From b0c3013f2ea2c82a43248e43a0abfaebd5bb105a Mon Sep 17 00:00:00 2001 From: "zhou.weiguo" Date: Wed, 24 Apr 2024 16:28:18 +0800 Subject: [PATCH 001/143] ggml: add Qualcomm QNN(Qualcomm Neural Network,aka Qualcomm AI Engine Direct) backend --- ggml-qnn.cpp | 4874 ++++++++++++++++++++++++++++++++++++++++++++++++++ ggml-qnn.h | 55 + ggml.c | 3 +- llama.cpp | 30 +- 4 files changed, 4960 insertions(+), 2 deletions(-) create mode 100644 ggml-qnn.cpp create mode 100644 ggml-qnn.h diff --git a/ggml-qnn.cpp b/ggml-qnn.cpp new file mode 100644 index 0000000000000..5d698f184c25d --- /dev/null +++ b/ggml-qnn.cpp @@ -0,0 +1,4874 @@ +/* + * MIT license + * Copyright (C) 2024 GGML Authors + * SPDX-License-Identifier: MIT + * + * this is implementation of ggml QNN(Qualcomm Neural Network, aka AI Engine Direct) backend + * + * status: + * + * 1. core implementation(data path works fine as expected with whisper.cpp using QNN CPU/GPU backend on Qualcomm's SoC based low-end phone + * + * 2. core implementation(data path works fine as expected with whisper.cpp using QNN HTP(aka DSP) backend on Qualcomm's soC based high-end phone + * + * 3. core implementation(data path works fine as expected with llama.cpp using QNN CPU/GPU/HTP(aka DSP) backend on Qualcomm's soC based high-end phone + * + * 4. GGML_OP_MUL_MAT & GGML_OP_MUL & GGML_OP_ADD using QNN API has been completed + * + * todo: + * + * 1. lack of implementation of other GGML-OPs using QNN API + * + * 2. only support FP32 / FP16 and the input and output tensors must be of the same data type + * + * 3. QNN's RPC feature(which useful for QNN HTP(aka DSP) backend) not used + * + * 4. multi QNN backend(CPU/GPU/DSP) simultaneously not support + * + * 5. multithreading not work with QNN GPU/HTP(aka DSP) backend + * + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "QnnTypes.h" +#include "QnnCommon.h" +#include "QnnContext.h" +#include "QnnBackend.h" +#include "QnnGraph.h" +#include "QnnProperty.h" +#include "QnnTensor.h" +#include "QnnInterface.h" +#include "Saver/QnnSaver.h" +#include "System/QnnSystemInterface.h" +#include "HTP/QnnHtpDevice.h" + +#include "ggml-qnn.h" + +#include "ggml-backend-impl.h" + + +// ================================================================================================= +// +// forward/external/helper declaration +// +// ================================================================================================= +class qnn_instance; + +//TODO: should be removed because this is a workaround method during development stage +extern "C" void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor); + +#if (defined __ANDROID__) || (defined ANDROID) //Qualcomm's QNN could running on Windows over ARM(aka WoA) +extern "C" int __android_log_print(int prio, const char * tag, const char * fmt, ...) +__attribute__((__format__(printf, 3, 4))); +#endif + +static void ggml_qnn_log_internal(ggml_log_level level, const char * file, const char * func, int line, const char * format, ...); + + + +// ================================================================================================= +// +// self-defined macro / data structure +// +// ================================================================================================= +#define RPCMEM_DEFAULT_FLAGS 1 +#define RPCMEM_HEAP_ID_SYSTEM 25 + +#define GGML_DUMP_TENSOR(tensor) ggml_tensor_dump(tensor, #tensor) + +#define GGML_QNN_LOGBUF_LEN 4096 +#define GGML_QNN_MAX_BUFFERS 128 +#define MATRIX_ROW_PADDING 512 + +#define BUF_MAJOR_MASK 0xFF000000 +#define BUF_CONTROL_BASE 0xEE000000 + +#define GGML_QNN_DEBUG 1 + +#define QNN_LOG_ERROR(...) ggml_qnn_log_internal(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) +#define QNN_LOG_WARN(...) ggml_qnn_log_internal(GGML_LOG_LEVEL_DEBUG , __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) +#define QNN_LOG_INFO(...) ggml_qnn_log_internal(GGML_LOG_LEVEL_DEBUG , __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) + +#if GGML_QNN_DEBUG +#define QNN_LOG_DEBUG(...) ggml_qnn_log_internal(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) +#else +#define QNN_LOG_DEBUG(...) +#endif + + +#define VALIDATE(value, status) \ + do { \ + status = value; \ + if (status != QNN_SUCCESS) { \ + QNN_LOG_WARN("%s expected QNN_SUCCESS\n", #value); \ + return status; \ + } \ + } while (0) + +#define VALIDATE_TENSOR_VERSION(tensor, err) VALIDATE(validate_tensor_version(tensor), err) + +#define VALIDATE_OP_CONFIG_VERSION(op, err) VALIDATE(validate_opconfig_version(op), err) + +#define QNN_VER_PTR(x) (&((x).v1)) +#define QNN_OP_CFG_VALID(opConfig) ((opConfig).version == QNN_OPCONFIG_VERSION_1) + +#define QNN_OP_CFG_GET_NAME(opConfig) get_qnn_oponfig_name(opConfig) +#define QNN_OP_CFG_GET_PACKAGE_NAME(opConfig) get_qnn_opconfig_packagename(opConfig) +#define QNN_OP_CFG_GET_TYPE_NAME(opConfig) get_qnn_opconfig_typename(opConfig) +#define QNN_OP_CFG_GET_NUM_PARAMS(opConfig) get_qnn_opconfig_numparams(opConfig) +#define QNN_OP_CFG_GET_PARAMS(opConfig) get_qnn_opconfig_params(opConfig) +#define QNN_OP_CFG_GET_NUM_INPUTS(opConfig) get_qnn_opconfig_numinputs(opConfig) +#define QNN_OP_CFG_GET_INPUTS(opConfig) get_qnn_opconfig_inputs(opConfig) +#define QNN_OP_CFG_GET_NUM_OUTPUTS(opConfig) get_qnn_opconfig_numoutputs(opConfig) +#define QNN_OP_CFG_GET_OUTPUTS(opConfig) get_qnn_opconfig_outputs(opConfig) + +#define QNN_OP_CFG_SET_NAME(opConfig, value) set_qnn_opconfig_name(opConfig, value) +#define QNN_OP_CFG_SET_PACKAGE_NAME(opConfig, value) set_qnn_opconfig_packagename(opConfig, value) +#define QNN_OP_CFG_SET_TYPE_NAME(opConfig, value) set_qnn_opconfig_typename(opConfig, value) + +#define QNN_OP_CFG_SET_PARAMS(opConfig, numOfParams, params) \ + set_qnn_opconfig_params(opConfig, numOfParams, params) + +#define QNN_OP_CFG_SET_INPUTS(opConfig, numOfInputs, inputTensors) \ + set_qnn_opconfig_inputs(opConfig, numOfInputs, inputTensors) + +#define QNN_OP_CFG_SET_OUTPUTS(opConfig, numOfOutputs, outputTensors) \ + set_qnn_opconfig_outputs(opConfig, numOfOutputs, outputTensors) + +#define QNN_TENSOR_GET_ID(tensor) get_qnn_tensorid(tensor) +#define QNN_TENSOR_GET_NAME(tensor) get_qnn_tensorname(tensor) +#define QNN_TENSOR_GET_TYPE(tensor) get_qnn_tensortype(tensor) +#define QNN_TENSOR_GET_DATA_FORMAT(tensor) get_qnn_tensor_dataformat(tensor) +#define QNN_TENSOR_GET_DATA_TYPE(tensor) get_qnn_tensor_datatype(tensor) +#define QNN_TENSOR_GET_QUANT_PARAMS(tensor) get_qnn_tensor_quantparams(tensor) +#define QNN_TENSOR_GET_RANK(tensor) get_qnn_tensor_rank(tensor) +#define QNN_TENSOR_GET_DIMENSIONS(tensor) get_qnn_tensor_dimensions(tensor) +#define QNN_TENSOR_GET_MEM_TYPE(tensor) get_qnn_tensor_memtype(tensor) +#define QNN_TENSOR_GET_CLIENT_BUF(tensor) get_qnn_tensor_clientbuf(tensor) +#define QNN_TENSOR_GET_MEM_HANDLE(tensor) get_qnn_tensor_memhandle(tensor) + +#define QNN_TENSOR_SET_ID(tensor, value) set_qnn_tensor_id(tensor, value) +#define QNN_TENSOR_SET_NAME(tensor, value) set_qnn_tensor_name(tensor, value) +#define QNN_TENSOR_SET_TYPE(tensor, value) set_qnn_tensor_type(tensor, value) +#define QNN_TENSOR_SET_DATA_FORMAT(tensor, value) set_qnn_tensor_dataformat(tensor, value) +#define QNN_TENSOR_SET_DATA_TYPE(tensor, value) set_qnn_tensor_datatype(tensor, value) +#define QNN_TENSOR_SET_QUANT_PARAMS(tensor, value) set_qnn_tensor_quantparams(tensor, value) +#define QNN_TENSOR_SET_RANK(tensor, value) set_qnn_tensor_rank(tensor, value) +#define QNN_TENSOR_SET_DIMENSIONS(tensor, value) set_qnn_tensor_dimensions(tensor, value) +#define QNN_TENSOR_SET_MEM_TYPE(tensor, value) set_qnn_tensor_memtype(tensor, value) +#define QNN_TENSOR_SET_CLIENT_BUF(tensor, value) set_qnn_tensor_clientbuf(tensor, value) +#define QNN_TENSOR_SET_MEM_HANDLE(tensor, value) set_qnn_tensor_memhandle(tensor, value) + + + +using pfn_rpc_mem_init = void (*)(void); +using pfn_rpc_mem_deinit = void (*)(void); +using pfn_rpc_mem_alloc = void *(*)(int, uint32_t, int); +using pfn_rpc_mem_free = void (*)(void *); +using pfn_rpc_mem_to_fd = int (*)(void *); + +using _pfn_QnnSaver_initialize = decltype(QnnSaver_initialize); +using _pfn_QnnInterface_getProviders = decltype(QnnInterface_getProviders); +using _pfn_QnnSystemInterface_getProviders = decltype(QnnSystemInterface_getProviders); + + + +typedef struct qnn_buf_s qnn_buf_t; +typedef struct qnn_buf_s qnn_buf_buffer_t; +typedef struct buf_element_s buf_element_t; +typedef void (*ggml_qnn_func_t)(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); +typedef void (*ggml_qnn_func_common_t)(const ggml_op ggmlop, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); + +enum class ggml_qnn_profile_level { + profile_off = 0, + profile_basic = 1, + profile_detail = 2 +}; + + +struct buf_element_s { + buf_element_t * next; + + unsigned char * mem; + unsigned char * content; /* start of raw content in mem */ + + uint32_t size ; /* size of content */ + int32_t max_size; /* size of pre-allocated memory pointed to by mem */ + uint32_t type; + void (*free_buffer) (buf_element_t * buf); + void * source; /* CPU, GPU, DSP, ... */ + int id; +} ; + + +struct qnn_buf_s { + buf_element_t * first, * last; + + size_t qnn_buf_size; + uint32_t qnn_buf_data_size; + void * qnn_buf_empty_cb_data; + const char * name; + + pthread_mutex_t mutex; + pthread_cond_t not_empty; + + void (*put) (qnn_buf_t * fifo, buf_element_t * buf); + + buf_element_t *(*get) (qnn_buf_t * fifo); + + void (*clear) (qnn_buf_t * fifo) ; + + int (*size) (qnn_buf_t * fifo); + + int (*num_free) (qnn_buf_t * fifo); + + uint32_t (*data_size) (qnn_buf_t * fifo); + + void (*destroy) (qnn_buf_t * fifo); + + buf_element_t * (*buffer_alloc) (qnn_buf_t * self); + + buf_element_t * (*buffer_try_alloc) (qnn_buf_t * self); + + buf_element_t * buffer_pool_top; + pthread_mutex_t buffer_pool_mutex; + pthread_cond_t buffer_pool_cond_not_empty; + int buffer_pool_num_free; + int buffer_pool_capacity; + int buffer_pool_buf_size; + void * buffer_pool_base; /* used to free mem pool */ +} ; + + +struct ggml_backend_qnn_context { + int device; + int threads; + char name[GGML_MAX_NAME]; + char lib[GGML_MAX_NAME]; + qnn_instance * instance; + qnn_buf_t * buffer_pool; + struct ggml_backend * backend; + QNN_INTERFACE_VER_TYPE raw_interface; + QNN_SYSTEM_INTERFACE_VER_TYPE raw_system_interface; +} ; + + +// ================================================================================================= +// +// static global variables +// +// ================================================================================================= +//TODO: should be removed for support multi QNN backend simultaneously +static ggml_backend_t g_qnn_backend = nullptr; + +//TODO: should be removed for support multi QNN backend simultaneously +static int g_current_device = 3; // 3 is the default ggml backend + +static bool GGML_OP_HAS_INIT [GGML_OP_COUNT] = { 0 }; +static bool GGML_OP_HAS_FINALIZE[GGML_OP_COUNT] = { 0 }; +static void ggml_setup_op_has_task_pass(void) { + { // INIT + bool * p = GGML_OP_HAS_INIT; + + p[GGML_OP_ACC ] = true; + p[GGML_OP_MUL_MAT ] = true; + p[GGML_OP_MUL_MAT_ID ] = true; + p[GGML_OP_OUT_PROD ] = true; + p[GGML_OP_SET ] = true; + p[GGML_OP_GET_ROWS_BACK ] = true; + p[GGML_OP_DIAG_MASK_INF ] = true; + p[GGML_OP_DIAG_MASK_ZERO ] = true; + p[GGML_OP_CONV_TRANSPOSE_1D ] = true; + p[GGML_OP_CONV_TRANSPOSE_2D ] = true; + p[GGML_OP_FLASH_ATTN_BACK ] = true; + p[GGML_OP_CROSS_ENTROPY_LOSS ] = true; + p[GGML_OP_ADD_REL_POS ] = true; + } + + { // FINALIZE + bool * p = GGML_OP_HAS_FINALIZE; + + p[GGML_OP_CROSS_ENTROPY_LOSS ] = true; + } +} + + +//QNN cDSP and HTA backend would not be used currently, just focus on QNN CPU/GPU/HTP(aka DSP) backend currently +static struct ggml_backend_qnn_context g_qnn_mgr[GGML_QNN_MAX_DEVICES] = { + [QNN_CPU] = {.device = 0, .threads = 1, .name = "qnn-cpu", .lib = "libQnnCpu.so", .instance = nullptr, .buffer_pool = nullptr, .backend = nullptr, .raw_interface = nullptr, .raw_system_interface = nullptr}, + [QNN_GPU] = {.device = 1, .threads = 1, .name = "qnn-gpu", .lib = "libQnnGpu.so", .instance = nullptr, .buffer_pool = nullptr, .backend = nullptr, .raw_interface = nullptr, .raw_system_interface = nullptr}, + [QNN_HTP] = {.device = 2, .threads = 1, .name = "qnn-htp(aka dsp)", .lib = "libQnnHtp.so", .instance = nullptr, .buffer_pool = nullptr, .backend = nullptr, .raw_interface = nullptr, .raw_system_interface = nullptr}, +}; + + + +// ================================================================================================= +// +// internal helper functions +// +// ================================================================================================= +static inline int validate_tensor_version(Qnn_Tensor_t tensor) { + if (tensor.version != QNN_TENSOR_VERSION_1) { + QNN_LOG_WARN("validate_tensor_version() tensor %s, got unsupported version %d\n", + tensor.v1.name, + tensor.version); + return 1; + } + return 0; +} + + +static inline int validate_opconfig_version(Qnn_OpConfig_t opConfig) { + if (opConfig.version != QNN_OPCONFIG_VERSION_1) { + QNN_LOG_WARN("validate_opconfig_version() op %s, got unsupported version %d\n", + opConfig.v1.name, + opConfig.version); + return 1; + } + return 0; +} + + +static inline const char * get_qnn_oponfig_name(const Qnn_OpConfig_t & opConfig) { + if (opConfig.version == QNN_OPCONFIG_VERSION_1) { + return opConfig.v1.name; + } + return nullptr; +} + + +static inline const char * get_qnn_oponfig_name(const Qnn_OpConfig_t * opConfig) { + return get_qnn_oponfig_name(*opConfig); +} + + +static inline const char * get_qnn_opconfig_packagename(const Qnn_OpConfig_t & opConfig) { + if (opConfig.version == QNN_OPCONFIG_VERSION_1) { + return opConfig.v1.packageName; + } + return nullptr; +} + + +static inline const char * get_qnn_opconfig_packagename(const Qnn_OpConfig_t * opConfig) { + return get_qnn_opconfig_packagename(*opConfig); +} + + +static inline const char * get_qnn_opconfig_typename(const Qnn_OpConfig_t & opConfig) { + if (opConfig.version == QNN_OPCONFIG_VERSION_1) { + return opConfig.v1.typeName; + } + return nullptr; +} + + +static inline const char * get_qnn_opconfig_typename(const Qnn_OpConfig_t * opConfig) { + return get_qnn_opconfig_typename(*opConfig); +} + + +static inline uint32_t get_qnn_opconfig_numparams(const Qnn_OpConfig_t & opConfig) { + if (opConfig.version == QNN_OPCONFIG_VERSION_1) { + return opConfig.v1.numOfParams; + } + return 0u; +} + + +static inline uint32_t get_qnn_opconfig_numparams(const Qnn_OpConfig_t * opConfig) { + return get_qnn_opconfig_numparams(*opConfig); +} + + +static inline const Qnn_Param_t * get_qnn_opconfig_params(const Qnn_OpConfig_t & opConfig) { + if (opConfig.version == QNN_OPCONFIG_VERSION_1) { + return opConfig.v1.params; + } + return nullptr; +} + + +static inline const Qnn_Param_t * get_qnn_opconfig_params(const Qnn_OpConfig_t * opConfig) { + return get_qnn_opconfig_params(*opConfig); +} + + +static inline uint32_t get_qnn_opconfig_numinputs(const Qnn_OpConfig_t & opConfig) { + if (opConfig.version == QNN_OPCONFIG_VERSION_1) { + return opConfig.v1.numOfInputs; + } + return 0u; +} + + +static inline uint32_t get_qnn_opconfig_numinputs(const Qnn_OpConfig_t * opConfig) { + return get_qnn_opconfig_numinputs(*opConfig); +} + + +static inline const Qnn_Tensor_t * get_qnn_opconfig_inputs(const Qnn_OpConfig_t & opConfig) { + if (opConfig.version == QNN_OPCONFIG_VERSION_1) { + return opConfig.v1.inputTensors; + } + return nullptr; +} + + +static inline const Qnn_Tensor_t * get_qnn_opconfig_inputs(const Qnn_OpConfig_t * opConfig) { + return get_qnn_opconfig_inputs(*opConfig); +} + + +static inline uint32_t get_qnn_opconfig_numoutputs(const Qnn_OpConfig_t & opConfig) { + if (opConfig.version == QNN_OPCONFIG_VERSION_1) { + return opConfig.v1.numOfOutputs; + } + return 0u; +} + + +static inline uint32_t get_qnn_opconfig_numoutputs(const Qnn_OpConfig_t * opConfig) { + return get_qnn_opconfig_numoutputs(*opConfig); +} + + +static inline const Qnn_Tensor_t * get_qnn_opconfig_outputs(const Qnn_OpConfig_t & opConfig) { + if (opConfig.version == QNN_OPCONFIG_VERSION_1) { + return opConfig.v1.outputTensors; + } + return nullptr; +} + + +static inline const Qnn_Tensor_t * get_qnn_opconfig_outputs(const Qnn_OpConfig_t * opConfig) { + return get_qnn_opconfig_outputs(*opConfig); +} + + +static inline void set_qnn_opconfig_name(Qnn_OpConfig_t & opConfig, const char * name) { + if (opConfig.version == QNN_OPCONFIG_VERSION_1) { + opConfig.v1.name = name; + } +} + + +static inline void set_qnn_opconfig_name(Qnn_OpConfig_t * opConfig, const char * name) { + set_qnn_opconfig_name(*opConfig, name); +} + + +static inline void set_qnn_opconfig_packagename(Qnn_OpConfig_t & opConfig, const char * packageName) { + if (opConfig.version == QNN_OPCONFIG_VERSION_1) { + opConfig.v1.packageName = packageName; + } +} + + +static inline void set_qnn_opconfig_packagename(Qnn_OpConfig_t * opConfig, const char * packageName) { + set_qnn_opconfig_packagename(*opConfig, packageName); +} + + +static inline void set_qnn_opconfig_typename(Qnn_OpConfig_t & opConfig, const char * typeName) { + if (opConfig.version == QNN_OPCONFIG_VERSION_1) { + opConfig.v1.typeName = typeName; + } +} + + +static inline void set_qnn_opconfig_typename(Qnn_OpConfig_t * opConfig, const char * typeName) { + set_qnn_opconfig_typename(*opConfig, typeName); +} + + +static inline void set_qnn_opconfig_params(Qnn_OpConfig_t & opConfig, + uint32_t numOfParams, + Qnn_Param_t * params) { + if (opConfig.version == QNN_OPCONFIG_VERSION_1) { + opConfig.v1.numOfParams = numOfParams; + opConfig.v1.params = params; + } +} + + +static inline void set_qnn_opconfig_params(Qnn_OpConfig_t * opConfig, + uint32_t numOfParams, + Qnn_Param_t * params) { + set_qnn_opconfig_params(*opConfig, numOfParams, params); +} + + +static inline void set_qnn_opconfig_inputs(Qnn_OpConfig_t & opConfig, + uint32_t numOfInputs, + Qnn_Tensor_t * inputTensors) { + if (opConfig.version == QNN_OPCONFIG_VERSION_1) { + opConfig.v1.numOfInputs = numOfInputs; + opConfig.v1.inputTensors = inputTensors; + } +} + + +static inline void set_qnn_opconfig_inputs(Qnn_OpConfig_t * opConfig, + uint32_t numOfInputs, + Qnn_Tensor_t * inputTensors) { + set_qnn_opconfig_inputs(*opConfig, numOfInputs, inputTensors); +} + + +static inline void set_qnn_opconfig_outputs(Qnn_OpConfig_t & opConfig, + uint32_t numOfOutputs, + Qnn_Tensor_t * outputTensors) { + if (opConfig.version == QNN_OPCONFIG_VERSION_1) { + opConfig.v1.numOfOutputs = numOfOutputs; + opConfig.v1.outputTensors = outputTensors; + } +} + + +static inline void set_qnn_opconfig_outputs(Qnn_OpConfig_t * opConfig, + uint32_t numOfOutputs, + Qnn_Tensor_t * outputTensors) { + set_qnn_opconfig_outputs(*opConfig, numOfOutputs, outputTensors); +} + + +static inline uint32_t get_qnn_tensorid(const Qnn_Tensor_t & tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.id; + } + return 0u; +} + + +static inline uint32_t get_qnn_tensorid(const Qnn_Tensor_t * tensor) { return get_qnn_tensorid(*tensor); } + + +static inline const char * get_qnn_tensorname(const Qnn_Tensor_t & tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.name; + } + return nullptr; +} + + +static inline const char * get_qnn_tensorname(const Qnn_Tensor_t * tensor) { + return get_qnn_tensorname(*tensor); +} + + +static inline Qnn_TensorType_t get_qnn_tensortype(const Qnn_Tensor_t & tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.type; + } + return QNN_TENSOR_TYPE_UNDEFINED; +} + + +static inline Qnn_TensorType_t get_qnn_tensortype(const Qnn_Tensor_t * tensor) { + return get_qnn_tensortype(*tensor); +} + + +static inline Qnn_TensorDataFormat_t get_qnn_tensor_dataformat(const Qnn_Tensor_t & tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.dataFormat; + } + return QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER; +} + + +static inline Qnn_TensorDataFormat_t get_qnn_tensor_dataformat(const Qnn_Tensor_t * tensor) { + return get_qnn_tensor_dataformat(*tensor); +} + + +static inline Qnn_DataType_t get_qnn_tensor_datatype(const Qnn_Tensor_t & tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.dataType; + } + return QNN_DATATYPE_UNDEFINED; +} + + +static inline Qnn_DataType_t get_qnn_tensor_datatype(const Qnn_Tensor_t * tensor) { + return get_qnn_tensor_datatype(*tensor); +} + + +static inline Qnn_QuantizeParams_t get_qnn_tensor_quantparams(const Qnn_Tensor_t & tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.quantizeParams; + } + return QNN_QUANTIZE_PARAMS_INIT; +} + + +static inline Qnn_QuantizeParams_t get_qnn_tensor_quantparams(const Qnn_Tensor_t * tensor) { + return get_qnn_tensor_quantparams(*tensor); +} + + +static inline uint32_t get_qnn_tensor_rank(const Qnn_Tensor_t & tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.rank; + } + return 0u; +} + + +static inline uint32_t get_qnn_tensor_rank(const Qnn_Tensor_t * tensor) { return get_qnn_tensor_rank(*tensor); } + + +static inline uint32_t * get_qnn_tensor_dimensions(const Qnn_Tensor_t & tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.dimensions; + } + return nullptr; +} + + +static inline uint32_t * get_qnn_tensor_dimensions(const Qnn_Tensor_t * tensor) { + return get_qnn_tensor_dimensions(*tensor); +} + + +static inline Qnn_TensorMemType_t get_qnn_tensor_memtype(const Qnn_Tensor_t & tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.memType; + } + return QNN_TENSORMEMTYPE_UNDEFINED; +} + + +static inline Qnn_TensorMemType_t get_qnn_tensor_memtype(const Qnn_Tensor_t * tensor) { + return get_qnn_tensor_memtype(*tensor); +} + + +static inline Qnn_ClientBuffer_t get_qnn_tensor_clientbuf(const Qnn_Tensor_t & tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.clientBuf; + } + return QNN_CLIENT_BUFFER_INIT; +} + + +static inline Qnn_ClientBuffer_t get_qnn_tensor_clientbuf(const Qnn_Tensor_t * tensor) { + return get_qnn_tensor_clientbuf(*tensor); +} + + +static inline Qnn_MemHandle_t get_qnn_tensor_memhandle(const Qnn_Tensor_t & tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.memHandle; + } + return nullptr; +} + + +static inline Qnn_MemHandle_t get_qnn_tensor_memhandle(const Qnn_Tensor_t * tensor) { + return get_qnn_tensor_memhandle(*tensor); +} + + +static inline void set_qnn_tensor_id(Qnn_Tensor_t & tensor, uint32_t id) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + tensor.v1.id = id; + } +} + + +static inline void set_qnn_tensor_id(Qnn_Tensor_t * tensor, uint32_t id) { set_qnn_tensor_id(*tensor, id); } + + +static inline void set_qnn_tensor_name(Qnn_Tensor_t & tensor, const char * name) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + tensor.v1.name = name; + } +} + + +static inline void set_qnn_tensor_name(Qnn_Tensor_t * tensor, const char * name) { + set_qnn_tensor_name(*tensor, name); +} + + +static inline void set_qnn_tensor_type(Qnn_Tensor_t & tensor, Qnn_TensorType_t type) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + tensor.v1.type = type; + } +} + + +static inline void set_qnn_tensor_type(Qnn_Tensor_t * tensor, Qnn_TensorType_t type) { + set_qnn_tensor_type(*tensor, type); +} + + +static inline void set_qnn_tensor_dataformat(Qnn_Tensor_t & tensor, Qnn_TensorDataFormat_t format) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + tensor.v1.dataFormat = format; + } +} + + +static inline void set_qnn_tensor_dataformat(Qnn_Tensor_t * tensor, Qnn_TensorDataFormat_t format) { + set_qnn_tensor_dataformat(*tensor, format); +} + + +static inline void set_qnn_tensor_datatype(Qnn_Tensor_t & tensor, Qnn_DataType_t dataType) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + tensor.v1.dataType = dataType; + } +} + + +static inline void set_qnn_tensor_datatype(Qnn_Tensor_t * tensor, Qnn_DataType_t dataType) { + set_qnn_tensor_datatype(*tensor, dataType); +} + + +static inline void set_qnn_tensor_quantparams(Qnn_Tensor_t & tensor, Qnn_QuantizeParams_t params) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + tensor.v1.quantizeParams = params; + } +} + + +static inline void set_qnn_tensor_quantparams(Qnn_Tensor_t * tensor, Qnn_QuantizeParams_t params) { + set_qnn_tensor_quantparams(*tensor, params); +} + + +static inline void set_qnn_tensor_rank(Qnn_Tensor_t & tensor, uint32_t rank) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + tensor.v1.rank = rank; + } +} + + +static inline void set_qnn_tensor_rank(Qnn_Tensor_t * tensor, uint32_t rank) { + set_qnn_tensor_rank(*tensor, rank); +} + + +static inline void set_qnn_tensor_dimensions(Qnn_Tensor_t & tensor, uint32_t * dims) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + tensor.v1.dimensions = dims; + } +} + + +static inline void set_qnn_tensor_dimensions(Qnn_Tensor_t * tensor, uint32_t * dims) { + set_qnn_tensor_dimensions(*tensor, dims); +} + + +static inline void set_qnn_tensor_memtype(Qnn_Tensor_t & tensor, Qnn_TensorMemType_t memType) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + tensor.v1.memType = memType; + } +} + + +static inline void set_qnn_tensor_memtype(Qnn_Tensor_t * tensor, Qnn_TensorMemType_t memType) { + set_qnn_tensor_memtype(*tensor, memType); +} + + +static inline void set_qnn_tensor_clientbuf(Qnn_Tensor_t & tensor, Qnn_ClientBuffer_t clientBuf) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + tensor.v1.clientBuf = clientBuf; + } +} + + +static inline void set_qnn_tensor_clientbuf(Qnn_Tensor_t * tensor, Qnn_ClientBuffer_t clientBuf) { + set_qnn_tensor_clientbuf(*tensor, clientBuf); +} + + +static inline void set_qnn_tensor_memhandle(Qnn_Tensor_t & tensor, Qnn_MemHandle_t handle) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + tensor.v1.memHandle = handle; + } +} + + +static inline void set_qnn_tensor_memhandle(Qnn_Tensor_t * tensor, Qnn_MemHandle_t handle) { + set_qnn_tensor_memhandle(*tensor, handle); +} + + + +static size_t memscpy(void * dst, size_t dstSize, const void * src, size_t copySize) { + if (!dst || !src || !dstSize || !copySize) + return 0; + + size_t minSize = dstSize < copySize ? dstSize : copySize; + + memcpy(dst, src, minSize); + + return minSize; +} + + +static char * ggml_qnn_strndup(const char * source, size_t maxlen) { + return ::strndup(source, maxlen); +} + + +static int deep_copy_qnn_tensors(Qnn_Tensor_t & src, Qnn_Tensor_t & dst) { + int err = 0; + VALIDATE_TENSOR_VERSION(src, err); + + dst.version = src.version; + QNN_TENSOR_SET_NAME( + dst, ggml_qnn_strndup(QNN_TENSOR_GET_NAME(src), std::string(QNN_TENSOR_GET_NAME(src)).size())); + if (QNN_TENSOR_GET_NAME(dst) == nullptr) { + return 1; + } + QNN_TENSOR_SET_ID(dst, QNN_TENSOR_GET_ID(src)); + QNN_TENSOR_SET_TYPE(dst, QNN_TENSOR_GET_TYPE(src)); + QNN_TENSOR_SET_DATA_FORMAT(dst, QNN_TENSOR_GET_DATA_FORMAT(src)); + QNN_TENSOR_SET_DATA_TYPE(dst, QNN_TENSOR_GET_DATA_TYPE(src)); + QNN_TENSOR_SET_MEM_TYPE(dst, QNN_TENSOR_GET_MEM_TYPE(src)); + + // Only metadata (i.e. non-static data) is copied from source to destination. The union still + // must be initialized so that the clientBuf/memHandle do not contain garbage data + if (QNN_TENSOR_GET_MEM_TYPE(src) == QNN_TENSORMEMTYPE_RAW) { + Qnn_ClientBuffer_t clientBuf = {nullptr, 0}; + QNN_TENSOR_SET_CLIENT_BUF(dst, clientBuf); + } else if (QNN_TENSOR_GET_MEM_TYPE(src) == QNN_TENSORMEMTYPE_MEMHANDLE) { + QNN_TENSOR_SET_MEM_HANDLE(dst, nullptr); + } else { + return 1; + } + + Qnn_QuantizeParams_t srcQParam = QNN_TENSOR_GET_QUANT_PARAMS(src); + Qnn_QuantizationEncoding_t encoding = srcQParam.quantizationEncoding; + if (encoding == QNN_QUANTIZATION_ENCODING_AXIS_SCALE_OFFSET) { + // need to allocate and copy memory for scaleOffset as it is a pointer array + Qnn_QuantizeParams_t srcQParamCpy = srcQParam; + Qnn_AxisScaleOffset_t &axisScaleOffset = srcQParamCpy.axisScaleOffsetEncoding; + Qnn_ScaleOffset_t **scaleOffset = &axisScaleOffset.scaleOffset; + size_t scaleOffsetSize = axisScaleOffset.numScaleOffsets * sizeof(Qnn_ScaleOffset_t); + *scaleOffset = (Qnn_ScaleOffset_t *)malloc(scaleOffsetSize); + memscpy(*scaleOffset, + scaleOffsetSize, + srcQParam.axisScaleOffsetEncoding.scaleOffset, + scaleOffsetSize); + QNN_TENSOR_SET_QUANT_PARAMS(dst, srcQParamCpy); + } else if (encoding == QNN_QUANTIZATION_ENCODING_BW_AXIS_SCALE_OFFSET) { + // need to allocate and copy memory for scaleOffset as it is a pointer array + Qnn_QuantizeParams_t srcQParamCpy = srcQParam; + Qnn_BwAxisScaleOffset_t &bwAxisScaleOffset = srcQParamCpy.bwAxisScaleOffsetEncoding; + size_t scaleSize = bwAxisScaleOffset.numElements * sizeof(float); + float **scales = &bwAxisScaleOffset.scales; + int32_t **offsets = &bwAxisScaleOffset.offsets; + *scales = (float *)malloc(scaleSize); + memscpy(*scales, scaleSize, srcQParam.bwAxisScaleOffsetEncoding.scales, scaleSize); + + // Only copy offsets if present, nullptr implies all offsets are 0 + if (bwAxisScaleOffset.offsets != nullptr) { + size_t offsetSize = bwAxisScaleOffset.numElements * sizeof(int32_t); + *offsets = (int32_t *)malloc(offsetSize); + memscpy(*offsets, offsetSize, srcQParam.bwAxisScaleOffsetEncoding.offsets, offsetSize); + } + QNN_TENSOR_SET_QUANT_PARAMS(dst, srcQParamCpy); + } else { + QNN_TENSOR_SET_QUANT_PARAMS(dst, srcQParam); + } + + // need to allocate and copy memory for all the pointer members + uint32_t rank = QNN_TENSOR_GET_RANK(src); + QNN_TENSOR_SET_RANK(dst, rank); + size_t dim_size = rank * sizeof(uint32_t); + uint32_t * dimensions = (uint32_t *)malloc(dim_size); + if (dimensions == nullptr) { + QNN_LOG_WARN("deep_copy_qnn_tensors() allocation error while copying tensor %s\n", QNN_TENSOR_GET_NAME(src)); + return 1; + } + memscpy(dimensions, dim_size, QNN_TENSOR_GET_DIMENSIONS(src), dim_size); + QNN_TENSOR_SET_DIMENSIONS(dst, dimensions); + + return err; +} + + +static int free_qnn_tensor(Qnn_Tensor_t & tensor) { + int err = 0; + VALIDATE_TENSOR_VERSION(tensor, err); + + if (nullptr == QNN_TENSOR_GET_NAME(tensor)) { + QNN_LOG_INFO("it should not happen, pls check"); + } else { + //QNN_LOG_DEBUG("QNN tensor name %s", QNN_TENSOR_GET_NAME(tensor)); + free((void *) QNN_TENSOR_GET_NAME(tensor)); + } + if (nullptr == QNN_TENSOR_GET_DIMENSIONS(tensor)) { + QNN_LOG_INFO("it should not happen, pls check"); + } else { + //TODO:why crash in here? why pointer changed with mul_mat? + //memory leak after comment above line + //free(QNN_TENSOR_GET_DIMENSIONS(tensor)); + } + + return err; +} + + +static int free_qnn_tensors(Qnn_Tensor_t *& tensors, uint32_t numTensors) { + int err = 0; + + // free all pointer allocations in struct + for (size_t i = 0; i < numTensors; i++) { + free_qnn_tensor(tensors[i]); + } + free(tensors); + + return err; +} + + +static float ggml_tensor_sum_elements(const ggml_tensor * tensor) { + double sum = 0; + float value = 0; + std::ostringstream tmposs; + if (tensor->type == GGML_TYPE_F32) { + for (int h = 0; h < tensor->ne[3]; h++) { + for (int i = 0; i < tensor->ne[2]; i++) { + for (int j = 0; j < tensor->ne[1]; j++) { + for (int k = 0; k < tensor->ne[0]; k++) { + value = ((float *) tensor->data)[h * tensor->ne[2] + i * tensor->ne[1] + j * tensor->ne[0] + k]; + sum += value; + //QNN_LOG_DEBUG("[%d][%d][%d][%d]%.2f \t", h, i, j, k, value); + tmposs << std::setw(8) << std::fixed << std::setprecision(2) << value << "\t"; + } + if (strlen(tmposs.str().c_str()) > 4000) { + + } else { + QNN_LOG_DEBUG("%s", tmposs.str().c_str()); + } + tmposs.clear(); + tmposs.str(""); + QNN_LOG_DEBUG("\n"); + } + } + } + } + QNN_LOG_DEBUG("\n"); + return sum; +} + + +static void ggml_dump_tensor(const ggml_tensor * tensor, const char * name) { + QNN_LOG_DEBUG("dump ggml tensor %s\n", name); + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", name, + tensor->type, ggml_type_name(tensor->type), + tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->nb[0], tensor->nb[1], tensor->nb[2]); + float sum = ggml_tensor_sum_elements(tensor); + + //QNN_LOG_DEBUG("\n"); + //QNN_LOG_DEBUG("Sum of tensor %s is %6.2f\n", name, sum); +} + + +static uint32_t ggml_get_tensor_rank(const ggml_tensor * tensor) { + uint32_t rank = 0; + for (int i = 0; i < GGML_MAX_DIMS; i++) { + if ((0 != tensor->ne[i]) && (1 != tensor->ne[i])) { + rank++; + } + } + return rank; +} + + +//TODO: +//ref:explanation of k-quants, https://github.com/ggerganov/llama.cpp/pull/1684 +static Qnn_DataType_t qnn_datatype_from_ggml_datatype(enum ggml_type ggmltype) { + switch (ggmltype) { + case GGML_TYPE_Q4_0: + return QNN_DATATYPE_UFIXED_POINT_4; + case GGML_TYPE_Q4_1: + return QNN_DATATYPE_SFIXED_POINT_4; + case GGML_TYPE_Q8_0: + return QNN_DATATYPE_UFIXED_POINT_8; + case GGML_TYPE_Q8_1: + return QNN_DATATYPE_SFIXED_POINT_8; + case GGML_TYPE_F16: + return QNN_DATATYPE_FLOAT_16; + case GGML_TYPE_F32: + return QNN_DATATYPE_FLOAT_32; + + } + return QNN_DATATYPE_FLOAT_32; +} + + +//TODO: +static const char * qnn_opname_from_ggmlop(enum ggml_op ggmlop) { + switch (ggmlop) { + case GGML_OP_ADD: + return QNN_OP_ELEMENT_WISE_ADD; + case GGML_OP_MUL: + return QNN_OP_ELEMENT_WISE_MULTIPLY; + case GGML_OP_MUL_MAT: + return QNN_OP_MAT_MUL; + } + + return nullptr; +} + + +static uint32_t ggml_get_tensor_data_size(const ggml_tensor * tensor) { + /* + size_t data_size = ggml_row_size(tensor->type, tensor->ne[0]); + size_t n_dims = ggml_get_tensor_rank(tensor); + for (int i = 1; i < n_dims; i++) { + data_size *= tensor->ne[i]; + } + + return data_size; + */ + return ggml_nbytes(tensor); +} + + +template +Fn load_qnn_functionpointers(void * handle, const char * function_name) { + return reinterpret_cast(dlsym(handle, function_name)); +} + + +static void qnn_xfree(void * ptr) { + if (nullptr != ptr) { + free(ptr); + ptr = nullptr; + } +} + + +static void * qnn_xmalloc(size_t size) { + void * ptr; + + if (!size) + size++; + + if ((ptr = calloc(1, size)) == nullptr) { + QNN_LOG_WARN("malloc(%d) failed: %s\n",size, strerror(errno)); + return nullptr; + } + + return ptr; +} + + +static void * qnn_xmalloc_aligned(size_t alignment, size_t size, void ** base) { + char * ptr; + + *base = ptr = static_cast(qnn_xmalloc(size + alignment)); + + while ((size_t) ptr % alignment) + ptr++; + + return ptr; +} + + +static void buffer_pool_free (buf_element_t * element) { + qnn_buf_t * self = (qnn_buf_t *) element->source; + + pthread_mutex_lock(&self->buffer_pool_mutex); + + element->next = self->buffer_pool_top; + self->buffer_pool_top = element; + + self->buffer_pool_num_free++; + if (self->buffer_pool_num_free > self->buffer_pool_capacity) { + QNN_LOG_DEBUG("TOO MANY FREE\n"); + } + + pthread_cond_signal (&self->buffer_pool_cond_not_empty); + + pthread_mutex_unlock (&self->buffer_pool_mutex); +} + + +static buf_element_t * buffer_pool_alloc (qnn_buf_t * self) { + buf_element_t * buf = nullptr; + int i; + + pthread_mutex_lock (&self->buffer_pool_mutex); + + while (self->buffer_pool_num_free < 2) { + pthread_cond_wait (&self->buffer_pool_cond_not_empty, &self->buffer_pool_mutex); + } + + buf = self->buffer_pool_top; + self->buffer_pool_top = self->buffer_pool_top->next; + self->buffer_pool_num_free--; + + buf->content = buf->mem; + buf->size = 0; + buf->type = 0; + + pthread_mutex_unlock (&self->buffer_pool_mutex); + + return buf; +} + + +static buf_element_t * buffer_pool_try_alloc (qnn_buf_t * self) { + buf_element_t * buf = nullptr; + + pthread_mutex_lock (&self->buffer_pool_mutex); + + if (self->buffer_pool_top) { + buf = self->buffer_pool_top; + self->buffer_pool_top = self->buffer_pool_top->next; + self->buffer_pool_num_free--; + } else { + buf = nullptr; + } + + pthread_mutex_unlock (&self->buffer_pool_mutex); + + if (buf) { + buf->content = buf->mem; + buf->size = 0; + } + + return buf; +} + + +static void qnn_buf_buffer_put(qnn_buf_t * fifo, buf_element_t * element) { + pthread_mutex_lock (&fifo->mutex); + + if (fifo->last) + fifo->last->next = element; + else + fifo->first = element; + + fifo->last = element; + element->next = nullptr; + fifo->qnn_buf_size++; + fifo->qnn_buf_data_size += element->size; + + LOGJ("put:index %d, fifo->size is %d, self->buffer_pool_num_free %d\n", element->id, fifo->qnn_buf_size, fifo->buffer_pool_num_free); + pthread_cond_signal (&fifo->not_empty); + + pthread_mutex_unlock (&fifo->mutex); +} + + +static buf_element_t * qnn_buf_buffer_get (qnn_buf_t * fifo) { + buf_element_t * buf = nullptr; + + pthread_mutex_lock (&fifo->mutex); +#if 0 + while (fifo->first == nullptr) { + pthread_cond_wait (&fifo->not_empty, &fifo->mutex); + } +#else + if (fifo->first == nullptr) { + pthread_mutex_unlock (&fifo->mutex); + return nullptr; + } +#endif + + buf = fifo->first; + + fifo->first = fifo->first->next; + if (fifo->first==nullptr) + fifo->last = nullptr; + + fifo->qnn_buf_size--; + fifo->qnn_buf_data_size -= buf->size; + + pthread_mutex_unlock (&fifo->mutex); + + return buf; +} + + +static void qnn_buf_buffer_clear (qnn_buf_t * fifo) { + buf_element_t * buf, * next, * prev; + + pthread_mutex_lock (&fifo->mutex); + + buf = fifo->first; + prev = nullptr; + + while (buf != nullptr) { + next = buf->next; + if ((buf->type & BUF_MAJOR_MASK) != BUF_CONTROL_BASE) { + if (prev) + prev->next = next; + else + fifo->first = next; + + if (!next) + fifo->last = prev; + + fifo->qnn_buf_size--; + fifo->qnn_buf_data_size -= buf->size; + + buf->free_buffer(buf); + } else { + prev = buf; + } + + buf = next; + } + + QNN_LOG_DEBUG("free buffers after clear: %d\n", fifo->buffer_pool_num_free); + pthread_mutex_unlock (&fifo->mutex); +} + + +static int qnn_buf_buffer_size (qnn_buf_t * self) { + int size = 0; + + pthread_mutex_lock(&self->mutex); + size = self->qnn_buf_size; + pthread_mutex_unlock(&self->mutex); + + return size; +} + + +static uint32_t qnn_buf_buffer_data_size (qnn_buf_t * self) { + uint32_t data_size; + + pthread_mutex_lock(&self->mutex); + data_size = self->qnn_buf_data_size; + pthread_mutex_unlock(&self->mutex); + + return data_size; +} + + +static int qnn_buf_buffer_num_free (qnn_buf_t * self) { + int buffer_pool_num_free = 0; + + pthread_mutex_lock(&self->mutex); + buffer_pool_num_free = self->buffer_pool_num_free; + pthread_mutex_unlock(&self->mutex); + + return buffer_pool_num_free; +} + + +static void qnn_buf_buffer_dispose (qnn_buf_t * self) { + buf_element_t * buf, * next; + int received = 0; + + self->clear( self ); + buf = self->buffer_pool_top; + + while (buf != nullptr) { + next = buf->next; + qnn_xfree(buf); + received++; + + buf = next; + } + + while (received < self->buffer_pool_capacity) { + buf = self->get(self); + qnn_xfree(buf); + received++; + } + + qnn_xfree(self->buffer_pool_base); + pthread_mutex_destroy(&self->mutex); + pthread_cond_destroy(&self->not_empty); + pthread_mutex_destroy(&self->buffer_pool_mutex); + pthread_cond_destroy(&self->buffer_pool_cond_not_empty); + qnn_xfree((void *)self->name); + qnn_xfree (self); +} + + +static qnn_buf_t * qnn_buf_new(const char * name, int num_buffers, uint32_t buf_size) { + int i = 0; + int alignment = 4; + qnn_buf_t * self = nullptr; + uint8_t * multi_buffer = nullptr; + + self = (qnn_buf_t*)qnn_xmalloc(sizeof(qnn_buf_t)); + if (nullptr == self) { + QNN_LOG_WARN("malloc memory failed\n"); + return nullptr; + } + + self->name = strdup(name); + self->first = nullptr; + self->last = nullptr; + self->qnn_buf_size = 0; + self->put = qnn_buf_buffer_put; + self->get = qnn_buf_buffer_get; + self->clear = qnn_buf_buffer_clear; + self->size = qnn_buf_buffer_size; + self->num_free = qnn_buf_buffer_num_free; + self->data_size = qnn_buf_buffer_data_size; + self->destroy = qnn_buf_buffer_dispose; + pthread_mutex_init (&self->mutex, nullptr); + pthread_cond_init (&self->not_empty, nullptr); + + + if (buf_size % alignment != 0) + buf_size += alignment - (buf_size % alignment); + + QNN_LOG_INFO("[%s]allocating %d Mbytes memory(alignment = %d)\n", name, (num_buffers * buf_size) / (1 << 20), alignment); + + multi_buffer = (uint8_t *)qnn_xmalloc_aligned (alignment, num_buffers * buf_size, &self->buffer_pool_base); + if (nullptr == multi_buffer) { + QNN_LOG_WARN("malloc memory failed\n"); + free(self); + return nullptr; + } + + self->buffer_pool_top = nullptr; + + pthread_mutex_init (&self->buffer_pool_mutex, nullptr); + pthread_cond_init (&self->buffer_pool_cond_not_empty, nullptr); + + self->buffer_pool_num_free = 0; + self->buffer_pool_capacity = num_buffers; + self->buffer_pool_buf_size = buf_size; + self->buffer_alloc = buffer_pool_alloc; + self->buffer_try_alloc = buffer_pool_try_alloc; + + for (i = 0; i < num_buffers; i++) { + buf_element_t * buf = nullptr; + + buf = (buf_element_t *)qnn_xmalloc(sizeof (buf_element_t)); + if (nullptr == buf) { + QNN_LOG_WARN("malloc memory failed"); + free(multi_buffer); + free(self); + return nullptr; + } + + buf->id = i; + buf->mem = multi_buffer; + multi_buffer += buf_size; + + buf->max_size = buf_size; + buf->free_buffer = buffer_pool_free; + buf->source = self; + + buffer_pool_free(buf); + } + + return self; +} + + +static const char * get_qnn_backend_name(int n_backend_type) { + switch (n_backend_type) { + case 0: + return "QNN-CPU"; + case 1: + return "QNN-GPU"; + case 2: + return "QNN-HTP(DSP)"; + case 3: + return "ggml"; //the default GGML backend, used to compare performance between QNN backend and the default GGML backend + +#if 0 //QNN cDSP and HTA backend would not be used currently, focus on QNN CPU/GPU/HTP(aka DSP) backend currently + case 3: + return "QNN-cDSP"; + case 4: + return "QNN-HTA"; +#endif + + default: + return "unknown"; + } +} + + +static intptr_t align_to(size_t alignment, intptr_t offset) { + return offset % alignment == 0 ? offset + : offset + + (static_cast(alignment) - + offset % static_cast(alignment)); +} + + +static void ggml_qnn_log_internal(ggml_log_level level, const char * file, const char * func, int line, const char * format, ...) { + static std::mutex ggml_qnn_log_internal_mutex; + static char s_ggml_qnn_log_internal_buf[GGML_QNN_LOGBUF_LEN]; + + { + std::lock_guard lock(ggml_qnn_log_internal_mutex); + va_list args; + va_start(args, format); + int len_prefix = snprintf(s_ggml_qnn_log_internal_buf, GGML_QNN_LOGBUF_LEN, "[%s, %d]: ", func, line); + int len = vsnprintf(s_ggml_qnn_log_internal_buf + len_prefix, GGML_QNN_LOGBUF_LEN - len_prefix, format, args); + if (len < (GGML_QNN_LOGBUF_LEN - len_prefix)) { +#if (defined __ANDROID__) || (defined ANDROID) + __android_log_print(level, "llamacpp", "%s", s_ggml_qnn_log_internal_buf); +#else + printf("%s", buffer); //Qualcomm's QNN could running on Window over ARM +#endif + } + va_end(args); + } +} + + +// ================================================================================================= +// +// wrapper class of Qualcomm QNN(Qualcomm Neural Network, aka Qualcomm AI Engine Direct) SDK +// +// ================================================================================================= +class qnn_interface { + +#define DEFINE_SHIM_FUNCTION_INTERFACE(F, pointer_name) \ + template \ + inline auto qnn_##F(Args... args) const { \ + return (_qnn_interface->QNN_INTERFACE_VER_NAME.pointer_name)( \ + std::forward(args)...); \ + } + + +#define DEFINE_SHIM_FUNCTION_SYS_INTERFACE(F, pointer_name) \ + template \ + inline auto qnn_##F(Args... args) const { \ + return (_qnn_sys_interface->QNN_SYSTEM_INTERFACE_VER_NAME.pointer_name)( \ + std::forward(args)...); \ + } + + friend class qnn_instance; + +public: + qnn_interface() = default; + + // QnnBackend + DEFINE_SHIM_FUNCTION_INTERFACE(backend_create, backendCreate); + + DEFINE_SHIM_FUNCTION_INTERFACE(backend_free, backendFree); + + DEFINE_SHIM_FUNCTION_INTERFACE(backend_register_op_package, backendRegisterOpPackage); + + DEFINE_SHIM_FUNCTION_INTERFACE(backend_validate_op_config, backendValidateOpConfig); + + DEFINE_SHIM_FUNCTION_INTERFACE(backend_get_api_version, backendGetApiVersion); + + // QnnDevice + DEFINE_SHIM_FUNCTION_INTERFACE(device_create, deviceCreate); + + DEFINE_SHIM_FUNCTION_INTERFACE(device_free, deviceFree); + + DEFINE_SHIM_FUNCTION_INTERFACE(device_get_infrastructure, deviceGetInfrastructure); + + DEFINE_SHIM_FUNCTION_INTERFACE(device_get_platform_info, deviceGetPlatformInfo); + + DEFINE_SHIM_FUNCTION_INTERFACE(device_get_info, deviceGetInfo); + + // QnnContext + DEFINE_SHIM_FUNCTION_INTERFACE(context_create, contextCreate); + + DEFINE_SHIM_FUNCTION_INTERFACE(context_get_binary_size, contextGetBinarySize); + + DEFINE_SHIM_FUNCTION_INTERFACE(context_get_binary, contextGetBinary); + + DEFINE_SHIM_FUNCTION_INTERFACE(context_create_from_binary, contextCreateFromBinary); + + DEFINE_SHIM_FUNCTION_INTERFACE(context_free, contextFree); + + // QnnGraph + DEFINE_SHIM_FUNCTION_INTERFACE(graph_create, graphCreate); + + DEFINE_SHIM_FUNCTION_INTERFACE(graph_add_node, graphAddNode); + + DEFINE_SHIM_FUNCTION_INTERFACE(graph_finalize, graphFinalize); + + DEFINE_SHIM_FUNCTION_INTERFACE(graph_execute, graphExecute); + + DEFINE_SHIM_FUNCTION_INTERFACE(graph_retrieve, graphRetrieve); + + // QnnLog + DEFINE_SHIM_FUNCTION_INTERFACE(log_create, logCreate); + + DEFINE_SHIM_FUNCTION_INTERFACE(log_free, logFree); + + DEFINE_SHIM_FUNCTION_INTERFACE(log_set_log_level, logSetLogLevel); + + // QnnProfile + DEFINE_SHIM_FUNCTION_INTERFACE(profile_create, profileCreate); + + DEFINE_SHIM_FUNCTION_INTERFACE(profile_get_events, profileGetEvents); + + DEFINE_SHIM_FUNCTION_INTERFACE(profile_get_sub_events, profileGetSubEvents); + + DEFINE_SHIM_FUNCTION_INTERFACE(profile_get_event_data, profileGetEventData); + + DEFINE_SHIM_FUNCTION_INTERFACE(profile_free, profileFree); + + // QnnMem + DEFINE_SHIM_FUNCTION_INTERFACE(mem_register, memRegister); + + DEFINE_SHIM_FUNCTION_INTERFACE(mem_de_register, memDeRegister); + + // QnnProperty + DEFINE_SHIM_FUNCTION_INTERFACE(property_has_capability, propertyHasCapability); + + // QnnTensor + DEFINE_SHIM_FUNCTION_INTERFACE(tensor_create_context_tensor, tensorCreateContextTensor); + + DEFINE_SHIM_FUNCTION_INTERFACE(tensor_create_graph_tensor, tensorCreateGraphTensor); + + // QnnSystem + DEFINE_SHIM_FUNCTION_SYS_INTERFACE(system_context_create, systemContextCreate); + + DEFINE_SHIM_FUNCTION_SYS_INTERFACE(system_context_get_binary_info, systemContextGetBinaryInfo); + + DEFINE_SHIM_FUNCTION_SYS_INTERFACE(system_context_free, systemContextFree); + + void set_qnn_interface(const QnnInterface_t * qnn_interface) { + _qnn_interface = qnn_interface; + } + + void set_qnn_system_interface(const QnnSystemInterface_t * qnn_sys_interface) { + _qnn_sys_interface = qnn_sys_interface; + } + + uint32_t get_backend_id() const { + return _qnn_interface->backendId; + } + + bool is_loaded() const { + return ((_qnn_sys_interface != nullptr) && (_qnn_interface != nullptr)); + } + +private: + const QnnInterface_t *_qnn_interface = nullptr; + + const QnnSystemInterface_t *_qnn_sys_interface = nullptr; +}; + + + +// ================================================================================================= +// +// wrapper class of Qualcomm QNN(Qualcomm Neural Network, aka Qualcomm AI Engine Direct) SDK +// +// and +// +// resource management of QNN resources for GGML's QNN backend +// ================================================================================================= +class qnn_instance { +public: + using BackendIdType = decltype(QnnInterface_t{}.backendId); + + explicit qnn_instance(const std::string & lib_path, const std::string & backend_name, + const std::string & model_name) : + _lib_path(std::move(lib_path)), + _backend_name(std::move(backend_name)), + _model_name(std::move(model_name)) {}; + + ~qnn_instance() { + } + + int qnn_init(const QnnSaver_Config_t ** saver_config); + + int qnn_finalize(); + + const qnn_interface &get_qnn_interface() { + if (!_qnn_interface.is_loaded()) { + QNN_LOG_WARN("pls check why _qnn_interface is not loaded\n"); + } + return _qnn_interface; + } + + + const QNN_INTERFACE_VER_TYPE &get_qnn_raw_interface() { + if (!_qnn_interface.is_loaded()) { + QNN_LOG_WARN("pls check why _qnn_interface is not loaded\n"); + } + return _qnn_raw_interface; + } + + const QNN_SYSTEM_INTERFACE_VER_TYPE &get_qnn_raw_system_interface() { + if (!_qnn_interface.is_loaded()) { + QNN_LOG_WARN("pls check why _qnn_interface is not loaded\n"); + } + return _qnn_raw_system_interface; + } + + const Qnn_LogHandle_t get_qnn_log_handle() { return _qnn_log_handle; } + + const Qnn_ProfileHandle_t get_qnn_profile_handle() { return _qnn_profile_handle; } + + const Qnn_DeviceHandle_t get_qnn_device_handle() { return _qnn_device_handle; } + + const Qnn_BackendHandle_t get_qnn_backend_handle() { return _qnn_backend_handle; } + + const Qnn_ContextHandle_t get_qnn_context_handle() { return _qnn_context_handle; } + + const QnnSystemContext_Handle_t get_qnn_system_handle() { return _qnn_system_handle; } + + const Qnn_GraphHandle_t get_qnn_graph_handle() { return _qnn_graph_handle; } + + + int init_qnn_graph(const char * graph_name, + bool debug, + uint8_t do_node_validation = 1, + const QnnGraph_Config_t ** graph_configs = nullptr + ); + + int finalize_qnn_graph(); + + int init_htp_perfinfra() { + QnnDevice_Infrastructure_t device_infra = nullptr; + int error = _qnn_raw_interface.deviceGetInfrastructure(&device_infra); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to get qnn device infra\n"); + return 1; + } + + QnnHtpDevice_Infrastructure_t *htp_infra = static_cast(device_infra); + QnnHtpDevice_PerfInfrastructure_t *htp_perfinfra = &htp_infra->perfInfra; + uint32_t power_configid = 1; + uint32_t device_id = 0; + uint32_t core_id = 0; + htp_perfinfra->createPowerConfigId(device_id, core_id, &power_configid); + _qnn_htp_perfinfra = htp_perfinfra; + _qnn_power_configid = power_configid; + + return 0; + } + + + int set_rpc_polling() { + if (_qnn_rpc_pollingtime > 0) { + QnnHtpPerfInfrastructure_PowerConfig_t rpc_pollingTime; + memset(&rpc_pollingTime, 0, sizeof(rpc_pollingTime)); + rpc_pollingTime.option = + QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_RPC_POLLING_TIME; + rpc_pollingTime.rpcPollingTimeConfig = _qnn_rpc_pollingtime; + const QnnHtpPerfInfrastructure_PowerConfig_t *powerConfigs[] = {&rpc_pollingTime, nullptr}; + if (_qnn_htp_perfinfra) { + _qnn_htp_perfinfra->setPowerConfig(_qnn_power_configid, powerConfigs); + } + } + return 0; + } + + + int set_high_performance_mode() { + if (nullptr == _qnn_htp_perfinfra) { + QNN_LOG_DEBUG("perf intra is null\n"); + return 1; + } + + QnnHtpPerfInfrastructure_PowerConfig_t powerConfig; + memset(&powerConfig, 0, sizeof(powerConfig)); + powerConfig.option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_DCVS_V3; + powerConfig.dcvsV3Config.dcvsEnable = 0; + powerConfig.dcvsV3Config.setDcvsEnable = 1; + powerConfig.dcvsV3Config.contextId = _qnn_power_configid; + powerConfig.dcvsV3Config.powerMode = QNN_HTP_PERF_INFRASTRUCTURE_POWERMODE_PERFORMANCE_MODE; + powerConfig.dcvsV3Config.setSleepLatency = 1; // True to consider Latency parameter otherwise False + powerConfig.dcvsV3Config.setBusParams = 1; // True to consider Bus parameter otherwise False + powerConfig.dcvsV3Config.setCoreParams = 1; // True to consider Core parameter otherwise False + powerConfig.dcvsV3Config.sleepDisable = 0; // True to consider sleep/LPM modes, False to enable + powerConfig.dcvsV3Config.setSleepDisable = 0; // True to consider sleep disable/enable parameter otherwise False + // set Sleep latency parameter + uint32_t latencyValue = 40; + powerConfig.dcvsV3Config.sleepLatency = latencyValue; // range 40-2000 micro sec + // set Bus Clock Parameters (refer QnnHtpPerfInfrastructure_VoltageCorner_t enum) + powerConfig.dcvsV3Config.busVoltageCornerMin = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; + powerConfig.dcvsV3Config.busVoltageCornerTarget = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; + powerConfig.dcvsV3Config.busVoltageCornerMax = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; + // set Core Clock Parameters (refer QnnHtpPerfInfrastructure_VoltageCorner_t enum) + powerConfig.dcvsV3Config.coreVoltageCornerMin = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; + powerConfig.dcvsV3Config.coreVoltageCornerTarget = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; + powerConfig.dcvsV3Config.coreVoltageCornerMax = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; + // set power config with different performance parameters + const QnnHtpPerfInfrastructure_PowerConfig_t *powerConfigs[] = {&powerConfig, nullptr}; + + _qnn_htp_perfinfra->setPowerConfig(_qnn_power_configid, powerConfigs); + + return 0; + } + + std::string &get_qnn_graph_name() { return _graph_name; } + + bool is_rpcmem_initialized() { + return _rpcmem_initialized; + } + + void set_rpcmem_initialized(bool initialized) { + _rpcmem_initialized = initialized; + } + + int32_t rpcmem_to_fd(void * buf); + + int register_rpcmem(void * p_data, Qnn_Tensor_t * p_tensor); + + void unregister_rpcmem(); + + void *alloc_rpcmem(size_t bytes, size_t alignment); + + void free_rpcmem(void * buf); + + bool is_rpcmem_allocated(void * buf); + + bool is_rpcmem_registered(Qnn_MemHandle_t handle) { + return _qnn_mem_set.count(handle) != 0U; + } + +public: + std::map> _qnn_graph_map; + +private: + int load_system(); + + int unload_system(); + + int load_backend(std::string &lib_path, const QnnSaver_Config_t ** saver_config); + + int unload_backend(); + + void set_qnn_raw_interface(QNN_INTERFACE_VER_TYPE & raw_interface) { + _qnn_raw_interface = raw_interface; + } + + void set_qnn_raw_system_interface(QNN_SYSTEM_INTERFACE_VER_TYPE &raw_interface) { + _qnn_raw_system_interface = raw_interface; + } + +private: + static constexpr const int _required_num_providers = 1; + +private: + std::string _lib_path; + std::string _backend_name; + std::string _model_name; // prebuilt QNN model name, not used in currently + BackendIdType _backend_id; + + bool _debug_tensor = false; // flag to indicate if requested graph is to be run in debug mode + bool _do_node_validations = true; // flag to indicate whether all add_node calls need to be validated + QnnLog_Level_t _qnn_log_level = QNN_LOG_LEVEL_DEBUG; + + ggml_qnn_profile_level _profile_level = ggml_qnn_profile_level::profile_detail; + + qnn_interface _qnn_interface; + + void *_system_lib_handle = nullptr; + void *_model_lib_handle = nullptr; + + Qnn_GraphHandle_t _qnn_graph_handle = nullptr; + + Qnn_LogHandle_t _qnn_log_handle = nullptr; + + Qnn_ProfileHandle_t _qnn_profile_handle = nullptr; + + Qnn_DeviceHandle_t _qnn_device_handle = nullptr; + + Qnn_BackendHandle_t _qnn_backend_handle = nullptr; + + Qnn_ContextHandle_t _qnn_context_handle = nullptr; + + QnnSystemContext_Handle_t _qnn_system_handle = nullptr; + + QnnHtpDevice_PerfInfrastructure_t *_qnn_htp_perfinfra = nullptr; + uint32_t _qnn_power_configid = 1; + uint32_t _qnn_rpc_pollingtime = 9999; // 0-10000 us for high performing + + QNN_INTERFACE_VER_TYPE _qnn_raw_interface; + QNN_SYSTEM_INTERFACE_VER_TYPE _qnn_raw_system_interface; + + std::unordered_set _qnn_mem_set; + + static std::mutex _init_mutex; + static std::unordered_map _loaded_lib_handle; + static std::unordered_map _lib_path_to_backend_id; + static std::unordered_map _loaded_backend; + + void *_rpc_lib_handle = nullptr; + std::atomic_bool _rpcmem_initialized{false}; + pfn_rpc_mem_alloc _pfn_rpc_mem_alloc; + pfn_rpc_mem_free _pfn_rpc_mem_free; + pfn_rpc_mem_to_fd _pfn_rpc_mem_to_fd; + pfn_rpc_mem_init _pfn_rpc_mem_init; + pfn_rpc_mem_deinit _pfn_rpc_mem_deinit; + std::unordered_map _rpcmem_store_map; + + + std::string _graph_name; +}; + + + +// ================================================================================================= +// +// implementation of wrapper class +// +// ================================================================================================= +std::mutex qnn_instance::_init_mutex; + +std::unordered_map qnn_instance::_loaded_lib_handle; + +std::unordered_map qnn_instance::_lib_path_to_backend_id; + +std::unordered_map qnn_instance::_loaded_backend; + + +void * qnn_instance::alloc_rpcmem(size_t bytes, size_t alignment) { + if (!_rpcmem_initialized) { + QNN_LOG_WARN("rpc memory not initialized\n"); + return nullptr; + } + + auto allocate_bytes = static_cast(bytes + alignment); + void *buf = _pfn_rpc_mem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS, allocate_bytes); + if (buf == nullptr) { + QNN_LOG_WARN("failed to allocate rpc memory\n"); + return nullptr; + } + + auto aligned_buf = reinterpret_cast(align_to(alignment, + reinterpret_cast(buf))); + bool status = _rpcmem_store_map.insert(std::pair(aligned_buf, buf)).second; + if (!status) { + QNN_LOG_WARN("failed to allocate rpc memory\n"); + _pfn_rpc_mem_free(buf); + } + + return aligned_buf; +} + + +void qnn_instance::free_rpcmem(void * buf) { + if (!_rpcmem_initialized) { + QNN_LOG_WARN("rpc memory not initialized\n"); + } else if (0 == _rpcmem_store_map.count(buf)) { + QNN_LOG_WARN("no allocated tensor\n"); + } else { + _pfn_rpc_mem_free(_rpcmem_store_map[buf]); + _rpcmem_store_map.erase(buf); + } +} + + +int32_t qnn_instance::rpcmem_to_fd(void *buf) { + int32_t mem_fd = -1; + if (!is_rpcmem_initialized()) { + QNN_LOG_WARN("rpc memory not initialized\n"); + } else { + mem_fd = _pfn_rpc_mem_to_fd(buf); + } + + return mem_fd; +} + + +int qnn_instance::register_rpcmem(void * p_data, Qnn_Tensor_t * p_tensor) { + if (nullptr == p_data || (nullptr == p_tensor)) { + QNN_LOG_WARN("invalid param\n"); + return 1; + } + + if (!is_rpcmem_initialized()) { + QNN_LOG_WARN("rpc memory not initialized\n"); + return 2; + } + + if (is_rpcmem_allocated(p_data)) { + QNN_LOG_WARN("rpc memory already allocated\n"); + //return 3; + } + if (is_rpcmem_registered((QNN_VER_PTR(*p_tensor)->memHandle))) { + QNN_LOG_WARN("tensor %s has been registered shared memory\n", (QNN_VER_PTR(*p_tensor)->name)); + return 4; + } + + int32_t mem_fd = rpcmem_to_fd(p_data); + if (-1 == mem_fd) { + QNN_LOG_WARN("failed to get file descriptor\n"); + return 5; + } + QNN_LOG_DEBUG("mem_fd %d\n", mem_fd); + Qnn_MemDescriptor_t descriptor = { + {QNN_VER_PTR(*p_tensor)->rank, QNN_VER_PTR(*p_tensor)->dimensions, nullptr}, + QNN_VER_PTR(*p_tensor)->dataType, + QNN_MEM_TYPE_ION, + {{mem_fd}}}; + Qnn_MemHandle_t handle = nullptr; + int error = QNN_SUCCESS; + error = _qnn_interface.qnn_mem_register( + _qnn_context_handle, + &descriptor, + /*numDescriptors=*/1, + &handle); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to register shared memory, error %d, %s\n", QNN_GET_ERROR_CODE(error), + strerror(error)); + return 6; + } else { + QNN_LOG_INFO("tensor %s successfully register shared memory\n", (QNN_VER_PTR(*p_tensor)->name)); + } + QNN_VER_PTR(*p_tensor)->memHandle = handle; + _qnn_mem_set.insert(handle); + + return 0; +} + + +void qnn_instance::unregister_rpcmem() { + Qnn_ErrorHandle_t error = QNN_SUCCESS; + + if (_qnn_mem_set.empty()) { + QNN_LOG_WARN("no rpcmem registered\n"); + } + + for (auto &mem_handle : _qnn_mem_set) { + error = _qnn_interface.qnn_mem_de_register(&mem_handle, 1); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to unregister shared memory, error %d\n", QNN_GET_ERROR_CODE(error)); + } + } + _qnn_mem_set.clear(); +} + + +bool qnn_instance::is_rpcmem_allocated(void * buf) { + return _rpcmem_store_map.count(buf) != 0U; +} + + +int qnn_instance::load_backend(std::string & lib_path, const QnnSaver_Config_t ** saver_config) { + Qnn_ErrorHandle_t error = QNN_SUCCESS; + QNN_LOG_DEBUG("lib_path:%s\n", lib_path.c_str()); + + void *lib_handle = dlopen(lib_path.c_str(), RTLD_NOW | RTLD_GLOBAL); + if (nullptr == lib_handle) { + QNN_LOG_WARN("can not open QNN library %s, with error: %s", lib_path.c_str(), dlerror()); + return 1; + } + + // load get_provider function + auto get_providers = load_qnn_functionpointers<_pfn_QnnInterface_getProviders *>(lib_handle, + "QnnInterface_getProviders"); + if (nullptr == get_providers) { + QNN_LOG_WARN("can not load symbol QnnInterface_getProviders : %s", dlerror()); + return 2; + } + + // get QnnInterface Providers + std::uint32_t num_providers = 0; + const QnnInterface_t **provider_list = nullptr; + error = get_providers(&provider_list, &num_providers); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to get providers, error %d", QNN_GET_ERROR_CODE(error)); + return 3; + } + QNN_LOG_DEBUG("num_providers=%d\n", num_providers); + if (num_providers != _required_num_providers) { + QNN_LOG_WARN("providers is %d instead of required %d", num_providers, _required_num_providers); + return 4; + } + + if (nullptr == provider_list) { + QNN_LOG_WARN("failed to get qnn interface providers\n"); + return 5; + } + bool found_valid_interface = false; + QNN_INTERFACE_VER_TYPE qnn_interface; + for (size_t idx = 0; idx < num_providers; idx++) { + if (QNN_API_VERSION_MAJOR == provider_list[idx]->apiVersion.coreApiVersion.major && + QNN_API_VERSION_MINOR <= provider_list[idx]->apiVersion.coreApiVersion.minor) { + found_valid_interface = true; + qnn_interface = provider_list[idx]->QNN_INTERFACE_VER_NAME; + break; + } + } + + if (!found_valid_interface) { + QNN_LOG_WARN("unable to find a valid qnn interface\n"); + return 6; + } else { + QNN_LOG_INFO("find a valid qnn interface\n"); + } + set_qnn_raw_interface(qnn_interface); + + BackendIdType backend_id = provider_list[0]->backendId; + _lib_path_to_backend_id[lib_path] = backend_id; + if (_loaded_backend.count(backend_id) > 0) { + QNN_LOG_WARN("lib_path %s is loaded, but backend %d already exists\n", + lib_path.c_str(), backend_id); + } + _loaded_backend[backend_id] = provider_list[0]; + if (_loaded_lib_handle.count(backend_id) > 0) { + QNN_LOG_WARN("closing %p\n", _loaded_lib_handle[backend_id]); + int dlclose_error = dlclose(_loaded_lib_handle[backend_id]); + if (dlclose_error != 0) { + QNN_LOG_WARN("fail to close %p with error %s\n", _loaded_lib_handle[backend_id], dlerror()); + } + } + _loaded_lib_handle[backend_id] = lib_handle; + _backend_id = backend_id; + + auto saver_initialize = load_qnn_functionpointers<_pfn_QnnSaver_initialize *>( + _loaded_lib_handle[backend_id], "QnnSaver_initialize"); + if (nullptr != saver_initialize) { + error = saver_initialize(saver_config); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to saver_initialize,error %d", QNN_GET_ERROR_CODE(error)); + return 7; + } + } else { + QNN_LOG_WARN("saver_initialize is null\n"); + } + + return 0; +} + + +int qnn_instance::unload_backend() { + int dlclose_error = 0; + for (auto &it : _loaded_lib_handle) { + dlclose_error = dlclose(it.second); + if (dlclose_error != 0) { + QNN_LOG_WARN("failed to close QNN backend %d, error %s\n", it.first, dlerror()); + } + } + + _loaded_lib_handle.clear(); + _lib_path_to_backend_id.clear(); + _loaded_backend.clear(); + + return 0; +} + + +int qnn_instance::load_system() { + Qnn_ErrorHandle_t error = QNN_SUCCESS; + + std::string system_lib_path = _lib_path + "libQnnSystem.so"; + QNN_LOG_DEBUG("system_lib_path:%s\n", system_lib_path.c_str()); + + _system_lib_handle = dlopen(system_lib_path.c_str(), RTLD_NOW | RTLD_LOCAL); + if (nullptr == _system_lib_handle) { + QNN_LOG_WARN("can not pen QNN library %s, error: %s\n", system_lib_path.c_str(), dlerror()); + return 1; + } + + auto *get_providers = reinterpret_cast<_pfn_QnnSystemInterface_getProviders *>(dlsym( + _system_lib_handle, "QnnSystemInterface_getProviders")); + if (nullptr == get_providers) { + QNN_LOG_WARN("can not load QNN symbol QnnSystemInterface_getProviders: %s\n", dlerror()); + return 2; + } + + uint32_t num_providers = 0; + const QnnSystemInterface_t ** provider_list = nullptr; + error = get_providers(&provider_list, &num_providers); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to get providers, error %d\n", QNN_GET_ERROR_CODE(error)); + return 3; + } + + if (num_providers != _required_num_providers) { + QNN_LOG_WARN("providers is %d instead of required %d\n", num_providers, _required_num_providers); + return 4; + } + + if (nullptr == provider_list) { + QNN_LOG_WARN("can not get providers\n"); + return 5; + } + + QNN_SYSTEM_INTERFACE_VER_TYPE qnn_system_interface; + bool found_valid_system_interface = false; + for (size_t idx = 0; idx < num_providers; idx++) { + if (QNN_SYSTEM_API_VERSION_MAJOR == + provider_list[idx]->systemApiVersion.major && + QNN_SYSTEM_API_VERSION_MINOR <= + provider_list[idx]->systemApiVersion.minor) { + found_valid_system_interface = true; + qnn_system_interface = provider_list[idx]->QNN_SYSTEM_INTERFACE_VER_NAME; + break; + } + } + if (!found_valid_system_interface) { + QNN_LOG_WARN("unable to find a valid qnn system interface\n"); + return 6; + } else { + QNN_LOG_INFO("find a valid qnn system interface\n"); + } + set_qnn_raw_system_interface(qnn_system_interface); + + _qnn_interface.set_qnn_system_interface(provider_list[0]); + + _qnn_interface.qnn_system_context_create(&_qnn_system_handle); + if (nullptr == _qnn_system_handle) { + LOGW("can not create QNN system contenxt\n"); + } else { + QNN_LOG_DEBUG("initialize qnn system successfully\n"); + } + + return 0; +} + + +int qnn_instance::unload_system() { + int result = 0; + + if (nullptr == _system_lib_handle) { + QNN_LOG_DEBUG("system lib handle is null\n"); + return 1; + } + + if (nullptr != _qnn_system_handle) { + result = _qnn_interface.qnn_system_context_free(_qnn_system_handle); + if (result != QNN_SUCCESS) { + QNN_LOG_WARN("failed to free QNN system context\n"); + } + _qnn_system_handle = nullptr; + } + + int dlclose_error = dlclose(_system_lib_handle); + if (dlclose_error != 0) { + QNN_LOG_WARN("failed to close QnnSystem library, error %s\n", dlerror()); + return 2; + } + + _system_lib_handle = nullptr; + + return 0; +} + + +static void ggml_qnn_logcallback(const char * fmt, + QnnLog_Level_t level, + uint64_t timestamp, + va_list argp) { + + static std::mutex log_mutex; + static unsigned char s_ggml_qnn_logbuf[GGML_QNN_LOGBUF_LEN]; + + const char * levelStr = ""; + switch (level) { + case QNN_LOG_LEVEL_ERROR: + levelStr = " ERROR "; + break; + case QNN_LOG_LEVEL_WARN: + levelStr = "WARNING"; + break; + case QNN_LOG_LEVEL_INFO: + levelStr = " INFO "; + break; + case QNN_LOG_LEVEL_DEBUG: + levelStr = " DEBUG "; + break; + case QNN_LOG_LEVEL_VERBOSE: + levelStr = "VERBOSE"; + break; + case QNN_LOG_LEVEL_MAX: + levelStr = "UNKNOWN"; + break; + } + + double ms = (double) timestamp / 1000000.0; + + { + std::lock_guard lock(log_mutex); + + int len_content = 0; + memset(s_ggml_qnn_logbuf, 0, GGML_QNN_LOGBUF_LEN); + len_content = vsnprintf(reinterpret_cast(s_ggml_qnn_logbuf), GGML_QNN_LOGBUF_LEN, fmt, argp); + //QNN_LOG_DEBUG("%8.1fms [%-7s] %s ", ms, levelStr, s_ggml_qnn_logbuf); + } +} + + +int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) { + BackendIdType backend_id = QNN_BACKEND_ID_NULL; + QNN_LOG_DEBUG("enter qni_init\n"); + + const std::lock_guard lock(_init_mutex); + + if (0 != load_system()) { + QNN_LOG_WARN("can not load QNN system lib, pls check why?\n"); + return 1; + } else { + QNN_LOG_DEBUG("load QNN system lib successfully\n"); + } + + std::string bakend_lib_path = _lib_path + _backend_name; + if (0 == _lib_path_to_backend_id.count(bakend_lib_path)) { + int is_load_ok = load_backend(bakend_lib_path, saver_config); + if (0 != is_load_ok) { + QNN_LOG_WARN("failed to load QNN backend\n"); + return 2; + } + } + + backend_id = _lib_path_to_backend_id[bakend_lib_path]; + if (0 == _loaded_backend.count(backend_id) || + 0 == _loaded_lib_handle.count(backend_id)) { + QNN_LOG_WARN("library %s is loaded but loaded backend count=%zu, loaded lib_handle count=%zu\n", + bakend_lib_path.c_str(), + _loaded_backend.count(backend_id), + _loaded_lib_handle.count(backend_id)); + return 3; + } + + _qnn_interface.set_qnn_interface(_loaded_backend[backend_id]); + +#if 1 + _qnn_interface.qnn_log_create(ggml_qnn_logcallback, _qnn_log_level, &_qnn_log_handle); +#else + _qnn_raw_interface.logCreate(ggml_qnn_logcallback, _qnn_log_level, &_qnn_log_handle); +#endif + if (nullptr == _qnn_log_handle) { + QNN_LOG_WARN("why failed to initialize qnn log\n"); //DSP backend not work on Qualcomm SoC based low-end phone + return 4; + } else { + QNN_LOG_DEBUG("initialize qnn log successfully\n"); + } + + + std::vector temp_backend_config; + _qnn_interface.qnn_backend_create(_qnn_log_handle, temp_backend_config.empty() ? nullptr + : temp_backend_config.data(), + &_qnn_backend_handle); + if (nullptr == _qnn_backend_handle) { + QNN_LOG_WARN("why failed to initialize qnn backend\n"); + return 5; + } else { + QNN_LOG_DEBUG("initialize qnn backend successfully\n"); + } + + if (nullptr != _qnn_raw_interface.propertyHasCapability) { + auto qnnStatus = _qnn_raw_interface.propertyHasCapability(QNN_PROPERTY_GROUP_DEVICE); + if (QNN_PROPERTY_NOT_SUPPORTED == qnnStatus) { + QNN_LOG_WARN("device property is not supported\n"); + } + if (QNN_PROPERTY_ERROR_UNKNOWN_KEY == qnnStatus) { + QNN_LOG_WARN("device property is not known to backend\n"); + } + } + + auto qnnStatus = _qnn_raw_interface.deviceCreate(_qnn_log_handle, nullptr, &_qnn_device_handle); + if (QNN_SUCCESS != qnnStatus && QNN_DEVICE_ERROR_UNSUPPORTED_FEATURE != qnnStatus) { + QNN_LOG_WARN("failed to create QNN device\n"); + } else { + QNN_LOG_INFO("create device successfully\n"); + } + + /* + std::vector temp_device_config; + _qnn_interface.qnn_device_create(_qnn_log_handle, temp_device_config.empty() ? nullptr : temp_device_config.data(), &_qnn_device_handle); + if (nullptr == _qnn_device_handle) { + QNN_LOG_WARN("why failed to initialize qnn device\n"); + //return 6; + } + */ + + if (ggml_qnn_profile_level::profile_off != _profile_level) { + QNN_LOG_INFO("profiling turned on; level = %d", _profile_level); + if (ggml_qnn_profile_level::profile_basic == _profile_level) { + QNN_LOG_INFO("basic profiling requested. creating Qnn Profile object\n"); + if (QNN_PROFILE_NO_ERROR != _qnn_raw_interface.profileCreate( + _qnn_backend_handle, QNN_PROFILE_LEVEL_BASIC, &_qnn_profile_handle)) { + QNN_LOG_WARN("unable to create profile handle in the backend\n"); + return 7; + } else { + QNN_LOG_DEBUG("initialize qnn profile successfully\n"); + } + } else if (ggml_qnn_profile_level::profile_detail == _profile_level) { + QNN_LOG_INFO("detailed profiling requested. Creating Qnn Profile object\n"); + if (QNN_PROFILE_NO_ERROR != _qnn_raw_interface.profileCreate( + _qnn_backend_handle, QNN_PROFILE_LEVEL_DETAILED, &_qnn_profile_handle)) { + QNN_LOG_WARN("unable to create profile handle in the backend\n"); + return 7; + } else { + QNN_LOG_DEBUG("initialize qnn profile successfully\n"); + } + } + } + + + _rpc_lib_handle = dlopen("libcdsprpc.so", RTLD_NOW | RTLD_LOCAL); + if (nullptr == _rpc_lib_handle) { + QNN_LOG_WARN("failed to load qualcomm's rpc lib, error:%s\n", dlerror()); + return 9; + } else { + QNN_LOG_DEBUG("load rpcmem lib successfully\n"); + set_rpcmem_initialized(true); + } + _pfn_rpc_mem_init = reinterpret_cast(dlsym(_rpc_lib_handle, "rpcmem_init")); + _pfn_rpc_mem_deinit = reinterpret_cast(dlsym(_rpc_lib_handle, "rpcmem_deinit")); + _pfn_rpc_mem_alloc = reinterpret_cast(dlsym(_rpc_lib_handle,"rpcmem_alloc")); + _pfn_rpc_mem_free = reinterpret_cast(dlsym(_rpc_lib_handle, "rpcmem_free")); + _pfn_rpc_mem_to_fd = reinterpret_cast(dlsym(_rpc_lib_handle,"rpcmem_to_fd")); + if (nullptr == _pfn_rpc_mem_alloc || nullptr == _pfn_rpc_mem_free + || nullptr == _pfn_rpc_mem_to_fd) { + QNN_LOG_WARN("unable to access symbols in QNN RPC lib. dlerror(): %s", dlerror()); + dlclose(_rpc_lib_handle); + return 10; + } + + if (nullptr != _pfn_rpc_mem_init) // make Qualcomm's SoC based low-end phone happy + _pfn_rpc_mem_init(); + + std::vector temp_context_config; + _qnn_interface.qnn_context_create(_qnn_backend_handle, _qnn_device_handle, + temp_context_config.empty() ? nullptr + : temp_context_config.data(), + &_qnn_context_handle); + if (nullptr == _qnn_context_handle) { + QNN_LOG_WARN("why failed to initialize qnn context\n"); + return 8; + } else { + QNN_LOG_DEBUG("initialize qnn context successfully\n"); + } + + QNN_LOG_DEBUG("leave qni_init\n"); + + return 0; +} + + +//QNN SDK would/might/should release all allocated resource in SDK's internal +int qnn_instance::qnn_finalize() { + int ret_status = 0; + Qnn_ErrorHandle_t error = QNN_SUCCESS; + + if (nullptr != _pfn_rpc_mem_deinit) // make Qualcomm's SoC based low-end phone happy + _pfn_rpc_mem_deinit(); + + if (dlclose(_rpc_lib_handle) != 0) { + QNN_LOG_WARN("failed to unload qualcomm's rpc lib, error:%s\n", dlerror()); + } else { + QNN_LOG_DEBUG("succeed to close rpcmem lib\n"); + } + + if (nullptr != _qnn_context_handle) { + error = _qnn_interface.qnn_context_free(_qnn_context_handle, _qnn_profile_handle); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to free QNN context_handle: ID %u, error %d\n", + _qnn_interface.get_backend_id(), QNN_GET_ERROR_CODE(error)); + + } + _qnn_context_handle = nullptr; + } + + if (nullptr != _qnn_profile_handle) { + error = _qnn_interface.qnn_profile_free(_qnn_profile_handle); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to free QNN profile_handle: ID %u, error %d\n", + _qnn_interface.get_backend_id(), QNN_GET_ERROR_CODE(error)); + + } + _qnn_profile_handle = nullptr; + } + + if (nullptr != _qnn_device_handle) { + error = _qnn_interface.qnn_device_free(_qnn_device_handle); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to free QNN device_handle: ID %u, error %d\n", + _qnn_interface.get_backend_id(), QNN_GET_ERROR_CODE(error)); + + } + _qnn_device_handle = nullptr; + } + + if (nullptr != _qnn_backend_handle) { + error = _qnn_interface.qnn_backend_free(_qnn_backend_handle); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to free QNN backend_handle: ID %u, error %d\n", + _qnn_interface.get_backend_id(), QNN_GET_ERROR_CODE(error)); + } + _qnn_backend_handle = nullptr; + + } + + if (nullptr != _qnn_log_handle) { + error = _qnn_interface.qnn_log_free(_qnn_log_handle); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to free QNN log_handle: ID %u, error %d\n", + _qnn_interface.get_backend_id(), QNN_GET_ERROR_CODE(error)); + } + _qnn_log_handle = nullptr; + } + + unload_backend(); + + unload_system(); + + return ret_status; +} + + +int qnn_instance::init_qnn_graph(const char * graph_name, bool debug, uint8_t do_node_validation, + const QnnGraph_Config_t ** graph_configs) { + int result = 0; + + if (nullptr == graph_name) { + QNN_LOG_WARN("graph name is null\n"); + return 1; + } + + if (!_graph_name.empty()) { + QNN_LOG_WARN("qnn model for graph %s already initialized\n", graph_name); + return 2; + } + + if (!do_node_validation) { + QNN_LOG_WARN("node validation disabled, backend will not perform op validation prior to adding node\n"); + } + + _graph_name = graph_name; + _debug_tensor = debug; + _do_node_validations = do_node_validation; + + result = _qnn_raw_interface.graphCreate(_qnn_context_handle, graph_name, graph_configs, + &_qnn_graph_handle); + if (result != QNN_GRAPH_NO_ERROR || nullptr == _qnn_graph_handle) { + QNN_LOG_WARN("failed to create graph in qnn context\n"); + return 3; + } else { + QNN_LOG_INFO("succeed to create graph %s, %p\n", graph_name, _qnn_graph_handle); + } + + return 0; +} + + +int qnn_instance::finalize_qnn_graph() { + if (nullptr != _qnn_graph_handle) { + if (_qnn_raw_interface.graphFinalize(_qnn_graph_handle, _qnn_profile_handle, nullptr) != + QNN_GRAPH_NO_ERROR) { + QNN_LOG_WARN("finalizing graph failure\n"); + //return 1; + } + } else { + QNN_LOG_DEBUG("qnn graph handle is null\n"); + } + + return 0; +} + + + +// ================================================================================================= +// +// implementation of GGML's QNN backend +// +// ================================================================================================= +static bool ggml_qnn_can_handle_op(const struct ggml_tensor * src0, const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + const int64_t ne00 = src0->ne[0]; + const int64_t ne01 = src0->ne[1]; + const int64_t ne10 = src1->ne[0]; + const int64_t ne11 = src1->ne[1]; + + const int64_t ne0 = dst->ne[0]; + const int64_t ne1 = dst->ne[1]; + + //double check + bool supported_op = ((dst->op == GGML_OP_ADD) || (dst->op == GGML_OP_MUL) || (dst->op == GGML_OP_MUL_MAT)); + if (!supported_op) { + QNN_LOG_DEBUG("op %d(%s)not support", dst->op, ggml_op_name(dst->op)); + return false; + } + + + //make QNN SDK happy + if (dst->op == GGML_OP_ADD) { + return (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16) && + (src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16) && + (dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16) && ((ne00 > 1 && ne01 > 1 && ne10 > 1 && ne11 > 1)) && + (src0->rank == src1->rank); + + } + + if (dst->op == GGML_OP_MUL_MAT) { +#if 1 // log output have significant effect to performance but useful during development stage + QNN_LOG_DEBUG("GGML_OP_MUL_MAT"); + QNN_LOG_INFO("%15s: rank = %d, type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + src0->name, src0->rank, + src0->type, ggml_type_name(src0->type), src0->ne[0], src0->ne[1], src0->ne[2], + src0->nb[0], src0->nb[1], src0->nb[2]); + QNN_LOG_INFO("%15s: rank = %d, type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + src1->name, src1->rank, + src1->type, ggml_type_name(src1->type), src1->ne[0], src1->ne[1], src1->ne[2], + src1->nb[0], src1->nb[1], src1->nb[2]); + QNN_LOG_INFO("%15s: rank = %d, type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + dst->name, dst->rank, + dst->type, ggml_type_name(dst->type), dst->ne[0], dst->ne[1], dst->ne[2], dst->nb[0], + dst->nb[1], dst->nb[2]); +#endif + } + + //make QNN SDK happy + return (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16) && + (src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16) && + (src0->type == src1->type) && (src0->type == dst->type) && ((ne00 > 1 && ne01 > 1 && ne10 > 1 && ne11 > 1)); + + +} + + +static void ggml_qnn_add(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + Qnn_ErrorHandle_t error = QNN_SUCCESS; + bool graph_initialized = false; + int64_t n_begin_time = 0LL; + int64_t n_end_time = 0LL; + int64_t n_durtion = 0LL; + + qnn_instance * instance = nullptr; + struct ggml_backend_qnn_context * ctx = nullptr; + + std::string graph_name = "ggml_op_qnn_add"; + Qnn_GraphHandle_t graph_handle = nullptr; + Qnn_Tensor_t * tensor_0 = nullptr; + Qnn_Tensor_t * tensor_1 = nullptr; + Qnn_Tensor_t * tensor_2 = nullptr; + + Qnn_QuantizeParams_t quantize_param = QNN_QUANTIZE_PARAMS_INIT; + Qnn_OpConfig_t qnn_opconfig = QNN_OPCONFIG_INIT; + Qnn_Param_t qnn_params[] = {}; + + enum ggml_op ggmlop = GGML_OP_ADD; + Qnn_DataType_t src0_qnn_type = QNN_DATATYPE_FLOAT_32; + Qnn_DataType_t src1_qnn_type = QNN_DATATYPE_FLOAT_32; + Qnn_DataType_t dst_qnn_type = QNN_DATATYPE_FLOAT_32; + + + if ((nullptr == src0) || (nullptr == src1) || (nullptr == dst)) { + QNN_LOG_WARN("pls check why GGML tensor is null"); + return; + } + tensor_0 = (Qnn_Tensor_t *)src0->extra; + tensor_1 = (Qnn_Tensor_t *)src1->extra; + tensor_2 = (Qnn_Tensor_t *)dst->extra; + if ((nullptr == tensor_0) || (nullptr == tensor_1) || (nullptr == tensor_2)) { + QNN_LOG_WARN("pls check why QNN tensor is null"); + return; + } + ctx = (struct ggml_backend_qnn_context *)g_qnn_backend->context; + if (nullptr == ctx) { + QNN_LOG_WARN("pls check why backend ctx is null"); + return; + } + instance = ctx->instance; + if (nullptr == instance) { + QNN_LOG_WARN("pls check why qnn instance is null"); + return; + } + QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface; + + n_begin_time = ggml_time_us(); +#if 0 //it works fine with whisper.cpp and llama.cpp. comment them because focus on mulmat in llama.cpp inference since 04-23-2024 + QNN_LOG_DEBUG("call %s\n", __func__); + QNN_LOG_INFO("%15s: rank = %d, type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + src0->name, src0->rank, + src0->type, ggml_type_name(src0->type), src0->ne[0], src0->ne[1], src0->ne[2], + src0->nb[0], src0->nb[1], src0->nb[2]); + QNN_LOG_INFO("%15s: rank = %d, type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + src1->name, src1->rank, + src1->type, ggml_type_name(src1->type), src1->ne[0], src1->ne[1], src1->ne[2], + src1->nb[0], src1->nb[1], src1->nb[2]); + QNN_LOG_INFO("%15s: rank = %d, type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + dst->name, dst->rank, + dst->type, ggml_type_name(dst->type), dst->ne[0], dst->ne[1], dst->ne[2], dst->nb[0], + dst->nb[1], dst->nb[2]); + QNN_LOG_DEBUG("%d, %d, %d, %d", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]); + QNN_LOG_DEBUG("tensor0 name %s", QNN_TENSOR_GET_NAME(tensor_0)); + QNN_LOG_DEBUG("tensor1 name %s", QNN_TENSOR_GET_NAME(tensor_1)); + QNN_LOG_DEBUG("tensor2 name %s", QNN_TENSOR_GET_NAME(tensor_2)); +#endif + + QNN_VER_PTR(*tensor_0)->type = QNN_TENSOR_TYPE_APP_WRITE; + QNN_VER_PTR(*tensor_1)->type = QNN_TENSOR_TYPE_APP_WRITE; + QNN_VER_PTR(*tensor_2)->type = QNN_TENSOR_TYPE_APP_READ; + + uint32_t dimensions_input_0[] = {(uint32_t) src0->ne[0], (uint32_t) src0->ne[1], + (uint32_t) src0->ne[2], (uint32_t) src0->ne[3]}; + uint32_t dimensions_input_1[] = {(uint32_t) src1->ne[0], (uint32_t) src1->ne[1], + (uint32_t) src1->ne[2], (uint32_t) src1->ne[3]}; + uint32_t dimensions_output[] = {(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], + (uint32_t) dst->ne[2], (uint32_t) dst->ne[3]}; + + + src0_qnn_type = qnn_datatype_from_ggml_datatype(src0->type); + src1_qnn_type = qnn_datatype_from_ggml_datatype(src1->type); + dst_qnn_type = qnn_datatype_from_ggml_datatype(dst->type); + + std::string map_entry = std::string(ggml_op_name(ggmlop)); + if (instance->_qnn_graph_map.find(map_entry) != instance->_qnn_graph_map.end()) { + graph_initialized = true; + auto & graph_item = instance->_qnn_graph_map[map_entry]; + graph_handle = std::get<0>(graph_item); + } + + if (!graph_initialized) { + graph_name = graph_name + "_" + std::to_string(ctx->threads) + src0->name + "_" + src1->name; + QNN_LOG_DEBUG("graph name %s", graph_name.c_str()); + //QnnGraph_Config_t graph_config; + //graph_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; + //graph_config.customConfig = strdup(graph_name.c_str()); + //const QnnGraph_Config_t * p_graph_config = &graph_config; + error = qnn_raw_interface.graphCreate(instance->get_qnn_context_handle(), graph_name.c_str(), nullptr, &graph_handle); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("can't create qnn graph handle with graph name %s, error = %d\n", graph_name.c_str(), error); + return; + } + error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_0); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + } + error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_1); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + } + error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_2); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + } + + QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, ggml_get_tensor_data_size(src0)}; + QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, ggml_get_tensor_data_size(src1)}; + QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, ggml_get_tensor_data_size(dst)}; + + Qnn_Tensor_t tensor_inputs[] = { + *tensor_0, + *tensor_1 + }; + Qnn_Tensor_t tensor_outputs[] = { + *tensor_2 + }; + Qnn_OpConfig_t opconfig = { + (Qnn_OpConfigVersion_t) 1, .v1 = { + "ggml_op_add", + QNN_OP_PACKAGE_NAME_QTI_AISW, + QNN_OP_ELEMENT_WISE_ADD, + 0, + qnn_params, + 2, + tensor_inputs, + 1, + tensor_outputs + } + }; + error = qnn_raw_interface.graphAddNode(graph_handle, opconfig); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + } + error = qnn_raw_interface.graphFinalize(graph_handle, nullptr, nullptr); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + } + error = qnn_raw_interface.graphExecute(graph_handle, tensor_inputs, 2, tensor_outputs, 1, nullptr, nullptr); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + } + auto graph_item = std::make_tuple(graph_handle, tensor_0, tensor_1, tensor_2); + instance->_qnn_graph_map[map_entry] = graph_item; + } else { + auto & graph_item = instance->_qnn_graph_map[map_entry]; + graph_handle = std::get<0>(graph_item); + tensor_0 = std::get<1>(graph_item); + tensor_1 = std::get<2>(graph_item); + tensor_2 = std::get<3>(graph_item); + + //comment them because focus on mulmat in llama.cpp inference since 04-23-2024 + //QNN_LOG_DEBUG("%d, %d, %d, %d", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]); + uint32_t dimensions_input_0[] = {(uint32_t) src0->ne[0], (uint32_t) src0->ne[1], + (uint32_t) src0->ne[2], (uint32_t) src0->ne[3]}; + uint32_t dimensions_input_1[] = {(uint32_t) src1->ne[0], (uint32_t) src1->ne[1], + (uint32_t) src1->ne[2], (uint32_t) src1->ne[3]}; + uint32_t dimensions_output[] = {(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], + (uint32_t) dst->ne[2], (uint32_t) dst->ne[3]}; + QNN_VER_PTR(*tensor_0)->dimensions = dimensions_input_0; + QNN_VER_PTR(*tensor_0)->rank = ggml_get_tensor_rank(src0); + QNN_VER_PTR(*tensor_0)->dataType = src0_qnn_type; + QNN_VER_PTR(*tensor_1)->dimensions = dimensions_input_1; + QNN_VER_PTR(*tensor_1)->rank = ggml_get_tensor_rank(src1); + QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type; + QNN_VER_PTR(*tensor_2)->dimensions = dimensions_output; + QNN_VER_PTR(*tensor_2)->rank = ggml_get_tensor_rank(dst); + QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type; + + QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, ggml_get_tensor_data_size(src0)}; + QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, ggml_get_tensor_data_size(src1)}; + QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, ggml_get_tensor_data_size(dst)}; + + Qnn_Tensor_t tensor_inputs[] = { + *tensor_0, + *tensor_1 + }; + Qnn_Tensor_t tensor_outputs[] = { + *tensor_2 + }; + error = qnn_raw_interface.graphExecute(graph_handle, tensor_inputs, 2, tensor_outputs, 1, nullptr, nullptr); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + } + } + n_end_time = ggml_time_us(); + n_durtion = (n_end_time - n_begin_time) / 1000; + //comment them because focus on mulmat in llama.cpp inference since 04-23-2024 + //QNN_LOG_DEBUG("duration of ggml_qnn_add : %lld milliseconds\n", n_durtion); + //QNN_LOG_DEBUG("call %s done\n", __func__); +} + + + +/* + * ggml_qnn_mul_mat was re-added as a standalone function because + * the following comments came from https://github.com/ggerganov/llama.cpp/pull/1632 + * MUL_MAT take most of the compute time (about 95%). So to speed up llama, we have to focus on MUL_MAT. + * We have three kinds of MUL_MAT to compute: + * mul_mat_f32: both src0 and src1 are F32. + * mul_mat_f16_f32: src0 is F16 and src1 is F32. + * mul_mat_q_f32: src0 is quantized (Q4_0, Q4_1, ...), and src1 is F32. +*/ + +static void ggml_qnn_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + Qnn_ErrorHandle_t error = QNN_SUCCESS; + bool graph_initialized = false; + int64_t n_begin_time = 0LL; + int64_t n_end_time = 0LL; + int64_t n_durtion = 0LL; + + qnn_instance * instance = nullptr; + struct ggml_backend_qnn_context * ctx = nullptr; + + std::string graph_name = "ggml_op_qnn_mul_mat"; + Qnn_GraphHandle_t graph_handle = nullptr; + Qnn_Tensor_t * tensor_0 = nullptr; + Qnn_Tensor_t * tensor_1 = nullptr; + Qnn_Tensor_t * tensor_2 = nullptr; + + Qnn_QuantizeParams_t quantize_param = QNN_QUANTIZE_PARAMS_INIT; + Qnn_OpConfig_t qnn_opconfig = QNN_OPCONFIG_INIT; + Qnn_Param_t qnn_params[] = {}; + + enum ggml_op ggmlop = GGML_OP_ADD; + Qnn_DataType_t src0_qnn_type = QNN_DATATYPE_FLOAT_32; + Qnn_DataType_t src1_qnn_type = QNN_DATATYPE_FLOAT_32; + Qnn_DataType_t dst_qnn_type = QNN_DATATYPE_FLOAT_32; + + + if ((nullptr == src0) || (nullptr == src1) || (nullptr == dst)) { + QNN_LOG_WARN("pls check why GGML tensor is null"); + return; + } + tensor_0 = (Qnn_Tensor_t *)src0->extra; + tensor_1 = (Qnn_Tensor_t *)src1->extra; + tensor_2 = (Qnn_Tensor_t *)dst->extra; + if ((nullptr == tensor_0) || (nullptr == tensor_1) || (nullptr == tensor_2)) { + QNN_LOG_WARN("pls check why QNN tensor is null"); + return; + } + ctx = (struct ggml_backend_qnn_context *)g_qnn_backend->context; + if (nullptr == ctx) { + QNN_LOG_WARN("pls check why backend ctx is null"); + return; + } + instance = ctx->instance; + if (nullptr == instance) { + QNN_LOG_WARN("pls check why qnn instance is null"); + return; + } + QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface; + + n_begin_time = ggml_time_us(); + QNN_LOG_DEBUG("call %s\n", __func__); + QNN_LOG_INFO("%15s: rank = %d, type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + src0->name, src0->rank, + src0->type, ggml_type_name(src0->type), src0->ne[0], src0->ne[1], src0->ne[2], + src0->nb[0], src0->nb[1], src0->nb[2]); + QNN_LOG_INFO("%15s: rank = %d, type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + src1->name, src1->rank, + src1->type, ggml_type_name(src1->type), src1->ne[0], src1->ne[1], src1->ne[2], + src1->nb[0], src1->nb[1], src1->nb[2]); + QNN_LOG_INFO("%15s: rank = %d, type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + dst->name, dst->rank, + dst->type, ggml_type_name(dst->type), dst->ne[0], dst->ne[1], dst->ne[2], dst->nb[0], + dst->nb[1], dst->nb[2]); + QNN_LOG_DEBUG("%d, %d, %d, %d", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]); + QNN_LOG_DEBUG("tensor0 name %s", QNN_TENSOR_GET_NAME(tensor_0)); + QNN_LOG_DEBUG("tensor1 name %s", QNN_TENSOR_GET_NAME(tensor_1)); + QNN_LOG_DEBUG("tensor2 name %s", QNN_TENSOR_GET_NAME(tensor_2)); + + QNN_VER_PTR(*tensor_0)->type = QNN_TENSOR_TYPE_APP_WRITE; + QNN_VER_PTR(*tensor_1)->type = QNN_TENSOR_TYPE_APP_WRITE; + QNN_VER_PTR(*tensor_2)->type = QNN_TENSOR_TYPE_APP_READ; + + uint32_t dimensions_input_0[] = {(uint32_t) src0->ne[0], (uint32_t) src0->ne[1], + (uint32_t) src0->ne[2], (uint32_t) src0->ne[3]}; + uint32_t dimensions_input_1[] = {(uint32_t) src1->ne[0], (uint32_t) src1->ne[1], + (uint32_t) src1->ne[2], (uint32_t) src1->ne[3]}; + uint32_t dimensions_output[] = {(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], + (uint32_t) dst->ne[2], (uint32_t) dst->ne[3]}; + + src0_qnn_type = qnn_datatype_from_ggml_datatype(src0->type); + src1_qnn_type = qnn_datatype_from_ggml_datatype(src1->type); + dst_qnn_type = qnn_datatype_from_ggml_datatype(dst->type); + + std::string map_entry = std::string(ggml_op_name(ggmlop)); + if (instance->_qnn_graph_map.find(map_entry) != instance->_qnn_graph_map.end()) { + graph_initialized = true; + auto & graph_item = instance->_qnn_graph_map[map_entry]; + graph_handle = std::get<0>(graph_item); + } + + if (!graph_initialized) { + graph_name = graph_name + "_" + std::to_string(ctx->threads) + src0->name + "_" + src1->name; + QNN_LOG_DEBUG("graph name %s", graph_name.c_str()); + error = qnn_raw_interface.graphCreate(instance->get_qnn_context_handle(), graph_name.c_str(), nullptr, &graph_handle); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("can't create qnn graph handle with graph name %s, error = %d\n", graph_name.c_str(), error); + return; + } + error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_0); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + } + error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_1); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + } + error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_2); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + } + + QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, ggml_get_tensor_data_size(src0)}; + QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, ggml_get_tensor_data_size(src1)}; + QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, ggml_get_tensor_data_size(dst)}; + + Qnn_Tensor_t tensor_inputs[] = { + *tensor_0, + *tensor_1 + }; + Qnn_Tensor_t tensor_outputs[] = { + *tensor_2 + }; + Qnn_OpConfig_t opconfig = { + (Qnn_OpConfigVersion_t) 1, .v1 = { + "ggml_op_mul_mat", + QNN_OP_PACKAGE_NAME_QTI_AISW, + QNN_OP_MAT_MUL, + 0, + qnn_params, + 2, + tensor_inputs, + 1, + tensor_outputs + } + }; + error = qnn_raw_interface.graphAddNode(graph_handle, opconfig); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + } + error = qnn_raw_interface.graphFinalize(graph_handle, nullptr, nullptr); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + } + error = qnn_raw_interface.graphExecute(graph_handle, tensor_inputs, 2, tensor_outputs, 1, nullptr, nullptr); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + } + auto graph_item = std::make_tuple(graph_handle, tensor_0, tensor_1, tensor_2); + instance->_qnn_graph_map[map_entry] = graph_item; + } else { + auto & graph_item = instance->_qnn_graph_map[map_entry]; + graph_handle = std::get<0>(graph_item); + tensor_0 = std::get<1>(graph_item); + tensor_1 = std::get<2>(graph_item); + tensor_2 = std::get<3>(graph_item); + + QNN_LOG_DEBUG("%d, %d, %d, %d", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]); + uint32_t dimensions_input_0[] = {(uint32_t) src0->ne[0], (uint32_t) src0->ne[1], + (uint32_t) src0->ne[2], (uint32_t) src0->ne[3]}; + uint32_t dimensions_input_1[] = {(uint32_t) src1->ne[0], (uint32_t) src1->ne[1], + (uint32_t) src1->ne[2], (uint32_t) src1->ne[3]}; + uint32_t dimensions_output[] = {(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], + (uint32_t) dst->ne[2], (uint32_t) dst->ne[3]}; + QNN_VER_PTR(*tensor_0)->dimensions = dimensions_input_0; + QNN_VER_PTR(*tensor_0)->rank = ggml_get_tensor_rank(src0); + QNN_VER_PTR(*tensor_0)->dataType = src0_qnn_type; + QNN_VER_PTR(*tensor_1)->dimensions = dimensions_input_1; + QNN_VER_PTR(*tensor_1)->rank = ggml_get_tensor_rank(src1); + QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type; + QNN_VER_PTR(*tensor_2)->dimensions = dimensions_output; + QNN_VER_PTR(*tensor_2)->rank = ggml_get_tensor_rank(dst); + QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type; + + QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, ggml_get_tensor_data_size(src0)}; + QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, ggml_get_tensor_data_size(src1)}; + QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, ggml_get_tensor_data_size(dst)}; + + Qnn_Tensor_t tensor_inputs[] = { + *tensor_0, + *tensor_1 + }; + Qnn_Tensor_t tensor_outputs[] = { + *tensor_2 + }; + error = qnn_raw_interface.graphExecute(graph_handle, tensor_inputs, 2, tensor_outputs, 1, nullptr, nullptr); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + } + } + n_end_time = ggml_time_us(); + n_durtion = (n_end_time - n_begin_time) / 1000; + QNN_LOG_DEBUG("duration of ggml_qnn_mul_mat : %lld milliseconds\n", n_durtion); + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +//common function for GGML OPs using QNN API +static void ggml_qnn_hanlde_op(const enum ggml_op ggmlop, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + Qnn_ErrorHandle_t error = QNN_SUCCESS; + bool graph_initialized = false; + int64_t n_begin_time = 0LL; + int64_t n_end_time = 0LL; + int64_t n_durtion = 0LL; + + qnn_instance * instance = nullptr; + struct ggml_backend_qnn_context * ctx = nullptr; + + std::string qnn_graph_name = "ggml_qnn_graph"; + std::string qnn_opconfig_name = "ggml_qnn_opconfig"; + const char * qnn_op_name = nullptr; + Qnn_GraphHandle_t graph_handle = nullptr; + Qnn_Tensor_t * tensor_0 = nullptr; + Qnn_Tensor_t * tensor_1 = nullptr; + Qnn_Tensor_t * tensor_2 = nullptr; + + Qnn_QuantizeParams_t quantize_param = QNN_QUANTIZE_PARAMS_INIT; + Qnn_OpConfig_t qnn_opconfig = QNN_OPCONFIG_INIT; + Qnn_Param_t qnn_params[] = {}; + + Qnn_DataType_t src0_qnn_type = QNN_DATATYPE_FLOAT_32; + Qnn_DataType_t src1_qnn_type = QNN_DATATYPE_FLOAT_32; + Qnn_DataType_t dst_qnn_type = QNN_DATATYPE_FLOAT_32; + + + if ((nullptr == src0) || (nullptr == src1) || (nullptr == dst)) { + QNN_LOG_WARN("pls check why GGML tensor is null"); + return; + } + tensor_0 = (Qnn_Tensor_t *)src0->extra; + tensor_1 = (Qnn_Tensor_t *)src1->extra; + tensor_2 = (Qnn_Tensor_t *)dst->extra; + if ((nullptr == tensor_0) || (nullptr == tensor_1) || (nullptr == tensor_2)) { + QNN_LOG_WARN("pls check why QNN tensor is null"); + return; + } + ctx = (struct ggml_backend_qnn_context *)g_qnn_backend->context; + if (nullptr == ctx) { + QNN_LOG_WARN("pls check why backend ctx is null"); + return; + } + instance = ctx->instance; + if (nullptr == instance) { + QNN_LOG_WARN("pls check why qnn instance is null"); + return; + } + QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface; + + src0_qnn_type = qnn_datatype_from_ggml_datatype(src0->type); + src1_qnn_type = qnn_datatype_from_ggml_datatype(src1->type); + dst_qnn_type = qnn_datatype_from_ggml_datatype(dst->type); + qnn_op_name = qnn_opname_from_ggmlop(ggmlop); + if (nullptr == qnn_op_name) { + QNN_LOG_WARN("pls check why can not get QNN OP name with ggml op %d(%s)", ggmlop, ggml_op_name(ggmlop)); + return; + } + + n_begin_time = ggml_time_us(); + QNN_LOG_DEBUG("call %s\n", __func__); + QNN_LOG_INFO("%15s: rank = %d, type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + src0->name, src0->rank, + src0->type, ggml_type_name(src0->type), src0->ne[0], src0->ne[1], src0->ne[2], + src0->nb[0], src0->nb[1], src0->nb[2]); + QNN_LOG_INFO("%15s: rank = %d, type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + src1->name, src1->rank, + src1->type, ggml_type_name(src1->type), src1->ne[0], src1->ne[1], src1->ne[2], + src1->nb[0], src1->nb[1], src1->nb[2]); + QNN_LOG_INFO("%15s: rank = %d, type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + dst->name, dst->rank, + dst->type, ggml_type_name(dst->type), dst->ne[0], dst->ne[1], dst->ne[2], dst->nb[0], + dst->nb[1], dst->nb[2]); + QNN_LOG_DEBUG("%d, %d, %d, %d", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]); + QNN_LOG_DEBUG("tensor0 name %s", QNN_TENSOR_GET_NAME(tensor_0)); + QNN_LOG_DEBUG("tensor1 name %s", QNN_TENSOR_GET_NAME(tensor_1)); + QNN_LOG_DEBUG("tensor2 name %s", QNN_TENSOR_GET_NAME(tensor_2)); + + QNN_VER_PTR(*tensor_0)->type = QNN_TENSOR_TYPE_APP_WRITE; + QNN_VER_PTR(*tensor_1)->type = QNN_TENSOR_TYPE_APP_WRITE; + QNN_VER_PTR(*tensor_2)->type = QNN_TENSOR_TYPE_APP_READ; + + uint32_t dimensions_input_0[] = {(uint32_t) src0->ne[0], (uint32_t) src0->ne[1], + (uint32_t) src0->ne[2], (uint32_t) src0->ne[3]}; + uint32_t dimensions_input_1[] = {(uint32_t) src1->ne[0], (uint32_t) src1->ne[1], + (uint32_t) src1->ne[2], (uint32_t) src1->ne[3]}; + uint32_t dimensions_output[] = {(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], + (uint32_t) dst->ne[2], (uint32_t) dst->ne[3]}; + + std::string map_entry = std::string(ggml_op_name(ggmlop)); + if (instance->_qnn_graph_map.find(map_entry) != instance->_qnn_graph_map.end()) { + graph_initialized = true; + auto & graph_item = instance->_qnn_graph_map[map_entry]; + graph_handle = std::get<0>(graph_item); + } + + if (!graph_initialized) { + qnn_graph_name = qnn_graph_name + "_" + ggml_op_name(ggmlop) + std::to_string(ctx->threads) + src0->name + "_" + src1->name; + qnn_opconfig_name = qnn_opconfig_name + "_" + ggml_op_name(ggmlop) + std::to_string(ctx->threads) + src0->name + "_" + src1->name; + QNN_LOG_DEBUG("qnn graph name %s", qnn_graph_name.c_str()); + QNN_LOG_DEBUG("qnn opconfig name %s", qnn_opconfig_name.c_str()); + error = qnn_raw_interface.graphCreate(instance->get_qnn_context_handle(), qnn_graph_name.c_str(), nullptr, &graph_handle); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("can't create qnn graph handle with ggml op %s, graph name %s, error = %d\n", ggml_op_name(ggmlop), qnn_graph_name.c_str(), error); + return; + } + + error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_0); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + } + error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_1); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + } + error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_2); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + } + + QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, ggml_get_tensor_data_size(src0)}; + QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, ggml_get_tensor_data_size(src1)}; + QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, ggml_get_tensor_data_size(dst)}; + + Qnn_Tensor_t tensor_inputs[] = { + *tensor_0, + *tensor_1 + }; + Qnn_Tensor_t tensor_outputs[] = { + *tensor_2 + }; + Qnn_OpConfig_t opconfig = { + (Qnn_OpConfigVersion_t) 1, .v1 = { + qnn_opconfig_name.c_str(), + QNN_OP_PACKAGE_NAME_QTI_AISW, + qnn_op_name, + 0, + qnn_params, + 2, + tensor_inputs, + 1, + tensor_outputs + } + }; + error = qnn_raw_interface.graphAddNode(graph_handle, opconfig); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + } + error = qnn_raw_interface.graphFinalize(graph_handle, nullptr, nullptr); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + } + error = qnn_raw_interface.graphExecute(graph_handle, tensor_inputs, 2, tensor_outputs, 1, nullptr, nullptr); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + } + auto graph_item = std::make_tuple(graph_handle, tensor_0, tensor_1, tensor_2); + instance->_qnn_graph_map[map_entry] = graph_item; + } else { + auto & graph_item = instance->_qnn_graph_map[map_entry]; + graph_handle = std::get<0>(graph_item); + tensor_0 = std::get<1>(graph_item); + tensor_1 = std::get<2>(graph_item); + tensor_2 = std::get<3>(graph_item); + + QNN_LOG_DEBUG("%d, %d, %d, %d", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]); + uint32_t dimensions_input_0[] = {(uint32_t) src0->ne[0], (uint32_t) src0->ne[1], + (uint32_t) src0->ne[2], (uint32_t) src0->ne[3]}; + uint32_t dimensions_input_1[] = {(uint32_t) src1->ne[0], (uint32_t) src1->ne[1], + (uint32_t) src1->ne[2], (uint32_t) src1->ne[3]}; + uint32_t dimensions_output[] = {(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], + (uint32_t) dst->ne[2], (uint32_t) dst->ne[3]}; + QNN_VER_PTR(*tensor_0)->dimensions = dimensions_input_0; + QNN_VER_PTR(*tensor_0)->rank = ggml_get_tensor_rank(src0); + QNN_VER_PTR(*tensor_0)->dataType = src0_qnn_type; + QNN_VER_PTR(*tensor_1)->dimensions = dimensions_input_1; + QNN_VER_PTR(*tensor_1)->rank = ggml_get_tensor_rank(src1); + QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type; + QNN_VER_PTR(*tensor_2)->dimensions = dimensions_output; + QNN_VER_PTR(*tensor_2)->rank = ggml_get_tensor_rank(dst); + QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type; + + QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, ggml_get_tensor_data_size(src0)}; + QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, ggml_get_tensor_data_size(src1)}; + QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, ggml_get_tensor_data_size(dst)}; + + Qnn_Tensor_t tensor_inputs[] = { + *tensor_0, + *tensor_1 + }; + Qnn_Tensor_t tensor_outputs[] = { + *tensor_2 + }; + error = qnn_raw_interface.graphExecute(graph_handle, tensor_inputs, 2, tensor_outputs, 1, nullptr, nullptr); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + } + } + n_end_time = ggml_time_us(); + n_durtion = (n_end_time - n_begin_time) / 1000; + QNN_LOG_DEBUG("duration of ggml_qnn_%s : %lld milliseconds\n", ggml_op_name(ggmlop), n_durtion); + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_repeat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_get_rows(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_acc(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + + + + +static void ggml_qnn_div(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_gelu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_silu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_gelu_quick(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_tanh(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_relu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_hardsigmoid(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_hardswish(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_leaky_relu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_sqr(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_group_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_concat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_upscale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_pad(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_rms_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_dup(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + ggml_qnn_cpy(src0, dst, nullptr); + (void) src1; +} + + +static void ggml_qnn_mul_mat_id(const ggml_tensor * src0, + const ggml_tensor * src1, + ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_scale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_clamp(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_diag_mask_inf(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_soft_max(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_rope(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + GGML_ASSERT(ggml_is_contiguous(src0)); + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); + +} + + +static void ggml_qnn_alibi(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_pool2d(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_im2col(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_sum_rows(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + GGML_ASSERT(ggml_is_contiguous(src0)); + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_argsort(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + GGML_ASSERT(ggml_is_contiguous(src0)); + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + (void) src0; + (void) src1; + (void) dst; + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +bool ggml_qnn_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) { + ggml_qnn_func_t func = nullptr; + ggml_qnn_func_common_t func_common = nullptr; + + bool supported_op = false; + + bool use_hwaccel = false; + + //begin sanity check + if (nullptr == g_qnn_backend) { + QNN_LOG_ERROR("pls check why qnn subsystem not initialized"); + return false; + } + + //this is special scenario for UT function qnn_ggml_op + //borrow some advantages from PyTorch:the user or the upper layer codes could specify whether a GGML OP(such as add/mul/mulmat) is accelerated by a specify backend) + //otherwise ggml-qnn.cpp don't known whether current caller is whisper.cpp or other scenario(for example, JNI function...) + + //in the all, use_hwaccel is different with supported_op + //this feature is heavily depend on PR in upstream whisper.cpp https://github.com/ggerganov/whisper.cpp/pull/2073 + use_hwaccel = (tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU); + + supported_op = ((tensor->op == GGML_OP_ADD) || (tensor->op == GGML_OP_MUL) || (tensor->op == GGML_OP_MUL_MAT)); + //supported_op = (tensor->op == GGML_OP_ADD); //works very good with whisper.cpp(asr result is correct) + + if ((!use_hwaccel) && (!supported_op)) { + //TODO: should be removed because this is a workaround method during development stage + ggml_compute_forward(params, tensor); + return false; + } + + if ((!use_hwaccel) && (!ggml_qnn_can_handle_op(tensor->src[0], tensor->src[1], tensor))) { + //TODO: should be removed because this is a workaround method during development stage + ggml_compute_forward(params, tensor); + return false; + } + //end sanity check + + switch (tensor->op) { + case GGML_OP_ADD: + func = ggml_qnn_add; + //func_common = ggml_qnn_hanlde_op; + break; + + case GGML_OP_MUL: + func_common = ggml_qnn_hanlde_op; + break; + + case GGML_OP_MUL_MAT: + func = ggml_qnn_mul_mat; + //func_common = ggml_qnn_hanlde_op; + break; + + case GGML_OP_REPEAT: + func = ggml_qnn_repeat; + break; + case GGML_OP_GET_ROWS: + func = ggml_qnn_get_rows; + break; + case GGML_OP_DUP: + func = ggml_qnn_dup; + break; + + case GGML_OP_ACC: + func = ggml_qnn_acc; + break; + + case GGML_OP_DIV: + func = ggml_qnn_div; + break; + + case GGML_OP_UNARY: + switch (ggml_get_unary_op(tensor)) { + case GGML_UNARY_OP_GELU: + func = ggml_qnn_gelu; + break; + case GGML_UNARY_OP_SILU: + func = ggml_qnn_silu; + break; + case GGML_UNARY_OP_GELU_QUICK: + func = ggml_qnn_gelu_quick; + break; + case GGML_UNARY_OP_TANH: + func = ggml_qnn_tanh; + break; + case GGML_UNARY_OP_RELU: + func = ggml_qnn_relu; + break; + case GGML_UNARY_OP_HARDSIGMOID: + func = ggml_qnn_hardsigmoid; + break; + case GGML_UNARY_OP_HARDSWISH: + func = ggml_qnn_hardswish; + break; + default: + return false; + } + break; + case GGML_OP_NORM: + func = ggml_qnn_norm; + break; + case GGML_OP_GROUP_NORM: + func = ggml_qnn_group_norm; + break; + case GGML_OP_CONCAT: + func = ggml_qnn_concat; + break; + case GGML_OP_UPSCALE: + func = ggml_qnn_upscale; + break; + case GGML_OP_PAD: + func = ggml_qnn_pad; + break; + case GGML_OP_LEAKY_RELU: + func = ggml_qnn_leaky_relu; + break; + case GGML_OP_RMS_NORM: + func = ggml_qnn_rms_norm; + break; + + case GGML_OP_MUL_MAT_ID: + func = ggml_qnn_mul_mat_id; + break; + case GGML_OP_SCALE: + func = ggml_qnn_scale; + break; + case GGML_OP_SQR: + func = ggml_qnn_sqr; + break; + case GGML_OP_CLAMP: + func = ggml_qnn_clamp; + break; + case GGML_OP_CPY: + func = ggml_qnn_cpy; + break; + case GGML_OP_CONT: + func = ggml_qnn_dup; + break; + case GGML_OP_NONE: + case GGML_OP_RESHAPE: + case GGML_OP_VIEW: + case GGML_OP_PERMUTE: + case GGML_OP_TRANSPOSE: + func = ggml_qnn_nop; + break; + case GGML_OP_DIAG_MASK_INF: + func = ggml_qnn_diag_mask_inf; + break; + case GGML_OP_SOFT_MAX: + func = ggml_qnn_soft_max; + break; + case GGML_OP_ROPE: + func = ggml_qnn_rope; + break; + case GGML_OP_ALIBI: + func = ggml_qnn_alibi; + break; + case GGML_OP_IM2COL: + func = ggml_qnn_im2col; + break; + case GGML_OP_POOL_2D: + func = ggml_qnn_pool2d; + break; + case GGML_OP_SUM_ROWS: + func = ggml_qnn_sum_rows; + break; + case GGML_OP_ARGSORT: + func = ggml_qnn_argsort; + break; + default: + return false; + } + + + //ok, real show time in Qualcomm's QNN internal + if (nullptr != func) + func(tensor->src[0], tensor->src[1], tensor); + if (nullptr != func_common) + func_common(tensor->op, tensor->src[0], tensor->src[1], tensor); + + return true; +} + + +struct ggml_backend_qnn_buffer_context { + ~ggml_backend_qnn_buffer_context() { + if (buffer) { + free(buffer); + } + for (auto * sub_buffer : sub_buffers) { + free(sub_buffer); + } + + for (auto * qnn_tensor : qnn_tensors) { + free_qnn_tensor(*qnn_tensor); + free(qnn_tensor); + } + + std::map>::iterator graph_it; + struct ggml_backend_qnn_context * ctx = (struct ggml_backend_qnn_context *) g_qnn_backend->context; + QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->instance->get_qnn_raw_interface(); + for (graph_it = backend_ctx->instance->_qnn_graph_map.begin(); graph_it != backend_ctx->instance->_qnn_graph_map.end(); graph_it++) { + auto & graph_item = graph_it->second; + Qnn_GraphHandle_t & graph_handle = std::get<0>(graph_item); + QNN_LOG_DEBUG("graph type:%s", graph_it->first.c_str()); + } + backend_ctx->instance->_qnn_graph_map.clear(); + + sub_buffers.clear(); + qnn_tensors.clear(); + } + void * buffer = nullptr; + + struct ggml_backend_qnn_context * backend_ctx = nullptr; + + size_t buffer_size = 0; + std::vector sub_buffers; + std::vector qnn_tensors; +}; + +static const char * ggml_backend_qnn_buffer_get_name(ggml_backend_buffer_t buffer) { + GGML_UNUSED(buffer); + return "QNN"; +} + + +GGML_CALL static bool ggml_backend_buffer_is_qnn(ggml_backend_buffer_t buffer) { + return buffer->iface.get_name == ggml_backend_qnn_buffer_get_name; +} + + +static void ggml_backend_qnn_buffer_free_buffer(ggml_backend_buffer_t buffer) { + ggml_backend_qnn_buffer_context * ctx = (ggml_backend_qnn_buffer_context *) buffer->context; + delete ctx; +} + + +//TODO:not used +static void * ggml_backend_qnn_buffer_get_base(ggml_backend_buffer_t buffer) { + ggml_backend_qnn_buffer_context * ctx = (ggml_backend_qnn_buffer_context *) buffer->context; + + return ctx->buffer; +} + + +static void ggml_backend_qnn_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) { + Qnn_ErrorHandle_t error = QNN_SUCCESS; + ggml_backend_qnn_buffer_context * ctx = (ggml_backend_qnn_buffer_context *)buffer->context; + + /* + if (tensor->view_src != nullptr && tensor->view_offs == 0) { + assert(tensor->view_src->buffer->buft == buffer->buft); + tensor->backend = tensor->view_src->backend; + tensor->extra = tensor->view_src->extra; + return; + } + */ + + uint32_t dimensions[] = {(uint32_t) tensor->ne[0], (uint32_t) tensor->ne[1], (uint32_t) tensor->ne[2], (uint32_t) tensor->ne[3]}; + //TODO:only support FP32 & FP16 + Qnn_DataType_t qnn_data_type = QNN_DATATYPE_FLOAT_32; + Qnn_TensorType_t qnn_tensor_type = QNN_TENSOR_TYPE_APP_WRITE; + + + if (tensor->flags & GGML_TENSOR_FLAG_INPUT) { + qnn_tensor_type = QNN_TENSOR_TYPE_APP_WRITE; + } else if (tensor->flags & GGML_TENSOR_FLAG_OUTPUT) { + qnn_tensor_type = QNN_TENSOR_TYPE_APP_READ; + } + Qnn_Tensor_t qnn_tensor = { + .version= QNN_TENSOR_VERSION_1, + {.v1= { + .id=0, + .name= tensor->name, + .type= qnn_tensor_type, + .dataFormat= QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER, + .dataType= qnn_data_type, + .quantizeParams= {QNN_DEFINITION_UNDEFINED, + QNN_QUANTIZATION_ENCODING_UNDEFINED, + {.scaleOffsetEncoding= {.scale= 0.0000000000000000f, .offset= 0}}}, + .rank= ggml_get_tensor_rank(tensor), + .dimensions=dimensions, + .memType= QNN_TENSORMEMTYPE_RAW, + {.clientBuf= {.data=nullptr, + .dataSize=0}}}} + }; + Qnn_Tensor_t * p_qnn_tensor = (Qnn_Tensor_t *)malloc(sizeof(Qnn_Tensor_t)); + if (nullptr == p_qnn_tensor) { + QNN_LOG_WARN("init tensor failed"); + return; + } + Qnn_Tensor_t tensor_copy; + error = deep_copy_qnn_tensors(qnn_tensor, *p_qnn_tensor); + if (error != QNN_SUCCESS) { + free(p_qnn_tensor); + QNN_LOG_DEBUG("init tensor failed"); + return; + } + tensor->extra = p_qnn_tensor; + ctx->qnn_tensors.push_back(p_qnn_tensor); + + if (ggml_is_quantized(tensor->type)) { + //TODO + QNN_LOG_DEBUG("is quantized"); + } +} + + +static void ggml_backend_qnn_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { + GGML_UNUSED(buffer); + + memcpy((char *)tensor->data + offset, data, size); +} + + +static void ggml_backend_qnn_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { + GGML_UNUSED(buffer); + memcpy(data, (const char *)tensor->data + offset, size); +} + + +static bool ggml_backend_qnn_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) { + GGML_UNUSED(buffer); + if (ggml_backend_buffer_is_host(src->buffer)) { + memcpy(dst->data, src->data, ggml_nbytes(src)); + return true; + } + + return false; +} + + +static void ggml_backend_qnn_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { + ggml_backend_qnn_buffer_context * ctx = (ggml_backend_qnn_buffer_context *) buffer->context; + + memset(ctx->buffer, value, ctx->buffer_size); +} + + + +static void ggml_backend_qnn_buffer_reset(ggml_backend_buffer_t buffer) { + ggml_backend_qnn_buffer_context * ctx = (ggml_backend_qnn_buffer_context *) buffer->context; + for (auto * sub_buffer : ctx->sub_buffers) { + free(sub_buffer); + } + ctx->sub_buffers.clear(); +} + + +static ggml_backend_buffer_i ggml_backend_qnn_buffer_interface = { + /* .get_name = */ ggml_backend_qnn_buffer_get_name, + /* .free_buffer = */ ggml_backend_qnn_buffer_free_buffer, + /* .get_base = */ ggml_backend_qnn_buffer_get_base, + /* .init_tensor = */ ggml_backend_qnn_buffer_init_tensor, + /* .set_tensor = */ ggml_backend_qnn_buffer_set_tensor, + /* .get_tensor = */ ggml_backend_qnn_buffer_get_tensor, + /* .cpy_tensor = */ ggml_backend_qnn_buffer_cpy_tensor, + /* .clear = */ ggml_backend_qnn_buffer_clear, + /* .reset = */ nullptr, +}; + + +static const char * ggml_backend_qnn_buffer_type_name(ggml_backend_buffer_type_t buft) { + return "QNN"; +} + + +static void * ggml_qnn_host_malloc(size_t n) { + void * data = nullptr; + const int result = posix_memalign((void **) &data, sysconf(_SC_PAGESIZE), n); + if (result != 0) { + QNN_LOG_WARN("%s: error: posix_memalign failed\n", __func__); + return nullptr; + } + + return data; +} + + +static ggml_backend_buffer_t ggml_backend_qnn_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { + ggml_backend_qnn_buffer_context * ctx = new ggml_backend_qnn_buffer_context; + + const size_t size_page = sysconf(_SC_PAGESIZE); + + size_t size_aligned = size; + if ((size_aligned % size_page) != 0) { + size_aligned += (size_page - (size_aligned % size_page)); + } + + //TODO:use pre-allocated buffer in internal memory pool + ctx->buffer = ggml_qnn_host_malloc(size_aligned); + ctx->buffer_size = size_aligned; + + ctx->backend_ctx = &g_qnn_mgr[g_current_device]; + + if (nullptr == ctx->buffer) { + QNN_LOG_WARN("%s: failed to allocate %.2f MiB\n", __func__, size / (1 << 20)); + return nullptr; + } + + return ggml_backend_buffer_init(buft, ggml_backend_qnn_buffer_interface, ctx, size); +} + + +static size_t ggml_backend_qnn_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { + GGML_UNUSED(buft); + return 32; +} + + +//TODO: this value is an experimental value +static size_t ggml_backend_qnn_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) { + GGML_UNUSED(buft); + + return (38 * 1024 * 1024); +} + + +static bool ggml_backend_qnn_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, + ggml_backend_t backend) { + GGML_UNUSED(buft); + + return ggml_backend_is_qnn(backend) || ggml_backend_is_cpu(backend); +} + + +// attention here because Qualcomm's QNN SDK is a highly well-designed SDK +// +// refer to https://developer.qualcomm.com/sites/default/files/attachments/qnn_software_stack.png +// https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/overview.html +static bool ggml_backend_qnn_buffer_is_host(ggml_backend_buffer_type_t buft) { + GGML_UNUSED(buft); + return true; +} + +static ggml_backend_buffer_type_i ggml_backend_qnn_buffer_type_interface = { + /* .get_name = */ ggml_backend_qnn_buffer_type_name, + /* .alloc_buffer = */ ggml_backend_qnn_buffer_type_alloc_buffer, + /* .get_alignment = */ ggml_backend_qnn_buffer_type_get_alignment, + /* .get_max_size = */ ggml_backend_qnn_buffer_type_get_max_size, + /* .get_alloc_size = */ nullptr, + /* .supports_backend = */ ggml_backend_qnn_buffer_type_supports_backend, + /* .is_host = */ ggml_backend_qnn_buffer_is_host +}; + + +static const char * ggml_backend_qnn_name(ggml_backend_t backend) { + return "QNN"; +} + + +static void ggml_backend_qnn_free(ggml_backend_t backend) { + QNN_LOG_INFO("enter %s", __func__ ); + ggml_backend_qnn_context * ctx = (ggml_backend_qnn_context *) backend->context; + QNN_LOG_DEBUG("idx %d, name:%s", ctx->device, g_qnn_mgr[ctx->device].name); + + qnn_instance * instance = (qnn_instance*)g_qnn_mgr[ctx->device].instance; + if (instance != nullptr) { + instance->qnn_finalize(); + delete instance; + g_qnn_mgr[ctx->device].instance = nullptr; + } + + qnn_buf_t * buffer_pool = (qnn_buf_t*)g_qnn_mgr[ctx->device].buffer_pool; + if (buffer_pool != nullptr) { + buffer_pool->destroy(buffer_pool); + g_qnn_mgr[ctx->device].buffer_pool = nullptr; + } + + if (g_qnn_mgr[ctx->device].backend != nullptr) { + delete backend; + g_qnn_backend = nullptr; + g_qnn_mgr[ctx->device].backend = nullptr; + } + QNN_LOG_INFO("leave %s", __func__ ); +} + + +static ggml_backend_buffer_type_t ggml_backend_qnn_get_default_buffer_type(ggml_backend_t backend) { + ggml_backend_qnn_context * ctx = (ggml_backend_qnn_context *) backend->context; + + return ggml_backend_qnn_buffer_type(ctx->device); +} + + +#if 0 +static bool ggml_backend_qnn_supports_op(ggml_backend_t backend, const ggml_tensor * op) { + GGML_UNUSED(backend); + + switch (op->op) { + case GGML_OP_UNARY: + switch (ggml_get_unary_op(op)) { + case GGML_UNARY_OP_GELU: + case GGML_UNARY_OP_SILU: + case GGML_UNARY_OP_RELU: + case GGML_UNARY_OP_HARDSIGMOID: + case GGML_UNARY_OP_HARDSWISH: + case GGML_UNARY_OP_GELU_QUICK: + case GGML_UNARY_OP_TANH: + return true; + default: + return false; + } + break; + case GGML_OP_MUL_MAT: + case GGML_OP_MUL_MAT_ID: { + struct ggml_tensor *a; + struct ggml_tensor *b; + if (op->op == GGML_OP_MUL_MAT) { + a = op->src[0]; + b = op->src[1]; + } else { + a = op->src[2]; + b = op->src[1]; + } + if (a->ne[3] != b->ne[3]) { + return false; + } + ggml_type a_type = a->type; + if (a_type == GGML_TYPE_IQ4_NL || a_type == GGML_TYPE_IQ2_S || + a_type == GGML_TYPE_IQ4_XS) { + return false; + } + return true; + } + break; + case GGML_OP_GET_ROWS: { + switch (op->src[0]->type) { + case GGML_TYPE_F16: + case GGML_TYPE_F32: + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: + case GGML_TYPE_Q5_0: + case GGML_TYPE_Q5_1: + case GGML_TYPE_Q8_0: + return true; + default: + return false; + } + } + break; + case GGML_OP_CPY: { + ggml_type src0_type = op->src[0]->type; + ggml_type src1_type = op->src[1]->type; + if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) { + return true; + } + if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F16) { + return true; + } + if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q8_0) { + return true; + } + if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q4_0) { + return true; + } + if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q4_1) { + return true; + } + if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F16) { + return true; + } + if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F32) { + return true; + } + return false; + } + break; + case GGML_OP_CONCAT: { + ggml_type src0_type = op->src[0]->type; + return src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16; + } + break; + case GGML_OP_DUP: + case GGML_OP_NONE: + case GGML_OP_RESHAPE: + case GGML_OP_REPEAT: + case GGML_OP_VIEW: + case GGML_OP_PERMUTE: + case GGML_OP_TRANSPOSE: + case GGML_OP_NORM: + case GGML_OP_ADD: + case GGML_OP_MUL: + case GGML_OP_DIV: + case GGML_OP_RMS_NORM: + case GGML_OP_SCALE: + case GGML_OP_SQR: + case GGML_OP_CLAMP: + case GGML_OP_CONT: + case GGML_OP_DIAG_MASK_INF: + case GGML_OP_SOFT_MAX: + case GGML_OP_ROPE: + case GGML_OP_ALIBI: + case GGML_OP_IM2COL: + case GGML_OP_POOL_2D: + case GGML_OP_SUM_ROWS: + case GGML_OP_ARGSORT: + case GGML_OP_ACC: + case GGML_OP_GROUP_NORM: + case GGML_OP_UPSCALE: + case GGML_OP_PAD: + case GGML_OP_LEAKY_RELU: + return true; + default: + return false; + } +} +# else +static bool ggml_backend_qnn_supports_op(ggml_backend_t backend, const ggml_tensor * op) { + GGML_UNUSED(backend); + + switch (op->op) { + case GGML_OP_MUL_MAT: + return true; + default: + return false; + } +} +#endif + + +static ggml_status ggml_backend_qnn_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { + enum ggml_status result = GGML_STATUS_SUCCESS; + int node_n = -1; + int task_phase = GGML_TASK_TYPE_FINALIZE; + ggml_backend_qnn_context * ctx = (ggml_backend_qnn_context *) backend->context; + + struct ggml_cplan plan = ggml_graph_plan(cgraph, 1); + + buf_element_t * qnn_buf = nullptr; + + if (plan.work_size > 0) { + //plan.work_data = static_cast(malloc(plan.work_size)); + plan.work_data = static_cast(ctx->buffer_pool->buffer_pool_base); + if (plan.work_data == nullptr) { + QNN_LOG_ERROR("malloc failed"); + return GGML_STATUS_FAILED; + } + } + struct ggml_cplan * cplan = &plan; + GGML_ASSERT(cplan->n_threads > 0); + if (cplan->work_size > 0) { + GGML_ASSERT(cplan->work_data); + } + + while (true) { + if (cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) { + result = GGML_STATUS_ABORTED; + break; + } + struct ggml_compute_params params = { + /*.type =*/ GGML_TASK_TYPE_FINALIZE, + /*.ith =*/ 0, + /*.nth =*/ 0, + /*.wsize =*/ cplan->work_size, + /*.wdata =*/ cplan->work_data, + }; + + if (node_n != -1) { + /* FINALIZE */ + struct ggml_tensor * node = cgraph->nodes[node_n]; + if (GGML_OP_HAS_FINALIZE[node->op]) { + params.nth = 1; + ggml_qnn_compute_forward(¶ms, node); + } + } + + while (++node_n < cgraph->n_nodes) { + struct ggml_tensor * node = cgraph->nodes[node_n]; + params.nth = 1; + if (GGML_OP_HAS_INIT[node->op]) { + params.type = GGML_TASK_TYPE_INIT; + ggml_qnn_compute_forward(¶ms, node); + } + params.type = GGML_TASK_TYPE_COMPUTE; + ggml_qnn_compute_forward(¶ms, node); + if (GGML_OP_HAS_FINALIZE[node->op]) { + params.type = GGML_TASK_TYPE_FINALIZE; + ggml_qnn_compute_forward(¶ms, node); + } + if (cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) { + result = GGML_STATUS_ABORTED; + break; + } + } + task_phase = GGML_TASK_TYPE_INIT; + if (node_n >= cgraph->n_nodes) { + //QNN_LOG_INFO("node_n %d", node_n); + //QNN_LOG_INFO("cgraph->n_nodes %d", cgraph->n_nodes); + break; + } + } + + //free(plan.work_data); + + return result; +} + + +struct ggml_compute_state_shared { + const struct ggml_cgraph * cgraph; + const struct ggml_cplan * cplan; + + int64_t perf_node_start_cycles; + int64_t perf_node_start_time_us; + + const int n_threads; + + // synchronization primitives + atomic_int n_active; // num active threads + atomic_int node_n; // active graph node + atomic_int node_task; // active graph node task phase + + ggml_abort_callback abort_callback; // abort ggml_graph_compute when true + void * abort_callback_data; +}; + +struct ggml_compute_state { + pthread_t thrd; + int ith; + struct ggml_compute_state_shared * shared; + enum ggml_status ec; +}; + + +#ifdef GGML_PERF +#define ggml_perf_time_ms() ggml_time_ms() +#define ggml_perf_time_us() ggml_time_us() +#define ggml_perf_cycles() ggml_cycles() +#define ggml_perf_cycles_per_ms() ggml_cycles_per_ms() +#else +#define ggml_perf_time_ms() 0 +#define ggml_perf_time_us() 0 +#define ggml_perf_cycles() 0 +#define ggml_perf_cycles_per_ms() 0 +#endif +#undef MIN +#undef MAX + +#define MIN(a, b) ((a) < (b) ? (a) : (b)) +#define MAX(a, b) ((a) > (b) ? (a) : (b)) + + +static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const struct ggml_compute_state_shared * st) { + int64_t cycles_cur = ggml_perf_cycles() - st->perf_node_start_cycles; + int64_t time_us_cur = ggml_perf_time_us() - st->perf_node_start_time_us; + + node->perf_runs++; + node->perf_cycles += cycles_cur; + node->perf_time_us += time_us_cur; +} + + +static void ggml_graph_compute_thread_sync_node(int * node_n, struct ggml_compute_state * state, const bool do_yield) { + // wait for other threads to finish + const int last_node_n = * node_n; + + while (true) { + if (do_yield) { + sched_yield(); + } + + * node_n = atomic_load(&state->shared->node_n); + if (* node_n != last_node_n) break; + } +} + + +static void ggml_graph_compute_thread_sync_task(int * task_phase, struct ggml_compute_state * state, const bool do_yield) { + // wait for other threads to finish + const int last_task_phase = * task_phase; + + while (true) { + if (do_yield) { + sched_yield(); + } + + * task_phase = atomic_load(&state->shared->node_task); + if (* task_phase != last_task_phase) break; + } +} + + +static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads, int n_cur_threads) { + int n_tasks = 0; + + if (ggml_is_empty(node)) { + // no need to multi-thread a no-op + n_tasks = 1; + return n_tasks; + } + + switch (node->op) { + case GGML_OP_CPY: + case GGML_OP_DUP: + case GGML_OP_ADD: + case GGML_OP_ADD1: + case GGML_OP_ACC: { + n_tasks = n_threads; + } + break; + case GGML_OP_SUB: + case GGML_OP_SQR: + case GGML_OP_SQRT: + case GGML_OP_LOG: + case GGML_OP_SUM: + case GGML_OP_SUM_ROWS: + case GGML_OP_MEAN: + case GGML_OP_ARGMAX: + case GGML_OP_REPEAT: + case GGML_OP_REPEAT_BACK: + case GGML_OP_LEAKY_RELU: { + n_tasks = 1; + } + break; + case GGML_OP_UNARY: + switch (ggml_get_unary_op(node)) { + case GGML_UNARY_OP_ABS: + case GGML_UNARY_OP_SGN: + case GGML_UNARY_OP_NEG: + case GGML_UNARY_OP_STEP: + case GGML_UNARY_OP_TANH: + case GGML_UNARY_OP_ELU: + case GGML_UNARY_OP_RELU: + case GGML_UNARY_OP_HARDSWISH: + case GGML_UNARY_OP_HARDSIGMOID: { + n_tasks = 1; + } + break; + + case GGML_UNARY_OP_GELU: + case GGML_UNARY_OP_GELU_QUICK: + case GGML_UNARY_OP_SILU: { + n_tasks = n_threads; + } + break; + default: + GGML_ASSERT(false); + } + break; + case GGML_OP_SILU_BACK: + case GGML_OP_MUL: + case GGML_OP_DIV: + case GGML_OP_NORM: + case GGML_OP_RMS_NORM: + case GGML_OP_RMS_NORM_BACK: + case GGML_OP_GROUP_NORM: + case GGML_OP_CONCAT: { + n_tasks = n_threads; + } + break; + case GGML_OP_MUL_MAT: { + n_tasks = n_threads; + } + break; + case GGML_OP_MUL_MAT_ID: { + n_tasks = n_threads; + } + break; + case GGML_OP_OUT_PROD: { + n_tasks = n_threads; + } + break; + case GGML_OP_GET_ROWS: { + n_tasks = MIN(n_cur_threads, ggml_nelements(node->src[1])); + } + break; + case GGML_OP_SCALE: + case GGML_OP_SET: + case GGML_OP_CONT: + case GGML_OP_RESHAPE: + case GGML_OP_VIEW: + case GGML_OP_PERMUTE: + case GGML_OP_TRANSPOSE: + case GGML_OP_GET_ROWS_BACK: + case GGML_OP_DIAG: { + n_tasks = 1; + } + break; + case GGML_OP_DIAG_MASK_ZERO: + case GGML_OP_DIAG_MASK_INF: + case GGML_OP_SOFT_MAX_BACK: + case GGML_OP_ROPE: + case GGML_OP_ROPE_BACK: + case GGML_OP_ADD_REL_POS: { + n_tasks = n_threads; + } + break; + case GGML_OP_ALIBI: { + n_tasks = 1; + } + break; + case GGML_OP_CLAMP: { + n_tasks = 1; + } + break; + case GGML_OP_SOFT_MAX: { + n_tasks = MIN(n_threads, ggml_nrows(node->src[0])); + } + break; + case GGML_OP_CONV_TRANSPOSE_1D: { + n_tasks = n_threads; + } + break; + case GGML_OP_IM2COL: { + n_tasks = n_threads; + } + break; + case GGML_OP_CONV_TRANSPOSE_2D: { + n_tasks = n_threads; + } + break; + case GGML_OP_POOL_1D: + case GGML_OP_POOL_2D: { + n_tasks = 1; + } + break; + case GGML_OP_UPSCALE: { + n_tasks = n_threads; + } + break; + case GGML_OP_PAD: { + n_tasks = n_threads; + } + break; + case GGML_OP_ARANGE: { + n_tasks = n_threads; + } + break; + case GGML_OP_TIMESTEP_EMBEDDING: { + n_tasks = n_threads; + } + break; + case GGML_OP_ARGSORT: { + n_tasks = n_threads; + } + break; + case GGML_OP_FLASH_ATTN: { + n_tasks = n_threads; + } + break; + case GGML_OP_FLASH_FF: { + n_tasks = n_threads; + } + break; + case GGML_OP_FLASH_ATTN_BACK: { + n_tasks = n_threads; + } + break; + case GGML_OP_SSM_CONV: + case GGML_OP_SSM_SCAN: { + n_tasks = n_threads; + } + break; + case GGML_OP_WIN_PART: + case GGML_OP_WIN_UNPART: + case GGML_OP_GET_REL_POS: + case GGML_OP_MAP_UNARY: + case GGML_OP_MAP_BINARY: + case GGML_OP_MAP_CUSTOM1_F32: + case GGML_OP_MAP_CUSTOM2_F32: + case GGML_OP_MAP_CUSTOM3_F32: { + n_tasks = 1; + } + break; + case GGML_OP_MAP_CUSTOM1: { + QNN_LOG_ERROR("not support"); + } + break; + case GGML_OP_MAP_CUSTOM2: { + QNN_LOG_ERROR("not support"); + } + break; + case GGML_OP_MAP_CUSTOM3: { + QNN_LOG_ERROR("not support"); + } + break; + case GGML_OP_CROSS_ENTROPY_LOSS: { + n_tasks = n_threads; + } + break; + case GGML_OP_CROSS_ENTROPY_LOSS_BACK: { + n_tasks = n_threads; + } + break; + case GGML_OP_NONE: { + n_tasks = 1; + } + break; + case GGML_OP_COUNT: { + GGML_ASSERT(false); + } + break; + default: { + QNN_LOG_WARN("%s: op not implemented: ", __func__); + if (node->op < GGML_OP_COUNT) { + QNN_LOG_DEBUG("%s\n", ggml_op_name(node->op)); + } else { + QNN_LOG_DEBUG("%d\n", node->op); + } + GGML_ASSERT(false); + } + break; + } + + assert(n_tasks > 0); + + return n_tasks; +} + + +static void * ggml_graph_compute_thread(void * data) { + struct ggml_compute_state * state = (struct ggml_compute_state *) data; + + const struct ggml_cgraph * cgraph = state->shared->cgraph; + const struct ggml_cplan * cplan = state->shared->cplan; + + const int n_threads = state->shared->n_threads; + + int node_n = -1; + int task_phase = GGML_TASK_TYPE_FINALIZE; + + while (true) { + if (cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) { + state->shared->node_n += 1; + state->ec = GGML_STATUS_ABORTED; + return 0; + } + + if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) { + // all other threads are finished and spinning + // do finalize and init here so we don't have synchronize again + struct ggml_compute_params params = { + /*.type =*/ GGML_TASK_TYPE_FINALIZE, + /*.ith =*/ 0, + /*.nth =*/ 0, + /*.wsize =*/ cplan->work_size, + /*.wdata =*/ cplan->work_data, + }; + + if (node_n != -1) { + /* FINALIZE */ + struct ggml_tensor * node = cgraph->nodes[node_n]; + if (GGML_OP_HAS_FINALIZE[node->op]) { + params.nth = ggml_get_n_tasks(node, n_threads, state->shared->n_threads); + ggml_qnn_compute_forward(¶ms, node); + } + ggml_graph_compute_perf_stats_node(node, state->shared); + } + + // distribute new work or execute it direct if 1T + while (++node_n < cgraph->n_nodes) { + //QNN_LOG_INFO("%s: %d/%d\n", __func__, node_n, cgraph->n_nodes); + struct ggml_tensor * node = cgraph->nodes[node_n]; + const int n_tasks = ggml_get_n_tasks(node, n_threads, state->shared->n_threads); + + state->shared->perf_node_start_cycles = ggml_perf_cycles(); + state->shared->perf_node_start_time_us = ggml_perf_time_us(); + + params.nth = n_tasks; + + if (n_tasks == 1) { + /* INIT */ + if (GGML_OP_HAS_INIT[node->op]) { + params.type = GGML_TASK_TYPE_INIT; + ggml_qnn_compute_forward(¶ms, node); + } + + // TODO: maybe push node_n to the atomic but if other threads see n_tasks is 1, + // they do something more efficient than spinning (?) + params.type = GGML_TASK_TYPE_COMPUTE; + ggml_qnn_compute_forward(¶ms, node); + + if (GGML_OP_HAS_FINALIZE[node->op]) { + params.type = GGML_TASK_TYPE_FINALIZE; + ggml_qnn_compute_forward(¶ms, node); + } + + ggml_graph_compute_perf_stats_node(node, state->shared); + } else { + break; + } + + if (cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) { + break; + } + } + + task_phase = GGML_TASK_TYPE_INIT; + atomic_store(&state->shared->n_active, n_threads); + atomic_store(&state->shared->node_n, node_n); + atomic_store(&state->shared->node_task, task_phase); + } else { + ggml_graph_compute_thread_sync_node(&node_n, state, false); + ggml_graph_compute_thread_sync_task(&task_phase, state, false); + } + + // check if we should stop + if (node_n >= cgraph->n_nodes) break; + + /* INIT & COMPUTE */ + struct ggml_tensor * node = cgraph->nodes[node_n]; + const int n_tasks = ggml_get_n_tasks(node, n_threads, state->shared->n_threads); + + struct ggml_compute_params params = { + /*.type =*/ GGML_TASK_TYPE_INIT, + /*.ith =*/ state->ith, + /*.nth =*/ n_tasks, + /*.wsize =*/ cplan->work_size, + /*.wdata =*/ cplan->work_data, + }; + + if (state->ith < n_tasks) { + if (GGML_OP_HAS_INIT[node->op]) { + ggml_qnn_compute_forward(¶ms, node); + } + } + + if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) { + task_phase = GGML_TASK_TYPE_COMPUTE; + atomic_store(&state->shared->n_active, n_threads); + atomic_store(&state->shared->node_task, task_phase); + } + else { + const bool do_yield = node_n < 0 || cgraph->nodes[node_n]->op == GGML_OP_MUL_MAT; + ggml_graph_compute_thread_sync_task(&task_phase, state, do_yield); + } + + if (state->ith < n_tasks) { + params.type = GGML_TASK_TYPE_COMPUTE; + ggml_qnn_compute_forward(¶ms, node); + } + + if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) { + task_phase = GGML_TASK_TYPE_FINALIZE; + atomic_store(&state->shared->n_active, n_threads); + atomic_store(&state->shared->node_task, task_phase); + } + else { + ggml_graph_compute_thread_sync_task(&task_phase, state, false); + } + } + + return 0; +} + + +static ggml_status ggml_backend_qnn_graph_compute_multithread(ggml_backend_t backend, ggml_cgraph * cgraph) { + ggml_backend_qnn_context * ctx = (ggml_backend_qnn_context *) backend->context; + + int num_threads = ctx->threads; + + if (QNN_GPU == ctx->device || QNN_HTP == ctx->device) { + //TODO:multithreading not supported using QNN GPU/HTP(aka DSP) backend + num_threads = 1; + } + struct ggml_cplan plan = ggml_graph_plan(cgraph, num_threads); + + + if (plan.work_size > 0) { + //QNN_LOG_INFO("work size %d(%d MB)", plan.work_size, plan.work_size / (1 << 20)); + plan.work_data = static_cast(malloc(plan.work_size)); + if (plan.work_data == nullptr) { + QNN_LOG_ERROR("malloc failed"); + return GGML_STATUS_FAILED; + } + } + + struct ggml_cplan * cplan = &plan; + GGML_ASSERT(cplan->n_threads > 0); + if (cplan->work_size > 0) { + GGML_ASSERT(cplan->work_data); + } + + //QNN_LOG_DEBUG("cgraph %p, cplan %p, work size %d, work data %p", cgraph, cplan, cplan->work_size, cplan->work_data); + const int n_threads = cplan->n_threads; + + struct ggml_compute_state_shared state_shared = { + /*.cgraph =*/ cgraph, + /*.cgraph_plan =*/ cplan, + /*.perf_node_start_cycles =*/ 0, + /*.perf_node_start_time_us =*/ 0, + /*.n_threads =*/ n_threads, + /*.n_active =*/ n_threads, + /*.node_n =*/ -1, + /*.node_task =*/ GGML_TASK_TYPE_FINALIZE, + /*.abort_callback =*/ nullptr, + /*.abort_callback_data =*/ nullptr, + }; + struct ggml_compute_state * workers = (struct ggml_compute_state*)alloca(sizeof(struct ggml_compute_state) * n_threads); + if (nullptr == workers) { + QNN_LOG_ERROR("malloc failed"); + if (plan.work_data != nullptr) { + free(plan.work_data); + } + return GGML_STATUS_FAILED; + } + + // create thread pool + if (n_threads > 1) { + for (int j = 1; j < n_threads; ++j) { + workers[j] = (struct ggml_compute_state) { + .thrd = 0, + .ith = j, + .shared = &state_shared, + .ec = GGML_STATUS_SUCCESS, + }; + + const int rc = pthread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]); + GGML_ASSERT(rc == 0); + } + } + + workers[0].ith = 0; + workers[0].shared = &state_shared; + workers[0].ec = GGML_STATUS_SUCCESS; + + // this is a work thread too + ggml_graph_compute_thread(&workers[0]); + enum ggml_status compute_status = workers[0].ec; + + // join or kill thread pool + if (n_threads > 1) { + for (int j = 1; j < n_threads; j++) { + const int rc = pthread_join(workers[j].thrd, NULL); + GGML_ASSERT(rc == 0); + if (workers[j].ec != GGML_STATUS_SUCCESS) + compute_status = workers[j].ec; + } + } + + if (plan.work_data != nullptr) { + free(plan.work_data); + } + + return compute_status; +} + + +static bool ggml_backend_qnn_offload_op(ggml_backend_t backend, const ggml_tensor * op) { + GGML_UNUSED(backend); + + const int min_batch_size = 32; + + return op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS; +} + + +static ggml_backend_i ggml_backend_qnn_interface = { + /* .get_name = */ ggml_backend_qnn_name, + /* .free = */ ggml_backend_qnn_free, + /* .get_default_buffer_type = */ ggml_backend_qnn_get_default_buffer_type, + /* .set_tensor_async = */ nullptr, + /* .get_tensor_async = */ nullptr, + /* .cpy_tensor_async = */ nullptr, + /* .synchronize = */ nullptr, + /* .graph_plan_create = */ nullptr, + /* .graph_plan_free = */ nullptr, + /* .graph_plan_compute = */ nullptr, + /* .graph_compute = */ ggml_backend_qnn_graph_compute_multithread, + /* .supports_op = */ ggml_backend_qnn_supports_op, + /* .offload_op = */ nullptr, + /* .event_new = */ nullptr, + /* .event_free = */ nullptr, + /* .event_record = */ nullptr, + /* .event_wait = */ nullptr, + /* .event_synchronize = */ nullptr, +}; + + +static ggml_guid_t ggml_backend_qnn_guid() { + static ggml_guid guid = {0x1a, 0x2b, 0x3c, 0x4d, 0x5e, 0x6f, 0x70, 0x81, 0x92, 0xa3, 0xb4, 0xc5, + 0xd6, 0xe7, 0xf8, 0x09}; + return &guid; +} + + +static ggml_backend_t ggml_backend_qnn_reg_init(const char * params, void * user_data) { + ggml_backend_t qnn_backend = ggml_backend_qnn_init((int) (intptr_t) user_data, params); + + return qnn_backend; +} + + +bool ggml_backend_is_qnn(ggml_backend_t backend) { + return backend != nullptr && ggml_guid_matches(backend->guid, ggml_backend_qnn_guid()); +} + + +void ggml_backend_qnn_set_n_threads(ggml_backend_t backend, int n_threads) { + GGML_ASSERT(ggml_backend_is_qnn(backend)); + + struct ggml_backend_qnn_context * ctx = (struct ggml_backend_qnn_context *)backend->context; + ctx->threads = n_threads; +} + +const char * ggml_backend_qnn_get_name(ggml_backend_t backend) { + return backend->iface.get_name(backend); +} + +int ggml_backend_qnn_get_device_count() { + return GGML_QNN_MAX_DEVICES; +} + + +void ggml_backend_qnn_get_device_description(int device, char * description, size_t description_size) { + if (nullptr == description || 0 == description_size) { + QNN_LOG_WARN("invalid param"); + return; + } + + if (device >= GGML_QNN_MAX_DEVICES) { + QNN_LOG_WARN("invalid param"); + return; + } + + snprintf(description, description_size, "%s", g_qnn_mgr[device].name); + QNN_LOG_DEBUG("description:%s", description); +} + + +ggml_backend_buffer_type_t ggml_backend_qnn_buffer_type(size_t device_index) { + if (device_index >= GGML_QNN_MAX_DEVICES) { + QNN_LOG_DEBUG("ggml_backend_qnn_buffer_type error: device_index:%d is out of range [0, %d]\n", + device_index, GGML_QNN_MAX_DEVICES - 1); + return nullptr; + } + + static struct ggml_backend_buffer_type ggml_backend_buffer_type_qnn = { + /* .iface = */ { + /* .get_name = */ ggml_backend_qnn_buffer_type_name, + /* .alloc_buffer = */ ggml_backend_qnn_buffer_type_alloc_buffer, + /* .get_alignment = */ ggml_backend_qnn_buffer_type_get_alignment, + /* .get_max_size = */ ggml_backend_qnn_buffer_type_get_max_size, + /* .get_alloc_size = */ nullptr,// defaults to ggml_nbytes + /* .supports_backend = */ ggml_backend_qnn_buffer_type_supports_backend, + /* .is_host = */ ggml_backend_qnn_buffer_is_host + }, + /* .context = */ nullptr, + }; + + return &ggml_backend_buffer_type_qnn; +} + + +/** + * + * @param device 0: QNN_CPU 1: QNN_GPU 2: QNN_HTP(aka DSP) + * @param qnn_lib_path qnn library path, such as "/data/data/com.ggml.llamacpp/" on Android which can got by JNI from Java layer + * @return + */ +ggml_backend_t ggml_backend_qnn_init(size_t device, const char * qnn_lib_path) { + int result = 0; + + if (nullptr == qnn_lib_path) + return nullptr; + + QNN_LOG_DEBUG("device %d", device); + QNN_LOG_DEBUG("qnn_lib_path %s", qnn_lib_path); + if (device >= GGML_QNN_MAX_DEVICES) { + QNN_LOG_ERROR("invalid device %d", device); + return nullptr; + } + + if (nullptr != g_qnn_mgr[device].backend) { + QNN_LOG_ERROR("qnn backend %d(%s) already loaded, it should not happened, pls check why?", device, get_qnn_backend_name(device)); + if (device == g_current_device) { + g_qnn_backend = g_qnn_mgr[device].backend; + QNN_LOG_INFO("re-use cached backend %d(%s)", device, get_qnn_backend_name(device)); + return g_qnn_mgr[device].backend; + } else { + QNN_LOG_INFO("delete previous backend %d(%s)", device, get_qnn_backend_name(device)); + ggml_backend_qnn_free(g_qnn_backend); + } + } + + static bool is_first_call = true; + if (is_first_call) { + ggml_setup_op_has_task_pass(); + is_first_call = false; + } + + if (QNN_HTP == device) { + std::string path = qnn_lib_path; + if (0 == setenv("LD_LIBRARY_PATH", + (path + + ":/vendor/dsp/cdsp:/vendor/lib64:/vendor/dsp/dsp:/vendor/dsp/images").c_str(), + 1)) { + QNN_LOG_INFO("QNN DSP backend setenv successfully"); + } else { + QNN_LOG_ERROR("QNN DSP backend setenv failure"); + } + if (0 == setenv("ADSP_LIBRARY_PATH", + (path + + ";/vendor/dsp/cdsp;/vendor/lib/rfsa/adsp;/system/lib/rfsa/adsp;/vendor/dsp/dsp;/vendor/dsp/images;/dsp").c_str(), + 1)) { + QNN_LOG_INFO("QNN DSP backend setenv successfully"); + } else { + QNN_LOG_ERROR("QNN DSP backend setenv failure"); + } + } + + qnn_instance * instance = nullptr; + instance = new qnn_instance(qnn_lib_path, g_qnn_mgr[device].lib, ""); + result = instance->qnn_init(nullptr); + if (0 != result) { + QNN_LOG_WARN("init qnn subsystem failed with qnn backend %s, pls check why\n", get_qnn_backend_name(device)); + delete instance; + return nullptr; + } + qnn_interface qnn_interface = instance->get_qnn_interface(); + if (!qnn_interface.is_loaded()) { + QNN_LOG_WARN("qnn subsystem failure\n"); + delete instance; + return nullptr; + } + + std::string device_name = GGML_QNN_NAME + std::string("_") + std::to_string(device) + std::string("_") + get_qnn_backend_name(device); + QNN_LOG_INFO("qnn device name %s", device_name.c_str()); + instance->init_qnn_graph(device_name.c_str(), false); + g_qnn_mgr[device].instance = instance; + g_qnn_mgr[device].raw_interface = instance->get_qnn_raw_interface(); + g_qnn_mgr[device].raw_system_interface = instance->get_qnn_raw_system_interface(); + //TODO:refine internal buffer management + g_qnn_mgr[device].buffer_pool = qnn_buf_new(get_qnn_backend_name(device), GGML_QNN_MAX_BUFFERS, (1 << 20)); + GGML_ASSERT(g_qnn_mgr[device].buffer_pool != nullptr); + + ggml_backend_t qnn_backend = new ggml_backend{ + /* .guid = */ ggml_backend_qnn_guid(), + /* .iface = */ ggml_backend_qnn_interface, + /* .context = */ &g_qnn_mgr[device] + }; + g_qnn_mgr[device].backend = qnn_backend; + g_qnn_backend = g_qnn_mgr[device].backend; + g_current_device = device; + + return qnn_backend; +} + + +extern "C" int ggml_backend_qnn_reg_devices(); + + +int ggml_backend_qnn_reg_devices() { + for (size_t idx = 0; idx < GGML_QNN_MAX_DEVICES; idx++) { + int id = g_qnn_mgr[idx].device; + char name[GGML_MAX_NAME]; + ggml_backend_qnn_get_device_description(idx, name, GGML_MAX_NAME); + ggml_backend_register(name, ggml_backend_qnn_reg_init, ggml_backend_qnn_buffer_type(idx), + (void *) (intptr_t)idx); + } + + return GGML_QNN_MAX_DEVICES; +} diff --git a/ggml-qnn.h b/ggml-qnn.h new file mode 100644 index 0000000000000..51f02d4ba3078 --- /dev/null +++ b/ggml-qnn.h @@ -0,0 +1,55 @@ +/* + * MIT license + * Copyright (C) 2024 GGML Authors + * SPDX-License-Identifier: MIT + * + * this is implementation of ggml QNN(Qualcomm Nerual Network, aka AI Engine Direct) backend + */ +#pragma once + +#include "ggml.h" +#include "ggml-backend.h" + +#ifdef __cplusplus +extern "C" { +#endif + + +#define GGML_QNN_NAME "QNN" +#define GGML_QNN_MAX_DEVICES 3 + +//QNN cDSP and HTA backend would not be used currently, just focus on QNN CPU/GPU/HTP(aka DSP) backend currently +enum QNNBackend { + QNN_CPU, + QNN_GPU, + QNN_HTP, +}; + +GGML_API int ggml_backend_qnn_reg_devices(); + +/** + * + * @param device 0: QNN_CPU 1: QNN_GPU 2: QNN_HTP(aka DSP) + * @param qnn_lib_path qnn library path, such as "/data/data/com.ggml.llamacpp/" on Android which can got by JNI from Java layer + * @return + */ +GGML_API ggml_backend_t ggml_backend_qnn_init(size_t dev_num, const char * qnn_lib_path); + +GGML_API bool ggml_backend_is_qnn(ggml_backend_t backend); + +GGML_API void ggml_backend_qnn_set_n_threads(ggml_backend_t backend, int n_threads); + +GGML_API int ggml_backend_qnn_get_device_count(void); +GGML_API void ggml_backend_qnn_get_device_description(int device, char * description, size_t description_size); + + +GGML_API ggml_backend_buffer_type_t ggml_backend_qnn_buffer_type(size_t dev_num); + + +//temporary API, should be removed in the future +GGML_API bool ggml_qnn_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor); + + +#ifdef __cplusplus +} +#endif diff --git a/ggml.c b/ggml.c index 086db96af7fcd..919eb0b7b1ff1 100644 --- a/ggml.c +++ b/ggml.c @@ -16153,7 +16153,8 @@ static void ggml_compute_forward_cross_entropy_loss_back( ///////////////////////////////// -static void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) { +//workaround for Qualcomm QNN backend +void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) { GGML_ASSERT(params); if (tensor->op == GGML_OP_NONE || ggml_is_empty(tensor)) { diff --git a/llama.cpp b/llama.cpp index 30fe190373b43..a10c3e1fc8488 100644 --- a/llama.cpp +++ b/llama.cpp @@ -17,6 +17,8 @@ # include "ggml-sycl.h" #elif defined(GGML_USE_KOMPUTE) # include "ggml-kompute.h" +#elif defined(GGML_USE_QNN) +# include "ggml-qnn.h" #endif #ifdef GGML_USE_METAL @@ -1680,6 +1682,8 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(int gpu) { if (buft == nullptr) { LLAMA_LOG_WARN("%s: cannot use GPU %d, check `vulkaninfo --summary`\n", __func__, gpu); } +#elif defined(GGML_USE_QNN) + buft = ggml_backend_qnn_buffer_type(gpu); #endif if (buft == nullptr) { @@ -1720,6 +1724,8 @@ static size_t llama_get_device_count() { return ggml_backend_sycl_get_device_count(); #elif defined(GGML_USE_VULKAN) return ggml_backend_vk_get_device_count(); +#elif defined(GGML_USE_QNN) + return ggml_backend_qnn_get_device_count(); #else return 1; #endif @@ -15090,6 +15096,8 @@ size_t llama_max_devices(void) { return GGML_SYCL_MAX_DEVICES; #elif defined(GGML_USE_VULKAN) return GGML_VK_MAX_DEVICES; +#elif defined(GGML_USE_QNN) + return GGML_QNN_MAX_DEVICES; #else return 1; #endif @@ -15105,7 +15113,7 @@ bool llama_supports_mlock(void) { bool llama_supports_gpu_offload(void) { #if defined(GGML_USE_CUDA) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || \ - defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE) + defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE) || defined(GGML_USE_QNN) // Defined when llama.cpp is compiled with support for offloading model layers to GPU. return true; #else @@ -15392,6 +15400,17 @@ struct llama_context * llama_new_context_with_model( } ctx->backends.push_back(backend); } +#elif defined(GGML_USE_QNN) + if (model->n_gpu_layers > 0) { + //the second param is package name of Andorid app, can be got by JNI from Java layer + ggml_backend_t backend = ggml_backend_qnn_init(QNN_CPU, "/data/data/com.ggml.llamacpp/"); + if (nullptr == backend) { + LLAMA_LOG_ERROR("%s: failed to initialize QNN backend\n", __func__); + llama_free(ctx); + return nullptr; + } + ctx->backends.push_back(backend); + } #endif ctx->backend_cpu = ggml_backend_cpu_init(); if (ctx->backend_cpu == nullptr) { @@ -17558,6 +17577,14 @@ void llama_reset_timings(struct llama_context * ctx) { ctx->t_p_eval_us = ctx->n_p_eval = 0; } +static int llama_has_qnn(void) { +#ifdef GGML_USE_QNN + return 1; +#else + return 0; +#endif +} + const char * llama_print_system_info(void) { static std::string s; @@ -17579,6 +17606,7 @@ const char * llama_print_system_info(void) { s += "SSSE3 = " + std::to_string(ggml_cpu_has_ssse3()) + " | "; s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | "; s += "MATMUL_INT8 = " + std::to_string(ggml_cpu_has_matmul_int8()) + " | "; + s += "QNN = " + std::to_string(llama_has_qnn()) + " | "; return s.c_str(); } From d325088dbf8e86722a41b37ef44549b86211742d Mon Sep 17 00:00:00 2001 From: "zhou.weiguo" Date: Wed, 24 Apr 2024 16:28:18 +0800 Subject: [PATCH 002/143] ggml: add Qualcomm QNN(Qualcomm Neural Network,aka Qualcomm AI Engine Direct) backend --- ggml-qnn.cpp | 3590 ++++++++++++++++++++++++++++++ ggml-qnn.h | 43 + llama.cpp | 23 +- tests/ggml-qnn/CMakeLists.txt | 60 + tests/ggml-qnn/build-ggml-qnn.sh | 95 + tests/ggml-qnn/run-ggml-qnn.sh | 108 + tests/ggml-qnn/test-qnn-ops.cpp | 450 ++++ 7 files changed, 4368 insertions(+), 1 deletion(-) create mode 100644 ggml-qnn.cpp create mode 100644 ggml-qnn.h create mode 100644 tests/ggml-qnn/CMakeLists.txt create mode 100755 tests/ggml-qnn/build-ggml-qnn.sh create mode 100755 tests/ggml-qnn/run-ggml-qnn.sh create mode 100644 tests/ggml-qnn/test-qnn-ops.cpp diff --git a/ggml-qnn.cpp b/ggml-qnn.cpp new file mode 100644 index 0000000000000..9319db227795d --- /dev/null +++ b/ggml-qnn.cpp @@ -0,0 +1,3590 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "QnnTypes.h" +#include "QnnCommon.h" +#include "QnnContext.h" +#include "QnnBackend.h" +#include "QnnGraph.h" +#include "QnnProperty.h" +#include "QnnTensor.h" +#include "QnnInterface.h" +#include "Saver/QnnSaver.h" +#include "System/QnnSystemInterface.h" +#include "HTP/QnnHtpDevice.h" + +#include "ggml-qnn.h" + +#include "ggml-backend-impl.h" + + +// ================================================================================================= +// +// forward/external/helper declaration +// +// ================================================================================================= +class qnn_instance; + + +#if (defined __ANDROID__) || (defined ANDROID) +extern "C" int __android_log_print(int prio, const char * tag, const char * fmt, ...) +__attribute__((__format__(printf, 3, 4))); +#endif +static void ggml_qnn_log_internal(ggml_log_level level, const char * file, const char * func, int line, const char * format, ...); + + +// ================================================================================================= +// +// self-defined macro / data structure +// +// ================================================================================================= +#define RPCMEM_DEFAULT_FLAGS 1 +#define RPCMEM_HEAP_ID_SYSTEM 25 + +#define GGML_DUMP_TENSOR(tensor) ggml_tensor_dump(tensor, #tensor) + +#define GGML_QNN_LOGBUF_LEN 4096 + +#define GGML_QNN_DEBUG 1 //for troubleshooting QNN backend + +#define QNN_LOG_ERROR(...) ggml_qnn_log_internal(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) +#define QNN_LOG_WARN(...) ggml_qnn_log_internal(GGML_LOG_LEVEL_DEBUG , __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) +#define QNN_LOG_INFO(...) ggml_qnn_log_internal(GGML_LOG_LEVEL_DEBUG , __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) + +#if GGML_QNN_DEBUG +#define QNN_LOG_DEBUG(...) ggml_qnn_log_internal(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) +#else +#define QNN_LOG_DEBUG(...) +#endif + + +#define VALIDATE(value, status) \ + do { \ + status = value; \ + if (status != QNN_SUCCESS) { \ + QNN_LOG_WARN("%s expected QNN_SUCCESS\n", #value); \ + return status; \ + } \ + } while (0) + +#define VALIDATE_TENSOR_VERSION(tensor, err) VALIDATE(validate_tensor_version(tensor), err) + +#define VALIDATE_OP_CONFIG_VERSION(op, err) VALIDATE(validate_op_config_version(op), err) + +#define QNN_VER_PTR(x) (&((x).v1)) +#define QNN_OP_CFG_VALID(op_config) ((op_config).version == QNN_OPCONFIG_VERSION_1) + +#define QNN_OP_CFG_GET_NAME(op_config) get_qnn_oponfig_name(op_config) +#define QNN_OP_CFG_GET_PACKAGE_NAME(op_config) get_qnn_op_config_packagename(op_config) +#define QNN_OP_CFG_GET_TYPE_NAME(op_config) get_qnn_op_config_typename(op_config) +#define QNN_OP_CFG_GET_NUM_PARAMS(op_config) get_qnn_op_config_numparams(op_config) +#define QNN_OP_CFG_GET_PARAMS(op_config) get_qnn_op_config_params(op_config) +#define QNN_OP_CFG_GET_NUM_INPUTS(op_config) get_qnn_op_config_numinputs(op_config) +#define QNN_OP_CFG_GET_INPUTS(op_config) get_qnn_op_config_inputs(op_config) +#define QNN_OP_CFG_GET_NUM_OUTPUTS(op_config) get_qnn_op_config_numoutputs(op_config) +#define QNN_OP_CFG_GET_OUTPUTS(op_config) get_qnn_op_config_outputs(op_config) + +#define QNN_OP_CFG_SET_NAME(op_config, value) set_qnn_op_config_name(op_config, value) +#define QNN_OP_CFG_SET_PACKAGE_NAME(op_config, value) set_qnn_op_config_packagename(op_config, value) +#define QNN_OP_CFG_SET_TYPE_NAME(op_config, value) set_qnn_op_config_typename(op_config, value) + +#define QNN_OP_CFG_SET_PARAMS(op_config, num_of_params, params) \ + set_qnn_op_config_params(op_config, num_of_params, params) + +#define QNN_OP_CFG_SET_INPUTS(op_config, num_of_inputs, inputTensors) \ + set_qnn_op_config_inputs(op_config, num_of_inputs, inputTensors) + +#define QNN_OP_CFG_SET_OUTPUTS(op_config, num_of_outputs, output_tensors) \ + set_qnn_op_config_outputs(op_config, num_of_outputs, output_tensors) + +#define QNN_TENSOR_GET_ID(tensor) get_qnn_tensorid(tensor) +#define QNN_TENSOR_GET_NAME(tensor) get_qnn_tensorname(tensor) +#define QNN_TENSOR_GET_TYPE(tensor) get_qnn_tensortype(tensor) +#define QNN_TENSOR_GET_DATA_FORMAT(tensor) get_qnn_tensor_dataformat(tensor) +#define QNN_TENSOR_GET_DATA_TYPE(tensor) get_qnn_tensor_datatype(tensor) +#define QNN_TENSOR_GET_QUANT_PARAMS(tensor) get_qnn_tensor_quantparams(tensor) +#define QNN_TENSOR_GET_RANK(tensor) get_qnn_tensor_rank(tensor) +#define QNN_TENSOR_GET_DIMENSIONS(tensor) get_qnn_tensor_dimensions(tensor) +#define QNN_TENSOR_GET_MEM_TYPE(tensor) get_qnn_tensor_memtype(tensor) +#define QNN_TENSOR_GET_CLIENT_BUF(tensor) get_qnn_tensor_clientbuf(tensor) +#define QNN_TENSOR_GET_MEM_HANDLE(tensor) get_qnn_tensor_memhandle(tensor) + +#define QNN_TENSOR_SET_ID(tensor, value) set_qnn_tensor_id(tensor, value) +#define QNN_TENSOR_SET_NAME(tensor, value) set_qnn_tensor_name(tensor, value) +#define QNN_TENSOR_SET_TYPE(tensor, value) set_qnn_tensor_type(tensor, value) +#define QNN_TENSOR_SET_DATA_FORMAT(tensor, value) set_qnn_tensor_dataformat(tensor, value) +#define QNN_TENSOR_SET_DATA_TYPE(tensor, value) set_qnn_tensor_datatype(tensor, value) +#define QNN_TENSOR_SET_QUANT_PARAMS(tensor, value) set_qnn_tensor_quantparams(tensor, value) +#define QNN_TENSOR_SET_RANK(tensor, value) set_qnn_tensor_rank(tensor, value) +#define QNN_TENSOR_SET_DIMENSIONS(tensor, value) set_qnn_tensor_dimensions(tensor, value) +#define QNN_TENSOR_SET_MEM_TYPE(tensor, value) set_qnn_tensor_memtype(tensor, value) +#define QNN_TENSOR_SET_CLIENT_BUF(tensor, value) set_qnn_tensor_clientbuf(tensor, value) +#define QNN_TENSOR_SET_MEM_HANDLE(tensor, value) set_qnn_tensor_memhandle(tensor, value) + + +using pfn_rpc_mem_init = void (*)(void); +using pfn_rpc_mem_deinit = void (*)(void); +using pfn_rpc_mem_alloc = void *(*)(int, uint32_t, int); +using pfn_rpc_mem_free = void (*)(void *); +using pfn_rpc_mem_to_fd = int (*)(void *); + +using _pfn_QnnSaver_initialize = decltype(QnnSaver_initialize); +using _pfn_QnnInterface_getProviders = decltype(QnnInterface_getProviders); +using _pfn_QnnSystemInterface_getProviders = decltype(QnnSystemInterface_getProviders); + + +typedef void (* ggml_qnn_func_t)(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); +typedef void (* ggml_qnn_func_common_t)(const ggml_op ggml_op, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); + +enum class ggml_qnn_profile_level { + profile_off = 0, + profile_basic = 1, + profile_detail = 2 +}; + + +struct ggml_backend_qnn_context { + int device; + int threads; + char name[GGML_MAX_NAME]; + char lib[GGML_MAX_NAME]; + qnn_instance * instance; + struct ggml_backend * backend; + QNN_INTERFACE_VER_TYPE raw_interface; + QNN_SYSTEM_INTERFACE_VER_TYPE raw_system_interface; +} ; + + +// ================================================================================================= +// +// static global variables +// +// ================================================================================================= +static ggml_backend_t g_qnn_backend = nullptr; + +static int g_current_device = QNN_BACKEND_GGML; + + +//QNN cDSP and HTA backend would not be used currently, just focus on QNN CPU/GPU/NPU(aka HTP/DSP) backend currently +static struct ggml_backend_qnn_context g_qnn_mgr[GGML_QNN_MAX_DEVICES] = { + [QNN_BACKEND_CPU] = {.device = 0, .threads = 1, .name = "qnn-cpu", .lib = "libQnnCpu.so", .instance = nullptr, .backend = nullptr, .raw_interface = {}, .raw_system_interface = {}}, + [QNN_BACKEND_GPU] = {.device = 1, .threads = 1, .name = "qnn-gpu", .lib = "libQnnGpu.so", .instance = nullptr, .backend = nullptr, .raw_interface = {}, .raw_system_interface = {}}, + [QNN_BACKEND_NPU] = {.device = 2, .threads = 1, .name = "qnn-npu", .lib = "libQnnHtp.so", .instance = nullptr, .backend = nullptr, .raw_interface = {}, .raw_system_interface = {}}, +}; + + +// ================================================================================================= +// +// QNN helper functions and other internal helper functions +// +// ================================================================================================= +static inline int validate_tensor_version(Qnn_Tensor_t tensor) { + if (tensor.version != QNN_TENSOR_VERSION_1) { + QNN_LOG_WARN("validate_tensor_version() tensor %s, got unsupported version %d\n", + tensor.v1.name, + tensor.version); + return 1; + } + return 0; +} + + +[[maybe_unused]] static inline int validate_op_config_version(Qnn_OpConfig_t op_config) { + if (op_config.version != QNN_OPCONFIG_VERSION_1) { + QNN_LOG_WARN("validate_op_config_version() op %s, got unsupported version %d\n", + op_config.v1.name, + op_config.version); + return 1; + } + return 0; +} + + +static inline const char * get_qnn_oponfig_name(const Qnn_OpConfig_t & op_config) { + if (op_config.version == QNN_OPCONFIG_VERSION_1) { + return op_config.v1.name; + } + return nullptr; +} + + +[[maybe_unused]] static inline const char * get_qnn_oponfig_name(const Qnn_OpConfig_t * op_config) { + return get_qnn_oponfig_name(*op_config); +} + + +static inline const char * get_qnn_op_config_packagename(const Qnn_OpConfig_t & op_config) { + if (op_config.version == QNN_OPCONFIG_VERSION_1) { + return op_config.v1.packageName; + } + return nullptr; +} + + +[[maybe_unused]] static inline const char * get_qnn_op_config_packagename(const Qnn_OpConfig_t * op_config) { + return get_qnn_op_config_packagename(*op_config); +} + + +static inline const char * get_qnn_op_config_typename(const Qnn_OpConfig_t & op_config) { + if (op_config.version == QNN_OPCONFIG_VERSION_1) { + return op_config.v1.typeName; + } + return nullptr; +} + + +[[maybe_unused]] static inline const char * get_qnn_op_config_typename(const Qnn_OpConfig_t * op_config) { + return get_qnn_op_config_typename(*op_config); +} + + +static inline uint32_t get_qnn_op_config_numparams(const Qnn_OpConfig_t & op_config) { + if (op_config.version == QNN_OPCONFIG_VERSION_1) { + return op_config.v1.numOfParams; + } + return 0u; +} + + +[[maybe_unused]] static inline uint32_t get_qnn_op_config_numparams(const Qnn_OpConfig_t * op_config) { + return get_qnn_op_config_numparams(*op_config); +} + + +static inline const Qnn_Param_t * get_qnn_op_config_params(const Qnn_OpConfig_t & op_config) { + if (op_config.version == QNN_OPCONFIG_VERSION_1) { + return op_config.v1.params; + } + return nullptr; +} + + +[[maybe_unused]] static inline const Qnn_Param_t * get_qnn_op_config_params(const Qnn_OpConfig_t * op_config) { + return get_qnn_op_config_params(*op_config); +} + + +static inline uint32_t get_qnn_op_config_numinputs(const Qnn_OpConfig_t & op_config) { + if (op_config.version == QNN_OPCONFIG_VERSION_1) { + return op_config.v1.numOfInputs; + } + return 0u; +} + + +[[maybe_unused]] static inline uint32_t get_qnn_op_config_numinputs(const Qnn_OpConfig_t * op_config) { + return get_qnn_op_config_numinputs(*op_config); +} + + +static inline const Qnn_Tensor_t * get_qnn_op_config_inputs(const Qnn_OpConfig_t & op_config) { + if (op_config.version == QNN_OPCONFIG_VERSION_1) { + return op_config.v1.inputTensors; + } + return nullptr; +} + + +[[maybe_unused]] static inline const Qnn_Tensor_t * get_qnn_op_config_inputs(const Qnn_OpConfig_t * op_config) { + return get_qnn_op_config_inputs(*op_config); +} + + +static inline uint32_t get_qnn_op_config_numoutputs(const Qnn_OpConfig_t & op_config) { + if (op_config.version == QNN_OPCONFIG_VERSION_1) { + return op_config.v1.numOfOutputs; + } + return 0u; +} + + +[[maybe_unused]] static inline uint32_t get_qnn_op_config_numoutputs(const Qnn_OpConfig_t * op_config) { + return get_qnn_op_config_numoutputs(*op_config); +} + + +static inline const Qnn_Tensor_t * get_qnn_op_config_outputs(const Qnn_OpConfig_t & op_config) { + if (op_config.version == QNN_OPCONFIG_VERSION_1) { + return op_config.v1.outputTensors; + } + return nullptr; +} + + +[[maybe_unused]] static inline const Qnn_Tensor_t * get_qnn_op_config_outputs(const Qnn_OpConfig_t * op_config) { + return get_qnn_op_config_outputs(*op_config); +} + + +static inline void set_qnn_op_config_name(Qnn_OpConfig_t & op_config, const char * name) { + if (op_config.version == QNN_OPCONFIG_VERSION_1) { + op_config.v1.name = name; + } +} + + +[[maybe_unused]] static inline void set_qnn_op_config_name(Qnn_OpConfig_t * op_config, const char * name) { + set_qnn_op_config_name(*op_config, name); +} + + +static inline void set_qnn_op_config_packagename(Qnn_OpConfig_t & op_config, const char * package_name) { + if (op_config.version == QNN_OPCONFIG_VERSION_1) { + op_config.v1.packageName = package_name; + } +} + + +[[maybe_unused]] static inline void set_qnn_op_config_packagename(Qnn_OpConfig_t * op_config, const char * package_name) { + set_qnn_op_config_packagename(*op_config, package_name); +} + + +static inline void set_qnn_op_config_typename(Qnn_OpConfig_t & op_config, const char * type_name) { + if (op_config.version == QNN_OPCONFIG_VERSION_1) { + op_config.v1.typeName = type_name; + } +} + + +[[maybe_unused]] static inline void set_qnn_op_config_typename(Qnn_OpConfig_t * op_config, const char * type_name) { + set_qnn_op_config_typename(*op_config, type_name); +} + + +static inline void set_qnn_op_config_params(Qnn_OpConfig_t & op_config, + uint32_t num_of_params, + Qnn_Param_t * params) { + if (op_config.version == QNN_OPCONFIG_VERSION_1) { + op_config.v1.numOfParams = num_of_params; + op_config.v1.params = params; + } +} + + +[[maybe_unused]] static inline void set_qnn_op_config_params(Qnn_OpConfig_t * op_config, + uint32_t num_of_params, + Qnn_Param_t * params) { + set_qnn_op_config_params(*op_config, num_of_params, params); +} + + +static inline void set_qnn_op_config_inputs(Qnn_OpConfig_t & op_config, + uint32_t num_of_inputs, + Qnn_Tensor_t * input_tensors) { + if (op_config.version == QNN_OPCONFIG_VERSION_1) { + op_config.v1.numOfInputs = num_of_inputs; + op_config.v1.inputTensors = input_tensors; + } +} + + +[[maybe_unused]] static inline void set_qnn_op_config_inputs(Qnn_OpConfig_t * op_config, + uint32_t num_of_inputs, + Qnn_Tensor_t * input_tensors) { + set_qnn_op_config_inputs(*op_config, num_of_inputs, input_tensors); +} + + +static inline void set_qnn_op_config_outputs(Qnn_OpConfig_t & op_config, + uint32_t num_of_outputs, + Qnn_Tensor_t * output_tensors) { + if (op_config.version == QNN_OPCONFIG_VERSION_1) { + op_config.v1.numOfOutputs = num_of_outputs; + op_config.v1.outputTensors = output_tensors; + } +} + + +[[maybe_unused]] static inline void set_qnn_op_config_outputs(Qnn_OpConfig_t * op_config, + uint32_t num_of_outputs, + Qnn_Tensor_t * output_tensors) { + set_qnn_op_config_outputs(*op_config, num_of_outputs, output_tensors); +} + + +static inline uint32_t get_qnn_tensorid(const Qnn_Tensor_t & tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.id; + } + + return 0u; +} + + +[[maybe_unused]] static inline uint32_t get_qnn_tensorid(const Qnn_Tensor_t * tensor) { + return get_qnn_tensorid(*tensor); +} + + +static inline const char * get_qnn_tensorname(const Qnn_Tensor_t & tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.name; + } + return nullptr; +} + + +static inline const char * get_qnn_tensorname(const Qnn_Tensor_t * tensor) { + return get_qnn_tensorname(*tensor); +} + + +static inline Qnn_TensorType_t get_qnn_tensortype(const Qnn_Tensor_t & tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.type; + } + return QNN_TENSOR_TYPE_UNDEFINED; +} + + +[[maybe_unused]] static inline Qnn_TensorType_t get_qnn_tensortype(const Qnn_Tensor_t * tensor) { + return get_qnn_tensortype(*tensor); +} + + +static inline Qnn_TensorDataFormat_t get_qnn_tensor_dataformat(const Qnn_Tensor_t & tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.dataFormat; + } + return QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER; +} + + +[[maybe_unused]] static inline Qnn_TensorDataFormat_t get_qnn_tensor_dataformat(const Qnn_Tensor_t * tensor) { + return get_qnn_tensor_dataformat(*tensor); +} + + +static inline Qnn_DataType_t get_qnn_tensor_datatype(const Qnn_Tensor_t & tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.dataType; + } + return QNN_DATATYPE_UNDEFINED; +} + + +[[maybe_unused]] static inline Qnn_DataType_t get_qnn_tensor_datatype(const Qnn_Tensor_t * tensor) { + return get_qnn_tensor_datatype(*tensor); +} + + +static inline Qnn_QuantizeParams_t get_qnn_tensor_quantparams(const Qnn_Tensor_t & tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.quantizeParams; + } + return QNN_QUANTIZE_PARAMS_INIT; +} + + +[[maybe_unused]] static inline Qnn_QuantizeParams_t get_qnn_tensor_quantparams(const Qnn_Tensor_t * tensor) { + return get_qnn_tensor_quantparams(*tensor); +} + + +static inline uint32_t get_qnn_tensor_rank(const Qnn_Tensor_t & tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.rank; + } + return 0u; +} + + +[[maybe_unused]] static inline uint32_t get_qnn_tensor_rank(const Qnn_Tensor_t * tensor) { + return get_qnn_tensor_rank(*tensor); +} + + +static inline uint32_t * get_qnn_tensor_dimensions(const Qnn_Tensor_t & tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.dimensions; + } + return nullptr; +} + + +[[maybe_unused]] static inline uint32_t * get_qnn_tensor_dimensions(const Qnn_Tensor_t * tensor) { + return get_qnn_tensor_dimensions(*tensor); +} + + +static inline Qnn_TensorMemType_t get_qnn_tensor_memtype(const Qnn_Tensor_t & tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.memType; + } + return QNN_TENSORMEMTYPE_UNDEFINED; +} + + +[[maybe_unused]] static inline Qnn_TensorMemType_t get_qnn_tensor_memtype(const Qnn_Tensor_t * tensor) { + return get_qnn_tensor_memtype(*tensor); +} + + +static inline Qnn_ClientBuffer_t get_qnn_tensor_clientbuf(const Qnn_Tensor_t & tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.clientBuf; + } + return QNN_CLIENT_BUFFER_INIT; +} + + +[[maybe_unused]] static inline Qnn_ClientBuffer_t get_qnn_tensor_clientbuf(const Qnn_Tensor_t * tensor) { + return get_qnn_tensor_clientbuf(*tensor); +} + + +static inline Qnn_MemHandle_t get_qnn_tensor_memhandle(const Qnn_Tensor_t & tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.memHandle; + } + return nullptr; +} + + +[[maybe_unused]] static inline Qnn_MemHandle_t get_qnn_tensor_memhandle(const Qnn_Tensor_t * tensor) { + return get_qnn_tensor_memhandle(*tensor); +} + + +static inline void set_qnn_tensor_id(Qnn_Tensor_t & tensor, uint32_t id) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + tensor.v1.id = id; + } +} + + +[[maybe_unused]] static inline void set_qnn_tensor_id(Qnn_Tensor_t * tensor, uint32_t id) { + set_qnn_tensor_id(*tensor, id); +} + + +static inline void set_qnn_tensor_name(Qnn_Tensor_t & tensor, const char * name) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + tensor.v1.name = name; + } +} + + +[[maybe_unused]] static inline void set_qnn_tensor_name(Qnn_Tensor_t * tensor, const char * name) { + set_qnn_tensor_name(*tensor, name); +} + + +static inline void set_qnn_tensor_type(Qnn_Tensor_t & tensor, Qnn_TensorType_t type) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + tensor.v1.type = type; + } +} + + +[[maybe_unused]] static inline void set_qnn_tensor_type(Qnn_Tensor_t * tensor, Qnn_TensorType_t type) { + set_qnn_tensor_type(*tensor, type); +} + + +static inline void set_qnn_tensor_dataformat(Qnn_Tensor_t & tensor, Qnn_TensorDataFormat_t format) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + tensor.v1.dataFormat = format; + } +} + + +[[maybe_unused]] static inline void set_qnn_tensor_dataformat(Qnn_Tensor_t * tensor, Qnn_TensorDataFormat_t format) { + set_qnn_tensor_dataformat(*tensor, format); +} + + +static inline void set_qnn_tensor_datatype(Qnn_Tensor_t & tensor, Qnn_DataType_t dataType) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + tensor.v1.dataType = dataType; + } +} + + +[[maybe_unused]] static inline void set_qnn_tensor_datatype(Qnn_Tensor_t * tensor, Qnn_DataType_t dataType) { + set_qnn_tensor_datatype(*tensor, dataType); +} + + +static inline void set_qnn_tensor_quantparams(Qnn_Tensor_t & tensor, Qnn_QuantizeParams_t params) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + tensor.v1.quantizeParams = params; + } +} + + +[[maybe_unused]] static inline void set_qnn_tensor_quantparams(Qnn_Tensor_t * tensor, Qnn_QuantizeParams_t params) { + set_qnn_tensor_quantparams(*tensor, params); +} + + +static inline void set_qnn_tensor_rank(Qnn_Tensor_t & tensor, uint32_t rank) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + tensor.v1.rank = rank; + } +} + + +[[maybe_unused]] static inline void set_qnn_tensor_rank(Qnn_Tensor_t * tensor, uint32_t rank) { + set_qnn_tensor_rank(*tensor, rank); +} + + +static inline void set_qnn_tensor_dimensions(Qnn_Tensor_t & tensor, uint32_t * dims) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + tensor.v1.dimensions = dims; + } +} + + +[[maybe_unused]] static inline void set_qnn_tensor_dimensions(Qnn_Tensor_t * tensor, uint32_t * dims) { + set_qnn_tensor_dimensions(*tensor, dims); +} + + +static inline void set_qnn_tensor_memtype(Qnn_Tensor_t & tensor, Qnn_TensorMemType_t memType) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + tensor.v1.memType = memType; + } +} + + +[[maybe_unused]] static inline void set_qnn_tensor_memtype(Qnn_Tensor_t * tensor, Qnn_TensorMemType_t memType) { + set_qnn_tensor_memtype(*tensor, memType); +} + + +static inline void set_qnn_tensor_clientbuf(Qnn_Tensor_t & tensor, Qnn_ClientBuffer_t clientBuf) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + tensor.v1.clientBuf = clientBuf; + } +} + + +[[maybe_unused]] static inline void set_qnn_tensor_clientbuf(Qnn_Tensor_t * tensor, Qnn_ClientBuffer_t clientBuf) { + set_qnn_tensor_clientbuf(*tensor, clientBuf); +} + + +static inline void set_qnn_tensor_memhandle(Qnn_Tensor_t & tensor, Qnn_MemHandle_t handle) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + tensor.v1.memHandle = handle; + } +} + + +[[maybe_unused]] static inline void set_qnn_tensor_memhandle(Qnn_Tensor_t * tensor, Qnn_MemHandle_t handle) { + set_qnn_tensor_memhandle(*tensor, handle); +} + + +static size_t memscpy(void * dst, size_t dstSize, const void * src, size_t copySize) { + if (!dst || !src || !dstSize || !copySize) + return 0; + + size_t minSize = dstSize < copySize ? dstSize : copySize; + + memcpy(dst, src, minSize); + + return minSize; +} + + +static char * ggml_qnn_strndup(const char * source, size_t maxlen) { + return ::strndup(source, maxlen); +} + + +static int deep_copy_qnn_tensors(Qnn_Tensor_t & src, Qnn_Tensor_t & dst) { + int err = 0; + VALIDATE_TENSOR_VERSION(src, err); + + dst.version = src.version; + QNN_TENSOR_SET_NAME( + dst, ggml_qnn_strndup(QNN_TENSOR_GET_NAME(src), std::string(QNN_TENSOR_GET_NAME(src)).size())); + if (QNN_TENSOR_GET_NAME(dst) == nullptr) { + return 1; + } + QNN_TENSOR_SET_ID(dst, QNN_TENSOR_GET_ID(src)); + QNN_TENSOR_SET_TYPE(dst, QNN_TENSOR_GET_TYPE(src)); + QNN_TENSOR_SET_DATA_FORMAT(dst, QNN_TENSOR_GET_DATA_FORMAT(src)); + QNN_TENSOR_SET_DATA_TYPE(dst, QNN_TENSOR_GET_DATA_TYPE(src)); + QNN_TENSOR_SET_MEM_TYPE(dst, QNN_TENSOR_GET_MEM_TYPE(src)); + + // Only metadata (i.e. non-static data) is copied from source to destination. The union still + // must be initialized so that the clientBuf/memHandle do not contain garbage data + if (QNN_TENSOR_GET_MEM_TYPE(src) == QNN_TENSORMEMTYPE_RAW) { + Qnn_ClientBuffer_t client_buf = {nullptr, 0}; + QNN_TENSOR_SET_CLIENT_BUF(dst, client_buf); + } else if (QNN_TENSOR_GET_MEM_TYPE(src) == QNN_TENSORMEMTYPE_MEMHANDLE) { + QNN_TENSOR_SET_MEM_HANDLE(dst, nullptr); + } else { + return 1; + } + + Qnn_QuantizeParams_t src_qparam = QNN_TENSOR_GET_QUANT_PARAMS(src); + Qnn_QuantizationEncoding_t encoding = src_qparam.quantizationEncoding; + if (encoding == QNN_QUANTIZATION_ENCODING_AXIS_SCALE_OFFSET) { + // need to allocate and copy memory for scaleOffset as it is a pointer array + Qnn_QuantizeParams_t src_qparam_cpy = src_qparam; + Qnn_AxisScaleOffset_t &axis_scale_offset = src_qparam_cpy.axisScaleOffsetEncoding; + Qnn_ScaleOffset_t **scaleOffset = &axis_scale_offset.scaleOffset; + size_t scaleOffsetSize = axis_scale_offset.numScaleOffsets * sizeof(Qnn_ScaleOffset_t); + *scaleOffset = (Qnn_ScaleOffset_t *)malloc(scaleOffsetSize); + memscpy(*scaleOffset, + scaleOffsetSize, + src_qparam.axisScaleOffsetEncoding.scaleOffset, + scaleOffsetSize); + QNN_TENSOR_SET_QUANT_PARAMS(dst, src_qparam_cpy); + } else if (encoding == QNN_QUANTIZATION_ENCODING_BW_AXIS_SCALE_OFFSET) { + // need to allocate and copy memory for scaleOffset as it is a pointer array + Qnn_QuantizeParams_t src_qparam_cpy = src_qparam; + Qnn_BwAxisScaleOffset_t &bwaxis_scale_offset = src_qparam_cpy.bwAxisScaleOffsetEncoding; + size_t scaleSize = bwaxis_scale_offset.numElements * sizeof(float); + float **scales = &bwaxis_scale_offset.scales; + int32_t **offsets = &bwaxis_scale_offset.offsets; + *scales = (float *)malloc(scaleSize); + memscpy(*scales, scaleSize, src_qparam.bwAxisScaleOffsetEncoding.scales, scaleSize); + + // only copy offsets if present, nullptr implies all offsets are 0 + if (bwaxis_scale_offset.offsets != nullptr) { + size_t offsetSize = bwaxis_scale_offset.numElements * sizeof(int32_t); + *offsets = (int32_t *)malloc(offsetSize); + memscpy(*offsets, offsetSize, src_qparam.bwAxisScaleOffsetEncoding.offsets, offsetSize); + } + QNN_TENSOR_SET_QUANT_PARAMS(dst, src_qparam_cpy); + } else { + QNN_TENSOR_SET_QUANT_PARAMS(dst, src_qparam); + } + + // allocate and copy memory for all the pointer members + uint32_t rank = QNN_TENSOR_GET_RANK(src); + QNN_TENSOR_SET_RANK(dst, rank); + size_t dim_size = rank * sizeof(uint32_t); + uint32_t * dimensions = (uint32_t *)malloc(dim_size); + if (dimensions == nullptr) { + QNN_LOG_WARN("deep_copy_qnn_tensors() allocation error while copying tensor %s\n", QNN_TENSOR_GET_NAME(src)); + return 1; + } + memscpy(dimensions, dim_size, QNN_TENSOR_GET_DIMENSIONS(src), dim_size); + QNN_TENSOR_SET_DIMENSIONS(dst, dimensions); + + return err; +} + + +static int free_qnn_tensor(Qnn_Tensor_t & tensor) { + int err = 0; + VALIDATE_TENSOR_VERSION(tensor, err); + + free((void *) QNN_TENSOR_GET_NAME(tensor)); + free(QNN_TENSOR_GET_DIMENSIONS(tensor)); + + return err; +} + + +[[maybe_unused]] static int free_qnn_tensors(Qnn_Tensor_t *& tensors, uint32_t num_tensors) { + int err = 0; + + // free all pointer allocations in struct + for (size_t i = 0; i < num_tensors; i++) { + free_qnn_tensor(tensors[i]); + } + free(tensors); + + return err; +} + + +static uint32_t ggml_get_tensor_rank(const ggml_tensor * tensor) { + uint32_t rank = 0; + for (int i = 0; i < GGML_MAX_DIMS; i++) { + if ((0 != tensor->ne[i]) && (1 != tensor->ne[i])) { + rank++; + } + } + return rank; +} + + +//TODO: mapping more ggml data type to QNN data type +//ref:explanation of k-quants, https://github.com/ggerganov/llama.cpp/pull/1684 +static Qnn_DataType_t qnn_datatype_from_ggml_datatype(enum ggml_type ggmltype) { + switch (ggmltype) { + case GGML_TYPE_F16: + return QNN_DATATYPE_FLOAT_16; + case GGML_TYPE_F32: + return QNN_DATATYPE_FLOAT_32; + default: + break; + + } + return QNN_DATATYPE_UNDEFINED; +} + + +//TODO: only support GGML_OP_ADD/GGML_OP_MUL/GGML_OP_MUL_MAT +static const char * qnn_opname_from_ggmlop(enum ggml_op ggmlop) { + switch (ggmlop) { + case GGML_OP_ADD: + return QNN_OP_ELEMENT_WISE_ADD; + case GGML_OP_MUL: + return QNN_OP_ELEMENT_WISE_MULTIPLY; + case GGML_OP_MUL_MAT: + return QNN_OP_MAT_MUL; + default: + break; + } + + return nullptr; +} + + +static uint32_t ggml_get_tensor_data_size(const ggml_tensor * tensor) { + /* + size_t data_size = ggml_row_size(tensor->type, tensor->ne[0]); + size_t n_dims = ggml_get_tensor_rank(tensor); + for (int i = 1; i < n_dims; i++) { + data_size *= tensor->ne[i]; + } + + return data_size; + */ + return ggml_nbytes(tensor); +} + + +template +Fn load_qnn_functionpointers(void * handle, const char * function_name) { + return reinterpret_cast(dlsym(handle, function_name)); +} + + +static const char * get_qnn_backend_name(int n_backend_type) { + switch (n_backend_type) { + case 0: + return "QNN-CPU"; + case 1: + return "QNN-GPU"; + case 2: + return "QNN-NPU"; + case 3: + return "ggml"; //the default GGML backend, used to compare performance between QNN backend and the default GGML backend + +#if 0 //QNN cDSP and HTA backend would not be used currently, focus on QNN CPU/GPU/NPU(aka HTP/DSP) backend currently + case 3: + return "QNN-cDSP"; + case 4: + return "QNN-HTA"; +#endif + default: + return "unknown"; + } +} + + +static intptr_t align_to(size_t alignment, intptr_t offset) { + return offset % alignment == 0 ? offset + : offset + + (static_cast(alignment) - + offset % static_cast(alignment)); +} + + +static void ggml_qnn_log_internal(ggml_log_level level, const char * file, const char * func, int line, const char * format, ...) { + static std::mutex ggml_qnn_log_internal_mutex; + static char s_ggml_qnn_log_internal_buf[GGML_QNN_LOGBUF_LEN]; + + { + std::lock_guard lock(ggml_qnn_log_internal_mutex); + va_list args; + va_start(args, format); + int len_prefix = snprintf(s_ggml_qnn_log_internal_buf, GGML_QNN_LOGBUF_LEN, "[%s, %d]: ", func, line); + int len = vsnprintf(s_ggml_qnn_log_internal_buf + len_prefix, GGML_QNN_LOGBUF_LEN - len_prefix, format, args); + if (len < (GGML_QNN_LOGBUF_LEN - len_prefix)) { +#if (defined __ANDROID__) || (defined ANDROID) + //for Android APK + __android_log_print(level, "ggml-qnn", "%s\n", s_ggml_qnn_log_internal_buf); +#endif + //for Android command line application or WoA + printf("%s\n", s_ggml_qnn_log_internal_buf); + } + va_end(args); + } +} + + +// ================================================================================================= +// +// wrapper class of Qualcomm QNN(Qualcomm Neural Network, aka Qualcomm AI Engine Direct) SDK +// +// ================================================================================================= +class qnn_interface { + +#define DEFINE_SHIM_FUNCTION_INTERFACE(F, pointer_name) \ + template \ + inline auto qnn_##F(Args... args) const { \ + return (_qnn_interface->QNN_INTERFACE_VER_NAME.pointer_name)( \ + std::forward(args)...); \ + } + + +#define DEFINE_SHIM_FUNCTION_SYS_INTERFACE(F, pointer_name) \ + template \ + inline auto qnn_##F(Args... args) const { \ + return (_qnn_sys_interface->QNN_SYSTEM_INTERFACE_VER_NAME.pointer_name)( \ + std::forward(args)...); \ + } + + friend class qnn_instance; + +public: + qnn_interface() = default; + + // QnnBackend + DEFINE_SHIM_FUNCTION_INTERFACE(backend_create, backendCreate); + + DEFINE_SHIM_FUNCTION_INTERFACE(backend_free, backendFree); + + DEFINE_SHIM_FUNCTION_INTERFACE(backend_register_op_package, backendRegisterOpPackage); + + DEFINE_SHIM_FUNCTION_INTERFACE(backend_validate_op_config, backendValidateOpConfig); + + DEFINE_SHIM_FUNCTION_INTERFACE(backend_get_api_version, backendGetApiVersion); + + // QnnDevice + DEFINE_SHIM_FUNCTION_INTERFACE(device_create, deviceCreate); + + DEFINE_SHIM_FUNCTION_INTERFACE(device_free, deviceFree); + + DEFINE_SHIM_FUNCTION_INTERFACE(device_get_infrastructure, deviceGetInfrastructure); + + DEFINE_SHIM_FUNCTION_INTERFACE(device_get_platform_info, deviceGetPlatformInfo); + + DEFINE_SHIM_FUNCTION_INTERFACE(device_get_info, deviceGetInfo); + + // QnnContext + DEFINE_SHIM_FUNCTION_INTERFACE(context_create, contextCreate); + + DEFINE_SHIM_FUNCTION_INTERFACE(context_get_binary_size, contextGetBinarySize); + + DEFINE_SHIM_FUNCTION_INTERFACE(context_get_binary, contextGetBinary); + + DEFINE_SHIM_FUNCTION_INTERFACE(context_create_from_binary, contextCreateFromBinary); + + DEFINE_SHIM_FUNCTION_INTERFACE(context_free, contextFree); + + // QnnGraph + DEFINE_SHIM_FUNCTION_INTERFACE(graph_create, graphCreate); + + DEFINE_SHIM_FUNCTION_INTERFACE(graph_add_node, graphAddNode); + + DEFINE_SHIM_FUNCTION_INTERFACE(graph_finalize, graphFinalize); + + DEFINE_SHIM_FUNCTION_INTERFACE(graph_execute, graphExecute); + + DEFINE_SHIM_FUNCTION_INTERFACE(graph_retrieve, graphRetrieve); + + // QnnLog + DEFINE_SHIM_FUNCTION_INTERFACE(log_create, logCreate); + + DEFINE_SHIM_FUNCTION_INTERFACE(log_free, logFree); + + DEFINE_SHIM_FUNCTION_INTERFACE(log_set_log_level, logSetLogLevel); + + // QnnProfile + DEFINE_SHIM_FUNCTION_INTERFACE(profile_create, profileCreate); + + DEFINE_SHIM_FUNCTION_INTERFACE(profile_get_events, profileGetEvents); + + DEFINE_SHIM_FUNCTION_INTERFACE(profile_get_sub_events, profileGetSubEvents); + + DEFINE_SHIM_FUNCTION_INTERFACE(profile_get_event_data, profileGetEventData); + + DEFINE_SHIM_FUNCTION_INTERFACE(profile_free, profileFree); + + // QnnMem + DEFINE_SHIM_FUNCTION_INTERFACE(mem_register, memRegister); + + DEFINE_SHIM_FUNCTION_INTERFACE(mem_de_register, memDeRegister); + + // QnnProperty + DEFINE_SHIM_FUNCTION_INTERFACE(property_has_capability, propertyHasCapability); + + // QnnTensor + DEFINE_SHIM_FUNCTION_INTERFACE(tensor_create_context_tensor, tensorCreateContextTensor); + + DEFINE_SHIM_FUNCTION_INTERFACE(tensor_create_graph_tensor, tensorCreateGraphTensor); + + // QnnSystem + DEFINE_SHIM_FUNCTION_SYS_INTERFACE(system_context_create, systemContextCreate); + + DEFINE_SHIM_FUNCTION_SYS_INTERFACE(system_context_get_binary_info, systemContextGetBinaryInfo); + + DEFINE_SHIM_FUNCTION_SYS_INTERFACE(system_context_free, systemContextFree); + + void set_qnn_interface(const QnnInterface_t * qnn_interface) { + _qnn_interface = qnn_interface; + } + + void set_qnn_system_interface(const QnnSystemInterface_t * qnn_sys_interface) { + _qnn_sys_interface = qnn_sys_interface; + } + + uint32_t get_backend_id() const { + return _qnn_interface->backendId; + } + + bool is_loaded() const { + return ((_qnn_sys_interface != nullptr) && (_qnn_interface != nullptr)); + } + +private: + const QnnInterface_t *_qnn_interface = nullptr; + + const QnnSystemInterface_t *_qnn_sys_interface = nullptr; +}; + + + +// ================================================================================================= +// +// wrapper class of Qualcomm QNN(Qualcomm Neural Network, aka Qualcomm AI Engine Direct) SDK +// +// and +// +// resource management of QNN resources for GGML's QNN backend +// ================================================================================================= +class qnn_instance { +public: + using BackendIdType = decltype(QnnInterface_t{}.backendId); + + explicit qnn_instance(const std::string & lib_path, const std::string & backend_name, + const std::string & model_name) : + _lib_path(std::move(lib_path)), + _backend_name(std::move(backend_name)), + _model_name(std::move(model_name)) {}; + + ~qnn_instance() { + } + + int qnn_init(const QnnSaver_Config_t ** saver_config); + + int qnn_finalize(); + + const qnn_interface &get_qnn_interface() { + if (!_qnn_interface.is_loaded()) { + QNN_LOG_WARN("pls check why _qnn_interface is not loaded\n"); + } + return _qnn_interface; + } + + + const QNN_INTERFACE_VER_TYPE &get_qnn_raw_interface() { + if (!_qnn_interface.is_loaded()) { + QNN_LOG_WARN("pls check why _qnn_interface is not loaded\n"); + } + return _qnn_raw_interface; + } + + const QNN_SYSTEM_INTERFACE_VER_TYPE &get_qnn_raw_system_interface() { + if (!_qnn_interface.is_loaded()) { + QNN_LOG_WARN("pls check why _qnn_interface is not loaded\n"); + } + return _qnn_raw_system_interface; + } + + const Qnn_LogHandle_t get_qnn_log_handle() { return _qnn_log_handle; } + + const Qnn_ProfileHandle_t get_qnn_profile_handle() { return _qnn_profile_handle; } + + const Qnn_DeviceHandle_t get_qnn_device_handle() { return _qnn_device_handle; } + + const Qnn_BackendHandle_t get_qnn_backend_handle() { return _qnn_backend_handle; } + + const Qnn_ContextHandle_t get_qnn_context_handle() { return _qnn_context_handle; } + + const QnnSystemContext_Handle_t get_qnn_system_handle() { return _qnn_system_handle; } + + const Qnn_GraphHandle_t get_qnn_graph_handle() { return _qnn_graph_handle; } + + + int init_qnn_graph(const char * graph_name, + bool debug, + uint8_t do_node_validation = 1, + const QnnGraph_Config_t ** graph_configs = nullptr + ); + + int finalize_qnn_graph(); + + int init_htp_perfinfra() { + QnnDevice_Infrastructure_t device_infra = nullptr; + int error = _qnn_raw_interface.deviceGetInfrastructure(&device_infra); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to get qnn device infra\n"); + return 1; + } + + QnnHtpDevice_Infrastructure_t *htp_infra = static_cast(device_infra); + QnnHtpDevice_PerfInfrastructure_t *htp_perfinfra = &htp_infra->perfInfra; + uint32_t power_configid = 1; + uint32_t device_id = 0; + uint32_t core_id = 0; + htp_perfinfra->createPowerConfigId(device_id, core_id, &power_configid); + _qnn_htp_perfinfra = htp_perfinfra; + _qnn_power_configid = power_configid; + + return 0; + } + + + int set_rpc_polling() { + if (_qnn_rpc_pollingtime > 0) { + QnnHtpPerfInfrastructure_PowerConfig_t rpc_pollingTime; + memset(&rpc_pollingTime, 0, sizeof(rpc_pollingTime)); + rpc_pollingTime.option = + QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_RPC_POLLING_TIME; + rpc_pollingTime.rpcPollingTimeConfig = _qnn_rpc_pollingtime; + const QnnHtpPerfInfrastructure_PowerConfig_t *powerConfigs[] = {&rpc_pollingTime, nullptr}; + if (_qnn_htp_perfinfra) { + _qnn_htp_perfinfra->setPowerConfig(_qnn_power_configid, powerConfigs); + } + } + return 0; + } + + + int set_high_performance_mode() { + if (nullptr == _qnn_htp_perfinfra) { + QNN_LOG_DEBUG("perf intra is null\n"); + return 1; + } + + QnnHtpPerfInfrastructure_PowerConfig_t powerConfig; + memset(&powerConfig, 0, sizeof(powerConfig)); + powerConfig.option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_DCVS_V3; + powerConfig.dcvsV3Config.dcvsEnable = 0; + powerConfig.dcvsV3Config.setDcvsEnable = 1; + powerConfig.dcvsV3Config.contextId = _qnn_power_configid; + powerConfig.dcvsV3Config.powerMode = QNN_HTP_PERF_INFRASTRUCTURE_POWERMODE_PERFORMANCE_MODE; + powerConfig.dcvsV3Config.setSleepLatency = 1; // True to consider Latency parameter otherwise False + powerConfig.dcvsV3Config.setBusParams = 1; // True to consider Bus parameter otherwise False + powerConfig.dcvsV3Config.setCoreParams = 1; // True to consider Core parameter otherwise False + powerConfig.dcvsV3Config.sleepDisable = 0; // True to consider sleep/LPM modes, False to enable + powerConfig.dcvsV3Config.setSleepDisable = 0; // True to consider sleep disable/enable parameter otherwise False + // set Sleep latency parameter + uint32_t latencyValue = 40; + powerConfig.dcvsV3Config.sleepLatency = latencyValue; // range 40-2000 micro sec + // set Bus Clock Parameters (refer QnnHtpPerfInfrastructure_VoltageCorner_t enum) + powerConfig.dcvsV3Config.busVoltageCornerMin = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; + powerConfig.dcvsV3Config.busVoltageCornerTarget = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; + powerConfig.dcvsV3Config.busVoltageCornerMax = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; + // set Core Clock Parameters (refer QnnHtpPerfInfrastructure_VoltageCorner_t enum) + powerConfig.dcvsV3Config.coreVoltageCornerMin = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; + powerConfig.dcvsV3Config.coreVoltageCornerTarget = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; + powerConfig.dcvsV3Config.coreVoltageCornerMax = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; + // set power config with different performance parameters + const QnnHtpPerfInfrastructure_PowerConfig_t *powerConfigs[] = {&powerConfig, nullptr}; + + _qnn_htp_perfinfra->setPowerConfig(_qnn_power_configid, powerConfigs); + + return 0; + } + + std::string &get_qnn_graph_name() { return _graph_name; } + + bool is_rpcmem_initialized() { + return _rpcmem_initialized; + } + + void set_rpcmem_initialized(bool initialized) { + _rpcmem_initialized = initialized; + } + + int32_t rpcmem_to_fd(void * buf); + + int register_rpcmem(void * p_data, Qnn_Tensor_t * p_tensor); + + void unregister_rpcmem(); + + void *alloc_rpcmem(size_t bytes, size_t alignment); + + void free_rpcmem(void * buf); + + bool is_rpcmem_allocated(void * buf); + + bool is_rpcmem_registered(Qnn_MemHandle_t handle) { + return _qnn_mem_set.count(handle) != 0U; + } + +public: + std::map> _qnn_graph_map; + +private: + int load_system(); + + int unload_system(); + + int load_backend(std::string &lib_path, const QnnSaver_Config_t ** saver_config); + + int unload_backend(); + + void set_qnn_raw_interface(QNN_INTERFACE_VER_TYPE & raw_interface) { + _qnn_raw_interface = raw_interface; + } + + void set_qnn_raw_system_interface(QNN_SYSTEM_INTERFACE_VER_TYPE &raw_interface) { + _qnn_raw_system_interface = raw_interface; + } + +private: + static constexpr const int _required_num_providers = 1; + +private: + std::string _lib_path; + std::string _backend_name; + std::string _model_name; // prebuilt QNN model name, not used in currently + BackendIdType _backend_id; + + bool _debug_tensor = false; // flag to indicate if requested graph is to be run in debug mode + bool _do_node_validations = true; // flag to indicate whether all add_node calls need to be validated + QnnLog_Level_t _qnn_log_level = QNN_LOG_LEVEL_DEBUG; + + ggml_qnn_profile_level _profile_level = ggml_qnn_profile_level::profile_detail; + + qnn_interface _qnn_interface; + + void *_system_lib_handle = nullptr; + + Qnn_GraphHandle_t _qnn_graph_handle = nullptr; + + Qnn_LogHandle_t _qnn_log_handle = nullptr; + + Qnn_ProfileHandle_t _qnn_profile_handle = nullptr; + + Qnn_DeviceHandle_t _qnn_device_handle = nullptr; + + Qnn_BackendHandle_t _qnn_backend_handle = nullptr; + + Qnn_ContextHandle_t _qnn_context_handle = nullptr; + + QnnSystemContext_Handle_t _qnn_system_handle = nullptr; + + QnnHtpDevice_PerfInfrastructure_t *_qnn_htp_perfinfra = nullptr; + uint32_t _qnn_power_configid = 1; + uint32_t _qnn_rpc_pollingtime = 9999; // 0-10000 us for high performing + + QNN_INTERFACE_VER_TYPE _qnn_raw_interface; + QNN_SYSTEM_INTERFACE_VER_TYPE _qnn_raw_system_interface; + + std::unordered_set _qnn_mem_set; + + static std::mutex _init_mutex; + static std::unordered_map _loaded_lib_handle; + static std::unordered_map _lib_path_to_backend_id; + static std::unordered_map _loaded_backend; + + void *_rpc_lib_handle = nullptr; + std::atomic_bool _rpcmem_initialized{false}; + pfn_rpc_mem_alloc _pfn_rpc_mem_alloc; + pfn_rpc_mem_free _pfn_rpc_mem_free; + pfn_rpc_mem_to_fd _pfn_rpc_mem_to_fd; + pfn_rpc_mem_init _pfn_rpc_mem_init; + pfn_rpc_mem_deinit _pfn_rpc_mem_deinit; + std::unordered_map _rpcmem_store_map; + + + std::string _graph_name; +}; + + + +// ================================================================================================= +// +// implementation of wrapper class +// +// ================================================================================================= +std::mutex qnn_instance::_init_mutex; + +std::unordered_map qnn_instance::_loaded_lib_handle; + +std::unordered_map qnn_instance::_lib_path_to_backend_id; + +std::unordered_map qnn_instance::_loaded_backend; + + +void * qnn_instance::alloc_rpcmem(size_t bytes, size_t alignment) { + if (!_rpcmem_initialized) { + QNN_LOG_WARN("rpc memory not initialized\n"); + return nullptr; + } + + auto allocate_bytes = static_cast(bytes + alignment); + void *buf = _pfn_rpc_mem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS, allocate_bytes); + if (buf == nullptr) { + QNN_LOG_WARN("failed to allocate rpc memory\n"); + return nullptr; + } + + auto aligned_buf = reinterpret_cast(align_to(alignment, + reinterpret_cast(buf))); + bool status = _rpcmem_store_map.insert(std::pair(aligned_buf, buf)).second; + if (!status) { + QNN_LOG_WARN("failed to allocate rpc memory\n"); + _pfn_rpc_mem_free(buf); + } + + return aligned_buf; +} + + +void qnn_instance::free_rpcmem(void * buf) { + if (!_rpcmem_initialized) { + QNN_LOG_WARN("rpc memory not initialized\n"); + } else if (0 == _rpcmem_store_map.count(buf)) { + QNN_LOG_WARN("no allocated tensor\n"); + } else { + _pfn_rpc_mem_free(_rpcmem_store_map[buf]); + _rpcmem_store_map.erase(buf); + } +} + + +int32_t qnn_instance::rpcmem_to_fd(void *buf) { + int32_t mem_fd = -1; + if (!is_rpcmem_initialized()) { + QNN_LOG_WARN("rpc memory not initialized\n"); + } else { + mem_fd = _pfn_rpc_mem_to_fd(buf); + } + + return mem_fd; +} + + +int qnn_instance::register_rpcmem(void * p_data, Qnn_Tensor_t * p_tensor) { + if (nullptr == p_data || (nullptr == p_tensor)) { + QNN_LOG_WARN("invalid param\n"); + return 1; + } + + if (!is_rpcmem_initialized()) { + QNN_LOG_WARN("rpc memory not initialized\n"); + return 2; + } + + if (is_rpcmem_allocated(p_data)) { + QNN_LOG_WARN("rpc memory already allocated\n"); + //return 3; + } + if (is_rpcmem_registered((QNN_VER_PTR(*p_tensor)->memHandle))) { + QNN_LOG_WARN("tensor %s has been registered shared memory\n", (QNN_VER_PTR(*p_tensor)->name)); + return 4; + } + + int32_t mem_fd = rpcmem_to_fd(p_data); + if (-1 == mem_fd) { + QNN_LOG_WARN("failed to get file descriptor\n"); + return 5; + } + QNN_LOG_DEBUG("mem_fd %d\n", mem_fd); + Qnn_MemDescriptor_t descriptor = { + {QNN_VER_PTR(*p_tensor)->rank, QNN_VER_PTR(*p_tensor)->dimensions, nullptr}, + QNN_VER_PTR(*p_tensor)->dataType, + QNN_MEM_TYPE_ION, + {{mem_fd}}}; + Qnn_MemHandle_t handle = nullptr; + int error = QNN_SUCCESS; + error = _qnn_interface.qnn_mem_register( + _qnn_context_handle, + &descriptor, + /*numDescriptors=*/1, + &handle); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to register shared memory, error %d, %s\n", QNN_GET_ERROR_CODE(error), + strerror(error)); + return 6; + } else { + QNN_LOG_INFO("tensor %s successfully register shared memory\n", (QNN_VER_PTR(*p_tensor)->name)); + } + QNN_VER_PTR(*p_tensor)->memHandle = handle; + _qnn_mem_set.insert(handle); + + return 0; +} + + +void qnn_instance::unregister_rpcmem() { + Qnn_ErrorHandle_t error = QNN_SUCCESS; + + if (_qnn_mem_set.empty()) { + QNN_LOG_WARN("no rpcmem registered\n"); + } + + for (auto &mem_handle : _qnn_mem_set) { + error = _qnn_interface.qnn_mem_de_register(&mem_handle, 1); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to unregister shared memory, error %d\n", QNN_GET_ERROR_CODE(error)); + } + } + _qnn_mem_set.clear(); +} + + +bool qnn_instance::is_rpcmem_allocated(void * buf) { + return _rpcmem_store_map.count(buf) != 0U; +} + + +int qnn_instance::load_backend(std::string & lib_path, const QnnSaver_Config_t ** saver_config) { + Qnn_ErrorHandle_t error = QNN_SUCCESS; + QNN_LOG_DEBUG("lib_path:%s\n", lib_path.c_str()); + + void *lib_handle = dlopen(lib_path.c_str(), RTLD_NOW | RTLD_GLOBAL); + if (nullptr == lib_handle) { + QNN_LOG_WARN("can not open QNN library %s, with error: %s", lib_path.c_str(), dlerror()); + return 1; + } + + // load get_provider function + auto get_providers = load_qnn_functionpointers<_pfn_QnnInterface_getProviders *>(lib_handle, + "QnnInterface_getProviders"); + if (nullptr == get_providers) { + QNN_LOG_WARN("can not load symbol QnnInterface_getProviders : %s", dlerror()); + return 2; + } + + // get QnnInterface Providers + std::uint32_t num_providers = 0; + const QnnInterface_t **provider_list = nullptr; + error = get_providers(&provider_list, &num_providers); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to get providers, error %d", QNN_GET_ERROR_CODE(error)); + return 3; + } + QNN_LOG_DEBUG("num_providers=%d\n", num_providers); + if (num_providers != _required_num_providers) { + QNN_LOG_WARN("providers is %d instead of required %d", num_providers, _required_num_providers); + return 4; + } + + if (nullptr == provider_list) { + QNN_LOG_WARN("failed to get qnn interface providers\n"); + return 5; + } + bool found_valid_interface = false; + QNN_INTERFACE_VER_TYPE qnn_interface; + for (size_t idx = 0; idx < num_providers; idx++) { + if (QNN_API_VERSION_MAJOR == provider_list[idx]->apiVersion.coreApiVersion.major && + QNN_API_VERSION_MINOR <= provider_list[idx]->apiVersion.coreApiVersion.minor) { + found_valid_interface = true; + qnn_interface = provider_list[idx]->QNN_INTERFACE_VER_NAME; + break; + } + } + + if (!found_valid_interface) { + QNN_LOG_WARN("unable to find a valid qnn interface\n"); + return 6; + } else { + QNN_LOG_INFO("find a valid qnn interface\n"); + } + set_qnn_raw_interface(qnn_interface); + + BackendIdType backend_id = provider_list[0]->backendId; + _lib_path_to_backend_id[lib_path] = backend_id; + if (_loaded_backend.count(backend_id) > 0) { + QNN_LOG_WARN("lib_path %s is loaded, but backend %d already exists\n", + lib_path.c_str(), backend_id); + } + _loaded_backend[backend_id] = provider_list[0]; + if (_loaded_lib_handle.count(backend_id) > 0) { + QNN_LOG_WARN("closing %p\n", _loaded_lib_handle[backend_id]); + int dlclose_error = dlclose(_loaded_lib_handle[backend_id]); + if (dlclose_error != 0) { + QNN_LOG_WARN("fail to close %p with error %s\n", _loaded_lib_handle[backend_id], dlerror()); + } + } + _loaded_lib_handle[backend_id] = lib_handle; + _backend_id = backend_id; + +#if 0 //comment it for purpose of reduce size of APK + QnnSaver_Config_t outputdir_cfg; + outputdir_cfg.option = QNN_SAVER_CONFIG_OPTION_OUTPUT_DIRECTORY; + outputdir_cfg.outputDirectory = "/data/local/tmp/"; + + QnnSaver_Config_t backendid_cfg; + backendid_cfg.option = QNN_SAVER_CONFIG_OPTION_BACKEND_ID; + backendid_cfg.backendId = _backend_id; + const QnnSaver_Config_t *saverCfg[] = {&outputdir_cfg, &backendid_cfg, nullptr}; + if (0 == QnnSaver_initialize(saverCfg)) { + QNN_LOG_INFO("QnnSaver_initialize successfully"); + } else { + QNN_LOG_WARN("QnnSaver_initialize failure"); + } +#endif + auto saver_initialize = load_qnn_functionpointers<_pfn_QnnSaver_initialize *>( + _loaded_lib_handle[backend_id], "QnnSaver_initialize"); + if (nullptr != saver_initialize) { + error = saver_initialize(saver_config); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to saver_initialize,error %d", QNN_GET_ERROR_CODE(error)); + return 7; + } + } else { + QNN_LOG_WARN("saver_initialize is null\n"); + } + + return 0; +} + + +int qnn_instance::unload_backend() { + int dlclose_error = 0; + for (auto &it : _loaded_lib_handle) { + dlclose_error = dlclose(it.second); + if (dlclose_error != 0) { + QNN_LOG_WARN("failed to close QNN backend %d, error %s\n", it.first, dlerror()); + } + } + + _loaded_lib_handle.clear(); + _lib_path_to_backend_id.clear(); + _loaded_backend.clear(); + + return 0; +} + + +int qnn_instance::load_system() { + Qnn_ErrorHandle_t error = QNN_SUCCESS; + + std::string system_lib_path = _lib_path + "libQnnSystem.so"; + QNN_LOG_DEBUG("system_lib_path:%s\n", system_lib_path.c_str()); + + _system_lib_handle = dlopen(system_lib_path.c_str(), RTLD_NOW | RTLD_LOCAL); + if (nullptr == _system_lib_handle) { + QNN_LOG_WARN("can not open QNN library %s, error: %s\n", system_lib_path.c_str(), dlerror()); + return 1; + } + + auto * get_providers = reinterpret_cast<_pfn_QnnSystemInterface_getProviders *>(dlsym( + _system_lib_handle, "QnnSystemInterface_getProviders")); + if (nullptr == get_providers) { + QNN_LOG_WARN("can not load QNN symbol QnnSystemInterface_getProviders: %s\n", dlerror()); + return 2; + } + + uint32_t num_providers = 0; + const QnnSystemInterface_t ** provider_list = nullptr; + error = get_providers(&provider_list, &num_providers); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to get providers, error %d\n", QNN_GET_ERROR_CODE(error)); + return 3; + } + + if (num_providers != _required_num_providers) { + QNN_LOG_WARN("providers is %d instead of required %d\n", num_providers, _required_num_providers); + return 4; + } + + if (nullptr == provider_list) { + QNN_LOG_WARN("can not get providers\n"); + return 5; + } + + QNN_SYSTEM_INTERFACE_VER_TYPE qnn_system_interface; + bool found_valid_system_interface = false; + for (size_t idx = 0; idx < num_providers; idx++) { + if (QNN_SYSTEM_API_VERSION_MAJOR == + provider_list[idx]->systemApiVersion.major && + QNN_SYSTEM_API_VERSION_MINOR <= + provider_list[idx]->systemApiVersion.minor) { + found_valid_system_interface = true; + qnn_system_interface = provider_list[idx]->QNN_SYSTEM_INTERFACE_VER_NAME; + break; + } + } + if (!found_valid_system_interface) { + QNN_LOG_WARN("unable to find a valid qnn system interface\n"); + return 6; + } else { + QNN_LOG_INFO("find a valid qnn system interface\n"); + } + set_qnn_raw_system_interface(qnn_system_interface); + + _qnn_interface.set_qnn_system_interface(provider_list[0]); + + _qnn_interface.qnn_system_context_create(&_qnn_system_handle); + if (nullptr == _qnn_system_handle) { + QNN_LOG_WARN("can not create QNN system contenxt\n"); + } else { + QNN_LOG_INFO("initialize qnn system successfully\n"); + } + + return 0; +} + + +int qnn_instance::unload_system() { + int result = 0; + + if (nullptr == _system_lib_handle) { + QNN_LOG_DEBUG("system lib handle is null\n"); + return 1; + } + + if (nullptr != _qnn_system_handle) { + result = _qnn_interface.qnn_system_context_free(_qnn_system_handle); + if (result != QNN_SUCCESS) { + QNN_LOG_WARN("failed to free QNN system context\n"); + } + _qnn_system_handle = nullptr; + } + + int dlclose_error = dlclose(_system_lib_handle); + if (dlclose_error != 0) { + QNN_LOG_WARN("failed to close QnnSystem library, error %s\n", dlerror()); + return 2; + } + + _system_lib_handle = nullptr; + + return result; +} + + +static void ggml_qnn_logcallback(const char * fmt, + QnnLog_Level_t level, + uint64_t timestamp, + va_list argp) { + + static std::mutex log_mutex; + static unsigned char s_ggml_qnn_logbuf[GGML_QNN_LOGBUF_LEN]; + + const char * log_level_desc = ""; + switch (level) { + case QNN_LOG_LEVEL_ERROR: + log_level_desc = " ERROR "; + break; + case QNN_LOG_LEVEL_WARN: + log_level_desc = "WARNING"; + break; + case QNN_LOG_LEVEL_INFO: + log_level_desc = " INFO "; + break; + case QNN_LOG_LEVEL_DEBUG: + log_level_desc = " DEBUG "; + break; + case QNN_LOG_LEVEL_VERBOSE: + log_level_desc = "VERBOSE"; + break; + case QNN_LOG_LEVEL_MAX: + log_level_desc = "UNKNOWN"; + break; + } + + double ms = (double) timestamp / 1000000.0; + + { + std::lock_guard lock(log_mutex); + + memset(s_ggml_qnn_logbuf, 0, GGML_QNN_LOGBUF_LEN); + vsnprintf(reinterpret_cast(s_ggml_qnn_logbuf), GGML_QNN_LOGBUF_LEN, fmt, argp); + QNN_LOG_DEBUG("%8.1fms [%-7s] %s\n", ms, log_level_desc, s_ggml_qnn_logbuf); + } +} + + +int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) { + BackendIdType backend_id = QNN_BACKEND_ID_NULL; + QNN_LOG_DEBUG("enter qni_init\n"); + + const std::lock_guard lock(_init_mutex); + + if (0 != load_system()) { + QNN_LOG_WARN("can not load QNN system lib, pls check why?\n"); + return 1; + } else { + QNN_LOG_DEBUG("load QNN system lib successfully\n"); + } + + std::string bakend_lib_path = _lib_path + _backend_name; + if (0 == _lib_path_to_backend_id.count(bakend_lib_path)) { + int is_load_ok = load_backend(bakend_lib_path, saver_config); + if (0 != is_load_ok) { + QNN_LOG_WARN("failed to load QNN backend\n"); + return 2; + } + } + + backend_id = _lib_path_to_backend_id[bakend_lib_path]; + if (0 == _loaded_backend.count(backend_id) || + 0 == _loaded_lib_handle.count(backend_id)) { + QNN_LOG_WARN("library %s is loaded but loaded backend count=%zu, loaded lib_handle count=%zu\n", + bakend_lib_path.c_str(), + _loaded_backend.count(backend_id), + _loaded_lib_handle.count(backend_id)); + return 3; + } + + _qnn_interface.set_qnn_interface(_loaded_backend[backend_id]); + +#if 1 + _qnn_interface.qnn_log_create(ggml_qnn_logcallback, _qnn_log_level, &_qnn_log_handle); +#else + _qnn_raw_interface.logCreate(ggml_qnn_logcallback, _qnn_log_level, &_qnn_log_handle); +#endif + if (nullptr == _qnn_log_handle) { + QNN_LOG_WARN("why failed to initialize qnn log\n"); //DSP backend not work on Qualcomm SoC based low-end phone + return 4; + } else { + QNN_LOG_DEBUG("initialize qnn log successfully\n"); + } + + std::vector temp_backend_config; + _qnn_interface.qnn_backend_create(_qnn_log_handle, temp_backend_config.empty() ? nullptr + : temp_backend_config.data(), + &_qnn_backend_handle); + if (nullptr == _qnn_backend_handle) { + QNN_LOG_WARN("why failed to initialize qnn backend\n"); + return 5; + } else { + QNN_LOG_DEBUG("initialize qnn backend successfully\n"); + } + + if (nullptr != _qnn_raw_interface.propertyHasCapability) { + auto qnnStatus = _qnn_raw_interface.propertyHasCapability(QNN_PROPERTY_GROUP_DEVICE); + if (QNN_PROPERTY_NOT_SUPPORTED == qnnStatus) { + QNN_LOG_WARN("device property is not supported\n"); + } + if (QNN_PROPERTY_ERROR_UNKNOWN_KEY == qnnStatus) { + QNN_LOG_WARN("device property is not known to backend\n"); + } + } + + auto qnnStatus = _qnn_raw_interface.deviceCreate(_qnn_log_handle, nullptr, &_qnn_device_handle); + if (QNN_SUCCESS != qnnStatus && QNN_DEVICE_ERROR_UNSUPPORTED_FEATURE != qnnStatus) { + QNN_LOG_WARN("failed to create QNN device\n"); + } else { + QNN_LOG_INFO("create device successfully\n"); + } + + if (ggml_qnn_profile_level::profile_off != _profile_level) { + QNN_LOG_INFO("profiling turned on; level = %d", _profile_level); + if (ggml_qnn_profile_level::profile_basic == _profile_level) { + QNN_LOG_INFO("basic profiling requested. creating Qnn Profile object\n"); + if (QNN_PROFILE_NO_ERROR != _qnn_raw_interface.profileCreate( + _qnn_backend_handle, QNN_PROFILE_LEVEL_BASIC, &_qnn_profile_handle)) { + QNN_LOG_WARN("unable to create profile handle in the backend\n"); + return 7; + } else { + QNN_LOG_DEBUG("initialize qnn profile successfully\n"); + } + } else if (ggml_qnn_profile_level::profile_detail == _profile_level) { + QNN_LOG_INFO("detailed profiling requested. Creating Qnn Profile object\n"); + if (QNN_PROFILE_NO_ERROR != _qnn_raw_interface.profileCreate( + _qnn_backend_handle, QNN_PROFILE_LEVEL_DETAILED, &_qnn_profile_handle)) { + QNN_LOG_WARN("unable to create profile handle in the backend\n"); + return 7; + } else { + QNN_LOG_DEBUG("initialize qnn profile successfully\n"); + } + } + } + + _rpc_lib_handle = dlopen("libcdsprpc.so", RTLD_NOW | RTLD_LOCAL); + if (nullptr == _rpc_lib_handle) { + QNN_LOG_WARN("failed to load qualcomm's rpc lib, error:%s\n", dlerror()); + return 9; + } else { + QNN_LOG_DEBUG("load rpcmem lib successfully\n"); + set_rpcmem_initialized(true); + } + _pfn_rpc_mem_init = reinterpret_cast(dlsym(_rpc_lib_handle, "rpcmem_init")); + _pfn_rpc_mem_deinit = reinterpret_cast(dlsym(_rpc_lib_handle, "rpcmem_deinit")); + _pfn_rpc_mem_alloc = reinterpret_cast(dlsym(_rpc_lib_handle,"rpcmem_alloc")); + _pfn_rpc_mem_free = reinterpret_cast(dlsym(_rpc_lib_handle, "rpcmem_free")); + _pfn_rpc_mem_to_fd = reinterpret_cast(dlsym(_rpc_lib_handle,"rpcmem_to_fd")); + if (nullptr == _pfn_rpc_mem_alloc || nullptr == _pfn_rpc_mem_free + || nullptr == _pfn_rpc_mem_to_fd) { + QNN_LOG_WARN("unable to access symbols in QNN RPC lib. dlerror(): %s", dlerror()); + dlclose(_rpc_lib_handle); + return 10; + } + + if (nullptr != _pfn_rpc_mem_init) // make Qualcomm's SoC based low-end phone happy + _pfn_rpc_mem_init(); + + std::vector temp_context_config; + _qnn_interface.qnn_context_create(_qnn_backend_handle, _qnn_device_handle, + temp_context_config.empty() ? nullptr + : temp_context_config.data(), + &_qnn_context_handle); + if (nullptr == _qnn_context_handle) { + QNN_LOG_WARN("why failed to initialize qnn context\n"); + return 8; + } else { + QNN_LOG_DEBUG("initialize qnn context successfully\n"); + } + + QNN_LOG_DEBUG("leave qni_init\n"); + + return 0; +} + + +int qnn_instance::qnn_finalize() { + int ret_status = 0; + Qnn_ErrorHandle_t error = QNN_SUCCESS; + + if (nullptr != _pfn_rpc_mem_deinit) // make Qualcomm's SoC based low-end phone happy + _pfn_rpc_mem_deinit(); + + if (dlclose(_rpc_lib_handle) != 0) { + QNN_LOG_WARN("failed to unload qualcomm's rpc lib, error:%s\n", dlerror()); + } else { + QNN_LOG_DEBUG("succeed to close rpcmem lib\n"); + } + + if (nullptr != _qnn_context_handle) { + error = _qnn_interface.qnn_context_free(_qnn_context_handle, _qnn_profile_handle); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to free QNN context_handle: ID %u, error %d\n", + _qnn_interface.get_backend_id(), QNN_GET_ERROR_CODE(error)); + + } + _qnn_context_handle = nullptr; + } + + if (nullptr != _qnn_profile_handle) { + error = _qnn_interface.qnn_profile_free(_qnn_profile_handle); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to free QNN profile_handle: ID %u, error %d\n", + _qnn_interface.get_backend_id(), QNN_GET_ERROR_CODE(error)); + + } + _qnn_profile_handle = nullptr; + } + + if (nullptr != _qnn_device_handle) { + error = _qnn_interface.qnn_device_free(_qnn_device_handle); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to free QNN device_handle: ID %u, error %d\n", + _qnn_interface.get_backend_id(), QNN_GET_ERROR_CODE(error)); + + } + _qnn_device_handle = nullptr; + } + + if (nullptr != _qnn_backend_handle) { + error = _qnn_interface.qnn_backend_free(_qnn_backend_handle); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to free QNN backend_handle: ID %u, error %d\n", + _qnn_interface.get_backend_id(), QNN_GET_ERROR_CODE(error)); + } + _qnn_backend_handle = nullptr; + + } + + if (nullptr != _qnn_log_handle) { + error = _qnn_interface.qnn_log_free(_qnn_log_handle); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to free QNN log_handle: ID %u, error %d\n", + _qnn_interface.get_backend_id(), QNN_GET_ERROR_CODE(error)); + } + _qnn_log_handle = nullptr; + } + + unload_backend(); + + unload_system(); + + return ret_status; +} + + +int qnn_instance::init_qnn_graph(const char * graph_name, bool debug, uint8_t do_node_validation, + const QnnGraph_Config_t ** graph_configs) { + int result = 0; + + if (nullptr == graph_name) { + QNN_LOG_WARN("graph name is null\n"); + return 1; + } + + if (!_graph_name.empty()) { + QNN_LOG_WARN("qnn model for graph %s already initialized\n", graph_name); + return 2; + } + + if (!do_node_validation) { + QNN_LOG_WARN("node validation disabled, backend will not perform op validation prior to adding node\n"); + } + + _graph_name = graph_name; + _debug_tensor = debug; + _do_node_validations = do_node_validation; + + result = _qnn_raw_interface.graphCreate(_qnn_context_handle, graph_name, graph_configs, + &_qnn_graph_handle); + if (result != QNN_GRAPH_NO_ERROR || nullptr == _qnn_graph_handle) { + QNN_LOG_WARN("failed to create graph in qnn context\n"); + return 3; + } else { + QNN_LOG_INFO("succeed to create graph %s, %p\n", graph_name, _qnn_graph_handle); + } + + return 0; +} + + +int qnn_instance::finalize_qnn_graph() { + if (nullptr != _qnn_graph_handle) { + if (_qnn_raw_interface.graphFinalize(_qnn_graph_handle, _qnn_profile_handle, nullptr) != + QNN_GRAPH_NO_ERROR) { + QNN_LOG_WARN("finalizing graph failure\n"); + //return 1; + } + } else { + QNN_LOG_DEBUG("qnn graph handle is null\n"); + } + + return 0; +} + + + +// ================================================================================================= +// +// implementation of GGML's QNN backend +// +// ================================================================================================= +static bool ggml_qnn_can_handle_op(const struct ggml_tensor * tensor, bool b_dump_tensor_info) { + if (nullptr == tensor) + return false; + if (b_dump_tensor_info) { + QNN_LOG_DEBUG("op name:%s, tensor type:%s", ggml_op_name(tensor->op), + ggml_type_name(tensor->type)); + } + //only support the following 3 OPs currently and ensure tensor->src[0] and tensor->src[1] is not nullptr + bool supported_op = ((tensor->op == GGML_OP_ADD) || (tensor->op == GGML_OP_MUL) || (tensor->op == GGML_OP_MUL_MAT)); + if (!supported_op) { + return false; + } + + const struct ggml_tensor * src0 = tensor->src[0]; + const struct ggml_tensor * src1 = tensor->src[1]; + + const int64_t ne00 = tensor->src[0]->ne[0]; + const int64_t ne01 = tensor->src[0]->ne[1]; + + const int64_t ne10 = tensor->src[1]->ne[0]; + const int64_t ne11 = tensor->src[1]->ne[1]; + + const int64_t ne0 = tensor->ne[0]; + const int64_t ne1 = tensor->ne[1]; + + GGML_UNUSED(ne0); + GGML_UNUSED(ne1); + + if (b_dump_tensor_info) { + QNN_LOG_DEBUG("src0 type:%s", ggml_type_name(tensor->src[0]->type)); + QNN_LOG_DEBUG("src1 type:%s", ggml_type_name(tensor->src[1]->type)); + + if (tensor->op == GGML_OP_MUL_MAT) { + QNN_LOG_DEBUG("GGML_OP_MUL_MAT"); + QNN_LOG_DEBUG( + "src0 %15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + src0->name, + src0->type, ggml_type_name(src0->type), src0->ne[0], src0->ne[1], src0->ne[2], + src0->nb[0], src0->nb[1], src0->nb[2]); + QNN_LOG_DEBUG( + "src1 %15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + src1->name, + src1->type, ggml_type_name(src1->type), src1->ne[0], src1->ne[1], src1->ne[2], + src1->nb[0], src1->nb[1], src1->nb[2]); + QNN_LOG_DEBUG( + " %15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + tensor->name, + tensor->type, ggml_type_name(tensor->type), tensor->ne[0], tensor->ne[1], tensor->ne[2], + tensor->nb[0], + tensor->nb[1], tensor->nb[2]); + + } + } + + if (ggml_is_empty(tensor) || tensor->op == GGML_OP_RESHAPE || tensor->op == GGML_OP_TRANSPOSE || tensor->op == GGML_OP_VIEW || tensor->op == GGML_OP_PERMUTE || tensor->op == GGML_OP_NONE) { + return false; + } + + //make ggml_get_tensor_rank and QNN SDK happy + if ((ne00 <= 1 || ne01 <= 1 || ne10 <= 1 || ne11 <= 1)) { + return false; + } + + if (tensor->op == GGML_OP_ADD) { + //TODO: this is limitation + return (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16) + && (src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16) + && (tensor->type == GGML_TYPE_F32 || tensor->type == GGML_TYPE_F16); + + } + + if (tensor->op == GGML_OP_MUL_MAT) { + //TODO: this is limitation + return (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16) + && (src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16) + && (src0->type == src1->type) && (src0->type == tensor->type); + + if (tensor->ne[1] < 32) { // GPU/NPU inference will slower then CPU inference when tensor->ne[1] < min batch size + return false; + } + + } + + //TODO: this is limitation + return (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16) + && (src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16) + && (src0->type == src1->type) && (src0->type == tensor->type); +} + + +static void ggml_qnn_add(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + Qnn_ErrorHandle_t error = QNN_SUCCESS; + bool graph_initialized = false; + int64_t n_begin_time = 0LL; + int64_t n_end_time = 0LL; + int64_t n_duration = 0LL; + + qnn_instance * instance = nullptr; + struct ggml_backend_qnn_context * ctx = nullptr; + + std::string graph_name = "ggml_op_qnn_add"; + Qnn_GraphHandle_t graph_handle = nullptr; + Qnn_Tensor_t * tensor_0 = nullptr; + Qnn_Tensor_t * tensor_1 = nullptr; + Qnn_Tensor_t * tensor_2 = nullptr; + Qnn_Param_t qnn_params[] = {}; + enum ggml_op ggmlop = GGML_OP_ADD; + Qnn_DataType_t src0_qnn_type = QNN_DATATYPE_FLOAT_32; + Qnn_DataType_t src1_qnn_type = QNN_DATATYPE_FLOAT_32; + Qnn_DataType_t dst_qnn_type = QNN_DATATYPE_FLOAT_32; + + if ((nullptr == src0) || (nullptr == src1) || (nullptr == dst)) { + QNN_LOG_WARN("pls check why GGML tensor is null"); + return; + } + tensor_0 = (Qnn_Tensor_t *)src0->extra; + tensor_1 = (Qnn_Tensor_t *)src1->extra; + tensor_2 = (Qnn_Tensor_t *)dst->extra; + if ((nullptr == tensor_0) || (nullptr == tensor_1) || (nullptr == tensor_2)) { + QNN_LOG_WARN("pls check why QNN tensor is null"); + return; + } + ctx = (struct ggml_backend_qnn_context *)g_qnn_backend->context; + if (nullptr == ctx) { + QNN_LOG_WARN("pls check why backend ctx is null"); + return; + } + instance = ctx->instance; + if (nullptr == instance) { + QNN_LOG_WARN("pls check why qnn instance is null"); + return; + } + QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface; + + n_begin_time = ggml_time_us(); +#if 1 + QNN_LOG_DEBUG("call %s\n", __func__); + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + src0->name, + src0->type, ggml_type_name(src0->type), src0->ne[0], src0->ne[1], src0->ne[2], + src0->nb[0], src0->nb[1], src0->nb[2]); + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + src1->name, + src1->type, ggml_type_name(src1->type), src1->ne[0], src1->ne[1], src1->ne[2], + src1->nb[0], src1->nb[1], src1->nb[2]); + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + dst->name, + dst->type, ggml_type_name(dst->type), dst->ne[0], dst->ne[1], dst->ne[2], dst->nb[0], + dst->nb[1], dst->nb[2]); + QNN_LOG_DEBUG("%d, %d, %d, %d", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]); + QNN_LOG_DEBUG("tensor0 name %s", QNN_TENSOR_GET_NAME(tensor_0)); + QNN_LOG_DEBUG("tensor1 name %s", QNN_TENSOR_GET_NAME(tensor_1)); + QNN_LOG_DEBUG("tensor2 name %s", QNN_TENSOR_GET_NAME(tensor_2)); +#endif + + QNN_VER_PTR(*tensor_0)->type = QNN_TENSOR_TYPE_APP_WRITE; + QNN_VER_PTR(*tensor_1)->type = QNN_TENSOR_TYPE_APP_WRITE; + QNN_VER_PTR(*tensor_2)->type = QNN_TENSOR_TYPE_APP_READ; + + src0_qnn_type = qnn_datatype_from_ggml_datatype(src0->type); + src1_qnn_type = qnn_datatype_from_ggml_datatype(src1->type); + dst_qnn_type = qnn_datatype_from_ggml_datatype(dst->type); + + std::string map_entry = std::string(ggml_op_name(ggmlop)); + if (instance->_qnn_graph_map.find(map_entry) != instance->_qnn_graph_map.end()) { + graph_initialized = true; + auto & graph_item = instance->_qnn_graph_map[map_entry]; + graph_handle = std::get<0>(graph_item); + } + + uint32_t * tensor_0_dimensions = QNN_VER_PTR(*tensor_0)->dimensions; + uint32_t * tensor_1_dimensions = QNN_VER_PTR(*tensor_1)->dimensions; + uint32_t * tensor_2_dimensions = QNN_VER_PTR(*tensor_2)->dimensions; + + if (!graph_initialized) { + graph_name = graph_name + "_" + std::to_string(ctx->threads) + src0->name + "_" + src1->name; + QNN_LOG_DEBUG("graph name %s", graph_name.c_str()); + //QnnGraph_Config_t graph_config; + //graph_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; + //graph_config.customConfig = strdup(graph_name.c_str()); + //const QnnGraph_Config_t * p_graph_config = &graph_config; + error = qnn_raw_interface.graphCreate(instance->get_qnn_context_handle(), graph_name.c_str(), nullptr, &graph_handle); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("can't create qnn graph handle with graph name %s, error = %d\n", graph_name.c_str(), error); + return; + } + error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_0); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + } + error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_1); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + } + error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_2); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + } + + QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, ggml_get_tensor_data_size(src0)}; + QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, ggml_get_tensor_data_size(src1)}; + QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, ggml_get_tensor_data_size(dst)}; + + Qnn_Tensor_t tensor_inputs[] = { + *tensor_0, + *tensor_1 + }; + Qnn_Tensor_t tensor_outputs[] = { + *tensor_2 + }; + Qnn_OpConfig_t op_config = { + (Qnn_OpConfigVersion_t) 1, .v1 = { + "ggml_op_add", + QNN_OP_PACKAGE_NAME_QTI_AISW, + QNN_OP_ELEMENT_WISE_ADD, + 0, + qnn_params, + 2, + tensor_inputs, + 1, + tensor_outputs + } + }; + error = qnn_raw_interface.graphAddNode(graph_handle, op_config); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + } + error = qnn_raw_interface.graphFinalize(graph_handle, nullptr, nullptr); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + } + error = qnn_raw_interface.graphExecute(graph_handle, tensor_inputs, 2, tensor_outputs, 1, nullptr, nullptr); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + } + auto graph_item = std::make_tuple(graph_handle, tensor_0, tensor_1, tensor_2); + instance->_qnn_graph_map[map_entry] = graph_item; + } else { + auto & graph_item = instance->_qnn_graph_map[map_entry]; + graph_handle = std::get<0>(graph_item); + tensor_0 = std::get<1>(graph_item); + tensor_1 = std::get<2>(graph_item); + tensor_2 = std::get<3>(graph_item); + + //QNN_LOG_DEBUG("%d, %d, %d, %d", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]); + uint32_t dimensions_input_0[] = {(uint32_t) src0->ne[0], (uint32_t) src0->ne[1], + (uint32_t) src0->ne[2], (uint32_t) src0->ne[3]}; + uint32_t dimensions_input_1[] = {(uint32_t) src1->ne[0], (uint32_t) src1->ne[1], + (uint32_t) src1->ne[2], (uint32_t) src1->ne[3]}; + uint32_t dimensions_output[] = {(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], + (uint32_t) dst->ne[2], (uint32_t) dst->ne[3]}; + QNN_VER_PTR(*tensor_0)->dimensions = dimensions_input_0; + QNN_VER_PTR(*tensor_0)->rank = ggml_get_tensor_rank(src0); + QNN_VER_PTR(*tensor_0)->dataType = src0_qnn_type; + QNN_VER_PTR(*tensor_1)->dimensions = dimensions_input_1; + QNN_VER_PTR(*tensor_1)->rank = ggml_get_tensor_rank(src1); + QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type; + QNN_VER_PTR(*tensor_2)->dimensions = dimensions_output; + QNN_VER_PTR(*tensor_2)->rank = ggml_get_tensor_rank(dst); + QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type; + + QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, ggml_get_tensor_data_size(src0)}; + QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, ggml_get_tensor_data_size(src1)}; + QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, ggml_get_tensor_data_size(dst)}; + + Qnn_Tensor_t tensor_inputs[] = { + *tensor_0, + *tensor_1 + }; + Qnn_Tensor_t tensor_outputs[] = { + *tensor_2 + }; + error = qnn_raw_interface.graphExecute(graph_handle, tensor_inputs, 2, tensor_outputs, 1, nullptr, nullptr); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + } + } + QNN_VER_PTR(*tensor_0)->dimensions = tensor_0_dimensions; + QNN_VER_PTR(*tensor_1)->dimensions = tensor_1_dimensions; + QNN_VER_PTR(*tensor_2)->dimensions = tensor_2_dimensions; + n_end_time = ggml_time_us(); + n_duration = (n_end_time - n_begin_time) / 1000; + QNN_LOG_DEBUG("duration of ggml_qnn_add : %lld milliseconds\n", n_duration); +} + + + +/* + * ggml_qnn_mul_mat was re-added as a standalone function because + * the following comments came from https://github.com/ggerganov/llama.cpp/pull/1632 + * MUL_MAT take most of the compute time (about 95%). So to speed up llama, we have to focus on MUL_MAT. + * We have three kinds of MUL_MAT to compute: + * mul_mat_f32: both src0 and src1 are F32. + * mul_mat_f16_f32: src0 is F16 and src1 is F32. + * mul_mat_q_f32: src0 is quantized (Q4_0, Q4_1, ...), and src1 is F32. +*/ +static void ggml_qnn_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + Qnn_ErrorHandle_t error = QNN_SUCCESS; + bool graph_initialized = false; + int64_t n_begin_time = 0LL; + int64_t n_end_time = 0LL; + int64_t n_duration = 0LL; + + qnn_instance * instance = nullptr; + struct ggml_backend_qnn_context * ctx = nullptr; + + std::string graph_name = "ggml_op_qnn_mul_mat"; + Qnn_GraphHandle_t graph_handle = nullptr; + Qnn_Tensor_t * tensor_0 = nullptr; + Qnn_Tensor_t * tensor_1 = nullptr; + Qnn_Tensor_t * tensor_2 = nullptr; + + Qnn_Param_t qnn_params[] = {}; + + enum ggml_op ggmlop = GGML_OP_ADD; + Qnn_DataType_t src0_qnn_type = QNN_DATATYPE_FLOAT_32; + Qnn_DataType_t src1_qnn_type = QNN_DATATYPE_FLOAT_32; + Qnn_DataType_t dst_qnn_type = QNN_DATATYPE_FLOAT_32; + + if ((nullptr == src0) || (nullptr == src1) || (nullptr == dst)) { + QNN_LOG_WARN("pls check why GGML tensor is null"); + return; + } + tensor_0 = (Qnn_Tensor_t *)src0->extra; + tensor_1 = (Qnn_Tensor_t *)src1->extra; + tensor_2 = (Qnn_Tensor_t *)dst->extra; + if ((nullptr == tensor_0) || (nullptr == tensor_1) || (nullptr == tensor_2)) { + QNN_LOG_WARN("pls check why QNN tensor is null"); + return; + } + ctx = (struct ggml_backend_qnn_context *)g_qnn_backend->context; + if (nullptr == ctx) { + QNN_LOG_WARN("pls check why backend ctx is null"); + return; + } + instance = ctx->instance; + if (nullptr == instance) { + QNN_LOG_WARN("pls check why qnn instance is null"); + return; + } + QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface; + + n_begin_time = ggml_time_us(); +#if 1 + QNN_LOG_DEBUG("call %s\n", __func__); + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + src0->name, + src0->type, ggml_type_name(src0->type), src0->ne[0], src0->ne[1], src0->ne[2], + src0->nb[0], src0->nb[1], src0->nb[2]); + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + src1->name, + src1->type, ggml_type_name(src1->type), src1->ne[0], src1->ne[1], src1->ne[2], + src1->nb[0], src1->nb[1], src1->nb[2]); + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + dst->name, + dst->type, ggml_type_name(dst->type), dst->ne[0], dst->ne[1], dst->ne[2], dst->nb[0], + dst->nb[1], dst->nb[2]); + QNN_LOG_DEBUG("%d, %d, %d, %d", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]); + QNN_LOG_DEBUG("tensor0 name %s", QNN_TENSOR_GET_NAME(tensor_0)); + QNN_LOG_DEBUG("tensor1 name %s", QNN_TENSOR_GET_NAME(tensor_1)); + QNN_LOG_DEBUG("tensor2 name %s", QNN_TENSOR_GET_NAME(tensor_2)); +#endif + + QNN_VER_PTR(*tensor_0)->type = QNN_TENSOR_TYPE_APP_WRITE; + QNN_VER_PTR(*tensor_1)->type = QNN_TENSOR_TYPE_APP_WRITE; + QNN_VER_PTR(*tensor_2)->type = QNN_TENSOR_TYPE_APP_READ; + + src0_qnn_type = qnn_datatype_from_ggml_datatype(src0->type); + src1_qnn_type = qnn_datatype_from_ggml_datatype(src1->type); + dst_qnn_type = qnn_datatype_from_ggml_datatype(dst->type); + + std::string map_entry = std::string(ggml_op_name(ggmlop)); + if (instance->_qnn_graph_map.find(map_entry) != instance->_qnn_graph_map.end()) { + graph_initialized = true; + auto & graph_item = instance->_qnn_graph_map[map_entry]; + graph_handle = std::get<0>(graph_item); + } + + uint32_t * tensor_0_dimensions = QNN_VER_PTR(*tensor_0)->dimensions; + uint32_t * tensor_1_dimensions = QNN_VER_PTR(*tensor_1)->dimensions; + uint32_t * tensor_2_dimensions = QNN_VER_PTR(*tensor_2)->dimensions; + + if (!graph_initialized) { + graph_name = graph_name + "_" + std::to_string(ctx->threads) + src0->name + "_" + src1->name; + QNN_LOG_DEBUG("graph name %s", graph_name.c_str()); + error = qnn_raw_interface.graphCreate(instance->get_qnn_context_handle(), graph_name.c_str(), nullptr, &graph_handle); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("can't create qnn graph handle with graph name %s, error = %d\n", graph_name.c_str(), error); + return; + } + error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_0); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + } + error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_1); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + } + error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_2); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + } + + QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, ggml_get_tensor_data_size(src0)}; + QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, ggml_get_tensor_data_size(src1)}; + QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, ggml_get_tensor_data_size(dst)}; + + Qnn_Tensor_t tensor_inputs[] = { + *tensor_0, + *tensor_1 + }; + Qnn_Tensor_t tensor_outputs[] = { + *tensor_2 + }; + Qnn_OpConfig_t op_config = { + (Qnn_OpConfigVersion_t) 1, .v1 = { + "ggml_op_mul_mat", + QNN_OP_PACKAGE_NAME_QTI_AISW, + QNN_OP_MAT_MUL, + 0, + qnn_params, + 2, + tensor_inputs, + 1, + tensor_outputs + } + }; + error = qnn_raw_interface.graphAddNode(graph_handle, op_config); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + } + error = qnn_raw_interface.graphFinalize(graph_handle, nullptr, nullptr); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + } + error = qnn_raw_interface.graphExecute(graph_handle, tensor_inputs, 2, tensor_outputs, 1, nullptr, nullptr); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + } + auto graph_item = std::make_tuple(graph_handle, tensor_0, tensor_1, tensor_2); + instance->_qnn_graph_map[map_entry] = graph_item; + } else { + auto & graph_item = instance->_qnn_graph_map[map_entry]; + graph_handle = std::get<0>(graph_item); + tensor_0 = std::get<1>(graph_item); + tensor_1 = std::get<2>(graph_item); + tensor_2 = std::get<3>(graph_item); + + QNN_LOG_DEBUG("%d, %d, %d, %d", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]); + uint32_t dimensions_input_0[] = {(uint32_t) src0->ne[0], (uint32_t) src0->ne[1], + (uint32_t) src0->ne[2], (uint32_t) src0->ne[3]}; + uint32_t dimensions_input_1[] = {(uint32_t) src1->ne[0], (uint32_t) src1->ne[1], + (uint32_t) src1->ne[2], (uint32_t) src1->ne[3]}; + uint32_t dimensions_output[] = {(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], + (uint32_t) dst->ne[2], (uint32_t) dst->ne[3]}; + QNN_VER_PTR(*tensor_0)->dimensions = dimensions_input_0; + QNN_VER_PTR(*tensor_0)->rank = ggml_get_tensor_rank(src0); + QNN_VER_PTR(*tensor_0)->dataType = src0_qnn_type; + QNN_VER_PTR(*tensor_1)->dimensions = dimensions_input_1; + QNN_VER_PTR(*tensor_1)->rank = ggml_get_tensor_rank(src1); + QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type; + QNN_VER_PTR(*tensor_2)->dimensions = dimensions_output; + QNN_VER_PTR(*tensor_2)->rank = ggml_get_tensor_rank(dst); + QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type; + + QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, ggml_get_tensor_data_size(src0)}; + QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, ggml_get_tensor_data_size(src1)}; + QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, ggml_get_tensor_data_size(dst)}; + + Qnn_Tensor_t tensor_inputs[] = { + *tensor_0, + *tensor_1 + }; + Qnn_Tensor_t tensor_outputs[] = { + *tensor_2 + }; + error = qnn_raw_interface.graphExecute(graph_handle, tensor_inputs, 2, tensor_outputs, 1, nullptr, nullptr); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + } + } + QNN_VER_PTR(*tensor_0)->dimensions = tensor_0_dimensions; + QNN_VER_PTR(*tensor_1)->dimensions = tensor_1_dimensions; + QNN_VER_PTR(*tensor_2)->dimensions = tensor_2_dimensions; + n_end_time = ggml_time_us(); + n_duration = (n_end_time - n_begin_time) / 1000; + QNN_LOG_DEBUG("duration of ggml_qnn_mul_mat : %lld milliseconds\n", n_duration); + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +//common function for GGML OPs using QNN API +static void ggml_qnn_hanlde_op(const enum ggml_op ggmlop, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + Qnn_ErrorHandle_t error = QNN_SUCCESS; + bool graph_initialized = false; + int64_t n_begin_time = 0LL; + int64_t n_end_time = 0LL; + int64_t n_duration = 0LL; + + qnn_instance * instance = nullptr; + struct ggml_backend_qnn_context * ctx = nullptr; + + std::string qnn_graph_name = "ggml_qnn_graph"; + std::string qnn_op_config_name = "ggml_qnn_op_config"; + const char * qnn_op_name = nullptr; + Qnn_GraphHandle_t graph_handle = nullptr; + Qnn_Tensor_t * tensor_0 = nullptr; + Qnn_Tensor_t * tensor_1 = nullptr; + Qnn_Tensor_t * tensor_2 = nullptr; + + Qnn_Param_t qnn_params[] = {}; + + Qnn_DataType_t src0_qnn_type = QNN_DATATYPE_FLOAT_32; + Qnn_DataType_t src1_qnn_type = QNN_DATATYPE_FLOAT_32; + Qnn_DataType_t dst_qnn_type = QNN_DATATYPE_FLOAT_32; + + if ((nullptr == src0) || (nullptr == src1) || (nullptr == dst)) { + QNN_LOG_WARN("pls check why GGML tensor is null"); + return; + } + tensor_0 = (Qnn_Tensor_t *)src0->extra; + tensor_1 = (Qnn_Tensor_t *)src1->extra; + tensor_2 = (Qnn_Tensor_t *)dst->extra; + if ((nullptr == tensor_0) || (nullptr == tensor_1) || (nullptr == tensor_2)) { + QNN_LOG_WARN("pls check why QNN tensor is null"); + return; + } + ctx = (struct ggml_backend_qnn_context *)g_qnn_backend->context; + if (nullptr == ctx) { + QNN_LOG_WARN("pls check why backend ctx is null"); + return; + } + instance = ctx->instance; + if (nullptr == instance) { + QNN_LOG_WARN("pls check why qnn instance is null"); + return; + } + QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface; + + src0_qnn_type = qnn_datatype_from_ggml_datatype(src0->type); + src1_qnn_type = qnn_datatype_from_ggml_datatype(src1->type); + dst_qnn_type = qnn_datatype_from_ggml_datatype(dst->type); + qnn_op_name = qnn_opname_from_ggmlop(ggmlop); + if (nullptr == qnn_op_name) { + QNN_LOG_WARN("pls check why can not get QNN OP name with ggml op %d(%s)", ggmlop, ggml_op_name(ggmlop)); + return; + } + + n_begin_time = ggml_time_us(); +#if 1 + QNN_LOG_DEBUG("call %s\n", __func__); + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + src0->name, + src0->type, ggml_type_name(src0->type), src0->ne[0], src0->ne[1], src0->ne[2], + src0->nb[0], src0->nb[1], src0->nb[2]); + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + src1->name, + src1->type, ggml_type_name(src1->type), src1->ne[0], src1->ne[1], src1->ne[2], + src1->nb[0], src1->nb[1], src1->nb[2]); + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + dst->name, + dst->type, ggml_type_name(dst->type), dst->ne[0], dst->ne[1], dst->ne[2], dst->nb[0], + dst->nb[1], dst->nb[2]); + QNN_LOG_DEBUG("%d, %d, %d, %d", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]); + QNN_LOG_DEBUG("tensor0 name %s", QNN_TENSOR_GET_NAME(tensor_0)); + QNN_LOG_DEBUG("tensor1 name %s", QNN_TENSOR_GET_NAME(tensor_1)); + QNN_LOG_DEBUG("tensor2 name %s", QNN_TENSOR_GET_NAME(tensor_2)); +#endif + + QNN_VER_PTR(*tensor_0)->type = QNN_TENSOR_TYPE_APP_WRITE; + QNN_VER_PTR(*tensor_1)->type = QNN_TENSOR_TYPE_APP_WRITE; + QNN_VER_PTR(*tensor_2)->type = QNN_TENSOR_TYPE_APP_READ; + + std::string map_entry = std::string(ggml_op_name(ggmlop)); + if (instance->_qnn_graph_map.find(map_entry) != instance->_qnn_graph_map.end()) { + graph_initialized = true; + auto & graph_item = instance->_qnn_graph_map[map_entry]; + graph_handle = std::get<0>(graph_item); + } + + uint32_t * tensor_0_dimensions = QNN_VER_PTR(*tensor_0)->dimensions; + uint32_t * tensor_1_dimensions = QNN_VER_PTR(*tensor_1)->dimensions; + uint32_t * tensor_2_dimensions = QNN_VER_PTR(*tensor_2)->dimensions; + + if (!graph_initialized) { + qnn_graph_name = qnn_graph_name + "_" + ggml_op_name(ggmlop) + std::to_string(ctx->threads) + src0->name + "_" + src1->name; + qnn_op_config_name = qnn_op_config_name + "_" + ggml_op_name(ggmlop) + std::to_string(ctx->threads) + src0->name + "_" + src1->name; + QNN_LOG_DEBUG("qnn graph name %s", qnn_graph_name.c_str()); + QNN_LOG_DEBUG("qnn op_config name %s", qnn_op_config_name.c_str()); + error = qnn_raw_interface.graphCreate(instance->get_qnn_context_handle(), qnn_graph_name.c_str(), nullptr, &graph_handle); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("can't create qnn graph handle with ggml op %s, graph name %s, error = %d\n", ggml_op_name(ggmlop), qnn_graph_name.c_str(), error); + return; + } + + error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_0); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + } + error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_1); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + } + error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_2); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + } + + QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, ggml_get_tensor_data_size(src0)}; + QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, ggml_get_tensor_data_size(src1)}; + QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, ggml_get_tensor_data_size(dst)}; + + Qnn_Tensor_t tensor_inputs[] = { + *tensor_0, + *tensor_1 + }; + Qnn_Tensor_t tensor_outputs[] = { + *tensor_2 + }; + Qnn_OpConfig_t op_config = { + (Qnn_OpConfigVersion_t) 1, .v1 = { + qnn_op_config_name.c_str(), + QNN_OP_PACKAGE_NAME_QTI_AISW, + qnn_op_name, + 0, + qnn_params, + 2, + tensor_inputs, + 1, + tensor_outputs + } + }; + error = qnn_raw_interface.graphAddNode(graph_handle, op_config); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + } + error = qnn_raw_interface.graphFinalize(graph_handle, nullptr, nullptr); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + } + error = qnn_raw_interface.graphExecute(graph_handle, tensor_inputs, 2, tensor_outputs, 1, nullptr, nullptr); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + } + auto graph_item = std::make_tuple(graph_handle, tensor_0, tensor_1, tensor_2); + instance->_qnn_graph_map[map_entry] = graph_item; + } else { + auto & graph_item = instance->_qnn_graph_map[map_entry]; + graph_handle = std::get<0>(graph_item); + tensor_0 = std::get<1>(graph_item); + tensor_1 = std::get<2>(graph_item); + tensor_2 = std::get<3>(graph_item); + + QNN_LOG_DEBUG("%d, %d, %d, %d", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]); + uint32_t dimensions_input_0[] = {(uint32_t) src0->ne[0], (uint32_t) src0->ne[1], + (uint32_t) src0->ne[2], (uint32_t) src0->ne[3]}; + uint32_t dimensions_input_1[] = {(uint32_t) src1->ne[0], (uint32_t) src1->ne[1], + (uint32_t) src1->ne[2], (uint32_t) src1->ne[3]}; + uint32_t dimensions_output[] = {(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], + (uint32_t) dst->ne[2], (uint32_t) dst->ne[3]}; + QNN_VER_PTR(*tensor_0)->dimensions = dimensions_input_0; + QNN_VER_PTR(*tensor_0)->rank = ggml_get_tensor_rank(src0); + QNN_VER_PTR(*tensor_0)->dataType = src0_qnn_type; + QNN_VER_PTR(*tensor_1)->dimensions = dimensions_input_1; + QNN_VER_PTR(*tensor_1)->rank = ggml_get_tensor_rank(src1); + QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type; + QNN_VER_PTR(*tensor_2)->dimensions = dimensions_output; + QNN_VER_PTR(*tensor_2)->rank = ggml_get_tensor_rank(dst); + QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type; + + QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, ggml_get_tensor_data_size(src0)}; + QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, ggml_get_tensor_data_size(src1)}; + QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, ggml_get_tensor_data_size(dst)}; + + Qnn_Tensor_t tensor_inputs[] = { + *tensor_0, + *tensor_1 + }; + Qnn_Tensor_t tensor_outputs[] = { + *tensor_2 + }; + error = qnn_raw_interface.graphExecute(graph_handle, tensor_inputs, 2, tensor_outputs, 1, nullptr, nullptr); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + } + } + QNN_VER_PTR(*tensor_0)->dimensions = tensor_0_dimensions; + QNN_VER_PTR(*tensor_1)->dimensions = tensor_1_dimensions; + QNN_VER_PTR(*tensor_2)->dimensions = tensor_2_dimensions; + n_end_time = ggml_time_us(); + n_duration = (n_end_time - n_begin_time) / 1000; + QNN_LOG_DEBUG("duration of ggml_qnn_%s : %lld milliseconds\n", ggml_op_name(ggmlop), n_duration); + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_repeat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_get_rows(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_acc(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_div(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_gelu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_silu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_gelu_quick(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_tanh(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_relu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_hardsigmoid(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_hardswish(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_leaky_relu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_sqr(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_group_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_concat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_upscale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_pad(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_rms_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_dup(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + ggml_qnn_cpy(src0, dst, nullptr); + (void) src1; +} + + +static void ggml_qnn_mul_mat_id(const ggml_tensor * src0, + const ggml_tensor * src1, + ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_scale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_clamp(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_diag_mask_inf(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_soft_max(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_rope(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + GGML_ASSERT(ggml_is_contiguous(src0)); + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); + +} + + +static void ggml_qnn_pool2d(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_im2col(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_sum_rows(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + GGML_ASSERT(ggml_is_contiguous(src0)); + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_argsort(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + GGML_ASSERT(ggml_is_contiguous(src0)); + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + (void) src0; + (void) src1; + (void) dst; + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +bool ggml_qnn_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) { + ggml_qnn_func_t func = nullptr; + ggml_qnn_func_common_t func_common = nullptr; + + switch (tensor->op) { + case GGML_OP_ADD: + func = ggml_qnn_add; + break; + + case GGML_OP_MUL: + func_common = ggml_qnn_hanlde_op; + break; + + case GGML_OP_MUL_MAT: + func = ggml_qnn_mul_mat; + break; + + case GGML_OP_REPEAT: + func = ggml_qnn_repeat; + break; + case GGML_OP_GET_ROWS: + func = ggml_qnn_get_rows; + break; + case GGML_OP_DUP: + func = ggml_qnn_dup; + break; + + case GGML_OP_ACC: + func = ggml_qnn_acc; + break; + + case GGML_OP_DIV: + func = ggml_qnn_div; + break; + + case GGML_OP_UNARY: + switch (ggml_get_unary_op(tensor)) { + case GGML_UNARY_OP_GELU: + func = ggml_qnn_gelu; + break; + case GGML_UNARY_OP_SILU: + func = ggml_qnn_silu; + break; + case GGML_UNARY_OP_GELU_QUICK: + func = ggml_qnn_gelu_quick; + break; + case GGML_UNARY_OP_TANH: + func = ggml_qnn_tanh; + break; + case GGML_UNARY_OP_RELU: + func = ggml_qnn_relu; + break; + case GGML_UNARY_OP_HARDSIGMOID: + func = ggml_qnn_hardsigmoid; + break; + case GGML_UNARY_OP_HARDSWISH: + func = ggml_qnn_hardswish; + break; + default: + return false; + } + break; + case GGML_OP_NORM: + func = ggml_qnn_norm; + break; + case GGML_OP_GROUP_NORM: + func = ggml_qnn_group_norm; + break; + case GGML_OP_CONCAT: + func = ggml_qnn_concat; + break; + case GGML_OP_UPSCALE: + func = ggml_qnn_upscale; + break; + case GGML_OP_PAD: + func = ggml_qnn_pad; + break; + case GGML_OP_LEAKY_RELU: + func = ggml_qnn_leaky_relu; + break; + case GGML_OP_RMS_NORM: + func = ggml_qnn_rms_norm; + break; + case GGML_OP_MUL_MAT_ID: + func = ggml_qnn_mul_mat_id; + break; + case GGML_OP_SCALE: + func = ggml_qnn_scale; + break; + case GGML_OP_SQR: + func = ggml_qnn_sqr; + break; + case GGML_OP_CLAMP: + func = ggml_qnn_clamp; + break; + case GGML_OP_CPY: + func = ggml_qnn_cpy; + break; + case GGML_OP_CONT: + func = ggml_qnn_dup; + break; + case GGML_OP_NONE: + case GGML_OP_RESHAPE: + case GGML_OP_VIEW: + case GGML_OP_PERMUTE: + case GGML_OP_TRANSPOSE: + func = ggml_qnn_nop; + break; + case GGML_OP_DIAG_MASK_INF: + func = ggml_qnn_diag_mask_inf; + break; + case GGML_OP_SOFT_MAX: + func = ggml_qnn_soft_max; + break; + case GGML_OP_ROPE: + func = ggml_qnn_rope; + break; + case GGML_OP_IM2COL: + func = ggml_qnn_im2col; + break; + case GGML_OP_POOL_2D: + func = ggml_qnn_pool2d; + break; + case GGML_OP_SUM_ROWS: + func = ggml_qnn_sum_rows; + break; + case GGML_OP_ARGSORT: + func = ggml_qnn_argsort; + break; + default: + return false; + } + + if (nullptr != func) + func(tensor->src[0], tensor->src[1], tensor); + + if (nullptr != func_common) + func_common(tensor->op, tensor->src[0], tensor->src[1], tensor); + + return true; +} + + +struct ggml_backend_qnn_buffer_context { + ~ggml_backend_qnn_buffer_context() { + if (buffer) { + free(buffer); + } + + for (auto * sub_buffer : sub_buffers) { + free(sub_buffer); + } + + for (auto * qnn_tensor : qnn_tensors) { + free_qnn_tensor(*qnn_tensor); + free(qnn_tensor); + } + + sub_buffers.clear(); + qnn_tensors.clear(); + } + void * buffer = nullptr; + + struct ggml_backend_qnn_context * backend_ctx = nullptr; + + size_t buffer_size = 0; + std::vector sub_buffers; + std::vector qnn_tensors; +}; + + +static const char * ggml_backend_qnn_buffer_get_name(ggml_backend_buffer_t buffer) { + GGML_UNUSED(buffer); + return "QNN"; +} + + +[[maybe_unused]] GGML_CALL static bool ggml_backend_buffer_is_qnn(ggml_backend_buffer_t buffer) { + return buffer->iface.get_name == ggml_backend_qnn_buffer_get_name; +} + + +GGML_CALL static void ggml_backend_qnn_buffer_free_buffer(ggml_backend_buffer_t buffer) { + ggml_backend_qnn_buffer_context * ctx = (ggml_backend_qnn_buffer_context *) buffer->context; + delete ctx; +} + + +GGML_CALL static void * ggml_backend_qnn_buffer_get_base(ggml_backend_buffer_t buffer) { + ggml_backend_qnn_buffer_context * ctx = (ggml_backend_qnn_buffer_context *) buffer->context; + + return ctx->buffer; +} + + +GGML_CALL static void ggml_backend_qnn_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) { + Qnn_ErrorHandle_t error = QNN_SUCCESS; + ggml_backend_qnn_buffer_context * ctx = (ggml_backend_qnn_buffer_context *)buffer->context; + + static int idx = 0; + char tensor_name[GGML_MAX_NAME] = { 0 }; + snprintf(tensor_name, GGML_MAX_NAME, "tensor_%2d", idx++); + + uint32_t dimensions[] = {(uint32_t) tensor->ne[0], (uint32_t) tensor->ne[1], (uint32_t) tensor->ne[2], (uint32_t) tensor->ne[3]}; + //TODO:only support FP32 & FP16 + Qnn_DataType_t qnn_data_type = QNN_DATATYPE_FLOAT_32; + Qnn_TensorType_t qnn_tensor_type = QNN_TENSOR_TYPE_APP_WRITE; + + if (tensor->flags & GGML_TENSOR_FLAG_INPUT) { + qnn_tensor_type = QNN_TENSOR_TYPE_APP_WRITE; + } else if (tensor->flags & GGML_TENSOR_FLAG_OUTPUT) { + qnn_tensor_type = QNN_TENSOR_TYPE_APP_READ; + } + Qnn_Tensor_t qnn_tensor = { + .version= QNN_TENSOR_VERSION_1, + {.v1= { + .id = 0, + .name = tensor_name, + .type = qnn_tensor_type, + .dataFormat = QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER, + .dataType = qnn_data_type, + .quantizeParams = {QNN_DEFINITION_UNDEFINED, + QNN_QUANTIZATION_ENCODING_UNDEFINED, + {.scaleOffsetEncoding = {.scale = 0.0000000000000000f, .offset = 0}}}, + .rank = ggml_get_tensor_rank(tensor), + .dimensions = dimensions, + .memType = QNN_TENSORMEMTYPE_RAW, + {.clientBuf = {.data = nullptr, + .dataSize = 0}}}} + }; + Qnn_Tensor_t * p_qnn_tensor = (Qnn_Tensor_t *)calloc(1, sizeof(Qnn_Tensor_t)); + if (nullptr == p_qnn_tensor) { + QNN_LOG_WARN("calloc failed"); + return; + } + error = deep_copy_qnn_tensors(qnn_tensor, *p_qnn_tensor); + if (error != QNN_SUCCESS) { + free(p_qnn_tensor); + QNN_LOG_DEBUG("init tensor failed"); + return; + } + tensor->extra = p_qnn_tensor; + ctx->qnn_tensors.push_back(p_qnn_tensor); +} + + +GGML_CALL static void ggml_backend_qnn_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { + GGML_UNUSED(buffer); + + memcpy((char *)tensor->data + offset, data, size); +} + + +GGML_CALL static void ggml_backend_qnn_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { + GGML_UNUSED(buffer); + memcpy(data, (const char *)tensor->data + offset, size); +} + + +GGML_CALL static bool ggml_backend_qnn_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) { + GGML_UNUSED(buffer); + if (ggml_backend_buffer_is_host(src->buffer)) { + memcpy(dst->data, src->data, ggml_nbytes(src)); + return true; + } + + return false; +} + + +GGML_CALL static void ggml_backend_qnn_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { + ggml_backend_qnn_buffer_context * ctx = (ggml_backend_qnn_buffer_context *) buffer->context; + + memset(ctx->buffer, value, ctx->buffer_size); +} + + +[[maybe_unused]] GGML_CALL static void ggml_backend_qnn_buffer_reset(ggml_backend_buffer_t buffer) { + ggml_backend_qnn_buffer_context * ctx = (ggml_backend_qnn_buffer_context *) buffer->context; + for (auto * sub_buffer : ctx->sub_buffers) { + free(sub_buffer); + } + ctx->sub_buffers.clear(); +} + + +static ggml_backend_buffer_i ggml_backend_qnn_buffer_interface = { + /* .get_name = */ ggml_backend_qnn_buffer_get_name, + /* .free_buffer = */ ggml_backend_qnn_buffer_free_buffer, + /* .get_base = */ ggml_backend_qnn_buffer_get_base, + /* .init_tensor = */ ggml_backend_qnn_buffer_init_tensor, + /* .set_tensor = */ ggml_backend_qnn_buffer_set_tensor, + /* .get_tensor = */ ggml_backend_qnn_buffer_get_tensor, + /* .cpy_tensor = */ ggml_backend_qnn_buffer_cpy_tensor, + /* .clear = */ ggml_backend_qnn_buffer_clear, + /* .reset = */ nullptr, +}; + + +GGML_CALL static const char * ggml_backend_qnn_buffer_type_name(ggml_backend_buffer_type_t buft) { + return "QNN"; +} + + +static void * ggml_qnn_host_malloc(size_t n) { + void * data = nullptr; + const int result = posix_memalign((void **) &data, sysconf(_SC_PAGESIZE), n); + if (result != 0) { + QNN_LOG_WARN("%s: error: posix_memalign failed\n", __func__); + return nullptr; + } + + return data; +} + + +GGML_CALL static ggml_backend_buffer_t ggml_backend_qnn_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { + ggml_backend_qnn_buffer_context * ctx = new ggml_backend_qnn_buffer_context; + + const size_t size_page = sysconf(_SC_PAGESIZE); + + size_t size_aligned = size; + if ((size_aligned % size_page) != 0) { + size_aligned += (size_page - (size_aligned % size_page)); + } + + //TODO:use pre-allocated buffer in internal memory pool + ctx->buffer = ggml_qnn_host_malloc(size_aligned); + ctx->buffer_size = size_aligned; + + ctx->backend_ctx = &g_qnn_mgr[g_current_device]; + + if (nullptr == ctx->buffer) { + QNN_LOG_WARN("%s: failed to allocate %.2f MiB\n", __func__, size / (1 << 20)); + return nullptr; + } + + return ggml_backend_buffer_init(buft, ggml_backend_qnn_buffer_interface, ctx, size); +} + + +GGML_CALL static size_t ggml_backend_qnn_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { + GGML_UNUSED(buft); + return 32; +} + + +//TODO: this value is an experimental value, works fine with whisper/llm/minicpm-v inference on Android +GGML_CALL static size_t ggml_backend_qnn_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) { + GGML_UNUSED(buft); + + return (96 * 1024 * 1024); +} + + +GGML_CALL static bool ggml_backend_qnn_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, + ggml_backend_t backend) { + GGML_UNUSED(buft); + + return ggml_backend_is_qnn(backend) || ggml_backend_is_cpu(backend); +} + + +GGML_CALL static bool ggml_backend_qnn_buffer_is_host(ggml_backend_buffer_type_t buft) { + GGML_UNUSED(buft); + return true; +} + + +GGML_CALL static const char * ggml_backend_qnn_name(ggml_backend_t backend) { + return "QNN"; +} + + +GGML_CALL static void ggml_backend_qnn_free(ggml_backend_t backend) { + QNN_LOG_INFO("enter %s", __func__ ); + ggml_backend_qnn_context * ctx = (ggml_backend_qnn_context *) backend->context; + QNN_LOG_DEBUG("idx %d, name:%s", ctx->device, g_qnn_mgr[ctx->device].name); + + qnn_instance * instance = (qnn_instance*)g_qnn_mgr[ctx->device].instance; + if (instance != nullptr) { + std::map>::iterator graph_it; + for (graph_it = instance->_qnn_graph_map.begin(); graph_it != instance->_qnn_graph_map.end(); graph_it++) { + auto & graph_item = graph_it->second; + Qnn_GraphHandle_t & graph_handle = std::get<0>(graph_item); + GGML_UNUSED(graph_handle); + QNN_LOG_DEBUG("graph type:%s", graph_it->first.c_str()); + } + instance->_qnn_graph_map.clear(); + + instance->qnn_finalize(); + delete instance; + g_qnn_mgr[ctx->device].instance = nullptr; + } + + if (g_qnn_mgr[ctx->device].backend != nullptr) { + delete backend; + g_qnn_backend = nullptr; + g_qnn_mgr[ctx->device].backend = nullptr; + } + QNN_LOG_INFO("leave %s", __func__ ); +} + + +GGML_CALL static ggml_backend_buffer_type_t ggml_backend_qnn_get_default_buffer_type(ggml_backend_t backend) { + ggml_backend_qnn_context * ctx = (ggml_backend_qnn_context *) backend->context; + + return ggml_backend_qnn_buffer_type(ctx->device); +} + + +GGML_CALL static ggml_status ggml_backend_qnn_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { + enum ggml_status result = GGML_STATUS_SUCCESS; + ggml_backend_qnn_context * ctx = (ggml_backend_qnn_context *) backend->context; + GGML_UNUSED(ctx); + + ggml_compute_params params = {}; + params.type = GGML_TASK_TYPE_COMPUTE; + params.ith = 0; + for (int i = 0; i < cgraph->n_nodes; i++) { + ggml_tensor * node = cgraph->nodes[i]; + if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) { + continue; + } + bool ok = ggml_qnn_compute_forward(¶ms, node); + if (!ok) { + QNN_LOG_DEBUG("%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op)); + } + } + + return result; +} + + +GGML_CALL static bool ggml_backend_qnn_supports_op(ggml_backend_t backend, const ggml_tensor * op) { + GGML_UNUSED(backend); + + return (ggml_qnn_can_handle_op(op, true)); +} + + +//note: this function be used with proposal/refined ggml backend subsystem in this PR: +// https://github.com/ggerganov/llama.cpp/pull/7641 +// new ggml backend(only using system memory: ggml_backend_xxx_buffer_is_host return true) +// can following this style for mixed inference between CPU&GPU / CPU&NPU very easily +GGML_CALL static bool ggml_backend_qnn_offload_op(ggml_backend_t backend, const ggml_tensor * tensor) { + GGML_UNUSED(backend); + + return ggml_qnn_compute_forward(nullptr, (ggml_tensor*)tensor); +} + + +static ggml_backend_i ggml_backend_qnn_interface = { + /* .get_name = */ ggml_backend_qnn_name, + /* .free = */ ggml_backend_qnn_free, + /* .get_default_buffer_type = */ ggml_backend_qnn_get_default_buffer_type, + /* .set_tensor_async = */ nullptr, + /* .get_tensor_async = */ nullptr, + /* .cpy_tensor_async = */ nullptr, + /* .synchronize = */ nullptr, + /* .graph_plan_create = */ nullptr, + /* .graph_plan_free = */ nullptr, + /* .graph_plan_compute = */ nullptr, + /* .graph_compute = */ ggml_backend_qnn_graph_compute, + /* .supports_op = */ ggml_backend_qnn_supports_op, + /* .offload_op = */ ggml_backend_qnn_offload_op, + /* .event_new = */ nullptr, + /* .event_free = */ nullptr, + /* .event_record = */ nullptr, + /* .event_wait = */ nullptr, + /* .event_synchronize = */ nullptr, +}; + + +static ggml_guid_t ggml_backend_qnn_guid() { + static ggml_guid guid = {0x1a, 0x2b, 0x3c, 0x4d, 0x5e, 0x6f, 0x70, 0x81, 0x92, 0xa3, 0xb4, 0xc5, + 0xd6, 0xe7, 0xf8, 0x09}; + return &guid; +} + + +static ggml_backend_t ggml_backend_qnn_reg_init(const char * params, void * user_data) { + if (nullptr == params) { + //QNN library path + //can be hardcoded to "/data/local/tmp/" for Android command line application + //or specified in JNI layer for Android APK + params = "/data/local/tmp/"; + } + ggml_backend_t qnn_backend = ggml_backend_qnn_init((int) (intptr_t) user_data, params); + + return qnn_backend; +} + + +bool ggml_backend_is_qnn(ggml_backend_t backend) { + return backend != nullptr && ggml_guid_matches(backend->guid, ggml_backend_qnn_guid()); +} + + +void ggml_backend_qnn_set_n_threads(ggml_backend_t backend, int n_threads) { + GGML_ASSERT(ggml_backend_is_qnn(backend)); + + struct ggml_backend_qnn_context * ctx = (struct ggml_backend_qnn_context *)backend->context; + ctx->threads = n_threads; +} + + +const char * ggml_backend_qnn_get_name(ggml_backend_t backend) { + return backend->iface.get_name(backend); +} + + +int ggml_backend_qnn_get_device_count() { + return GGML_QNN_MAX_DEVICES; +} + + +void ggml_backend_qnn_get_device_description(size_t dev_num, char * description, size_t description_size) { + if (nullptr == description || 0 == description_size) { + QNN_LOG_WARN("invalid param"); + return; + } + + if (dev_num >= GGML_QNN_MAX_DEVICES) { + QNN_LOG_WARN("invalid param"); + return; + } + + snprintf(description, description_size, "%s", g_qnn_mgr[dev_num].name); + QNN_LOG_DEBUG("description:%s", description); +} + + +ggml_backend_buffer_type_t ggml_backend_qnn_buffer_type(size_t device_index) { + if (device_index >= GGML_QNN_MAX_DEVICES) { + QNN_LOG_DEBUG("ggml_backend_qnn_buffer_type error: device_index:%d is out of range [0, %d]\n", + device_index, GGML_QNN_MAX_DEVICES - 1); + return nullptr; + } + + static struct ggml_backend_buffer_type ggml_backend_buffer_type_qnn = { + /* .iface = */ { + /* .get_name = */ ggml_backend_qnn_buffer_type_name, + /* .alloc_buffer = */ ggml_backend_qnn_buffer_type_alloc_buffer, + /* .get_alignment = */ ggml_backend_qnn_buffer_type_get_alignment, + /* .get_max_size = */ ggml_backend_qnn_buffer_type_get_max_size, + /* .get_alloc_size = */ nullptr,// defaults to ggml_nbytes + /* .supports_backend = */ ggml_backend_qnn_buffer_type_supports_backend, + /* .is_host = */ ggml_backend_qnn_buffer_is_host + }, + /* .context = */ nullptr, + }; + + return &ggml_backend_buffer_type_qnn; +} + + +/** + * + * @param device 0: QNN_BACKEND_CPU 1: QNN_BACKEND_GPU 2: QNN_BACKEND_NPU(aka HTP/DSP) + * @param qnn_lib_path qnn library path, such as "/data/local/tmp/" on Android or specified in JNI layer + * @return + */ +ggml_backend_t ggml_backend_qnn_init(size_t device, const char * qnn_lib_path) { + int result = 0; + + if (nullptr == qnn_lib_path) + return nullptr; + + QNN_LOG_DEBUG("device %d", device); + QNN_LOG_DEBUG("qnn_lib_path %s", qnn_lib_path); + if (device >= GGML_QNN_MAX_DEVICES) { + QNN_LOG_ERROR("invalid device %d", device); + return nullptr; + } + + if (nullptr != g_qnn_mgr[device].backend) { + QNN_LOG_ERROR("qnn backend %d(%s) already loaded", device, get_qnn_backend_name(device)); + if (device == g_current_device) { + g_qnn_backend = g_qnn_mgr[device].backend; + QNN_LOG_INFO("re-use cached backend %d(%s)", device, get_qnn_backend_name(device)); + return g_qnn_mgr[device].backend; + } else { + QNN_LOG_INFO("delete previous backend %d(%s)", device, get_qnn_backend_name(device)); + ggml_backend_qnn_free(g_qnn_backend); + } + } + + std::string path = qnn_lib_path; + if (QNN_BACKEND_NPU == device) { + if (0 == setenv("LD_LIBRARY_PATH", + (path + + ":/vendor/dsp/cdsp:/vendor/lib64:/vendor/dsp/dsp:/vendor/dsp/images").c_str(), + 1)) { + QNN_LOG_INFO("QNN DSP backend setenv successfully"); + } else { + QNN_LOG_ERROR("QNN DSP backend setenv failure"); + } + if (0 == setenv("ADSP_LIBRARY_PATH", + (path + + ";/vendor/dsp/cdsp;/vendor/lib/rfsa/adsp;/system/lib/rfsa/adsp;/vendor/dsp/dsp;/vendor/dsp/images;/dsp").c_str(), + 1)) { + QNN_LOG_INFO("QNN DSP backend setenv successfully"); + } else { + QNN_LOG_ERROR("QNN DSP backend setenv failure"); + } + } else { + if (0 == setenv("LD_LIBRARY_PATH", + (path + + ":/vendor/dsp/cdsp:/vendor/lib64:/vendor/dsp/dsp:/vendor/dsp/images").c_str(), + 1)) { + QNN_LOG_INFO("%s backend setenv successfully\n", get_qnn_backend_name(device)); + } else { + QNN_LOG_ERROR("%s backend setenv failure\n", get_qnn_backend_name(device)); + } + } + + qnn_instance * instance = nullptr; + instance = new qnn_instance(qnn_lib_path, g_qnn_mgr[device].lib, ""); + result = instance->qnn_init(nullptr); + if (0 != result) { + QNN_LOG_WARN("init qnn subsystem failed with qnn backend %s, pls check why\n", get_qnn_backend_name(device)); + delete instance; + return nullptr; + } + qnn_interface qnn_interface = instance->get_qnn_interface(); + if (!qnn_interface.is_loaded()) { + QNN_LOG_WARN("qnn subsystem failure\n"); + delete instance; + return nullptr; + } + + std::string device_name = get_qnn_backend_name(device); + QNN_LOG_INFO("qnn device name %s", device_name.c_str()); + instance->init_qnn_graph(device_name.c_str(), false); + g_qnn_mgr[device].instance = instance; + g_qnn_mgr[device].raw_interface = instance->get_qnn_raw_interface(); + g_qnn_mgr[device].raw_system_interface = instance->get_qnn_raw_system_interface(); + + ggml_backend_t qnn_backend = new ggml_backend{ + /* .guid = */ ggml_backend_qnn_guid(), + /* .iface = */ ggml_backend_qnn_interface, + /* .context = */ &g_qnn_mgr[device] + }; + g_qnn_mgr[device].backend = qnn_backend; + g_qnn_backend = g_qnn_mgr[device].backend; + g_current_device = device; + + return qnn_backend; +} + + +extern "C" GGML_CALL int ggml_backend_qnn_reg_devices(void); + +GGML_CALL int ggml_backend_qnn_reg_devices() { + for (size_t idx = 0; idx < GGML_QNN_MAX_DEVICES; idx++) { + char name[GGML_MAX_NAME]; + ggml_backend_qnn_get_device_description(idx, name, GGML_MAX_NAME); + ggml_backend_register(name, ggml_backend_qnn_reg_init, ggml_backend_qnn_buffer_type(idx), + (void *) (intptr_t)idx); + } + + return GGML_QNN_MAX_DEVICES; +} diff --git a/ggml-qnn.h b/ggml-qnn.h new file mode 100644 index 0000000000000..c61ebd25d9ba6 --- /dev/null +++ b/ggml-qnn.h @@ -0,0 +1,43 @@ +#pragma once + +#include "ggml.h" +#include "ggml-backend.h" + +#ifdef __cplusplus +extern "C" { +#endif + + +#define GGML_QNN_MAX_DEVICES 3 + +//QNN cDSP and HTA backend would not be used currently, just focus on QNN CPU/GPU/NPU(aka HTP/DSP) backend currently +enum QNNBackend { + QNN_BACKEND_CPU, + QNN_BACKEND_GPU, + QNN_BACKEND_NPU, + QNN_BACKEND_GGML, //"fake" QNN backend just for compare performance between QNN and original GGML +}; + +GGML_API int ggml_backend_qnn_reg_devices(void); + +/** + * + * @param device 0: QNN_BACKEND_CPU 1: QNN_BACKEND_GPU 2: QNN_BACKEND_NPU(aka HTP/DSP) + * @param qnn_lib_path qnn library path, such as "/data/local/tmp/" on Android or specified in JNI layer + * @return + */ +GGML_API ggml_backend_t ggml_backend_qnn_init(size_t dev_num, const char * qnn_lib_path); + +GGML_API bool ggml_backend_is_qnn(ggml_backend_t backend); + +GGML_API void ggml_backend_qnn_set_n_threads(ggml_backend_t backend, int thread_counts); + +GGML_API int ggml_backend_qnn_get_device_count(void); + +GGML_API void ggml_backend_qnn_get_device_description(size_t dev_num, char * description, size_t description_size); + +GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_qnn_buffer_type(size_t dev_num); + +#ifdef __cplusplus +} +#endif diff --git a/llama.cpp b/llama.cpp index 06889126ecdc4..42a9cb2a44981 100644 --- a/llama.cpp +++ b/llama.cpp @@ -19,6 +19,8 @@ # include "ggml-sycl.h" #elif defined(GGML_USE_KOMPUTE) # include "ggml-kompute.h" +#elif defined(GGML_USE_QNN) +# include "ggml-qnn.h" #endif #ifdef GGML_USE_METAL @@ -2377,6 +2379,8 @@ static size_t llama_get_device_count(const llama_model & model) { count = ggml_backend_sycl_get_device_count(); #elif defined(GGML_USE_VULKAN) count = ggml_backend_vk_get_device_count(); +#elif defined(GGML_USE_QNN) + count = ggml_backend_qnn_get_device_count(); #endif #if defined(GGML_USE_RPC) count += model.rpc_servers.size(); @@ -2409,6 +2413,8 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_ if (buft == nullptr) { LLAMA_LOG_WARN("%s: cannot use GPU %d, check `vulkaninfo --summary`\n", __func__, gpu); } +#elif defined(GGML_USE_QNN) + buft = ggml_backend_qnn_buffer_type(gpu); #endif if (buft == nullptr) { @@ -15899,6 +15905,8 @@ size_t llama_max_devices(void) { return GGML_SYCL_MAX_DEVICES; #elif defined(GGML_USE_VULKAN) return GGML_VK_MAX_DEVICES; +#elif defined(GGML_USE_QNN) + return GGML_QNN_MAX_DEVICES; #else return 1; #endif @@ -15914,7 +15922,7 @@ bool llama_supports_mlock(void) { bool llama_supports_gpu_offload(void) { #if defined(GGML_USE_CUDA) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || \ - defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE) || defined(GGML_USE_RPC) + defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE) || defined(GGML_USE_RPC) || defined(GGML_USE_QNN) // Defined when llama.cpp is compiled with support for offloading model layers to GPU. return true; #else @@ -16225,6 +16233,19 @@ struct llama_context * llama_new_context_with_model( } ctx->backends.push_back(backend); } +#elif defined(GGML_USE_QNN) + if (model->n_gpu_layers > 0) { + //the second param is data path of prebuit QNN libs provided by Qualcomm + //can be hardcoded to "/data/local/tmp/" for Android command line application + //or specified in JNI layer for Android APK application + ggml_backend_t backend = ggml_backend_qnn_init(model->main_gpu, "/data/local/tmp/"); + if (nullptr == backend) { + LLAMA_LOG_ERROR("%s: failed to initialize QNN backend\n", __func__); + llama_free(ctx); + return nullptr; + } + ctx->backends.push_back(backend); + } #endif #if defined(GGML_USE_RPC) if (model->n_gpu_layers > 0) { diff --git a/tests/ggml-qnn/CMakeLists.txt b/tests/ggml-qnn/CMakeLists.txt new file mode 100644 index 0000000000000..15ad7be6f6c88 --- /dev/null +++ b/tests/ggml-qnn/CMakeLists.txt @@ -0,0 +1,60 @@ +cmake_minimum_required(VERSION 3.22.1) +project(ggml-qnn-test) + +set(CMAKE_VERBOSE_MAKEFILE on) +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_STANDARD_REQUIRED ON) +set(CMAKE_POSITION_INDEPENDENT_CODE ON) + +#set to ON if target Android phone is based on Qualcomm Snapdragon 8 Gen 3 +set(TARGET_SNAPDRAGON_8_GEN3 OFF) + +set(QNN_INC_PATH ${QNN_SDK_PATH}/include/QNN) +set(QNN_LIB_PATH ${QNN_SDK_PATH}/lib/aarch64-android) + +include_directories(${QNN_INC_PATH}) +include_directories(../../) # ggml.h + +set(SOURCE_FILES + ../../ggml.c + ../../ggml-alloc.c + ../../ggml-backend.c + ../../ggml-quants.c + ../../ggml-qnn.cpp + test-qnn-ops.cpp +) + + +message("QNN_SDK_PATH : ${QNN_SDK_PATH}") +message("QNN_INC_PATH : ${QNN_INC_PATH}") +message("QNN_LIB_PATH : ${QNN_LIB_PATH}") + +add_definitions(-D__ARM_NEON) +add_definitions(-DGGML_USE_QNN) + +if(CMAKE_BUILD_TYPE STREQUAL "Release") +add_definitions(-DNDEBUG) +add_definitions(-O3) +endif() + +if (TARGET_SNAPDRAGON_8_GEN3) +# the below build optimization only verified and works well on Qualcomm SM8650-AB Snapdragon 8 Gen 3 +add_definitions(-march=armv8.7-a) +add_definitions(-mcpu=cortex-x1) +add_definitions(-mtune=cortex-x1) + +else() +# the below build optimization might be works well on ALL mainstream Android phone based on Qualcomm mobile SoC +add_definitions(-mcpu=cortex-a72) + +endif() + +add_compile_options("-Wall" "-Wno-sign-compare") + +find_library(LOG_LIB log) + +link_libraries(${LOG_LIB} android) + +add_executable(${TARGET_NAME} + ${SOURCE_FILES} +) diff --git a/tests/ggml-qnn/build-ggml-qnn.sh b/tests/ggml-qnn/build-ggml-qnn.sh new file mode 100755 index 0000000000000..baca02f91347d --- /dev/null +++ b/tests/ggml-qnn/build-ggml-qnn.sh @@ -0,0 +1,95 @@ +#!/bin/bash + +set -e + +#https://qpm.qualcomm.com/#/main/tools/details/qualcomm_ai_engine_direct +#https://developer.qualcomm.com/software/hexagon-dsp-sdk/tools +QNN_SDK_PATH=/opt/qcom/aistack/qnn/2.20.0.240223/ + +ANDROID_NDK=`pwd`/android-ndk-r26c +ANDROID_PLATFORM=android-34 +TARGET=ggml-qnn-test + + +function dump_vars() +{ + echo -e "ANDROID_NDK: ${ANDROID_NDK}" + echo -e "QNN_SDK_PATH: ${QNN_SDK_PATH}" +} + + +function show_pwd() +{ + echo -e "current working path:$(pwd)\n" +} + + +function check_qnn_sdk() +{ + if [ ! -d ${QNN_SDK_PATH} ]; then + echo -e "QNN_SDK_PATH ${QNN_SDK_PATH} not exist, pls check...\n" + exit 1 + fi +} + + +function check_and_download_ndk() +{ + is_android_ndk_exist=1 + + if [ ! -d ${ANDROID_NDK} ]; then + is_android_ndk_exist=0 + fi + + if [ ! -f ${ANDROID_NDK}/build/cmake/android.toolchain.cmake ]; then + is_android_ndk_exist=0 + fi + + if [ ${is_android_ndk_exist} -eq 0 ]; then + + if [ ! -f android-ndk-r26c-linux.zip ]; then + wget --no-config --quiet --show-progress -O android-ndk-r26c-linux.zip https://dl.google.com/android/repository/android-ndk-r26c-linux.zip + fi + + unzip android-ndk-r26c-linux.zip + + if [ $? -ne 0 ]; then + printf "failed to download android ndk to %s \n" "${ANDROID_NDK}" + exit 1 + fi + + printf "android ndk saved to ${ANDROID_NDK} \n\n" + else + printf "android ndk already exist:${ANDROID_NDK} \n\n" + fi +} + + +function build_arm64 +{ + cmake -H. -B./out/arm64-v8a -DTARGET_NAME=${TARGET} -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=${ANDROID_PLATFORM} -DANDROID_NDK=${ANDROID_NDK} -DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK}/build/cmake/android.toolchain.cmake -DQNN_SDK_PATH=${QNN_SDK_PATH} + + cd ./out/arm64-v8a + make + + ls -lah ${TARGET} + /bin/cp ${TARGET} ../../ + cd - +} + + +function remove_temp_dir() +{ + if [ -d out ]; then + echo "remove out directory in `pwd`" + rm -rf out + fi +} + + +show_pwd +check_and_download_ndk +check_qnn_sdk +dump_vars +remove_temp_dir +build_arm64 diff --git a/tests/ggml-qnn/run-ggml-qnn.sh b/tests/ggml-qnn/run-ggml-qnn.sh new file mode 100755 index 0000000000000..a4c1f22ad70cd --- /dev/null +++ b/tests/ggml-qnn/run-ggml-qnn.sh @@ -0,0 +1,108 @@ +#!/bin/bash + +#https://qpm.qualcomm.com/#/main/tools/details/qualcomm_ai_engine_direct +#https://developer.qualcomm.com/software/hexagon-dsp-sdk/tools +QNN_SDK_PATH=/opt/qcom/aistack/qnn/2.20.0.240223/ +GGML_QNN_TEST=ggml-qnn-test +REMOTE_PATH=/data/local/tmp/ + + +function check_qnn_sdk() +{ + if [ ! -d ${QNN_SDK_PATH} ]; then + echo -e "QNN_SDK_PATH ${QNN_SDK_PATH} not exist, pls check or download it from https://qpm.qualcomm.com/#/main/tools/details/qualcomm_ai_engine_direct...\n" + exit 1 + fi +} + + +function check_qnn_libs() +{ + #reuse the cached qnn libs in Android phone + adb shell ls ${REMOTE_PATH}/libQnnCpu.so + if [ $? -eq 0 ]; then + printf "QNN libs already exist on Android phone\n" + else + adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnSystem.so ${REMOTE_PATH}/ + adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnCpu.so ${REMOTE_PATH}/ + adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnGpu.so ${REMOTE_PATH}/ + + #the QNN NPU(aka HTP/DSP) backend only verified on Xiaomi14(Qualcomm SM8650-AB Snapdragon 8 Gen 3) successfully + adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtp.so ${REMOTE_PATH}/ + adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtpNetRunExtensions.so ${REMOTE_PATH}/ + adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtpPrepare.so ${REMOTE_PATH}/ + adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtpV75Stub.so ${REMOTE_PATH}/ + adb push ${QNN_SDK_PATH}/lib/hexagon-v75/unsigned/libQnnHtpV75Skel.so ${REMOTE_PATH}/ + fi +} + + +function show_usage() +{ + echo "Usage:" + echo " $0 GGML_OP_ADD 0/1/2" + echo " $0 GGML_OP_MUL 0/1/2" + echo " $0 GGML_OP_MUL_MAT 0/1/2" + echo -e "\n\n\n" +} + + +function main() +{ + check_qnn_libs + + #upload the latest ggml_qnn_test + adb push ${GGML_QNN_TEST} ${REMOTE_PATH} + adb shell chmod +x ${REMOTE_PATH}/${GGML_QNN_TEST} + + case "$ggmlop" in + GGML_OP_ADD) + echo "adb shell ${REMOTE_PATH}/${GGML_QNN_TEST} -t GGML_OP_ADD -b $qnnbackend" + adb shell ${REMOTE_PATH}/${GGML_QNN_TEST} -t GGML_OP_ADD -b $qnnbackend + ;; + + GGML_OP_MUL) + adb shell ${REMOTE_PATH}/${GGML_QNN_TEST} -t GGML_OP_MUL -b $qnnbackend + ;; + + GGML_OP_MUL_MAT) + adb shell ${REMOTE_PATH}/${GGML_QNN_TEST} -t GGML_OP_MUL_MAT -b $qnnbackend + ;; + + *) + printf " \n$arg not supported currently\n" + show_usage + exit 1 + ;; + esac +} + + +check_qnn_sdk + +unset ggmlop +unset qnnbackend +if [ $# == 0 ]; then + show_usage + exit 1 +elif [ $# == 1 ]; then + if [ "$1" == "-h" ]; then + #avoid upload command line program to Android phone in this scenario + show_usage + exit 1 + elif [ "$1" == "help" ]; then + #avoid upload command line program to Android phone in this scenario + show_usage + exit 1 + else + ggmlop=$1 + qnnbackend=0 + fi +elif [ $# == 2 ]; then + ggmlop=$1 + qnnbackend=$2 +else + show_usage + exit 1 +fi +main $arg diff --git a/tests/ggml-qnn/test-qnn-ops.cpp b/tests/ggml-qnn/test-qnn-ops.cpp new file mode 100644 index 0000000000000..27967270bdcd4 --- /dev/null +++ b/tests/ggml-qnn/test-qnn-ops.cpp @@ -0,0 +1,450 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ggml.h" +#include "ggml-alloc.h" +#include "ggml-backend.h" +#include "ggml-qnn.h" + +#define GGML_QNN_DEBUG 1 +#define GGML_QNN_LOGBUF_LEN 4096 + +#define QNN_LOG_ERROR(...) ggml_qnn_log_internal(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) +#define QNN_LOG_WARN(...) ggml_qnn_log_internal(GGML_LOG_LEVEL_DEBUG , __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) +#define QNN_LOG_INFO(...) ggml_qnn_log_internal(GGML_LOG_LEVEL_DEBUG , __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) + +#if GGML_QNN_DEBUG +#define QNN_LOG_DEBUG(...) ggml_qnn_log_internal(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) +#else +#define QNN_LOG_DEBUG(...) +#endif + +static void tensor_dump(const ggml_tensor * tensor, const char * name); + +#define TENSOR_DUMP(tensor) tensor_dump(tensor, #tensor) + +static void ggml_qnn_log_internal(ggml_log_level level, const char * file, const char * func, int line, const char * format, ...) { + static std::mutex ggml_qnn_log_internal_mutex; + static char s_ggml_qnn_log_internal_buf[GGML_QNN_LOGBUF_LEN]; + + { + std::lock_guard lock(ggml_qnn_log_internal_mutex); + va_list args; + va_start(args, format); + int len_prefix = snprintf(s_ggml_qnn_log_internal_buf, GGML_QNN_LOGBUF_LEN, "[%s, %d]: ", func, line); + int len = vsnprintf(s_ggml_qnn_log_internal_buf + len_prefix, GGML_QNN_LOGBUF_LEN - len_prefix, format, args); + if (len < (GGML_QNN_LOGBUF_LEN - len_prefix)) { + //for Android command line application or WoA + printf("%s\n", s_ggml_qnn_log_internal_buf); + } + va_end(args); + } +} + + +static const char * get_qnn_backend_name(int n_backend_type) { + switch (n_backend_type) { + case 0: + return "QNN-CPU"; + case 1: + return "QNN-GPU"; + case 2: + return "QNN-NPU(HTP/DSP)"; + case 3: + return "ggml"; + default: + return "unknown"; + } +} + + +static bool ggml_graph_compute_helper( + struct ggml_backend * backend, + struct ggml_cgraph * graph, + std::vector & buf, + int n_threads, + ggml_abort_callback abort_callback, + void * abort_callback_data) { + struct ggml_cplan plan = ggml_graph_plan(graph, n_threads); + + plan.abort_callback = abort_callback; + plan.abort_callback_data = abort_callback_data; + + if (plan.work_size > 0) { + buf.resize(plan.work_size); + plan.work_data = buf.data(); + } + + if (ggml_backend_is_cpu(backend)) { + ggml_backend_cpu_set_n_threads(backend, n_threads); + } + +#ifdef GGML_USE_QNN + if (ggml_backend_is_qnn(backend)) { + ggml_backend_qnn_set_n_threads(backend, n_threads); + } +#endif + + //a new approch of mixed inference + if (nullptr != backend) + return ggml_backend_graph_compute(backend, graph) == GGML_STATUS_SUCCESS; + else + return ggml_graph_compute(graph, &plan); +} + + +static void tensor_dump_elements(const ggml_tensor * tensor) { + float value = 0; + std::ostringstream tmposs; + if (tensor->type == GGML_TYPE_F32) { + for (int h = 0; h < tensor->ne[3]; h++) { + for (int i = 0; i < tensor->ne[2]; i++) { + for (int j = 0; j < tensor->ne[1]; j++) { + for (int k = 0; k < tensor->ne[0]; k++) { + value = ((float *) tensor->data)[h * tensor->ne[2] + i * tensor->ne[1] + + j * tensor->ne[0] + k]; + tmposs << std::setw(8) << std::fixed << std::setprecision(2) << value + << " "; + } + if (strlen(tmposs.str().c_str()) <= (GGML_QNN_LOGBUF_LEN - 96)) { + QNN_LOG_DEBUG("%s", tmposs.str().c_str()); + } + tmposs.clear(); + tmposs.str(""); + //QNN_LOG_DEBUG("\n"); + } + } + } + } + + //QNN_LOG_DEBUG("\n"); +} + + +static void tensor_dump(const ggml_tensor * tensor, const char * name) { + QNN_LOG_DEBUG("dump ggml tensor %s(%s)", name, tensor->name); + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)", + name, + tensor->type, ggml_type_name(tensor->type), + tensor->ne[0], tensor->ne[1], tensor->ne[2], + tensor->nb[0], tensor->nb[1], tensor->nb[2]); + tensor_dump_elements(tensor); + + QNN_LOG_DEBUG("\n"); +} + + +static uint32_t get_tensor_rank(const ggml_tensor * tensor) { + uint32_t rank = 0; + for (int i = 0; i < GGML_MAX_DIMS; i++) { + if ((0 != tensor->ne[i]) && (1 != tensor->ne[i])) { + rank++; + } + } + return rank; +} + + +static uint32_t get_tensor_data_size(const ggml_tensor * tensor) { + size_t data_size = ggml_row_size(tensor->type, tensor->ne[0]); + size_t n_dims = get_tensor_rank(tensor); + for (int i = 1; i < n_dims; i++) { + data_size *= tensor->ne[i]; + } + + QNN_LOG_DEBUG("get_tensor_data_size %d", data_size); + QNN_LOG_DEBUG("ggml_nbytes(tensor) %d", ggml_nbytes(tensor)); + + return ggml_nbytes(tensor); +} + + +//ref: https://github.com/ggerganov/llama.cpp/blob/master/tests/test-backend-ops.cpp#L20 +static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float max = 1.0f) { + // static RNG initialization (revisit if n_threads stops being constant) + static const size_t n_threads = std::thread::hardware_concurrency(); + static std::vector generators = []() { + std::random_device rd; + std::vector vec; + vec.reserve(n_threads); + //for (size_t i = 0; i < n_threads; i++) { vec.emplace_back(1234 + i); } // fixed seed + for (size_t i = 0; i < n_threads; i++) { vec.emplace_back(rd()); } + return vec; + }(); + + size_t size = ggml_nelements(tensor); + std::vector data(size); + + auto init_thread = [&](size_t ith, size_t start, size_t end) { + std::uniform_real_distribution distribution(min, max); + for (size_t i = start; i < end; i++) { + data[i] = distribution(generators[ith]); + } + }; + + std::vector threads; + threads.reserve(n_threads); + for (size_t i = 0; i < n_threads; i++) { + size_t start = i*size/n_threads; + size_t end = (i+1)*size/n_threads; + threads.emplace_back(init_thread, i, start, end); + } + for (auto & t : threads) { + t.join(); + } + if (tensor->type == GGML_TYPE_F32 || tensor->type == GGML_TYPE_I32) { + ggml_backend_tensor_set(tensor, data.data(), 0, size * sizeof(float)); + } else if (ggml_is_quantized(tensor->type) || tensor->type == GGML_TYPE_F16 || tensor->type == GGML_TYPE_BF16) { + GGML_ASSERT(size % ggml_blck_size(tensor->type) == 0); + std::vector dataq(ggml_row_size(tensor->type, size)); + std::vector imatrix(tensor->ne[0], 1.0f); // dummy importance matrix + const float * im = imatrix.data(); + if (!ggml_quantize_requires_imatrix(tensor->type)) { + // when the imatrix is optional, we want to test both quantization with and without imatrix + // use one of the random numbers to decide + if (data[0] > 0.5f*(min + max)) { + im = nullptr; + } + } + ggml_quantize_chunk(tensor->type, data.data(), dataq.data(), 0, size/tensor->ne[0], tensor->ne[0], im); + GGML_ASSERT(ggml_validate_row_data(tensor->type, dataq.data(), dataq.size())); + ggml_backend_tensor_set(tensor, dataq.data(), 0, dataq.size()); + } else if (tensor->type == GGML_TYPE_I8 || tensor->type == GGML_TYPE_I16 || tensor->type == GGML_TYPE_I32) { + // This is going to create some weird integers though. + ggml_backend_tensor_set(tensor, data.data(), 0, ggml_nbytes(tensor)); + } else { + GGML_ASSERT(false); + } +} + + +//ref: https://github.com/ggerganov/llama.cpp/blob/master/tests/test-backend-ops.cpp#L310 +static void initialize_tensors(ggml_context * ctx) { + for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) { + init_tensor_uniform(t); + } +} + + +static void show_usage() { + printf(" " \ + "\nUsage: test_qnn_ops [options]\n" \ + "\n" \ + "Options:\n" \ + " -t GGML_OP_ADD / GGML_OP_MUL / GGML_OP_MULMAT\n" \ + " -b 0(QNN_CPU) 1(QNN_GPU) 2(QNN_NPU)\n" \ + " ?/h print usage infomation\n\n" + ); +} + + +int main(int argc, char * argv[]) { + int64_t n_begin_time = 0LL; + int64_t n_end_time = 0LL; + int64_t n_duration = 0LL; + size_t ctx_size = 0; + int sizey = 4; + int sizex = 4; + int num_threads = 4; + int n_backend_type = QNN_BACKEND_CPU; + int n_ggml_op_type = GGML_OP_ADD; + + struct ggml_context * ctx = nullptr; + struct ggml_cgraph * gf = nullptr; + struct ggml_tensor * src0 = nullptr; + struct ggml_tensor * src1 = nullptr; + struct ggml_tensor * dst = nullptr; + ggml_backend_t backend = nullptr; + ggml_backend_buffer_t buffer= nullptr; + ggml_type qtype = GGML_TYPE_F32; + std::vector work_buffer; + + for (int i = 1; i < argc; i++) { + if (0 == strcmp(argv[i], "-t")) { + if (i + 1 < argc) { + if (0 == memcmp(argv[i + 1], "GGML_OP_ADD", 11)) { + n_ggml_op_type = GGML_OP_ADD; + } else if (0 == memcmp(argv[i + 1], "GGML_OP_MUL_MAT", 15)) { + n_ggml_op_type = GGML_OP_MUL_MAT; + } else if (0 == memcmp(argv[i + 1], "GGML_OP_MUL", 11)) { + n_ggml_op_type = GGML_OP_MUL; + } else { + show_usage(); + return 1; + } + i++; + } + } else if (0 == strcmp(argv[i], "-b")) { + if (i + 1 < argc) { + int backend = atoi(argv[i + 1]); + if (backend <= QNN_BACKEND_NPU) + n_backend_type = backend; + else { + show_usage(); + return 1; + } + i++; + } + } else { + show_usage(); + return 1; + } + } + + QNN_LOG_DEBUG("enter qnn_ggml_op\n"); + QNN_LOG_DEBUG("ggml op:%d(%s)", n_ggml_op_type, ggml_op_name((enum ggml_op) n_ggml_op_type)); + + n_begin_time = ggml_time_us(); + srand(time(NULL)); + + ctx_size += 1024 * 1024 * 32; + QNN_LOG_DEBUG("Allocating Memory of size %zi bytes, %zi MB\n", ctx_size, + (ctx_size / 1024 / 1024)); + + struct ggml_init_params params = { + /*.mem_size =*/ ctx_size, + /*.mem_buffer =*/ NULL, + /* no_alloc =*/ 0 + }; + + if (n_backend_type != QNN_BACKEND_GGML) { + params.no_alloc = true; + backend = ggml_backend_qnn_init(n_backend_type, "/data/local/tmp/"); + if (nullptr == backend) { + QNN_LOG_ERROR("create qnn backend %d(%s) failed", n_backend_type, get_qnn_backend_name(n_backend_type)); + return 1; + } + } + + ctx = ggml_init(params); + if (!ctx) { + QNN_LOG_ERROR("%s: ggml_init() failed\n"); + return 2; + } + + QNN_LOG_DEBUG("creating new tensors\n"); + QNN_LOG_DEBUG("ggml_blck_size(%s) %d", ggml_type_name(qtype), ggml_blck_size(qtype)); + QNN_LOG_DEBUG("ggml_type_size(%s) %d", ggml_type_name(qtype), ggml_type_size(qtype)); + if (qtype != GGML_TYPE_F32) { + sizex = ggml_blck_size(qtype); + } + + src0 = ggml_new_tensor_2d(ctx, qtype, sizex, sizey); + ggml_set_input(src0); + src1 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizey); + ggml_set_input(src1); + + switch (n_ggml_op_type) { + case GGML_OP_ADD: + dst = ggml_add(ctx, src0, src1); + break; + case GGML_OP_MUL: + dst = ggml_mul(ctx, src0, src1); + break; + case GGML_OP_MUL_MAT: + dst = ggml_mul_mat(ctx, src0, src1); + break; + default: + QNN_LOG_WARN("ggml op %d(%s) not supported", n_ggml_op_type, + ggml_op_name((enum ggml_op) n_ggml_op_type)); + ggml_free(ctx); + ggml_backend_free(backend); + return 3; + } + + ggml_set_output(dst); +#ifdef GGML_USE_QNN + if (n_backend_type != QNN_BACKEND_GGML) { + buffer = ggml_backend_alloc_ctx_tensors(ctx, backend); + if (!buffer) { + QNN_LOG_ERROR("%s: failed to allocate backend buffer\n", __func__); + ggml_free(ctx); + ggml_backend_free(backend); + return 4; + } + } +#endif + + QNN_LOG_DEBUG("creating compute graph\n"); + gf = ggml_new_graph(ctx); + ggml_build_forward_expand(gf, dst); + +#if 0 + ggml_set_f32(src0, (rand() % 100 + 1)); + ggml_set_f32(src1, (rand() % 100 + 1)); + ggml_set_f32(dst, 0.0f); +#else + if (n_backend_type != QNN_BACKEND_GGML) { + initialize_tensors(ctx); + } +#endif + + ggml_graph_compute_helper(backend, gf, work_buffer, num_threads, nullptr, nullptr); + if (get_tensor_data_size(dst) < (32 * 32)) { + QNN_LOG_DEBUG("dump tensors:\n"); + TENSOR_DUMP(src0); + TENSOR_DUMP(src1); + TENSOR_DUMP(dst); + } else { + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + src0->name, + src0->type, ggml_type_name(src0->type), src0->ne[0], src0->ne[1], src0->ne[2], + src0->nb[0], src0->nb[1], src0->nb[2]); + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + src1->name, + src1->type, ggml_type_name(src1->type), src1->ne[0], src1->ne[1], src1->ne[2], + src1->nb[0], src1->nb[1], src1->nb[2]); + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + dst->name, + dst->type, ggml_type_name(dst->type), dst->ne[0], dst->ne[1], dst->ne[2], dst->nb[0], + dst->nb[1], dst->nb[2]); + } + + ggml_free(ctx); + ggml_backend_buffer_free(buffer); + ggml_backend_free(backend); + + n_end_time = ggml_time_us(); + n_duration = (n_end_time - n_begin_time) / 1000; + QNN_LOG_DEBUG("duration of ut GGML_OP_%s using QNN backend %s: %lld milliseconds\n", ggml_op_name((enum ggml_op)n_ggml_op_type), get_qnn_backend_name(n_backend_type), n_duration); + + return 0; +} From 9c872cbbce2fb76b11766fb4012e9206b27726b9 Mon Sep 17 00:00:00 2001 From: "zhou.weiguo" Date: Wed, 5 Jun 2024 12:06:17 +0800 Subject: [PATCH 003/143] refine ggml-qnn-ut program and script to make reviewers happy --- tests/ggml-qnn/CMakeLists.txt | 2 +- tests/ggml-qnn/build-ggml-qnn.sh | 95 --------- tests/ggml-qnn/ggml-qnn-ut-build-run.sh | 198 ++++++++++++++++++ .../{test-qnn-ops.cpp => ggml-qnn-ut.cpp} | 0 tests/ggml-qnn/run-ggml-qnn.sh | 108 ---------- 5 files changed, 199 insertions(+), 204 deletions(-) delete mode 100755 tests/ggml-qnn/build-ggml-qnn.sh create mode 100755 tests/ggml-qnn/ggml-qnn-ut-build-run.sh rename tests/ggml-qnn/{test-qnn-ops.cpp => ggml-qnn-ut.cpp} (100%) delete mode 100755 tests/ggml-qnn/run-ggml-qnn.sh diff --git a/tests/ggml-qnn/CMakeLists.txt b/tests/ggml-qnn/CMakeLists.txt index 15ad7be6f6c88..a78bdaeaf8009 100644 --- a/tests/ggml-qnn/CMakeLists.txt +++ b/tests/ggml-qnn/CMakeLists.txt @@ -21,7 +21,7 @@ set(SOURCE_FILES ../../ggml-backend.c ../../ggml-quants.c ../../ggml-qnn.cpp - test-qnn-ops.cpp + ggml-qnn-ut.cpp ) diff --git a/tests/ggml-qnn/build-ggml-qnn.sh b/tests/ggml-qnn/build-ggml-qnn.sh deleted file mode 100755 index baca02f91347d..0000000000000 --- a/tests/ggml-qnn/build-ggml-qnn.sh +++ /dev/null @@ -1,95 +0,0 @@ -#!/bin/bash - -set -e - -#https://qpm.qualcomm.com/#/main/tools/details/qualcomm_ai_engine_direct -#https://developer.qualcomm.com/software/hexagon-dsp-sdk/tools -QNN_SDK_PATH=/opt/qcom/aistack/qnn/2.20.0.240223/ - -ANDROID_NDK=`pwd`/android-ndk-r26c -ANDROID_PLATFORM=android-34 -TARGET=ggml-qnn-test - - -function dump_vars() -{ - echo -e "ANDROID_NDK: ${ANDROID_NDK}" - echo -e "QNN_SDK_PATH: ${QNN_SDK_PATH}" -} - - -function show_pwd() -{ - echo -e "current working path:$(pwd)\n" -} - - -function check_qnn_sdk() -{ - if [ ! -d ${QNN_SDK_PATH} ]; then - echo -e "QNN_SDK_PATH ${QNN_SDK_PATH} not exist, pls check...\n" - exit 1 - fi -} - - -function check_and_download_ndk() -{ - is_android_ndk_exist=1 - - if [ ! -d ${ANDROID_NDK} ]; then - is_android_ndk_exist=0 - fi - - if [ ! -f ${ANDROID_NDK}/build/cmake/android.toolchain.cmake ]; then - is_android_ndk_exist=0 - fi - - if [ ${is_android_ndk_exist} -eq 0 ]; then - - if [ ! -f android-ndk-r26c-linux.zip ]; then - wget --no-config --quiet --show-progress -O android-ndk-r26c-linux.zip https://dl.google.com/android/repository/android-ndk-r26c-linux.zip - fi - - unzip android-ndk-r26c-linux.zip - - if [ $? -ne 0 ]; then - printf "failed to download android ndk to %s \n" "${ANDROID_NDK}" - exit 1 - fi - - printf "android ndk saved to ${ANDROID_NDK} \n\n" - else - printf "android ndk already exist:${ANDROID_NDK} \n\n" - fi -} - - -function build_arm64 -{ - cmake -H. -B./out/arm64-v8a -DTARGET_NAME=${TARGET} -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=${ANDROID_PLATFORM} -DANDROID_NDK=${ANDROID_NDK} -DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK}/build/cmake/android.toolchain.cmake -DQNN_SDK_PATH=${QNN_SDK_PATH} - - cd ./out/arm64-v8a - make - - ls -lah ${TARGET} - /bin/cp ${TARGET} ../../ - cd - -} - - -function remove_temp_dir() -{ - if [ -d out ]; then - echo "remove out directory in `pwd`" - rm -rf out - fi -} - - -show_pwd -check_and_download_ndk -check_qnn_sdk -dump_vars -remove_temp_dir -build_arm64 diff --git a/tests/ggml-qnn/ggml-qnn-ut-build-run.sh b/tests/ggml-qnn/ggml-qnn-ut-build-run.sh new file mode 100755 index 0000000000000..c7bff2ee9c20e --- /dev/null +++ b/tests/ggml-qnn/ggml-qnn-ut-build-run.sh @@ -0,0 +1,198 @@ +#!/bin/bash + +set -e + +#https://qpm.qualcomm.com/#/main/tools/details/qualcomm_ai_engine_direct +#https://developer.qualcomm.com/software/hexagon-dsp-sdk/tools +QNN_SDK_PATH=/opt/qcom/aistack/qnn/2.20.0.240223/ + +ANDROID_NDK=`pwd`/android-ndk-r26c +ANDROID_PLATFORM=android-34 + +GGML_QNN_UT=ggml-qnn-ut +REMOTE_PATH=/data/local/tmp/ + + +function dump_vars() +{ + echo -e "ANDROID_NDK: ${ANDROID_NDK}" + echo -e "QNN_SDK_PATH: ${QNN_SDK_PATH}" +} + + +function show_pwd() +{ + echo -e "current working path:$(pwd)\n" +} + + +function check_qnn_sdk() +{ + if [ ! -d ${QNN_SDK_PATH} ]; then + echo -e "QNN_SDK_PATH ${QNN_SDK_PATH} not exist, pls check or download it from https://qpm.qualcomm.com/#/main/tools/details/qualcomm_ai_engine_direct...\n" + exit 1 + fi +} + + +function check_and_download_ndk() +{ + is_android_ndk_exist=1 + + if [ ! -d ${ANDROID_NDK} ]; then + is_android_ndk_exist=0 + fi + + if [ ! -f ${ANDROID_NDK}/build/cmake/android.toolchain.cmake ]; then + is_android_ndk_exist=0 + fi + + if [ ${is_android_ndk_exist} -eq 0 ]; then + + if [ ! -f android-ndk-r26c-linux.zip ]; then + wget --no-config --quiet --show-progress -O android-ndk-r26c-linux.zip https://dl.google.com/android/repository/android-ndk-r26c-linux.zip + fi + + unzip android-ndk-r26c-linux.zip + + if [ $? -ne 0 ]; then + printf "failed to download android ndk to %s \n" "${ANDROID_NDK}" + exit 1 + fi + + printf "android ndk saved to ${ANDROID_NDK} \n\n" + else + printf "android ndk already exist:${ANDROID_NDK} \n\n" + fi +} + + +function build_arm64 +{ + cmake -H. -B./out/arm64-v8a -DTARGET_NAME=${GGML_QNN_UT} -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=${ANDROID_PLATFORM} -DANDROID_NDK=${ANDROID_NDK} -DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK}/build/cmake/android.toolchain.cmake -DQNN_SDK_PATH=${QNN_SDK_PATH} + + cd ./out/arm64-v8a + make + + ls -lah ${GGML_QNN_UT} + /bin/cp ${GGML_QNN_UT} ../../ + cd - +} + + +function remove_temp_dir() +{ + if [ -d out ]; then + echo "remove out directory in `pwd`" + rm -rf out + fi +} + + +function check_qnn_libs() +{ + #reuse the cached qnn libs in Android phone + adb shell ls ${REMOTE_PATH}/libQnnCpu.so + if [ $? -eq 0 ]; then + printf "QNN libs already exist on Android phone\n" + else + adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnSystem.so ${REMOTE_PATH}/ + adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnCpu.so ${REMOTE_PATH}/ + adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnGpu.so ${REMOTE_PATH}/ + + #the QNN NPU(aka HTP/DSP) backend only verified on Xiaomi14(Qualcomm SM8650-AB Snapdragon 8 Gen 3) successfully + adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtp.so ${REMOTE_PATH}/ + adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtpNetRunExtensions.so ${REMOTE_PATH}/ + adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtpPrepare.so ${REMOTE_PATH}/ + adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtpV75Stub.so ${REMOTE_PATH}/ + adb push ${QNN_SDK_PATH}/lib/hexagon-v75/unsigned/libQnnHtpV75Skel.so ${REMOTE_PATH}/ + fi +} + + +function build_ggml_qnn_ut() +{ + show_pwd + check_and_download_ndk + check_qnn_sdk + dump_vars + remove_temp_dir + build_arm64 +} + + +function run_ggml_qnn_ut() +{ + check_qnn_libs + + #upload the latest ggml_qnn_test + adb push ${GGML_QNN_UT} ${REMOTE_PATH} + adb shell chmod +x ${REMOTE_PATH}/${GGML_QNN_UT} + + case "$ggmlop" in + GGML_OP_ADD) + echo "adb shell ${REMOTE_PATH}/${GGML_QNN_UT} -t GGML_OP_ADD -b $qnnbackend" + adb shell ${REMOTE_PATH}/${GGML_QNN_UT} -t GGML_OP_ADD -b $qnnbackend + ;; + + GGML_OP_MUL) + adb shell ${REMOTE_PATH}/${GGML_QNN_UT} -t GGML_OP_MUL -b $qnnbackend + ;; + + GGML_OP_MUL_MAT) + adb shell ${REMOTE_PATH}/${GGML_QNN_UT} -t GGML_OP_MUL_MAT -b $qnnbackend + ;; + + *) + printf " \n$arg not supported currently\n" + show_usage + exit 1 + ;; + esac +} + + +function show_usage() +{ + echo "Usage:" + echo " $0 build" + echo " $0 GGML_OP_ADD 0 (QNN_CPU) / 1(QNN_GPU) / 2(QNN_NPU)" + echo " $0 GGML_OP_MUL 0 (QNN_CPU) / 1(QNN_GPU) / 2(QNN_NPU)" + echo " $0 GGML_OP_MUL_MAT 0 (QNN_CPU) / 1(QNN_GPU) / 2(QNN_NPU)" + echo -e "\n\n\n" +} + + +unset ggmlop +unset qnnbackend + +check_qnn_sdk + +if [ $# == 0 ]; then + show_usage + exit 1 +elif [ $# == 1 ]; then + if [ "$1" == "-h" ]; then + #avoid upload command line program to Android phone in this scenario + show_usage + exit 1 + elif [ "$1" == "help" ]; then + #avoid upload command line program to Android phone in this scenario + show_usage + exit 1 + elif [ "$1" == "build" ]; then + build_ggml_qnn_ut + exit 0 + else + ggmlop=$1 + qnnbackend=0 + run_ggml_qnn_ut + fi +elif [ $# == 2 ]; then + ggmlop=$1 + qnnbackend=$2 + run_ggml_qnn_ut +else + show_usage + exit 1 +fi diff --git a/tests/ggml-qnn/test-qnn-ops.cpp b/tests/ggml-qnn/ggml-qnn-ut.cpp similarity index 100% rename from tests/ggml-qnn/test-qnn-ops.cpp rename to tests/ggml-qnn/ggml-qnn-ut.cpp diff --git a/tests/ggml-qnn/run-ggml-qnn.sh b/tests/ggml-qnn/run-ggml-qnn.sh deleted file mode 100755 index a4c1f22ad70cd..0000000000000 --- a/tests/ggml-qnn/run-ggml-qnn.sh +++ /dev/null @@ -1,108 +0,0 @@ -#!/bin/bash - -#https://qpm.qualcomm.com/#/main/tools/details/qualcomm_ai_engine_direct -#https://developer.qualcomm.com/software/hexagon-dsp-sdk/tools -QNN_SDK_PATH=/opt/qcom/aistack/qnn/2.20.0.240223/ -GGML_QNN_TEST=ggml-qnn-test -REMOTE_PATH=/data/local/tmp/ - - -function check_qnn_sdk() -{ - if [ ! -d ${QNN_SDK_PATH} ]; then - echo -e "QNN_SDK_PATH ${QNN_SDK_PATH} not exist, pls check or download it from https://qpm.qualcomm.com/#/main/tools/details/qualcomm_ai_engine_direct...\n" - exit 1 - fi -} - - -function check_qnn_libs() -{ - #reuse the cached qnn libs in Android phone - adb shell ls ${REMOTE_PATH}/libQnnCpu.so - if [ $? -eq 0 ]; then - printf "QNN libs already exist on Android phone\n" - else - adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnSystem.so ${REMOTE_PATH}/ - adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnCpu.so ${REMOTE_PATH}/ - adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnGpu.so ${REMOTE_PATH}/ - - #the QNN NPU(aka HTP/DSP) backend only verified on Xiaomi14(Qualcomm SM8650-AB Snapdragon 8 Gen 3) successfully - adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtp.so ${REMOTE_PATH}/ - adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtpNetRunExtensions.so ${REMOTE_PATH}/ - adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtpPrepare.so ${REMOTE_PATH}/ - adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtpV75Stub.so ${REMOTE_PATH}/ - adb push ${QNN_SDK_PATH}/lib/hexagon-v75/unsigned/libQnnHtpV75Skel.so ${REMOTE_PATH}/ - fi -} - - -function show_usage() -{ - echo "Usage:" - echo " $0 GGML_OP_ADD 0/1/2" - echo " $0 GGML_OP_MUL 0/1/2" - echo " $0 GGML_OP_MUL_MAT 0/1/2" - echo -e "\n\n\n" -} - - -function main() -{ - check_qnn_libs - - #upload the latest ggml_qnn_test - adb push ${GGML_QNN_TEST} ${REMOTE_PATH} - adb shell chmod +x ${REMOTE_PATH}/${GGML_QNN_TEST} - - case "$ggmlop" in - GGML_OP_ADD) - echo "adb shell ${REMOTE_PATH}/${GGML_QNN_TEST} -t GGML_OP_ADD -b $qnnbackend" - adb shell ${REMOTE_PATH}/${GGML_QNN_TEST} -t GGML_OP_ADD -b $qnnbackend - ;; - - GGML_OP_MUL) - adb shell ${REMOTE_PATH}/${GGML_QNN_TEST} -t GGML_OP_MUL -b $qnnbackend - ;; - - GGML_OP_MUL_MAT) - adb shell ${REMOTE_PATH}/${GGML_QNN_TEST} -t GGML_OP_MUL_MAT -b $qnnbackend - ;; - - *) - printf " \n$arg not supported currently\n" - show_usage - exit 1 - ;; - esac -} - - -check_qnn_sdk - -unset ggmlop -unset qnnbackend -if [ $# == 0 ]; then - show_usage - exit 1 -elif [ $# == 1 ]; then - if [ "$1" == "-h" ]; then - #avoid upload command line program to Android phone in this scenario - show_usage - exit 1 - elif [ "$1" == "help" ]; then - #avoid upload command line program to Android phone in this scenario - show_usage - exit 1 - else - ggmlop=$1 - qnnbackend=0 - fi -elif [ $# == 2 ]; then - ggmlop=$1 - qnnbackend=$2 -else - show_usage - exit 1 -fi -main $arg From 926a8661f31c85499314c3b15f47c0709041ee07 Mon Sep 17 00:00:00 2001 From: "zhou.weiguo" Date: Wed, 5 Jun 2024 21:10:59 +0800 Subject: [PATCH 004/143] review: replace external declaration with NDK header file --- ggml-qnn.cpp | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/ggml-qnn.cpp b/ggml-qnn.cpp index 9319db227795d..15c6538d1870d 100644 --- a/ggml-qnn.cpp +++ b/ggml-qnn.cpp @@ -50,6 +50,9 @@ #include "ggml-backend-impl.h" +#if (defined __ANDROID__) || (defined ANDROID) +#include +#endif // ================================================================================================= // @@ -58,11 +61,6 @@ // ================================================================================================= class qnn_instance; - -#if (defined __ANDROID__) || (defined ANDROID) -extern "C" int __android_log_print(int prio, const char * tag, const char * fmt, ...) -__attribute__((__format__(printf, 3, 4))); -#endif static void ggml_qnn_log_internal(ggml_log_level level, const char * file, const char * func, int line, const char * format, ...); From dd29834c115f5c644b34fb7e60c0175b9890da29 Mon Sep 17 00:00:00 2001 From: "zhou.weiguo" Date: Thu, 6 Jun 2024 17:12:28 +0800 Subject: [PATCH 005/143] add supportive of quantize data type Q8_0 --- ggml-qnn.cpp | 176 +++++++++------ ggml-qnn.h | 5 +- tests/ggml-qnn/ggml-qnn-ut-build-run.sh | 37 ++-- tests/ggml-qnn/ggml-qnn-ut.cpp | 274 ++++++++++++++++-------- 4 files changed, 321 insertions(+), 171 deletions(-) diff --git a/ggml-qnn.cpp b/ggml-qnn.cpp index 15c6538d1870d..d0927f22e514a 100644 --- a/ggml-qnn.cpp +++ b/ggml-qnn.cpp @@ -72,8 +72,6 @@ static void ggml_qnn_log_internal(ggml_log_level level, const char * file, const #define RPCMEM_DEFAULT_FLAGS 1 #define RPCMEM_HEAP_ID_SYSTEM 25 -#define GGML_DUMP_TENSOR(tensor) ggml_tensor_dump(tensor, #tensor) - #define GGML_QNN_LOGBUF_LEN 4096 #define GGML_QNN_DEBUG 1 //for troubleshooting QNN backend @@ -195,8 +193,17 @@ static ggml_backend_t g_qnn_backend = nullptr; static int g_current_device = QNN_BACKEND_GGML; - -//QNN cDSP and HTA backend would not be used currently, just focus on QNN CPU/GPU/NPU(aka HTP/DSP) backend currently +//according to the QNN SDK Reference Guide, +//CPU - Choose a non-quantized model. Quantized models are currently incompatible with the CPU backend +//GPU - Choose a non-quantized model. Quantized models are currently incompatible with the GPU backend +//HTP - Choose a quantized model. Quantized models are required when running on the HTP backend +//DSP - Choose a quantized model. Quantized models are required when running on the DSP backend +//HTA - Choose a quantized model. Quantized models are required when running on the HTA backend +// +//only focus on Qualcomm CPU/GPU/NPU backend in this implementation of QNN backend for ggml currently +//Qualcomm CPU: Qualcomm Kryo CPU +//Qualcomm GPU: Qualcomm Adreno GPU +//Qualcomm NPU: aka HTP(Hexagon Tensor Processor), ~= cDSP(Compute DSP) + HMX(Hexagon Matrix eXtensions)/HTA(Hexagon Tensor Accelerator) static struct ggml_backend_qnn_context g_qnn_mgr[GGML_QNN_MAX_DEVICES] = { [QNN_BACKEND_CPU] = {.device = 0, .threads = 1, .name = "qnn-cpu", .lib = "libQnnCpu.so", .instance = nullptr, .backend = nullptr, .raw_interface = {}, .raw_system_interface = {}}, [QNN_BACKEND_GPU] = {.device = 1, .threads = 1, .name = "qnn-gpu", .lib = "libQnnGpu.so", .instance = nullptr, .backend = nullptr, .raw_interface = {}, .raw_system_interface = {}}, @@ -849,6 +856,10 @@ static Qnn_DataType_t qnn_datatype_from_ggml_datatype(enum ggml_type ggmltype) { return QNN_DATATYPE_FLOAT_16; case GGML_TYPE_F32: return QNN_DATATYPE_FLOAT_32; + case GGML_TYPE_I8: + return QNN_DATATYPE_INT_8; + case GGML_TYPE_Q8_0: + return QNN_DATATYPE_SFIXED_POINT_8; default: break; @@ -903,14 +914,8 @@ static const char * get_qnn_backend_name(int n_backend_type) { case 2: return "QNN-NPU"; case 3: - return "ggml"; //the default GGML backend, used to compare performance between QNN backend and the default GGML backend + return "ggml"; //"fake" QNN backend, used for compare performance between QNN backend and original GGML -#if 0 //QNN cDSP and HTA backend would not be used currently, focus on QNN CPU/GPU/NPU(aka HTP/DSP) backend currently - case 3: - return "QNN-cDSP"; - case 4: - return "QNN-HTA"; -#endif default: return "unknown"; } @@ -1720,7 +1725,7 @@ static void ggml_qnn_logcallback(const char * fmt, double ms = (double) timestamp / 1000000.0; - { + if (0) { std::lock_guard lock(log_mutex); memset(s_ggml_qnn_logbuf, 0, GGML_QNN_LOGBUF_LEN); @@ -1770,7 +1775,7 @@ int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) { _qnn_raw_interface.logCreate(ggml_qnn_logcallback, _qnn_log_level, &_qnn_log_handle); #endif if (nullptr == _qnn_log_handle) { - QNN_LOG_WARN("why failed to initialize qnn log\n"); //DSP backend not work on Qualcomm SoC based low-end phone + QNN_LOG_WARN("why failed to initialize qnn log\n"); //NPU backend not work on Qualcomm SoC based low-end phone return 4; } else { QNN_LOG_DEBUG("initialize qnn log successfully\n"); @@ -2010,14 +2015,14 @@ static bool ggml_qnn_can_handle_op(const struct ggml_tensor * tensor, bool b_dum const struct ggml_tensor * src0 = tensor->src[0]; const struct ggml_tensor * src1 = tensor->src[1]; - const int64_t ne00 = tensor->src[0]->ne[0]; - const int64_t ne01 = tensor->src[0]->ne[1]; + const int64_t ne00 = src0->ne[0]; + const int64_t ne01 = src0->ne[1]; - const int64_t ne10 = tensor->src[1]->ne[0]; - const int64_t ne11 = tensor->src[1]->ne[1]; + const int64_t ne10 = src1->ne[0]; + const int64_t ne11 = src1->ne[1]; - const int64_t ne0 = tensor->ne[0]; - const int64_t ne1 = tensor->ne[1]; + const int64_t ne0 = tensor->ne[0]; + const int64_t ne1 = tensor->ne[1]; GGML_UNUSED(ne0); GGML_UNUSED(ne1); @@ -2057,30 +2062,15 @@ static bool ggml_qnn_can_handle_op(const struct ggml_tensor * tensor, bool b_dum return false; } - if (tensor->op == GGML_OP_ADD) { - //TODO: this is limitation - return (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16) - && (src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16) - && (tensor->type == GGML_TYPE_F32 || tensor->type == GGML_TYPE_F16); - + // GPU/NPU inference will slower then CPU inference when tensor->ne[1] < min batch size + if (tensor->ne[1] < 32) { + return false; } - if (tensor->op == GGML_OP_MUL_MAT) { - //TODO: this is limitation - return (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16) - && (src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16) - && (src0->type == src1->type) && (src0->type == tensor->type); + int qtype = src0->type; + return (qtype == GGML_TYPE_F32 || qtype == GGML_TYPE_F16 || qtype == GGML_TYPE_Q8_0) + && (src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16); - if (tensor->ne[1] < 32) { // GPU/NPU inference will slower then CPU inference when tensor->ne[1] < min batch size - return false; - } - - } - - //TODO: this is limitation - return (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16) - && (src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16) - && (src0->type == src1->type) && (src0->type == tensor->type); } @@ -2129,7 +2119,7 @@ static void ggml_qnn_add(const ggml_tensor * src0, const ggml_tensor * src1, ggm QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface; n_begin_time = ggml_time_us(); -#if 1 + QNN_LOG_DEBUG("call %s\n", __func__); QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", src0->name, @@ -2147,17 +2137,23 @@ static void ggml_qnn_add(const ggml_tensor * src0, const ggml_tensor * src1, ggm QNN_LOG_DEBUG("tensor0 name %s", QNN_TENSOR_GET_NAME(tensor_0)); QNN_LOG_DEBUG("tensor1 name %s", QNN_TENSOR_GET_NAME(tensor_1)); QNN_LOG_DEBUG("tensor2 name %s", QNN_TENSOR_GET_NAME(tensor_2)); -#endif QNN_VER_PTR(*tensor_0)->type = QNN_TENSOR_TYPE_APP_WRITE; QNN_VER_PTR(*tensor_1)->type = QNN_TENSOR_TYPE_APP_WRITE; QNN_VER_PTR(*tensor_2)->type = QNN_TENSOR_TYPE_APP_READ; - src0_qnn_type = qnn_datatype_from_ggml_datatype(src0->type); - src1_qnn_type = qnn_datatype_from_ggml_datatype(src1->type); - dst_qnn_type = qnn_datatype_from_ggml_datatype(dst->type); + src0_qnn_type = qnn_datatype_from_ggml_datatype(src0->type); + src1_qnn_type = qnn_datatype_from_ggml_datatype(src1->type); + dst_qnn_type = qnn_datatype_from_ggml_datatype(dst->type); - std::string map_entry = std::string(ggml_op_name(ggmlop)); + uint32_t dimensions_input_0[] = {(uint32_t) src0->ne[0], (uint32_t) src0->ne[1], + (uint32_t) src0->ne[2], (uint32_t) src0->ne[3]}; + uint32_t dimensions_input_1[] = {(uint32_t) src1->ne[0], (uint32_t) src1->ne[1], + (uint32_t) src1->ne[2], (uint32_t) src1->ne[3]}; + uint32_t dimensions_output[] = {(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], + (uint32_t) dst->ne[2], (uint32_t) dst->ne[3]}; + + std::string map_entry = std::string(ggml_op_name(ggmlop)); if (instance->_qnn_graph_map.find(map_entry) != instance->_qnn_graph_map.end()) { graph_initialized = true; auto & graph_item = instance->_qnn_graph_map[map_entry]; @@ -2197,6 +2193,16 @@ static void ggml_qnn_add(const ggml_tensor * src0, const ggml_tensor * src1, ggm QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, ggml_get_tensor_data_size(src1)}; QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, ggml_get_tensor_data_size(dst)}; + QNN_VER_PTR(*tensor_0)->dimensions = dimensions_input_0; + QNN_VER_PTR(*tensor_0)->rank = ggml_get_tensor_rank(src0); + QNN_VER_PTR(*tensor_0)->dataType = src0_qnn_type; + QNN_VER_PTR(*tensor_1)->dimensions = dimensions_input_1; + QNN_VER_PTR(*tensor_1)->rank = ggml_get_tensor_rank(src1); + QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type; + QNN_VER_PTR(*tensor_2)->dimensions = dimensions_output; + QNN_VER_PTR(*tensor_2)->rank = ggml_get_tensor_rank(dst); + QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type; + Qnn_Tensor_t tensor_inputs[] = { *tensor_0, *tensor_1 @@ -2245,6 +2251,11 @@ static void ggml_qnn_add(const ggml_tensor * src0, const ggml_tensor * src1, ggm (uint32_t) src1->ne[2], (uint32_t) src1->ne[3]}; uint32_t dimensions_output[] = {(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3]}; + + QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, ggml_get_tensor_data_size(src0)}; + QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, ggml_get_tensor_data_size(src1)}; + QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, ggml_get_tensor_data_size(dst)}; + QNN_VER_PTR(*tensor_0)->dimensions = dimensions_input_0; QNN_VER_PTR(*tensor_0)->rank = ggml_get_tensor_rank(src0); QNN_VER_PTR(*tensor_0)->dataType = src0_qnn_type; @@ -2255,10 +2266,6 @@ static void ggml_qnn_add(const ggml_tensor * src0, const ggml_tensor * src1, ggm QNN_VER_PTR(*tensor_2)->rank = ggml_get_tensor_rank(dst); QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type; - QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, ggml_get_tensor_data_size(src0)}; - QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, ggml_get_tensor_data_size(src1)}; - QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, ggml_get_tensor_data_size(dst)}; - Qnn_Tensor_t tensor_inputs[] = { *tensor_0, *tensor_1 @@ -2337,7 +2344,6 @@ static void ggml_qnn_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface; n_begin_time = ggml_time_us(); -#if 1 QNN_LOG_DEBUG("call %s\n", __func__); QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", src0->name, @@ -2355,17 +2361,23 @@ static void ggml_qnn_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, QNN_LOG_DEBUG("tensor0 name %s", QNN_TENSOR_GET_NAME(tensor_0)); QNN_LOG_DEBUG("tensor1 name %s", QNN_TENSOR_GET_NAME(tensor_1)); QNN_LOG_DEBUG("tensor2 name %s", QNN_TENSOR_GET_NAME(tensor_2)); -#endif QNN_VER_PTR(*tensor_0)->type = QNN_TENSOR_TYPE_APP_WRITE; QNN_VER_PTR(*tensor_1)->type = QNN_TENSOR_TYPE_APP_WRITE; QNN_VER_PTR(*tensor_2)->type = QNN_TENSOR_TYPE_APP_READ; - src0_qnn_type = qnn_datatype_from_ggml_datatype(src0->type); - src1_qnn_type = qnn_datatype_from_ggml_datatype(src1->type); - dst_qnn_type = qnn_datatype_from_ggml_datatype(dst->type); + src0_qnn_type = qnn_datatype_from_ggml_datatype(src0->type); + src1_qnn_type = qnn_datatype_from_ggml_datatype(src1->type); + dst_qnn_type = qnn_datatype_from_ggml_datatype(dst->type); - std::string map_entry = std::string(ggml_op_name(ggmlop)); + uint32_t dimensions_input_0[] = {(uint32_t) src0->ne[0], (uint32_t) src0->ne[1], + (uint32_t) src0->ne[2], (uint32_t) src0->ne[3]}; + uint32_t dimensions_input_1[] = {(uint32_t) src1->ne[0], (uint32_t) src1->ne[1], + (uint32_t) src1->ne[2], (uint32_t) src1->ne[3]}; + uint32_t dimensions_output[] = {(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], + (uint32_t) dst->ne[2], (uint32_t) dst->ne[3]}; + + std::string map_entry = std::string(ggml_op_name(ggmlop)); if (instance->_qnn_graph_map.find(map_entry) != instance->_qnn_graph_map.end()) { graph_initialized = true; auto & graph_item = instance->_qnn_graph_map[map_entry]; @@ -2401,6 +2413,16 @@ static void ggml_qnn_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, ggml_get_tensor_data_size(src1)}; QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, ggml_get_tensor_data_size(dst)}; + QNN_VER_PTR(*tensor_0)->dimensions = dimensions_input_0; + QNN_VER_PTR(*tensor_0)->rank = ggml_get_tensor_rank(src0); + QNN_VER_PTR(*tensor_0)->dataType = src0_qnn_type; + QNN_VER_PTR(*tensor_1)->dimensions = dimensions_input_1; + QNN_VER_PTR(*tensor_1)->rank = ggml_get_tensor_rank(src1); + QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type; + QNN_VER_PTR(*tensor_2)->dimensions = dimensions_output; + QNN_VER_PTR(*tensor_2)->rank = ggml_get_tensor_rank(dst); + QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type; + Qnn_Tensor_t tensor_inputs[] = { *tensor_0, *tensor_1 @@ -2543,7 +2565,7 @@ static void ggml_qnn_hanlde_op(const enum ggml_op ggmlop, const ggml_tensor * sr } n_begin_time = ggml_time_us(); -#if 1 + QNN_LOG_DEBUG("call %s\n", __func__); QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", src0->name, @@ -2561,11 +2583,17 @@ static void ggml_qnn_hanlde_op(const enum ggml_op ggmlop, const ggml_tensor * sr QNN_LOG_DEBUG("tensor0 name %s", QNN_TENSOR_GET_NAME(tensor_0)); QNN_LOG_DEBUG("tensor1 name %s", QNN_TENSOR_GET_NAME(tensor_1)); QNN_LOG_DEBUG("tensor2 name %s", QNN_TENSOR_GET_NAME(tensor_2)); -#endif QNN_VER_PTR(*tensor_0)->type = QNN_TENSOR_TYPE_APP_WRITE; QNN_VER_PTR(*tensor_1)->type = QNN_TENSOR_TYPE_APP_WRITE; QNN_VER_PTR(*tensor_2)->type = QNN_TENSOR_TYPE_APP_READ; + uint32_t dimensions_input_0[] = {(uint32_t) src0->ne[0], (uint32_t) src0->ne[1], + (uint32_t) src0->ne[2], (uint32_t) src0->ne[3]}; + uint32_t dimensions_input_1[] = {(uint32_t) src1->ne[0], (uint32_t) src1->ne[1], + (uint32_t) src1->ne[2], (uint32_t) src1->ne[3]}; + uint32_t dimensions_output[] = {(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], + (uint32_t) dst->ne[2], (uint32_t) dst->ne[3]}; + std::string map_entry = std::string(ggml_op_name(ggmlop)); if (instance->_qnn_graph_map.find(map_entry) != instance->_qnn_graph_map.end()) { @@ -2606,6 +2634,16 @@ static void ggml_qnn_hanlde_op(const enum ggml_op ggmlop, const ggml_tensor * sr QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, ggml_get_tensor_data_size(src1)}; QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, ggml_get_tensor_data_size(dst)}; + QNN_VER_PTR(*tensor_0)->dimensions = dimensions_input_0; + QNN_VER_PTR(*tensor_0)->rank = ggml_get_tensor_rank(src0); + QNN_VER_PTR(*tensor_0)->dataType = src0_qnn_type; + QNN_VER_PTR(*tensor_1)->dimensions = dimensions_input_1; + QNN_VER_PTR(*tensor_1)->rank = ggml_get_tensor_rank(src1); + QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type; + QNN_VER_PTR(*tensor_2)->dimensions = dimensions_output; + QNN_VER_PTR(*tensor_2)->rank = ggml_get_tensor_rank(dst); + QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type; + Qnn_Tensor_t tensor_inputs[] = { *tensor_0, *tensor_1 @@ -3125,10 +3163,9 @@ GGML_CALL static void ggml_backend_qnn_buffer_init_tensor(ggml_backend_buffer_t char tensor_name[GGML_MAX_NAME] = { 0 }; snprintf(tensor_name, GGML_MAX_NAME, "tensor_%2d", idx++); - uint32_t dimensions[] = {(uint32_t) tensor->ne[0], (uint32_t) tensor->ne[1], (uint32_t) tensor->ne[2], (uint32_t) tensor->ne[3]}; - //TODO:only support FP32 & FP16 - Qnn_DataType_t qnn_data_type = QNN_DATATYPE_FLOAT_32; - Qnn_TensorType_t qnn_tensor_type = QNN_TENSOR_TYPE_APP_WRITE; + uint32_t dimensions[] = {(uint32_t) tensor->ne[0], (uint32_t) tensor->ne[1], (uint32_t) tensor->ne[2], (uint32_t) tensor->ne[3]}; + Qnn_DataType_t qnn_data_type = qnn_datatype_from_ggml_datatype(tensor->type); + Qnn_TensorType_t qnn_tensor_type= QNN_TENSOR_TYPE_APP_WRITE; if (tensor->flags & GGML_TENSOR_FLAG_INPUT) { qnn_tensor_type = QNN_TENSOR_TYPE_APP_WRITE; @@ -3365,7 +3402,7 @@ GGML_CALL static bool ggml_backend_qnn_supports_op(ggml_backend_t backend, const //note: this function be used with proposal/refined ggml backend subsystem in this PR: // https://github.com/ggerganov/llama.cpp/pull/7641 -// new ggml backend(only using system memory: ggml_backend_xxx_buffer_is_host return true) +// any ggml backend(only using system memory: ggml_backend_xxx_buffer_is_host return true) // can following this style for mixed inference between CPU&GPU / CPU&NPU very easily GGML_CALL static bool ggml_backend_qnn_offload_op(ggml_backend_t backend, const ggml_tensor * tensor) { GGML_UNUSED(backend); @@ -3481,7 +3518,7 @@ ggml_backend_buffer_type_t ggml_backend_qnn_buffer_type(size_t device_index) { /** * - * @param device 0: QNN_BACKEND_CPU 1: QNN_BACKEND_GPU 2: QNN_BACKEND_NPU(aka HTP/DSP) + * @param device 0: QNN_BACKEND_CPU 1: QNN_BACKEND_GPU 2: QNN_BACKEND_NPU * @param qnn_lib_path qnn library path, such as "/data/local/tmp/" on Android or specified in JNI layer * @return */ @@ -3516,22 +3553,21 @@ ggml_backend_t ggml_backend_qnn_init(size_t device, const char * qnn_lib_path) { (path + ":/vendor/dsp/cdsp:/vendor/lib64:/vendor/dsp/dsp:/vendor/dsp/images").c_str(), 1)) { - QNN_LOG_INFO("QNN DSP backend setenv successfully"); + QNN_LOG_INFO("QNN NPU backend setenv successfully"); } else { - QNN_LOG_ERROR("QNN DSP backend setenv failure"); + QNN_LOG_ERROR("QNN NPU backend setenv failure"); } if (0 == setenv("ADSP_LIBRARY_PATH", (path + ";/vendor/dsp/cdsp;/vendor/lib/rfsa/adsp;/system/lib/rfsa/adsp;/vendor/dsp/dsp;/vendor/dsp/images;/dsp").c_str(), 1)) { - QNN_LOG_INFO("QNN DSP backend setenv successfully"); + QNN_LOG_INFO("QNN NPU backend setenv successfully"); } else { - QNN_LOG_ERROR("QNN DSP backend setenv failure"); + QNN_LOG_ERROR("QNN NPU backend setenv failure"); } } else { if (0 == setenv("LD_LIBRARY_PATH", - (path + - ":/vendor/dsp/cdsp:/vendor/lib64:/vendor/dsp/dsp:/vendor/dsp/images").c_str(), + path.c_str(), 1)) { QNN_LOG_INFO("%s backend setenv successfully\n", get_qnn_backend_name(device)); } else { diff --git a/ggml-qnn.h b/ggml-qnn.h index c61ebd25d9ba6..9ea3dcda62c64 100644 --- a/ggml-qnn.h +++ b/ggml-qnn.h @@ -10,19 +10,18 @@ extern "C" { #define GGML_QNN_MAX_DEVICES 3 -//QNN cDSP and HTA backend would not be used currently, just focus on QNN CPU/GPU/NPU(aka HTP/DSP) backend currently enum QNNBackend { QNN_BACKEND_CPU, QNN_BACKEND_GPU, QNN_BACKEND_NPU, - QNN_BACKEND_GGML, //"fake" QNN backend just for compare performance between QNN and original GGML + QNN_BACKEND_GGML, //"fake" QNN backend, used for compare performance between QNN and original GGML }; GGML_API int ggml_backend_qnn_reg_devices(void); /** * - * @param device 0: QNN_BACKEND_CPU 1: QNN_BACKEND_GPU 2: QNN_BACKEND_NPU(aka HTP/DSP) + * @param device 0: QNN_BACKEND_CPU 1: QNN_BACKEND_GPU 2: QNN_BACKEND_NPU * @param qnn_lib_path qnn library path, such as "/data/local/tmp/" on Android or specified in JNI layer * @return */ diff --git a/tests/ggml-qnn/ggml-qnn-ut-build-run.sh b/tests/ggml-qnn/ggml-qnn-ut-build-run.sh index c7bff2ee9c20e..192f2f4bda2f5 100755 --- a/tests/ggml-qnn/ggml-qnn-ut-build-run.sh +++ b/tests/ggml-qnn/ggml-qnn-ut-build-run.sh @@ -4,7 +4,8 @@ set -e #https://qpm.qualcomm.com/#/main/tools/details/qualcomm_ai_engine_direct #https://developer.qualcomm.com/software/hexagon-dsp-sdk/tools -QNN_SDK_PATH=/opt/qcom/aistack/qnn/2.20.0.240223/ +#QNN SDK released on 20240531 +QNN_SDK_PATH=/opt/qcom/aistack/qairt/2.23.0.240531/ ANDROID_NDK=`pwd`/android-ndk-r26c ANDROID_PLATFORM=android-34 @@ -89,6 +90,23 @@ function remove_temp_dir() } +function update_qnn_libs() +{ + check_qnn_sdk + + adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnSystem.so ${REMOTE_PATH}/ + adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnCpu.so ${REMOTE_PATH}/ + adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnGpu.so ${REMOTE_PATH}/ + + #the QNN NPU(aka HTP/DSP) backend only verified on Xiaomi14(Qualcomm SM8650-AB Snapdragon 8 Gen 3) successfully + adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtp.so ${REMOTE_PATH}/ + adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtpNetRunExtensions.so ${REMOTE_PATH}/ + adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtpPrepare.so ${REMOTE_PATH}/ + adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtpV75Stub.so ${REMOTE_PATH}/ + adb push ${QNN_SDK_PATH}/lib/hexagon-v75/unsigned/libQnnHtpV75Skel.so ${REMOTE_PATH}/ +} + + function check_qnn_libs() { #reuse the cached qnn libs in Android phone @@ -96,16 +114,7 @@ function check_qnn_libs() if [ $? -eq 0 ]; then printf "QNN libs already exist on Android phone\n" else - adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnSystem.so ${REMOTE_PATH}/ - adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnCpu.so ${REMOTE_PATH}/ - adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnGpu.so ${REMOTE_PATH}/ - - #the QNN NPU(aka HTP/DSP) backend only verified on Xiaomi14(Qualcomm SM8650-AB Snapdragon 8 Gen 3) successfully - adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtp.so ${REMOTE_PATH}/ - adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtpNetRunExtensions.so ${REMOTE_PATH}/ - adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtpPrepare.so ${REMOTE_PATH}/ - adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtpV75Stub.so ${REMOTE_PATH}/ - adb push ${QNN_SDK_PATH}/lib/hexagon-v75/unsigned/libQnnHtpV75Skel.so ${REMOTE_PATH}/ + update_qnn_libs fi } @@ -155,7 +164,8 @@ function run_ggml_qnn_ut() function show_usage() { echo "Usage:" - echo " $0 build" + echo " $0 build (build Android command line UT program)" + echo " $0 updateqnnlibs (upload the latest QNN libs to Android phone)" echo " $0 GGML_OP_ADD 0 (QNN_CPU) / 1(QNN_GPU) / 2(QNN_NPU)" echo " $0 GGML_OP_MUL 0 (QNN_CPU) / 1(QNN_GPU) / 2(QNN_NPU)" echo " $0 GGML_OP_MUL_MAT 0 (QNN_CPU) / 1(QNN_GPU) / 2(QNN_NPU)" @@ -183,6 +193,9 @@ elif [ $# == 1 ]; then elif [ "$1" == "build" ]; then build_ggml_qnn_ut exit 0 + elif [ "$1" == "updateqnnlibs" ]; then + update_qnn_libs + exit 0 else ggmlop=$1 qnnbackend=0 diff --git a/tests/ggml-qnn/ggml-qnn-ut.cpp b/tests/ggml-qnn/ggml-qnn-ut.cpp index 27967270bdcd4..1041252f3770f 100644 --- a/tests/ggml-qnn/ggml-qnn-ut.cpp +++ b/tests/ggml-qnn/ggml-qnn-ut.cpp @@ -87,7 +87,7 @@ static const char * get_qnn_backend_name(int n_backend_type) { case 1: return "QNN-GPU"; case 2: - return "QNN-NPU(HTP/DSP)"; + return "QNN-NPU"; case 3: return "ggml"; default: @@ -131,9 +131,54 @@ static bool ggml_graph_compute_helper( } -static void tensor_dump_elements(const ggml_tensor * tensor) { +#define QK8_0 32 +typedef struct { + uint16_t d; // delta + int8_t qs[QK8_0]; // quants +} block_q8_0; + + +static inline float ggml_compute_fp16_to_fp32(uint16_t h) { + __fp16 tmp; + memcpy(&tmp, &h, sizeof(uint16_t)); + return (float)tmp; +} +#define GGML_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x) + +static void tensor_dump(const ggml_tensor * tensor, const char * name) { + QNN_LOG_DEBUG("dump ggml tensor %s(%s): type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + name, tensor->name, + tensor->type, ggml_type_name(tensor->type), + tensor->ne[0], tensor->ne[1], tensor->ne[2], + tensor->nb[0], tensor->nb[1], tensor->nb[2]); + float value = 0; std::ostringstream tmposs; + if (nullptr == tensor) { + QNN_LOG_WARN("tensor is null"); + return; + } + if (tensor->type == GGML_TYPE_I8) { + for (int h = 0; h < tensor->ne[3]; h++) { + for (int i = 0; i < tensor->ne[2]; i++) { + for (int j = 0; j < tensor->ne[1]; j++) { + for (int k = 0; k < tensor->ne[0]; k++) { + value = ((int8_t *) tensor->data)[h * tensor->ne[2] + i * tensor->ne[1] + + j * tensor->ne[0] + k]; + tmposs << std::setw(8) << std::fixed << std::setprecision(2) << value + << " "; + } + tmposs << "\n"; + } + } + } + if (strlen(tmposs.str().c_str()) <= (GGML_QNN_LOGBUF_LEN - 96)) { + QNN_LOG_DEBUG("\n%s\n", tmposs.str().c_str()); + tmposs.clear(); + tmposs.str(""); + } + } + if (tensor->type == GGML_TYPE_F32) { for (int h = 0; h < tensor->ne[3]; h++) { for (int i = 0; i < tensor->ne[2]; i++) { @@ -144,31 +189,59 @@ static void tensor_dump_elements(const ggml_tensor * tensor) { tmposs << std::setw(8) << std::fixed << std::setprecision(2) << value << " "; } - if (strlen(tmposs.str().c_str()) <= (GGML_QNN_LOGBUF_LEN - 96)) { - QNN_LOG_DEBUG("%s", tmposs.str().c_str()); - } - tmposs.clear(); - tmposs.str(""); - //QNN_LOG_DEBUG("\n"); + tmposs << "\n"; } } } + if (strlen(tmposs.str().c_str()) <= (GGML_QNN_LOGBUF_LEN - 96)) { + QNN_LOG_DEBUG("\n%s\n", tmposs.str().c_str()); + tmposs.clear(); + tmposs.str(""); + } } - //QNN_LOG_DEBUG("\n"); -} - - -static void tensor_dump(const ggml_tensor * tensor, const char * name) { - QNN_LOG_DEBUG("dump ggml tensor %s(%s)", name, tensor->name); - QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)", - name, - tensor->type, ggml_type_name(tensor->type), - tensor->ne[0], tensor->ne[1], tensor->ne[2], - tensor->nb[0], tensor->nb[1], tensor->nb[2]); - tensor_dump_elements(tensor); + if (tensor->type == GGML_TYPE_F16) { + for (int h = 0; h < tensor->ne[3]; h++) { + for (int i = 0; i < tensor->ne[2]; i++) { + for (int j = 0; j < tensor->ne[1]; j++) { + for (int k = 0; k < tensor->ne[0]; k++) { + unsigned short tmpvalue = ((unsigned short *) tensor->data)[h * tensor->ne[2] + i * tensor->ne[1] + + j * tensor->ne[0] + k]; + value = GGML_FP16_TO_FP32(tmpvalue); + tmposs << std::setw(8) << std::fixed << std::setprecision(2) << value + << " "; + } + tmposs << "\n"; + } + } + } + if (strlen(tmposs.str().c_str()) <= (GGML_QNN_LOGBUF_LEN - 96)) { + QNN_LOG_DEBUG("\n%s\n", tmposs.str().c_str()); + tmposs.clear(); + tmposs.str(""); + } + } - QNN_LOG_DEBUG("\n"); + if (tensor->type == GGML_TYPE_Q8_0) { + block_q8_0 * tmp = ((block_q8_0 *)tensor->data); + for (int j = 0; j < tensor->ne[1]; j++) { + int n = tensor->ne[0] / QK8_0; //blocks per row + for (int z = 0; z < n; z++) { + const float d = GGML_FP16_TO_FP32(tmp[ j * n + z ].d); + for (int k = 0; k < QK8_0; k++) { + value = tmp[j * n + z].qs[k] * d; + tmposs << std::setw(8) << std::fixed << std::setprecision(2) << value + << " "; + } + } + tmposs << "\n"; + } + if (strlen(tmposs.str().c_str()) <= (GGML_QNN_LOGBUF_LEN - 96)) { + QNN_LOG_DEBUG("\n%s\n", tmposs.str().c_str()); + tmposs.clear(); + tmposs.str(""); + } + } } @@ -231,7 +304,8 @@ static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float m t.join(); } if (tensor->type == GGML_TYPE_F32 || tensor->type == GGML_TYPE_I32) { - ggml_backend_tensor_set(tensor, data.data(), 0, size * sizeof(float)); + //ggml_backend_tensor_set(tensor, data.data(), 0, size * sizeof(float)); + memcpy((char*)tensor->data, data.data(), size * sizeof(float)); } else if (ggml_is_quantized(tensor->type) || tensor->type == GGML_TYPE_F16 || tensor->type == GGML_TYPE_BF16) { GGML_ASSERT(size % ggml_blck_size(tensor->type) == 0); std::vector dataq(ggml_row_size(tensor->type, size)); @@ -246,10 +320,12 @@ static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float m } ggml_quantize_chunk(tensor->type, data.data(), dataq.data(), 0, size/tensor->ne[0], tensor->ne[0], im); GGML_ASSERT(ggml_validate_row_data(tensor->type, dataq.data(), dataq.size())); - ggml_backend_tensor_set(tensor, dataq.data(), 0, dataq.size()); + //ggml_backend_tensor_set(tensor, dataq.data(), 0, dataq.size()); + memcpy((char*)tensor->data, dataq.data(), dataq.size()); } else if (tensor->type == GGML_TYPE_I8 || tensor->type == GGML_TYPE_I16 || tensor->type == GGML_TYPE_I32) { // This is going to create some weird integers though. - ggml_backend_tensor_set(tensor, data.data(), 0, ggml_nbytes(tensor)); + //ggml_backend_tensor_set(tensor, data.data(), 0, ggml_nbytes(tensor)); + memcpy((char*)tensor->data, data.data(), ggml_nbytes(tensor)); } else { GGML_ASSERT(false); } @@ -276,16 +352,13 @@ static void show_usage() { } -int main(int argc, char * argv[]) { +static int qnn_op_ut(int num_threads, int n_backend_type, int n_ggml_op_type) { int64_t n_begin_time = 0LL; int64_t n_end_time = 0LL; int64_t n_duration = 0LL; size_t ctx_size = 0; int sizey = 4; int sizex = 4; - int num_threads = 4; - int n_backend_type = QNN_BACKEND_CPU; - int n_ggml_op_type = GGML_OP_ADD; struct ggml_context * ctx = nullptr; struct ggml_cgraph * gf = nullptr; @@ -294,50 +367,23 @@ int main(int argc, char * argv[]) { struct ggml_tensor * dst = nullptr; ggml_backend_t backend = nullptr; ggml_backend_buffer_t buffer= nullptr; - ggml_type qtype = GGML_TYPE_F32; - std::vector work_buffer; - for (int i = 1; i < argc; i++) { - if (0 == strcmp(argv[i], "-t")) { - if (i + 1 < argc) { - if (0 == memcmp(argv[i + 1], "GGML_OP_ADD", 11)) { - n_ggml_op_type = GGML_OP_ADD; - } else if (0 == memcmp(argv[i + 1], "GGML_OP_MUL_MAT", 15)) { - n_ggml_op_type = GGML_OP_MUL_MAT; - } else if (0 == memcmp(argv[i + 1], "GGML_OP_MUL", 11)) { - n_ggml_op_type = GGML_OP_MUL; - } else { - show_usage(); - return 1; - } - i++; - } - } else if (0 == strcmp(argv[i], "-b")) { - if (i + 1 < argc) { - int backend = atoi(argv[i + 1]); - if (backend <= QNN_BACKEND_NPU) - n_backend_type = backend; - else { - show_usage(); - return 1; - } - i++; - } - } else { - show_usage(); - return 1; - } - } + ggml_type qtype = GGML_TYPE_I8; + qtype = GGML_TYPE_F32; + qtype = GGML_TYPE_F16; + qtype = GGML_TYPE_Q8_0; + std::vector work_buffer; QNN_LOG_DEBUG("enter qnn_ggml_op\n"); - QNN_LOG_DEBUG("ggml op:%d(%s)", n_ggml_op_type, ggml_op_name((enum ggml_op) n_ggml_op_type)); + QNN_LOG_DEBUG("ggml op:%d(%s)\n", n_ggml_op_type, ggml_op_name((enum ggml_op) n_ggml_op_type)); + n_begin_time = ggml_time_us(); srand(time(NULL)); ctx_size += 1024 * 1024 * 32; QNN_LOG_DEBUG("Allocating Memory of size %zi bytes, %zi MB\n", ctx_size, - (ctx_size / 1024 / 1024)); + (ctx_size / 1024 / 1024)); struct ggml_init_params params = { /*.mem_size =*/ ctx_size, @@ -349,7 +395,7 @@ int main(int argc, char * argv[]) { params.no_alloc = true; backend = ggml_backend_qnn_init(n_backend_type, "/data/local/tmp/"); if (nullptr == backend) { - QNN_LOG_ERROR("create qnn backend %d(%s) failed", n_backend_type, get_qnn_backend_name(n_backend_type)); + QNN_LOG_ERROR("create qnn backend %d(%s) failed\n", n_backend_type, get_qnn_backend_name(n_backend_type)); return 1; } } @@ -361,15 +407,25 @@ int main(int argc, char * argv[]) { } QNN_LOG_DEBUG("creating new tensors\n"); - QNN_LOG_DEBUG("ggml_blck_size(%s) %d", ggml_type_name(qtype), ggml_blck_size(qtype)); - QNN_LOG_DEBUG("ggml_type_size(%s) %d", ggml_type_name(qtype), ggml_type_size(qtype)); - if (qtype != GGML_TYPE_F32) { + QNN_LOG_DEBUG("ggml_blck_size(%s) %d\n", ggml_type_name(qtype), ggml_blck_size(qtype)); + QNN_LOG_DEBUG("ggml_type_size(%s) %d\n", ggml_type_name(qtype), ggml_type_size(qtype)); + if (ggml_is_quantized(qtype)) { sizex = ggml_blck_size(qtype); + + if (n_ggml_op_type == GGML_OP_MUL_MAT) { + sizex = ggml_blck_size(qtype) * 2; + } } + QNN_LOG_DEBUG("sizex %d\n", sizex); - src0 = ggml_new_tensor_2d(ctx, qtype, sizex, sizey); + if (n_ggml_op_type == GGML_OP_MUL) { + src0 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizey); + src1 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizey); + } else { + src0 = ggml_new_tensor_2d(ctx, qtype, sizex, sizey); + src1 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizey); + } ggml_set_input(src0); - src1 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizey); ggml_set_input(src1); switch (n_ggml_op_type) { @@ -384,7 +440,7 @@ int main(int argc, char * argv[]) { break; default: QNN_LOG_WARN("ggml op %d(%s) not supported", n_ggml_op_type, - ggml_op_name((enum ggml_op) n_ggml_op_type)); + ggml_op_name((enum ggml_op) n_ggml_op_type)); ggml_free(ctx); ggml_backend_free(backend); return 3; @@ -407,17 +463,20 @@ int main(int argc, char * argv[]) { gf = ggml_new_graph(ctx); ggml_build_forward_expand(gf, dst); -#if 0 - ggml_set_f32(src0, (rand() % 100 + 1)); - ggml_set_f32(src1, (rand() % 100 + 1)); - ggml_set_f32(dst, 0.0f); -#else if (n_backend_type != QNN_BACKEND_GGML) { initialize_tensors(ctx); + } else { + if (qtype == GGML_TYPE_F32) { + ggml_set_f32(src0, (rand() % 100 + 1)); + } else { + initialize_tensors(ctx); + } + ggml_set_f32(src1, (rand() % 100 + 1)); + //ggml_set_f32(dst, 0.0f); } -#endif ggml_graph_compute_helper(backend, gf, work_buffer, num_threads, nullptr, nullptr); + if (get_tensor_data_size(dst) < (32 * 32)) { QNN_LOG_DEBUG("dump tensors:\n"); TENSOR_DUMP(src0); @@ -425,26 +484,69 @@ int main(int argc, char * argv[]) { TENSOR_DUMP(dst); } else { QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", - src0->name, - src0->type, ggml_type_name(src0->type), src0->ne[0], src0->ne[1], src0->ne[2], - src0->nb[0], src0->nb[1], src0->nb[2]); + src0->name, + src0->type, ggml_type_name(src0->type), src0->ne[0], src0->ne[1], src0->ne[2], + src0->nb[0], src0->nb[1], src0->nb[2]); QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", - src1->name, - src1->type, ggml_type_name(src1->type), src1->ne[0], src1->ne[1], src1->ne[2], - src1->nb[0], src1->nb[1], src1->nb[2]); + src1->name, + src1->type, ggml_type_name(src1->type), src1->ne[0], src1->ne[1], src1->ne[2], + src1->nb[0], src1->nb[1], src1->nb[2]); QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", - dst->name, - dst->type, ggml_type_name(dst->type), dst->ne[0], dst->ne[1], dst->ne[2], dst->nb[0], - dst->nb[1], dst->nb[2]); + dst->name, + dst->type, ggml_type_name(dst->type), dst->ne[0], dst->ne[1], dst->ne[2], dst->nb[0], + dst->nb[1], dst->nb[2]); } ggml_free(ctx); ggml_backend_buffer_free(buffer); ggml_backend_free(backend); - n_end_time = ggml_time_us(); n_duration = (n_end_time - n_begin_time) / 1000; QNN_LOG_DEBUG("duration of ut GGML_OP_%s using QNN backend %s: %lld milliseconds\n", ggml_op_name((enum ggml_op)n_ggml_op_type), get_qnn_backend_name(n_backend_type), n_duration); + return 0; +} + + +int main(int argc, char * argv[]) { + int num_threads = 4; + int n_backend_type = QNN_BACKEND_CPU; + int n_ggml_op_type = GGML_OP_ADD; + + for (int i = 1; i < argc; i++) { + if (0 == strcmp(argv[i], "-t")) { + if (i + 1 < argc) { + if (0 == memcmp(argv[i + 1], "GGML_OP_ADD", 11)) { + n_ggml_op_type = GGML_OP_ADD; + } else if (0 == memcmp(argv[i + 1], "GGML_OP_MUL_MAT", 15)) { + n_ggml_op_type = GGML_OP_MUL_MAT; + } else if (0 == memcmp(argv[i + 1], "GGML_OP_MUL", 11)) { + n_ggml_op_type = GGML_OP_MUL; + } else { + show_usage(); + return 1; + } + i++; + } + } else if (0 == strcmp(argv[i], "-b")) { + if (i + 1 < argc) { + int backend = atoi(argv[i + 1]); + if (backend <= QNN_BACKEND_NPU) + n_backend_type = backend; + else { + show_usage(); + return 1; + } + i++; + } + } else { + show_usage(); + return 1; + } + } + + QNN_LOG_DEBUG("enter qnn_ggml_op\n"); + QNN_LOG_DEBUG("backend %d, ggml op:%d(%s)", n_backend_type, n_ggml_op_type, ggml_op_name((enum ggml_op) n_ggml_op_type)); + qnn_op_ut(num_threads, n_backend_type, n_ggml_op_type); return 0; } From f4c53037abff299f20a1d40e1247e29d2d7b82dc Mon Sep 17 00:00:00 2001 From: "zhou.weiguo" Date: Thu, 6 Jun 2024 20:24:03 +0800 Subject: [PATCH 006/143] review: remove unused QNN helper functions --- ggml-qnn.cpp | 404 +-------------------------------------------------- 1 file changed, 8 insertions(+), 396 deletions(-) diff --git a/ggml-qnn.cpp b/ggml-qnn.cpp index d0927f22e514a..e81704305e988 100644 --- a/ggml-qnn.cpp +++ b/ggml-qnn.cpp @@ -54,6 +54,7 @@ #include #endif + // ================================================================================================= // // forward/external/helper declaration @@ -61,6 +62,7 @@ // ================================================================================================= class qnn_instance; + static void ggml_qnn_log_internal(ggml_log_level level, const char * file, const char * func, int line, const char * format, ...); @@ -74,7 +76,7 @@ static void ggml_qnn_log_internal(ggml_log_level level, const char * file, const #define GGML_QNN_LOGBUF_LEN 4096 -#define GGML_QNN_DEBUG 1 //for troubleshooting QNN backend +#define GGML_QNN_DEBUG 0 //for troubleshooting QNN backend #define QNN_LOG_ERROR(...) ggml_qnn_log_internal(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) #define QNN_LOG_WARN(...) ggml_qnn_log_internal(GGML_LOG_LEVEL_DEBUG , __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) @@ -86,6 +88,8 @@ static void ggml_qnn_log_internal(ggml_log_level level, const char * file, const #define QNN_LOG_DEBUG(...) #endif +#define QNN_VER_PTR(x) (&((x).v1)) + #define VALIDATE(value, status) \ do { \ @@ -98,34 +102,6 @@ static void ggml_qnn_log_internal(ggml_log_level level, const char * file, const #define VALIDATE_TENSOR_VERSION(tensor, err) VALIDATE(validate_tensor_version(tensor), err) -#define VALIDATE_OP_CONFIG_VERSION(op, err) VALIDATE(validate_op_config_version(op), err) - -#define QNN_VER_PTR(x) (&((x).v1)) -#define QNN_OP_CFG_VALID(op_config) ((op_config).version == QNN_OPCONFIG_VERSION_1) - -#define QNN_OP_CFG_GET_NAME(op_config) get_qnn_oponfig_name(op_config) -#define QNN_OP_CFG_GET_PACKAGE_NAME(op_config) get_qnn_op_config_packagename(op_config) -#define QNN_OP_CFG_GET_TYPE_NAME(op_config) get_qnn_op_config_typename(op_config) -#define QNN_OP_CFG_GET_NUM_PARAMS(op_config) get_qnn_op_config_numparams(op_config) -#define QNN_OP_CFG_GET_PARAMS(op_config) get_qnn_op_config_params(op_config) -#define QNN_OP_CFG_GET_NUM_INPUTS(op_config) get_qnn_op_config_numinputs(op_config) -#define QNN_OP_CFG_GET_INPUTS(op_config) get_qnn_op_config_inputs(op_config) -#define QNN_OP_CFG_GET_NUM_OUTPUTS(op_config) get_qnn_op_config_numoutputs(op_config) -#define QNN_OP_CFG_GET_OUTPUTS(op_config) get_qnn_op_config_outputs(op_config) - -#define QNN_OP_CFG_SET_NAME(op_config, value) set_qnn_op_config_name(op_config, value) -#define QNN_OP_CFG_SET_PACKAGE_NAME(op_config, value) set_qnn_op_config_packagename(op_config, value) -#define QNN_OP_CFG_SET_TYPE_NAME(op_config, value) set_qnn_op_config_typename(op_config, value) - -#define QNN_OP_CFG_SET_PARAMS(op_config, num_of_params, params) \ - set_qnn_op_config_params(op_config, num_of_params, params) - -#define QNN_OP_CFG_SET_INPUTS(op_config, num_of_inputs, inputTensors) \ - set_qnn_op_config_inputs(op_config, num_of_inputs, inputTensors) - -#define QNN_OP_CFG_SET_OUTPUTS(op_config, num_of_outputs, output_tensors) \ - set_qnn_op_config_outputs(op_config, num_of_outputs, output_tensors) - #define QNN_TENSOR_GET_ID(tensor) get_qnn_tensorid(tensor) #define QNN_TENSOR_GET_NAME(tensor) get_qnn_tensorname(tensor) #define QNN_TENSOR_GET_TYPE(tensor) get_qnn_tensortype(tensor) @@ -135,8 +111,6 @@ static void ggml_qnn_log_internal(ggml_log_level level, const char * file, const #define QNN_TENSOR_GET_RANK(tensor) get_qnn_tensor_rank(tensor) #define QNN_TENSOR_GET_DIMENSIONS(tensor) get_qnn_tensor_dimensions(tensor) #define QNN_TENSOR_GET_MEM_TYPE(tensor) get_qnn_tensor_memtype(tensor) -#define QNN_TENSOR_GET_CLIENT_BUF(tensor) get_qnn_tensor_clientbuf(tensor) -#define QNN_TENSOR_GET_MEM_HANDLE(tensor) get_qnn_tensor_memhandle(tensor) #define QNN_TENSOR_SET_ID(tensor, value) set_qnn_tensor_id(tensor, value) #define QNN_TENSOR_SET_NAME(tensor, value) set_qnn_tensor_name(tensor, value) @@ -150,7 +124,6 @@ static void ggml_qnn_log_internal(ggml_log_level level, const char * file, const #define QNN_TENSOR_SET_CLIENT_BUF(tensor, value) set_qnn_tensor_clientbuf(tensor, value) #define QNN_TENSOR_SET_MEM_HANDLE(tensor, value) set_qnn_tensor_memhandle(tensor, value) - using pfn_rpc_mem_init = void (*)(void); using pfn_rpc_mem_deinit = void (*)(void); using pfn_rpc_mem_alloc = void *(*)(int, uint32_t, int); @@ -204,6 +177,7 @@ static int g_current_device = QNN_BACKEND_GGML; //Qualcomm CPU: Qualcomm Kryo CPU //Qualcomm GPU: Qualcomm Adreno GPU //Qualcomm NPU: aka HTP(Hexagon Tensor Processor), ~= cDSP(Compute DSP) + HMX(Hexagon Matrix eXtensions)/HTA(Hexagon Tensor Accelerator) + static struct ggml_backend_qnn_context g_qnn_mgr[GGML_QNN_MAX_DEVICES] = { [QNN_BACKEND_CPU] = {.device = 0, .threads = 1, .name = "qnn-cpu", .lib = "libQnnCpu.so", .instance = nullptr, .backend = nullptr, .raw_interface = {}, .raw_system_interface = {}}, [QNN_BACKEND_GPU] = {.device = 1, .threads = 1, .name = "qnn-gpu", .lib = "libQnnGpu.so", .instance = nullptr, .backend = nullptr, .raw_interface = {}, .raw_system_interface = {}}, @@ -227,221 +201,6 @@ static inline int validate_tensor_version(Qnn_Tensor_t tensor) { } -[[maybe_unused]] static inline int validate_op_config_version(Qnn_OpConfig_t op_config) { - if (op_config.version != QNN_OPCONFIG_VERSION_1) { - QNN_LOG_WARN("validate_op_config_version() op %s, got unsupported version %d\n", - op_config.v1.name, - op_config.version); - return 1; - } - return 0; -} - - -static inline const char * get_qnn_oponfig_name(const Qnn_OpConfig_t & op_config) { - if (op_config.version == QNN_OPCONFIG_VERSION_1) { - return op_config.v1.name; - } - return nullptr; -} - - -[[maybe_unused]] static inline const char * get_qnn_oponfig_name(const Qnn_OpConfig_t * op_config) { - return get_qnn_oponfig_name(*op_config); -} - - -static inline const char * get_qnn_op_config_packagename(const Qnn_OpConfig_t & op_config) { - if (op_config.version == QNN_OPCONFIG_VERSION_1) { - return op_config.v1.packageName; - } - return nullptr; -} - - -[[maybe_unused]] static inline const char * get_qnn_op_config_packagename(const Qnn_OpConfig_t * op_config) { - return get_qnn_op_config_packagename(*op_config); -} - - -static inline const char * get_qnn_op_config_typename(const Qnn_OpConfig_t & op_config) { - if (op_config.version == QNN_OPCONFIG_VERSION_1) { - return op_config.v1.typeName; - } - return nullptr; -} - - -[[maybe_unused]] static inline const char * get_qnn_op_config_typename(const Qnn_OpConfig_t * op_config) { - return get_qnn_op_config_typename(*op_config); -} - - -static inline uint32_t get_qnn_op_config_numparams(const Qnn_OpConfig_t & op_config) { - if (op_config.version == QNN_OPCONFIG_VERSION_1) { - return op_config.v1.numOfParams; - } - return 0u; -} - - -[[maybe_unused]] static inline uint32_t get_qnn_op_config_numparams(const Qnn_OpConfig_t * op_config) { - return get_qnn_op_config_numparams(*op_config); -} - - -static inline const Qnn_Param_t * get_qnn_op_config_params(const Qnn_OpConfig_t & op_config) { - if (op_config.version == QNN_OPCONFIG_VERSION_1) { - return op_config.v1.params; - } - return nullptr; -} - - -[[maybe_unused]] static inline const Qnn_Param_t * get_qnn_op_config_params(const Qnn_OpConfig_t * op_config) { - return get_qnn_op_config_params(*op_config); -} - - -static inline uint32_t get_qnn_op_config_numinputs(const Qnn_OpConfig_t & op_config) { - if (op_config.version == QNN_OPCONFIG_VERSION_1) { - return op_config.v1.numOfInputs; - } - return 0u; -} - - -[[maybe_unused]] static inline uint32_t get_qnn_op_config_numinputs(const Qnn_OpConfig_t * op_config) { - return get_qnn_op_config_numinputs(*op_config); -} - - -static inline const Qnn_Tensor_t * get_qnn_op_config_inputs(const Qnn_OpConfig_t & op_config) { - if (op_config.version == QNN_OPCONFIG_VERSION_1) { - return op_config.v1.inputTensors; - } - return nullptr; -} - - -[[maybe_unused]] static inline const Qnn_Tensor_t * get_qnn_op_config_inputs(const Qnn_OpConfig_t * op_config) { - return get_qnn_op_config_inputs(*op_config); -} - - -static inline uint32_t get_qnn_op_config_numoutputs(const Qnn_OpConfig_t & op_config) { - if (op_config.version == QNN_OPCONFIG_VERSION_1) { - return op_config.v1.numOfOutputs; - } - return 0u; -} - - -[[maybe_unused]] static inline uint32_t get_qnn_op_config_numoutputs(const Qnn_OpConfig_t * op_config) { - return get_qnn_op_config_numoutputs(*op_config); -} - - -static inline const Qnn_Tensor_t * get_qnn_op_config_outputs(const Qnn_OpConfig_t & op_config) { - if (op_config.version == QNN_OPCONFIG_VERSION_1) { - return op_config.v1.outputTensors; - } - return nullptr; -} - - -[[maybe_unused]] static inline const Qnn_Tensor_t * get_qnn_op_config_outputs(const Qnn_OpConfig_t * op_config) { - return get_qnn_op_config_outputs(*op_config); -} - - -static inline void set_qnn_op_config_name(Qnn_OpConfig_t & op_config, const char * name) { - if (op_config.version == QNN_OPCONFIG_VERSION_1) { - op_config.v1.name = name; - } -} - - -[[maybe_unused]] static inline void set_qnn_op_config_name(Qnn_OpConfig_t * op_config, const char * name) { - set_qnn_op_config_name(*op_config, name); -} - - -static inline void set_qnn_op_config_packagename(Qnn_OpConfig_t & op_config, const char * package_name) { - if (op_config.version == QNN_OPCONFIG_VERSION_1) { - op_config.v1.packageName = package_name; - } -} - - -[[maybe_unused]] static inline void set_qnn_op_config_packagename(Qnn_OpConfig_t * op_config, const char * package_name) { - set_qnn_op_config_packagename(*op_config, package_name); -} - - -static inline void set_qnn_op_config_typename(Qnn_OpConfig_t & op_config, const char * type_name) { - if (op_config.version == QNN_OPCONFIG_VERSION_1) { - op_config.v1.typeName = type_name; - } -} - - -[[maybe_unused]] static inline void set_qnn_op_config_typename(Qnn_OpConfig_t * op_config, const char * type_name) { - set_qnn_op_config_typename(*op_config, type_name); -} - - -static inline void set_qnn_op_config_params(Qnn_OpConfig_t & op_config, - uint32_t num_of_params, - Qnn_Param_t * params) { - if (op_config.version == QNN_OPCONFIG_VERSION_1) { - op_config.v1.numOfParams = num_of_params; - op_config.v1.params = params; - } -} - - -[[maybe_unused]] static inline void set_qnn_op_config_params(Qnn_OpConfig_t * op_config, - uint32_t num_of_params, - Qnn_Param_t * params) { - set_qnn_op_config_params(*op_config, num_of_params, params); -} - - -static inline void set_qnn_op_config_inputs(Qnn_OpConfig_t & op_config, - uint32_t num_of_inputs, - Qnn_Tensor_t * input_tensors) { - if (op_config.version == QNN_OPCONFIG_VERSION_1) { - op_config.v1.numOfInputs = num_of_inputs; - op_config.v1.inputTensors = input_tensors; - } -} - - -[[maybe_unused]] static inline void set_qnn_op_config_inputs(Qnn_OpConfig_t * op_config, - uint32_t num_of_inputs, - Qnn_Tensor_t * input_tensors) { - set_qnn_op_config_inputs(*op_config, num_of_inputs, input_tensors); -} - - -static inline void set_qnn_op_config_outputs(Qnn_OpConfig_t & op_config, - uint32_t num_of_outputs, - Qnn_Tensor_t * output_tensors) { - if (op_config.version == QNN_OPCONFIG_VERSION_1) { - op_config.v1.numOfOutputs = num_of_outputs; - op_config.v1.outputTensors = output_tensors; - } -} - - -[[maybe_unused]] static inline void set_qnn_op_config_outputs(Qnn_OpConfig_t * op_config, - uint32_t num_of_outputs, - Qnn_Tensor_t * output_tensors) { - set_qnn_op_config_outputs(*op_config, num_of_outputs, output_tensors); -} - - static inline uint32_t get_qnn_tensorid(const Qnn_Tensor_t & tensor) { if (tensor.version == QNN_TENSOR_VERSION_1) { return tensor.v1.id; @@ -451,11 +210,6 @@ static inline uint32_t get_qnn_tensorid(const Qnn_Tensor_t & tensor) { } -[[maybe_unused]] static inline uint32_t get_qnn_tensorid(const Qnn_Tensor_t * tensor) { - return get_qnn_tensorid(*tensor); -} - - static inline const char * get_qnn_tensorname(const Qnn_Tensor_t & tensor) { if (tensor.version == QNN_TENSOR_VERSION_1) { return tensor.v1.name; @@ -464,10 +218,6 @@ static inline const char * get_qnn_tensorname(const Qnn_Tensor_t & tensor) { } -static inline const char * get_qnn_tensorname(const Qnn_Tensor_t * tensor) { - return get_qnn_tensorname(*tensor); -} - static inline Qnn_TensorType_t get_qnn_tensortype(const Qnn_Tensor_t & tensor) { if (tensor.version == QNN_TENSOR_VERSION_1) { @@ -477,11 +227,6 @@ static inline Qnn_TensorType_t get_qnn_tensortype(const Qnn_Tensor_t & tensor) { } -[[maybe_unused]] static inline Qnn_TensorType_t get_qnn_tensortype(const Qnn_Tensor_t * tensor) { - return get_qnn_tensortype(*tensor); -} - - static inline Qnn_TensorDataFormat_t get_qnn_tensor_dataformat(const Qnn_Tensor_t & tensor) { if (tensor.version == QNN_TENSOR_VERSION_1) { return tensor.v1.dataFormat; @@ -490,11 +235,6 @@ static inline Qnn_TensorDataFormat_t get_qnn_tensor_dataformat(const Qnn_Tensor_ } -[[maybe_unused]] static inline Qnn_TensorDataFormat_t get_qnn_tensor_dataformat(const Qnn_Tensor_t * tensor) { - return get_qnn_tensor_dataformat(*tensor); -} - - static inline Qnn_DataType_t get_qnn_tensor_datatype(const Qnn_Tensor_t & tensor) { if (tensor.version == QNN_TENSOR_VERSION_1) { return tensor.v1.dataType; @@ -503,11 +243,6 @@ static inline Qnn_DataType_t get_qnn_tensor_datatype(const Qnn_Tensor_t & tensor } -[[maybe_unused]] static inline Qnn_DataType_t get_qnn_tensor_datatype(const Qnn_Tensor_t * tensor) { - return get_qnn_tensor_datatype(*tensor); -} - - static inline Qnn_QuantizeParams_t get_qnn_tensor_quantparams(const Qnn_Tensor_t & tensor) { if (tensor.version == QNN_TENSOR_VERSION_1) { return tensor.v1.quantizeParams; @@ -516,11 +251,6 @@ static inline Qnn_QuantizeParams_t get_qnn_tensor_quantparams(const Qnn_Tensor_t } -[[maybe_unused]] static inline Qnn_QuantizeParams_t get_qnn_tensor_quantparams(const Qnn_Tensor_t * tensor) { - return get_qnn_tensor_quantparams(*tensor); -} - - static inline uint32_t get_qnn_tensor_rank(const Qnn_Tensor_t & tensor) { if (tensor.version == QNN_TENSOR_VERSION_1) { return tensor.v1.rank; @@ -529,11 +259,6 @@ static inline uint32_t get_qnn_tensor_rank(const Qnn_Tensor_t & tensor) { } -[[maybe_unused]] static inline uint32_t get_qnn_tensor_rank(const Qnn_Tensor_t * tensor) { - return get_qnn_tensor_rank(*tensor); -} - - static inline uint32_t * get_qnn_tensor_dimensions(const Qnn_Tensor_t & tensor) { if (tensor.version == QNN_TENSOR_VERSION_1) { return tensor.v1.dimensions; @@ -542,11 +267,6 @@ static inline uint32_t * get_qnn_tensor_dimensions(const Qnn_Tensor_t & tensor) } -[[maybe_unused]] static inline uint32_t * get_qnn_tensor_dimensions(const Qnn_Tensor_t * tensor) { - return get_qnn_tensor_dimensions(*tensor); -} - - static inline Qnn_TensorMemType_t get_qnn_tensor_memtype(const Qnn_Tensor_t & tensor) { if (tensor.version == QNN_TENSOR_VERSION_1) { return tensor.v1.memType; @@ -555,37 +275,6 @@ static inline Qnn_TensorMemType_t get_qnn_tensor_memtype(const Qnn_Tensor_t & te } -[[maybe_unused]] static inline Qnn_TensorMemType_t get_qnn_tensor_memtype(const Qnn_Tensor_t * tensor) { - return get_qnn_tensor_memtype(*tensor); -} - - -static inline Qnn_ClientBuffer_t get_qnn_tensor_clientbuf(const Qnn_Tensor_t & tensor) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - return tensor.v1.clientBuf; - } - return QNN_CLIENT_BUFFER_INIT; -} - - -[[maybe_unused]] static inline Qnn_ClientBuffer_t get_qnn_tensor_clientbuf(const Qnn_Tensor_t * tensor) { - return get_qnn_tensor_clientbuf(*tensor); -} - - -static inline Qnn_MemHandle_t get_qnn_tensor_memhandle(const Qnn_Tensor_t & tensor) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - return tensor.v1.memHandle; - } - return nullptr; -} - - -[[maybe_unused]] static inline Qnn_MemHandle_t get_qnn_tensor_memhandle(const Qnn_Tensor_t * tensor) { - return get_qnn_tensor_memhandle(*tensor); -} - - static inline void set_qnn_tensor_id(Qnn_Tensor_t & tensor, uint32_t id) { if (tensor.version == QNN_TENSOR_VERSION_1) { tensor.v1.id = id; @@ -593,11 +282,6 @@ static inline void set_qnn_tensor_id(Qnn_Tensor_t & tensor, uint32_t id) { } -[[maybe_unused]] static inline void set_qnn_tensor_id(Qnn_Tensor_t * tensor, uint32_t id) { - set_qnn_tensor_id(*tensor, id); -} - - static inline void set_qnn_tensor_name(Qnn_Tensor_t & tensor, const char * name) { if (tensor.version == QNN_TENSOR_VERSION_1) { tensor.v1.name = name; @@ -605,11 +289,6 @@ static inline void set_qnn_tensor_name(Qnn_Tensor_t & tensor, const char * name) } -[[maybe_unused]] static inline void set_qnn_tensor_name(Qnn_Tensor_t * tensor, const char * name) { - set_qnn_tensor_name(*tensor, name); -} - - static inline void set_qnn_tensor_type(Qnn_Tensor_t & tensor, Qnn_TensorType_t type) { if (tensor.version == QNN_TENSOR_VERSION_1) { tensor.v1.type = type; @@ -617,11 +296,6 @@ static inline void set_qnn_tensor_type(Qnn_Tensor_t & tensor, Qnn_TensorType_t t } -[[maybe_unused]] static inline void set_qnn_tensor_type(Qnn_Tensor_t * tensor, Qnn_TensorType_t type) { - set_qnn_tensor_type(*tensor, type); -} - - static inline void set_qnn_tensor_dataformat(Qnn_Tensor_t & tensor, Qnn_TensorDataFormat_t format) { if (tensor.version == QNN_TENSOR_VERSION_1) { tensor.v1.dataFormat = format; @@ -629,11 +303,6 @@ static inline void set_qnn_tensor_dataformat(Qnn_Tensor_t & tensor, Qnn_TensorDa } -[[maybe_unused]] static inline void set_qnn_tensor_dataformat(Qnn_Tensor_t * tensor, Qnn_TensorDataFormat_t format) { - set_qnn_tensor_dataformat(*tensor, format); -} - - static inline void set_qnn_tensor_datatype(Qnn_Tensor_t & tensor, Qnn_DataType_t dataType) { if (tensor.version == QNN_TENSOR_VERSION_1) { tensor.v1.dataType = dataType; @@ -641,11 +310,6 @@ static inline void set_qnn_tensor_datatype(Qnn_Tensor_t & tensor, Qnn_DataType_t } -[[maybe_unused]] static inline void set_qnn_tensor_datatype(Qnn_Tensor_t * tensor, Qnn_DataType_t dataType) { - set_qnn_tensor_datatype(*tensor, dataType); -} - - static inline void set_qnn_tensor_quantparams(Qnn_Tensor_t & tensor, Qnn_QuantizeParams_t params) { if (tensor.version == QNN_TENSOR_VERSION_1) { tensor.v1.quantizeParams = params; @@ -653,11 +317,6 @@ static inline void set_qnn_tensor_quantparams(Qnn_Tensor_t & tensor, Qnn_Quantiz } -[[maybe_unused]] static inline void set_qnn_tensor_quantparams(Qnn_Tensor_t * tensor, Qnn_QuantizeParams_t params) { - set_qnn_tensor_quantparams(*tensor, params); -} - - static inline void set_qnn_tensor_rank(Qnn_Tensor_t & tensor, uint32_t rank) { if (tensor.version == QNN_TENSOR_VERSION_1) { tensor.v1.rank = rank; @@ -665,11 +324,6 @@ static inline void set_qnn_tensor_rank(Qnn_Tensor_t & tensor, uint32_t rank) { } -[[maybe_unused]] static inline void set_qnn_tensor_rank(Qnn_Tensor_t * tensor, uint32_t rank) { - set_qnn_tensor_rank(*tensor, rank); -} - - static inline void set_qnn_tensor_dimensions(Qnn_Tensor_t & tensor, uint32_t * dims) { if (tensor.version == QNN_TENSOR_VERSION_1) { tensor.v1.dimensions = dims; @@ -677,11 +331,6 @@ static inline void set_qnn_tensor_dimensions(Qnn_Tensor_t & tensor, uint32_t * d } -[[maybe_unused]] static inline void set_qnn_tensor_dimensions(Qnn_Tensor_t * tensor, uint32_t * dims) { - set_qnn_tensor_dimensions(*tensor, dims); -} - - static inline void set_qnn_tensor_memtype(Qnn_Tensor_t & tensor, Qnn_TensorMemType_t memType) { if (tensor.version == QNN_TENSOR_VERSION_1) { tensor.v1.memType = memType; @@ -689,11 +338,6 @@ static inline void set_qnn_tensor_memtype(Qnn_Tensor_t & tensor, Qnn_TensorMemTy } -[[maybe_unused]] static inline void set_qnn_tensor_memtype(Qnn_Tensor_t * tensor, Qnn_TensorMemType_t memType) { - set_qnn_tensor_memtype(*tensor, memType); -} - - static inline void set_qnn_tensor_clientbuf(Qnn_Tensor_t & tensor, Qnn_ClientBuffer_t clientBuf) { if (tensor.version == QNN_TENSOR_VERSION_1) { tensor.v1.clientBuf = clientBuf; @@ -701,11 +345,6 @@ static inline void set_qnn_tensor_clientbuf(Qnn_Tensor_t & tensor, Qnn_ClientBuf } -[[maybe_unused]] static inline void set_qnn_tensor_clientbuf(Qnn_Tensor_t * tensor, Qnn_ClientBuffer_t clientBuf) { - set_qnn_tensor_clientbuf(*tensor, clientBuf); -} - - static inline void set_qnn_tensor_memhandle(Qnn_Tensor_t & tensor, Qnn_MemHandle_t handle) { if (tensor.version == QNN_TENSOR_VERSION_1) { tensor.v1.memHandle = handle; @@ -713,11 +352,6 @@ static inline void set_qnn_tensor_memhandle(Qnn_Tensor_t & tensor, Qnn_MemHandle } -[[maybe_unused]] static inline void set_qnn_tensor_memhandle(Qnn_Tensor_t * tensor, Qnn_MemHandle_t handle) { - set_qnn_tensor_memhandle(*tensor, handle); -} - - static size_t memscpy(void * dst, size_t dstSize, const void * src, size_t copySize) { if (!dst || !src || !dstSize || !copySize) return 0; @@ -824,19 +458,6 @@ static int free_qnn_tensor(Qnn_Tensor_t & tensor) { } -[[maybe_unused]] static int free_qnn_tensors(Qnn_Tensor_t *& tensors, uint32_t num_tensors) { - int err = 0; - - // free all pointer allocations in struct - for (size_t i = 0; i < num_tensors; i++) { - free_qnn_tensor(tensors[i]); - } - free(tensors); - - return err; -} - - static uint32_t ggml_get_tensor_rank(const ggml_tensor * tensor) { uint32_t rank = 0; for (int i = 0; i < GGML_MAX_DIMS; i++) { @@ -3137,7 +2758,7 @@ static const char * ggml_backend_qnn_buffer_get_name(ggml_backend_buffer_t buffe } -[[maybe_unused]] GGML_CALL static bool ggml_backend_buffer_is_qnn(ggml_backend_buffer_t buffer) { +GGML_CALL static bool ggml_backend_buffer_is_qnn(ggml_backend_buffer_t buffer) { return buffer->iface.get_name == ggml_backend_qnn_buffer_get_name; } @@ -3236,15 +2857,6 @@ GGML_CALL static void ggml_backend_qnn_buffer_clear(ggml_backend_buffer_t buffer } -[[maybe_unused]] GGML_CALL static void ggml_backend_qnn_buffer_reset(ggml_backend_buffer_t buffer) { - ggml_backend_qnn_buffer_context * ctx = (ggml_backend_qnn_buffer_context *) buffer->context; - for (auto * sub_buffer : ctx->sub_buffers) { - free(sub_buffer); - } - ctx->sub_buffers.clear(); -} - - static ggml_backend_buffer_i ggml_backend_qnn_buffer_interface = { /* .get_name = */ ggml_backend_qnn_buffer_get_name, /* .free_buffer = */ ggml_backend_qnn_buffer_free_buffer, @@ -3402,7 +3014,7 @@ GGML_CALL static bool ggml_backend_qnn_supports_op(ggml_backend_t backend, const //note: this function be used with proposal/refined ggml backend subsystem in this PR: // https://github.com/ggerganov/llama.cpp/pull/7641 -// any ggml backend(only using system memory: ggml_backend_xxx_buffer_is_host return true) +// new ggml backend(only using system memory: ggml_backend_xxx_buffer_is_host return true) // can following this style for mixed inference between CPU&GPU / CPU&NPU very easily GGML_CALL static bool ggml_backend_qnn_offload_op(ggml_backend_t backend, const ggml_tensor * tensor) { GGML_UNUSED(backend); From 2fab33d8250db70e872a12af7ffd41af04592acc Mon Sep 17 00:00:00 2001 From: "zhou.weiguo" Date: Fri, 7 Jun 2024 12:51:04 +0800 Subject: [PATCH 007/143] ggml-qnn: remove static global vars to support multi-instance simultaneously --- ggml-qnn.cpp | 250 +++++++++++++++------------------ tests/ggml-qnn/ggml-qnn-ut.cpp | 3 +- 2 files changed, 113 insertions(+), 140 deletions(-) diff --git a/ggml-qnn.cpp b/ggml-qnn.cpp index e81704305e988..867f01625ad7f 100644 --- a/ggml-qnn.cpp +++ b/ggml-qnn.cpp @@ -76,7 +76,7 @@ static void ggml_qnn_log_internal(ggml_log_level level, const char * file, const #define GGML_QNN_LOGBUF_LEN 4096 -#define GGML_QNN_DEBUG 0 //for troubleshooting QNN backend +#define GGML_QNN_DEBUG 1 //for troubleshooting QNN backend #define QNN_LOG_ERROR(...) ggml_qnn_log_internal(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) #define QNN_LOG_WARN(...) ggml_qnn_log_internal(GGML_LOG_LEVEL_DEBUG , __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) @@ -89,7 +89,7 @@ static void ggml_qnn_log_internal(ggml_log_level level, const char * file, const #endif #define QNN_VER_PTR(x) (&((x).v1)) - +#define GGML_QNN_NAME "qnn" #define VALIDATE(value, status) \ do { \ @@ -135,8 +135,6 @@ using _pfn_QnnInterface_getProviders = decltype(QnnInterface_ using _pfn_QnnSystemInterface_getProviders = decltype(QnnSystemInterface_getProviders); -typedef void (* ggml_qnn_func_t)(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); -typedef void (* ggml_qnn_func_common_t)(const ggml_op ggml_op, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); enum class ggml_qnn_profile_level { profile_off = 0, @@ -144,7 +142,6 @@ enum class ggml_qnn_profile_level { profile_detail = 2 }; - struct ggml_backend_qnn_context { int device; int threads; @@ -156,15 +153,16 @@ struct ggml_backend_qnn_context { QNN_SYSTEM_INTERFACE_VER_TYPE raw_system_interface; } ; +typedef void (* ggml_qnn_func_t)(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); + +typedef void (* ggml_qnn_func_common_t)(ggml_backend_qnn_context * ctx, const ggml_op ggml_op, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); // ================================================================================================= // // static global variables // // ================================================================================================= -static ggml_backend_t g_qnn_backend = nullptr; - -static int g_current_device = QNN_BACKEND_GGML; +//static ggml_backend_t g_qnn_backend = nullptr; //according to the QNN SDK Reference Guide, //CPU - Choose a non-quantized model. Quantized models are currently incompatible with the CPU backend @@ -184,7 +182,6 @@ static struct ggml_backend_qnn_context g_qnn_mgr[GGML_QNN_MAX_DEVICES] = { [QNN_BACKEND_NPU] = {.device = 2, .threads = 1, .name = "qnn-npu", .lib = "libQnnHtp.so", .instance = nullptr, .backend = nullptr, .raw_interface = {}, .raw_system_interface = {}}, }; - // ================================================================================================= // // QNN helper functions and other internal helper functions @@ -1010,7 +1007,7 @@ void qnn_instance::free_rpcmem(void * buf) { } -int32_t qnn_instance::rpcmem_to_fd(void *buf) { +int32_t qnn_instance::rpcmem_to_fd(void * buf) { int32_t mem_fd = -1; if (!is_rpcmem_initialized()) { QNN_LOG_WARN("rpc memory not initialized\n"); @@ -1168,33 +1165,6 @@ int qnn_instance::load_backend(std::string & lib_path, const QnnSaver_Config_t * _loaded_lib_handle[backend_id] = lib_handle; _backend_id = backend_id; -#if 0 //comment it for purpose of reduce size of APK - QnnSaver_Config_t outputdir_cfg; - outputdir_cfg.option = QNN_SAVER_CONFIG_OPTION_OUTPUT_DIRECTORY; - outputdir_cfg.outputDirectory = "/data/local/tmp/"; - - QnnSaver_Config_t backendid_cfg; - backendid_cfg.option = QNN_SAVER_CONFIG_OPTION_BACKEND_ID; - backendid_cfg.backendId = _backend_id; - const QnnSaver_Config_t *saverCfg[] = {&outputdir_cfg, &backendid_cfg, nullptr}; - if (0 == QnnSaver_initialize(saverCfg)) { - QNN_LOG_INFO("QnnSaver_initialize successfully"); - } else { - QNN_LOG_WARN("QnnSaver_initialize failure"); - } -#endif - auto saver_initialize = load_qnn_functionpointers<_pfn_QnnSaver_initialize *>( - _loaded_lib_handle[backend_id], "QnnSaver_initialize"); - if (nullptr != saver_initialize) { - error = saver_initialize(saver_config); - if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to saver_initialize,error %d", QNN_GET_ERROR_CODE(error)); - return 7; - } - } else { - QNN_LOG_WARN("saver_initialize is null\n"); - } - return 0; } @@ -1345,14 +1315,15 @@ static void ggml_qnn_logcallback(const char * fmt, } double ms = (double) timestamp / 1000000.0; - - if (0) { +#if GGML_QNN_DEBUG + { std::lock_guard lock(log_mutex); memset(s_ggml_qnn_logbuf, 0, GGML_QNN_LOGBUF_LEN); vsnprintf(reinterpret_cast(s_ggml_qnn_logbuf), GGML_QNN_LOGBUF_LEN, fmt, argp); QNN_LOG_DEBUG("%8.1fms [%-7s] %s\n", ms, log_level_desc, s_ggml_qnn_logbuf); } +#endif } @@ -1390,11 +1361,7 @@ int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) { _qnn_interface.set_qnn_interface(_loaded_backend[backend_id]); -#if 1 _qnn_interface.qnn_log_create(ggml_qnn_logcallback, _qnn_log_level, &_qnn_log_handle); -#else - _qnn_raw_interface.logCreate(ggml_qnn_logcallback, _qnn_log_level, &_qnn_log_handle); -#endif if (nullptr == _qnn_log_handle) { QNN_LOG_WARN("why failed to initialize qnn log\n"); //NPU backend not work on Qualcomm SoC based low-end phone return 4; @@ -1437,7 +1404,7 @@ int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) { if (QNN_PROFILE_NO_ERROR != _qnn_raw_interface.profileCreate( _qnn_backend_handle, QNN_PROFILE_LEVEL_BASIC, &_qnn_profile_handle)) { QNN_LOG_WARN("unable to create profile handle in the backend\n"); - return 7; + return 6; } else { QNN_LOG_DEBUG("initialize qnn profile successfully\n"); } @@ -1456,7 +1423,7 @@ int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) { _rpc_lib_handle = dlopen("libcdsprpc.so", RTLD_NOW | RTLD_LOCAL); if (nullptr == _rpc_lib_handle) { QNN_LOG_WARN("failed to load qualcomm's rpc lib, error:%s\n", dlerror()); - return 9; + return 8; } else { QNN_LOG_DEBUG("load rpcmem lib successfully\n"); set_rpcmem_initialized(true); @@ -1470,7 +1437,7 @@ int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) { || nullptr == _pfn_rpc_mem_to_fd) { QNN_LOG_WARN("unable to access symbols in QNN RPC lib. dlerror(): %s", dlerror()); dlclose(_rpc_lib_handle); - return 10; + return 9; } if (nullptr != _pfn_rpc_mem_init) // make Qualcomm's SoC based low-end phone happy @@ -1483,7 +1450,7 @@ int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) { &_qnn_context_handle); if (nullptr == _qnn_context_handle) { QNN_LOG_WARN("why failed to initialize qnn context\n"); - return 8; + return 10; } else { QNN_LOG_DEBUG("initialize qnn context successfully\n"); } @@ -1695,7 +1662,7 @@ static bool ggml_qnn_can_handle_op(const struct ggml_tensor * tensor, bool b_dum } -static void ggml_qnn_add(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { Qnn_ErrorHandle_t error = QNN_SUCCESS; bool graph_initialized = false; int64_t n_begin_time = 0LL; @@ -1703,7 +1670,6 @@ static void ggml_qnn_add(const ggml_tensor * src0, const ggml_tensor * src1, ggm int64_t n_duration = 0LL; qnn_instance * instance = nullptr; - struct ggml_backend_qnn_context * ctx = nullptr; std::string graph_name = "ggml_op_qnn_add"; Qnn_GraphHandle_t graph_handle = nullptr; @@ -1727,7 +1693,6 @@ static void ggml_qnn_add(const ggml_tensor * src0, const ggml_tensor * src1, ggm QNN_LOG_WARN("pls check why QNN tensor is null"); return; } - ctx = (struct ggml_backend_qnn_context *)g_qnn_backend->context; if (nullptr == ctx) { QNN_LOG_WARN("pls check why backend ctx is null"); return; @@ -1755,9 +1720,9 @@ static void ggml_qnn_add(const ggml_tensor * src0, const ggml_tensor * src1, ggm dst->type, ggml_type_name(dst->type), dst->ne[0], dst->ne[1], dst->ne[2], dst->nb[0], dst->nb[1], dst->nb[2]); QNN_LOG_DEBUG("%d, %d, %d, %d", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]); - QNN_LOG_DEBUG("tensor0 name %s", QNN_TENSOR_GET_NAME(tensor_0)); - QNN_LOG_DEBUG("tensor1 name %s", QNN_TENSOR_GET_NAME(tensor_1)); - QNN_LOG_DEBUG("tensor2 name %s", QNN_TENSOR_GET_NAME(tensor_2)); + QNN_LOG_DEBUG("tensor0 name %s", QNN_TENSOR_GET_NAME(*tensor_0)); + QNN_LOG_DEBUG("tensor1 name %s", QNN_TENSOR_GET_NAME(*tensor_1)); + QNN_LOG_DEBUG("tensor2 name %s", QNN_TENSOR_GET_NAME(*tensor_2)); QNN_VER_PTR(*tensor_0)->type = QNN_TENSOR_TYPE_APP_WRITE; QNN_VER_PTR(*tensor_1)->type = QNN_TENSOR_TYPE_APP_WRITE; @@ -1918,7 +1883,7 @@ static void ggml_qnn_add(const ggml_tensor * src0, const ggml_tensor * src1, ggm * mul_mat_f16_f32: src0 is F16 and src1 is F32. * mul_mat_q_f32: src0 is quantized (Q4_0, Q4_1, ...), and src1 is F32. */ -static void ggml_qnn_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { Qnn_ErrorHandle_t error = QNN_SUCCESS; bool graph_initialized = false; int64_t n_begin_time = 0LL; @@ -1926,7 +1891,6 @@ static void ggml_qnn_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, int64_t n_duration = 0LL; qnn_instance * instance = nullptr; - struct ggml_backend_qnn_context * ctx = nullptr; std::string graph_name = "ggml_op_qnn_mul_mat"; Qnn_GraphHandle_t graph_handle = nullptr; @@ -1952,7 +1916,6 @@ static void ggml_qnn_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, QNN_LOG_WARN("pls check why QNN tensor is null"); return; } - ctx = (struct ggml_backend_qnn_context *)g_qnn_backend->context; if (nullptr == ctx) { QNN_LOG_WARN("pls check why backend ctx is null"); return; @@ -1979,9 +1942,9 @@ static void ggml_qnn_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, dst->type, ggml_type_name(dst->type), dst->ne[0], dst->ne[1], dst->ne[2], dst->nb[0], dst->nb[1], dst->nb[2]); QNN_LOG_DEBUG("%d, %d, %d, %d", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]); - QNN_LOG_DEBUG("tensor0 name %s", QNN_TENSOR_GET_NAME(tensor_0)); - QNN_LOG_DEBUG("tensor1 name %s", QNN_TENSOR_GET_NAME(tensor_1)); - QNN_LOG_DEBUG("tensor2 name %s", QNN_TENSOR_GET_NAME(tensor_2)); + QNN_LOG_DEBUG("tensor0 name %s", QNN_TENSOR_GET_NAME(*tensor_0)); + QNN_LOG_DEBUG("tensor1 name %s", QNN_TENSOR_GET_NAME(*tensor_1)); + QNN_LOG_DEBUG("tensor2 name %s", QNN_TENSOR_GET_NAME(*tensor_2)); QNN_VER_PTR(*tensor_0)->type = QNN_TENSOR_TYPE_APP_WRITE; QNN_VER_PTR(*tensor_1)->type = QNN_TENSOR_TYPE_APP_WRITE; @@ -2129,7 +2092,7 @@ static void ggml_qnn_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, //common function for GGML OPs using QNN API -static void ggml_qnn_hanlde_op(const enum ggml_op ggmlop, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_qnn_hanlde_op(ggml_backend_qnn_context * ctx, const enum ggml_op ggmlop, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { Qnn_ErrorHandle_t error = QNN_SUCCESS; bool graph_initialized = false; int64_t n_begin_time = 0LL; @@ -2137,7 +2100,6 @@ static void ggml_qnn_hanlde_op(const enum ggml_op ggmlop, const ggml_tensor * sr int64_t n_duration = 0LL; qnn_instance * instance = nullptr; - struct ggml_backend_qnn_context * ctx = nullptr; std::string qnn_graph_name = "ggml_qnn_graph"; std::string qnn_op_config_name = "ggml_qnn_op_config"; @@ -2164,7 +2126,6 @@ static void ggml_qnn_hanlde_op(const enum ggml_op ggmlop, const ggml_tensor * sr QNN_LOG_WARN("pls check why QNN tensor is null"); return; } - ctx = (struct ggml_backend_qnn_context *)g_qnn_backend->context; if (nullptr == ctx) { QNN_LOG_WARN("pls check why backend ctx is null"); return; @@ -2201,9 +2162,9 @@ static void ggml_qnn_hanlde_op(const enum ggml_op ggmlop, const ggml_tensor * sr dst->type, ggml_type_name(dst->type), dst->ne[0], dst->ne[1], dst->ne[2], dst->nb[0], dst->nb[1], dst->nb[2]); QNN_LOG_DEBUG("%d, %d, %d, %d", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]); - QNN_LOG_DEBUG("tensor0 name %s", QNN_TENSOR_GET_NAME(tensor_0)); - QNN_LOG_DEBUG("tensor1 name %s", QNN_TENSOR_GET_NAME(tensor_1)); - QNN_LOG_DEBUG("tensor2 name %s", QNN_TENSOR_GET_NAME(tensor_2)); + QNN_LOG_DEBUG("tensor0 name %s", QNN_TENSOR_GET_NAME(*tensor_0)); + QNN_LOG_DEBUG("tensor1 name %s", QNN_TENSOR_GET_NAME(*tensor_1)); + QNN_LOG_DEBUG("tensor2 name %s", QNN_TENSOR_GET_NAME(*tensor_2)); QNN_VER_PTR(*tensor_0)->type = QNN_TENSOR_TYPE_APP_WRITE; QNN_VER_PTR(*tensor_1)->type = QNN_TENSOR_TYPE_APP_WRITE; @@ -2349,153 +2310,154 @@ static void ggml_qnn_hanlde_op(const enum ggml_op ggmlop, const ggml_tensor * sr } -static void ggml_qnn_repeat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_qnn_repeat(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { QNN_LOG_DEBUG("call %s\n", __func__); QNN_LOG_DEBUG("call %s done\n", __func__); } -static void ggml_qnn_get_rows(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_qnn_get_rows(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { QNN_LOG_DEBUG("call %s\n", __func__); QNN_LOG_DEBUG("call %s done\n", __func__); } -static void ggml_qnn_acc(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_qnn_acc(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { QNN_LOG_DEBUG("call %s\n", __func__); QNN_LOG_DEBUG("call %s done\n", __func__); } -static void ggml_qnn_div(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_qnn_div(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { QNN_LOG_DEBUG("call %s\n", __func__); QNN_LOG_DEBUG("call %s done\n", __func__); } -static void ggml_qnn_gelu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_qnn_gelu(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { QNN_LOG_DEBUG("call %s\n", __func__); QNN_LOG_DEBUG("call %s done\n", __func__); } -static void ggml_qnn_silu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_qnn_silu(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { QNN_LOG_DEBUG("call %s\n", __func__); QNN_LOG_DEBUG("call %s done\n", __func__); } -static void ggml_qnn_gelu_quick(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_qnn_gelu_quick(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { QNN_LOG_DEBUG("call %s\n", __func__); QNN_LOG_DEBUG("call %s done\n", __func__); } -static void ggml_qnn_tanh(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_qnn_tanh(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { QNN_LOG_DEBUG("call %s\n", __func__); QNN_LOG_DEBUG("call %s done\n", __func__); } -static void ggml_qnn_relu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_qnn_relu(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { QNN_LOG_DEBUG("call %s\n", __func__); QNN_LOG_DEBUG("call %s done\n", __func__); } -static void ggml_qnn_hardsigmoid(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_qnn_hardsigmoid(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { QNN_LOG_DEBUG("call %s\n", __func__); QNN_LOG_DEBUG("call %s done\n", __func__); } -static void ggml_qnn_hardswish(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_qnn_hardswish(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { QNN_LOG_DEBUG("call %s\n", __func__); QNN_LOG_DEBUG("call %s done\n", __func__); } -static void ggml_qnn_leaky_relu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_qnn_leaky_relu(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { QNN_LOG_DEBUG("call %s\n", __func__); QNN_LOG_DEBUG("call %s done\n", __func__); } -static void ggml_qnn_sqr(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_qnn_sqr(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { QNN_LOG_DEBUG("call %s\n", __func__); QNN_LOG_DEBUG("call %s done\n", __func__); } -static void ggml_qnn_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_qnn_norm(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { QNN_LOG_DEBUG("call %s\n", __func__); QNN_LOG_DEBUG("call %s done\n", __func__); } -static void ggml_qnn_group_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_qnn_group_norm(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { QNN_LOG_DEBUG("call %s\n", __func__); QNN_LOG_DEBUG("call %s done\n", __func__); } -static void ggml_qnn_concat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_qnn_concat(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { QNN_LOG_DEBUG("call %s\n", __func__); QNN_LOG_DEBUG("call %s done\n", __func__); } -static void ggml_qnn_upscale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_qnn_upscale(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { QNN_LOG_DEBUG("call %s\n", __func__); QNN_LOG_DEBUG("call %s done\n", __func__); } -static void ggml_qnn_pad(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_qnn_pad(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { QNN_LOG_DEBUG("call %s\n", __func__); QNN_LOG_DEBUG("call %s done\n", __func__); } -static void ggml_qnn_rms_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_qnn_rms_norm(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { QNN_LOG_DEBUG("call %s\n", __func__); QNN_LOG_DEBUG("call %s done\n", __func__); } -static void ggml_qnn_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_qnn_cpy(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { QNN_LOG_DEBUG("call %s\n", __func__); QNN_LOG_DEBUG("call %s done\n", __func__); } -static void ggml_qnn_dup(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - ggml_qnn_cpy(src0, dst, nullptr); +static void ggml_qnn_dup(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + ggml_qnn_cpy(ctx, src0, dst, nullptr); (void) src1; } -static void ggml_qnn_mul_mat_id(const ggml_tensor * src0, +static void ggml_qnn_mul_mat_id(ggml_backend_qnn_context * ctx, + const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { QNN_LOG_DEBUG("call %s\n", __func__); @@ -2504,35 +2466,35 @@ static void ggml_qnn_mul_mat_id(const ggml_tensor * src0, } -static void ggml_qnn_scale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_qnn_scale(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { QNN_LOG_DEBUG("call %s\n", __func__); QNN_LOG_DEBUG("call %s done\n", __func__); } -static void ggml_qnn_clamp(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_qnn_clamp(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { QNN_LOG_DEBUG("call %s\n", __func__); QNN_LOG_DEBUG("call %s done\n", __func__); } -static void ggml_qnn_diag_mask_inf(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_qnn_diag_mask_inf(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { QNN_LOG_DEBUG("call %s\n", __func__); QNN_LOG_DEBUG("call %s done\n", __func__); } -static void ggml_qnn_soft_max(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_qnn_soft_max(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { QNN_LOG_DEBUG("call %s\n", __func__); QNN_LOG_DEBUG("call %s done\n", __func__); } -static void ggml_qnn_rope(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_qnn_rope(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { GGML_ASSERT(ggml_is_contiguous(src0)); QNN_LOG_DEBUG("call %s\n", __func__); @@ -2541,21 +2503,21 @@ static void ggml_qnn_rope(const ggml_tensor * src0, const ggml_tensor * src1, gg } -static void ggml_qnn_pool2d(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_qnn_pool2d(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { QNN_LOG_DEBUG("call %s\n", __func__); QNN_LOG_DEBUG("call %s done\n", __func__); } -static void ggml_qnn_im2col(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_qnn_im2col(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { QNN_LOG_DEBUG("call %s\n", __func__); QNN_LOG_DEBUG("call %s done\n", __func__); } -static void ggml_qnn_sum_rows(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_qnn_sum_rows(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { GGML_ASSERT(ggml_is_contiguous(src0)); QNN_LOG_DEBUG("call %s\n", __func__); @@ -2563,7 +2525,7 @@ static void ggml_qnn_sum_rows(const ggml_tensor * src0, const ggml_tensor * src1 } -static void ggml_qnn_argsort(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_qnn_argsort(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { GGML_ASSERT(ggml_is_contiguous(src0)); QNN_LOG_DEBUG("call %s\n", __func__); @@ -2571,7 +2533,7 @@ static void ggml_qnn_argsort(const ggml_tensor * src0, const ggml_tensor * src1, } -static void ggml_qnn_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_qnn_nop(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { (void) src0; (void) src1; (void) dst; @@ -2581,7 +2543,7 @@ static void ggml_qnn_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggm } -bool ggml_qnn_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) { +bool ggml_qnn_compute_forward(ggml_backend_qnn_context * ctx, struct ggml_compute_params * params, struct ggml_tensor * tensor) { ggml_qnn_func_t func = nullptr; ggml_qnn_func_common_t func_common = nullptr; @@ -2715,16 +2677,21 @@ bool ggml_qnn_compute_forward(struct ggml_compute_params * params, struct ggml_t } if (nullptr != func) - func(tensor->src[0], tensor->src[1], tensor); + func(ctx, tensor->src[0], tensor->src[1], tensor); if (nullptr != func_common) - func_common(tensor->op, tensor->src[0], tensor->src[1], tensor); + func_common(ctx, tensor->op, tensor->src[0], tensor->src[1], tensor); return true; } struct ggml_backend_qnn_buffer_context { + ggml_backend_qnn_buffer_context(size_t device) : + device(device), + name(GGML_QNN_NAME + std::to_string(device)) { + } + ~ggml_backend_qnn_buffer_context() { if (buffer) { free(buffer); @@ -2749,6 +2716,14 @@ struct ggml_backend_qnn_buffer_context { size_t buffer_size = 0; std::vector sub_buffers; std::vector qnn_tensors; + size_t device; + std::string name; +}; + + +struct ggml_backend_qnn_buffer_type_context { + size_t device; + std::string name; }; @@ -2782,7 +2757,7 @@ GGML_CALL static void ggml_backend_qnn_buffer_init_tensor(ggml_backend_buffer_t static int idx = 0; char tensor_name[GGML_MAX_NAME] = { 0 }; - snprintf(tensor_name, GGML_MAX_NAME, "tensor_%2d", idx++); + snprintf(tensor_name, GGML_MAX_NAME, "tensor_%04d", idx++); uint32_t dimensions[] = {(uint32_t) tensor->ne[0], (uint32_t) tensor->ne[1], (uint32_t) tensor->ne[2], (uint32_t) tensor->ne[3]}; Qnn_DataType_t qnn_data_type = qnn_datatype_from_ggml_datatype(tensor->type); @@ -2888,7 +2863,8 @@ static void * ggml_qnn_host_malloc(size_t n) { GGML_CALL static ggml_backend_buffer_t ggml_backend_qnn_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { - ggml_backend_qnn_buffer_context * ctx = new ggml_backend_qnn_buffer_context; + ggml_backend_qnn_buffer_type_context * buft_ctx = (ggml_backend_qnn_buffer_type_context *)buft->context; + ggml_backend_qnn_buffer_context * ctx = new ggml_backend_qnn_buffer_context(buft_ctx->device); const size_t size_page = sysconf(_SC_PAGESIZE); @@ -2901,7 +2877,7 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_qnn_buffer_type_alloc_buffer ctx->buffer = ggml_qnn_host_malloc(size_aligned); ctx->buffer_size = size_aligned; - ctx->backend_ctx = &g_qnn_mgr[g_current_device]; + ctx->backend_ctx = &g_qnn_mgr[buft_ctx->device]; if (nullptr == ctx->buffer) { QNN_LOG_WARN("%s: failed to allocate %.2f MiB\n", __func__, size / (1 << 20)); @@ -2968,7 +2944,6 @@ GGML_CALL static void ggml_backend_qnn_free(ggml_backend_t backend) { if (g_qnn_mgr[ctx->device].backend != nullptr) { delete backend; - g_qnn_backend = nullptr; g_qnn_mgr[ctx->device].backend = nullptr; } QNN_LOG_INFO("leave %s", __func__ ); @@ -2995,7 +2970,7 @@ GGML_CALL static ggml_status ggml_backend_qnn_graph_compute(ggml_backend_t backe if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) { continue; } - bool ok = ggml_qnn_compute_forward(¶ms, node); + bool ok = ggml_qnn_compute_forward(ctx, ¶ms, node); if (!ok) { QNN_LOG_DEBUG("%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op)); } @@ -3017,9 +2992,9 @@ GGML_CALL static bool ggml_backend_qnn_supports_op(ggml_backend_t backend, const // new ggml backend(only using system memory: ggml_backend_xxx_buffer_is_host return true) // can following this style for mixed inference between CPU&GPU / CPU&NPU very easily GGML_CALL static bool ggml_backend_qnn_offload_op(ggml_backend_t backend, const ggml_tensor * tensor) { - GGML_UNUSED(backend); + ggml_backend_qnn_context * ctx = (ggml_backend_qnn_context *) backend->context; - return ggml_qnn_compute_forward(nullptr, (ggml_tensor*)tensor); + return ggml_qnn_compute_forward(ctx, nullptr, (ggml_tensor*)tensor); } @@ -3104,27 +3079,36 @@ void ggml_backend_qnn_get_device_description(size_t dev_num, char * description, } -ggml_backend_buffer_type_t ggml_backend_qnn_buffer_type(size_t device_index) { - if (device_index >= GGML_QNN_MAX_DEVICES) { +ggml_backend_buffer_type_t ggml_backend_qnn_buffer_type(size_t device) { + if (device >= GGML_QNN_MAX_DEVICES) { QNN_LOG_DEBUG("ggml_backend_qnn_buffer_type error: device_index:%d is out of range [0, %d]\n", - device_index, GGML_QNN_MAX_DEVICES - 1); + device, GGML_QNN_MAX_DEVICES - 1); return nullptr; } - static struct ggml_backend_buffer_type ggml_backend_buffer_type_qnn = { - /* .iface = */ { - /* .get_name = */ ggml_backend_qnn_buffer_type_name, - /* .alloc_buffer = */ ggml_backend_qnn_buffer_type_alloc_buffer, - /* .get_alignment = */ ggml_backend_qnn_buffer_type_get_alignment, - /* .get_max_size = */ ggml_backend_qnn_buffer_type_get_max_size, - /* .get_alloc_size = */ nullptr,// defaults to ggml_nbytes - /* .supports_backend = */ ggml_backend_qnn_buffer_type_supports_backend, - /* .is_host = */ ggml_backend_qnn_buffer_is_host - }, - /* .context = */ nullptr, - }; + static ggml_backend_buffer_type ggml_backend_qnn_buffer_types[GGML_QNN_MAX_DEVICES]; + + static bool ggml_backend_qnn_buffer_type_initialized = false; + + if (!ggml_backend_qnn_buffer_type_initialized) { + for (int i = 0; i < GGML_QNN_MAX_DEVICES; i++) { + ggml_backend_qnn_buffer_types[i] = { + /* .iface = */ { + /* .get_name = */ ggml_backend_qnn_buffer_type_name, + /* .alloc_buffer = */ ggml_backend_qnn_buffer_type_alloc_buffer, + /* .get_alignment = */ ggml_backend_qnn_buffer_type_get_alignment, + /* .get_max_size = */ ggml_backend_qnn_buffer_type_get_max_size, + /* .get_alloc_size = */ nullptr,// defaults to ggml_nbytes + /* .supports_backend = */ ggml_backend_qnn_buffer_type_supports_backend, + /* .is_host = */ ggml_backend_qnn_buffer_is_host + }, + /* .context = */ new ggml_backend_qnn_buffer_type_context { device, GGML_QNN_NAME + std::to_string(device) }, + }; + } + ggml_backend_qnn_buffer_type_initialized = true; + } - return &ggml_backend_buffer_type_qnn; + return &ggml_backend_qnn_buffer_types[device]; } @@ -3137,8 +3121,10 @@ ggml_backend_buffer_type_t ggml_backend_qnn_buffer_type(size_t device_index) { ggml_backend_t ggml_backend_qnn_init(size_t device, const char * qnn_lib_path) { int result = 0; - if (nullptr == qnn_lib_path) + if (nullptr == qnn_lib_path) { + QNN_LOG_ERROR("invalid qnn lib path\n"); return nullptr; + } QNN_LOG_DEBUG("device %d", device); QNN_LOG_DEBUG("qnn_lib_path %s", qnn_lib_path); @@ -3147,18 +3133,6 @@ ggml_backend_t ggml_backend_qnn_init(size_t device, const char * qnn_lib_path) { return nullptr; } - if (nullptr != g_qnn_mgr[device].backend) { - QNN_LOG_ERROR("qnn backend %d(%s) already loaded", device, get_qnn_backend_name(device)); - if (device == g_current_device) { - g_qnn_backend = g_qnn_mgr[device].backend; - QNN_LOG_INFO("re-use cached backend %d(%s)", device, get_qnn_backend_name(device)); - return g_qnn_mgr[device].backend; - } else { - QNN_LOG_INFO("delete previous backend %d(%s)", device, get_qnn_backend_name(device)); - ggml_backend_qnn_free(g_qnn_backend); - } - } - std::string path = qnn_lib_path; if (QNN_BACKEND_NPU == device) { if (0 == setenv("LD_LIBRARY_PATH", @@ -3215,8 +3189,6 @@ ggml_backend_t ggml_backend_qnn_init(size_t device, const char * qnn_lib_path) { /* .context = */ &g_qnn_mgr[device] }; g_qnn_mgr[device].backend = qnn_backend; - g_qnn_backend = g_qnn_mgr[device].backend; - g_current_device = device; return qnn_backend; } diff --git a/tests/ggml-qnn/ggml-qnn-ut.cpp b/tests/ggml-qnn/ggml-qnn-ut.cpp index 1041252f3770f..eb072beae6bd4 100644 --- a/tests/ggml-qnn/ggml-qnn-ut.cpp +++ b/tests/ggml-qnn/ggml-qnn-ut.cpp @@ -133,7 +133,7 @@ static bool ggml_graph_compute_helper( #define QK8_0 32 typedef struct { - uint16_t d; // delta + uint16_t d; // delta int8_t qs[QK8_0]; // quants } block_q8_0; @@ -158,6 +158,7 @@ static void tensor_dump(const ggml_tensor * tensor, const char * name) { QNN_LOG_WARN("tensor is null"); return; } + if (tensor->type == GGML_TYPE_I8) { for (int h = 0; h < tensor->ne[3]; h++) { for (int i = 0; i < tensor->ne[2]; i++) { From 94ee77505832bdaf5fa72fd72c2fd4031c57eefc Mon Sep 17 00:00:00 2001 From: "zhou.weiguo" Date: Fri, 7 Jun 2024 14:56:07 +0800 Subject: [PATCH 008/143] review: remove static global vars to support multi-instance simultaneously and thread safe --- ggml-qnn.cpp | 40 ++++++++++++++-------------------------- 1 file changed, 14 insertions(+), 26 deletions(-) diff --git a/ggml-qnn.cpp b/ggml-qnn.cpp index 867f01625ad7f..f45a6449ccae3 100644 --- a/ggml-qnn.cpp +++ b/ggml-qnn.cpp @@ -697,9 +697,9 @@ class qnn_interface { } private: - const QnnInterface_t *_qnn_interface = nullptr; + const QnnInterface_t * _qnn_interface = nullptr; - const QnnSystemInterface_t *_qnn_sys_interface = nullptr; + const QnnSystemInterface_t * _qnn_sys_interface = nullptr; }; @@ -848,7 +848,7 @@ class qnn_instance { return 0; } - std::string &get_qnn_graph_name() { return _graph_name; } + std::string & get_qnn_graph_name() { return _graph_name; } bool is_rpcmem_initialized() { return _rpcmem_initialized; @@ -911,7 +911,7 @@ class qnn_instance { qnn_interface _qnn_interface; - void *_system_lib_handle = nullptr; + void * _system_lib_handle = nullptr; Qnn_GraphHandle_t _qnn_graph_handle = nullptr; @@ -927,7 +927,7 @@ class qnn_instance { QnnSystemContext_Handle_t _qnn_system_handle = nullptr; - QnnHtpDevice_PerfInfrastructure_t *_qnn_htp_perfinfra = nullptr; + QnnHtpDevice_PerfInfrastructure_t * _qnn_htp_perfinfra = nullptr; uint32_t _qnn_power_configid = 1; uint32_t _qnn_rpc_pollingtime = 9999; // 0-10000 us for high performing @@ -936,12 +936,12 @@ class qnn_instance { std::unordered_set _qnn_mem_set; - static std::mutex _init_mutex; - static std::unordered_map _loaded_lib_handle; - static std::unordered_map _lib_path_to_backend_id; - static std::unordered_map _loaded_backend; + std::mutex _init_mutex; + std::unordered_map _loaded_lib_handle; + std::unordered_map _lib_path_to_backend_id; + std::unordered_map _loaded_backend; - void *_rpc_lib_handle = nullptr; + void * _rpc_lib_handle = nullptr; std::atomic_bool _rpcmem_initialized{false}; pfn_rpc_mem_alloc _pfn_rpc_mem_alloc; pfn_rpc_mem_free _pfn_rpc_mem_free; @@ -950,26 +950,15 @@ class qnn_instance { pfn_rpc_mem_deinit _pfn_rpc_mem_deinit; std::unordered_map _rpcmem_store_map; - std::string _graph_name; }; - // ================================================================================================= // // implementation of wrapper class // // ================================================================================================= -std::mutex qnn_instance::_init_mutex; - -std::unordered_map qnn_instance::_loaded_lib_handle; - -std::unordered_map qnn_instance::_lib_path_to_backend_id; - -std::unordered_map qnn_instance::_loaded_backend; - - void * qnn_instance::alloc_rpcmem(size_t bytes, size_t alignment) { if (!_rpcmem_initialized) { QNN_LOG_WARN("rpc memory not initialized\n"); @@ -977,14 +966,13 @@ void * qnn_instance::alloc_rpcmem(size_t bytes, size_t alignment) { } auto allocate_bytes = static_cast(bytes + alignment); - void *buf = _pfn_rpc_mem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS, allocate_bytes); + void * buf = _pfn_rpc_mem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS, allocate_bytes); if (buf == nullptr) { QNN_LOG_WARN("failed to allocate rpc memory\n"); return nullptr; } - auto aligned_buf = reinterpret_cast(align_to(alignment, - reinterpret_cast(buf))); + auto aligned_buf = reinterpret_cast(align_to(alignment,reinterpret_cast(buf))); bool status = _rpcmem_store_map.insert(std::pair(aligned_buf, buf)).second; if (!status) { QNN_LOG_WARN("failed to allocate rpc memory\n"); @@ -1097,7 +1085,7 @@ int qnn_instance::load_backend(std::string & lib_path, const QnnSaver_Config_t * Qnn_ErrorHandle_t error = QNN_SUCCESS; QNN_LOG_DEBUG("lib_path:%s\n", lib_path.c_str()); - void *lib_handle = dlopen(lib_path.c_str(), RTLD_NOW | RTLD_GLOBAL); + void * lib_handle = dlopen(lib_path.c_str(), RTLD_NOW | RTLD_GLOBAL); if (nullptr == lib_handle) { QNN_LOG_WARN("can not open QNN library %s, with error: %s", lib_path.c_str(), dlerror()); return 1; @@ -1113,7 +1101,7 @@ int qnn_instance::load_backend(std::string & lib_path, const QnnSaver_Config_t * // get QnnInterface Providers std::uint32_t num_providers = 0; - const QnnInterface_t **provider_list = nullptr; + const QnnInterface_t ** provider_list = nullptr; error = get_providers(&provider_list, &num_providers); if (error != QNN_SUCCESS) { QNN_LOG_WARN("failed to get providers, error %d", QNN_GET_ERROR_CODE(error)); From 5d691c6cd05b4ff51f181272b8cb4df0dcb0e0ba Mon Sep 17 00:00:00 2001 From: "zhou.weiguo" Date: Sat, 8 Jun 2024 09:22:39 +0800 Subject: [PATCH 009/143] review: put qnn's internal log inside preprocessor diretive --- ggml-qnn.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml-qnn.cpp b/ggml-qnn.cpp index f45a6449ccae3..072003e1d76b8 100644 --- a/ggml-qnn.cpp +++ b/ggml-qnn.cpp @@ -1277,6 +1277,7 @@ static void ggml_qnn_logcallback(const char * fmt, uint64_t timestamp, va_list argp) { +#if GGML_QNN_DEBUG static std::mutex log_mutex; static unsigned char s_ggml_qnn_logbuf[GGML_QNN_LOGBUF_LEN]; @@ -1303,7 +1304,6 @@ static void ggml_qnn_logcallback(const char * fmt, } double ms = (double) timestamp / 1000000.0; -#if GGML_QNN_DEBUG { std::lock_guard lock(log_mutex); From fdf0272dfb29cd640de92d6e54dce448c48a156e Mon Sep 17 00:00:00 2001 From: "zhou.weiguo" Date: Sat, 8 Jun 2024 17:56:32 +0800 Subject: [PATCH 010/143] review: code format using clang-format + manually modification according to review comments --- ggml-qnn.cpp | 2793 +++++++++++++++++++++++++------------------------- 1 file changed, 1414 insertions(+), 1379 deletions(-) diff --git a/ggml-qnn.cpp b/ggml-qnn.cpp index 072003e1d76b8..3c5ff332a1df2 100644 --- a/ggml-qnn.cpp +++ b/ggml-qnn.cpp @@ -54,132 +54,166 @@ #include #endif - // ================================================================================================= // -// forward/external/helper declaration +// forward declaration // // ================================================================================================= class qnn_instance; - -static void ggml_qnn_log_internal(ggml_log_level level, const char * file, const char * func, int line, const char * format, ...); - +static void ggml_qnn_log_internal(ggml_log_level level, const char * file, + const char * func, int line, + const char * format, ...); // ================================================================================================= // // self-defined macro / data structure // // ================================================================================================= -#define RPCMEM_DEFAULT_FLAGS 1 -#define RPCMEM_HEAP_ID_SYSTEM 25 +#define RPCMEM_DEFAULT_FLAGS 1 +#define RPCMEM_HEAP_ID_SYSTEM 25 -#define GGML_QNN_LOGBUF_LEN 4096 +#define GGML_QNN_DEBUG 1 // for troubleshooting QNN backend +#define ENABLE_QNN_LOG 0 // enable/disable QNN internal log +#define GGML_QNN_LOGBUF_LEN 4096 +#define QNN_VER_PTR(x) (&((x).v1)) +#define GGML_QNN_NAME "qnn" -#define GGML_QNN_DEBUG 1 //for troubleshooting QNN backend +#define QNN_LOG_ERROR(...) \ + ggml_qnn_log_internal(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) -#define QNN_LOG_ERROR(...) ggml_qnn_log_internal(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) -#define QNN_LOG_WARN(...) ggml_qnn_log_internal(GGML_LOG_LEVEL_DEBUG , __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) -#define QNN_LOG_INFO(...) ggml_qnn_log_internal(GGML_LOG_LEVEL_DEBUG , __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) +#define QNN_LOG_WARN(...) \ + ggml_qnn_log_internal(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) + +#define QNN_LOG_INFO(...) \ + ggml_qnn_log_internal(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) #if GGML_QNN_DEBUG -#define QNN_LOG_DEBUG(...) ggml_qnn_log_internal(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) +#define QNN_LOG_DEBUG(...) \ + ggml_qnn_log_internal(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) #else #define QNN_LOG_DEBUG(...) #endif -#define QNN_VER_PTR(x) (&((x).v1)) -#define GGML_QNN_NAME "qnn" - -#define VALIDATE(value, status) \ - do { \ - status = value; \ - if (status != QNN_SUCCESS) { \ - QNN_LOG_WARN("%s expected QNN_SUCCESS\n", #value); \ - return status; \ - } \ - } while (0) - -#define VALIDATE_TENSOR_VERSION(tensor, err) VALIDATE(validate_tensor_version(tensor), err) - -#define QNN_TENSOR_GET_ID(tensor) get_qnn_tensorid(tensor) -#define QNN_TENSOR_GET_NAME(tensor) get_qnn_tensorname(tensor) -#define QNN_TENSOR_GET_TYPE(tensor) get_qnn_tensortype(tensor) -#define QNN_TENSOR_GET_DATA_FORMAT(tensor) get_qnn_tensor_dataformat(tensor) -#define QNN_TENSOR_GET_DATA_TYPE(tensor) get_qnn_tensor_datatype(tensor) -#define QNN_TENSOR_GET_QUANT_PARAMS(tensor) get_qnn_tensor_quantparams(tensor) -#define QNN_TENSOR_GET_RANK(tensor) get_qnn_tensor_rank(tensor) -#define QNN_TENSOR_GET_DIMENSIONS(tensor) get_qnn_tensor_dimensions(tensor) -#define QNN_TENSOR_GET_MEM_TYPE(tensor) get_qnn_tensor_memtype(tensor) - -#define QNN_TENSOR_SET_ID(tensor, value) set_qnn_tensor_id(tensor, value) -#define QNN_TENSOR_SET_NAME(tensor, value) set_qnn_tensor_name(tensor, value) -#define QNN_TENSOR_SET_TYPE(tensor, value) set_qnn_tensor_type(tensor, value) -#define QNN_TENSOR_SET_DATA_FORMAT(tensor, value) set_qnn_tensor_dataformat(tensor, value) -#define QNN_TENSOR_SET_DATA_TYPE(tensor, value) set_qnn_tensor_datatype(tensor, value) -#define QNN_TENSOR_SET_QUANT_PARAMS(tensor, value) set_qnn_tensor_quantparams(tensor, value) -#define QNN_TENSOR_SET_RANK(tensor, value) set_qnn_tensor_rank(tensor, value) -#define QNN_TENSOR_SET_DIMENSIONS(tensor, value) set_qnn_tensor_dimensions(tensor, value) -#define QNN_TENSOR_SET_MEM_TYPE(tensor, value) set_qnn_tensor_memtype(tensor, value) -#define QNN_TENSOR_SET_CLIENT_BUF(tensor, value) set_qnn_tensor_clientbuf(tensor, value) -#define QNN_TENSOR_SET_MEM_HANDLE(tensor, value) set_qnn_tensor_memhandle(tensor, value) - -using pfn_rpc_mem_init = void (*)(void); -using pfn_rpc_mem_deinit = void (*)(void); -using pfn_rpc_mem_alloc = void *(*)(int, uint32_t, int); -using pfn_rpc_mem_free = void (*)(void *); -using pfn_rpc_mem_to_fd = int (*)(void *); - -using _pfn_QnnSaver_initialize = decltype(QnnSaver_initialize); -using _pfn_QnnInterface_getProviders = decltype(QnnInterface_getProviders); -using _pfn_QnnSystemInterface_getProviders = decltype(QnnSystemInterface_getProviders); - - +#define VALIDATE(value, status) \ + do { \ + status = value; \ + if (status != QNN_SUCCESS) { \ + QNN_LOG_WARN("%s expected QNN_SUCCESS\n", #value); \ + return status; \ + } \ + } while (0) + +#define VALIDATE_TENSOR_VERSION(tensor, err) \ + VALIDATE(validate_tensor_version(tensor), err) + +#define QNN_TENSOR_GET_ID(tensor) get_qnn_tensorid(tensor) +#define QNN_TENSOR_GET_NAME(tensor) get_qnn_tensorname(tensor) +#define QNN_TENSOR_GET_TYPE(tensor) get_qnn_tensortype(tensor) +#define QNN_TENSOR_GET_DATA_FORMAT(tensor) get_qnn_tensor_dataformat(tensor) +#define QNN_TENSOR_GET_DATA_TYPE(tensor) get_qnn_tensor_datatype(tensor) +#define QNN_TENSOR_GET_QUANT_PARAMS(tensor) get_qnn_tensor_quantparams(tensor) +#define QNN_TENSOR_GET_RANK(tensor) get_qnn_tensor_rank(tensor) +#define QNN_TENSOR_GET_DIMENSIONS(tensor) get_qnn_tensor_dimensions(tensor) +#define QNN_TENSOR_GET_MEM_TYPE(tensor) get_qnn_tensor_memtype(tensor) + +#define QNN_TENSOR_SET_ID(tensor, value) set_qnn_tensor_id(tensor, value) +#define QNN_TENSOR_SET_NAME(tensor, value) set_qnn_tensor_name(tensor, value) +#define QNN_TENSOR_SET_TYPE(tensor, value) set_qnn_tensor_type(tensor, value) +#define QNN_TENSOR_SET_DATA_FORMAT(tensor, value) \ + set_qnn_tensor_dataformat(tensor, value) +#define QNN_TENSOR_SET_DATA_TYPE(tensor, value) \ + set_qnn_tensor_datatype(tensor, value) +#define QNN_TENSOR_SET_QUANT_PARAMS(tensor, value) \ + set_qnn_tensor_quantparams(tensor, value) +#define QNN_TENSOR_SET_RANK(tensor, value) set_qnn_tensor_rank(tensor, value) +#define QNN_TENSOR_SET_DIMENSIONS(tensor, value) \ + set_qnn_tensor_dimensions(tensor, value) +#define QNN_TENSOR_SET_MEM_TYPE(tensor, value) \ + set_qnn_tensor_memtype(tensor, value) +#define QNN_TENSOR_SET_CLIENT_BUF(tensor, value) \ + set_qnn_tensor_clientbuf(tensor, value) +#define QNN_TENSOR_SET_MEM_HANDLE(tensor, value) \ + set_qnn_tensor_memhandle(tensor, value) + +using pfn_rpc_mem_init = void (*)(void); +using pfn_rpc_mem_deinit = void (*)(void); +using pfn_rpc_mem_alloc = void *(*) (int, uint32_t, int); +using pfn_rpc_mem_free = void (*)(void *); +using pfn_rpc_mem_to_fd = int (*)(void *); + +using _pfn_QnnSaver_initialize = decltype(QnnSaver_initialize); +using _pfn_QnnInterface_getProviders = decltype(QnnInterface_getProviders); +using _pfn_QnnSystemInterface_getProviders = decltype(QnnSystemInterface_getProviders); enum class ggml_qnn_profile_level { - profile_off = 0, - profile_basic = 1, - profile_detail = 2 + profile_off = 0, + profile_basic = 1, + profile_detail = 2 }; struct ggml_backend_qnn_context { - int device; - int threads; - char name[GGML_MAX_NAME]; - char lib[GGML_MAX_NAME]; - qnn_instance * instance; - struct ggml_backend * backend; - QNN_INTERFACE_VER_TYPE raw_interface; + int device; + int threads; + char name[GGML_MAX_NAME]; + char lib[GGML_MAX_NAME]; + qnn_instance * instance; + struct ggml_backend * backend; + QNN_INTERFACE_VER_TYPE raw_interface; QNN_SYSTEM_INTERFACE_VER_TYPE raw_system_interface; -} ; - -typedef void (* ggml_qnn_func_t)(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); - -typedef void (* ggml_qnn_func_common_t)(ggml_backend_qnn_context * ctx, const ggml_op ggml_op, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); +}; -// ================================================================================================= -// -// static global variables -// -// ================================================================================================= -//static ggml_backend_t g_qnn_backend = nullptr; - -//according to the QNN SDK Reference Guide, -//CPU - Choose a non-quantized model. Quantized models are currently incompatible with the CPU backend -//GPU - Choose a non-quantized model. Quantized models are currently incompatible with the GPU backend -//HTP - Choose a quantized model. Quantized models are required when running on the HTP backend -//DSP - Choose a quantized model. Quantized models are required when running on the DSP backend -//HTA - Choose a quantized model. Quantized models are required when running on the HTA backend +typedef void (*ggml_qnn_func_t)(ggml_backend_qnn_context * ctx, + const ggml_tensor * src0, + const ggml_tensor * src1, + ggml_tensor * dst); + +typedef void (*ggml_qnn_func_common_t)(ggml_backend_qnn_context * ctx, + const ggml_op ggml_op, + const ggml_tensor * src0, + const ggml_tensor * src1, + ggml_tensor * dst); + +// according to the QNN SDK Reference Guide, +// CPU - Choose a non-quantized model.Quantized models are currently incompatible with the CPU backend +// GPU - Choose a non-quantized model.Quantized models are currently incompatible with the GPU backend +// HTP - Choose a quantized model. Quantized models are required when running on the HTP backend +// DSP - Choose a quantized model. Quantized models are required when running on the DSP backend +// HTA - Choose a quantized model. Quantized models are required when running on the HTA backend // -//only focus on Qualcomm CPU/GPU/NPU backend in this implementation of QNN backend for ggml currently -//Qualcomm CPU: Qualcomm Kryo CPU -//Qualcomm GPU: Qualcomm Adreno GPU -//Qualcomm NPU: aka HTP(Hexagon Tensor Processor), ~= cDSP(Compute DSP) + HMX(Hexagon Matrix eXtensions)/HTA(Hexagon Tensor Accelerator) +// only focus on Qualcomm CPU/GPU/NPU backend in this implementation of QNN backend for ggml currently, +// CPU: Qualcomm Kryo CPU +// GPU: Qualcomm Adreno GPU +// NPU: Qualcomm NPU: aka HTP(Hexagon Tensor Processor), ~= cDSP(Compute DSP) + +// HMX(Hexagon Matrix eXtensions)/HTA(Hexagon Tensor Accelerator) static struct ggml_backend_qnn_context g_qnn_mgr[GGML_QNN_MAX_DEVICES] = { - [QNN_BACKEND_CPU] = {.device = 0, .threads = 1, .name = "qnn-cpu", .lib = "libQnnCpu.so", .instance = nullptr, .backend = nullptr, .raw_interface = {}, .raw_system_interface = {}}, - [QNN_BACKEND_GPU] = {.device = 1, .threads = 1, .name = "qnn-gpu", .lib = "libQnnGpu.so", .instance = nullptr, .backend = nullptr, .raw_interface = {}, .raw_system_interface = {}}, - [QNN_BACKEND_NPU] = {.device = 2, .threads = 1, .name = "qnn-npu", .lib = "libQnnHtp.so", .instance = nullptr, .backend = nullptr, .raw_interface = {}, .raw_system_interface = {}}, + [QNN_BACKEND_CPU] = {.device = 0, + .threads = 1, + .name = "qnn-cpu", + .lib = "libQnnCpu.so", + .instance = nullptr, + .backend = nullptr, + .raw_interface = {}, + .raw_system_interface = {}}, + + [QNN_BACKEND_GPU] = {.device = 1, + .threads = 1, + .name = "qnn-gpu", + .lib = "libQnnGpu.so", + .instance = nullptr, + .backend = nullptr, + .raw_interface = {}, + .raw_system_interface = {}}, + + [QNN_BACKEND_NPU] = {.device = 2, + .threads = 1, + .name = "qnn-npu", + .lib = "libQnnHtp.so", + .instance = nullptr, + .backend = nullptr, + .raw_interface = {}, + .raw_system_interface = {}}, }; // ================================================================================================= @@ -189,15 +223,14 @@ static struct ggml_backend_qnn_context g_qnn_mgr[GGML_QNN_MAX_DEVICES] = { // ================================================================================================= static inline int validate_tensor_version(Qnn_Tensor_t tensor) { if (tensor.version != QNN_TENSOR_VERSION_1) { - QNN_LOG_WARN("validate_tensor_version() tensor %s, got unsupported version %d\n", - tensor.v1.name, - tensor.version); + QNN_LOG_WARN( + "validate_tensor_version() tensor %s, got unsupported version %d\n", + tensor.v1.name, tensor.version); return 1; } return 0; } - static inline uint32_t get_qnn_tensorid(const Qnn_Tensor_t & tensor) { if (tensor.version == QNN_TENSOR_VERSION_1) { return tensor.v1.id; @@ -206,7 +239,6 @@ static inline uint32_t get_qnn_tensorid(const Qnn_Tensor_t & tensor) { return 0u; } - static inline const char * get_qnn_tensorname(const Qnn_Tensor_t & tensor) { if (tensor.version == QNN_TENSOR_VERSION_1) { return tensor.v1.name; @@ -214,8 +246,6 @@ static inline const char * get_qnn_tensorname(const Qnn_Tensor_t & tensor) { return nullptr; } - - static inline Qnn_TensorType_t get_qnn_tensortype(const Qnn_Tensor_t & tensor) { if (tensor.version == QNN_TENSOR_VERSION_1) { return tensor.v1.type; @@ -223,31 +253,30 @@ static inline Qnn_TensorType_t get_qnn_tensortype(const Qnn_Tensor_t & tensor) { return QNN_TENSOR_TYPE_UNDEFINED; } - -static inline Qnn_TensorDataFormat_t get_qnn_tensor_dataformat(const Qnn_Tensor_t & tensor) { +static inline Qnn_TensorDataFormat_t + get_qnn_tensor_dataformat(const Qnn_Tensor_t & tensor) { if (tensor.version == QNN_TENSOR_VERSION_1) { return tensor.v1.dataFormat; } return QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER; } - -static inline Qnn_DataType_t get_qnn_tensor_datatype(const Qnn_Tensor_t & tensor) { +static inline Qnn_DataType_t + get_qnn_tensor_datatype(const Qnn_Tensor_t & tensor) { if (tensor.version == QNN_TENSOR_VERSION_1) { return tensor.v1.dataType; } return QNN_DATATYPE_UNDEFINED; } - -static inline Qnn_QuantizeParams_t get_qnn_tensor_quantparams(const Qnn_Tensor_t & tensor) { +static inline Qnn_QuantizeParams_t + get_qnn_tensor_quantparams(const Qnn_Tensor_t & tensor) { if (tensor.version == QNN_TENSOR_VERSION_1) { return tensor.v1.quantizeParams; } return QNN_QUANTIZE_PARAMS_INIT; } - static inline uint32_t get_qnn_tensor_rank(const Qnn_Tensor_t & tensor) { if (tensor.version == QNN_TENSOR_VERSION_1) { return tensor.v1.rank; @@ -255,7 +284,6 @@ static inline uint32_t get_qnn_tensor_rank(const Qnn_Tensor_t & tensor) { return 0u; } - static inline uint32_t * get_qnn_tensor_dimensions(const Qnn_Tensor_t & tensor) { if (tensor.version == QNN_TENSOR_VERSION_1) { return tensor.v1.dimensions; @@ -263,7 +291,6 @@ static inline uint32_t * get_qnn_tensor_dimensions(const Qnn_Tensor_t & tensor) return nullptr; } - static inline Qnn_TensorMemType_t get_qnn_tensor_memtype(const Qnn_Tensor_t & tensor) { if (tensor.version == QNN_TENSOR_VERSION_1) { return tensor.v1.memType; @@ -271,109 +298,95 @@ static inline Qnn_TensorMemType_t get_qnn_tensor_memtype(const Qnn_Tensor_t & te return QNN_TENSORMEMTYPE_UNDEFINED; } - static inline void set_qnn_tensor_id(Qnn_Tensor_t & tensor, uint32_t id) { if (tensor.version == QNN_TENSOR_VERSION_1) { tensor.v1.id = id; } } - static inline void set_qnn_tensor_name(Qnn_Tensor_t & tensor, const char * name) { if (tensor.version == QNN_TENSOR_VERSION_1) { tensor.v1.name = name; } } - static inline void set_qnn_tensor_type(Qnn_Tensor_t & tensor, Qnn_TensorType_t type) { if (tensor.version == QNN_TENSOR_VERSION_1) { tensor.v1.type = type; } } - static inline void set_qnn_tensor_dataformat(Qnn_Tensor_t & tensor, Qnn_TensorDataFormat_t format) { if (tensor.version == QNN_TENSOR_VERSION_1) { tensor.v1.dataFormat = format; } } - static inline void set_qnn_tensor_datatype(Qnn_Tensor_t & tensor, Qnn_DataType_t dataType) { if (tensor.version == QNN_TENSOR_VERSION_1) { tensor.v1.dataType = dataType; } } - static inline void set_qnn_tensor_quantparams(Qnn_Tensor_t & tensor, Qnn_QuantizeParams_t params) { if (tensor.version == QNN_TENSOR_VERSION_1) { tensor.v1.quantizeParams = params; } } - static inline void set_qnn_tensor_rank(Qnn_Tensor_t & tensor, uint32_t rank) { if (tensor.version == QNN_TENSOR_VERSION_1) { tensor.v1.rank = rank; } } - static inline void set_qnn_tensor_dimensions(Qnn_Tensor_t & tensor, uint32_t * dims) { if (tensor.version == QNN_TENSOR_VERSION_1) { tensor.v1.dimensions = dims; } } - -static inline void set_qnn_tensor_memtype(Qnn_Tensor_t & tensor, Qnn_TensorMemType_t memType) { +static inline void set_qnn_tensor_memtype(Qnn_Tensor_t & tensor, Qnn_TensorMemType_t mem_type) { if (tensor.version == QNN_TENSOR_VERSION_1) { - tensor.v1.memType = memType; + tensor.v1.memType = mem_type; } } - -static inline void set_qnn_tensor_clientbuf(Qnn_Tensor_t & tensor, Qnn_ClientBuffer_t clientBuf) { +static inline void set_qnn_tensor_clientbuf(Qnn_Tensor_t & tensor, Qnn_ClientBuffer_t client_buf) { if (tensor.version == QNN_TENSOR_VERSION_1) { - tensor.v1.clientBuf = clientBuf; + tensor.v1.clientBuf = client_buf; } } - static inline void set_qnn_tensor_memhandle(Qnn_Tensor_t & tensor, Qnn_MemHandle_t handle) { if (tensor.version == QNN_TENSOR_VERSION_1) { tensor.v1.memHandle = handle; } } +static size_t memscpy(void * dst, size_t dst_size, const void * src, size_t copy_size) { + if (!dst || !src || !dst_size || !copy_size) return 0; -static size_t memscpy(void * dst, size_t dstSize, const void * src, size_t copySize) { - if (!dst || !src || !dstSize || !copySize) - return 0; - - size_t minSize = dstSize < copySize ? dstSize : copySize; + size_t min_size = dst_size < copy_size ? dst_size : copy_size; - memcpy(dst, src, minSize); + memcpy(dst, src, min_size); - return minSize; + return min_size; } - static char * ggml_qnn_strndup(const char * source, size_t maxlen) { return ::strndup(source, maxlen); } - static int deep_copy_qnn_tensors(Qnn_Tensor_t & src, Qnn_Tensor_t & dst) { int err = 0; VALIDATE_TENSOR_VERSION(src, err); dst.version = src.version; QNN_TENSOR_SET_NAME( - dst, ggml_qnn_strndup(QNN_TENSOR_GET_NAME(src), std::string(QNN_TENSOR_GET_NAME(src)).size())); - if (QNN_TENSOR_GET_NAME(dst) == nullptr) { + dst, ggml_qnn_strndup(QNN_TENSOR_GET_NAME(src), + std::string(QNN_TENSOR_GET_NAME(src)).size())); + if (nullptr == QNN_TENSOR_GET_NAME(dst)) { return 1; } QNN_TENSOR_SET_ID(dst, QNN_TENSOR_GET_ID(src)); @@ -382,8 +395,6 @@ static int deep_copy_qnn_tensors(Qnn_Tensor_t & src, Qnn_Tensor_t & dst) { QNN_TENSOR_SET_DATA_TYPE(dst, QNN_TENSOR_GET_DATA_TYPE(src)); QNN_TENSOR_SET_MEM_TYPE(dst, QNN_TENSOR_GET_MEM_TYPE(src)); - // Only metadata (i.e. non-static data) is copied from source to destination. The union still - // must be initialized so that the clientBuf/memHandle do not contain garbage data if (QNN_TENSOR_GET_MEM_TYPE(src) == QNN_TENSORMEMTYPE_RAW) { Qnn_ClientBuffer_t client_buf = {nullptr, 0}; QNN_TENSOR_SET_CLIENT_BUF(dst, client_buf); @@ -393,48 +404,47 @@ static int deep_copy_qnn_tensors(Qnn_Tensor_t & src, Qnn_Tensor_t & dst) { return 1; } - Qnn_QuantizeParams_t src_qparam = QNN_TENSOR_GET_QUANT_PARAMS(src); - Qnn_QuantizationEncoding_t encoding = src_qparam.quantizationEncoding; + Qnn_QuantizeParams_t src_qparam = QNN_TENSOR_GET_QUANT_PARAMS(src); + Qnn_QuantizationEncoding_t encoding = src_qparam.quantizationEncoding; if (encoding == QNN_QUANTIZATION_ENCODING_AXIS_SCALE_OFFSET) { - // need to allocate and copy memory for scaleOffset as it is a pointer array - Qnn_QuantizeParams_t src_qparam_cpy = src_qparam; - Qnn_AxisScaleOffset_t &axis_scale_offset = src_qparam_cpy.axisScaleOffsetEncoding; - Qnn_ScaleOffset_t **scaleOffset = &axis_scale_offset.scaleOffset; - size_t scaleOffsetSize = axis_scale_offset.numScaleOffsets * sizeof(Qnn_ScaleOffset_t); - *scaleOffset = (Qnn_ScaleOffset_t *)malloc(scaleOffsetSize); - memscpy(*scaleOffset, - scaleOffsetSize, + Qnn_QuantizeParams_t src_qparam_cpy = src_qparam; + Qnn_AxisScaleOffset_t & axis_scale_offset = src_qparam_cpy.axisScaleOffsetEncoding; + Qnn_ScaleOffset_t **scaleOffset = & axis_scale_offset.scaleOffset; + size_t scaleOffsetSize = axis_scale_offset.numScaleOffsets * sizeof(Qnn_ScaleOffset_t); + *scaleOffset = (Qnn_ScaleOffset_t *) malloc(scaleOffsetSize); + memscpy(*scaleOffset, scaleOffsetSize, src_qparam.axisScaleOffsetEncoding.scaleOffset, scaleOffsetSize); QNN_TENSOR_SET_QUANT_PARAMS(dst, src_qparam_cpy); } else if (encoding == QNN_QUANTIZATION_ENCODING_BW_AXIS_SCALE_OFFSET) { - // need to allocate and copy memory for scaleOffset as it is a pointer array - Qnn_QuantizeParams_t src_qparam_cpy = src_qparam; - Qnn_BwAxisScaleOffset_t &bwaxis_scale_offset = src_qparam_cpy.bwAxisScaleOffsetEncoding; - size_t scaleSize = bwaxis_scale_offset.numElements * sizeof(float); - float **scales = &bwaxis_scale_offset.scales; - int32_t **offsets = &bwaxis_scale_offset.offsets; - *scales = (float *)malloc(scaleSize); - memscpy(*scales, scaleSize, src_qparam.bwAxisScaleOffsetEncoding.scales, scaleSize); - - // only copy offsets if present, nullptr implies all offsets are 0 + Qnn_QuantizeParams_t src_qparam_cpy = src_qparam; + Qnn_BwAxisScaleOffset_t & bwaxis_scale_offset = src_qparam_cpy.bwAxisScaleOffsetEncoding; + size_t scaleSize = bwaxis_scale_offset.numElements * sizeof(float); + float ** scales = &bwaxis_scale_offset.scales; + int32_t ** offsets = &bwaxis_scale_offset.offsets; + *scales = (float *) malloc(scaleSize); + memscpy(*scales, scaleSize, src_qparam.bwAxisScaleOffsetEncoding.scales, + scaleSize); + if (bwaxis_scale_offset.offsets != nullptr) { size_t offsetSize = bwaxis_scale_offset.numElements * sizeof(int32_t); - *offsets = (int32_t *)malloc(offsetSize); - memscpy(*offsets, offsetSize, src_qparam.bwAxisScaleOffsetEncoding.offsets, offsetSize); + *offsets = (int32_t *) malloc(offsetSize); + memscpy(*offsets, offsetSize, + src_qparam.bwAxisScaleOffsetEncoding.offsets, offsetSize); } QNN_TENSOR_SET_QUANT_PARAMS(dst, src_qparam_cpy); } else { QNN_TENSOR_SET_QUANT_PARAMS(dst, src_qparam); } - // allocate and copy memory for all the pointer members uint32_t rank = QNN_TENSOR_GET_RANK(src); QNN_TENSOR_SET_RANK(dst, rank); - size_t dim_size = rank * sizeof(uint32_t); - uint32_t * dimensions = (uint32_t *)malloc(dim_size); + size_t dim_size = rank * sizeof(uint32_t); + uint32_t *dimensions = (uint32_t *) malloc(dim_size); if (dimensions == nullptr) { - QNN_LOG_WARN("deep_copy_qnn_tensors() allocation error while copying tensor %s\n", QNN_TENSOR_GET_NAME(src)); + QNN_LOG_WARN("deep_copy_qnn_tensors() allocation error while copying " + "tensor %s\n", + QNN_TENSOR_GET_NAME(src)); return 1; } memscpy(dimensions, dim_size, QNN_TENSOR_GET_DIMENSIONS(src), dim_size); @@ -443,7 +453,6 @@ static int deep_copy_qnn_tensors(Qnn_Tensor_t & src, Qnn_Tensor_t & dst) { return err; } - static int free_qnn_tensor(Qnn_Tensor_t & tensor) { int err = 0; VALIDATE_TENSOR_VERSION(tensor, err); @@ -454,7 +463,6 @@ static int free_qnn_tensor(Qnn_Tensor_t & tensor) { return err; } - static uint32_t ggml_get_tensor_rank(const ggml_tensor * tensor) { uint32_t rank = 0; for (int i = 0; i < GGML_MAX_DIMS; i++) { @@ -465,44 +473,40 @@ static uint32_t ggml_get_tensor_rank(const ggml_tensor * tensor) { return rank; } - -//TODO: mapping more ggml data type to QNN data type -//ref:explanation of k-quants, https://github.com/ggerganov/llama.cpp/pull/1684 +// TODO: mapping more ggml data type to QNN data type +// ref:explanation of k-quants, https://github.com/ggerganov/llama.cpp/pull/1684 static Qnn_DataType_t qnn_datatype_from_ggml_datatype(enum ggml_type ggmltype) { switch (ggmltype) { - case GGML_TYPE_F16: - return QNN_DATATYPE_FLOAT_16; - case GGML_TYPE_F32: - return QNN_DATATYPE_FLOAT_32; - case GGML_TYPE_I8: - return QNN_DATATYPE_INT_8; - case GGML_TYPE_Q8_0: - return QNN_DATATYPE_SFIXED_POINT_8; - default: - break; - + case GGML_TYPE_F16: + return QNN_DATATYPE_FLOAT_16; + case GGML_TYPE_F32: + return QNN_DATATYPE_FLOAT_32; + case GGML_TYPE_I8: + return QNN_DATATYPE_INT_8; + case GGML_TYPE_Q8_0: + return QNN_DATATYPE_SFIXED_POINT_8; + default: + break; } return QNN_DATATYPE_UNDEFINED; } - -//TODO: only support GGML_OP_ADD/GGML_OP_MUL/GGML_OP_MUL_MAT +// TODO: only support GGML_OP_ADD/GGML_OP_MUL/GGML_OP_MUL_MAT static const char * qnn_opname_from_ggmlop(enum ggml_op ggmlop) { switch (ggmlop) { - case GGML_OP_ADD: - return QNN_OP_ELEMENT_WISE_ADD; - case GGML_OP_MUL: - return QNN_OP_ELEMENT_WISE_MULTIPLY; - case GGML_OP_MUL_MAT: - return QNN_OP_MAT_MUL; - default: - break; + case GGML_OP_ADD: + return QNN_OP_ELEMENT_WISE_ADD; + case GGML_OP_MUL: + return QNN_OP_ELEMENT_WISE_MULTIPLY; + case GGML_OP_MUL_MAT: + return QNN_OP_MAT_MUL; + default: + break; } return nullptr; } - static uint32_t ggml_get_tensor_data_size(const ggml_tensor * tensor) { /* size_t data_size = ggml_row_size(tensor->type, tensor->ne[0]); @@ -516,86 +520,85 @@ static uint32_t ggml_get_tensor_data_size(const ggml_tensor * tensor) { return ggml_nbytes(tensor); } - -template +template Fn load_qnn_functionpointers(void * handle, const char * function_name) { return reinterpret_cast(dlsym(handle, function_name)); } - static const char * get_qnn_backend_name(int n_backend_type) { switch (n_backend_type) { - case 0: - return "QNN-CPU"; - case 1: - return "QNN-GPU"; - case 2: - return "QNN-NPU"; - case 3: - return "ggml"; //"fake" QNN backend, used for compare performance between QNN backend and original GGML - - default: - return "unknown"; + case 0: + return "QNN-CPU"; + case 1: + return "QNN-GPU"; + case 2: + return "QNN-NPU"; + case 3: + return "ggml"; //"fake" QNN backend, used for compare performance between QNN backend and original GGML + default: + return "unknown"; } } - static intptr_t align_to(size_t alignment, intptr_t offset) { - return offset % alignment == 0 ? offset - : offset + - (static_cast(alignment) - - offset % static_cast(alignment)); + return offset % alignment == 0 + ? offset + : offset + (static_cast(alignment) - + offset % static_cast(alignment)); } - -static void ggml_qnn_log_internal(ggml_log_level level, const char * file, const char * func, int line, const char * format, ...) { +static void ggml_qnn_log_internal(ggml_log_level level, const char * file, + const char * func, int line, + const char * format, ...) { static std::mutex ggml_qnn_log_internal_mutex; - static char s_ggml_qnn_log_internal_buf[GGML_QNN_LOGBUF_LEN]; + static char s_ggml_qnn_log_internal_buf[GGML_QNN_LOGBUF_LEN]; { std::lock_guard lock(ggml_qnn_log_internal_mutex); - va_list args; + va_list args; + va_start(args, format); - int len_prefix = snprintf(s_ggml_qnn_log_internal_buf, GGML_QNN_LOGBUF_LEN, "[%s, %d]: ", func, line); - int len = vsnprintf(s_ggml_qnn_log_internal_buf + len_prefix, GGML_QNN_LOGBUF_LEN - len_prefix, format, args); + int len_prefix = + snprintf(s_ggml_qnn_log_internal_buf, GGML_QNN_LOGBUF_LEN, + "[%s, %d]: ", func, line); + int len = vsnprintf(s_ggml_qnn_log_internal_buf + len_prefix, + GGML_QNN_LOGBUF_LEN - len_prefix, format, args); if (len < (GGML_QNN_LOGBUF_LEN - len_prefix)) { #if (defined __ANDROID__) || (defined ANDROID) - //for Android APK + // for Android APK __android_log_print(level, "ggml-qnn", "%s\n", s_ggml_qnn_log_internal_buf); #endif - //for Android command line application or WoA + // for Android command line application or WoA printf("%s\n", s_ggml_qnn_log_internal_buf); } va_end(args); } } - // ================================================================================================= // -// wrapper class of Qualcomm QNN(Qualcomm Neural Network, aka Qualcomm AI Engine Direct) SDK +// wrapper class of Qualcomm QNN(Qualcomm Neural Network, aka Qualcomm AI +// Engine Direct) SDK // // ================================================================================================= class qnn_interface { -#define DEFINE_SHIM_FUNCTION_INTERFACE(F, pointer_name) \ - template \ - inline auto qnn_##F(Args... args) const { \ - return (_qnn_interface->QNN_INTERFACE_VER_NAME.pointer_name)( \ - std::forward(args)...); \ - } - +#define DEFINE_SHIM_FUNCTION_INTERFACE(F, pointer_name) \ + template inline auto qnn_##F(Args... args) const { \ + return (_qnn_interface->QNN_INTERFACE_VER_NAME.pointer_name)( \ + std::forward(args)...); \ + } -#define DEFINE_SHIM_FUNCTION_SYS_INTERFACE(F, pointer_name) \ - template \ - inline auto qnn_##F(Args... args) const { \ - return (_qnn_sys_interface->QNN_SYSTEM_INTERFACE_VER_NAME.pointer_name)( \ - std::forward(args)...); \ - } +#define DEFINE_SHIM_FUNCTION_SYS_INTERFACE(F, pointer_name) \ + template inline auto qnn_##F(Args... args) const { \ + return ( \ + _qnn_sys_interface->QNN_SYSTEM_INTERFACE_VER_NAME.pointer_name)( \ + std::forward(args)...); \ + } friend class qnn_instance; -public: + public: qnn_interface() = default; // QnnBackend @@ -603,31 +606,38 @@ class qnn_interface { DEFINE_SHIM_FUNCTION_INTERFACE(backend_free, backendFree); - DEFINE_SHIM_FUNCTION_INTERFACE(backend_register_op_package, backendRegisterOpPackage); + DEFINE_SHIM_FUNCTION_INTERFACE(backend_register_op_package, + backendRegisterOpPackage); - DEFINE_SHIM_FUNCTION_INTERFACE(backend_validate_op_config, backendValidateOpConfig); + DEFINE_SHIM_FUNCTION_INTERFACE(backend_validate_op_config, + backendValidateOpConfig); - DEFINE_SHIM_FUNCTION_INTERFACE(backend_get_api_version, backendGetApiVersion); + DEFINE_SHIM_FUNCTION_INTERFACE(backend_get_api_version, + backendGetApiVersion); // QnnDevice DEFINE_SHIM_FUNCTION_INTERFACE(device_create, deviceCreate); DEFINE_SHIM_FUNCTION_INTERFACE(device_free, deviceFree); - DEFINE_SHIM_FUNCTION_INTERFACE(device_get_infrastructure, deviceGetInfrastructure); + DEFINE_SHIM_FUNCTION_INTERFACE(device_get_infrastructure, + deviceGetInfrastructure); - DEFINE_SHIM_FUNCTION_INTERFACE(device_get_platform_info, deviceGetPlatformInfo); + DEFINE_SHIM_FUNCTION_INTERFACE(device_get_platform_info, + deviceGetPlatformInfo); DEFINE_SHIM_FUNCTION_INTERFACE(device_get_info, deviceGetInfo); // QnnContext DEFINE_SHIM_FUNCTION_INTERFACE(context_create, contextCreate); - DEFINE_SHIM_FUNCTION_INTERFACE(context_get_binary_size, contextGetBinarySize); + DEFINE_SHIM_FUNCTION_INTERFACE(context_get_binary_size, + contextGetBinarySize); DEFINE_SHIM_FUNCTION_INTERFACE(context_get_binary, contextGetBinary); - DEFINE_SHIM_FUNCTION_INTERFACE(context_create_from_binary, contextCreateFromBinary); + DEFINE_SHIM_FUNCTION_INTERFACE(context_create_from_binary, + contextCreateFromBinary); DEFINE_SHIM_FUNCTION_INTERFACE(context_free, contextFree); @@ -666,17 +676,22 @@ class qnn_interface { DEFINE_SHIM_FUNCTION_INTERFACE(mem_de_register, memDeRegister); // QnnProperty - DEFINE_SHIM_FUNCTION_INTERFACE(property_has_capability, propertyHasCapability); + DEFINE_SHIM_FUNCTION_INTERFACE(property_has_capability, + propertyHasCapability); // QnnTensor - DEFINE_SHIM_FUNCTION_INTERFACE(tensor_create_context_tensor, tensorCreateContextTensor); + DEFINE_SHIM_FUNCTION_INTERFACE(tensor_create_context_tensor, + tensorCreateContextTensor); - DEFINE_SHIM_FUNCTION_INTERFACE(tensor_create_graph_tensor, tensorCreateGraphTensor); + DEFINE_SHIM_FUNCTION_INTERFACE(tensor_create_graph_tensor, + tensorCreateGraphTensor); // QnnSystem - DEFINE_SHIM_FUNCTION_SYS_INTERFACE(system_context_create, systemContextCreate); + DEFINE_SHIM_FUNCTION_SYS_INTERFACE(system_context_create, + systemContextCreate); - DEFINE_SHIM_FUNCTION_SYS_INTERFACE(system_context_get_binary_info, systemContextGetBinaryInfo); + DEFINE_SHIM_FUNCTION_SYS_INTERFACE(system_context_get_binary_info, + systemContextGetBinaryInfo); DEFINE_SHIM_FUNCTION_SYS_INTERFACE(system_context_free, systemContextFree); @@ -684,67 +699,60 @@ class qnn_interface { _qnn_interface = qnn_interface; } - void set_qnn_system_interface(const QnnSystemInterface_t * qnn_sys_interface) { + void set_qnn_system_interface( + const QnnSystemInterface_t * qnn_sys_interface) { _qnn_sys_interface = qnn_sys_interface; } - uint32_t get_backend_id() const { - return _qnn_interface->backendId; - } + uint32_t get_backend_id() const { return _qnn_interface->backendId; } bool is_loaded() const { return ((_qnn_sys_interface != nullptr) && (_qnn_interface != nullptr)); } -private: + private: const QnnInterface_t * _qnn_interface = nullptr; const QnnSystemInterface_t * _qnn_sys_interface = nullptr; }; - - // ================================================================================================= // // wrapper class of Qualcomm QNN(Qualcomm Neural Network, aka Qualcomm AI Engine Direct) SDK // -// and -// -// resource management of QNN resources for GGML's QNN backend // ================================================================================================= class qnn_instance { -public: + public: using BackendIdType = decltype(QnnInterface_t{}.backendId); - explicit qnn_instance(const std::string & lib_path, const std::string & backend_name, - const std::string & model_name) : - _lib_path(std::move(lib_path)), - _backend_name(std::move(backend_name)), - _model_name(std::move(model_name)) {}; + explicit qnn_instance(const std::string & lib_path, + const std::string & backend_name, + const std::string & model_name) + : _lib_path(std::move(lib_path)) + , _backend_name(std::move(backend_name)) + , _model_name(std::move(model_name)){}; - ~qnn_instance() { - } + ~qnn_instance() {} int qnn_init(const QnnSaver_Config_t ** saver_config); int qnn_finalize(); - const qnn_interface &get_qnn_interface() { + const qnn_interface & get_qnn_interface() { if (!_qnn_interface.is_loaded()) { QNN_LOG_WARN("pls check why _qnn_interface is not loaded\n"); } return _qnn_interface; } - - const QNN_INTERFACE_VER_TYPE &get_qnn_raw_interface() { + const QNN_INTERFACE_VER_TYPE & get_qnn_raw_interface() { if (!_qnn_interface.is_loaded()) { QNN_LOG_WARN("pls check why _qnn_interface is not loaded\n"); } return _qnn_raw_interface; } - const QNN_SYSTEM_INTERFACE_VER_TYPE &get_qnn_raw_system_interface() { + const QNN_SYSTEM_INTERFACE_VER_TYPE & get_qnn_raw_system_interface() { if (!_qnn_interface.is_loaded()) { QNN_LOG_WARN("pls check why _qnn_interface is not loaded\n"); } @@ -753,24 +761,31 @@ class qnn_instance { const Qnn_LogHandle_t get_qnn_log_handle() { return _qnn_log_handle; } - const Qnn_ProfileHandle_t get_qnn_profile_handle() { return _qnn_profile_handle; } + const Qnn_ProfileHandle_t get_qnn_profile_handle() { + return _qnn_profile_handle; + } - const Qnn_DeviceHandle_t get_qnn_device_handle() { return _qnn_device_handle; } + const Qnn_DeviceHandle_t get_qnn_device_handle() { + return _qnn_device_handle; + } - const Qnn_BackendHandle_t get_qnn_backend_handle() { return _qnn_backend_handle; } + const Qnn_BackendHandle_t get_qnn_backend_handle() { + return _qnn_backend_handle; + } - const Qnn_ContextHandle_t get_qnn_context_handle() { return _qnn_context_handle; } + const Qnn_ContextHandle_t get_qnn_context_handle() { + return _qnn_context_handle; + } - const QnnSystemContext_Handle_t get_qnn_system_handle() { return _qnn_system_handle; } + const QnnSystemContext_Handle_t get_qnn_system_handle() { + return _qnn_system_handle; + } const Qnn_GraphHandle_t get_qnn_graph_handle() { return _qnn_graph_handle; } - - int init_qnn_graph(const char * graph_name, - bool debug, - uint8_t do_node_validation = 1, - const QnnGraph_Config_t ** graph_configs = nullptr - ); + int init_qnn_graph(const char * graph_name, bool debug, + uint8_t do_node_validation = 1, + const QnnGraph_Config_t ** graph_configs = nullptr); int finalize_qnn_graph(); @@ -782,35 +797,35 @@ class qnn_instance { return 1; } - QnnHtpDevice_Infrastructure_t *htp_infra = static_cast(device_infra); - QnnHtpDevice_PerfInfrastructure_t *htp_perfinfra = &htp_infra->perfInfra; + QnnHtpDevice_Infrastructure_t * htp_infra = static_cast(device_infra); + QnnHtpDevice_PerfInfrastructure_t * htp_perfinfra = &htp_infra->perfInfra; uint32_t power_configid = 1; - uint32_t device_id = 0; - uint32_t core_id = 0; + uint32_t device_id = 0; + uint32_t core_id = 0; htp_perfinfra->createPowerConfigId(device_id, core_id, &power_configid); - _qnn_htp_perfinfra = htp_perfinfra; + _qnn_htp_perfinfra = htp_perfinfra; _qnn_power_configid = power_configid; return 0; } - int set_rpc_polling() { if (_qnn_rpc_pollingtime > 0) { QnnHtpPerfInfrastructure_PowerConfig_t rpc_pollingTime; memset(&rpc_pollingTime, 0, sizeof(rpc_pollingTime)); rpc_pollingTime.option = - QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_RPC_POLLING_TIME; + QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_RPC_POLLING_TIME; rpc_pollingTime.rpcPollingTimeConfig = _qnn_rpc_pollingtime; - const QnnHtpPerfInfrastructure_PowerConfig_t *powerConfigs[] = {&rpc_pollingTime, nullptr}; + const QnnHtpPerfInfrastructure_PowerConfig_t * powerConfigs[] = { + &rpc_pollingTime, nullptr}; if (_qnn_htp_perfinfra) { - _qnn_htp_perfinfra->setPowerConfig(_qnn_power_configid, powerConfigs); + _qnn_htp_perfinfra->setPowerConfig(_qnn_power_configid, + powerConfigs); } } return 0; } - int set_high_performance_mode() { if (nullptr == _qnn_htp_perfinfra) { QNN_LOG_DEBUG("perf intra is null\n"); @@ -820,39 +835,49 @@ class qnn_instance { QnnHtpPerfInfrastructure_PowerConfig_t powerConfig; memset(&powerConfig, 0, sizeof(powerConfig)); powerConfig.option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_DCVS_V3; - powerConfig.dcvsV3Config.dcvsEnable = 0; + powerConfig.dcvsV3Config.dcvsEnable = 0; powerConfig.dcvsV3Config.setDcvsEnable = 1; - powerConfig.dcvsV3Config.contextId = _qnn_power_configid; + powerConfig.dcvsV3Config.contextId = _qnn_power_configid; powerConfig.dcvsV3Config.powerMode = QNN_HTP_PERF_INFRASTRUCTURE_POWERMODE_PERFORMANCE_MODE; - powerConfig.dcvsV3Config.setSleepLatency = 1; // True to consider Latency parameter otherwise False - powerConfig.dcvsV3Config.setBusParams = 1; // True to consider Bus parameter otherwise False - powerConfig.dcvsV3Config.setCoreParams = 1; // True to consider Core parameter otherwise False - powerConfig.dcvsV3Config.sleepDisable = 0; // True to consider sleep/LPM modes, False to enable - powerConfig.dcvsV3Config.setSleepDisable = 0; // True to consider sleep disable/enable parameter otherwise False - // set Sleep latency parameter + powerConfig.dcvsV3Config.setSleepLatency = + 1; // true to consider Latency parameter otherwise False + powerConfig.dcvsV3Config.setBusParams = + 1; // true to consider Bus parameter otherwise False + powerConfig.dcvsV3Config.setCoreParams = + 1; // true to consider Core parameter otherwise False + powerConfig.dcvsV3Config.sleepDisable = + 0; // true to consider sleep/LPM modes, False to enable + powerConfig.dcvsV3Config.setSleepDisable = + 0; // true to consider sleep disable/enable parameter otherwise False set sleep latency parameter uint32_t latencyValue = 40; - powerConfig.dcvsV3Config.sleepLatency = latencyValue; // range 40-2000 micro sec - // set Bus Clock Parameters (refer QnnHtpPerfInfrastructure_VoltageCorner_t enum) - powerConfig.dcvsV3Config.busVoltageCornerMin = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; - powerConfig.dcvsV3Config.busVoltageCornerTarget = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; - powerConfig.dcvsV3Config.busVoltageCornerMax = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; - // set Core Clock Parameters (refer QnnHtpPerfInfrastructure_VoltageCorner_t enum) - powerConfig.dcvsV3Config.coreVoltageCornerMin = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; - powerConfig.dcvsV3Config.coreVoltageCornerTarget = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; - powerConfig.dcvsV3Config.coreVoltageCornerMax = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; + powerConfig.dcvsV3Config.sleepLatency = + latencyValue; // range 40-2000 micro sec + // set Bus Clock Parameters + powerConfig.dcvsV3Config.busVoltageCornerMin = + DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; + powerConfig.dcvsV3Config.busVoltageCornerTarget = + DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; + powerConfig.dcvsV3Config.busVoltageCornerMax = + DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; + // set Core Clock Parameters + powerConfig.dcvsV3Config.coreVoltageCornerMin = + DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; + powerConfig.dcvsV3Config.coreVoltageCornerTarget = + DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; + powerConfig.dcvsV3Config.coreVoltageCornerMax = + DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; // set power config with different performance parameters - const QnnHtpPerfInfrastructure_PowerConfig_t *powerConfigs[] = {&powerConfig, nullptr}; + const QnnHtpPerfInfrastructure_PowerConfig_t *powerConfigs[] = { + &powerConfig, nullptr}; _qnn_htp_perfinfra->setPowerConfig(_qnn_power_configid, powerConfigs); return 0; } - std::string & get_qnn_graph_name() { return _graph_name; } + std::string &get_qnn_graph_name() { return _graph_name; } - bool is_rpcmem_initialized() { - return _rpcmem_initialized; - } + bool is_rpcmem_initialized() { return _rpcmem_initialized; } void set_rpcmem_initialized(bool initialized) { _rpcmem_initialized = initialized; @@ -864,7 +889,7 @@ class qnn_instance { void unregister_rpcmem(); - void *alloc_rpcmem(size_t bytes, size_t alignment); + void * alloc_rpcmem(size_t bytes, size_t alignment); void free_rpcmem(void * buf); @@ -874,15 +899,17 @@ class qnn_instance { return _qnn_mem_set.count(handle) != 0U; } -public: - std::map> _qnn_graph_map; + public: + std::map> + _qnn_graph_map; -private: + private: int load_system(); int unload_system(); - int load_backend(std::string &lib_path, const QnnSaver_Config_t ** saver_config); + int load_backend(std::string & lib_path, const QnnSaver_Config_t ** saver_config); int unload_backend(); @@ -890,24 +917,25 @@ class qnn_instance { _qnn_raw_interface = raw_interface; } - void set_qnn_raw_system_interface(QNN_SYSTEM_INTERFACE_VER_TYPE &raw_interface) { + void set_qnn_raw_system_interface(QNN_SYSTEM_INTERFACE_VER_TYPE & raw_interface) { _qnn_raw_system_interface = raw_interface; } -private: + private: static constexpr const int _required_num_providers = 1; -private: - std::string _lib_path; - std::string _backend_name; - std::string _model_name; // prebuilt QNN model name, not used in currently + private: + std::string _lib_path; + std::string _backend_name; + std::string _model_name; // prebuilt QNN model name, not used currently BackendIdType _backend_id; - bool _debug_tensor = false; // flag to indicate if requested graph is to be run in debug mode - bool _do_node_validations = true; // flag to indicate whether all add_node calls need to be validated - QnnLog_Level_t _qnn_log_level = QNN_LOG_LEVEL_DEBUG; + bool _debug_tensor = false; // flag to indicate if requested graph is to be run in debug mode + bool _do_node_validations = true; // flag to indicate whether all add_node + // calls need to be validated + QnnLog_Level_t _qnn_log_level = QNN_LOG_LEVEL_DEBUG; - ggml_qnn_profile_level _profile_level = ggml_qnn_profile_level::profile_detail; + ggml_qnn_profile_level _profile_level = ggml_qnn_profile_level::profile_detail; qnn_interface _qnn_interface; @@ -927,36 +955,35 @@ class qnn_instance { QnnSystemContext_Handle_t _qnn_system_handle = nullptr; - QnnHtpDevice_PerfInfrastructure_t * _qnn_htp_perfinfra = nullptr; - uint32_t _qnn_power_configid = 1; + QnnHtpDevice_PerfInfrastructure_t * _qnn_htp_perfinfra = nullptr; + uint32_t _qnn_power_configid = 1; uint32_t _qnn_rpc_pollingtime = 9999; // 0-10000 us for high performing - QNN_INTERFACE_VER_TYPE _qnn_raw_interface; + QNN_INTERFACE_VER_TYPE _qnn_raw_interface; QNN_SYSTEM_INTERFACE_VER_TYPE _qnn_raw_system_interface; std::unordered_set _qnn_mem_set; - std::mutex _init_mutex; - std::unordered_map _loaded_lib_handle; + std::mutex _init_mutex; + std::unordered_map _loaded_lib_handle; std::unordered_map _lib_path_to_backend_id; std::unordered_map _loaded_backend; - void * _rpc_lib_handle = nullptr; - std::atomic_bool _rpcmem_initialized{false}; - pfn_rpc_mem_alloc _pfn_rpc_mem_alloc; - pfn_rpc_mem_free _pfn_rpc_mem_free; - pfn_rpc_mem_to_fd _pfn_rpc_mem_to_fd; - pfn_rpc_mem_init _pfn_rpc_mem_init; - pfn_rpc_mem_deinit _pfn_rpc_mem_deinit; + void * _rpc_lib_handle = nullptr; + std::atomic_bool _rpcmem_initialized{false}; + pfn_rpc_mem_alloc _pfn_rpc_mem_alloc; + pfn_rpc_mem_free _pfn_rpc_mem_free; + pfn_rpc_mem_to_fd _pfn_rpc_mem_to_fd; + pfn_rpc_mem_init _pfn_rpc_mem_init; + pfn_rpc_mem_deinit _pfn_rpc_mem_deinit; std::unordered_map _rpcmem_store_map; std::string _graph_name; }; - // ================================================================================================= // -// implementation of wrapper class +// implementation of QNN wrapper class // // ================================================================================================= void * qnn_instance::alloc_rpcmem(size_t bytes, size_t alignment) { @@ -965,15 +992,18 @@ void * qnn_instance::alloc_rpcmem(size_t bytes, size_t alignment) { return nullptr; } - auto allocate_bytes = static_cast(bytes + alignment); - void * buf = _pfn_rpc_mem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS, allocate_bytes); + auto allocate_bytes = static_cast(bytes + alignment); + void * buf = _pfn_rpc_mem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS, + allocate_bytes); if (buf == nullptr) { QNN_LOG_WARN("failed to allocate rpc memory\n"); return nullptr; } - auto aligned_buf = reinterpret_cast(align_to(alignment,reinterpret_cast(buf))); - bool status = _rpcmem_store_map.insert(std::pair(aligned_buf, buf)).second; + auto aligned_buf = reinterpret_cast( + align_to(alignment, reinterpret_cast(buf))); + bool status = + _rpcmem_store_map.insert(std::pair(aligned_buf, buf)).second; if (!status) { QNN_LOG_WARN("failed to allocate rpc memory\n"); _pfn_rpc_mem_free(buf); @@ -982,7 +1012,6 @@ void * qnn_instance::alloc_rpcmem(size_t bytes, size_t alignment) { return aligned_buf; } - void qnn_instance::free_rpcmem(void * buf) { if (!_rpcmem_initialized) { QNN_LOG_WARN("rpc memory not initialized\n"); @@ -994,7 +1023,6 @@ void qnn_instance::free_rpcmem(void * buf) { } } - int32_t qnn_instance::rpcmem_to_fd(void * buf) { int32_t mem_fd = -1; if (!is_rpcmem_initialized()) { @@ -1006,7 +1034,6 @@ int32_t qnn_instance::rpcmem_to_fd(void * buf) { return mem_fd; } - int qnn_instance::register_rpcmem(void * p_data, Qnn_Tensor_t * p_tensor) { if (nullptr == p_data || (nullptr == p_tensor)) { QNN_LOG_WARN("invalid param\n"); @@ -1020,10 +1047,11 @@ int qnn_instance::register_rpcmem(void * p_data, Qnn_Tensor_t * p_tensor) { if (is_rpcmem_allocated(p_data)) { QNN_LOG_WARN("rpc memory already allocated\n"); - //return 3; + // return 3; } if (is_rpcmem_registered((QNN_VER_PTR(*p_tensor)->memHandle))) { - QNN_LOG_WARN("tensor %s has been registered shared memory\n", (QNN_VER_PTR(*p_tensor)->name)); + QNN_LOG_WARN("tensor %s has been registered shared memory\n", + (QNN_VER_PTR(*p_tensor)->name)); return 4; } @@ -1033,24 +1061,23 @@ int qnn_instance::register_rpcmem(void * p_data, Qnn_Tensor_t * p_tensor) { return 5; } QNN_LOG_DEBUG("mem_fd %d\n", mem_fd); - Qnn_MemDescriptor_t descriptor = { - {QNN_VER_PTR(*p_tensor)->rank, QNN_VER_PTR(*p_tensor)->dimensions, nullptr}, - QNN_VER_PTR(*p_tensor)->dataType, - QNN_MEM_TYPE_ION, - {{mem_fd}}}; - Qnn_MemHandle_t handle = nullptr; - int error = QNN_SUCCESS; - error = _qnn_interface.qnn_mem_register( - _qnn_context_handle, - &descriptor, - /*numDescriptors=*/1, - &handle); + Qnn_MemDescriptor_t descriptor = {{QNN_VER_PTR(*p_tensor)->rank, + QNN_VER_PTR(*p_tensor)->dimensions, + nullptr}, + QNN_VER_PTR(*p_tensor)->dataType, + QNN_MEM_TYPE_ION, + {{mem_fd}}}; + Qnn_MemHandle_t handle = nullptr; + int error = QNN_SUCCESS; + error = _qnn_interface.qnn_mem_register(_qnn_context_handle, &descriptor, + /*numDescriptors=*/1, &handle); if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to register shared memory, error %d, %s\n", QNN_GET_ERROR_CODE(error), - strerror(error)); + QNN_LOG_WARN("failed to register shared memory, error %d, %s\n", + QNN_GET_ERROR_CODE(error), strerror(error)); return 6; } else { - QNN_LOG_INFO("tensor %s successfully register shared memory\n", (QNN_VER_PTR(*p_tensor)->name)); + QNN_LOG_INFO("tensor %s successfully register shared memory\n", + (QNN_VER_PTR(*p_tensor)->name)); } QNN_VER_PTR(*p_tensor)->memHandle = handle; _qnn_mem_set.insert(handle); @@ -1058,7 +1085,6 @@ int qnn_instance::register_rpcmem(void * p_data, Qnn_Tensor_t * p_tensor) { return 0; } - void qnn_instance::unregister_rpcmem() { Qnn_ErrorHandle_t error = QNN_SUCCESS; @@ -1069,47 +1095,49 @@ void qnn_instance::unregister_rpcmem() { for (auto &mem_handle : _qnn_mem_set) { error = _qnn_interface.qnn_mem_de_register(&mem_handle, 1); if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to unregister shared memory, error %d\n", QNN_GET_ERROR_CODE(error)); + QNN_LOG_WARN("failed to unregister shared memory, error %d\n", + QNN_GET_ERROR_CODE(error)); } } _qnn_mem_set.clear(); } - bool qnn_instance::is_rpcmem_allocated(void * buf) { return _rpcmem_store_map.count(buf) != 0U; } - -int qnn_instance::load_backend(std::string & lib_path, const QnnSaver_Config_t ** saver_config) { +int qnn_instance::load_backend(std::string & lib_path, + const QnnSaver_Config_t ** saver_config) { Qnn_ErrorHandle_t error = QNN_SUCCESS; QNN_LOG_DEBUG("lib_path:%s\n", lib_path.c_str()); void * lib_handle = dlopen(lib_path.c_str(), RTLD_NOW | RTLD_GLOBAL); if (nullptr == lib_handle) { - QNN_LOG_WARN("can not open QNN library %s, with error: %s", lib_path.c_str(), dlerror()); + QNN_LOG_WARN("can not open QNN library %s, with error: %s", + lib_path.c_str(), dlerror()); return 1; } - // load get_provider function - auto get_providers = load_qnn_functionpointers<_pfn_QnnInterface_getProviders *>(lib_handle, - "QnnInterface_getProviders"); + auto get_providers = load_qnn_functionpointers<_pfn_QnnInterface_getProviders *>( + lib_handle, "QnnInterface_getProviders"); if (nullptr == get_providers) { - QNN_LOG_WARN("can not load symbol QnnInterface_getProviders : %s", dlerror()); + QNN_LOG_WARN("can not load symbol QnnInterface_getProviders : %s", + dlerror()); return 2; } - // get QnnInterface Providers std::uint32_t num_providers = 0; const QnnInterface_t ** provider_list = nullptr; error = get_providers(&provider_list, &num_providers); if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to get providers, error %d", QNN_GET_ERROR_CODE(error)); + QNN_LOG_WARN("failed to get providers, error %d", + QNN_GET_ERROR_CODE(error)); return 3; } QNN_LOG_DEBUG("num_providers=%d\n", num_providers); if (num_providers != _required_num_providers) { - QNN_LOG_WARN("providers is %d instead of required %d", num_providers, _required_num_providers); + QNN_LOG_WARN("providers is %d instead of required %d", num_providers, + _required_num_providers); return 4; } @@ -1120,10 +1148,12 @@ int qnn_instance::load_backend(std::string & lib_path, const QnnSaver_Config_t * bool found_valid_interface = false; QNN_INTERFACE_VER_TYPE qnn_interface; for (size_t idx = 0; idx < num_providers; idx++) { - if (QNN_API_VERSION_MAJOR == provider_list[idx]->apiVersion.coreApiVersion.major && - QNN_API_VERSION_MINOR <= provider_list[idx]->apiVersion.coreApiVersion.minor) { + if (QNN_API_VERSION_MAJOR == + provider_list[idx]->apiVersion.coreApiVersion.major && + QNN_API_VERSION_MINOR <= + provider_list[idx]->apiVersion.coreApiVersion.minor) { found_valid_interface = true; - qnn_interface = provider_list[idx]->QNN_INTERFACE_VER_NAME; + qnn_interface = provider_list[idx]->QNN_INTERFACE_VER_NAME; break; } } @@ -1136,33 +1166,34 @@ int qnn_instance::load_backend(std::string & lib_path, const QnnSaver_Config_t * } set_qnn_raw_interface(qnn_interface); - BackendIdType backend_id = provider_list[0]->backendId; + BackendIdType backend_id = provider_list[0]->backendId; _lib_path_to_backend_id[lib_path] = backend_id; if (_loaded_backend.count(backend_id) > 0) { QNN_LOG_WARN("lib_path %s is loaded, but backend %d already exists\n", - lib_path.c_str(), backend_id); + lib_path.c_str(), backend_id); } _loaded_backend[backend_id] = provider_list[0]; if (_loaded_lib_handle.count(backend_id) > 0) { QNN_LOG_WARN("closing %p\n", _loaded_lib_handle[backend_id]); int dlclose_error = dlclose(_loaded_lib_handle[backend_id]); if (dlclose_error != 0) { - QNN_LOG_WARN("fail to close %p with error %s\n", _loaded_lib_handle[backend_id], dlerror()); + QNN_LOG_WARN("fail to close %p with error %s\n", + _loaded_lib_handle[backend_id], dlerror()); } } _loaded_lib_handle[backend_id] = lib_handle; - _backend_id = backend_id; + _backend_id = backend_id; return 0; } - int qnn_instance::unload_backend() { int dlclose_error = 0; - for (auto &it : _loaded_lib_handle) { + for (auto & it : _loaded_lib_handle) { dlclose_error = dlclose(it.second); if (dlclose_error != 0) { - QNN_LOG_WARN("failed to close QNN backend %d, error %s\n", it.first, dlerror()); + QNN_LOG_WARN("failed to close QNN backend %d, error %s\n", it.first, + dlerror()); } } @@ -1173,7 +1204,6 @@ int qnn_instance::unload_backend() { return 0; } - int qnn_instance::load_system() { Qnn_ErrorHandle_t error = QNN_SUCCESS; @@ -1182,14 +1212,18 @@ int qnn_instance::load_system() { _system_lib_handle = dlopen(system_lib_path.c_str(), RTLD_NOW | RTLD_LOCAL); if (nullptr == _system_lib_handle) { - QNN_LOG_WARN("can not open QNN library %s, error: %s\n", system_lib_path.c_str(), dlerror()); + QNN_LOG_WARN("can not open QNN library %s, error: %s\n", + system_lib_path.c_str(), dlerror()); return 1; } - auto * get_providers = reinterpret_cast<_pfn_QnnSystemInterface_getProviders *>(dlsym( - _system_lib_handle, "QnnSystemInterface_getProviders")); + auto * get_providers = + reinterpret_cast<_pfn_QnnSystemInterface_getProviders *>( + dlsym(_system_lib_handle, "QnnSystemInterface_getProviders")); if (nullptr == get_providers) { - QNN_LOG_WARN("can not load QNN symbol QnnSystemInterface_getProviders: %s\n", dlerror()); + QNN_LOG_WARN( + "can not load QNN symbol QnnSystemInterface_getProviders: %s\n", + dlerror()); return 2; } @@ -1197,12 +1231,14 @@ int qnn_instance::load_system() { const QnnSystemInterface_t ** provider_list = nullptr; error = get_providers(&provider_list, &num_providers); if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to get providers, error %d\n", QNN_GET_ERROR_CODE(error)); + QNN_LOG_WARN("failed to get providers, error %d\n", + QNN_GET_ERROR_CODE(error)); return 3; } if (num_providers != _required_num_providers) { - QNN_LOG_WARN("providers is %d instead of required %d\n", num_providers, _required_num_providers); + QNN_LOG_WARN("providers is %d instead of required %d\n", num_providers, + _required_num_providers); return 4; } @@ -1215,11 +1251,12 @@ int qnn_instance::load_system() { bool found_valid_system_interface = false; for (size_t idx = 0; idx < num_providers; idx++) { if (QNN_SYSTEM_API_VERSION_MAJOR == - provider_list[idx]->systemApiVersion.major && + provider_list[idx]->systemApiVersion.major && QNN_SYSTEM_API_VERSION_MINOR <= - provider_list[idx]->systemApiVersion.minor) { + provider_list[idx]->systemApiVersion.minor) { found_valid_system_interface = true; - qnn_system_interface = provider_list[idx]->QNN_SYSTEM_INTERFACE_VER_NAME; + qnn_system_interface = + provider_list[idx]->QNN_SYSTEM_INTERFACE_VER_NAME; break; } } @@ -1243,7 +1280,6 @@ int qnn_instance::load_system() { return 0; } - int qnn_instance::unload_system() { int result = 0; @@ -1262,7 +1298,8 @@ int qnn_instance::unload_system() { int dlclose_error = dlclose(_system_lib_handle); if (dlclose_error != 0) { - QNN_LOG_WARN("failed to close QnnSystem library, error %s\n", dlerror()); + QNN_LOG_WARN("failed to close QnnSystem library, error %s\n", + dlerror()); return 2; } @@ -1271,36 +1308,33 @@ int qnn_instance::unload_system() { return result; } +static void ggml_qnn_logcallback(const char * fmt, QnnLog_Level_t level, + uint64_t timestamp, va_list argp) { -static void ggml_qnn_logcallback(const char * fmt, - QnnLog_Level_t level, - uint64_t timestamp, - va_list argp) { - -#if GGML_QNN_DEBUG - static std::mutex log_mutex; +#if ENABLE_QNN_LOG + static std::mutex log_mutex; static unsigned char s_ggml_qnn_logbuf[GGML_QNN_LOGBUF_LEN]; const char * log_level_desc = ""; switch (level) { - case QNN_LOG_LEVEL_ERROR: - log_level_desc = " ERROR "; - break; - case QNN_LOG_LEVEL_WARN: - log_level_desc = "WARNING"; - break; - case QNN_LOG_LEVEL_INFO: - log_level_desc = " INFO "; - break; - case QNN_LOG_LEVEL_DEBUG: - log_level_desc = " DEBUG "; - break; - case QNN_LOG_LEVEL_VERBOSE: - log_level_desc = "VERBOSE"; - break; - case QNN_LOG_LEVEL_MAX: - log_level_desc = "UNKNOWN"; - break; + case QNN_LOG_LEVEL_ERROR: + log_level_desc = "ERROR"; + break; + case QNN_LOG_LEVEL_WARN: + log_level_desc = "WARNING"; + break; + case QNN_LOG_LEVEL_INFO: + log_level_desc = "INFO"; + break; + case QNN_LOG_LEVEL_DEBUG: + log_level_desc = "DEBUG"; + break; + case QNN_LOG_LEVEL_VERBOSE: + log_level_desc = "VERBOSE"; + break; + case QNN_LOG_LEVEL_MAX: + log_level_desc = "UNKNOWN"; + break; } double ms = (double) timestamp / 1000000.0; @@ -1314,12 +1348,11 @@ static void ggml_qnn_logcallback(const char * fmt, #endif } - int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) { BackendIdType backend_id = QNN_BACKEND_ID_NULL; QNN_LOG_DEBUG("enter qni_init\n"); - const std::lock_guard lock(_init_mutex); + std::lock_guard lock(_init_mutex); if (0 != load_system()) { QNN_LOG_WARN("can not load QNN system lib, pls check why?\n"); @@ -1328,39 +1361,43 @@ int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) { QNN_LOG_DEBUG("load QNN system lib successfully\n"); } - std::string bakend_lib_path = _lib_path + _backend_name; - if (0 == _lib_path_to_backend_id.count(bakend_lib_path)) { - int is_load_ok = load_backend(bakend_lib_path, saver_config); + std::string backend_lib_path = _lib_path + _backend_name; + if (0 == _lib_path_to_backend_id.count(backend_lib_path)) { + int is_load_ok = load_backend(backend_lib_path, saver_config); if (0 != is_load_ok) { QNN_LOG_WARN("failed to load QNN backend\n"); return 2; } } - backend_id = _lib_path_to_backend_id[bakend_lib_path]; + backend_id = _lib_path_to_backend_id[backend_lib_path]; if (0 == _loaded_backend.count(backend_id) || 0 == _loaded_lib_handle.count(backend_id)) { - QNN_LOG_WARN("library %s is loaded but loaded backend count=%zu, loaded lib_handle count=%zu\n", - bakend_lib_path.c_str(), - _loaded_backend.count(backend_id), - _loaded_lib_handle.count(backend_id)); + QNN_LOG_WARN("library %s is loaded but loaded backend count=%zu, " + "loaded lib_handle count=%zu\n", + backend_lib_path.c_str(), _loaded_backend.count(backend_id), + _loaded_lib_handle.count(backend_id)); return 3; } _qnn_interface.set_qnn_interface(_loaded_backend[backend_id]); - _qnn_interface.qnn_log_create(ggml_qnn_logcallback, _qnn_log_level, &_qnn_log_handle); + _qnn_interface.qnn_log_create(ggml_qnn_logcallback, _qnn_log_level, + &_qnn_log_handle); if (nullptr == _qnn_log_handle) { - QNN_LOG_WARN("why failed to initialize qnn log\n"); //NPU backend not work on Qualcomm SoC based low-end phone + QNN_LOG_WARN( + "why failed to initialize qnn log\n"); // NPU backend not work on + // Qualcomm SoC equipped low-end phone return 4; } else { QNN_LOG_DEBUG("initialize qnn log successfully\n"); } std::vector temp_backend_config; - _qnn_interface.qnn_backend_create(_qnn_log_handle, temp_backend_config.empty() ? nullptr - : temp_backend_config.data(), - &_qnn_backend_handle); + _qnn_interface.qnn_backend_create( + _qnn_log_handle, + temp_backend_config.empty() ? nullptr : temp_backend_config.data(), + &_qnn_backend_handle); if (nullptr == _qnn_backend_handle) { QNN_LOG_WARN("why failed to initialize qnn backend\n"); return 5; @@ -1369,7 +1406,8 @@ int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) { } if (nullptr != _qnn_raw_interface.propertyHasCapability) { - auto qnnStatus = _qnn_raw_interface.propertyHasCapability(QNN_PROPERTY_GROUP_DEVICE); + auto qnnStatus = + _qnn_raw_interface.propertyHasCapability(QNN_PROPERTY_GROUP_DEVICE); if (QNN_PROPERTY_NOT_SUPPORTED == qnnStatus) { QNN_LOG_WARN("device property is not supported\n"); } @@ -1378,8 +1416,10 @@ int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) { } } - auto qnnStatus = _qnn_raw_interface.deviceCreate(_qnn_log_handle, nullptr, &_qnn_device_handle); - if (QNN_SUCCESS != qnnStatus && QNN_DEVICE_ERROR_UNSUPPORTED_FEATURE != qnnStatus) { + Qnn_ErrorHandle_t qnn_status = _qnn_raw_interface.deviceCreate(_qnn_log_handle, nullptr, + &_qnn_device_handle); + if (QNN_SUCCESS != qnn_status && + QNN_DEVICE_ERROR_UNSUPPORTED_FEATURE != qnn_status) { QNN_LOG_WARN("failed to create QNN device\n"); } else { QNN_LOG_INFO("create device successfully\n"); @@ -1389,8 +1429,10 @@ int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) { QNN_LOG_INFO("profiling turned on; level = %d", _profile_level); if (ggml_qnn_profile_level::profile_basic == _profile_level) { QNN_LOG_INFO("basic profiling requested. creating Qnn Profile object\n"); - if (QNN_PROFILE_NO_ERROR != _qnn_raw_interface.profileCreate( - _qnn_backend_handle, QNN_PROFILE_LEVEL_BASIC, &_qnn_profile_handle)) { + if (QNN_PROFILE_NO_ERROR != + _qnn_raw_interface.profileCreate(_qnn_backend_handle, + QNN_PROFILE_LEVEL_BASIC, + &_qnn_profile_handle)) { QNN_LOG_WARN("unable to create profile handle in the backend\n"); return 6; } else { @@ -1398,8 +1440,10 @@ int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) { } } else if (ggml_qnn_profile_level::profile_detail == _profile_level) { QNN_LOG_INFO("detailed profiling requested. Creating Qnn Profile object\n"); - if (QNN_PROFILE_NO_ERROR != _qnn_raw_interface.profileCreate( - _qnn_backend_handle, QNN_PROFILE_LEVEL_DETAILED, &_qnn_profile_handle)) { + if (QNN_PROFILE_NO_ERROR != + _qnn_raw_interface.profileCreate(_qnn_backend_handle, + QNN_PROFILE_LEVEL_DETAILED, + &_qnn_profile_handle)) { QNN_LOG_WARN("unable to create profile handle in the backend\n"); return 7; } else { @@ -1416,26 +1460,32 @@ int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) { QNN_LOG_DEBUG("load rpcmem lib successfully\n"); set_rpcmem_initialized(true); } - _pfn_rpc_mem_init = reinterpret_cast(dlsym(_rpc_lib_handle, "rpcmem_init")); - _pfn_rpc_mem_deinit = reinterpret_cast(dlsym(_rpc_lib_handle, "rpcmem_deinit")); - _pfn_rpc_mem_alloc = reinterpret_cast(dlsym(_rpc_lib_handle,"rpcmem_alloc")); - _pfn_rpc_mem_free = reinterpret_cast(dlsym(_rpc_lib_handle, "rpcmem_free")); - _pfn_rpc_mem_to_fd = reinterpret_cast(dlsym(_rpc_lib_handle,"rpcmem_to_fd")); - if (nullptr == _pfn_rpc_mem_alloc || nullptr == _pfn_rpc_mem_free - || nullptr == _pfn_rpc_mem_to_fd) { + _pfn_rpc_mem_init = reinterpret_cast( + dlsym(_rpc_lib_handle, "rpcmem_init")); + _pfn_rpc_mem_deinit = reinterpret_cast( + dlsym(_rpc_lib_handle, "rpcmem_deinit")); + _pfn_rpc_mem_alloc = reinterpret_cast( + dlsym(_rpc_lib_handle, "rpcmem_alloc")); + _pfn_rpc_mem_free = reinterpret_cast( + dlsym(_rpc_lib_handle, "rpcmem_free")); + _pfn_rpc_mem_to_fd = reinterpret_cast( + dlsym(_rpc_lib_handle, "rpcmem_to_fd")); + if (nullptr == _pfn_rpc_mem_alloc || nullptr == _pfn_rpc_mem_free || + nullptr == _pfn_rpc_mem_to_fd) { QNN_LOG_WARN("unable to access symbols in QNN RPC lib. dlerror(): %s", dlerror()); dlclose(_rpc_lib_handle); return 9; } - if (nullptr != _pfn_rpc_mem_init) // make Qualcomm's SoC based low-end phone happy + if (nullptr != + _pfn_rpc_mem_init) // make Qualcomm's SoC equipped low-end phone happy _pfn_rpc_mem_init(); std::vector temp_context_config; - _qnn_interface.qnn_context_create(_qnn_backend_handle, _qnn_device_handle, - temp_context_config.empty() ? nullptr - : temp_context_config.data(), - &_qnn_context_handle); + _qnn_interface.qnn_context_create( + _qnn_backend_handle, _qnn_device_handle, + temp_context_config.empty() ? nullptr : temp_context_config.data(), + &_qnn_context_handle); if (nullptr == _qnn_context_handle) { QNN_LOG_WARN("why failed to initialize qnn context\n"); return 10; @@ -1448,12 +1498,12 @@ int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) { return 0; } - int qnn_instance::qnn_finalize() { - int ret_status = 0; - Qnn_ErrorHandle_t error = QNN_SUCCESS; + int ret_status = 0; + Qnn_ErrorHandle_t error = QNN_SUCCESS; - if (nullptr != _pfn_rpc_mem_deinit) // make Qualcomm's SoC based low-end phone happy + if (nullptr != + _pfn_rpc_mem_deinit) // make Qualcomm's SoC equipped low-end phone happy _pfn_rpc_mem_deinit(); if (dlclose(_rpc_lib_handle) != 0) { @@ -1463,11 +1513,12 @@ int qnn_instance::qnn_finalize() { } if (nullptr != _qnn_context_handle) { - error = _qnn_interface.qnn_context_free(_qnn_context_handle, _qnn_profile_handle); + error = _qnn_interface.qnn_context_free(_qnn_context_handle, + _qnn_profile_handle); if (error != QNN_SUCCESS) { QNN_LOG_WARN("failed to free QNN context_handle: ID %u, error %d\n", - _qnn_interface.get_backend_id(), QNN_GET_ERROR_CODE(error)); - + _qnn_interface.get_backend_id(), + QNN_GET_ERROR_CODE(error)); } _qnn_context_handle = nullptr; } @@ -1476,8 +1527,8 @@ int qnn_instance::qnn_finalize() { error = _qnn_interface.qnn_profile_free(_qnn_profile_handle); if (error != QNN_SUCCESS) { QNN_LOG_WARN("failed to free QNN profile_handle: ID %u, error %d\n", - _qnn_interface.get_backend_id(), QNN_GET_ERROR_CODE(error)); - + _qnn_interface.get_backend_id(), + QNN_GET_ERROR_CODE(error)); } _qnn_profile_handle = nullptr; } @@ -1486,8 +1537,8 @@ int qnn_instance::qnn_finalize() { error = _qnn_interface.qnn_device_free(_qnn_device_handle); if (error != QNN_SUCCESS) { QNN_LOG_WARN("failed to free QNN device_handle: ID %u, error %d\n", - _qnn_interface.get_backend_id(), QNN_GET_ERROR_CODE(error)); - + _qnn_interface.get_backend_id(), + QNN_GET_ERROR_CODE(error)); } _qnn_device_handle = nullptr; } @@ -1496,17 +1547,18 @@ int qnn_instance::qnn_finalize() { error = _qnn_interface.qnn_backend_free(_qnn_backend_handle); if (error != QNN_SUCCESS) { QNN_LOG_WARN("failed to free QNN backend_handle: ID %u, error %d\n", - _qnn_interface.get_backend_id(), QNN_GET_ERROR_CODE(error)); + _qnn_interface.get_backend_id(), + QNN_GET_ERROR_CODE(error)); } _qnn_backend_handle = nullptr; - } if (nullptr != _qnn_log_handle) { error = _qnn_interface.qnn_log_free(_qnn_log_handle); if (error != QNN_SUCCESS) { QNN_LOG_WARN("failed to free QNN log_handle: ID %u, error %d\n", - _qnn_interface.get_backend_id(), QNN_GET_ERROR_CODE(error)); + _qnn_interface.get_backend_id(), + QNN_GET_ERROR_CODE(error)); } _qnn_log_handle = nullptr; } @@ -1518,9 +1570,9 @@ int qnn_instance::qnn_finalize() { return ret_status; } - -int qnn_instance::init_qnn_graph(const char * graph_name, bool debug, uint8_t do_node_validation, - const QnnGraph_Config_t ** graph_configs) { +int qnn_instance::init_qnn_graph(const char * graph_name, bool debug, + uint8_t do_node_validation, + const QnnGraph_Config_t ** graph_configs) { int result = 0; if (nullptr == graph_name) { @@ -1534,15 +1586,16 @@ int qnn_instance::init_qnn_graph(const char * graph_name, bool debug, uint8_t do } if (!do_node_validation) { - QNN_LOG_WARN("node validation disabled, backend will not perform op validation prior to adding node\n"); + QNN_LOG_WARN("node validation disabled, backend will not perform op " + "validation prior to adding node\n"); } - _graph_name = graph_name; - _debug_tensor = debug; + _graph_name = graph_name; + _debug_tensor = debug; _do_node_validations = do_node_validation; - result = _qnn_raw_interface.graphCreate(_qnn_context_handle, graph_name, graph_configs, - &_qnn_graph_handle); + result = _qnn_raw_interface.graphCreate(_qnn_context_handle, graph_name, + graph_configs, &_qnn_graph_handle); if (result != QNN_GRAPH_NO_ERROR || nullptr == _qnn_graph_handle) { QNN_LOG_WARN("failed to create graph in qnn context\n"); return 3; @@ -1553,13 +1606,12 @@ int qnn_instance::init_qnn_graph(const char * graph_name, bool debug, uint8_t do return 0; } - int qnn_instance::finalize_qnn_graph() { if (nullptr != _qnn_graph_handle) { - if (_qnn_raw_interface.graphFinalize(_qnn_graph_handle, _qnn_profile_handle, nullptr) != - QNN_GRAPH_NO_ERROR) { + if (_qnn_raw_interface.graphFinalize(_qnn_graph_handle, + _qnn_profile_handle, + nullptr) != QNN_GRAPH_NO_ERROR) { QNN_LOG_WARN("finalizing graph failure\n"); - //return 1; } } else { QNN_LOG_DEBUG("qnn graph handle is null\n"); @@ -1568,26 +1620,28 @@ int qnn_instance::finalize_qnn_graph() { return 0; } - - // ================================================================================================= // // implementation of GGML's QNN backend // // ================================================================================================= -static bool ggml_qnn_can_handle_op(const struct ggml_tensor * tensor, bool b_dump_tensor_info) { - if (nullptr == tensor) - return false; - if (b_dump_tensor_info) { - QNN_LOG_DEBUG("op name:%s, tensor type:%s", ggml_op_name(tensor->op), - ggml_type_name(tensor->type)); - } - //only support the following 3 OPs currently and ensure tensor->src[0] and tensor->src[1] is not nullptr - bool supported_op = ((tensor->op == GGML_OP_ADD) || (tensor->op == GGML_OP_MUL) || (tensor->op == GGML_OP_MUL_MAT)); +static bool ggml_qnn_can_handle_op(ggml_backend_qnn_context *ctx, + const struct ggml_tensor *tensor, + bool b_dump_tensor_info) { + // only support the following 3 OPs currently + // provide a GENERAL approach could fix this problem in a standalone PR of refine ggml backend + // subsystem for mixed inference between CPU&GPU / CPU&NPU easily for ANY ggml backends + // which the backend's ggml_backend_xxx_buffer_is_host return true. + // this approach could be found: + // https://github.com/ggerganov/llama.cpp/pull/7641 + // + // ensure tensor->src[0] and tensor->src[1] is not nullptr. + bool supported_op = + ((tensor->op == GGML_OP_ADD) || (tensor->op == GGML_OP_MUL) || + (tensor->op == GGML_OP_MUL_MAT)); if (!supported_op) { return false; } - const struct ggml_tensor * src0 = tensor->src[0]; const struct ggml_tensor * src1 = tensor->src[1]; @@ -1597,87 +1651,114 @@ static bool ggml_qnn_can_handle_op(const struct ggml_tensor * tensor, bool b_dum const int64_t ne10 = src1->ne[0]; const int64_t ne11 = src1->ne[1]; - const int64_t ne0 = tensor->ne[0]; - const int64_t ne1 = tensor->ne[1]; + const int64_t ne20 = tensor->ne[0]; + const int64_t ne21 = tensor->ne[1]; - GGML_UNUSED(ne0); - GGML_UNUSED(ne1); + //TODO: support other quatinized data type + if (ggml_is_quantized(src0->type) && (src0->type != GGML_TYPE_Q8_0)) { + return false; + } if (b_dump_tensor_info) { - QNN_LOG_DEBUG("src0 type:%s", ggml_type_name(tensor->src[0]->type)); - QNN_LOG_DEBUG("src1 type:%s", ggml_type_name(tensor->src[1]->type)); - if (tensor->op == GGML_OP_MUL_MAT) { - QNN_LOG_DEBUG("GGML_OP_MUL_MAT"); - QNN_LOG_DEBUG( - "src0 %15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", - src0->name, - src0->type, ggml_type_name(src0->type), src0->ne[0], src0->ne[1], src0->ne[2], - src0->nb[0], src0->nb[1], src0->nb[2]); - QNN_LOG_DEBUG( - "src1 %15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", - src1->name, - src1->type, ggml_type_name(src1->type), src1->ne[0], src1->ne[1], src1->ne[2], - src1->nb[0], src1->nb[1], src1->nb[2]); - QNN_LOG_DEBUG( - " %15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", - tensor->name, - tensor->type, ggml_type_name(tensor->type), tensor->ne[0], tensor->ne[1], tensor->ne[2], - tensor->nb[0], - tensor->nb[1], tensor->nb[2]); - - } - } - - if (ggml_is_empty(tensor) || tensor->op == GGML_OP_RESHAPE || tensor->op == GGML_OP_TRANSPOSE || tensor->op == GGML_OP_VIEW || tensor->op == GGML_OP_PERMUTE || tensor->op == GGML_OP_NONE) { + QNN_LOG_DEBUG("GGML_OP_MUL_MAT"); + QNN_LOG_DEBUG("op name:%s, tensor type:%s", + ggml_op_name(tensor->op), + ggml_type_name(tensor->type)); + QNN_LOG_DEBUG("src0 type:%s", ggml_type_name(tensor->src[0]->type)); + QNN_LOG_DEBUG("src1 type:%s", ggml_type_name(tensor->src[1]->type)); + QNN_LOG_DEBUG("src0 %15s: type = %i (%5s) ne = %5" PRIi64 + " x %5" PRIi64 " x %5" PRIi64 + ", nb = (%5zi, %5zi, %5zi)\n", + src0->name, src0->type, ggml_type_name(src0->type), + src0->ne[0], src0->ne[1], src0->ne[2], src0->nb[0], + src0->nb[1], src0->nb[2]); + QNN_LOG_DEBUG("src1 %15s: type = %i (%5s) ne = %5" PRIi64 + " x %5" PRIi64 " x %5" PRIi64 + ", nb = (%5zi, %5zi, %5zi)\n", + src1->name, src1->type, ggml_type_name(src1->type), + src1->ne[0], src1->ne[1], src1->ne[2], src1->nb[0], + src1->nb[1], src1->nb[2]); + QNN_LOG_DEBUG( + " %15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 + " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + tensor->name, tensor->type, ggml_type_name(tensor->type), + tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->nb[0], + tensor->nb[1], tensor->nb[2]); + } + } + + if (ggml_is_empty(tensor) || tensor->op == GGML_OP_RESHAPE || + tensor->op == GGML_OP_TRANSPOSE || tensor->op == GGML_OP_VIEW || + tensor->op == GGML_OP_PERMUTE || tensor->op == GGML_OP_NONE) { return false; } - //make ggml_get_tensor_rank and QNN SDK happy + // make ggml_get_tensor_rank and QNN SDK happy if ((ne00 <= 1 || ne01 <= 1 || ne10 <= 1 || ne11 <= 1)) { return false; } - // GPU/NPU inference will slower then CPU inference when tensor->ne[1] < min batch size - if (tensor->ne[1] < 32) { + if ((ne20 < 32) || (ne21 < 32) || (ne10 < 32)) { return false; } int qtype = src0->type; - return (qtype == GGML_TYPE_F32 || qtype == GGML_TYPE_F16 || qtype == GGML_TYPE_Q8_0) - && (src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16); - -} + if (tensor->op == GGML_OP_ADD) { + return (qtype == GGML_TYPE_F32 || qtype == GGML_TYPE_F16 || + qtype == GGML_TYPE_Q8_0) && + (src1->type == GGML_TYPE_F32); + } + if (tensor->op == GGML_OP_MUL) { + return (qtype == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32); + } -static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - Qnn_ErrorHandle_t error = QNN_SUCCESS; - bool graph_initialized = false; - int64_t n_begin_time = 0LL; - int64_t n_end_time = 0LL; - int64_t n_duration = 0LL; + if (tensor->op == GGML_OP_MUL_MAT) { + if (ctx->device == QNN_BACKEND_GGML) { + return (ne00 == ne10) && (src1->ne[2] % src0->ne[2] == 0) && + (src1->ne[3] % src0->ne[3] == 0); + } + if ((ctx->device == QNN_BACKEND_NPU) && (qtype == GGML_TYPE_Q8_0) && + (src1->type == GGML_TYPE_F32) && (tensor->type == GGML_TYPE_F32)) { + return true; + } + if (ctx->device == QNN_BACKEND_CPU || ctx->device == QNN_BACKEND_GPU) { + return (ne00 == ne10) && (ne00 == ne01); + } + return false; + } +} - qnn_instance * instance = nullptr; +static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, + const ggml_tensor * src1, ggml_tensor * dst) { + Qnn_ErrorHandle_t error = QNN_SUCCESS; + bool graph_initialized = false; + int64_t n_begin_time = 0LL; + int64_t n_end_time = 0LL; + int64_t n_duration = 0LL; - std::string graph_name = "ggml_op_qnn_add"; - Qnn_GraphHandle_t graph_handle = nullptr; - Qnn_Tensor_t * tensor_0 = nullptr; - Qnn_Tensor_t * tensor_1 = nullptr; - Qnn_Tensor_t * tensor_2 = nullptr; - Qnn_Param_t qnn_params[] = {}; - enum ggml_op ggmlop = GGML_OP_ADD; - Qnn_DataType_t src0_qnn_type = QNN_DATATYPE_FLOAT_32; - Qnn_DataType_t src1_qnn_type = QNN_DATATYPE_FLOAT_32; - Qnn_DataType_t dst_qnn_type = QNN_DATATYPE_FLOAT_32; + qnn_instance * instance = nullptr; + std::string graph_name = "ggml_op_qnn_add"; + Qnn_GraphHandle_t graph_handle = nullptr; + Qnn_Tensor_t * tensor_0 = nullptr; + Qnn_Tensor_t * tensor_1 = nullptr; + Qnn_Tensor_t * tensor_2 = nullptr; + Qnn_Param_t qnn_params[] = {}; + enum ggml_op ggmlop = GGML_OP_ADD; + Qnn_DataType_t src0_qnn_type = QNN_DATATYPE_FLOAT_32; + Qnn_DataType_t src1_qnn_type = QNN_DATATYPE_FLOAT_32; + Qnn_DataType_t dst_qnn_type = QNN_DATATYPE_FLOAT_32; if ((nullptr == src0) || (nullptr == src1) || (nullptr == dst)) { QNN_LOG_WARN("pls check why GGML tensor is null"); return; } - tensor_0 = (Qnn_Tensor_t *)src0->extra; - tensor_1 = (Qnn_Tensor_t *)src1->extra; - tensor_2 = (Qnn_Tensor_t *)dst->extra; - if ((nullptr == tensor_0) || (nullptr == tensor_1) || (nullptr == tensor_2)) { + tensor_0 = (Qnn_Tensor_t *) src0->extra; + tensor_1 = (Qnn_Tensor_t *) src1->extra; + tensor_2 = (Qnn_Tensor_t *) dst->extra; + if ((nullptr == tensor_0) || (nullptr == tensor_1) || + (nullptr == tensor_2)) { QNN_LOG_WARN("pls check why QNN tensor is null"); return; } @@ -1685,53 +1766,63 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src QNN_LOG_WARN("pls check why backend ctx is null"); return; } - instance = ctx->instance; + instance = ctx->instance; if (nullptr == instance) { QNN_LOG_WARN("pls check why qnn instance is null"); return; } - QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface; - - n_begin_time = ggml_time_us(); - - QNN_LOG_DEBUG("call %s\n", __func__); - QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", - src0->name, - src0->type, ggml_type_name(src0->type), src0->ne[0], src0->ne[1], src0->ne[2], - src0->nb[0], src0->nb[1], src0->nb[2]); - QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", - src1->name, - src1->type, ggml_type_name(src1->type), src1->ne[0], src1->ne[1], src1->ne[2], - src1->nb[0], src1->nb[1], src1->nb[2]); - QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", - dst->name, - dst->type, ggml_type_name(dst->type), dst->ne[0], dst->ne[1], dst->ne[2], dst->nb[0], - dst->nb[1], dst->nb[2]); - QNN_LOG_DEBUG("%d, %d, %d, %d", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]); - QNN_LOG_DEBUG("tensor0 name %s", QNN_TENSOR_GET_NAME(*tensor_0)); - QNN_LOG_DEBUG("tensor1 name %s", QNN_TENSOR_GET_NAME(*tensor_1)); - QNN_LOG_DEBUG("tensor2 name %s", QNN_TENSOR_GET_NAME(*tensor_2)); + QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface; + + n_begin_time = ggml_time_us(); + + if (0) { + QNN_LOG_DEBUG("call %s\n", __func__); + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 + " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + src0->name, src0->type, ggml_type_name(src0->type), + src0->ne[0], src0->ne[1], src0->ne[2], src0->nb[0], + src0->nb[1], src0->nb[2]); + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 + " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + src1->name, src1->type, ggml_type_name(src1->type), + src1->ne[0], src1->ne[1], src1->ne[2], src1->nb[0], + src1->nb[1], src1->nb[2]); + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 + " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + dst->name, dst->type, ggml_type_name(dst->type), + dst->ne[0], dst->ne[1], dst->ne[2], dst->nb[0], + dst->nb[1], dst->nb[2]); + QNN_LOG_DEBUG("%d, %d, %d, %d", src0->ne[0], src0->ne[1], src0->ne[2], + src0->ne[3]); + QNN_LOG_DEBUG("tensor0 name %s", QNN_TENSOR_GET_NAME(*tensor_0)); + QNN_LOG_DEBUG("tensor1 name %s", QNN_TENSOR_GET_NAME(*tensor_1)); + QNN_LOG_DEBUG("tensor2 name %s", QNN_TENSOR_GET_NAME(*tensor_2)); + } QNN_VER_PTR(*tensor_0)->type = QNN_TENSOR_TYPE_APP_WRITE; QNN_VER_PTR(*tensor_1)->type = QNN_TENSOR_TYPE_APP_WRITE; QNN_VER_PTR(*tensor_2)->type = QNN_TENSOR_TYPE_APP_READ; - src0_qnn_type = qnn_datatype_from_ggml_datatype(src0->type); - src1_qnn_type = qnn_datatype_from_ggml_datatype(src1->type); - dst_qnn_type = qnn_datatype_from_ggml_datatype(dst->type); - - uint32_t dimensions_input_0[] = {(uint32_t) src0->ne[0], (uint32_t) src0->ne[1], - (uint32_t) src0->ne[2], (uint32_t) src0->ne[3]}; - uint32_t dimensions_input_1[] = {(uint32_t) src1->ne[0], (uint32_t) src1->ne[1], - (uint32_t) src1->ne[2], (uint32_t) src1->ne[3]}; - uint32_t dimensions_output[] = {(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], - (uint32_t) dst->ne[2], (uint32_t) dst->ne[3]}; - - std::string map_entry = std::string(ggml_op_name(ggmlop)); - if (instance->_qnn_graph_map.find(map_entry) != instance->_qnn_graph_map.end()) { + src0_qnn_type = qnn_datatype_from_ggml_datatype(src0->type); + src1_qnn_type = qnn_datatype_from_ggml_datatype(src1->type); + dst_qnn_type = qnn_datatype_from_ggml_datatype(dst->type); + + uint32_t dimensions_input_0[] = { + (uint32_t) src0->ne[0], (uint32_t) src0->ne[1], (uint32_t) src0->ne[2], + (uint32_t) src0->ne[3]}; + uint32_t dimensions_input_1[] = { + (uint32_t) src1->ne[0], (uint32_t) src1->ne[1], (uint32_t) src1->ne[2], + (uint32_t) src1->ne[3]}; + uint32_t dimensions_output[] = { + (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], + (uint32_t) dst->ne[3]}; + + std::string map_entry = std::string(ggml_op_name(ggmlop)); + if (instance->_qnn_graph_map.find(map_entry) != + instance->_qnn_graph_map.end()) { graph_initialized = true; auto & graph_item = instance->_qnn_graph_map[map_entry]; - graph_handle = std::get<0>(graph_item); + graph_handle = std::get<0>(graph_item); } uint32_t * tensor_0_dimensions = QNN_VER_PTR(*tensor_0)->dimensions; @@ -1739,15 +1830,16 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src uint32_t * tensor_2_dimensions = QNN_VER_PTR(*tensor_2)->dimensions; if (!graph_initialized) { - graph_name = graph_name + "_" + std::to_string(ctx->threads) + src0->name + "_" + src1->name; + graph_name = graph_name + "_" + std::to_string(ctx->threads) + + src0->name + "_" + src1->name; QNN_LOG_DEBUG("graph name %s", graph_name.c_str()); - //QnnGraph_Config_t graph_config; - //graph_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; - //graph_config.customConfig = strdup(graph_name.c_str()); - //const QnnGraph_Config_t * p_graph_config = &graph_config; - error = qnn_raw_interface.graphCreate(instance->get_qnn_context_handle(), graph_name.c_str(), nullptr, &graph_handle); + error = qnn_raw_interface.graphCreate( + instance->get_qnn_context_handle(), graph_name.c_str(), nullptr, + &graph_handle); if (QNN_SUCCESS != error) { - QNN_LOG_INFO("can't create qnn graph handle with graph name %s, error = %d\n", graph_name.c_str(), error); + QNN_LOG_INFO("can't create qnn graph handle with graph name %s, " + "error = %d\n", + graph_name.c_str(), error); return; } error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_0); @@ -1763,40 +1855,31 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src QNN_LOG_INFO("error = %d\n", error); } - QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, ggml_get_tensor_data_size(src0)}; - QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, ggml_get_tensor_data_size(src1)}; - QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, ggml_get_tensor_data_size(dst)}; + QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, + ggml_get_tensor_data_size(src0)}; + QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, + ggml_get_tensor_data_size(src1)}; + QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, + ggml_get_tensor_data_size(dst)}; QNN_VER_PTR(*tensor_0)->dimensions = dimensions_input_0; - QNN_VER_PTR(*tensor_0)->rank = ggml_get_tensor_rank(src0); - QNN_VER_PTR(*tensor_0)->dataType = src0_qnn_type; + QNN_VER_PTR(*tensor_0)->rank = ggml_get_tensor_rank(src0); + QNN_VER_PTR(*tensor_0)->dataType = src0_qnn_type; QNN_VER_PTR(*tensor_1)->dimensions = dimensions_input_1; - QNN_VER_PTR(*tensor_1)->rank = ggml_get_tensor_rank(src1); - QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type; + QNN_VER_PTR(*tensor_1)->rank = ggml_get_tensor_rank(src1); + QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type; QNN_VER_PTR(*tensor_2)->dimensions = dimensions_output; - QNN_VER_PTR(*tensor_2)->rank = ggml_get_tensor_rank(dst); - QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type; - - Qnn_Tensor_t tensor_inputs[] = { - *tensor_0, - *tensor_1 - }; - Qnn_Tensor_t tensor_outputs[] = { - *tensor_2 - }; - Qnn_OpConfig_t op_config = { - (Qnn_OpConfigVersion_t) 1, .v1 = { - "ggml_op_add", - QNN_OP_PACKAGE_NAME_QTI_AISW, - QNN_OP_ELEMENT_WISE_ADD, - 0, - qnn_params, - 2, - tensor_inputs, - 1, - tensor_outputs - } - }; + QNN_VER_PTR(*tensor_2)->rank = ggml_get_tensor_rank(dst); + QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type; + + Qnn_Tensor_t tensor_inputs[] = {*tensor_0, *tensor_1}; + Qnn_Tensor_t tensor_outputs[] = {*tensor_2}; + Qnn_OpConfig_t op_config = { + (Qnn_OpConfigVersion_t) 1, + .v1 = {"ggml_op_add", QNN_OP_PACKAGE_NAME_QTI_AISW, + QNN_OP_ELEMENT_WISE_ADD, 0, qnn_params, + 2, tensor_inputs, 1, + tensor_outputs}}; error = qnn_raw_interface.graphAddNode(graph_handle, op_config); if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); @@ -1805,49 +1888,57 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); } - error = qnn_raw_interface.graphExecute(graph_handle, tensor_inputs, 2, tensor_outputs, 1, nullptr, nullptr); + error = + qnn_raw_interface.graphExecute(graph_handle, tensor_inputs, 2, + tensor_outputs, 1, + nullptr, nullptr); if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); } - auto graph_item = std::make_tuple(graph_handle, tensor_0, tensor_1, tensor_2); + auto graph_item = std::make_tuple(graph_handle, tensor_0, tensor_1, tensor_2); instance->_qnn_graph_map[map_entry] = graph_item; } else { auto & graph_item = instance->_qnn_graph_map[map_entry]; - graph_handle = std::get<0>(graph_item); - tensor_0 = std::get<1>(graph_item); - tensor_1 = std::get<2>(graph_item); - tensor_2 = std::get<3>(graph_item); - - //QNN_LOG_DEBUG("%d, %d, %d, %d", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]); - uint32_t dimensions_input_0[] = {(uint32_t) src0->ne[0], (uint32_t) src0->ne[1], - (uint32_t) src0->ne[2], (uint32_t) src0->ne[3]}; - uint32_t dimensions_input_1[] = {(uint32_t) src1->ne[0], (uint32_t) src1->ne[1], - (uint32_t) src1->ne[2], (uint32_t) src1->ne[3]}; - uint32_t dimensions_output[] = {(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], - (uint32_t) dst->ne[2], (uint32_t) dst->ne[3]}; - - QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, ggml_get_tensor_data_size(src0)}; - QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, ggml_get_tensor_data_size(src1)}; - QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, ggml_get_tensor_data_size(dst)}; + graph_handle = std::get<0>(graph_item); + tensor_0 = std::get<1>(graph_item); + tensor_1 = std::get<2>(graph_item); + tensor_2 = std::get<3>(graph_item); + + // QNN_LOG_DEBUG("%d, %d, %d, %d", src0->ne[0], src0->ne[1], + // src0->ne[2], src0->ne[3]); + uint32_t dimensions_input_0[] = { + (uint32_t) src0->ne[0], (uint32_t) src0->ne[1], + (uint32_t) src0->ne[2], (uint32_t) src0->ne[3]}; + uint32_t dimensions_input_1[] = { + (uint32_t) src1->ne[0], (uint32_t) src1->ne[1], + (uint32_t) src1->ne[2], (uint32_t) src1->ne[3]}; + uint32_t dimensions_output[] = { + (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], + (uint32_t) dst->ne[3]}; + + QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, + ggml_get_tensor_data_size(src0)}; + QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, + ggml_get_tensor_data_size(src1)}; + QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, + ggml_get_tensor_data_size(dst)}; QNN_VER_PTR(*tensor_0)->dimensions = dimensions_input_0; - QNN_VER_PTR(*tensor_0)->rank = ggml_get_tensor_rank(src0); - QNN_VER_PTR(*tensor_0)->dataType = src0_qnn_type; + QNN_VER_PTR(*tensor_0)->rank = ggml_get_tensor_rank(src0); + QNN_VER_PTR(*tensor_0)->dataType = src0_qnn_type; QNN_VER_PTR(*tensor_1)->dimensions = dimensions_input_1; - QNN_VER_PTR(*tensor_1)->rank = ggml_get_tensor_rank(src1); - QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type; + QNN_VER_PTR(*tensor_1)->rank = ggml_get_tensor_rank(src1); + QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type; QNN_VER_PTR(*tensor_2)->dimensions = dimensions_output; - QNN_VER_PTR(*tensor_2)->rank = ggml_get_tensor_rank(dst); - QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type; - - Qnn_Tensor_t tensor_inputs[] = { - *tensor_0, - *tensor_1 - }; - Qnn_Tensor_t tensor_outputs[] = { - *tensor_2 - }; - error = qnn_raw_interface.graphExecute(graph_handle, tensor_inputs, 2, tensor_outputs, 1, nullptr, nullptr); + QNN_VER_PTR(*tensor_2)->rank = ggml_get_tensor_rank(dst); + QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type; + + Qnn_Tensor_t tensor_inputs[] = {*tensor_0, *tensor_1}; + Qnn_Tensor_t tensor_outputs[] = {*tensor_2}; + error = + qnn_raw_interface.graphExecute(graph_handle, tensor_inputs,2, + tensor_outputs,1, + nullptr, nullptr); if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); } @@ -1855,52 +1946,54 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src QNN_VER_PTR(*tensor_0)->dimensions = tensor_0_dimensions; QNN_VER_PTR(*tensor_1)->dimensions = tensor_1_dimensions; QNN_VER_PTR(*tensor_2)->dimensions = tensor_2_dimensions; - n_end_time = ggml_time_us(); - n_duration = (n_end_time - n_begin_time) / 1000; + n_end_time = ggml_time_us(); + n_duration = (n_end_time - n_begin_time) / 1000; QNN_LOG_DEBUG("duration of ggml_qnn_add : %lld milliseconds\n", n_duration); } - - /* * ggml_qnn_mul_mat was re-added as a standalone function because * the following comments came from https://github.com/ggerganov/llama.cpp/pull/1632 - * MUL_MAT take most of the compute time (about 95%). So to speed up llama, we have to focus on MUL_MAT. + * MUL_MAT take most of the compute time (about 95%). + * So to speed up llama, we have to focus on MUL_MAT. + * * We have three kinds of MUL_MAT to compute: - * mul_mat_f32: both src0 and src1 are F32. + * mul_mat_f32: both src0 and src1 are F32. * mul_mat_f16_f32: src0 is F16 and src1 is F32. - * mul_mat_q_f32: src0 is quantized (Q4_0, Q4_1, ...), and src1 is F32. -*/ -static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - Qnn_ErrorHandle_t error = QNN_SUCCESS; - bool graph_initialized = false; - int64_t n_begin_time = 0LL; - int64_t n_end_time = 0LL; - int64_t n_duration = 0LL; - - qnn_instance * instance = nullptr; - - std::string graph_name = "ggml_op_qnn_mul_mat"; - Qnn_GraphHandle_t graph_handle = nullptr; - Qnn_Tensor_t * tensor_0 = nullptr; - Qnn_Tensor_t * tensor_1 = nullptr; - Qnn_Tensor_t * tensor_2 = nullptr; - - Qnn_Param_t qnn_params[] = {}; - - enum ggml_op ggmlop = GGML_OP_ADD; - Qnn_DataType_t src0_qnn_type = QNN_DATATYPE_FLOAT_32; - Qnn_DataType_t src1_qnn_type = QNN_DATATYPE_FLOAT_32; - Qnn_DataType_t dst_qnn_type = QNN_DATATYPE_FLOAT_32; + * mul_mat_q_f32: src0 is quantized (Q4_0, Q4_1, ...), and src1 is F32. + */ +static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, + const ggml_tensor * src0, const ggml_tensor * src1, + ggml_tensor * dst) { + Qnn_ErrorHandle_t error = QNN_SUCCESS; + bool graph_initialized = false; + int64_t n_begin_time = 0LL; + int64_t n_end_time = 0LL; + int64_t n_duration = 0LL; + + qnn_instance * instance = nullptr; + std::string graph_name = "ggml_op_qnn_mul_mat"; + Qnn_GraphHandle_t graph_handle = nullptr; + Qnn_Tensor_t * tensor_0 = nullptr; + Qnn_Tensor_t * tensor_1 = nullptr; + Qnn_Tensor_t * tensor_2 = nullptr; + + Qnn_Param_t qnn_params[] = {}; + + enum ggml_op ggmlop = GGML_OP_MUL_MAT; + Qnn_DataType_t src0_qnn_type = QNN_DATATYPE_FLOAT_32; + Qnn_DataType_t src1_qnn_type = QNN_DATATYPE_FLOAT_32; + Qnn_DataType_t dst_qnn_type = QNN_DATATYPE_FLOAT_32; if ((nullptr == src0) || (nullptr == src1) || (nullptr == dst)) { QNN_LOG_WARN("pls check why GGML tensor is null"); return; } - tensor_0 = (Qnn_Tensor_t *)src0->extra; - tensor_1 = (Qnn_Tensor_t *)src1->extra; - tensor_2 = (Qnn_Tensor_t *)dst->extra; - if ((nullptr == tensor_0) || (nullptr == tensor_1) || (nullptr == tensor_2)) { + tensor_0 = (Qnn_Tensor_t *) src0->extra; + tensor_1 = (Qnn_Tensor_t *) src1->extra; + tensor_2 = (Qnn_Tensor_t *) dst->extra; + if ((nullptr == tensor_0) || (nullptr == tensor_1) || + (nullptr == tensor_2)) { QNN_LOG_WARN("pls check why QNN tensor is null"); return; } @@ -1908,28 +2001,31 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, const ggml_tensor * QNN_LOG_WARN("pls check why backend ctx is null"); return; } - instance = ctx->instance; + instance = ctx->instance; if (nullptr == instance) { QNN_LOG_WARN("pls check why qnn instance is null"); return; } - QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface; + QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface; - n_begin_time = ggml_time_us(); + n_begin_time = ggml_time_us(); QNN_LOG_DEBUG("call %s\n", __func__); - QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", - src0->name, - src0->type, ggml_type_name(src0->type), src0->ne[0], src0->ne[1], src0->ne[2], - src0->nb[0], src0->nb[1], src0->nb[2]); - QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", - src1->name, - src1->type, ggml_type_name(src1->type), src1->ne[0], src1->ne[1], src1->ne[2], - src1->nb[0], src1->nb[1], src1->nb[2]); - QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", - dst->name, - dst->type, ggml_type_name(dst->type), dst->ne[0], dst->ne[1], dst->ne[2], dst->nb[0], - dst->nb[1], dst->nb[2]); - QNN_LOG_DEBUG("%d, %d, %d, %d", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]); + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 + " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + src0->name, src0->type, ggml_type_name(src0->type), + src0->ne[0], src0->ne[1], src0->ne[2], src0->nb[0], + src0->nb[1], src0->nb[2]); + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 + " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + src1->name, src1->type, ggml_type_name(src1->type), + src1->ne[0], src1->ne[1], src1->ne[2], src1->nb[0], + src1->nb[1], src1->nb[2]); + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 + " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + dst->name, dst->type, ggml_type_name(dst->type), dst->ne[0], + dst->ne[1], dst->ne[2], dst->nb[0], dst->nb[1], dst->nb[2]); + QNN_LOG_DEBUG("%d, %d, %d, %d", src0->ne[0], src0->ne[1], src0->ne[2], + src0->ne[3]); QNN_LOG_DEBUG("tensor0 name %s", QNN_TENSOR_GET_NAME(*tensor_0)); QNN_LOG_DEBUG("tensor1 name %s", QNN_TENSOR_GET_NAME(*tensor_1)); QNN_LOG_DEBUG("tensor2 name %s", QNN_TENSOR_GET_NAME(*tensor_2)); @@ -1938,22 +2034,26 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, const ggml_tensor * QNN_VER_PTR(*tensor_1)->type = QNN_TENSOR_TYPE_APP_WRITE; QNN_VER_PTR(*tensor_2)->type = QNN_TENSOR_TYPE_APP_READ; - src0_qnn_type = qnn_datatype_from_ggml_datatype(src0->type); - src1_qnn_type = qnn_datatype_from_ggml_datatype(src1->type); - dst_qnn_type = qnn_datatype_from_ggml_datatype(dst->type); - - uint32_t dimensions_input_0[] = {(uint32_t) src0->ne[0], (uint32_t) src0->ne[1], - (uint32_t) src0->ne[2], (uint32_t) src0->ne[3]}; - uint32_t dimensions_input_1[] = {(uint32_t) src1->ne[0], (uint32_t) src1->ne[1], - (uint32_t) src1->ne[2], (uint32_t) src1->ne[3]}; - uint32_t dimensions_output[] = {(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], - (uint32_t) dst->ne[2], (uint32_t) dst->ne[3]}; - - std::string map_entry = std::string(ggml_op_name(ggmlop)); - if (instance->_qnn_graph_map.find(map_entry) != instance->_qnn_graph_map.end()) { + src0_qnn_type = qnn_datatype_from_ggml_datatype(src0->type); + src1_qnn_type = qnn_datatype_from_ggml_datatype(src1->type); + dst_qnn_type = qnn_datatype_from_ggml_datatype(dst->type); + + uint32_t dimensions_input_0[] = { + (uint32_t) src0->ne[0], (uint32_t) src0->ne[1], (uint32_t) src0->ne[2], + (uint32_t) src0->ne[3]}; + uint32_t dimensions_input_1[] = { + (uint32_t) src1->ne[0], (uint32_t) src1->ne[1], (uint32_t) src1->ne[2], + (uint32_t) src1->ne[3]}; + uint32_t dimensions_output[] = { + (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], + (uint32_t) dst->ne[3]}; + + std::string map_entry = std::string(ggml_op_name(ggmlop)); + if (instance->_qnn_graph_map.find(map_entry) != + instance->_qnn_graph_map.end()) { graph_initialized = true; auto & graph_item = instance->_qnn_graph_map[map_entry]; - graph_handle = std::get<0>(graph_item); + graph_handle = std::get<0>(graph_item); } uint32_t * tensor_0_dimensions = QNN_VER_PTR(*tensor_0)->dimensions; @@ -1961,11 +2061,16 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, const ggml_tensor * uint32_t * tensor_2_dimensions = QNN_VER_PTR(*tensor_2)->dimensions; if (!graph_initialized) { - graph_name = graph_name + "_" + std::to_string(ctx->threads) + src0->name + "_" + src1->name; + graph_name = graph_name + "_" + std::to_string(ctx->threads) + + src0->name + "_" + src1->name; QNN_LOG_DEBUG("graph name %s", graph_name.c_str()); - error = qnn_raw_interface.graphCreate(instance->get_qnn_context_handle(), graph_name.c_str(), nullptr, &graph_handle); + error = qnn_raw_interface.graphCreate( + instance->get_qnn_context_handle(), graph_name.c_str(), nullptr, + &graph_handle); if (QNN_SUCCESS != error) { - QNN_LOG_INFO("can't create qnn graph handle with graph name %s, error = %d\n", graph_name.c_str(), error); + QNN_LOG_INFO("can't create qnn graph handle with graph name %s, " + "error = %d\n", + graph_name.c_str(), error); return; } error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_0); @@ -1981,40 +2086,30 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, const ggml_tensor * QNN_LOG_INFO("error = %d\n", error); } - QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, ggml_get_tensor_data_size(src0)}; - QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, ggml_get_tensor_data_size(src1)}; - QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, ggml_get_tensor_data_size(dst)}; + QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, + ggml_get_tensor_data_size(src0)}; + QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, + ggml_get_tensor_data_size(src1)}; + QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, + ggml_get_tensor_data_size(dst)}; QNN_VER_PTR(*tensor_0)->dimensions = dimensions_input_0; - QNN_VER_PTR(*tensor_0)->rank = ggml_get_tensor_rank(src0); - QNN_VER_PTR(*tensor_0)->dataType = src0_qnn_type; + QNN_VER_PTR(*tensor_0)->rank = ggml_get_tensor_rank(src0); + QNN_VER_PTR(*tensor_0)->dataType = src0_qnn_type; QNN_VER_PTR(*tensor_1)->dimensions = dimensions_input_1; - QNN_VER_PTR(*tensor_1)->rank = ggml_get_tensor_rank(src1); - QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type; + QNN_VER_PTR(*tensor_1)->rank = ggml_get_tensor_rank(src1); + QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type; QNN_VER_PTR(*tensor_2)->dimensions = dimensions_output; - QNN_VER_PTR(*tensor_2)->rank = ggml_get_tensor_rank(dst); - QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type; - - Qnn_Tensor_t tensor_inputs[] = { - *tensor_0, - *tensor_1 - }; - Qnn_Tensor_t tensor_outputs[] = { - *tensor_2 - }; - Qnn_OpConfig_t op_config = { - (Qnn_OpConfigVersion_t) 1, .v1 = { - "ggml_op_mul_mat", - QNN_OP_PACKAGE_NAME_QTI_AISW, - QNN_OP_MAT_MUL, - 0, - qnn_params, - 2, - tensor_inputs, - 1, - tensor_outputs - } - }; + QNN_VER_PTR(*tensor_2)->rank = ggml_get_tensor_rank(dst); + QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type; + + Qnn_Tensor_t tensor_inputs[] = {*tensor_0, *tensor_1}; + Qnn_Tensor_t tensor_outputs[] = {*tensor_2}; + Qnn_OpConfig_t op_config = {(Qnn_OpConfigVersion_t) 1, + .v1 = {"ggml_op_mul_mat", + QNN_OP_PACKAGE_NAME_QTI_AISW, + QNN_OP_MAT_MUL, 0, qnn_params, 2, + tensor_inputs, 1, tensor_outputs}}; error = qnn_raw_interface.graphAddNode(graph_handle, op_config); if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); @@ -2023,48 +2118,56 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, const ggml_tensor * if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); } - error = qnn_raw_interface.graphExecute(graph_handle, tensor_inputs, 2, tensor_outputs, 1, nullptr, nullptr); + error = qnn_raw_interface.graphExecute(graph_handle, + tensor_inputs, 2, + tensor_outputs, 1, + nullptr, nullptr); if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); } - auto graph_item = std::make_tuple(graph_handle, tensor_0, tensor_1, tensor_2); + auto graph_item = std::make_tuple(graph_handle, tensor_0, tensor_1, tensor_2); instance->_qnn_graph_map[map_entry] = graph_item; } else { - auto & graph_item = instance->_qnn_graph_map[map_entry]; - graph_handle = std::get<0>(graph_item); - tensor_0 = std::get<1>(graph_item); - tensor_1 = std::get<2>(graph_item); - tensor_2 = std::get<3>(graph_item); + auto & graph_item= instance->_qnn_graph_map[map_entry]; + graph_handle = std::get<0>(graph_item); + tensor_0 = std::get<1>(graph_item); + tensor_1 = std::get<2>(graph_item); + tensor_2 = std::get<3>(graph_item); QNN_LOG_DEBUG("%d, %d, %d, %d", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]); - uint32_t dimensions_input_0[] = {(uint32_t) src0->ne[0], (uint32_t) src0->ne[1], - (uint32_t) src0->ne[2], (uint32_t) src0->ne[3]}; - uint32_t dimensions_input_1[] = {(uint32_t) src1->ne[0], (uint32_t) src1->ne[1], - (uint32_t) src1->ne[2], (uint32_t) src1->ne[3]}; - uint32_t dimensions_output[] = {(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], - (uint32_t) dst->ne[2], (uint32_t) dst->ne[3]}; + uint32_t dimensions_input_0[] = { + (uint32_t) src0->ne[0], (uint32_t) src0->ne[1], + (uint32_t) src0->ne[2], (uint32_t) src0->ne[3]}; + uint32_t dimensions_input_1[] = { + (uint32_t) src1->ne[0], (uint32_t) src1->ne[1], + (uint32_t) src1->ne[2], (uint32_t) src1->ne[3]}; + uint32_t dimensions_output[] = { + (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], + (uint32_t) dst->ne[3]}; QNN_VER_PTR(*tensor_0)->dimensions = dimensions_input_0; - QNN_VER_PTR(*tensor_0)->rank = ggml_get_tensor_rank(src0); - QNN_VER_PTR(*tensor_0)->dataType = src0_qnn_type; + QNN_VER_PTR(*tensor_0)->rank = ggml_get_tensor_rank(src0); + QNN_VER_PTR(*tensor_0)->dataType = src0_qnn_type; QNN_VER_PTR(*tensor_1)->dimensions = dimensions_input_1; - QNN_VER_PTR(*tensor_1)->rank = ggml_get_tensor_rank(src1); - QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type; + QNN_VER_PTR(*tensor_1)->rank = ggml_get_tensor_rank(src1); + QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type; QNN_VER_PTR(*tensor_2)->dimensions = dimensions_output; - QNN_VER_PTR(*tensor_2)->rank = ggml_get_tensor_rank(dst); - QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type; - - QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, ggml_get_tensor_data_size(src0)}; - QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, ggml_get_tensor_data_size(src1)}; - QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, ggml_get_tensor_data_size(dst)}; - - Qnn_Tensor_t tensor_inputs[] = { - *tensor_0, - *tensor_1 - }; - Qnn_Tensor_t tensor_outputs[] = { - *tensor_2 - }; - error = qnn_raw_interface.graphExecute(graph_handle, tensor_inputs, 2, tensor_outputs, 1, nullptr, nullptr); + QNN_VER_PTR(*tensor_2)->rank = ggml_get_tensor_rank(dst); + QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type; + + QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, + ggml_get_tensor_data_size(src0)}; + QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, + ggml_get_tensor_data_size(src1)}; + QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, + ggml_get_tensor_data_size(dst)}; + + Qnn_Tensor_t tensor_inputs[] = {*tensor_0, *tensor_1}; + Qnn_Tensor_t tensor_outputs[] = {*tensor_2}; + error = + qnn_raw_interface.graphExecute(graph_handle, + tensor_inputs, 2, + tensor_outputs, 1, + nullptr, nullptr); if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); } @@ -2072,45 +2175,48 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, const ggml_tensor * QNN_VER_PTR(*tensor_0)->dimensions = tensor_0_dimensions; QNN_VER_PTR(*tensor_1)->dimensions = tensor_1_dimensions; QNN_VER_PTR(*tensor_2)->dimensions = tensor_2_dimensions; - n_end_time = ggml_time_us(); - n_duration = (n_end_time - n_begin_time) / 1000; - QNN_LOG_DEBUG("duration of ggml_qnn_mul_mat : %lld milliseconds\n", n_duration); + n_end_time = ggml_time_us(); + n_duration = (n_end_time - n_begin_time) / 1000; + QNN_LOG_DEBUG("duration of ggml_qnn_mul_mat : %lld milliseconds\n", + n_duration); QNN_LOG_DEBUG("call %s done\n", __func__); } - -//common function for GGML OPs using QNN API -static void ggml_qnn_hanlde_op(ggml_backend_qnn_context * ctx, const enum ggml_op ggmlop, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - Qnn_ErrorHandle_t error = QNN_SUCCESS; - bool graph_initialized = false; - int64_t n_begin_time = 0LL; - int64_t n_end_time = 0LL; - int64_t n_duration = 0LL; - - qnn_instance * instance = nullptr; - - std::string qnn_graph_name = "ggml_qnn_graph"; - std::string qnn_op_config_name = "ggml_qnn_op_config"; - const char * qnn_op_name = nullptr; - Qnn_GraphHandle_t graph_handle = nullptr; - Qnn_Tensor_t * tensor_0 = nullptr; - Qnn_Tensor_t * tensor_1 = nullptr; - Qnn_Tensor_t * tensor_2 = nullptr; - - Qnn_Param_t qnn_params[] = {}; - - Qnn_DataType_t src0_qnn_type = QNN_DATATYPE_FLOAT_32; - Qnn_DataType_t src1_qnn_type = QNN_DATATYPE_FLOAT_32; - Qnn_DataType_t dst_qnn_type = QNN_DATATYPE_FLOAT_32; +// common function for GGML OPs using QNN API +static void ggml_qnn_hanlde_op(ggml_backend_qnn_context * ctx, + const enum ggml_op ggmlop, + const ggml_tensor * src0, const ggml_tensor * src1, + ggml_tensor * dst) { + Qnn_ErrorHandle_t error = QNN_SUCCESS; + bool graph_initialized = false; + int64_t n_begin_time = 0LL; + int64_t n_end_time = 0LL; + int64_t n_duration = 0LL; + + qnn_instance * instance = nullptr; + std::string qnn_graph_name = "ggml_qnn_graph"; + std::string qnn_op_config_name = "ggml_qnn_op_config"; + const char * qnn_op_name = nullptr; + Qnn_GraphHandle_t graph_handle = nullptr; + Qnn_Tensor_t * tensor_0 = nullptr; + Qnn_Tensor_t * tensor_1 = nullptr; + Qnn_Tensor_t * tensor_2 = nullptr; + + Qnn_Param_t qnn_params[] = {}; + + Qnn_DataType_t src0_qnn_type = QNN_DATATYPE_FLOAT_32; + Qnn_DataType_t src1_qnn_type = QNN_DATATYPE_FLOAT_32; + Qnn_DataType_t dst_qnn_type = QNN_DATATYPE_FLOAT_32; if ((nullptr == src0) || (nullptr == src1) || (nullptr == dst)) { QNN_LOG_WARN("pls check why GGML tensor is null"); return; } - tensor_0 = (Qnn_Tensor_t *)src0->extra; - tensor_1 = (Qnn_Tensor_t *)src1->extra; - tensor_2 = (Qnn_Tensor_t *)dst->extra; - if ((nullptr == tensor_0) || (nullptr == tensor_1) || (nullptr == tensor_2)) { + tensor_0 = (Qnn_Tensor_t *) src0->extra; + tensor_1 = (Qnn_Tensor_t *) src1->extra; + tensor_2 = (Qnn_Tensor_t *) dst->extra; + if ((nullptr == tensor_0) || (nullptr == tensor_1) || + (nullptr == tensor_2)) { QNN_LOG_WARN("pls check why QNN tensor is null"); return; } @@ -2118,58 +2224,66 @@ static void ggml_qnn_hanlde_op(ggml_backend_qnn_context * ctx, const enum ggml_o QNN_LOG_WARN("pls check why backend ctx is null"); return; } - instance = ctx->instance; + instance = ctx->instance; if (nullptr == instance) { QNN_LOG_WARN("pls check why qnn instance is null"); return; } - QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface; + QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface; - src0_qnn_type = qnn_datatype_from_ggml_datatype(src0->type); - src1_qnn_type = qnn_datatype_from_ggml_datatype(src1->type); - dst_qnn_type = qnn_datatype_from_ggml_datatype(dst->type); - qnn_op_name = qnn_opname_from_ggmlop(ggmlop); + src0_qnn_type = qnn_datatype_from_ggml_datatype(src0->type); + src1_qnn_type = qnn_datatype_from_ggml_datatype(src1->type); + dst_qnn_type = qnn_datatype_from_ggml_datatype(dst->type); + qnn_op_name = qnn_opname_from_ggmlop(ggmlop); if (nullptr == qnn_op_name) { - QNN_LOG_WARN("pls check why can not get QNN OP name with ggml op %d(%s)", ggmlop, ggml_op_name(ggmlop)); + QNN_LOG_WARN( + "pls check why can not get QNN OP name with ggml op %d(%s)", ggmlop, + ggml_op_name(ggmlop)); return; } - n_begin_time = ggml_time_us(); + n_begin_time = ggml_time_us(); QNN_LOG_DEBUG("call %s\n", __func__); - QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", - src0->name, - src0->type, ggml_type_name(src0->type), src0->ne[0], src0->ne[1], src0->ne[2], - src0->nb[0], src0->nb[1], src0->nb[2]); - QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", - src1->name, - src1->type, ggml_type_name(src1->type), src1->ne[0], src1->ne[1], src1->ne[2], - src1->nb[0], src1->nb[1], src1->nb[2]); - QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", - dst->name, - dst->type, ggml_type_name(dst->type), dst->ne[0], dst->ne[1], dst->ne[2], dst->nb[0], - dst->nb[1], dst->nb[2]); - QNN_LOG_DEBUG("%d, %d, %d, %d", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]); + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 + " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + src0->name, src0->type, ggml_type_name(src0->type), + src0->ne[0], src0->ne[1], src0->ne[2], src0->nb[0], + src0->nb[1], src0->nb[2]); + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 + " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + src1->name, src1->type, ggml_type_name(src1->type), + src1->ne[0], src1->ne[1], src1->ne[2], src1->nb[0], + src1->nb[1], src1->nb[2]); + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 + " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + dst->name, dst->type, ggml_type_name(dst->type), dst->ne[0], + dst->ne[1], dst->ne[2], dst->nb[0], dst->nb[1], dst->nb[2]); + QNN_LOG_DEBUG("%d, %d, %d, %d", src0->ne[0], src0->ne[1], src0->ne[2], + src0->ne[3]); QNN_LOG_DEBUG("tensor0 name %s", QNN_TENSOR_GET_NAME(*tensor_0)); QNN_LOG_DEBUG("tensor1 name %s", QNN_TENSOR_GET_NAME(*tensor_1)); QNN_LOG_DEBUG("tensor2 name %s", QNN_TENSOR_GET_NAME(*tensor_2)); - QNN_VER_PTR(*tensor_0)->type = QNN_TENSOR_TYPE_APP_WRITE; - QNN_VER_PTR(*tensor_1)->type = QNN_TENSOR_TYPE_APP_WRITE; - QNN_VER_PTR(*tensor_2)->type = QNN_TENSOR_TYPE_APP_READ; - uint32_t dimensions_input_0[] = {(uint32_t) src0->ne[0], (uint32_t) src0->ne[1], - (uint32_t) src0->ne[2], (uint32_t) src0->ne[3]}; - uint32_t dimensions_input_1[] = {(uint32_t) src1->ne[0], (uint32_t) src1->ne[1], - (uint32_t) src1->ne[2], (uint32_t) src1->ne[3]}; - uint32_t dimensions_output[] = {(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], - (uint32_t) dst->ne[2], (uint32_t) dst->ne[3]}; - - - std::string map_entry = std::string(ggml_op_name(ggmlop)); - if (instance->_qnn_graph_map.find(map_entry) != instance->_qnn_graph_map.end()) { + QNN_VER_PTR(*tensor_0)->type = QNN_TENSOR_TYPE_APP_WRITE; + QNN_VER_PTR(*tensor_1)->type = QNN_TENSOR_TYPE_APP_WRITE; + QNN_VER_PTR(*tensor_2)->type = QNN_TENSOR_TYPE_APP_READ; + uint32_t dimensions_input_0[] = { + (uint32_t) src0->ne[0], (uint32_t) src0->ne[1], (uint32_t) src0->ne[2], + (uint32_t) src0->ne[3]}; + uint32_t dimensions_input_1[] = { + (uint32_t) src1->ne[0], (uint32_t) src1->ne[1], (uint32_t) src1->ne[2], + (uint32_t) src1->ne[3]}; + uint32_t dimensions_output[] = { + (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], + (uint32_t) dst->ne[3]}; + + std::string map_entry = std::string(ggml_op_name(ggmlop)); + if (instance->_qnn_graph_map.find(map_entry) != + instance->_qnn_graph_map.end()) { graph_initialized = true; auto & graph_item = instance->_qnn_graph_map[map_entry]; - graph_handle = std::get<0>(graph_item); + graph_handle = std::get<0>(graph_item); } uint32_t * tensor_0_dimensions = QNN_VER_PTR(*tensor_0)->dimensions; @@ -2177,13 +2291,21 @@ static void ggml_qnn_hanlde_op(ggml_backend_qnn_context * ctx, const enum ggml_o uint32_t * tensor_2_dimensions = QNN_VER_PTR(*tensor_2)->dimensions; if (!graph_initialized) { - qnn_graph_name = qnn_graph_name + "_" + ggml_op_name(ggmlop) + std::to_string(ctx->threads) + src0->name + "_" + src1->name; - qnn_op_config_name = qnn_op_config_name + "_" + ggml_op_name(ggmlop) + std::to_string(ctx->threads) + src0->name + "_" + src1->name; + qnn_graph_name = qnn_graph_name + "_" + ggml_op_name(ggmlop) + + std::to_string(ctx->threads) + src0->name + "_" + + src1->name; + qnn_op_config_name = qnn_op_config_name + "_" + ggml_op_name(ggmlop) + + std::to_string(ctx->threads) + src0->name + "_" + + src1->name; QNN_LOG_DEBUG("qnn graph name %s", qnn_graph_name.c_str()); QNN_LOG_DEBUG("qnn op_config name %s", qnn_op_config_name.c_str()); - error = qnn_raw_interface.graphCreate(instance->get_qnn_context_handle(), qnn_graph_name.c_str(), nullptr, &graph_handle); + error = qnn_raw_interface.graphCreate( + instance->get_qnn_context_handle(), qnn_graph_name.c_str(), nullptr, + &graph_handle); if (QNN_SUCCESS != error) { - QNN_LOG_INFO("can't create qnn graph handle with ggml op %s, graph name %s, error = %d\n", ggml_op_name(ggmlop), qnn_graph_name.c_str(), error); + QNN_LOG_INFO("can't create qnn graph handle with ggml op %s, graph " + "name %s, error = %d\n", + ggml_op_name(ggmlop), qnn_graph_name.c_str(), error); return; } @@ -2200,40 +2322,30 @@ static void ggml_qnn_hanlde_op(ggml_backend_qnn_context * ctx, const enum ggml_o QNN_LOG_INFO("error = %d\n", error); } - QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, ggml_get_tensor_data_size(src0)}; - QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, ggml_get_tensor_data_size(src1)}; - QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, ggml_get_tensor_data_size(dst)}; + QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, + ggml_get_tensor_data_size(src0)}; + QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, + ggml_get_tensor_data_size(src1)}; + QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, + ggml_get_tensor_data_size(dst)}; QNN_VER_PTR(*tensor_0)->dimensions = dimensions_input_0; - QNN_VER_PTR(*tensor_0)->rank = ggml_get_tensor_rank(src0); - QNN_VER_PTR(*tensor_0)->dataType = src0_qnn_type; + QNN_VER_PTR(*tensor_0)->rank = ggml_get_tensor_rank(src0); + QNN_VER_PTR(*tensor_0)->dataType = src0_qnn_type; QNN_VER_PTR(*tensor_1)->dimensions = dimensions_input_1; - QNN_VER_PTR(*tensor_1)->rank = ggml_get_tensor_rank(src1); - QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type; + QNN_VER_PTR(*tensor_1)->rank = ggml_get_tensor_rank(src1); + QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type; QNN_VER_PTR(*tensor_2)->dimensions = dimensions_output; - QNN_VER_PTR(*tensor_2)->rank = ggml_get_tensor_rank(dst); - QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type; - - Qnn_Tensor_t tensor_inputs[] = { - *tensor_0, - *tensor_1 - }; - Qnn_Tensor_t tensor_outputs[] = { - *tensor_2 - }; - Qnn_OpConfig_t op_config = { - (Qnn_OpConfigVersion_t) 1, .v1 = { - qnn_op_config_name.c_str(), - QNN_OP_PACKAGE_NAME_QTI_AISW, - qnn_op_name, - 0, - qnn_params, - 2, - tensor_inputs, - 1, - tensor_outputs - } - }; + QNN_VER_PTR(*tensor_2)->rank = ggml_get_tensor_rank(dst); + QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type; + + Qnn_Tensor_t tensor_inputs[] = {*tensor_0, *tensor_1}; + Qnn_Tensor_t tensor_outputs[] = {*tensor_2}; + Qnn_OpConfig_t op_config = {(Qnn_OpConfigVersion_t) 1, + .v1 = {qnn_op_config_name.c_str(), + QNN_OP_PACKAGE_NAME_QTI_AISW, + qnn_op_name, 0, qnn_params, 2, + tensor_inputs, 1, tensor_outputs}}; error = qnn_raw_interface.graphAddNode(graph_handle, op_config); if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); @@ -2242,48 +2354,56 @@ static void ggml_qnn_hanlde_op(ggml_backend_qnn_context * ctx, const enum ggml_o if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); } - error = qnn_raw_interface.graphExecute(graph_handle, tensor_inputs, 2, tensor_outputs, 1, nullptr, nullptr); + error = qnn_raw_interface.graphExecute(graph_handle, + tensor_inputs, 2, + tensor_outputs, 1, + nullptr, nullptr); if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); } - auto graph_item = std::make_tuple(graph_handle, tensor_0, tensor_1, tensor_2); + auto graph_item = std::make_tuple(graph_handle, tensor_0, tensor_1, tensor_2); instance->_qnn_graph_map[map_entry] = graph_item; } else { auto & graph_item = instance->_qnn_graph_map[map_entry]; - graph_handle = std::get<0>(graph_item); - tensor_0 = std::get<1>(graph_item); - tensor_1 = std::get<2>(graph_item); - tensor_2 = std::get<3>(graph_item); + graph_handle = std::get<0>(graph_item); + tensor_0 = std::get<1>(graph_item); + tensor_1 = std::get<2>(graph_item); + tensor_2 = std::get<3>(graph_item); QNN_LOG_DEBUG("%d, %d, %d, %d", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]); - uint32_t dimensions_input_0[] = {(uint32_t) src0->ne[0], (uint32_t) src0->ne[1], - (uint32_t) src0->ne[2], (uint32_t) src0->ne[3]}; - uint32_t dimensions_input_1[] = {(uint32_t) src1->ne[0], (uint32_t) src1->ne[1], - (uint32_t) src1->ne[2], (uint32_t) src1->ne[3]}; - uint32_t dimensions_output[] = {(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], - (uint32_t) dst->ne[2], (uint32_t) dst->ne[3]}; + uint32_t dimensions_input_0[] = { + (uint32_t) src0->ne[0], (uint32_t) src0->ne[1], + (uint32_t) src0->ne[2], (uint32_t) src0->ne[3]}; + uint32_t dimensions_input_1[] = { + (uint32_t) src1->ne[0], (uint32_t) src1->ne[1], + (uint32_t) src1->ne[2], (uint32_t) src1->ne[3]}; + uint32_t dimensions_output[] = { + (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], + (uint32_t) dst->ne[3]}; QNN_VER_PTR(*tensor_0)->dimensions = dimensions_input_0; - QNN_VER_PTR(*tensor_0)->rank = ggml_get_tensor_rank(src0); - QNN_VER_PTR(*tensor_0)->dataType = src0_qnn_type; + QNN_VER_PTR(*tensor_0)->rank = ggml_get_tensor_rank(src0); + QNN_VER_PTR(*tensor_0)->dataType = src0_qnn_type; QNN_VER_PTR(*tensor_1)->dimensions = dimensions_input_1; - QNN_VER_PTR(*tensor_1)->rank = ggml_get_tensor_rank(src1); - QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type; + QNN_VER_PTR(*tensor_1)->rank = ggml_get_tensor_rank(src1); + QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type; QNN_VER_PTR(*tensor_2)->dimensions = dimensions_output; - QNN_VER_PTR(*tensor_2)->rank = ggml_get_tensor_rank(dst); - QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type; - - QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, ggml_get_tensor_data_size(src0)}; - QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, ggml_get_tensor_data_size(src1)}; - QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, ggml_get_tensor_data_size(dst)}; - - Qnn_Tensor_t tensor_inputs[] = { - *tensor_0, - *tensor_1 - }; - Qnn_Tensor_t tensor_outputs[] = { - *tensor_2 - }; - error = qnn_raw_interface.graphExecute(graph_handle, tensor_inputs, 2, tensor_outputs, 1, nullptr, nullptr); + QNN_VER_PTR(*tensor_2)->rank = ggml_get_tensor_rank(dst); + QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type; + + QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, + ggml_get_tensor_data_size(src0)}; + QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, + ggml_get_tensor_data_size(src1)}; + QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, + ggml_get_tensor_data_size(dst)}; + + Qnn_Tensor_t tensor_inputs[] = {*tensor_0, *tensor_1}; + Qnn_Tensor_t tensor_outputs[] = {*tensor_2}; + error = + qnn_raw_interface.graphExecute(graph_handle, + tensor_inputs, 2, + tensor_outputs, 1, + nullptr, nullptr); if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); } @@ -2291,381 +2411,310 @@ static void ggml_qnn_hanlde_op(ggml_backend_qnn_context * ctx, const enum ggml_o QNN_VER_PTR(*tensor_0)->dimensions = tensor_0_dimensions; QNN_VER_PTR(*tensor_1)->dimensions = tensor_1_dimensions; QNN_VER_PTR(*tensor_2)->dimensions = tensor_2_dimensions; - n_end_time = ggml_time_us(); - n_duration = (n_end_time - n_begin_time) / 1000; - QNN_LOG_DEBUG("duration of ggml_qnn_%s : %lld milliseconds\n", ggml_op_name(ggmlop), n_duration); + n_end_time = ggml_time_us(); + n_duration = (n_end_time - n_begin_time) / 1000; + QNN_LOG_DEBUG("duration of ggml_qnn_%s : %lld milliseconds\n", + ggml_op_name(ggmlop), n_duration); QNN_LOG_DEBUG("call %s done\n", __func__); } - -static void ggml_qnn_repeat(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - QNN_LOG_DEBUG("call %s\n", __func__); - - QNN_LOG_DEBUG("call %s done\n", __func__); +static void ggml_qnn_repeat(ggml_backend_qnn_context * ctx, + const ggml_tensor * src0, const ggml_tensor * src1, + ggml_tensor * dst) { } - -static void ggml_qnn_get_rows(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - QNN_LOG_DEBUG("call %s\n", __func__); - - QNN_LOG_DEBUG("call %s done\n", __func__); +static void ggml_qnn_get_rows(ggml_backend_qnn_context * ctx, + const ggml_tensor * src0, const ggml_tensor * src1, + ggml_tensor * dst) { } - -static void ggml_qnn_acc(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - QNN_LOG_DEBUG("call %s\n", __func__); - - QNN_LOG_DEBUG("call %s done\n", __func__); +static void ggml_qnn_acc(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, + const ggml_tensor * src1, ggml_tensor * dst) { } - -static void ggml_qnn_div(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - QNN_LOG_DEBUG("call %s\n", __func__); - - QNN_LOG_DEBUG("call %s done\n", __func__); +static void ggml_qnn_div(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, + const ggml_tensor * src1, ggml_tensor * dst) { } - -static void ggml_qnn_gelu(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - QNN_LOG_DEBUG("call %s\n", __func__); - - QNN_LOG_DEBUG("call %s done\n", __func__); +static void ggml_qnn_gelu(ggml_backend_qnn_context * ctx, + const ggml_tensor * src0, const ggml_tensor * src1, + ggml_tensor * dst) { } - -static void ggml_qnn_silu(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - QNN_LOG_DEBUG("call %s\n", __func__); - - QNN_LOG_DEBUG("call %s done\n", __func__); +static void ggml_qnn_silu(ggml_backend_qnn_context * ctx, + const ggml_tensor * src0, const ggml_tensor * src1, + ggml_tensor * dst) { } - -static void ggml_qnn_gelu_quick(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - QNN_LOG_DEBUG("call %s\n", __func__); - - QNN_LOG_DEBUG("call %s done\n", __func__); +static void ggml_qnn_gelu_quick(ggml_backend_qnn_context * ctx, + const ggml_tensor * src0, + const ggml_tensor * src1, ggml_tensor * dst) { } - -static void ggml_qnn_tanh(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - QNN_LOG_DEBUG("call %s\n", __func__); - - QNN_LOG_DEBUG("call %s done\n", __func__); +static void ggml_qnn_tanh(ggml_backend_qnn_context * ctx, + const ggml_tensor * src0, const ggml_tensor * src1, + ggml_tensor * dst) { } - -static void ggml_qnn_relu(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - QNN_LOG_DEBUG("call %s\n", __func__); - - QNN_LOG_DEBUG("call %s done\n", __func__); +static void ggml_qnn_relu(ggml_backend_qnn_context * ctx, + const ggml_tensor * src0, const ggml_tensor * src1, + ggml_tensor * dst) { } - -static void ggml_qnn_hardsigmoid(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - QNN_LOG_DEBUG("call %s\n", __func__); - - QNN_LOG_DEBUG("call %s done\n", __func__); +static void ggml_qnn_hardsigmoid(ggml_backend_qnn_context * ctx, + const ggml_tensor * src0, + const ggml_tensor * src1, ggml_tensor * dst) { } - -static void ggml_qnn_hardswish(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - QNN_LOG_DEBUG("call %s\n", __func__); - - QNN_LOG_DEBUG("call %s done\n", __func__); +static void ggml_qnn_hardswish(ggml_backend_qnn_context * ctx, + const ggml_tensor * src0, const ggml_tensor * src1, + ggml_tensor * dst) { } - -static void ggml_qnn_leaky_relu(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - QNN_LOG_DEBUG("call %s\n", __func__); - - QNN_LOG_DEBUG("call %s done\n", __func__); +static void ggml_qnn_leaky_relu(ggml_backend_qnn_context * ctx, + const ggml_tensor * src0, + const ggml_tensor * src1, ggml_tensor * dst) { } - -static void ggml_qnn_sqr(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - QNN_LOG_DEBUG("call %s\n", __func__); - - QNN_LOG_DEBUG("call %s done\n", __func__); +static void ggml_qnn_sqr(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, + const ggml_tensor * src1, ggml_tensor * dst) { } - -static void ggml_qnn_norm(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - QNN_LOG_DEBUG("call %s\n", __func__); - - QNN_LOG_DEBUG("call %s done\n", __func__); +static void ggml_qnn_norm(ggml_backend_qnn_context * ctx, + const ggml_tensor * src0, const ggml_tensor * src1, + ggml_tensor * dst) { } - -static void ggml_qnn_group_norm(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - QNN_LOG_DEBUG("call %s\n", __func__); - - QNN_LOG_DEBUG("call %s done\n", __func__); +static void ggml_qnn_group_norm(ggml_backend_qnn_context * ctx, + const ggml_tensor * src0, + const ggml_tensor * src1, ggml_tensor * dst) { } - -static void ggml_qnn_concat(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - QNN_LOG_DEBUG("call %s\n", __func__); - - QNN_LOG_DEBUG("call %s done\n", __func__); +static void ggml_qnn_concat(ggml_backend_qnn_context * ctx, + const ggml_tensor * src0, const ggml_tensor * src1, + ggml_tensor * dst) { } - -static void ggml_qnn_upscale(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - QNN_LOG_DEBUG("call %s\n", __func__); - - QNN_LOG_DEBUG("call %s done\n", __func__); +static void ggml_qnn_upscale(ggml_backend_qnn_context * ctx, + const ggml_tensor * src0, const ggml_tensor * src1, + ggml_tensor * dst) { } - -static void ggml_qnn_pad(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - QNN_LOG_DEBUG("call %s\n", __func__); - - QNN_LOG_DEBUG("call %s done\n", __func__); +static void ggml_qnn_pad(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, + const ggml_tensor * src1, ggml_tensor * dst) { } - -static void ggml_qnn_rms_norm(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - QNN_LOG_DEBUG("call %s\n", __func__); - - QNN_LOG_DEBUG("call %s done\n", __func__); +static void ggml_qnn_rms_norm(ggml_backend_qnn_context * ctx, + const ggml_tensor * src0, const ggml_tensor * src1, + ggml_tensor * dst) { } - -static void ggml_qnn_cpy(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - QNN_LOG_DEBUG("call %s\n", __func__); - - QNN_LOG_DEBUG("call %s done\n", __func__); +static void ggml_qnn_cpy(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, + const ggml_tensor * src1, ggml_tensor * dst) { } - -static void ggml_qnn_dup(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_qnn_dup(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, + const ggml_tensor * src1, ggml_tensor * dst) { ggml_qnn_cpy(ctx, src0, dst, nullptr); (void) src1; } - static void ggml_qnn_mul_mat_id(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, - const ggml_tensor * src1, - ggml_tensor * dst) { - QNN_LOG_DEBUG("call %s\n", __func__); - - QNN_LOG_DEBUG("call %s done\n", __func__); + const ggml_tensor * src1, ggml_tensor * dst) { } - -static void ggml_qnn_scale(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - QNN_LOG_DEBUG("call %s\n", __func__); - - QNN_LOG_DEBUG("call %s done\n", __func__); +static void ggml_qnn_scale(ggml_backend_qnn_context * ctx, + const ggml_tensor * src0, const ggml_tensor * src1, + ggml_tensor * dst) { } - -static void ggml_qnn_clamp(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - QNN_LOG_DEBUG("call %s\n", __func__); - - QNN_LOG_DEBUG("call %s done\n", __func__); +static void ggml_qnn_clamp(ggml_backend_qnn_context * ctx, + const ggml_tensor * src0, const ggml_tensor * src1, + ggml_tensor * dst) { } - -static void ggml_qnn_diag_mask_inf(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - QNN_LOG_DEBUG("call %s\n", __func__); - - QNN_LOG_DEBUG("call %s done\n", __func__); +static void ggml_qnn_diag_mask_inf(ggml_backend_qnn_context * ctx, + const ggml_tensor * src0, + const ggml_tensor * src1, ggml_tensor * dst) { } - -static void ggml_qnn_soft_max(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - QNN_LOG_DEBUG("call %s\n", __func__); - - QNN_LOG_DEBUG("call %s done\n", __func__); +static void ggml_qnn_soft_max(ggml_backend_qnn_context * ctx, + const ggml_tensor * src0, const ggml_tensor * src1, + ggml_tensor * dst) { } - -static void ggml_qnn_rope(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_qnn_rope(ggml_backend_qnn_context * ctx, + const ggml_tensor * src0, const ggml_tensor * src1, + ggml_tensor * dst) { GGML_ASSERT(ggml_is_contiguous(src0)); - QNN_LOG_DEBUG("call %s\n", __func__); - - QNN_LOG_DEBUG("call %s done\n", __func__); - } - -static void ggml_qnn_pool2d(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - QNN_LOG_DEBUG("call %s\n", __func__); - - QNN_LOG_DEBUG("call %s done\n", __func__); +static void ggml_qnn_pool2d(ggml_backend_qnn_context * ctx, + const ggml_tensor * src0, const ggml_tensor * src1, + ggml_tensor * dst) { } - -static void ggml_qnn_im2col(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - QNN_LOG_DEBUG("call %s\n", __func__); - - QNN_LOG_DEBUG("call %s done\n", __func__); +static void ggml_qnn_im2col(ggml_backend_qnn_context * ctx, + const ggml_tensor * src0, const ggml_tensor * src1, + ggml_tensor * dst) { } - -static void ggml_qnn_sum_rows(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_qnn_sum_rows(ggml_backend_qnn_context * ctx, + const ggml_tensor * src0, const ggml_tensor * src1, + ggml_tensor * dst) { GGML_ASSERT(ggml_is_contiguous(src0)); - QNN_LOG_DEBUG("call %s\n", __func__); - - QNN_LOG_DEBUG("call %s done\n", __func__); } - -static void ggml_qnn_argsort(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_qnn_argsort(ggml_backend_qnn_context * ctx, + const ggml_tensor * src0, const ggml_tensor * src1, + ggml_tensor * dst) { GGML_ASSERT(ggml_is_contiguous(src0)); - QNN_LOG_DEBUG("call %s\n", __func__); - - QNN_LOG_DEBUG("call %s done\n", __func__); } - -static void ggml_qnn_nop(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - (void) src0; - (void) src1; - (void) dst; - QNN_LOG_DEBUG("call %s\n", __func__); - - QNN_LOG_DEBUG("call %s done\n", __func__); +static void ggml_qnn_nop(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, + const ggml_tensor * src1, ggml_tensor * dst) { + (void)src0; + (void)src1; + (void)dst; } - -bool ggml_qnn_compute_forward(ggml_backend_qnn_context * ctx, struct ggml_compute_params * params, struct ggml_tensor * tensor) { - ggml_qnn_func_t func = nullptr; - ggml_qnn_func_common_t func_common = nullptr; +bool ggml_qnn_compute_forward(ggml_backend_qnn_context * ctx, + struct ggml_compute_params * params, + struct ggml_tensor * tensor) { + ggml_qnn_func_t func = nullptr; + ggml_qnn_func_common_t func_common = nullptr; switch (tensor->op) { - case GGML_OP_ADD: - func = ggml_qnn_add; + case GGML_OP_ADD: + func = ggml_qnn_add; + break; + + case GGML_OP_MUL: + func_common = ggml_qnn_hanlde_op; + break; + + case GGML_OP_MUL_MAT: + func = ggml_qnn_mul_mat; + break; + + case GGML_OP_REPEAT: + func = ggml_qnn_repeat; + break; + case GGML_OP_GET_ROWS: + func = ggml_qnn_get_rows; + break; + case GGML_OP_DUP: + func = ggml_qnn_dup; + break; + + case GGML_OP_ACC: + func = ggml_qnn_acc; + break; + + case GGML_OP_DIV: + func = ggml_qnn_div; + break; + + case GGML_OP_UNARY: + switch (ggml_get_unary_op(tensor)) { + case GGML_UNARY_OP_GELU: + func = ggml_qnn_gelu; break; - - case GGML_OP_MUL: - func_common = ggml_qnn_hanlde_op; + case GGML_UNARY_OP_SILU: + func = ggml_qnn_silu; break; - - case GGML_OP_MUL_MAT: - func = ggml_qnn_mul_mat; + case GGML_UNARY_OP_GELU_QUICK: + func = ggml_qnn_gelu_quick; break; - - case GGML_OP_REPEAT: - func = ggml_qnn_repeat; + case GGML_UNARY_OP_TANH: + func = ggml_qnn_tanh; break; - case GGML_OP_GET_ROWS: - func = ggml_qnn_get_rows; + case GGML_UNARY_OP_RELU: + func = ggml_qnn_relu; break; - case GGML_OP_DUP: - func = ggml_qnn_dup; + case GGML_UNARY_OP_HARDSIGMOID: + func = ggml_qnn_hardsigmoid; break; - - case GGML_OP_ACC: - func = ggml_qnn_acc; - break; - - case GGML_OP_DIV: - func = ggml_qnn_div; - break; - - case GGML_OP_UNARY: - switch (ggml_get_unary_op(tensor)) { - case GGML_UNARY_OP_GELU: - func = ggml_qnn_gelu; - break; - case GGML_UNARY_OP_SILU: - func = ggml_qnn_silu; - break; - case GGML_UNARY_OP_GELU_QUICK: - func = ggml_qnn_gelu_quick; - break; - case GGML_UNARY_OP_TANH: - func = ggml_qnn_tanh; - break; - case GGML_UNARY_OP_RELU: - func = ggml_qnn_relu; - break; - case GGML_UNARY_OP_HARDSIGMOID: - func = ggml_qnn_hardsigmoid; - break; - case GGML_UNARY_OP_HARDSWISH: - func = ggml_qnn_hardswish; - break; - default: - return false; - } - break; - case GGML_OP_NORM: - func = ggml_qnn_norm; - break; - case GGML_OP_GROUP_NORM: - func = ggml_qnn_group_norm; - break; - case GGML_OP_CONCAT: - func = ggml_qnn_concat; - break; - case GGML_OP_UPSCALE: - func = ggml_qnn_upscale; - break; - case GGML_OP_PAD: - func = ggml_qnn_pad; - break; - case GGML_OP_LEAKY_RELU: - func = ggml_qnn_leaky_relu; - break; - case GGML_OP_RMS_NORM: - func = ggml_qnn_rms_norm; - break; - case GGML_OP_MUL_MAT_ID: - func = ggml_qnn_mul_mat_id; - break; - case GGML_OP_SCALE: - func = ggml_qnn_scale; - break; - case GGML_OP_SQR: - func = ggml_qnn_sqr; - break; - case GGML_OP_CLAMP: - func = ggml_qnn_clamp; - break; - case GGML_OP_CPY: - func = ggml_qnn_cpy; - break; - case GGML_OP_CONT: - func = ggml_qnn_dup; - break; - case GGML_OP_NONE: - case GGML_OP_RESHAPE: - case GGML_OP_VIEW: - case GGML_OP_PERMUTE: - case GGML_OP_TRANSPOSE: - func = ggml_qnn_nop; - break; - case GGML_OP_DIAG_MASK_INF: - func = ggml_qnn_diag_mask_inf; - break; - case GGML_OP_SOFT_MAX: - func = ggml_qnn_soft_max; - break; - case GGML_OP_ROPE: - func = ggml_qnn_rope; - break; - case GGML_OP_IM2COL: - func = ggml_qnn_im2col; - break; - case GGML_OP_POOL_2D: - func = ggml_qnn_pool2d; - break; - case GGML_OP_SUM_ROWS: - func = ggml_qnn_sum_rows; - break; - case GGML_OP_ARGSORT: - func = ggml_qnn_argsort; + case GGML_UNARY_OP_HARDSWISH: + func = ggml_qnn_hardswish; break; default: return false; + } + break; + case GGML_OP_NORM: + func = ggml_qnn_norm; + break; + case GGML_OP_GROUP_NORM: + func = ggml_qnn_group_norm; + break; + case GGML_OP_CONCAT: + func = ggml_qnn_concat; + break; + case GGML_OP_UPSCALE: + func = ggml_qnn_upscale; + break; + case GGML_OP_PAD: + func = ggml_qnn_pad; + break; + case GGML_OP_LEAKY_RELU: + func = ggml_qnn_leaky_relu; + break; + case GGML_OP_RMS_NORM: + func = ggml_qnn_rms_norm; + break; + case GGML_OP_MUL_MAT_ID: + func = ggml_qnn_mul_mat_id; + break; + case GGML_OP_SCALE: + func = ggml_qnn_scale; + break; + case GGML_OP_SQR: + func = ggml_qnn_sqr; + break; + case GGML_OP_CLAMP: + func = ggml_qnn_clamp; + break; + case GGML_OP_CPY: + func = ggml_qnn_cpy; + break; + case GGML_OP_CONT: + func = ggml_qnn_dup; + break; + case GGML_OP_NONE: + case GGML_OP_RESHAPE: + case GGML_OP_VIEW: + case GGML_OP_PERMUTE: + case GGML_OP_TRANSPOSE: + func = ggml_qnn_nop; + break; + case GGML_OP_DIAG_MASK_INF: + func = ggml_qnn_diag_mask_inf; + break; + case GGML_OP_SOFT_MAX: + func = ggml_qnn_soft_max; + break; + case GGML_OP_ROPE: + func = ggml_qnn_rope; + break; + case GGML_OP_IM2COL: + func = ggml_qnn_im2col; + break; + case GGML_OP_POOL_2D: + func = ggml_qnn_pool2d; + break; + case GGML_OP_SUM_ROWS: + func = ggml_qnn_sum_rows; + break; + case GGML_OP_ARGSORT: + func = ggml_qnn_argsort; + break; + default: + return false; } - if (nullptr != func) - func(ctx, tensor->src[0], tensor->src[1], tensor); + if (nullptr != func) func(ctx, tensor->src[0], tensor->src[1], tensor); if (nullptr != func_common) func_common(ctx, tensor->op, tensor->src[0], tensor->src[1], tensor); @@ -2673,12 +2722,10 @@ bool ggml_qnn_compute_forward(ggml_backend_qnn_context * ctx, struct ggml_comput return true; } - struct ggml_backend_qnn_buffer_context { - ggml_backend_qnn_buffer_context(size_t device) : - device(device), - name(GGML_QNN_NAME + std::to_string(device)) { - } + ggml_backend_qnn_buffer_context(size_t device) + : device(device) + , name(GGML_QNN_NAME + std::to_string(device)) {} ~ggml_backend_qnn_buffer_context() { if (buffer) { @@ -2697,83 +2744,82 @@ struct ggml_backend_qnn_buffer_context { sub_buffers.clear(); qnn_tensors.clear(); } - void * buffer = nullptr; + void * buffer = nullptr; struct ggml_backend_qnn_context * backend_ctx = nullptr; - size_t buffer_size = 0; - std::vector sub_buffers; + size_t buffer_size = 0; + std::vector sub_buffers; std::vector qnn_tensors; - size_t device; - std::string name; + size_t device; + std::string name; }; - struct ggml_backend_qnn_buffer_type_context { - size_t device; + size_t device; std::string name; }; - static const char * ggml_backend_qnn_buffer_get_name(ggml_backend_buffer_t buffer) { GGML_UNUSED(buffer); return "QNN"; } - GGML_CALL static bool ggml_backend_buffer_is_qnn(ggml_backend_buffer_t buffer) { return buffer->iface.get_name == ggml_backend_qnn_buffer_get_name; } - GGML_CALL static void ggml_backend_qnn_buffer_free_buffer(ggml_backend_buffer_t buffer) { ggml_backend_qnn_buffer_context * ctx = (ggml_backend_qnn_buffer_context *) buffer->context; delete ctx; } - GGML_CALL static void * ggml_backend_qnn_buffer_get_base(ggml_backend_buffer_t buffer) { ggml_backend_qnn_buffer_context * ctx = (ggml_backend_qnn_buffer_context *) buffer->context; return ctx->buffer; } +GGML_CALL static void ggml_backend_qnn_buffer_init_tensor(ggml_backend_buffer_t buffer, + ggml_tensor * tensor) { + Qnn_ErrorHandle_t error = QNN_SUCCESS; + ggml_backend_qnn_buffer_context * ctx = + (ggml_backend_qnn_buffer_context *) buffer->context; -GGML_CALL static void ggml_backend_qnn_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) { - Qnn_ErrorHandle_t error = QNN_SUCCESS; - ggml_backend_qnn_buffer_context * ctx = (ggml_backend_qnn_buffer_context *)buffer->context; - - static int idx = 0; - char tensor_name[GGML_MAX_NAME] = { 0 }; + static int idx = 0; + char tensor_name[GGML_MAX_NAME] = {0}; snprintf(tensor_name, GGML_MAX_NAME, "tensor_%04d", idx++); - uint32_t dimensions[] = {(uint32_t) tensor->ne[0], (uint32_t) tensor->ne[1], (uint32_t) tensor->ne[2], (uint32_t) tensor->ne[3]}; - Qnn_DataType_t qnn_data_type = qnn_datatype_from_ggml_datatype(tensor->type); - Qnn_TensorType_t qnn_tensor_type= QNN_TENSOR_TYPE_APP_WRITE; + uint32_t dimensions[] = {(uint32_t) tensor->ne[0], (uint32_t) tensor->ne[1], + (uint32_t) tensor->ne[2], + (uint32_t) tensor->ne[3]}; + Qnn_DataType_t qnn_data_type = + qnn_datatype_from_ggml_datatype(tensor->type); + Qnn_TensorType_t qnn_tensor_type = QNN_TENSOR_TYPE_APP_WRITE; if (tensor->flags & GGML_TENSOR_FLAG_INPUT) { qnn_tensor_type = QNN_TENSOR_TYPE_APP_WRITE; } else if (tensor->flags & GGML_TENSOR_FLAG_OUTPUT) { qnn_tensor_type = QNN_TENSOR_TYPE_APP_READ; } - Qnn_Tensor_t qnn_tensor = { - .version= QNN_TENSOR_VERSION_1, - {.v1= { - .id = 0, - .name = tensor_name, - .type = qnn_tensor_type, - .dataFormat = QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER, - .dataType = qnn_data_type, - .quantizeParams = {QNN_DEFINITION_UNDEFINED, - QNN_QUANTIZATION_ENCODING_UNDEFINED, - {.scaleOffsetEncoding = {.scale = 0.0000000000000000f, .offset = 0}}}, - .rank = ggml_get_tensor_rank(tensor), - .dimensions = dimensions, - .memType = QNN_TENSORMEMTYPE_RAW, - {.clientBuf = {.data = nullptr, - .dataSize = 0}}}} - }; - Qnn_Tensor_t * p_qnn_tensor = (Qnn_Tensor_t *)calloc(1, sizeof(Qnn_Tensor_t)); + Qnn_Tensor_t qnn_tensor = { + .version = QNN_TENSOR_VERSION_1, + {.v1 = {.id = 0, + .name = tensor_name, + .type = qnn_tensor_type, + .dataFormat = QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER, + .dataType = qnn_data_type, + .quantizeParams = + {QNN_DEFINITION_UNDEFINED, + QNN_QUANTIZATION_ENCODING_UNDEFINED, + {.scaleOffsetEncoding = {.scale = 0.0000000000000000f, + .offset = 0}}}, + .rank = ggml_get_tensor_rank(tensor), + .dimensions = dimensions, + .memType = QNN_TENSORMEMTYPE_RAW, + {.clientBuf = {.data = nullptr, .dataSize = 0}}}}}; + Qnn_Tensor_t * p_qnn_tensor = + (Qnn_Tensor_t *)calloc(1, sizeof(Qnn_Tensor_t)); if (nullptr == p_qnn_tensor) { QNN_LOG_WARN("calloc failed"); return; @@ -2788,21 +2834,24 @@ GGML_CALL static void ggml_backend_qnn_buffer_init_tensor(ggml_backend_buffer_t ctx->qnn_tensors.push_back(p_qnn_tensor); } - -GGML_CALL static void ggml_backend_qnn_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { +GGML_CALL static void ggml_backend_qnn_buffer_set_tensor(ggml_backend_buffer_t buffer, + ggml_tensor * tensor, const void * data, + size_t offset, size_t size) { GGML_UNUSED(buffer); - memcpy((char *)tensor->data + offset, data, size); + memcpy((char *) tensor->data + offset, data, size); } - -GGML_CALL static void ggml_backend_qnn_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { +GGML_CALL static void ggml_backend_qnn_buffer_get_tensor(ggml_backend_buffer_t buffer, + const ggml_tensor * tensor, void * data, + size_t offset, size_t size) { GGML_UNUSED(buffer); - memcpy(data, (const char *)tensor->data + offset, size); + memcpy(data, (const char *) tensor->data + offset, size); } - -GGML_CALL static bool ggml_backend_qnn_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) { +GGML_CALL static bool ggml_backend_qnn_buffer_cpy_tensor(ggml_backend_buffer_t buffer, + const struct ggml_tensor * src, + struct ggml_tensor * dst) { GGML_UNUSED(buffer); if (ggml_backend_buffer_is_host(src->buffer)) { memcpy(dst->data, src->data, ggml_nbytes(src)); @@ -2812,35 +2861,31 @@ GGML_CALL static bool ggml_backend_qnn_buffer_cpy_tensor(ggml_backend_buffer_t b return false; } - GGML_CALL static void ggml_backend_qnn_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { ggml_backend_qnn_buffer_context * ctx = (ggml_backend_qnn_buffer_context *) buffer->context; memset(ctx->buffer, value, ctx->buffer_size); } - static ggml_backend_buffer_i ggml_backend_qnn_buffer_interface = { - /* .get_name = */ ggml_backend_qnn_buffer_get_name, - /* .free_buffer = */ ggml_backend_qnn_buffer_free_buffer, - /* .get_base = */ ggml_backend_qnn_buffer_get_base, - /* .init_tensor = */ ggml_backend_qnn_buffer_init_tensor, - /* .set_tensor = */ ggml_backend_qnn_buffer_set_tensor, - /* .get_tensor = */ ggml_backend_qnn_buffer_get_tensor, - /* .cpy_tensor = */ ggml_backend_qnn_buffer_cpy_tensor, - /* .clear = */ ggml_backend_qnn_buffer_clear, - /* .reset = */ nullptr, + /* .get_name = */ ggml_backend_qnn_buffer_get_name, + /* .free_buffer = */ ggml_backend_qnn_buffer_free_buffer, + /* .get_base = */ ggml_backend_qnn_buffer_get_base, + /* .init_tensor = */ ggml_backend_qnn_buffer_init_tensor, + /* .set_tensor = */ ggml_backend_qnn_buffer_set_tensor, + /* .get_tensor = */ ggml_backend_qnn_buffer_get_tensor, + /* .cpy_tensor = */ ggml_backend_qnn_buffer_cpy_tensor, + /* .clear = */ ggml_backend_qnn_buffer_clear, + /* .reset = */ nullptr, }; - GGML_CALL static const char * ggml_backend_qnn_buffer_type_name(ggml_backend_buffer_type_t buft) { return "QNN"; } - static void * ggml_qnn_host_malloc(size_t n) { void * data = nullptr; - const int result = posix_memalign((void **) &data, sysconf(_SC_PAGESIZE), n); + int result = posix_memalign((void **) &data, sysconf(_SC_PAGESIZE), n); if (result != 0) { QNN_LOG_WARN("%s: error: posix_memalign failed\n", __func__); return nullptr; @@ -2849,20 +2894,20 @@ static void * ggml_qnn_host_malloc(size_t n) { return data; } - -GGML_CALL static ggml_backend_buffer_t ggml_backend_qnn_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { +GGML_CALL static ggml_backend_buffer_t ggml_backend_qnn_buffer_type_alloc_buffer( + ggml_backend_buffer_type_t buft, size_t size) { ggml_backend_qnn_buffer_type_context * buft_ctx = (ggml_backend_qnn_buffer_type_context *)buft->context; ggml_backend_qnn_buffer_context * ctx = new ggml_backend_qnn_buffer_context(buft_ctx->device); - const size_t size_page = sysconf(_SC_PAGESIZE); + size_t size_page = sysconf(_SC_PAGESIZE); size_t size_aligned = size; if ((size_aligned % size_page) != 0) { size_aligned += (size_page - (size_aligned % size_page)); } - //TODO:use pre-allocated buffer in internal memory pool - ctx->buffer = ggml_qnn_host_malloc(size_aligned); + // TODO:use pre-allocated buffer in internal memory pool + ctx->buffer = ggml_qnn_host_malloc(size_aligned); ctx->buffer_size = size_aligned; ctx->backend_ctx = &g_qnn_mgr[buft_ctx->device]; @@ -2872,53 +2917,51 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_qnn_buffer_type_alloc_buffer return nullptr; } - return ggml_backend_buffer_init(buft, ggml_backend_qnn_buffer_interface, ctx, size); + return ggml_backend_buffer_init(buft, ggml_backend_qnn_buffer_interface,ctx, size); } - -GGML_CALL static size_t ggml_backend_qnn_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { +GGML_CALL static size_t ggml_backend_qnn_buffer_type_get_alignment( + ggml_backend_buffer_type_t buft) { GGML_UNUSED(buft); return 32; } - -//TODO: this value is an experimental value, works fine with whisper/llm/minicpm-v inference on Android +// TODO: this value is an experimental value, works fine with whisper/llm/minicpm-v inference on Android GGML_CALL static size_t ggml_backend_qnn_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) { GGML_UNUSED(buft); return (96 * 1024 * 1024); } - -GGML_CALL static bool ggml_backend_qnn_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, - ggml_backend_t backend) { +GGML_CALL static bool ggml_backend_qnn_buffer_type_supports_backend( + ggml_backend_buffer_type_t buft, ggml_backend_t backend) { GGML_UNUSED(buft); return ggml_backend_is_qnn(backend) || ggml_backend_is_cpu(backend); } - GGML_CALL static bool ggml_backend_qnn_buffer_is_host(ggml_backend_buffer_type_t buft) { GGML_UNUSED(buft); return true; } - GGML_CALL static const char * ggml_backend_qnn_name(ggml_backend_t backend) { return "QNN"; } - GGML_CALL static void ggml_backend_qnn_free(ggml_backend_t backend) { - QNN_LOG_INFO("enter %s", __func__ ); + QNN_LOG_INFO("enter %s", __func__); ggml_backend_qnn_context * ctx = (ggml_backend_qnn_context *) backend->context; QNN_LOG_DEBUG("idx %d, name:%s", ctx->device, g_qnn_mgr[ctx->device].name); - qnn_instance * instance = (qnn_instance*)g_qnn_mgr[ctx->device].instance; + qnn_instance * instance = (qnn_instance *)g_qnn_mgr[ctx->device].instance; if (instance != nullptr) { - std::map>::iterator graph_it; - for (graph_it = instance->_qnn_graph_map.begin(); graph_it != instance->_qnn_graph_map.end(); graph_it++) { - auto & graph_item = graph_it->second; + std::map>::iterator graph_it; + for (graph_it = instance->_qnn_graph_map.begin(); + graph_it != instance->_qnn_graph_map.end(); graph_it++) { + auto & graph_item = graph_it->second; Qnn_GraphHandle_t & graph_handle = std::get<0>(graph_item); GGML_UNUSED(graph_handle); QNN_LOG_DEBUG("graph type:%s", graph_it->first.c_str()); @@ -2930,96 +2973,90 @@ GGML_CALL static void ggml_backend_qnn_free(ggml_backend_t backend) { g_qnn_mgr[ctx->device].instance = nullptr; } - if (g_qnn_mgr[ctx->device].backend != nullptr) { + if (g_qnn_mgr[ctx->device].backend != nullptr) { delete backend; g_qnn_mgr[ctx->device].backend = nullptr; } - QNN_LOG_INFO("leave %s", __func__ ); + QNN_LOG_INFO("leave %s", __func__); } - GGML_CALL static ggml_backend_buffer_type_t ggml_backend_qnn_get_default_buffer_type(ggml_backend_t backend) { ggml_backend_qnn_context * ctx = (ggml_backend_qnn_context *) backend->context; return ggml_backend_qnn_buffer_type(ctx->device); } - GGML_CALL static ggml_status ggml_backend_qnn_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { - enum ggml_status result = GGML_STATUS_SUCCESS; - ggml_backend_qnn_context * ctx = (ggml_backend_qnn_context *) backend->context; + enum ggml_status result = GGML_STATUS_SUCCESS; + ggml_backend_qnn_context * ctx = (ggml_backend_qnn_context *) backend->context; GGML_UNUSED(ctx); ggml_compute_params params = {}; - params.type = GGML_TASK_TYPE_COMPUTE; - params.ith = 0; + params.type = GGML_TASK_TYPE_COMPUTE; + params.ith = 0; for (int i = 0; i < cgraph->n_nodes; i++) { - ggml_tensor * node = cgraph->nodes[i]; - if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) { + ggml_tensor *node = cgraph->nodes[i]; + if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || + node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || + node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) { continue; } bool ok = ggml_qnn_compute_forward(ctx, ¶ms, node); if (!ok) { - QNN_LOG_DEBUG("%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op)); + QNN_LOG_DEBUG("error: op not supported %s (%s)\n", node->name, ggml_op_name(node->op)); } } return result; } +GGML_CALL static bool ggml_backend_qnn_supports_op(ggml_backend_t backend, + const ggml_tensor * op) { + ggml_backend_qnn_context *ctx = (ggml_backend_qnn_context *) backend->context; -GGML_CALL static bool ggml_backend_qnn_supports_op(ggml_backend_t backend, const ggml_tensor * op) { - GGML_UNUSED(backend); - - return (ggml_qnn_can_handle_op(op, true)); + return (ggml_qnn_can_handle_op(ctx, op, true)); } +GGML_CALL static bool ggml_backend_qnn_offload_op(ggml_backend_t backend,const ggml_tensor * tensor) { + ggml_backend_qnn_context * ctx = (ggml_backend_qnn_context *) backend->context; -//note: this function be used with proposal/refined ggml backend subsystem in this PR: -// https://github.com/ggerganov/llama.cpp/pull/7641 -// new ggml backend(only using system memory: ggml_backend_xxx_buffer_is_host return true) -// can following this style for mixed inference between CPU&GPU / CPU&NPU very easily -GGML_CALL static bool ggml_backend_qnn_offload_op(ggml_backend_t backend, const ggml_tensor * tensor) { - ggml_backend_qnn_context * ctx = (ggml_backend_qnn_context *) backend->context; - - return ggml_qnn_compute_forward(ctx, nullptr, (ggml_tensor*)tensor); + return ggml_qnn_compute_forward(ctx, nullptr, (ggml_tensor *) tensor); } - static ggml_backend_i ggml_backend_qnn_interface = { - /* .get_name = */ ggml_backend_qnn_name, - /* .free = */ ggml_backend_qnn_free, - /* .get_default_buffer_type = */ ggml_backend_qnn_get_default_buffer_type, - /* .set_tensor_async = */ nullptr, - /* .get_tensor_async = */ nullptr, - /* .cpy_tensor_async = */ nullptr, - /* .synchronize = */ nullptr, - /* .graph_plan_create = */ nullptr, - /* .graph_plan_free = */ nullptr, - /* .graph_plan_compute = */ nullptr, - /* .graph_compute = */ ggml_backend_qnn_graph_compute, - /* .supports_op = */ ggml_backend_qnn_supports_op, - /* .offload_op = */ ggml_backend_qnn_offload_op, - /* .event_new = */ nullptr, - /* .event_free = */ nullptr, - /* .event_record = */ nullptr, - /* .event_wait = */ nullptr, - /* .event_synchronize = */ nullptr, + /* .get_name = */ ggml_backend_qnn_name, + /* .free = */ ggml_backend_qnn_free, + /* .get_default_buffer_type = */ ggml_backend_qnn_get_default_buffer_type, + /* .set_tensor_async = */ nullptr, + /* .get_tensor_async = */ nullptr, + /* .cpy_tensor_async = */ nullptr, + /* .synchronize = */ nullptr, + /* .graph_plan_create = */ nullptr, + /* .graph_plan_free = */ nullptr, + /* .graph_plan_compute = */ nullptr, + /* .graph_compute = */ ggml_backend_qnn_graph_compute, + /* .supports_op = */ ggml_backend_qnn_supports_op, + /* .offload_op = */ ggml_backend_qnn_offload_op, + /* .event_new = */ nullptr, + /* .event_free = */ nullptr, + /* .event_record = */ nullptr, + /* .event_wait = */ nullptr, + /* .event_synchronize = */ nullptr, }; - static ggml_guid_t ggml_backend_qnn_guid() { - static ggml_guid guid = {0x1a, 0x2b, 0x3c, 0x4d, 0x5e, 0x6f, 0x70, 0x81, 0x92, 0xa3, 0xb4, 0xc5, - 0xd6, 0xe7, 0xf8, 0x09}; + static ggml_guid guid = { + 0x1a, 0x2b, 0x3c, 0x4d, 0x5e, 0x6f, 0x70, 0x81, + 0x92, 0xa3, 0xb4, 0xc5, 0xd6, 0xe7, 0xf8, 0x09 + }; return &guid; } - static ggml_backend_t ggml_backend_qnn_reg_init(const char * params, void * user_data) { if (nullptr == params) { - //QNN library path - //can be hardcoded to "/data/local/tmp/" for Android command line application - //or specified in JNI layer for Android APK + // QNN library path + // can be hardcoded to "/data/local/tmp/" for Android command line application + // or specified in JNI layer for Android APK params = "/data/local/tmp/"; } ggml_backend_t qnn_backend = ggml_backend_qnn_init((int) (intptr_t) user_data, params); @@ -3027,30 +3064,25 @@ static ggml_backend_t ggml_backend_qnn_reg_init(const char * params, void * user return qnn_backend; } - bool ggml_backend_is_qnn(ggml_backend_t backend) { return backend != nullptr && ggml_guid_matches(backend->guid, ggml_backend_qnn_guid()); } - void ggml_backend_qnn_set_n_threads(ggml_backend_t backend, int n_threads) { GGML_ASSERT(ggml_backend_is_qnn(backend)); - struct ggml_backend_qnn_context * ctx = (struct ggml_backend_qnn_context *)backend->context; + struct ggml_backend_qnn_context * ctx = (struct ggml_backend_qnn_context *) backend->context; ctx->threads = n_threads; } - const char * ggml_backend_qnn_get_name(ggml_backend_t backend) { return backend->iface.get_name(backend); } - int ggml_backend_qnn_get_device_count() { return GGML_QNN_MAX_DEVICES; } - void ggml_backend_qnn_get_device_description(size_t dev_num, char * description, size_t description_size) { if (nullptr == description || 0 == description_size) { QNN_LOG_WARN("invalid param"); @@ -3063,14 +3095,13 @@ void ggml_backend_qnn_get_device_description(size_t dev_num, char * description, } snprintf(description, description_size, "%s", g_qnn_mgr[dev_num].name); - QNN_LOG_DEBUG("description:%s", description); } - ggml_backend_buffer_type_t ggml_backend_qnn_buffer_type(size_t device) { if (device >= GGML_QNN_MAX_DEVICES) { - QNN_LOG_DEBUG("ggml_backend_qnn_buffer_type error: device_index:%d is out of range [0, %d]\n", - device, GGML_QNN_MAX_DEVICES - 1); + QNN_LOG_DEBUG("ggml_backend_qnn_buffer_type error: device_index:%d is " + "out of range [0, %d]\n", + device, GGML_QNN_MAX_DEVICES - 1); return nullptr; } @@ -3086,11 +3117,12 @@ ggml_backend_buffer_type_t ggml_backend_qnn_buffer_type(size_t device) { /* .alloc_buffer = */ ggml_backend_qnn_buffer_type_alloc_buffer, /* .get_alignment = */ ggml_backend_qnn_buffer_type_get_alignment, /* .get_max_size = */ ggml_backend_qnn_buffer_type_get_max_size, - /* .get_alloc_size = */ nullptr,// defaults to ggml_nbytes + /* .get_alloc_size = */ nullptr, // defaults to ggml_nbytes /* .supports_backend = */ ggml_backend_qnn_buffer_type_supports_backend, /* .is_host = */ ggml_backend_qnn_buffer_is_host - }, - /* .context = */ new ggml_backend_qnn_buffer_type_context { device, GGML_QNN_NAME + std::to_string(device) }, + }, + /* .context = */ new ggml_backend_qnn_buffer_type_context { device, + GGML_QNN_NAME + std::to_string(device)}, }; } ggml_backend_qnn_buffer_type_initialized = true; @@ -3099,7 +3131,6 @@ ggml_backend_buffer_type_t ggml_backend_qnn_buffer_type(size_t device) { return &ggml_backend_qnn_buffer_types[device]; } - /** * * @param device 0: QNN_BACKEND_CPU 1: QNN_BACKEND_GPU 2: QNN_BACKEND_NPU @@ -3124,8 +3155,9 @@ ggml_backend_t ggml_backend_qnn_init(size_t device, const char * qnn_lib_path) { std::string path = qnn_lib_path; if (QNN_BACKEND_NPU == device) { if (0 == setenv("LD_LIBRARY_PATH", - (path + - ":/vendor/dsp/cdsp:/vendor/lib64:/vendor/dsp/dsp:/vendor/dsp/images").c_str(), + (path + ":/vendor/dsp/cdsp:/vendor/lib64:/vendor/dsp/" + "dsp:/vendor/dsp/images") + .c_str(), 1)) { QNN_LOG_INFO("QNN NPU backend setenv successfully"); } else { @@ -3133,31 +3165,35 @@ ggml_backend_t ggml_backend_qnn_init(size_t device, const char * qnn_lib_path) { } if (0 == setenv("ADSP_LIBRARY_PATH", (path + - ";/vendor/dsp/cdsp;/vendor/lib/rfsa/adsp;/system/lib/rfsa/adsp;/vendor/dsp/dsp;/vendor/dsp/images;/dsp").c_str(), + ";/vendor/dsp/cdsp;/vendor/lib/rfsa/adsp;/system/lib/" + "rfsa/adsp;/vendor/dsp/dsp;/vendor/dsp/images;/dsp") + .c_str(), 1)) { QNN_LOG_INFO("QNN NPU backend setenv successfully"); } else { QNN_LOG_ERROR("QNN NPU backend setenv failure"); } } else { - if (0 == setenv("LD_LIBRARY_PATH", - path.c_str(), - 1)) { - QNN_LOG_INFO("%s backend setenv successfully\n", get_qnn_backend_name(device)); + if (0 == setenv("LD_LIBRARY_PATH", path.c_str(), 1)) { + QNN_LOG_INFO("%s backend setenv successfully\n", + get_qnn_backend_name(device)); } else { - QNN_LOG_ERROR("%s backend setenv failure\n", get_qnn_backend_name(device)); + QNN_LOG_ERROR("%s backend setenv failure\n", + get_qnn_backend_name(device)); } } qnn_instance * instance = nullptr; instance = new qnn_instance(qnn_lib_path, g_qnn_mgr[device].lib, ""); - result = instance->qnn_init(nullptr); + result = instance->qnn_init(nullptr); if (0 != result) { - QNN_LOG_WARN("init qnn subsystem failed with qnn backend %s, pls check why\n", get_qnn_backend_name(device)); + QNN_LOG_WARN( + "init qnn subsystem failed with qnn backend %s, pls check why\n", + get_qnn_backend_name(device)); delete instance; return nullptr; } - qnn_interface qnn_interface = instance->get_qnn_interface(); + qnn_interface qnn_interface = instance->get_qnn_interface(); if (!qnn_interface.is_loaded()) { QNN_LOG_WARN("qnn subsystem failure\n"); delete instance; @@ -3167,29 +3203,28 @@ ggml_backend_t ggml_backend_qnn_init(size_t device, const char * qnn_lib_path) { std::string device_name = get_qnn_backend_name(device); QNN_LOG_INFO("qnn device name %s", device_name.c_str()); instance->init_qnn_graph(device_name.c_str(), false); - g_qnn_mgr[device].instance = instance; - g_qnn_mgr[device].raw_interface = instance->get_qnn_raw_interface(); - g_qnn_mgr[device].raw_system_interface = instance->get_qnn_raw_system_interface(); + g_qnn_mgr[device].instance = instance; + g_qnn_mgr[device].raw_interface = instance->get_qnn_raw_interface(); + g_qnn_mgr[device].raw_system_interface = instance->get_qnn_raw_system_interface(); - ggml_backend_t qnn_backend = new ggml_backend{ - /* .guid = */ ggml_backend_qnn_guid(), - /* .iface = */ ggml_backend_qnn_interface, - /* .context = */ &g_qnn_mgr[device] - }; - g_qnn_mgr[device].backend = qnn_backend; + ggml_backend_t qnn_backend = + new ggml_backend{/* .guid = */ ggml_backend_qnn_guid(), + /* .iface = */ ggml_backend_qnn_interface, + /* .context = */ &g_qnn_mgr[device]}; + g_qnn_mgr[device].backend = qnn_backend; return qnn_backend; } - extern "C" GGML_CALL int ggml_backend_qnn_reg_devices(void); GGML_CALL int ggml_backend_qnn_reg_devices() { for (size_t idx = 0; idx < GGML_QNN_MAX_DEVICES; idx++) { char name[GGML_MAX_NAME]; ggml_backend_qnn_get_device_description(idx, name, GGML_MAX_NAME); - ggml_backend_register(name, ggml_backend_qnn_reg_init, ggml_backend_qnn_buffer_type(idx), - (void *) (intptr_t)idx); + ggml_backend_register(name, ggml_backend_qnn_reg_init, + ggml_backend_qnn_buffer_type(idx), + (void *) (intptr_t) idx); } return GGML_QNN_MAX_DEVICES; From 3e8b61f9702a702bfe14478bdc4eb466038643dd Mon Sep 17 00:00:00 2001 From: "zhou.weiguo" Date: Sun, 9 Jun 2024 09:06:44 +0800 Subject: [PATCH 011/143] review: fix a memory leak introduced by review modification which explained in https://github.com/zhouwg/llama.cpp/pull/1 --- ggml-qnn.cpp | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/ggml-qnn.cpp b/ggml-qnn.cpp index 3c5ff332a1df2..d1d69afe2eef5 100644 --- a/ggml-qnn.cpp +++ b/ggml-qnn.cpp @@ -2771,6 +2771,7 @@ GGML_CALL static bool ggml_backend_buffer_is_qnn(ggml_backend_buffer_t buffer) { GGML_CALL static void ggml_backend_qnn_buffer_free_buffer(ggml_backend_buffer_t buffer) { ggml_backend_qnn_buffer_context * ctx = (ggml_backend_qnn_buffer_context *) buffer->context; + delete ctx; } @@ -3105,12 +3106,14 @@ ggml_backend_buffer_type_t ggml_backend_qnn_buffer_type(size_t device) { return nullptr; } + //ref:https://github.com/zhouwg/llama.cpp/pull/1 + static ggml_backend_qnn_buffer_type_context ggml_backend_qnn_buffer_type_contexts[GGML_QNN_MAX_DEVICES]; static ggml_backend_buffer_type ggml_backend_qnn_buffer_types[GGML_QNN_MAX_DEVICES]; - static bool ggml_backend_qnn_buffer_type_initialized = false; - if (!ggml_backend_qnn_buffer_type_initialized) { - for (int i = 0; i < GGML_QNN_MAX_DEVICES; i++) { + for (size_t i = 0; i < GGML_QNN_MAX_DEVICES; i++) { + auto & context = ggml_backend_qnn_buffer_type_contexts[i]; + context = { i, std::string(GGML_QNN_NAME) + std::to_string(i) }; ggml_backend_qnn_buffer_types[i] = { /* .iface = */ { /* .get_name = */ ggml_backend_qnn_buffer_type_name, @@ -3121,8 +3124,7 @@ ggml_backend_buffer_type_t ggml_backend_qnn_buffer_type(size_t device) { /* .supports_backend = */ ggml_backend_qnn_buffer_type_supports_backend, /* .is_host = */ ggml_backend_qnn_buffer_is_host }, - /* .context = */ new ggml_backend_qnn_buffer_type_context { device, - GGML_QNN_NAME + std::to_string(device)}, + /* .context = */ & context, }; } ggml_backend_qnn_buffer_type_initialized = true; From d38d4a67d17570d3b3003397a50f873f5e143603 Mon Sep 17 00:00:00 2001 From: "zhou.weiguo" Date: Sun, 9 Jun 2024 23:49:54 +0800 Subject: [PATCH 012/143] npu: probe htp info and capacity of rpc ion memory --- ggml-qnn.cpp | 123 +++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 115 insertions(+), 8 deletions(-) diff --git a/ggml-qnn.cpp b/ggml-qnn.cpp index d1d69afe2eef5..3248e244a31c2 100644 --- a/ggml-qnn.cpp +++ b/ggml-qnn.cpp @@ -152,6 +152,28 @@ enum class ggml_qnn_profile_level { profile_detail = 2 }; +enum qcom_htp_arch { + NONE = 0, + V68 = 68, + V69 = 69, + V73 = 73, + V75 = 75, +}; + +enum qcom_chipset { + UNKNOWN_SM = 0, + SM8450 = 36, // v69 + SM8475 = 42, // v69 + SM8550 = 43, // v73 + SM8650 = 57, // v75 +}; + +struct qcom_socinfo { + int soc_model; + int htp_arch; + int vtcm_size_in_mb; +}; + struct ggml_backend_qnn_context { int device; int threads; @@ -216,6 +238,29 @@ static struct ggml_backend_qnn_context g_qnn_mgr[GGML_QNN_MAX_DEVICES] = { .raw_system_interface = {}}, }; +static struct qcom_socinfo g_qnn_soc_info_table[] = { + /* Qualcomm SnapDragon 8 Gen 1 */ + [SM8450] = {.soc_model = SM8450, + .htp_arch = V69, + .vtcm_size_in_mb = 8}, + + /* Qualcomm SnapDragon 8 Gen 1+ */ + [SM8475] = {.soc_model = SM8475, + .htp_arch = V69, + .vtcm_size_in_mb = 8}, + + /* Qualcomm SnapDragon 8 Gen 2 */ + [SM8550] = {.soc_model = SM8550, + .htp_arch = V73, + .vtcm_size_in_mb = 8}, + + /* Qualcomm SnapDragon 8 Gen 3 */ + [SM8650] = {.soc_model = SM8650, + .htp_arch = V75, + .vtcm_size_in_mb = 8}, + +}; + // ================================================================================================= // // QNN helper functions and other internal helper functions @@ -485,6 +530,8 @@ static Qnn_DataType_t qnn_datatype_from_ggml_datatype(enum ggml_type ggmltype) { return QNN_DATATYPE_INT_8; case GGML_TYPE_Q8_0: return QNN_DATATYPE_SFIXED_POINT_8; + case GGML_TYPE_Q4_0: + return QNN_DATATYPE_SFIXED_POINT_4; default: break; } @@ -527,19 +574,34 @@ Fn load_qnn_functionpointers(void * handle, const char * function_name) { static const char * get_qnn_backend_name(int n_backend_type) { switch (n_backend_type) { - case 0: + case QNN_BACKEND_CPU: return "QNN-CPU"; - case 1: + case QNN_BACKEND_GPU: return "QNN-GPU"; - case 2: + case QNN_BACKEND_NPU: return "QNN-NPU"; - case 3: + case QNN_BACKEND_GGML: return "ggml"; //"fake" QNN backend, used for compare performance between QNN backend and original GGML default: return "unknown"; } } +static const char * qnn_get_chipset_desc(uint32_t chipset_id) { + switch (chipset_id) { + case SM8450: + return "SM8450"; + case SM8475: + return "SM8475"; + case SM8550: + return "SM8550"; + case SM8650: + return "SM8650"; + default: + return "unknown"; + } +} + static intptr_t align_to(size_t alignment, intptr_t offset) { return offset % alignment == 0 ? offset @@ -875,7 +937,7 @@ class qnn_instance { return 0; } - std::string &get_qnn_graph_name() { return _graph_name; } + std::string & get_qnn_graph_name() { return _graph_name; } bool is_rpcmem_initialized() { return _rpcmem_initialized; } @@ -893,6 +955,8 @@ class qnn_instance { void free_rpcmem(void * buf); + size_t get_rpcmem_capacity() { return _rpcmem_capacity; } + bool is_rpcmem_allocated(void * buf); bool is_rpcmem_registered(Qnn_MemHandle_t handle) { @@ -977,6 +1041,7 @@ class qnn_instance { pfn_rpc_mem_init _pfn_rpc_mem_init; pfn_rpc_mem_deinit _pfn_rpc_mem_deinit; std::unordered_map _rpcmem_store_map; + size_t _rpcmem_capacity = 512; std::string _graph_name; }; @@ -1493,6 +1558,46 @@ int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) { QNN_LOG_DEBUG("initialize qnn context successfully\n"); } + if (_backend_name.find("Htp") != std::variant_npos) { + const QnnDevice_PlatformInfo_t * p_info = nullptr; + _qnn_raw_interface.deviceGetPlatformInfo(nullptr, &p_info); + QNN_LOG_INFO("device counts %d", p_info->v1.numHwDevices); + QnnDevice_HardwareDeviceInfo_t * infos = p_info->v1.hwDevices; + for (int i = 0; i < p_info->v1.numHwDevices; i++) { + QNN_LOG_INFO("deviceID:%d, deviceType:%d, numCores %d", infos[i].v1.deviceId, + infos[i].v1.deviceType, infos[i].v1.numCores); + QnnDevice_DeviceInfoExtension_t devinfo = infos[i].v1.deviceInfoExtension; + QnnHtpDevice_OnChipDeviceInfoExtension_t chipinfo = devinfo->onChipDevice; + QnnHtpDevice_Arch_t chiparch = chipinfo.arch; + QNN_LOG_INFO("htp_type:%d(%s)", devinfo->devType, (devinfo->devType == QNN_HTP_DEVICE_TYPE_ON_CHIP) ? "ON_CHIP" : ""); + QNN_LOG_INFO("qualcomm soc_model:%d(%s), htp_arch:%d, vtcm_size_in_mb:%d MB", chipinfo.socModel, + qnn_get_chipset_desc(chipinfo.socModel), chiparch, chipinfo.vtcmSize); + } + _qnn_raw_interface.deviceFreePlatformInfo(nullptr, p_info); + + + //TODO: faster approach to probe the accurate capacity of rpc ion memory + size_t candidate_size = 0; + uint8_t * rpc_buffer = nullptr; + const int SIZE_IN_MB = (1 << 20); + size_t probe_slots[] = {1024, 1536, 2048 - 48, 2048}; + size_t probe_counts = sizeof(probe_slots) / sizeof(size_t); + for (size_t idx = 0; idx < probe_counts; idx++) { + rpc_buffer = static_cast(alloc_rpcmem(probe_slots[idx] * SIZE_IN_MB, 4)); + if (nullptr == rpc_buffer) { + QNN_LOG_INFO("alloc rpcmem %d (MB) failure, %s\n", probe_slots[idx], strerror(errno)); + break; + } else { + candidate_size = probe_slots[idx]; + free_rpcmem(rpc_buffer); + rpc_buffer = nullptr; + } + } + if (candidate_size > _rpcmem_capacity) + _rpcmem_capacity = candidate_size; + QNN_LOG_INFO("capacity of rpc ion memory %d MB\n", _rpcmem_capacity); + } + QNN_LOG_DEBUG("leave qni_init\n"); return 0; @@ -1654,9 +1759,11 @@ static bool ggml_qnn_can_handle_op(ggml_backend_qnn_context *ctx, const int64_t ne20 = tensor->ne[0]; const int64_t ne21 = tensor->ne[1]; - //TODO: support other quatinized data type - if (ggml_is_quantized(src0->type) && (src0->type != GGML_TYPE_Q8_0)) { - return false; + //TODO: support other quantized data type + if (ggml_is_quantized(src0->type)) { + if ((src0->type != GGML_TYPE_Q8_0) && (src0->type != GGML_TYPE_Q4_0)) { + return false; + } } if (b_dump_tensor_info) { From 5f8cfe4a1eecab1504dea1451f7d4b4e7983d7b9 Mon Sep 17 00:00:00 2001 From: "zhou.weiguo" Date: Mon, 10 Jun 2024 20:07:26 +0800 Subject: [PATCH 013/143] ggml-qnn: refine source code of ggml-qnn.cpp to make reviewer more happy --- ggml-qnn.cpp | 2654 +++++++++++++++++++++++++------------------------- 1 file changed, 1327 insertions(+), 1327 deletions(-) diff --git a/ggml-qnn.cpp b/ggml-qnn.cpp index 3248e244a31c2..43a8fcd3ea8cb 100644 --- a/ggml-qnn.cpp +++ b/ggml-qnn.cpp @@ -1,6 +1,7 @@ #include #include #include +#include #include #include #include @@ -32,8 +33,17 @@ #include #include #include -#include +#if (defined __ANDROID__) || (defined ANDROID) +#include +#endif + +#include "ggml-qnn.h" + +#include "ggml-backend-impl.h" + +// header file of Qualcomm QNN(Qualcomm Neural Network, aka Qualcomm AI Engine Direct) SDK +// https://qpm.qualcomm.com/#/main/tools/details/qualcomm_ai_engine_direct #include "QnnTypes.h" #include "QnnCommon.h" #include "QnnContext.h" @@ -46,14 +56,6 @@ #include "System/QnnSystemInterface.h" #include "HTP/QnnHtpDevice.h" -#include "ggml-qnn.h" - -#include "ggml-backend-impl.h" - -#if (defined __ANDROID__) || (defined ANDROID) -#include -#endif - // ================================================================================================= // // forward declaration @@ -61,96 +63,31 @@ // ================================================================================================= class qnn_instance; -static void ggml_qnn_log_internal(ggml_log_level level, const char * file, - const char * func, int line, - const char * format, ...); +struct ggml_backend_qnn_context; + +static int free_qnn_tensor(Qnn_Tensor_t & tensor); // ================================================================================================= // // self-defined macro / data structure // // ================================================================================================= -#define RPCMEM_DEFAULT_FLAGS 1 -#define RPCMEM_HEAP_ID_SYSTEM 25 - -#define GGML_QNN_DEBUG 1 // for troubleshooting QNN backend -#define ENABLE_QNN_LOG 0 // enable/disable QNN internal log -#define GGML_QNN_LOGBUF_LEN 4096 -#define QNN_VER_PTR(x) (&((x).v1)) -#define GGML_QNN_NAME "qnn" - -#define QNN_LOG_ERROR(...) \ - ggml_qnn_log_internal(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) - -#define QNN_LOG_WARN(...) \ - ggml_qnn_log_internal(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) +#define ENABLE_QNNBACKEND_DEBUG 1 // for troubleshooting QNN backend +#define ENABLE_QNNSDK_LOG 0 // enable/disable QNN SDK's internal log +#define ENABLE_QNNBACKEND_PERF 0 // enable/disable op's perf info +#define QNN_LOGBUF_LEN 4096 +#define QNN_BACKEND_NAME "qnn" -#define QNN_LOG_INFO(...) \ - ggml_qnn_log_internal(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) - -#if GGML_QNN_DEBUG -#define QNN_LOG_DEBUG(...) \ - ggml_qnn_log_internal(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) -#else -#define QNN_LOG_DEBUG(...) -#endif - -#define VALIDATE(value, status) \ - do { \ - status = value; \ - if (status != QNN_SUCCESS) { \ - QNN_LOG_WARN("%s expected QNN_SUCCESS\n", #value); \ - return status; \ - } \ - } while (0) +typedef void (*ggml_qnn_func_t)(ggml_backend_qnn_context * ctx, + const ggml_tensor * src0, + const ggml_tensor * src1, + ggml_tensor * dst); -#define VALIDATE_TENSOR_VERSION(tensor, err) \ - VALIDATE(validate_tensor_version(tensor), err) - -#define QNN_TENSOR_GET_ID(tensor) get_qnn_tensorid(tensor) -#define QNN_TENSOR_GET_NAME(tensor) get_qnn_tensorname(tensor) -#define QNN_TENSOR_GET_TYPE(tensor) get_qnn_tensortype(tensor) -#define QNN_TENSOR_GET_DATA_FORMAT(tensor) get_qnn_tensor_dataformat(tensor) -#define QNN_TENSOR_GET_DATA_TYPE(tensor) get_qnn_tensor_datatype(tensor) -#define QNN_TENSOR_GET_QUANT_PARAMS(tensor) get_qnn_tensor_quantparams(tensor) -#define QNN_TENSOR_GET_RANK(tensor) get_qnn_tensor_rank(tensor) -#define QNN_TENSOR_GET_DIMENSIONS(tensor) get_qnn_tensor_dimensions(tensor) -#define QNN_TENSOR_GET_MEM_TYPE(tensor) get_qnn_tensor_memtype(tensor) - -#define QNN_TENSOR_SET_ID(tensor, value) set_qnn_tensor_id(tensor, value) -#define QNN_TENSOR_SET_NAME(tensor, value) set_qnn_tensor_name(tensor, value) -#define QNN_TENSOR_SET_TYPE(tensor, value) set_qnn_tensor_type(tensor, value) -#define QNN_TENSOR_SET_DATA_FORMAT(tensor, value) \ - set_qnn_tensor_dataformat(tensor, value) -#define QNN_TENSOR_SET_DATA_TYPE(tensor, value) \ - set_qnn_tensor_datatype(tensor, value) -#define QNN_TENSOR_SET_QUANT_PARAMS(tensor, value) \ - set_qnn_tensor_quantparams(tensor, value) -#define QNN_TENSOR_SET_RANK(tensor, value) set_qnn_tensor_rank(tensor, value) -#define QNN_TENSOR_SET_DIMENSIONS(tensor, value) \ - set_qnn_tensor_dimensions(tensor, value) -#define QNN_TENSOR_SET_MEM_TYPE(tensor, value) \ - set_qnn_tensor_memtype(tensor, value) -#define QNN_TENSOR_SET_CLIENT_BUF(tensor, value) \ - set_qnn_tensor_clientbuf(tensor, value) -#define QNN_TENSOR_SET_MEM_HANDLE(tensor, value) \ - set_qnn_tensor_memhandle(tensor, value) - -using pfn_rpc_mem_init = void (*)(void); -using pfn_rpc_mem_deinit = void (*)(void); -using pfn_rpc_mem_alloc = void *(*) (int, uint32_t, int); -using pfn_rpc_mem_free = void (*)(void *); -using pfn_rpc_mem_to_fd = int (*)(void *); - -using _pfn_QnnSaver_initialize = decltype(QnnSaver_initialize); -using _pfn_QnnInterface_getProviders = decltype(QnnInterface_getProviders); -using _pfn_QnnSystemInterface_getProviders = decltype(QnnSystemInterface_getProviders); - -enum class ggml_qnn_profile_level { - profile_off = 0, - profile_basic = 1, - profile_detail = 2 -}; +typedef void (*ggml_qnn_func_common_t)(ggml_backend_qnn_context * ctx, + const ggml_op ggml_op, + const ggml_tensor * src0, + const ggml_tensor * src1, + ggml_tensor * dst); enum qcom_htp_arch { NONE = 0, @@ -169,9 +106,36 @@ enum qcom_chipset { }; struct qcom_socinfo { - int soc_model; - int htp_arch; - int vtcm_size_in_mb; + uint32_t soc_model; + size_t htp_arch; + size_t vtcm_size_in_mb; +}; + +static struct qcom_socinfo g_qnn_soc_info_table[] = { + /* Qualcomm SnapDragon 8 Gen 1 */ + [SM8450] = { + .soc_model = SM8450, + .htp_arch = V69, + .vtcm_size_in_mb = 8}, + + /* Qualcomm SnapDragon 8 Gen 1+ */ + [SM8475] = { + .soc_model = SM8475, + .htp_arch = V69, + .vtcm_size_in_mb = 8}, + + /* Qualcomm SnapDragon 8 Gen 2 */ + [SM8550] = { + .soc_model = SM8550, + .htp_arch = V73, + .vtcm_size_in_mb = 8}, + + /* Qualcomm SnapDragon 8 Gen 3 */ + [SM8650] = { + .soc_model = SM8650, + .htp_arch = V75, + .vtcm_size_in_mb = 8}, + }; struct ggml_backend_qnn_context { @@ -183,19 +147,9 @@ struct ggml_backend_qnn_context { struct ggml_backend * backend; QNN_INTERFACE_VER_TYPE raw_interface; QNN_SYSTEM_INTERFACE_VER_TYPE raw_system_interface; + struct qcom_socinfo socinfo; }; -typedef void (*ggml_qnn_func_t)(ggml_backend_qnn_context * ctx, - const ggml_tensor * src0, - const ggml_tensor * src1, - ggml_tensor * dst); - -typedef void (*ggml_qnn_func_common_t)(ggml_backend_qnn_context * ctx, - const ggml_op ggml_op, - const ggml_tensor * src0, - const ggml_tensor * src1, - ggml_tensor * dst); - // according to the QNN SDK Reference Guide, // CPU - Choose a non-quantized model.Quantized models are currently incompatible with the CPU backend // GPU - Choose a non-quantized model.Quantized models are currently incompatible with the GPU backend @@ -217,7 +171,8 @@ static struct ggml_backend_qnn_context g_qnn_mgr[GGML_QNN_MAX_DEVICES] = { .instance = nullptr, .backend = nullptr, .raw_interface = {}, - .raw_system_interface = {}}, + .raw_system_interface = {}, + .socinfo = {}}, [QNN_BACKEND_GPU] = {.device = 1, .threads = 1, @@ -226,7 +181,8 @@ static struct ggml_backend_qnn_context g_qnn_mgr[GGML_QNN_MAX_DEVICES] = { .instance = nullptr, .backend = nullptr, .raw_interface = {}, - .raw_system_interface = {}}, + .raw_system_interface = {}, + .socinfo = {}}, [QNN_BACKEND_NPU] = {.device = 2, .threads = 1, @@ -235,128 +191,425 @@ static struct ggml_backend_qnn_context g_qnn_mgr[GGML_QNN_MAX_DEVICES] = { .instance = nullptr, .backend = nullptr, .raw_interface = {}, - .raw_system_interface = {}}, + .raw_system_interface = {}, + .socinfo = {}}, }; -static struct qcom_socinfo g_qnn_soc_info_table[] = { - /* Qualcomm SnapDragon 8 Gen 1 */ - [SM8450] = {.soc_model = SM8450, - .htp_arch = V69, - .vtcm_size_in_mb = 8}, +struct ggml_backend_qnn_buffer_context { + ggml_backend_qnn_buffer_context(size_t device) + : device(device) + , name(QNN_BACKEND_NAME + std::to_string(device)) {} - /* Qualcomm SnapDragon 8 Gen 1+ */ - [SM8475] = {.soc_model = SM8475, - .htp_arch = V69, - .vtcm_size_in_mb = 8}, + ~ggml_backend_qnn_buffer_context() { + if (buffer) { + free(buffer); + } - /* Qualcomm SnapDragon 8 Gen 2 */ - [SM8550] = {.soc_model = SM8550, - .htp_arch = V73, - .vtcm_size_in_mb = 8}, + for (auto * sub_buffer : sub_buffers) { + free(sub_buffer); + } - /* Qualcomm SnapDragon 8 Gen 3 */ - [SM8650] = {.soc_model = SM8650, - .htp_arch = V75, - .vtcm_size_in_mb = 8}, + for (auto * qnn_tensor : qnn_tensors) { + free_qnn_tensor(*qnn_tensor); + free(qnn_tensor); + } + + sub_buffers.clear(); + qnn_tensors.clear(); + } + void * buffer = nullptr; + + struct ggml_backend_qnn_context * backend_ctx = nullptr; + + size_t buffer_size = 0; + std::vector sub_buffers; + std::vector qnn_tensors; + size_t device; + std::string name; +}; +struct ggml_backend_qnn_buffer_type_context { + size_t device; + std::string name; }; // ================================================================================================= // -// QNN helper functions and other internal helper functions +// QNN backend internal log function // // ================================================================================================= -static inline int validate_tensor_version(Qnn_Tensor_t tensor) { - if (tensor.version != QNN_TENSOR_VERSION_1) { - QNN_LOG_WARN( - "validate_tensor_version() tensor %s, got unsupported version %d\n", - tensor.v1.name, tensor.version); - return 1; +static void qnn_internal_log(ggml_log_level level, const char * file, + const char * func, int line, + const char * format, ...); +#define QNN_LOG_ERROR(...) \ + qnn_internal_log(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) + +#define QNN_LOG_WARN(...) \ + qnn_internal_log(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) + +#define QNN_LOG_INFO(...) \ + qnn_internal_log(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) + +#if ENABLE_QNNBACKEND_DEBUG +#define QNN_LOG_DEBUG(...) \ + qnn_internal_log(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) +#else +#define QNN_LOG_DEBUG(...) +#endif + +// ================================================================================================= +// +// QNN backend internal helper functions +// +// ================================================================================================= +static uint32_t qnn_get_ggml_tensor_rank(const ggml_tensor * tensor) { + uint32_t rank = 0; + for (int i = 0; i < GGML_MAX_DIMS; i++) { + if ((0 != tensor->ne[i]) && (1 != tensor->ne[i])) { + rank++; + } } - return 0; + return rank; } -static inline uint32_t get_qnn_tensorid(const Qnn_Tensor_t & tensor) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - return tensor.v1.id; +// TODO: mapping more ggml data type to QNN data type +// ref:explanation of k-quants, https://github.com/ggerganov/llama.cpp/pull/1684 +static Qnn_DataType_t qnn_datatype_from_ggml_datatype(enum ggml_type ggmltype) { + switch (ggmltype) { + case GGML_TYPE_F16: + return QNN_DATATYPE_FLOAT_16; + case GGML_TYPE_F32: + return QNN_DATATYPE_FLOAT_32; + case GGML_TYPE_I8: + return QNN_DATATYPE_INT_8; + case GGML_TYPE_Q8_0: + return QNN_DATATYPE_SFIXED_POINT_8; + case GGML_TYPE_Q4_0: + return QNN_DATATYPE_SFIXED_POINT_4; + default: + break; } - - return 0u; + return QNN_DATATYPE_UNDEFINED; } -static inline const char * get_qnn_tensorname(const Qnn_Tensor_t & tensor) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - return tensor.v1.name; +// TODO: only support GGML_OP_ADD/GGML_OP_MUL/GGML_OP_MUL_MAT +static const char * qnn_opname_from_ggmlop(enum ggml_op ggmlop) { + switch (ggmlop) { + case GGML_OP_ADD: + return QNN_OP_ELEMENT_WISE_ADD; + case GGML_OP_MUL: + return QNN_OP_ELEMENT_WISE_MULTIPLY; + case GGML_OP_MUL_MAT: + return QNN_OP_MAT_MUL; + default: + break; } return nullptr; } -static inline Qnn_TensorType_t get_qnn_tensortype(const Qnn_Tensor_t & tensor) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - return tensor.v1.type; +static uint32_t qnn_get_ggml_tensor_data_size(const ggml_tensor * tensor) { + /* + size_t data_size = ggml_row_size(tensor->type, tensor->ne[0]); + size_t n_dims = qnn_get_ggml_tensor_rank(tensor); + for (int i = 1; i < n_dims; i++) { + data_size *= tensor->ne[i]; } - return QNN_TENSOR_TYPE_UNDEFINED; + + return data_size; + */ + return ggml_nbytes(tensor); } -static inline Qnn_TensorDataFormat_t - get_qnn_tensor_dataformat(const Qnn_Tensor_t & tensor) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - return tensor.v1.dataFormat; +static const char * qnn_get_backend_name(int n_backend_type) { + switch (n_backend_type) { + case QNN_BACKEND_CPU: + return "QNN-CPU"; + case QNN_BACKEND_GPU: + return "QNN-GPU"; + case QNN_BACKEND_NPU: + return "QNN-NPU"; + case QNN_BACKEND_GGML: + return "ggml"; //"fake" QNN backend, used for compare performance between QNN backend and original GGML + default: + return "unknown"; } - return QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER; } -static inline Qnn_DataType_t - get_qnn_tensor_datatype(const Qnn_Tensor_t & tensor) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - return tensor.v1.dataType; +static const char * qnn_get_chipset_desc(uint32_t chipset_id) { + switch (chipset_id) { + case SM8450: + return "SM8450"; + case SM8475: + return "SM8475"; + case SM8550: + return "SM8550"; + case SM8650: + return "SM8650"; + default: + return "unknown"; } - return QNN_DATATYPE_UNDEFINED; } -static inline Qnn_QuantizeParams_t - get_qnn_tensor_quantparams(const Qnn_Tensor_t & tensor) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - return tensor.v1.quantizeParams; +static const char * qnn_get_htparch_desc(size_t htp_arch) { + switch (htp_arch) { + case V68: + return "QCOM_HTP_V68"; + case V69: + return "QCOM_HTP_V69"; + case V73: + return "QCOM_HTP_V73"; + case V75: + return "QCOM_HTP_V75"; + default: + return "unknown"; } - return QNN_QUANTIZE_PARAMS_INIT; } -static inline uint32_t get_qnn_tensor_rank(const Qnn_Tensor_t & tensor) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - return tensor.v1.rank; +static void qnn_internal_log(ggml_log_level level, const char * file, + const char * func, int line, + const char * format, ...) { + static std::mutex qnn_internal_log_mutex; + static char s_qnn_internal_log_buf[QNN_LOGBUF_LEN]; + + { + std::lock_guard lock(qnn_internal_log_mutex); + va_list args; + + va_start(args, format); + int len_prefix = + snprintf(s_qnn_internal_log_buf, QNN_LOGBUF_LEN, + "[%s, %d]: ", func, line); + int len = vsnprintf(s_qnn_internal_log_buf + len_prefix, + QNN_LOGBUF_LEN - len_prefix, format, args); + if (len < (QNN_LOGBUF_LEN - len_prefix)) { +#if (defined __ANDROID__) || (defined ANDROID) + // for Android APK + __android_log_print(level, "ggml-qnn", "%s\n", s_qnn_internal_log_buf); +#endif + // for Android command line application or WoA(Windows on ARM) + printf("%s\n", s_qnn_internal_log_buf); + } + va_end(args); } - return 0u; } -static inline uint32_t * get_qnn_tensor_dimensions(const Qnn_Tensor_t & tensor) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - return tensor.v1.dimensions; + +static bool qnn_is_valid_params(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, + const ggml_tensor * src1, ggml_tensor * dst) { + if ((nullptr == ctx) || (nullptr == src0) || (nullptr == src1) || (nullptr == dst)) { + QNN_LOG_WARN("invalid params\n"); + return false; } - return nullptr; -} -static inline Qnn_TensorMemType_t get_qnn_tensor_memtype(const Qnn_Tensor_t & tensor) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - return tensor.v1.memType; + qnn_instance * instance = nullptr; + Qnn_Tensor_t * tensor_0 = nullptr; + Qnn_Tensor_t * tensor_1 = nullptr; + Qnn_Tensor_t * tensor_2 = nullptr; + tensor_0 = (Qnn_Tensor_t *) src0->extra; + tensor_1 = (Qnn_Tensor_t *) src1->extra; + tensor_2 = (Qnn_Tensor_t *) dst->extra; + instance = ctx->instance; + if ((nullptr == instance) || (nullptr == tensor_0) || (nullptr == tensor_1) || (nullptr == tensor_2)) { + QNN_LOG_WARN("invalid params\n"); + return false; } - return QNN_TENSORMEMTYPE_UNDEFINED; + + return true; } -static inline void set_qnn_tensor_id(Qnn_Tensor_t & tensor, uint32_t id) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - tensor.v1.id = id; +#define CHECK_PARAMS(ctx, src0, src1, dst) \ + do { \ + if (!qnn_is_valid_params((ctx), (src0), (src1), (dst))) { \ + return; \ + } \ + } while (0) + +#if ENABLE_QNNBACKEND_PERF +class qnn_perf { +public: + qnn_perf(const std::string & perf_name) : _perf_name(std::move(perf_name)) {}; + qnn_perf() = delete; + qnn_perf(const qnn_perf & ) = delete; + qnn_perf & operator= (const qnn_perf & ) = delete; + + void start() { + _begin_time = ggml_time_us(); } -} -static inline void set_qnn_tensor_name(Qnn_Tensor_t & tensor, const char * name) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - tensor.v1.name = name; + void info() { + _end_time = ggml_time_us(); + _duration = (_end_time - _begin_time) / 1000; + QNN_LOG_DEBUG("duration of %s : %lld milliseconds\n", _perf_name.c_str(), _duration); } -} -static inline void set_qnn_tensor_type(Qnn_Tensor_t & tensor, Qnn_TensorType_t type) { - if (tensor.version == QNN_TENSOR_VERSION_1) { +private: + int64_t _begin_time = 0LL; + int64_t _end_time = 0LL; + int64_t _duration = 0LL; + std::string _perf_name; +}; +#else +class qnn_perf { +public: + qnn_perf(const std::string & perf_name) {} + qnn_perf() = delete; + qnn_perf(const qnn_perf & ) = delete; + qnn_perf & operator= (const qnn_perf & ) = delete; + + void start() {} + void info() {} +}; +#endif + +// ================================================================================================= +// +// helper data type / data structure / macros / functions of +// Qualcomm QNN(Qualcomm Neural Network, aka Qualcomm AI Engine Direct) SDK +// ref:https://github.com/pytorch/executorch/tree/main/backends/qualcomm +// ================================================================================================= +enum qnn_sdk_profile_level { + profile_off = 0, + profile_basic = 1, + profile_detail = 2 +}; + +using _pfn_rpc_mem_init = void (*)(void); +using _pfn_rpc_mem_deinit = void (*)(void); +using _pfn_rpc_mem_alloc = void *(*) (int, uint32_t, int); +using _pfn_rpc_mem_free = void (*)(void *); +using _pfn_rpc_mem_to_fd = int (*)(void *); + +using _pfn_qnnsaver_initialize = decltype(QnnSaver_initialize); +using _pfn_qnninterface_getproviders = decltype(QnnInterface_getProviders); +using _pfn_qnnsysteminterface_getproviders = decltype(QnnSystemInterface_getProviders); + +#define QNN_VER_PTR(x) (&((x).v1)) +#define RPCMEM_DEFAULT_FLAGS 1 +#define RPCMEM_HEAP_ID_SYSTEM 25 + +#define VALIDATE(value, status) \ + do { \ + status = value; \ + if (status != QNN_SUCCESS) { \ + QNN_LOG_WARN("%s expected QNN_SUCCESS\n", #value); \ + return status; \ + } \ + } while (0) + +#define QNN_TENSOR_GET_ID(tensor) get_qnn_tensorid(tensor) +#define QNN_TENSOR_GET_NAME(tensor) get_qnn_tensorname(tensor) +#define QNN_TENSOR_GET_TYPE(tensor) get_qnn_tensortype(tensor) +#define QNN_TENSOR_GET_DATA_FORMAT(tensor) get_qnn_tensor_dataformat(tensor) +#define QNN_TENSOR_GET_DATA_TYPE(tensor) get_qnn_tensor_datatype(tensor) +#define QNN_TENSOR_GET_QUANT_PARAMS(tensor) get_qnn_tensor_quantparams(tensor) +#define QNN_TENSOR_GET_RANK(tensor) get_qnn_tensor_rank(tensor) +#define QNN_TENSOR_GET_DIMENSIONS(tensor) get_qnn_tensor_dimensions(tensor) +#define QNN_TENSOR_GET_MEM_TYPE(tensor) get_qnn_tensor_memtype(tensor) + +#define QNN_TENSOR_SET_ID(tensor, value) set_qnn_tensor_id(tensor, value) +#define QNN_TENSOR_SET_NAME(tensor, value) set_qnn_tensor_name(tensor, value) +#define QNN_TENSOR_SET_TYPE(tensor, value) set_qnn_tensor_type(tensor, value) +#define QNN_TENSOR_SET_DATA_FORMAT(tensor, value) set_qnn_tensor_dataformat(tensor, value) +#define QNN_TENSOR_SET_DATA_TYPE(tensor, value) set_qnn_tensor_datatype(tensor, value) +#define QNN_TENSOR_SET_QUANT_PARAMS(tensor, value) set_qnn_tensor_quantparams(tensor, value) +#define QNN_TENSOR_SET_RANK(tensor, value) set_qnn_tensor_rank(tensor, value) +#define QNN_TENSOR_SET_DIMENSIONS(tensor, value) set_qnn_tensor_dimensions(tensor, value) +#define QNN_TENSOR_SET_MEM_TYPE(tensor, value) set_qnn_tensor_memtype(tensor, value) +#define QNN_TENSOR_SET_CLIENT_BUF(tensor, value) set_qnn_tensor_clientbuf(tensor, value) +#define QNN_TENSOR_SET_MEM_HANDLE(tensor, value) set_qnn_tensor_memhandle(tensor, value) +#define VALIDATE_TENSOR_VERSION(tensor, err) VALIDATE(validate_tensor_version(tensor), err) + +static inline int validate_tensor_version(Qnn_Tensor_t tensor) { + if (tensor.version != QNN_TENSOR_VERSION_1) { + QNN_LOG_WARN( + "validate_tensor_version() tensor %s, got unsupported version %d\n", + tensor.v1.name, tensor.version); + return 1; + } + return 0; +} + +static inline uint32_t get_qnn_tensorid(const Qnn_Tensor_t & tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.id; + } + + return 0u; +} + +static inline const char * get_qnn_tensorname(const Qnn_Tensor_t & tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.name; + } + return nullptr; +} + +static inline Qnn_TensorType_t get_qnn_tensortype(const Qnn_Tensor_t & tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.type; + } + return QNN_TENSOR_TYPE_UNDEFINED; +} + +static inline Qnn_TensorDataFormat_t + get_qnn_tensor_dataformat(const Qnn_Tensor_t & tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.dataFormat; + } + return QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER; +} + +static inline Qnn_DataType_t + get_qnn_tensor_datatype(const Qnn_Tensor_t & tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.dataType; + } + return QNN_DATATYPE_UNDEFINED; +} + +static inline Qnn_QuantizeParams_t + get_qnn_tensor_quantparams(const Qnn_Tensor_t & tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.quantizeParams; + } + return QNN_QUANTIZE_PARAMS_INIT; +} + +static inline uint32_t get_qnn_tensor_rank(const Qnn_Tensor_t & tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.rank; + } + return 0u; +} + +static inline uint32_t * get_qnn_tensor_dimensions(const Qnn_Tensor_t & tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.dimensions; + } + return nullptr; +} + +static inline Qnn_TensorMemType_t get_qnn_tensor_memtype(const Qnn_Tensor_t & tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.memType; + } + return QNN_TENSORMEMTYPE_UNDEFINED; +} + +static inline void set_qnn_tensor_id(Qnn_Tensor_t & tensor, uint32_t id) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + tensor.v1.id = id; + } +} + +static inline void set_qnn_tensor_name(Qnn_Tensor_t & tensor, const char * name) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + tensor.v1.name = name; + } +} + +static inline void set_qnn_tensor_type(Qnn_Tensor_t & tensor, Qnn_TensorType_t type) { + if (tensor.version == QNN_TENSOR_VERSION_1) { tensor.v1.type = type; } } @@ -419,18 +672,13 @@ static size_t memscpy(void * dst, size_t dst_size, const void * src, size_t copy return min_size; } -static char * ggml_qnn_strndup(const char * source, size_t maxlen) { - return ::strndup(source, maxlen); -} - static int deep_copy_qnn_tensors(Qnn_Tensor_t & src, Qnn_Tensor_t & dst) { int err = 0; VALIDATE_TENSOR_VERSION(src, err); dst.version = src.version; QNN_TENSOR_SET_NAME( - dst, ggml_qnn_strndup(QNN_TENSOR_GET_NAME(src), - std::string(QNN_TENSOR_GET_NAME(src)).size())); + dst, ::strndup(QNN_TENSOR_GET_NAME(src),std::string(QNN_TENSOR_GET_NAME(src)).size())); if (nullptr == QNN_TENSOR_GET_NAME(dst)) { return 1; } @@ -508,140 +756,61 @@ static int free_qnn_tensor(Qnn_Tensor_t & tensor) { return err; } -static uint32_t ggml_get_tensor_rank(const ggml_tensor * tensor) { - uint32_t rank = 0; - for (int i = 0; i < GGML_MAX_DIMS; i++) { - if ((0 != tensor->ne[i]) && (1 != tensor->ne[i])) { - rank++; - } - } - return rank; -} - -// TODO: mapping more ggml data type to QNN data type -// ref:explanation of k-quants, https://github.com/ggerganov/llama.cpp/pull/1684 -static Qnn_DataType_t qnn_datatype_from_ggml_datatype(enum ggml_type ggmltype) { - switch (ggmltype) { - case GGML_TYPE_F16: - return QNN_DATATYPE_FLOAT_16; - case GGML_TYPE_F32: - return QNN_DATATYPE_FLOAT_32; - case GGML_TYPE_I8: - return QNN_DATATYPE_INT_8; - case GGML_TYPE_Q8_0: - return QNN_DATATYPE_SFIXED_POINT_8; - case GGML_TYPE_Q4_0: - return QNN_DATATYPE_SFIXED_POINT_4; - default: - break; - } - return QNN_DATATYPE_UNDEFINED; -} - -// TODO: only support GGML_OP_ADD/GGML_OP_MUL/GGML_OP_MUL_MAT -static const char * qnn_opname_from_ggmlop(enum ggml_op ggmlop) { - switch (ggmlop) { - case GGML_OP_ADD: - return QNN_OP_ELEMENT_WISE_ADD; - case GGML_OP_MUL: - return QNN_OP_ELEMENT_WISE_MULTIPLY; - case GGML_OP_MUL_MAT: - return QNN_OP_MAT_MUL; - default: - break; - } - - return nullptr; -} - -static uint32_t ggml_get_tensor_data_size(const ggml_tensor * tensor) { - /* - size_t data_size = ggml_row_size(tensor->type, tensor->ne[0]); - size_t n_dims = ggml_get_tensor_rank(tensor); - for (int i = 1; i < n_dims; i++) { - data_size *= tensor->ne[i]; - } - - return data_size; - */ - return ggml_nbytes(tensor); -} - -template -Fn load_qnn_functionpointers(void * handle, const char * function_name) { +template Fn load_qnn_functionpointers(void * handle, const char * function_name) { return reinterpret_cast(dlsym(handle, function_name)); } -static const char * get_qnn_backend_name(int n_backend_type) { - switch (n_backend_type) { - case QNN_BACKEND_CPU: - return "QNN-CPU"; - case QNN_BACKEND_GPU: - return "QNN-GPU"; - case QNN_BACKEND_NPU: - return "QNN-NPU"; - case QNN_BACKEND_GGML: - return "ggml"; //"fake" QNN backend, used for compare performance between QNN backend and original GGML - default: - return "unknown"; - } -} - -static const char * qnn_get_chipset_desc(uint32_t chipset_id) { - switch (chipset_id) { - case SM8450: - return "SM8450"; - case SM8475: - return "SM8475"; - case SM8550: - return "SM8550"; - case SM8650: - return "SM8650"; - default: - return "unknown"; - } -} - static intptr_t align_to(size_t alignment, intptr_t offset) { return offset % alignment == 0 - ? offset - : offset + (static_cast(alignment) - - offset % static_cast(alignment)); + ? offset + : offset + (static_cast(alignment) - + offset % static_cast(alignment)); } -static void ggml_qnn_log_internal(ggml_log_level level, const char * file, - const char * func, int line, - const char * format, ...) { - static std::mutex ggml_qnn_log_internal_mutex; - static char s_ggml_qnn_log_internal_buf[GGML_QNN_LOGBUF_LEN]; +static void qnn_sdk_logcallback(const char * fmt, QnnLog_Level_t level, + uint64_t timestamp, va_list argp) { + +#if ENABLE_QNNSDK_LOG + static std::mutex log_mutex; + static unsigned char s_ggml_qnn_logbuf[QNN_LOGBUF_LEN]; + + const char * log_level_desc = ""; + switch (level) { + case QNN_LOG_LEVEL_ERROR: + log_level_desc = "ERROR"; + break; + case QNN_LOG_LEVEL_WARN: + log_level_desc = "WARNING"; + break; + case QNN_LOG_LEVEL_INFO: + log_level_desc = "INFO"; + break; + case QNN_LOG_LEVEL_DEBUG: + log_level_desc = "DEBUG"; + break; + case QNN_LOG_LEVEL_VERBOSE: + log_level_desc = "VERBOSE"; + break; + case QNN_LOG_LEVEL_MAX: + log_level_desc = "UNKNOWN"; + break; + } + double ms = (double) timestamp / 1000000.0; { - std::lock_guard lock(ggml_qnn_log_internal_mutex); - va_list args; + std::lock_guard lock(log_mutex); - va_start(args, format); - int len_prefix = - snprintf(s_ggml_qnn_log_internal_buf, GGML_QNN_LOGBUF_LEN, - "[%s, %d]: ", func, line); - int len = vsnprintf(s_ggml_qnn_log_internal_buf + len_prefix, - GGML_QNN_LOGBUF_LEN - len_prefix, format, args); - if (len < (GGML_QNN_LOGBUF_LEN - len_prefix)) { -#if (defined __ANDROID__) || (defined ANDROID) - // for Android APK - __android_log_print(level, "ggml-qnn", "%s\n", s_ggml_qnn_log_internal_buf); -#endif - // for Android command line application or WoA - printf("%s\n", s_ggml_qnn_log_internal_buf); - } - va_end(args); + memset(s_ggml_qnn_logbuf, 0, QNN_LOGBUF_LEN); + vsnprintf(reinterpret_cast(s_ggml_qnn_logbuf), QNN_LOGBUF_LEN, fmt, argp); + QNN_LOG_DEBUG("%8.1fms [%-7s] %s\n", ms, log_level_desc, s_ggml_qnn_logbuf); } +#endif } // ================================================================================================= // -// wrapper class of Qualcomm QNN(Qualcomm Neural Network, aka Qualcomm AI -// Engine Direct) SDK -// +// wrapper class of Qualcomm QNN(Qualcomm Neural Network, aka Qualcomm AI Engine Direct) SDK +// ref:https://github.com/pytorch/executorch/tree/main/backends/qualcomm // ================================================================================================= class qnn_interface { @@ -778,11 +947,6 @@ class qnn_interface { const QnnSystemInterface_t * _qnn_sys_interface = nullptr; }; -// ================================================================================================= -// -// wrapper class of Qualcomm QNN(Qualcomm Neural Network, aka Qualcomm AI Engine Direct) SDK -// -// ================================================================================================= class qnn_instance { public: using BackendIdType = decltype(QnnInterface_t{}.backendId); @@ -796,44 +960,354 @@ class qnn_instance { ~qnn_instance() {} - int qnn_init(const QnnSaver_Config_t ** saver_config); + int qnn_init(const QnnSaver_Config_t ** saver_config) { + BackendIdType backend_id = QNN_BACKEND_ID_NULL; + QNN_LOG_DEBUG("enter qni_init\n"); - int qnn_finalize(); + std::lock_guard lock(_init_mutex); - const qnn_interface & get_qnn_interface() { - if (!_qnn_interface.is_loaded()) { - QNN_LOG_WARN("pls check why _qnn_interface is not loaded\n"); + if (0 != load_system()) { + QNN_LOG_WARN("can not load QNN system lib, pls check why?\n"); + return 1; + } else { + QNN_LOG_DEBUG("load QNN system lib successfully\n"); } - return _qnn_interface; - } - const QNN_INTERFACE_VER_TYPE & get_qnn_raw_interface() { - if (!_qnn_interface.is_loaded()) { - QNN_LOG_WARN("pls check why _qnn_interface is not loaded\n"); + std::string backend_lib_path = _lib_path + _backend_name; + if (0 == _lib_path_to_backend_id.count(backend_lib_path)) { + int is_load_ok = load_backend(backend_lib_path, saver_config); + if (0 != is_load_ok) { + QNN_LOG_WARN("failed to load QNN backend\n"); + return 2; + } } - return _qnn_raw_interface; - } - const QNN_SYSTEM_INTERFACE_VER_TYPE & get_qnn_raw_system_interface() { - if (!_qnn_interface.is_loaded()) { - QNN_LOG_WARN("pls check why _qnn_interface is not loaded\n"); + backend_id = _lib_path_to_backend_id[backend_lib_path]; + if (0 == _loaded_backend.count(backend_id) || + 0 == _loaded_lib_handle.count(backend_id)) { + QNN_LOG_WARN("library %s is loaded but loaded backend count=%zu, " + "loaded lib_handle count=%zu\n", + backend_lib_path.c_str(), _loaded_backend.count(backend_id), + _loaded_lib_handle.count(backend_id)); + return 3; } - return _qnn_raw_system_interface; - } - const Qnn_LogHandle_t get_qnn_log_handle() { return _qnn_log_handle; } + _qnn_interface.set_qnn_interface(_loaded_backend[backend_id]); - const Qnn_ProfileHandle_t get_qnn_profile_handle() { - return _qnn_profile_handle; - } + _qnn_interface.qnn_log_create(qnn_sdk_logcallback, _qnn_log_level, + &_qnn_log_handle); + if (nullptr == _qnn_log_handle) { + QNN_LOG_WARN( + "why failed to initialize qnn log\n"); // NPU backend not work on + // Qualcomm SoC equipped low-end phone + return 4; + } else { + QNN_LOG_DEBUG("initialize qnn log successfully\n"); + } - const Qnn_DeviceHandle_t get_qnn_device_handle() { - return _qnn_device_handle; - } + std::vector temp_backend_config; + _qnn_interface.qnn_backend_create( + _qnn_log_handle, + temp_backend_config.empty() ? nullptr : temp_backend_config.data(), + &_qnn_backend_handle); + if (nullptr == _qnn_backend_handle) { + QNN_LOG_WARN("why failed to initialize qnn backend\n"); + return 5; + } else { + QNN_LOG_DEBUG("initialize qnn backend successfully\n"); + } - const Qnn_BackendHandle_t get_qnn_backend_handle() { - return _qnn_backend_handle; - } + if (nullptr != _qnn_raw_interface.propertyHasCapability) { + auto qnnStatus = + _qnn_raw_interface.propertyHasCapability(QNN_PROPERTY_GROUP_DEVICE); + if (QNN_PROPERTY_NOT_SUPPORTED == qnnStatus) { + QNN_LOG_WARN("device property is not supported\n"); + } + if (QNN_PROPERTY_ERROR_UNKNOWN_KEY == qnnStatus) { + QNN_LOG_WARN("device property is not known to backend\n"); + } + } + + Qnn_ErrorHandle_t qnn_status = _qnn_raw_interface.deviceCreate(_qnn_log_handle, nullptr, + &_qnn_device_handle); + if (QNN_SUCCESS != qnn_status && + QNN_DEVICE_ERROR_UNSUPPORTED_FEATURE != qnn_status) { + QNN_LOG_WARN("failed to create QNN device\n"); + } else { + QNN_LOG_INFO("create device successfully\n"); + } + + if (qnn_sdk_profile_level::profile_off != _profile_level) { + QNN_LOG_INFO("profiling turned on; level = %d", _profile_level); + if (qnn_sdk_profile_level::profile_basic == _profile_level) { + QNN_LOG_INFO("basic profiling requested. creating Qnn Profile object\n"); + if (QNN_PROFILE_NO_ERROR != + _qnn_raw_interface.profileCreate(_qnn_backend_handle, + QNN_PROFILE_LEVEL_BASIC, + &_qnn_profile_handle)) { + QNN_LOG_WARN("unable to create profile handle in the backend\n"); + return 6; + } else { + QNN_LOG_DEBUG("initialize qnn profile successfully\n"); + } + } else if (qnn_sdk_profile_level::profile_detail == _profile_level) { + QNN_LOG_INFO("detailed profiling requested. Creating Qnn Profile object\n"); + if (QNN_PROFILE_NO_ERROR != + _qnn_raw_interface.profileCreate(_qnn_backend_handle, + QNN_PROFILE_LEVEL_DETAILED, + &_qnn_profile_handle)) { + QNN_LOG_WARN("unable to create profile handle in the backend\n"); + return 7; + } else { + QNN_LOG_DEBUG("initialize qnn profile successfully\n"); + } + } + } + + _rpc_lib_handle = dlopen("libcdsprpc.so", RTLD_NOW | RTLD_LOCAL); + if (nullptr == _rpc_lib_handle) { + QNN_LOG_WARN("failed to load qualcomm's rpc lib, error:%s\n", dlerror()); + return 8; + } else { + QNN_LOG_DEBUG("load rpcmem lib successfully\n"); + set_rpcmem_initialized(true); + } + __pfn_rpc_mem_init = reinterpret_cast<_pfn_rpc_mem_init>( + dlsym(_rpc_lib_handle, "rpcmem_init")); + __pfn_rpc_mem_deinit = reinterpret_cast<_pfn_rpc_mem_deinit>( + dlsym(_rpc_lib_handle, "rpcmem_deinit")); + __pfn_rpc_mem_alloc = reinterpret_cast<_pfn_rpc_mem_alloc>( + dlsym(_rpc_lib_handle, "rpcmem_alloc")); + __pfn_rpc_mem_free = reinterpret_cast<_pfn_rpc_mem_free>( + dlsym(_rpc_lib_handle, "rpcmem_free")); + __pfn_rpc_mem_to_fd = reinterpret_cast<_pfn_rpc_mem_to_fd>( + dlsym(_rpc_lib_handle, "rpcmem_to_fd")); + if (nullptr == __pfn_rpc_mem_alloc || nullptr == __pfn_rpc_mem_free || + nullptr == __pfn_rpc_mem_to_fd) { + QNN_LOG_WARN("unable to access symbols in QNN RPC lib. dlerror(): %s", dlerror()); + dlclose(_rpc_lib_handle); + return 9; + } + + if (nullptr != + __pfn_rpc_mem_init) // make Qualcomm's SoC equipped low-end phone happy + __pfn_rpc_mem_init(); + + std::vector temp_context_config; + _qnn_interface.qnn_context_create( + _qnn_backend_handle, _qnn_device_handle, + temp_context_config.empty() ? nullptr : temp_context_config.data(), + &_qnn_context_handle); + if (nullptr == _qnn_context_handle) { + QNN_LOG_WARN("why failed to initialize qnn context\n"); + return 10; + } else { + QNN_LOG_DEBUG("initialize qnn context successfully\n"); + } + + if (_backend_name.find("Htp") != std::variant_npos) { + const QnnDevice_PlatformInfo_t * p_info = nullptr; + _qnn_raw_interface.deviceGetPlatformInfo(nullptr, &p_info); + QNN_LOG_INFO("device counts %d", p_info->v1.numHwDevices); + QnnDevice_HardwareDeviceInfo_t * infos = p_info->v1.hwDevices; + for (int i = 0; i < p_info->v1.numHwDevices; i++) { + QNN_LOG_INFO("deviceID:%d, deviceType:%d, numCores %d", infos[i].v1.deviceId, + infos[i].v1.deviceType, infos[i].v1.numCores); + QnnDevice_DeviceInfoExtension_t devinfo = infos[i].v1.deviceInfoExtension; + QnnHtpDevice_OnChipDeviceInfoExtension_t chipinfo = devinfo->onChipDevice; + QnnHtpDevice_Arch_t htp_arch = chipinfo.arch; + QNN_LOG_INFO("htp_type:%d(%s)", devinfo->devType, (devinfo->devType == QNN_HTP_DEVICE_TYPE_ON_CHIP) ? "ON_CHIP" : ""); + QNN_LOG_INFO("qualcomm soc_model:%d(%s), htp_arch:%d(%s), vtcm_size:%d MB", \ + chipinfo.socModel, qnn_get_chipset_desc(chipinfo.socModel), \ + htp_arch, qnn_get_htparch_desc(htp_arch), chipinfo.vtcmSize); + g_qnn_mgr[QNN_BACKEND_NPU].socinfo = { chipinfo.socModel, htp_arch, chipinfo.vtcmSize }; + } + _qnn_raw_interface.deviceFreePlatformInfo(nullptr, p_info); + + + //TODO: faster approach to probe the accurate capacity of rpc ion memory + size_t candidate_size = 0; + uint8_t * rpc_buffer = nullptr; + const int SIZE_IN_MB = (1 << 20); + size_t probe_slots[] = {1024, 1536, 2048 - 48, 2048}; + size_t probe_counts = sizeof(probe_slots) / sizeof(size_t); + for (size_t idx = 0; idx < probe_counts; idx++) { + rpc_buffer = static_cast(alloc_rpcmem(probe_slots[idx] * SIZE_IN_MB, 4)); + if (nullptr == rpc_buffer) { + QNN_LOG_INFO("alloc rpcmem %d (MB) failure, %s\n", probe_slots[idx], strerror(errno)); + break; + } else { + candidate_size = probe_slots[idx]; + free_rpcmem(rpc_buffer); + rpc_buffer = nullptr; + } + } + if (candidate_size > _rpcmem_capacity) + _rpcmem_capacity = candidate_size; + QNN_LOG_INFO("capacity of rpc ion memory %d MB\n", _rpcmem_capacity); + } + + QNN_LOG_DEBUG("leave qni_init\n"); + + return 0; + } + + int qnn_finalize() { + int ret_status = 0; + Qnn_ErrorHandle_t error = QNN_SUCCESS; + + if (nullptr != + __pfn_rpc_mem_deinit) // make Qualcomm's SoC equipped low-end phone happy + __pfn_rpc_mem_deinit(); + + if (dlclose(_rpc_lib_handle) != 0) { + QNN_LOG_WARN("failed to unload qualcomm's rpc lib, error:%s\n", dlerror()); + } else { + QNN_LOG_DEBUG("succeed to close rpcmem lib\n"); + } + + if (nullptr != _qnn_context_handle) { + error = _qnn_interface.qnn_context_free(_qnn_context_handle, + _qnn_profile_handle); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to free QNN context_handle: ID %u, error %d\n", + _qnn_interface.get_backend_id(), + QNN_GET_ERROR_CODE(error)); + } + _qnn_context_handle = nullptr; + } + + if (nullptr != _qnn_profile_handle) { + error = _qnn_interface.qnn_profile_free(_qnn_profile_handle); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to free QNN profile_handle: ID %u, error %d\n", + _qnn_interface.get_backend_id(), + QNN_GET_ERROR_CODE(error)); + } + _qnn_profile_handle = nullptr; + } + + if (nullptr != _qnn_device_handle) { + error = _qnn_interface.qnn_device_free(_qnn_device_handle); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to free QNN device_handle: ID %u, error %d\n", + _qnn_interface.get_backend_id(), + QNN_GET_ERROR_CODE(error)); + } + _qnn_device_handle = nullptr; + } + + if (nullptr != _qnn_backend_handle) { + error = _qnn_interface.qnn_backend_free(_qnn_backend_handle); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to free QNN backend_handle: ID %u, error %d\n", + _qnn_interface.get_backend_id(), + QNN_GET_ERROR_CODE(error)); + } + _qnn_backend_handle = nullptr; + } + + if (nullptr != _qnn_log_handle) { + error = _qnn_interface.qnn_log_free(_qnn_log_handle); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to free QNN log_handle: ID %u, error %d\n", + _qnn_interface.get_backend_id(), + QNN_GET_ERROR_CODE(error)); + } + _qnn_log_handle = nullptr; + } + + unload_backend(); + + unload_system(); + + return ret_status; + } + + int init_qnn_graph(const char * graph_name, bool debug, + uint8_t do_node_validation = true, + const QnnGraph_Config_t ** graph_configs = nullptr) { + int result = 0; + + if (nullptr == graph_name) { + QNN_LOG_WARN("graph name is null\n"); + return 1; + } + + if (!_graph_name.empty()) { + QNN_LOG_WARN("qnn model for graph %s already initialized\n", graph_name); + return 2; + } + + if (!do_node_validation) { + QNN_LOG_WARN("node validation disabled, backend will not perform op " + "validation prior to adding node\n"); + } + + _graph_name = graph_name; + _debug_tensor = debug; + _do_node_validations = do_node_validation; + + result = _qnn_raw_interface.graphCreate(_qnn_context_handle, graph_name, + graph_configs, &_qnn_graph_handle); + if (result != QNN_GRAPH_NO_ERROR || nullptr == _qnn_graph_handle) { + QNN_LOG_WARN("failed to create graph in qnn context\n"); + return 3; + } else { + QNN_LOG_INFO("succeed to create graph %s, %p\n", graph_name, _qnn_graph_handle); + } + + return 0; + } + + int finalize_qnn_graph() { + if (nullptr != _qnn_graph_handle) { + if (_qnn_raw_interface.graphFinalize(_qnn_graph_handle, + _qnn_profile_handle, + nullptr) != QNN_GRAPH_NO_ERROR) { + QNN_LOG_WARN("finalizing graph failure\n"); + } + } else { + QNN_LOG_DEBUG("qnn graph handle is null\n"); + } + + return 0; + } + + const qnn_interface & get_qnn_interface() { + if (!_qnn_interface.is_loaded()) { + QNN_LOG_WARN("pls check why _qnn_interface is not loaded\n"); + } + return _qnn_interface; + } + + const QNN_INTERFACE_VER_TYPE & get_qnn_raw_interface() { + if (!_qnn_interface.is_loaded()) { + QNN_LOG_WARN("pls check why _qnn_interface is not loaded\n"); + } + return _qnn_raw_interface; + } + + const QNN_SYSTEM_INTERFACE_VER_TYPE & get_qnn_raw_system_interface() { + if (!_qnn_interface.is_loaded()) { + QNN_LOG_WARN("pls check why _qnn_interface is not loaded\n"); + } + return _qnn_raw_system_interface; + } + + const Qnn_LogHandle_t get_qnn_log_handle() { return _qnn_log_handle; } + + const Qnn_ProfileHandle_t get_qnn_profile_handle() { + return _qnn_profile_handle; + } + + const Qnn_DeviceHandle_t get_qnn_device_handle() { + return _qnn_device_handle; + } + + const Qnn_BackendHandle_t get_qnn_backend_handle() { + return _qnn_backend_handle; + } const Qnn_ContextHandle_t get_qnn_context_handle() { return _qnn_context_handle; @@ -845,12 +1319,6 @@ class qnn_instance { const Qnn_GraphHandle_t get_qnn_graph_handle() { return _qnn_graph_handle; } - int init_qnn_graph(const char * graph_name, bool debug, - uint8_t do_node_validation = 1, - const QnnGraph_Config_t ** graph_configs = nullptr); - - int finalize_qnn_graph(); - int init_htp_perfinfra() { QnnDevice_Infrastructure_t device_infra = nullptr; int error = _qnn_raw_interface.deviceGetInfrastructure(&device_infra); @@ -945,793 +1413,416 @@ class qnn_instance { _rpcmem_initialized = initialized; } - int32_t rpcmem_to_fd(void * buf); - - int register_rpcmem(void * p_data, Qnn_Tensor_t * p_tensor); - - void unregister_rpcmem(); - - void * alloc_rpcmem(size_t bytes, size_t alignment); - - void free_rpcmem(void * buf); - size_t get_rpcmem_capacity() { return _rpcmem_capacity; } - bool is_rpcmem_allocated(void * buf); - bool is_rpcmem_registered(Qnn_MemHandle_t handle) { return _qnn_mem_set.count(handle) != 0U; } - public: - std::map> - _qnn_graph_map; - - private: - int load_system(); - - int unload_system(); - - int load_backend(std::string & lib_path, const QnnSaver_Config_t ** saver_config); - - int unload_backend(); - - void set_qnn_raw_interface(QNN_INTERFACE_VER_TYPE & raw_interface) { - _qnn_raw_interface = raw_interface; - } - - void set_qnn_raw_system_interface(QNN_SYSTEM_INTERFACE_VER_TYPE & raw_interface) { - _qnn_raw_system_interface = raw_interface; - } - - private: - static constexpr const int _required_num_providers = 1; - - private: - std::string _lib_path; - std::string _backend_name; - std::string _model_name; // prebuilt QNN model name, not used currently - BackendIdType _backend_id; - - bool _debug_tensor = false; // flag to indicate if requested graph is to be run in debug mode - bool _do_node_validations = true; // flag to indicate whether all add_node - // calls need to be validated - QnnLog_Level_t _qnn_log_level = QNN_LOG_LEVEL_DEBUG; - - ggml_qnn_profile_level _profile_level = ggml_qnn_profile_level::profile_detail; - - qnn_interface _qnn_interface; - - void * _system_lib_handle = nullptr; - - Qnn_GraphHandle_t _qnn_graph_handle = nullptr; - - Qnn_LogHandle_t _qnn_log_handle = nullptr; - - Qnn_ProfileHandle_t _qnn_profile_handle = nullptr; - - Qnn_DeviceHandle_t _qnn_device_handle = nullptr; - - Qnn_BackendHandle_t _qnn_backend_handle = nullptr; - - Qnn_ContextHandle_t _qnn_context_handle = nullptr; - - QnnSystemContext_Handle_t _qnn_system_handle = nullptr; - - QnnHtpDevice_PerfInfrastructure_t * _qnn_htp_perfinfra = nullptr; - uint32_t _qnn_power_configid = 1; - uint32_t _qnn_rpc_pollingtime = 9999; // 0-10000 us for high performing - - QNN_INTERFACE_VER_TYPE _qnn_raw_interface; - QNN_SYSTEM_INTERFACE_VER_TYPE _qnn_raw_system_interface; - - std::unordered_set _qnn_mem_set; - - std::mutex _init_mutex; - std::unordered_map _loaded_lib_handle; - std::unordered_map _lib_path_to_backend_id; - std::unordered_map _loaded_backend; - - void * _rpc_lib_handle = nullptr; - std::atomic_bool _rpcmem_initialized{false}; - pfn_rpc_mem_alloc _pfn_rpc_mem_alloc; - pfn_rpc_mem_free _pfn_rpc_mem_free; - pfn_rpc_mem_to_fd _pfn_rpc_mem_to_fd; - pfn_rpc_mem_init _pfn_rpc_mem_init; - pfn_rpc_mem_deinit _pfn_rpc_mem_deinit; - std::unordered_map _rpcmem_store_map; - size_t _rpcmem_capacity = 512; - - std::string _graph_name; -}; - -// ================================================================================================= -// -// implementation of QNN wrapper class -// -// ================================================================================================= -void * qnn_instance::alloc_rpcmem(size_t bytes, size_t alignment) { - if (!_rpcmem_initialized) { - QNN_LOG_WARN("rpc memory not initialized\n"); - return nullptr; - } - - auto allocate_bytes = static_cast(bytes + alignment); - void * buf = _pfn_rpc_mem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS, - allocate_bytes); - if (buf == nullptr) { - QNN_LOG_WARN("failed to allocate rpc memory\n"); - return nullptr; - } - - auto aligned_buf = reinterpret_cast( - align_to(alignment, reinterpret_cast(buf))); - bool status = - _rpcmem_store_map.insert(std::pair(aligned_buf, buf)).second; - if (!status) { - QNN_LOG_WARN("failed to allocate rpc memory\n"); - _pfn_rpc_mem_free(buf); - } - - return aligned_buf; -} - -void qnn_instance::free_rpcmem(void * buf) { - if (!_rpcmem_initialized) { - QNN_LOG_WARN("rpc memory not initialized\n"); - } else if (0 == _rpcmem_store_map.count(buf)) { - QNN_LOG_WARN("no allocated tensor\n"); - } else { - _pfn_rpc_mem_free(_rpcmem_store_map[buf]); - _rpcmem_store_map.erase(buf); - } -} + void * alloc_rpcmem(size_t bytes, size_t alignment) { + if (!_rpcmem_initialized) { + QNN_LOG_WARN("rpc memory not initialized\n"); + return nullptr; + } -int32_t qnn_instance::rpcmem_to_fd(void * buf) { - int32_t mem_fd = -1; - if (!is_rpcmem_initialized()) { - QNN_LOG_WARN("rpc memory not initialized\n"); - } else { - mem_fd = _pfn_rpc_mem_to_fd(buf); - } + auto allocate_bytes = static_cast(bytes + alignment); + void * buf = __pfn_rpc_mem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS, + allocate_bytes); + if (buf == nullptr) { + QNN_LOG_WARN("failed to allocate rpc memory\n"); + return nullptr; + } - return mem_fd; -} + auto aligned_buf = reinterpret_cast( + align_to(alignment, reinterpret_cast(buf))); + bool status = + _rpcmem_store_map.insert(std::pair(aligned_buf, buf)).second; + if (!status) { + QNN_LOG_WARN("failed to allocate rpc memory\n"); + __pfn_rpc_mem_free(buf); + } -int qnn_instance::register_rpcmem(void * p_data, Qnn_Tensor_t * p_tensor) { - if (nullptr == p_data || (nullptr == p_tensor)) { - QNN_LOG_WARN("invalid param\n"); - return 1; + return aligned_buf; } - if (!is_rpcmem_initialized()) { - QNN_LOG_WARN("rpc memory not initialized\n"); - return 2; + void free_rpcmem(void * buf) { + if (!_rpcmem_initialized) { + QNN_LOG_WARN("rpc memory not initialized\n"); + } else if (0 == _rpcmem_store_map.count(buf)) { + QNN_LOG_WARN("no allocated tensor\n"); + } else { + __pfn_rpc_mem_free(_rpcmem_store_map[buf]); + _rpcmem_store_map.erase(buf); + } } - if (is_rpcmem_allocated(p_data)) { - QNN_LOG_WARN("rpc memory already allocated\n"); - // return 3; - } - if (is_rpcmem_registered((QNN_VER_PTR(*p_tensor)->memHandle))) { - QNN_LOG_WARN("tensor %s has been registered shared memory\n", - (QNN_VER_PTR(*p_tensor)->name)); - return 4; - } + int32_t rpcmem_to_fd(void * buf) { + int32_t mem_fd = -1; + if (!is_rpcmem_initialized()) { + QNN_LOG_WARN("rpc memory not initialized\n"); + } else { + mem_fd = __pfn_rpc_mem_to_fd(buf); + } - int32_t mem_fd = rpcmem_to_fd(p_data); - if (-1 == mem_fd) { - QNN_LOG_WARN("failed to get file descriptor\n"); - return 5; + return mem_fd; } - QNN_LOG_DEBUG("mem_fd %d\n", mem_fd); - Qnn_MemDescriptor_t descriptor = {{QNN_VER_PTR(*p_tensor)->rank, - QNN_VER_PTR(*p_tensor)->dimensions, - nullptr}, - QNN_VER_PTR(*p_tensor)->dataType, - QNN_MEM_TYPE_ION, - {{mem_fd}}}; - Qnn_MemHandle_t handle = nullptr; - int error = QNN_SUCCESS; - error = _qnn_interface.qnn_mem_register(_qnn_context_handle, &descriptor, - /*numDescriptors=*/1, &handle); - if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to register shared memory, error %d, %s\n", - QNN_GET_ERROR_CODE(error), strerror(error)); - return 6; - } else { - QNN_LOG_INFO("tensor %s successfully register shared memory\n", - (QNN_VER_PTR(*p_tensor)->name)); - } - QNN_VER_PTR(*p_tensor)->memHandle = handle; - _qnn_mem_set.insert(handle); - - return 0; -} -void qnn_instance::unregister_rpcmem() { - Qnn_ErrorHandle_t error = QNN_SUCCESS; + int register_rpcmem(void * p_data, Qnn_Tensor_t * p_tensor) { + if (nullptr == p_data || (nullptr == p_tensor)) { + QNN_LOG_WARN("invalid param\n"); + return 1; + } - if (_qnn_mem_set.empty()) { - QNN_LOG_WARN("no rpcmem registered\n"); - } + if (!is_rpcmem_initialized()) { + QNN_LOG_WARN("rpc memory not initialized\n"); + return 2; + } - for (auto &mem_handle : _qnn_mem_set) { - error = _qnn_interface.qnn_mem_de_register(&mem_handle, 1); + if (is_rpcmem_allocated(p_data)) { + QNN_LOG_WARN("rpc memory already allocated\n"); + // return 3; + } + if (is_rpcmem_registered((QNN_VER_PTR(*p_tensor)->memHandle))) { + QNN_LOG_WARN("tensor %s has been registered shared memory\n", + (QNN_VER_PTR(*p_tensor)->name)); + return 4; + } + + int32_t mem_fd = rpcmem_to_fd(p_data); + if (-1 == mem_fd) { + QNN_LOG_WARN("failed to get file descriptor\n"); + return 5; + } + QNN_LOG_DEBUG("mem_fd %d\n", mem_fd); + Qnn_MemDescriptor_t descriptor = {{QNN_VER_PTR(*p_tensor)->rank, + QNN_VER_PTR(*p_tensor)->dimensions, + nullptr}, + QNN_VER_PTR(*p_tensor)->dataType, + QNN_MEM_TYPE_ION, + {{mem_fd}}}; + Qnn_MemHandle_t handle = nullptr; + int error = QNN_SUCCESS; + error = _qnn_interface.qnn_mem_register(_qnn_context_handle, &descriptor, + /*numDescriptors=*/1, &handle); if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to unregister shared memory, error %d\n", - QNN_GET_ERROR_CODE(error)); + QNN_LOG_WARN("failed to register shared memory, error %d, %s\n", + QNN_GET_ERROR_CODE(error), strerror(error)); + return 6; + } else { + QNN_LOG_INFO("tensor %s successfully register shared memory\n", + (QNN_VER_PTR(*p_tensor)->name)); } - } - _qnn_mem_set.clear(); -} - -bool qnn_instance::is_rpcmem_allocated(void * buf) { - return _rpcmem_store_map.count(buf) != 0U; -} - -int qnn_instance::load_backend(std::string & lib_path, - const QnnSaver_Config_t ** saver_config) { - Qnn_ErrorHandle_t error = QNN_SUCCESS; - QNN_LOG_DEBUG("lib_path:%s\n", lib_path.c_str()); + QNN_VER_PTR(*p_tensor)->memHandle = handle; + _qnn_mem_set.insert(handle); - void * lib_handle = dlopen(lib_path.c_str(), RTLD_NOW | RTLD_GLOBAL); - if (nullptr == lib_handle) { - QNN_LOG_WARN("can not open QNN library %s, with error: %s", - lib_path.c_str(), dlerror()); - return 1; - } - - auto get_providers = load_qnn_functionpointers<_pfn_QnnInterface_getProviders *>( - lib_handle, "QnnInterface_getProviders"); - if (nullptr == get_providers) { - QNN_LOG_WARN("can not load symbol QnnInterface_getProviders : %s", - dlerror()); - return 2; + return 0; } - std::uint32_t num_providers = 0; - const QnnInterface_t ** provider_list = nullptr; - error = get_providers(&provider_list, &num_providers); - if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to get providers, error %d", - QNN_GET_ERROR_CODE(error)); - return 3; - } - QNN_LOG_DEBUG("num_providers=%d\n", num_providers); - if (num_providers != _required_num_providers) { - QNN_LOG_WARN("providers is %d instead of required %d", num_providers, - _required_num_providers); - return 4; - } + void unregister_rpcmem() { + Qnn_ErrorHandle_t error = QNN_SUCCESS; - if (nullptr == provider_list) { - QNN_LOG_WARN("failed to get qnn interface providers\n"); - return 5; - } - bool found_valid_interface = false; - QNN_INTERFACE_VER_TYPE qnn_interface; - for (size_t idx = 0; idx < num_providers; idx++) { - if (QNN_API_VERSION_MAJOR == - provider_list[idx]->apiVersion.coreApiVersion.major && - QNN_API_VERSION_MINOR <= - provider_list[idx]->apiVersion.coreApiVersion.minor) { - found_valid_interface = true; - qnn_interface = provider_list[idx]->QNN_INTERFACE_VER_NAME; - break; + if (_qnn_mem_set.empty()) { + QNN_LOG_WARN("no rpcmem registered\n"); } - } - if (!found_valid_interface) { - QNN_LOG_WARN("unable to find a valid qnn interface\n"); - return 6; - } else { - QNN_LOG_INFO("find a valid qnn interface\n"); - } - set_qnn_raw_interface(qnn_interface); - - BackendIdType backend_id = provider_list[0]->backendId; - _lib_path_to_backend_id[lib_path] = backend_id; - if (_loaded_backend.count(backend_id) > 0) { - QNN_LOG_WARN("lib_path %s is loaded, but backend %d already exists\n", - lib_path.c_str(), backend_id); - } - _loaded_backend[backend_id] = provider_list[0]; - if (_loaded_lib_handle.count(backend_id) > 0) { - QNN_LOG_WARN("closing %p\n", _loaded_lib_handle[backend_id]); - int dlclose_error = dlclose(_loaded_lib_handle[backend_id]); - if (dlclose_error != 0) { - QNN_LOG_WARN("fail to close %p with error %s\n", - _loaded_lib_handle[backend_id], dlerror()); + for (auto & mem_handle : _qnn_mem_set) { + error = _qnn_interface.qnn_mem_de_register(&mem_handle, 1); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to unregister shared memory, error %d\n", + QNN_GET_ERROR_CODE(error)); + } } + _qnn_mem_set.clear(); } - _loaded_lib_handle[backend_id] = lib_handle; - _backend_id = backend_id; - return 0; -} - -int qnn_instance::unload_backend() { - int dlclose_error = 0; - for (auto & it : _loaded_lib_handle) { - dlclose_error = dlclose(it.second); - if (dlclose_error != 0) { - QNN_LOG_WARN("failed to close QNN backend %d, error %s\n", it.first, - dlerror()); - } + bool is_rpcmem_allocated(void * buf) { + return _rpcmem_store_map.count(buf) != 0U; } - _loaded_lib_handle.clear(); - _lib_path_to_backend_id.clear(); - _loaded_backend.clear(); - return 0; -} + public: + std::map> + _qnn_graph_map; -int qnn_instance::load_system() { - Qnn_ErrorHandle_t error = QNN_SUCCESS; + private: + int load_system() { + Qnn_ErrorHandle_t error = QNN_SUCCESS; - std::string system_lib_path = _lib_path + "libQnnSystem.so"; - QNN_LOG_DEBUG("system_lib_path:%s\n", system_lib_path.c_str()); + std::string system_lib_path = _lib_path + "libQnnSystem.so"; + QNN_LOG_DEBUG("system_lib_path:%s\n", system_lib_path.c_str()); - _system_lib_handle = dlopen(system_lib_path.c_str(), RTLD_NOW | RTLD_LOCAL); - if (nullptr == _system_lib_handle) { - QNN_LOG_WARN("can not open QNN library %s, error: %s\n", - system_lib_path.c_str(), dlerror()); - return 1; - } + _system_lib_handle = dlopen(system_lib_path.c_str(), RTLD_NOW | RTLD_LOCAL); + if (nullptr == _system_lib_handle) { + QNN_LOG_WARN("can not open QNN library %s, error: %s\n", + system_lib_path.c_str(), dlerror()); + return 1; + } - auto * get_providers = - reinterpret_cast<_pfn_QnnSystemInterface_getProviders *>( - dlsym(_system_lib_handle, "QnnSystemInterface_getProviders")); - if (nullptr == get_providers) { - QNN_LOG_WARN( - "can not load QNN symbol QnnSystemInterface_getProviders: %s\n", - dlerror()); - return 2; - } + auto * get_providers = + reinterpret_cast<_pfn_qnnsysteminterface_getproviders *>( + dlsym(_system_lib_handle, "QnnSystemInterface_getProviders")); + if (nullptr == get_providers) { + QNN_LOG_WARN( + "can not load QNN symbol QnnSystemInterface_getProviders: %s\n", + dlerror()); + return 2; + } - uint32_t num_providers = 0; - const QnnSystemInterface_t ** provider_list = nullptr; - error = get_providers(&provider_list, &num_providers); - if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to get providers, error %d\n", - QNN_GET_ERROR_CODE(error)); - return 3; - } + uint32_t num_providers = 0; + const QnnSystemInterface_t ** provider_list = nullptr; + error = get_providers(&provider_list, &num_providers); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to get providers, error %d\n", + QNN_GET_ERROR_CODE(error)); + return 3; + } - if (num_providers != _required_num_providers) { - QNN_LOG_WARN("providers is %d instead of required %d\n", num_providers, - _required_num_providers); - return 4; - } + if (num_providers != _required_num_providers) { + QNN_LOG_WARN("providers is %d instead of required %d\n", num_providers, + _required_num_providers); + return 4; + } - if (nullptr == provider_list) { - QNN_LOG_WARN("can not get providers\n"); - return 5; - } + if (nullptr == provider_list) { + QNN_LOG_WARN("can not get providers\n"); + return 5; + } - QNN_SYSTEM_INTERFACE_VER_TYPE qnn_system_interface; - bool found_valid_system_interface = false; - for (size_t idx = 0; idx < num_providers; idx++) { - if (QNN_SYSTEM_API_VERSION_MAJOR == + QNN_SYSTEM_INTERFACE_VER_TYPE qnn_system_interface; + bool found_valid_system_interface = false; + for (size_t idx = 0; idx < num_providers; idx++) { + if (QNN_SYSTEM_API_VERSION_MAJOR == provider_list[idx]->systemApiVersion.major && - QNN_SYSTEM_API_VERSION_MINOR <= + QNN_SYSTEM_API_VERSION_MINOR <= provider_list[idx]->systemApiVersion.minor) { - found_valid_system_interface = true; - qnn_system_interface = - provider_list[idx]->QNN_SYSTEM_INTERFACE_VER_NAME; - break; + found_valid_system_interface = true; + qnn_system_interface = + provider_list[idx]->QNN_SYSTEM_INTERFACE_VER_NAME; + break; + } } - } - if (!found_valid_system_interface) { - QNN_LOG_WARN("unable to find a valid qnn system interface\n"); - return 6; - } else { - QNN_LOG_INFO("find a valid qnn system interface\n"); - } - set_qnn_raw_system_interface(qnn_system_interface); - - _qnn_interface.set_qnn_system_interface(provider_list[0]); - - _qnn_interface.qnn_system_context_create(&_qnn_system_handle); - if (nullptr == _qnn_system_handle) { - QNN_LOG_WARN("can not create QNN system contenxt\n"); - } else { - QNN_LOG_INFO("initialize qnn system successfully\n"); - } - - return 0; -} - -int qnn_instance::unload_system() { - int result = 0; + if (!found_valid_system_interface) { + QNN_LOG_WARN("unable to find a valid qnn system interface\n"); + return 6; + } else { + QNN_LOG_INFO("find a valid qnn system interface\n"); + } + set_qnn_raw_system_interface(qnn_system_interface); - if (nullptr == _system_lib_handle) { - QNN_LOG_DEBUG("system lib handle is null\n"); - return 1; - } + _qnn_interface.set_qnn_system_interface(provider_list[0]); - if (nullptr != _qnn_system_handle) { - result = _qnn_interface.qnn_system_context_free(_qnn_system_handle); - if (result != QNN_SUCCESS) { - QNN_LOG_WARN("failed to free QNN system context\n"); + _qnn_interface.qnn_system_context_create(&_qnn_system_handle); + if (nullptr == _qnn_system_handle) { + QNN_LOG_WARN("can not create QNN system contenxt\n"); + } else { + QNN_LOG_INFO("initialize qnn system successfully\n"); } - _qnn_system_handle = nullptr; - } - int dlclose_error = dlclose(_system_lib_handle); - if (dlclose_error != 0) { - QNN_LOG_WARN("failed to close QnnSystem library, error %s\n", - dlerror()); - return 2; + return 0; } - _system_lib_handle = nullptr; - - return result; -} + int unload_system() { + int result = 0; -static void ggml_qnn_logcallback(const char * fmt, QnnLog_Level_t level, - uint64_t timestamp, va_list argp) { + if (nullptr == _system_lib_handle) { + QNN_LOG_DEBUG("system lib handle is null\n"); + return 1; + } -#if ENABLE_QNN_LOG - static std::mutex log_mutex; - static unsigned char s_ggml_qnn_logbuf[GGML_QNN_LOGBUF_LEN]; + if (nullptr != _qnn_system_handle) { + result = _qnn_interface.qnn_system_context_free(_qnn_system_handle); + if (result != QNN_SUCCESS) { + QNN_LOG_WARN("failed to free QNN system context\n"); + } + _qnn_system_handle = nullptr; + } - const char * log_level_desc = ""; - switch (level) { - case QNN_LOG_LEVEL_ERROR: - log_level_desc = "ERROR"; - break; - case QNN_LOG_LEVEL_WARN: - log_level_desc = "WARNING"; - break; - case QNN_LOG_LEVEL_INFO: - log_level_desc = "INFO"; - break; - case QNN_LOG_LEVEL_DEBUG: - log_level_desc = "DEBUG"; - break; - case QNN_LOG_LEVEL_VERBOSE: - log_level_desc = "VERBOSE"; - break; - case QNN_LOG_LEVEL_MAX: - log_level_desc = "UNKNOWN"; - break; - } + int dlclose_error = dlclose(_system_lib_handle); + if (dlclose_error != 0) { + QNN_LOG_WARN("failed to close QnnSystem library, error %s\n", + dlerror()); + return 2; + } - double ms = (double) timestamp / 1000000.0; - { - std::lock_guard lock(log_mutex); + _system_lib_handle = nullptr; - memset(s_ggml_qnn_logbuf, 0, GGML_QNN_LOGBUF_LEN); - vsnprintf(reinterpret_cast(s_ggml_qnn_logbuf), GGML_QNN_LOGBUF_LEN, fmt, argp); - QNN_LOG_DEBUG("%8.1fms [%-7s] %s\n", ms, log_level_desc, s_ggml_qnn_logbuf); + return result; } -#endif -} -int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) { - BackendIdType backend_id = QNN_BACKEND_ID_NULL; - QNN_LOG_DEBUG("enter qni_init\n"); + int load_backend(std::string & lib_path, const QnnSaver_Config_t ** saver_config) { + Qnn_ErrorHandle_t error = QNN_SUCCESS; + QNN_LOG_DEBUG("lib_path:%s\n", lib_path.c_str()); - std::lock_guard lock(_init_mutex); - - if (0 != load_system()) { - QNN_LOG_WARN("can not load QNN system lib, pls check why?\n"); - return 1; - } else { - QNN_LOG_DEBUG("load QNN system lib successfully\n"); - } + void * lib_handle = dlopen(lib_path.c_str(), RTLD_NOW | RTLD_GLOBAL); + if (nullptr == lib_handle) { + QNN_LOG_WARN("can not open QNN library %s, with error: %s", + lib_path.c_str(), dlerror()); + return 1; + } - std::string backend_lib_path = _lib_path + _backend_name; - if (0 == _lib_path_to_backend_id.count(backend_lib_path)) { - int is_load_ok = load_backend(backend_lib_path, saver_config); - if (0 != is_load_ok) { - QNN_LOG_WARN("failed to load QNN backend\n"); + auto get_providers = load_qnn_functionpointers<_pfn_qnninterface_getproviders *>( + lib_handle, "QnnInterface_getProviders"); + if (nullptr == get_providers) { + QNN_LOG_WARN("can not load symbol QnnInterface_getProviders : %s", + dlerror()); return 2; } - } - - backend_id = _lib_path_to_backend_id[backend_lib_path]; - if (0 == _loaded_backend.count(backend_id) || - 0 == _loaded_lib_handle.count(backend_id)) { - QNN_LOG_WARN("library %s is loaded but loaded backend count=%zu, " - "loaded lib_handle count=%zu\n", - backend_lib_path.c_str(), _loaded_backend.count(backend_id), - _loaded_lib_handle.count(backend_id)); - return 3; - } - - _qnn_interface.set_qnn_interface(_loaded_backend[backend_id]); - - _qnn_interface.qnn_log_create(ggml_qnn_logcallback, _qnn_log_level, - &_qnn_log_handle); - if (nullptr == _qnn_log_handle) { - QNN_LOG_WARN( - "why failed to initialize qnn log\n"); // NPU backend not work on - // Qualcomm SoC equipped low-end phone - return 4; - } else { - QNN_LOG_DEBUG("initialize qnn log successfully\n"); - } - std::vector temp_backend_config; - _qnn_interface.qnn_backend_create( - _qnn_log_handle, - temp_backend_config.empty() ? nullptr : temp_backend_config.data(), - &_qnn_backend_handle); - if (nullptr == _qnn_backend_handle) { - QNN_LOG_WARN("why failed to initialize qnn backend\n"); - return 5; - } else { - QNN_LOG_DEBUG("initialize qnn backend successfully\n"); - } - - if (nullptr != _qnn_raw_interface.propertyHasCapability) { - auto qnnStatus = - _qnn_raw_interface.propertyHasCapability(QNN_PROPERTY_GROUP_DEVICE); - if (QNN_PROPERTY_NOT_SUPPORTED == qnnStatus) { - QNN_LOG_WARN("device property is not supported\n"); + std::uint32_t num_providers = 0; + const QnnInterface_t ** provider_list = nullptr; + error = get_providers(&provider_list, &num_providers); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to get providers, error %d", + QNN_GET_ERROR_CODE(error)); + return 3; } - if (QNN_PROPERTY_ERROR_UNKNOWN_KEY == qnnStatus) { - QNN_LOG_WARN("device property is not known to backend\n"); + QNN_LOG_DEBUG("num_providers=%d\n", num_providers); + if (num_providers != _required_num_providers) { + QNN_LOG_WARN("providers is %d instead of required %d", num_providers, + _required_num_providers); + return 4; } - } - Qnn_ErrorHandle_t qnn_status = _qnn_raw_interface.deviceCreate(_qnn_log_handle, nullptr, - &_qnn_device_handle); - if (QNN_SUCCESS != qnn_status && - QNN_DEVICE_ERROR_UNSUPPORTED_FEATURE != qnn_status) { - QNN_LOG_WARN("failed to create QNN device\n"); - } else { - QNN_LOG_INFO("create device successfully\n"); - } - - if (ggml_qnn_profile_level::profile_off != _profile_level) { - QNN_LOG_INFO("profiling turned on; level = %d", _profile_level); - if (ggml_qnn_profile_level::profile_basic == _profile_level) { - QNN_LOG_INFO("basic profiling requested. creating Qnn Profile object\n"); - if (QNN_PROFILE_NO_ERROR != - _qnn_raw_interface.profileCreate(_qnn_backend_handle, - QNN_PROFILE_LEVEL_BASIC, - &_qnn_profile_handle)) { - QNN_LOG_WARN("unable to create profile handle in the backend\n"); - return 6; - } else { - QNN_LOG_DEBUG("initialize qnn profile successfully\n"); - } - } else if (ggml_qnn_profile_level::profile_detail == _profile_level) { - QNN_LOG_INFO("detailed profiling requested. Creating Qnn Profile object\n"); - if (QNN_PROFILE_NO_ERROR != - _qnn_raw_interface.profileCreate(_qnn_backend_handle, - QNN_PROFILE_LEVEL_DETAILED, - &_qnn_profile_handle)) { - QNN_LOG_WARN("unable to create profile handle in the backend\n"); - return 7; - } else { - QNN_LOG_DEBUG("initialize qnn profile successfully\n"); - } + if (nullptr == provider_list) { + QNN_LOG_WARN("failed to get qnn interface providers\n"); + return 5; } - } - - _rpc_lib_handle = dlopen("libcdsprpc.so", RTLD_NOW | RTLD_LOCAL); - if (nullptr == _rpc_lib_handle) { - QNN_LOG_WARN("failed to load qualcomm's rpc lib, error:%s\n", dlerror()); - return 8; - } else { - QNN_LOG_DEBUG("load rpcmem lib successfully\n"); - set_rpcmem_initialized(true); - } - _pfn_rpc_mem_init = reinterpret_cast( - dlsym(_rpc_lib_handle, "rpcmem_init")); - _pfn_rpc_mem_deinit = reinterpret_cast( - dlsym(_rpc_lib_handle, "rpcmem_deinit")); - _pfn_rpc_mem_alloc = reinterpret_cast( - dlsym(_rpc_lib_handle, "rpcmem_alloc")); - _pfn_rpc_mem_free = reinterpret_cast( - dlsym(_rpc_lib_handle, "rpcmem_free")); - _pfn_rpc_mem_to_fd = reinterpret_cast( - dlsym(_rpc_lib_handle, "rpcmem_to_fd")); - if (nullptr == _pfn_rpc_mem_alloc || nullptr == _pfn_rpc_mem_free || - nullptr == _pfn_rpc_mem_to_fd) { - QNN_LOG_WARN("unable to access symbols in QNN RPC lib. dlerror(): %s", dlerror()); - dlclose(_rpc_lib_handle); - return 9; - } - - if (nullptr != - _pfn_rpc_mem_init) // make Qualcomm's SoC equipped low-end phone happy - _pfn_rpc_mem_init(); - - std::vector temp_context_config; - _qnn_interface.qnn_context_create( - _qnn_backend_handle, _qnn_device_handle, - temp_context_config.empty() ? nullptr : temp_context_config.data(), - &_qnn_context_handle); - if (nullptr == _qnn_context_handle) { - QNN_LOG_WARN("why failed to initialize qnn context\n"); - return 10; - } else { - QNN_LOG_DEBUG("initialize qnn context successfully\n"); - } - - if (_backend_name.find("Htp") != std::variant_npos) { - const QnnDevice_PlatformInfo_t * p_info = nullptr; - _qnn_raw_interface.deviceGetPlatformInfo(nullptr, &p_info); - QNN_LOG_INFO("device counts %d", p_info->v1.numHwDevices); - QnnDevice_HardwareDeviceInfo_t * infos = p_info->v1.hwDevices; - for (int i = 0; i < p_info->v1.numHwDevices; i++) { - QNN_LOG_INFO("deviceID:%d, deviceType:%d, numCores %d", infos[i].v1.deviceId, - infos[i].v1.deviceType, infos[i].v1.numCores); - QnnDevice_DeviceInfoExtension_t devinfo = infos[i].v1.deviceInfoExtension; - QnnHtpDevice_OnChipDeviceInfoExtension_t chipinfo = devinfo->onChipDevice; - QnnHtpDevice_Arch_t chiparch = chipinfo.arch; - QNN_LOG_INFO("htp_type:%d(%s)", devinfo->devType, (devinfo->devType == QNN_HTP_DEVICE_TYPE_ON_CHIP) ? "ON_CHIP" : ""); - QNN_LOG_INFO("qualcomm soc_model:%d(%s), htp_arch:%d, vtcm_size_in_mb:%d MB", chipinfo.socModel, - qnn_get_chipset_desc(chipinfo.socModel), chiparch, chipinfo.vtcmSize); - } - _qnn_raw_interface.deviceFreePlatformInfo(nullptr, p_info); - - - //TODO: faster approach to probe the accurate capacity of rpc ion memory - size_t candidate_size = 0; - uint8_t * rpc_buffer = nullptr; - const int SIZE_IN_MB = (1 << 20); - size_t probe_slots[] = {1024, 1536, 2048 - 48, 2048}; - size_t probe_counts = sizeof(probe_slots) / sizeof(size_t); - for (size_t idx = 0; idx < probe_counts; idx++) { - rpc_buffer = static_cast(alloc_rpcmem(probe_slots[idx] * SIZE_IN_MB, 4)); - if (nullptr == rpc_buffer) { - QNN_LOG_INFO("alloc rpcmem %d (MB) failure, %s\n", probe_slots[idx], strerror(errno)); + bool found_valid_interface = false; + QNN_INTERFACE_VER_TYPE qnn_interface; + for (size_t idx = 0; idx < num_providers; idx++) { + if (QNN_API_VERSION_MAJOR == + provider_list[idx]->apiVersion.coreApiVersion.major && + QNN_API_VERSION_MINOR <= + provider_list[idx]->apiVersion.coreApiVersion.minor) { + found_valid_interface = true; + qnn_interface = provider_list[idx]->QNN_INTERFACE_VER_NAME; break; - } else { - candidate_size = probe_slots[idx]; - free_rpcmem(rpc_buffer); - rpc_buffer = nullptr; } } - if (candidate_size > _rpcmem_capacity) - _rpcmem_capacity = candidate_size; - QNN_LOG_INFO("capacity of rpc ion memory %d MB\n", _rpcmem_capacity); - } - - QNN_LOG_DEBUG("leave qni_init\n"); - - return 0; -} - -int qnn_instance::qnn_finalize() { - int ret_status = 0; - Qnn_ErrorHandle_t error = QNN_SUCCESS; - - if (nullptr != - _pfn_rpc_mem_deinit) // make Qualcomm's SoC equipped low-end phone happy - _pfn_rpc_mem_deinit(); - - if (dlclose(_rpc_lib_handle) != 0) { - QNN_LOG_WARN("failed to unload qualcomm's rpc lib, error:%s\n", dlerror()); - } else { - QNN_LOG_DEBUG("succeed to close rpcmem lib\n"); - } - - if (nullptr != _qnn_context_handle) { - error = _qnn_interface.qnn_context_free(_qnn_context_handle, - _qnn_profile_handle); - if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to free QNN context_handle: ID %u, error %d\n", - _qnn_interface.get_backend_id(), - QNN_GET_ERROR_CODE(error)); - } - _qnn_context_handle = nullptr; - } - if (nullptr != _qnn_profile_handle) { - error = _qnn_interface.qnn_profile_free(_qnn_profile_handle); - if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to free QNN profile_handle: ID %u, error %d\n", - _qnn_interface.get_backend_id(), - QNN_GET_ERROR_CODE(error)); + if (!found_valid_interface) { + QNN_LOG_WARN("unable to find a valid qnn interface\n"); + return 6; + } else { + QNN_LOG_INFO("find a valid qnn interface\n"); + } + set_qnn_raw_interface(qnn_interface); + + BackendIdType backend_id = provider_list[0]->backendId; + _lib_path_to_backend_id[lib_path] = backend_id; + if (_loaded_backend.count(backend_id) > 0) { + QNN_LOG_WARN("lib_path %s is loaded, but backend %d already exists\n", + lib_path.c_str(), backend_id); + } + _loaded_backend[backend_id] = provider_list[0]; + if (_loaded_lib_handle.count(backend_id) > 0) { + QNN_LOG_WARN("closing %p\n", _loaded_lib_handle[backend_id]); + int dlclose_error = dlclose(_loaded_lib_handle[backend_id]); + if (dlclose_error != 0) { + QNN_LOG_WARN("fail to close %p with error %s\n", + _loaded_lib_handle[backend_id], dlerror()); + } } - _qnn_profile_handle = nullptr; + _loaded_lib_handle[backend_id] = lib_handle; + _backend_id = backend_id; + + return 0; } - if (nullptr != _qnn_device_handle) { - error = _qnn_interface.qnn_device_free(_qnn_device_handle); - if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to free QNN device_handle: ID %u, error %d\n", - _qnn_interface.get_backend_id(), - QNN_GET_ERROR_CODE(error)); + int unload_backend() { + int dlclose_error = 0; + for (auto & it : _loaded_lib_handle) { + dlclose_error = dlclose(it.second); + if (dlclose_error != 0) { + QNN_LOG_WARN("failed to close QNN backend %d, error %s\n", it.first, + dlerror()); + } } - _qnn_device_handle = nullptr; + + _loaded_lib_handle.clear(); + _lib_path_to_backend_id.clear(); + _loaded_backend.clear(); + + return 0; } - if (nullptr != _qnn_backend_handle) { - error = _qnn_interface.qnn_backend_free(_qnn_backend_handle); - if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to free QNN backend_handle: ID %u, error %d\n", - _qnn_interface.get_backend_id(), - QNN_GET_ERROR_CODE(error)); - } - _qnn_backend_handle = nullptr; + void set_qnn_raw_interface(QNN_INTERFACE_VER_TYPE & raw_interface) { + _qnn_raw_interface = raw_interface; } - if (nullptr != _qnn_log_handle) { - error = _qnn_interface.qnn_log_free(_qnn_log_handle); - if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to free QNN log_handle: ID %u, error %d\n", - _qnn_interface.get_backend_id(), - QNN_GET_ERROR_CODE(error)); - } - _qnn_log_handle = nullptr; + void set_qnn_raw_system_interface(QNN_SYSTEM_INTERFACE_VER_TYPE & raw_interface) { + _qnn_raw_system_interface = raw_interface; } - unload_backend(); + private: + static constexpr const int _required_num_providers = 1; - unload_system(); + private: + std::string _lib_path; + std::string _backend_name; + std::string _model_name; // prebuilt QNN model name, not used currently + BackendIdType _backend_id; - return ret_status; -} + bool _debug_tensor = false; + bool _do_node_validations = true; -int qnn_instance::init_qnn_graph(const char * graph_name, bool debug, - uint8_t do_node_validation, - const QnnGraph_Config_t ** graph_configs) { - int result = 0; + QnnLog_Level_t _qnn_log_level = QNN_LOG_LEVEL_DEBUG; - if (nullptr == graph_name) { - QNN_LOG_WARN("graph name is null\n"); - return 1; - } + qnn_sdk_profile_level _profile_level = qnn_sdk_profile_level::profile_detail; - if (!_graph_name.empty()) { - QNN_LOG_WARN("qnn model for graph %s already initialized\n", graph_name); - return 2; - } + qnn_interface _qnn_interface; - if (!do_node_validation) { - QNN_LOG_WARN("node validation disabled, backend will not perform op " - "validation prior to adding node\n"); - } + void * _system_lib_handle = nullptr; - _graph_name = graph_name; - _debug_tensor = debug; - _do_node_validations = do_node_validation; + Qnn_GraphHandle_t _qnn_graph_handle = nullptr; - result = _qnn_raw_interface.graphCreate(_qnn_context_handle, graph_name, - graph_configs, &_qnn_graph_handle); - if (result != QNN_GRAPH_NO_ERROR || nullptr == _qnn_graph_handle) { - QNN_LOG_WARN("failed to create graph in qnn context\n"); - return 3; - } else { - QNN_LOG_INFO("succeed to create graph %s, %p\n", graph_name, _qnn_graph_handle); - } + Qnn_LogHandle_t _qnn_log_handle = nullptr; - return 0; -} + Qnn_ProfileHandle_t _qnn_profile_handle = nullptr; -int qnn_instance::finalize_qnn_graph() { - if (nullptr != _qnn_graph_handle) { - if (_qnn_raw_interface.graphFinalize(_qnn_graph_handle, - _qnn_profile_handle, - nullptr) != QNN_GRAPH_NO_ERROR) { - QNN_LOG_WARN("finalizing graph failure\n"); - } - } else { - QNN_LOG_DEBUG("qnn graph handle is null\n"); - } + Qnn_DeviceHandle_t _qnn_device_handle = nullptr; - return 0; -} + Qnn_BackendHandle_t _qnn_backend_handle = nullptr; + + Qnn_ContextHandle_t _qnn_context_handle = nullptr; + + QnnSystemContext_Handle_t _qnn_system_handle = nullptr; + + QnnHtpDevice_PerfInfrastructure_t * _qnn_htp_perfinfra = nullptr; + uint32_t _qnn_power_configid = 1; + uint32_t _qnn_rpc_pollingtime = 9999; // 0-10000 us for high performing + + QNN_INTERFACE_VER_TYPE _qnn_raw_interface; + QNN_SYSTEM_INTERFACE_VER_TYPE _qnn_raw_system_interface; + + std::unordered_set _qnn_mem_set; + + std::mutex _init_mutex; + std::unordered_map _loaded_lib_handle; + std::unordered_map _lib_path_to_backend_id; + std::unordered_map _loaded_backend; + + void * _rpc_lib_handle = nullptr; + std::atomic_bool _rpcmem_initialized{false}; + _pfn_rpc_mem_alloc __pfn_rpc_mem_alloc; + _pfn_rpc_mem_free __pfn_rpc_mem_free; + _pfn_rpc_mem_to_fd __pfn_rpc_mem_to_fd; + _pfn_rpc_mem_init __pfn_rpc_mem_init; + _pfn_rpc_mem_deinit __pfn_rpc_mem_deinit; + std::unordered_map _rpcmem_store_map; + size_t _rpcmem_capacity = 512; + + std::string _graph_name; +}; // ================================================================================================= // -// implementation of GGML's QNN backend +// implementation of QNN backend for GGML // // ================================================================================================= -static bool ggml_qnn_can_handle_op(ggml_backend_qnn_context *ctx, - const struct ggml_tensor *tensor, +static bool ggml_qnn_can_handle_op(ggml_backend_qnn_context * ctx, + const struct ggml_tensor * tensor, bool b_dump_tensor_info) { // only support the following 3 OPs currently // provide a GENERAL approach could fix this problem in a standalone PR of refine ggml backend @@ -1739,23 +1830,18 @@ static bool ggml_qnn_can_handle_op(ggml_backend_qnn_context *ctx, // which the backend's ggml_backend_xxx_buffer_is_host return true. // this approach could be found: // https://github.com/ggerganov/llama.cpp/pull/7641 - // - // ensure tensor->src[0] and tensor->src[1] is not nullptr. - bool supported_op = - ((tensor->op == GGML_OP_ADD) || (tensor->op == GGML_OP_MUL) || - (tensor->op == GGML_OP_MUL_MAT)); + bool supported_op = ((tensor->op == GGML_OP_ADD) || (tensor->op == GGML_OP_MUL) + || (tensor->op == GGML_OP_MUL_MAT)); if (!supported_op) { return false; } + const struct ggml_tensor * src0 = tensor->src[0]; const struct ggml_tensor * src1 = tensor->src[1]; - const int64_t ne00 = src0->ne[0]; const int64_t ne01 = src0->ne[1]; - const int64_t ne10 = src1->ne[0]; const int64_t ne11 = src1->ne[1]; - const int64_t ne20 = tensor->ne[0]; const int64_t ne21 = tensor->ne[1]; @@ -1801,15 +1887,11 @@ static bool ggml_qnn_can_handle_op(ggml_backend_qnn_context *ctx, return false; } - // make ggml_get_tensor_rank and QNN SDK happy + // make qnn_get_ggml_tensor_rank and QNN SDK happy if ((ne00 <= 1 || ne01 <= 1 || ne10 <= 1 || ne11 <= 1)) { return false; } - if ((ne20 < 32) || (ne21 < 32) || (ne10 < 32)) { - return false; - } - int qtype = src0->type; if (tensor->op == GGML_OP_ADD) { return (qtype == GGML_TYPE_F32 || qtype == GGML_TYPE_F16 || @@ -1837,75 +1919,32 @@ static bool ggml_qnn_can_handle_op(ggml_backend_qnn_context *ctx, } } + static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { Qnn_ErrorHandle_t error = QNN_SUCCESS; bool graph_initialized = false; - int64_t n_begin_time = 0LL; - int64_t n_end_time = 0LL; - int64_t n_duration = 0LL; - - qnn_instance * instance = nullptr; - std::string graph_name = "ggml_op_qnn_add"; - Qnn_GraphHandle_t graph_handle = nullptr; - Qnn_Tensor_t * tensor_0 = nullptr; - Qnn_Tensor_t * tensor_1 = nullptr; - Qnn_Tensor_t * tensor_2 = nullptr; - Qnn_Param_t qnn_params[] = {}; - enum ggml_op ggmlop = GGML_OP_ADD; - Qnn_DataType_t src0_qnn_type = QNN_DATATYPE_FLOAT_32; - Qnn_DataType_t src1_qnn_type = QNN_DATATYPE_FLOAT_32; - Qnn_DataType_t dst_qnn_type = QNN_DATATYPE_FLOAT_32; - - if ((nullptr == src0) || (nullptr == src1) || (nullptr == dst)) { - QNN_LOG_WARN("pls check why GGML tensor is null"); - return; - } + qnn_instance * instance = nullptr; + std::string graph_name = "ggml_op_qnn_add"; + Qnn_GraphHandle_t graph_handle = nullptr; + Qnn_Tensor_t * tensor_0 = nullptr; + Qnn_Tensor_t * tensor_1 = nullptr; + Qnn_Tensor_t * tensor_2 = nullptr; + Qnn_Param_t qnn_params[] = {}; + enum ggml_op ggmlop = GGML_OP_ADD; + Qnn_DataType_t src0_qnn_type = QNN_DATATYPE_FLOAT_32; + Qnn_DataType_t src1_qnn_type = QNN_DATATYPE_FLOAT_32; + Qnn_DataType_t dst_qnn_type = QNN_DATATYPE_FLOAT_32; + + CHECK_PARAMS(ctx, src0, src1, dst); tensor_0 = (Qnn_Tensor_t *) src0->extra; tensor_1 = (Qnn_Tensor_t *) src1->extra; tensor_2 = (Qnn_Tensor_t *) dst->extra; - if ((nullptr == tensor_0) || (nullptr == tensor_1) || - (nullptr == tensor_2)) { - QNN_LOG_WARN("pls check why QNN tensor is null"); - return; - } - if (nullptr == ctx) { - QNN_LOG_WARN("pls check why backend ctx is null"); - return; - } instance = ctx->instance; - if (nullptr == instance) { - QNN_LOG_WARN("pls check why qnn instance is null"); - return; - } - QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface; - - n_begin_time = ggml_time_us(); - - if (0) { - QNN_LOG_DEBUG("call %s\n", __func__); - QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 - " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", - src0->name, src0->type, ggml_type_name(src0->type), - src0->ne[0], src0->ne[1], src0->ne[2], src0->nb[0], - src0->nb[1], src0->nb[2]); - QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 - " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", - src1->name, src1->type, ggml_type_name(src1->type), - src1->ne[0], src1->ne[1], src1->ne[2], src1->nb[0], - src1->nb[1], src1->nb[2]); - QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 - " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", - dst->name, dst->type, ggml_type_name(dst->type), - dst->ne[0], dst->ne[1], dst->ne[2], dst->nb[0], - dst->nb[1], dst->nb[2]); - QNN_LOG_DEBUG("%d, %d, %d, %d", src0->ne[0], src0->ne[1], src0->ne[2], - src0->ne[3]); - QNN_LOG_DEBUG("tensor0 name %s", QNN_TENSOR_GET_NAME(*tensor_0)); - QNN_LOG_DEBUG("tensor1 name %s", QNN_TENSOR_GET_NAME(*tensor_1)); - QNN_LOG_DEBUG("tensor2 name %s", QNN_TENSOR_GET_NAME(*tensor_2)); - } + qnn_perf perf("ggml_qnn_add"); + perf.start(); + QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface; QNN_VER_PTR(*tensor_0)->type = QNN_TENSOR_TYPE_APP_WRITE; QNN_VER_PTR(*tensor_1)->type = QNN_TENSOR_TYPE_APP_WRITE; QNN_VER_PTR(*tensor_2)->type = QNN_TENSOR_TYPE_APP_READ; @@ -1947,36 +1986,39 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src QNN_LOG_INFO("can't create qnn graph handle with graph name %s, " "error = %d\n", graph_name.c_str(), error); - return; + goto failure; } error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_0); if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); + goto failure; } error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_1); if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); + goto failure; } error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_2); if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); + goto failure; } QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, - ggml_get_tensor_data_size(src0)}; + qnn_get_ggml_tensor_data_size(src0)}; QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, - ggml_get_tensor_data_size(src1)}; + qnn_get_ggml_tensor_data_size(src1)}; QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, - ggml_get_tensor_data_size(dst)}; + qnn_get_ggml_tensor_data_size(dst)}; QNN_VER_PTR(*tensor_0)->dimensions = dimensions_input_0; - QNN_VER_PTR(*tensor_0)->rank = ggml_get_tensor_rank(src0); + QNN_VER_PTR(*tensor_0)->rank = qnn_get_ggml_tensor_rank(src0); QNN_VER_PTR(*tensor_0)->dataType = src0_qnn_type; QNN_VER_PTR(*tensor_1)->dimensions = dimensions_input_1; - QNN_VER_PTR(*tensor_1)->rank = ggml_get_tensor_rank(src1); + QNN_VER_PTR(*tensor_1)->rank = qnn_get_ggml_tensor_rank(src1); QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type; QNN_VER_PTR(*tensor_2)->dimensions = dimensions_output; - QNN_VER_PTR(*tensor_2)->rank = ggml_get_tensor_rank(dst); + QNN_VER_PTR(*tensor_2)->rank = qnn_get_ggml_tensor_rank(dst); QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type; Qnn_Tensor_t tensor_inputs[] = {*tensor_0, *tensor_1}; @@ -1990,17 +2032,19 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src error = qnn_raw_interface.graphAddNode(graph_handle, op_config); if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); + goto failure; } error = qnn_raw_interface.graphFinalize(graph_handle, nullptr, nullptr); if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); + goto failure; } - error = - qnn_raw_interface.graphExecute(graph_handle, tensor_inputs, 2, + error = qnn_raw_interface.graphExecute(graph_handle, tensor_inputs, 2, tensor_outputs, 1, nullptr, nullptr); if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); + goto failure; } auto graph_item = std::make_tuple(graph_handle, tensor_0, tensor_1, tensor_2); instance->_qnn_graph_map[map_entry] = graph_item; @@ -2011,8 +2055,6 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src tensor_1 = std::get<2>(graph_item); tensor_2 = std::get<3>(graph_item); - // QNN_LOG_DEBUG("%d, %d, %d, %d", src0->ne[0], src0->ne[1], - // src0->ne[2], src0->ne[3]); uint32_t dimensions_input_0[] = { (uint32_t) src0->ne[0], (uint32_t) src0->ne[1], (uint32_t) src0->ne[2], (uint32_t) src0->ne[3]}; @@ -2024,38 +2066,61 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src (uint32_t) dst->ne[3]}; QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, - ggml_get_tensor_data_size(src0)}; + qnn_get_ggml_tensor_data_size(src0)}; QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, - ggml_get_tensor_data_size(src1)}; + qnn_get_ggml_tensor_data_size(src1)}; QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, - ggml_get_tensor_data_size(dst)}; + qnn_get_ggml_tensor_data_size(dst)}; QNN_VER_PTR(*tensor_0)->dimensions = dimensions_input_0; - QNN_VER_PTR(*tensor_0)->rank = ggml_get_tensor_rank(src0); + QNN_VER_PTR(*tensor_0)->rank = qnn_get_ggml_tensor_rank(src0); QNN_VER_PTR(*tensor_0)->dataType = src0_qnn_type; QNN_VER_PTR(*tensor_1)->dimensions = dimensions_input_1; - QNN_VER_PTR(*tensor_1)->rank = ggml_get_tensor_rank(src1); + QNN_VER_PTR(*tensor_1)->rank = qnn_get_ggml_tensor_rank(src1); QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type; QNN_VER_PTR(*tensor_2)->dimensions = dimensions_output; - QNN_VER_PTR(*tensor_2)->rank = ggml_get_tensor_rank(dst); + QNN_VER_PTR(*tensor_2)->rank = qnn_get_ggml_tensor_rank(dst); QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type; Qnn_Tensor_t tensor_inputs[] = {*tensor_0, *tensor_1}; Qnn_Tensor_t tensor_outputs[] = {*tensor_2}; - error = - qnn_raw_interface.graphExecute(graph_handle, tensor_inputs,2, + error = qnn_raw_interface.graphExecute(graph_handle, tensor_inputs,2, tensor_outputs,1, nullptr, nullptr); if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); + goto failure; } } +failure: + if (QNN_SUCCESS != error) { + QNN_LOG_DEBUG("tensor0 name %s", QNN_TENSOR_GET_NAME(*tensor_0)); + QNN_LOG_DEBUG("tensor1 name %s", QNN_TENSOR_GET_NAME(*tensor_1)); + QNN_LOG_DEBUG("tensor2 name %s", QNN_TENSOR_GET_NAME(*tensor_2)); + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 + " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + src0->name, src0->type, ggml_type_name(src0->type), + src0->ne[0], src0->ne[1], src0->ne[2], src0->nb[0], + src0->nb[1], src0->nb[2]); + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 + " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + src1->name, src1->type, ggml_type_name(src1->type), + src1->ne[0], src1->ne[1], src1->ne[2], src1->nb[0], + src1->nb[1], src1->nb[2]); + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 + " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + dst->name, dst->type, ggml_type_name(dst->type), + dst->ne[0], dst->ne[1], dst->ne[2], dst->nb[0], + dst->nb[1], dst->nb[2]); + QNN_LOG_DEBUG("%d, %d, %d, %d", src0->ne[0], src0->ne[1], src0->ne[2], + src0->ne[3]); + } + QNN_VER_PTR(*tensor_0)->dimensions = tensor_0_dimensions; QNN_VER_PTR(*tensor_1)->dimensions = tensor_1_dimensions; QNN_VER_PTR(*tensor_2)->dimensions = tensor_2_dimensions; - n_end_time = ggml_time_us(); - n_duration = (n_end_time - n_begin_time) / 1000; - QNN_LOG_DEBUG("duration of ggml_qnn_add : %lld milliseconds\n", n_duration); + + perf.info(); } /* @@ -2074,69 +2139,32 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, ggml_tensor * dst) { Qnn_ErrorHandle_t error = QNN_SUCCESS; bool graph_initialized = false; - int64_t n_begin_time = 0LL; - int64_t n_end_time = 0LL; - int64_t n_duration = 0LL; - - qnn_instance * instance = nullptr; - std::string graph_name = "ggml_op_qnn_mul_mat"; - Qnn_GraphHandle_t graph_handle = nullptr; - Qnn_Tensor_t * tensor_0 = nullptr; - Qnn_Tensor_t * tensor_1 = nullptr; - Qnn_Tensor_t * tensor_2 = nullptr; - - Qnn_Param_t qnn_params[] = {}; - - enum ggml_op ggmlop = GGML_OP_MUL_MAT; - Qnn_DataType_t src0_qnn_type = QNN_DATATYPE_FLOAT_32; - Qnn_DataType_t src1_qnn_type = QNN_DATATYPE_FLOAT_32; - Qnn_DataType_t dst_qnn_type = QNN_DATATYPE_FLOAT_32; - - if ((nullptr == src0) || (nullptr == src1) || (nullptr == dst)) { - QNN_LOG_WARN("pls check why GGML tensor is null"); - return; - } + qnn_instance * instance = nullptr; + std::string graph_name = "ggml_op_qnn_mul_mat"; + Qnn_GraphHandle_t graph_handle = nullptr; + Qnn_Tensor_t * tensor_0 = nullptr; + Qnn_Tensor_t * tensor_1 = nullptr; + Qnn_Tensor_t * tensor_2 = nullptr; + Qnn_Param_t qnn_params[] = {}; + enum ggml_op ggmlop = GGML_OP_MUL_MAT; + Qnn_DataType_t src0_qnn_type = QNN_DATATYPE_FLOAT_32; + Qnn_DataType_t src1_qnn_type = QNN_DATATYPE_FLOAT_32; + Qnn_DataType_t dst_qnn_type = QNN_DATATYPE_FLOAT_32; + + CHECK_PARAMS(ctx, src0, src1, dst); tensor_0 = (Qnn_Tensor_t *) src0->extra; tensor_1 = (Qnn_Tensor_t *) src1->extra; tensor_2 = (Qnn_Tensor_t *) dst->extra; - if ((nullptr == tensor_0) || (nullptr == tensor_1) || - (nullptr == tensor_2)) { - QNN_LOG_WARN("pls check why QNN tensor is null"); - return; - } - if (nullptr == ctx) { - QNN_LOG_WARN("pls check why backend ctx is null"); - return; - } instance = ctx->instance; - if (nullptr == instance) { - QNN_LOG_WARN("pls check why qnn instance is null"); - return; - } - QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface; + qnn_perf perf("ggml_qnn_mul_mat"); + perf.start(); - n_begin_time = ggml_time_us(); - QNN_LOG_DEBUG("call %s\n", __func__); - QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 - " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", - src0->name, src0->type, ggml_type_name(src0->type), - src0->ne[0], src0->ne[1], src0->ne[2], src0->nb[0], - src0->nb[1], src0->nb[2]); - QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 - " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", - src1->name, src1->type, ggml_type_name(src1->type), - src1->ne[0], src1->ne[1], src1->ne[2], src1->nb[0], - src1->nb[1], src1->nb[2]); - QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 - " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", - dst->name, dst->type, ggml_type_name(dst->type), dst->ne[0], - dst->ne[1], dst->ne[2], dst->nb[0], dst->nb[1], dst->nb[2]); - QNN_LOG_DEBUG("%d, %d, %d, %d", src0->ne[0], src0->ne[1], src0->ne[2], - src0->ne[3]); - QNN_LOG_DEBUG("tensor0 name %s", QNN_TENSOR_GET_NAME(*tensor_0)); - QNN_LOG_DEBUG("tensor1 name %s", QNN_TENSOR_GET_NAME(*tensor_1)); - QNN_LOG_DEBUG("tensor2 name %s", QNN_TENSOR_GET_NAME(*tensor_2)); + tensor_0 = (Qnn_Tensor_t *) src0->extra; + tensor_1 = (Qnn_Tensor_t *) src1->extra; + tensor_2 = (Qnn_Tensor_t *) dst->extra; + instance = ctx->instance; + QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface; QNN_VER_PTR(*tensor_0)->type = QNN_TENSOR_TYPE_APP_WRITE; QNN_VER_PTR(*tensor_1)->type = QNN_TENSOR_TYPE_APP_WRITE; QNN_VER_PTR(*tensor_2)->type = QNN_TENSOR_TYPE_APP_READ; @@ -2178,36 +2206,39 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, QNN_LOG_INFO("can't create qnn graph handle with graph name %s, " "error = %d\n", graph_name.c_str(), error); - return; + goto failure; } error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_0); if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); + goto failure; } error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_1); if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); + goto failure; } error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_2); if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); + goto failure; } QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, - ggml_get_tensor_data_size(src0)}; + qnn_get_ggml_tensor_data_size(src0)}; QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, - ggml_get_tensor_data_size(src1)}; + qnn_get_ggml_tensor_data_size(src1)}; QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, - ggml_get_tensor_data_size(dst)}; + qnn_get_ggml_tensor_data_size(dst)}; QNN_VER_PTR(*tensor_0)->dimensions = dimensions_input_0; - QNN_VER_PTR(*tensor_0)->rank = ggml_get_tensor_rank(src0); + QNN_VER_PTR(*tensor_0)->rank = qnn_get_ggml_tensor_rank(src0); QNN_VER_PTR(*tensor_0)->dataType = src0_qnn_type; QNN_VER_PTR(*tensor_1)->dimensions = dimensions_input_1; - QNN_VER_PTR(*tensor_1)->rank = ggml_get_tensor_rank(src1); + QNN_VER_PTR(*tensor_1)->rank = qnn_get_ggml_tensor_rank(src1); QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type; QNN_VER_PTR(*tensor_2)->dimensions = dimensions_output; - QNN_VER_PTR(*tensor_2)->rank = ggml_get_tensor_rank(dst); + QNN_VER_PTR(*tensor_2)->rank = qnn_get_ggml_tensor_rank(dst); QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type; Qnn_Tensor_t tensor_inputs[] = {*tensor_0, *tensor_1}; @@ -2220,10 +2251,12 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, error = qnn_raw_interface.graphAddNode(graph_handle, op_config); if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); + goto failure; } error = qnn_raw_interface.graphFinalize(graph_handle, nullptr, nullptr); if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); + goto failure; } error = qnn_raw_interface.graphExecute(graph_handle, tensor_inputs, 2, @@ -2231,6 +2264,7 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, nullptr, nullptr); if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); + goto failure; } auto graph_item = std::make_tuple(graph_handle, tensor_0, tensor_1, tensor_2); instance->_qnn_graph_map[map_entry] = graph_item; @@ -2241,7 +2275,6 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, tensor_1 = std::get<2>(graph_item); tensor_2 = std::get<3>(graph_item); - QNN_LOG_DEBUG("%d, %d, %d, %d", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]); uint32_t dimensions_input_0[] = { (uint32_t) src0->ne[0], (uint32_t) src0->ne[1], (uint32_t) src0->ne[2], (uint32_t) src0->ne[3]}; @@ -2252,41 +2285,60 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3]}; QNN_VER_PTR(*tensor_0)->dimensions = dimensions_input_0; - QNN_VER_PTR(*tensor_0)->rank = ggml_get_tensor_rank(src0); + QNN_VER_PTR(*tensor_0)->rank = qnn_get_ggml_tensor_rank(src0); QNN_VER_PTR(*tensor_0)->dataType = src0_qnn_type; QNN_VER_PTR(*tensor_1)->dimensions = dimensions_input_1; - QNN_VER_PTR(*tensor_1)->rank = ggml_get_tensor_rank(src1); + QNN_VER_PTR(*tensor_1)->rank = qnn_get_ggml_tensor_rank(src1); QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type; QNN_VER_PTR(*tensor_2)->dimensions = dimensions_output; - QNN_VER_PTR(*tensor_2)->rank = ggml_get_tensor_rank(dst); + QNN_VER_PTR(*tensor_2)->rank = qnn_get_ggml_tensor_rank(dst); QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type; QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, - ggml_get_tensor_data_size(src0)}; + qnn_get_ggml_tensor_data_size(src0)}; QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, - ggml_get_tensor_data_size(src1)}; + qnn_get_ggml_tensor_data_size(src1)}; QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, - ggml_get_tensor_data_size(dst)}; + qnn_get_ggml_tensor_data_size(dst)}; Qnn_Tensor_t tensor_inputs[] = {*tensor_0, *tensor_1}; Qnn_Tensor_t tensor_outputs[] = {*tensor_2}; - error = - qnn_raw_interface.graphExecute(graph_handle, + error = qnn_raw_interface.graphExecute(graph_handle, tensor_inputs, 2, tensor_outputs, 1, nullptr, nullptr); if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); + goto failure; } } +failure: + if (QNN_SUCCESS != error) { + QNN_LOG_DEBUG("tensor0 name %s", QNN_TENSOR_GET_NAME(*tensor_0)); + QNN_LOG_DEBUG("tensor1 name %s", QNN_TENSOR_GET_NAME(*tensor_1)); + QNN_LOG_DEBUG("tensor2 name %s", QNN_TENSOR_GET_NAME(*tensor_2)); + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 + " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + src0->name, src0->type, ggml_type_name(src0->type), + src0->ne[0], src0->ne[1], src0->ne[2], src0->nb[0], + src0->nb[1], src0->nb[2]); + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 + " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + src1->name, src1->type, ggml_type_name(src1->type), + src1->ne[0], src1->ne[1], src1->ne[2], src1->nb[0], + src1->nb[1], src1->nb[2]); + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 + " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + dst->name, dst->type, ggml_type_name(dst->type), dst->ne[0], + dst->ne[1], dst->ne[2], dst->nb[0], dst->nb[1], dst->nb[2]); + QNN_LOG_DEBUG("%d, %d, %d, %d", src0->ne[0], src0->ne[1], src0->ne[2], + src0->ne[3]); + } + QNN_VER_PTR(*tensor_0)->dimensions = tensor_0_dimensions; QNN_VER_PTR(*tensor_1)->dimensions = tensor_1_dimensions; QNN_VER_PTR(*tensor_2)->dimensions = tensor_2_dimensions; - n_end_time = ggml_time_us(); - n_duration = (n_end_time - n_begin_time) / 1000; - QNN_LOG_DEBUG("duration of ggml_qnn_mul_mat : %lld milliseconds\n", - n_duration); - QNN_LOG_DEBUG("call %s done\n", __func__); + perf.info(); } // common function for GGML OPs using QNN API @@ -2296,10 +2348,6 @@ static void ggml_qnn_hanlde_op(ggml_backend_qnn_context * ctx, ggml_tensor * dst) { Qnn_ErrorHandle_t error = QNN_SUCCESS; bool graph_initialized = false; - int64_t n_begin_time = 0LL; - int64_t n_end_time = 0LL; - int64_t n_duration = 0LL; - qnn_instance * instance = nullptr; std::string qnn_graph_name = "ggml_qnn_graph"; std::string qnn_op_config_name = "ggml_qnn_op_config"; @@ -2308,73 +2356,39 @@ static void ggml_qnn_hanlde_op(ggml_backend_qnn_context * ctx, Qnn_Tensor_t * tensor_0 = nullptr; Qnn_Tensor_t * tensor_1 = nullptr; Qnn_Tensor_t * tensor_2 = nullptr; - Qnn_Param_t qnn_params[] = {}; - Qnn_DataType_t src0_qnn_type = QNN_DATATYPE_FLOAT_32; Qnn_DataType_t src1_qnn_type = QNN_DATATYPE_FLOAT_32; Qnn_DataType_t dst_qnn_type = QNN_DATATYPE_FLOAT_32; - if ((nullptr == src0) || (nullptr == src1) || (nullptr == dst)) { - QNN_LOG_WARN("pls check why GGML tensor is null"); - return; - } + CHECK_PARAMS(ctx, src0, src1, dst); tensor_0 = (Qnn_Tensor_t *) src0->extra; tensor_1 = (Qnn_Tensor_t *) src1->extra; tensor_2 = (Qnn_Tensor_t *) dst->extra; - if ((nullptr == tensor_0) || (nullptr == tensor_1) || - (nullptr == tensor_2)) { - QNN_LOG_WARN("pls check why QNN tensor is null"); - return; - } - if (nullptr == ctx) { - QNN_LOG_WARN("pls check why backend ctx is null"); - return; - } instance = ctx->instance; - if (nullptr == instance) { - QNN_LOG_WARN("pls check why qnn instance is null"); + qnn_perf perf(ggml_op_name(ggmlop)); + perf.start(); + + qnn_op_name = qnn_opname_from_ggmlop(ggmlop); + if (nullptr == qnn_op_name) { + QNN_LOG_WARN("ggml op %d(%s) not supported currently", ggmlop, ggml_op_name(ggmlop)); return; } + + tensor_0 = (Qnn_Tensor_t *) src0->extra; + tensor_1 = (Qnn_Tensor_t *) src1->extra; + tensor_2 = (Qnn_Tensor_t *) dst->extra; + instance = ctx->instance; QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface; src0_qnn_type = qnn_datatype_from_ggml_datatype(src0->type); src1_qnn_type = qnn_datatype_from_ggml_datatype(src1->type); dst_qnn_type = qnn_datatype_from_ggml_datatype(dst->type); - qnn_op_name = qnn_opname_from_ggmlop(ggmlop); - if (nullptr == qnn_op_name) { - QNN_LOG_WARN( - "pls check why can not get QNN OP name with ggml op %d(%s)", ggmlop, - ggml_op_name(ggmlop)); - return; - } - - n_begin_time = ggml_time_us(); - - QNN_LOG_DEBUG("call %s\n", __func__); - QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 - " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", - src0->name, src0->type, ggml_type_name(src0->type), - src0->ne[0], src0->ne[1], src0->ne[2], src0->nb[0], - src0->nb[1], src0->nb[2]); - QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 - " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", - src1->name, src1->type, ggml_type_name(src1->type), - src1->ne[0], src1->ne[1], src1->ne[2], src1->nb[0], - src1->nb[1], src1->nb[2]); - QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 - " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", - dst->name, dst->type, ggml_type_name(dst->type), dst->ne[0], - dst->ne[1], dst->ne[2], dst->nb[0], dst->nb[1], dst->nb[2]); - QNN_LOG_DEBUG("%d, %d, %d, %d", src0->ne[0], src0->ne[1], src0->ne[2], - src0->ne[3]); - QNN_LOG_DEBUG("tensor0 name %s", QNN_TENSOR_GET_NAME(*tensor_0)); - QNN_LOG_DEBUG("tensor1 name %s", QNN_TENSOR_GET_NAME(*tensor_1)); - QNN_LOG_DEBUG("tensor2 name %s", QNN_TENSOR_GET_NAME(*tensor_2)); QNN_VER_PTR(*tensor_0)->type = QNN_TENSOR_TYPE_APP_WRITE; QNN_VER_PTR(*tensor_1)->type = QNN_TENSOR_TYPE_APP_WRITE; QNN_VER_PTR(*tensor_2)->type = QNN_TENSOR_TYPE_APP_READ; + uint32_t dimensions_input_0[] = { (uint32_t) src0->ne[0], (uint32_t) src0->ne[1], (uint32_t) src0->ne[2], (uint32_t) src0->ne[3]}; @@ -2413,37 +2427,40 @@ static void ggml_qnn_hanlde_op(ggml_backend_qnn_context * ctx, QNN_LOG_INFO("can't create qnn graph handle with ggml op %s, graph " "name %s, error = %d\n", ggml_op_name(ggmlop), qnn_graph_name.c_str(), error); - return; + goto failure; } error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_0); if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); + goto failure; } error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_1); if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); + goto failure; } error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_2); if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); + goto failure; } QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, - ggml_get_tensor_data_size(src0)}; + qnn_get_ggml_tensor_data_size(src0)}; QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, - ggml_get_tensor_data_size(src1)}; + qnn_get_ggml_tensor_data_size(src1)}; QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, - ggml_get_tensor_data_size(dst)}; + qnn_get_ggml_tensor_data_size(dst)}; QNN_VER_PTR(*tensor_0)->dimensions = dimensions_input_0; - QNN_VER_PTR(*tensor_0)->rank = ggml_get_tensor_rank(src0); + QNN_VER_PTR(*tensor_0)->rank = qnn_get_ggml_tensor_rank(src0); QNN_VER_PTR(*tensor_0)->dataType = src0_qnn_type; QNN_VER_PTR(*tensor_1)->dimensions = dimensions_input_1; - QNN_VER_PTR(*tensor_1)->rank = ggml_get_tensor_rank(src1); + QNN_VER_PTR(*tensor_1)->rank = qnn_get_ggml_tensor_rank(src1); QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type; QNN_VER_PTR(*tensor_2)->dimensions = dimensions_output; - QNN_VER_PTR(*tensor_2)->rank = ggml_get_tensor_rank(dst); + QNN_VER_PTR(*tensor_2)->rank = qnn_get_ggml_tensor_rank(dst); QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type; Qnn_Tensor_t tensor_inputs[] = {*tensor_0, *tensor_1}; @@ -2456,10 +2473,12 @@ static void ggml_qnn_hanlde_op(ggml_backend_qnn_context * ctx, error = qnn_raw_interface.graphAddNode(graph_handle, op_config); if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); + goto failure; } error = qnn_raw_interface.graphFinalize(graph_handle, nullptr, nullptr); if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); + goto failure; } error = qnn_raw_interface.graphExecute(graph_handle, tensor_inputs, 2, @@ -2467,6 +2486,7 @@ static void ggml_qnn_hanlde_op(ggml_backend_qnn_context * ctx, nullptr, nullptr); if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); + goto failure; } auto graph_item = std::make_tuple(graph_handle, tensor_0, tensor_1, tensor_2); instance->_qnn_graph_map[map_entry] = graph_item; @@ -2477,7 +2497,6 @@ static void ggml_qnn_hanlde_op(ggml_backend_qnn_context * ctx, tensor_1 = std::get<2>(graph_item); tensor_2 = std::get<3>(graph_item); - QNN_LOG_DEBUG("%d, %d, %d, %d", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]); uint32_t dimensions_input_0[] = { (uint32_t) src0->ne[0], (uint32_t) src0->ne[1], (uint32_t) src0->ne[2], (uint32_t) src0->ne[3]}; @@ -2488,21 +2507,21 @@ static void ggml_qnn_hanlde_op(ggml_backend_qnn_context * ctx, (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3]}; QNN_VER_PTR(*tensor_0)->dimensions = dimensions_input_0; - QNN_VER_PTR(*tensor_0)->rank = ggml_get_tensor_rank(src0); + QNN_VER_PTR(*tensor_0)->rank = qnn_get_ggml_tensor_rank(src0); QNN_VER_PTR(*tensor_0)->dataType = src0_qnn_type; QNN_VER_PTR(*tensor_1)->dimensions = dimensions_input_1; - QNN_VER_PTR(*tensor_1)->rank = ggml_get_tensor_rank(src1); + QNN_VER_PTR(*tensor_1)->rank = qnn_get_ggml_tensor_rank(src1); QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type; QNN_VER_PTR(*tensor_2)->dimensions = dimensions_output; - QNN_VER_PTR(*tensor_2)->rank = ggml_get_tensor_rank(dst); + QNN_VER_PTR(*tensor_2)->rank = qnn_get_ggml_tensor_rank(dst); QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type; QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, - ggml_get_tensor_data_size(src0)}; + qnn_get_ggml_tensor_data_size(src0)}; QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, - ggml_get_tensor_data_size(src1)}; + qnn_get_ggml_tensor_data_size(src1)}; QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, - ggml_get_tensor_data_size(dst)}; + qnn_get_ggml_tensor_data_size(dst)}; Qnn_Tensor_t tensor_inputs[] = {*tensor_0, *tensor_1}; Qnn_Tensor_t tensor_outputs[] = {*tensor_2}; @@ -2513,16 +2532,36 @@ static void ggml_qnn_hanlde_op(ggml_backend_qnn_context * ctx, nullptr, nullptr); if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); + goto failure; } } +failure: + if (QNN_SUCCESS != error) { + QNN_LOG_DEBUG("tensor0 name %s", QNN_TENSOR_GET_NAME(*tensor_0)); + QNN_LOG_DEBUG("tensor1 name %s", QNN_TENSOR_GET_NAME(*tensor_1)); + QNN_LOG_DEBUG("tensor2 name %s", QNN_TENSOR_GET_NAME(*tensor_2)); + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 + " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + src0->name, src0->type, ggml_type_name(src0->type), + src0->ne[0], src0->ne[1], src0->ne[2], src0->nb[0], + src0->nb[1], src0->nb[2]); + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 + " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + src1->name, src1->type, ggml_type_name(src1->type), + src1->ne[0], src1->ne[1], src1->ne[2], src1->nb[0], + src1->nb[1], src1->nb[2]); + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 + " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + dst->name, dst->type, ggml_type_name(dst->type), dst->ne[0], + dst->ne[1], dst->ne[2], dst->nb[0], dst->nb[1], dst->nb[2]); + QNN_LOG_DEBUG("%d, %d, %d, %d", src0->ne[0], src0->ne[1], src0->ne[2], + src0->ne[3]); + } + QNN_VER_PTR(*tensor_0)->dimensions = tensor_0_dimensions; QNN_VER_PTR(*tensor_1)->dimensions = tensor_1_dimensions; QNN_VER_PTR(*tensor_2)->dimensions = tensor_2_dimensions; - n_end_time = ggml_time_us(); - n_duration = (n_end_time - n_begin_time) / 1000; - QNN_LOG_DEBUG("duration of ggml_qnn_%s : %lld milliseconds\n", - ggml_op_name(ggmlop), n_duration); - QNN_LOG_DEBUG("call %s done\n", __func__); + perf.info(); } static void ggml_qnn_repeat(ggml_backend_qnn_context * ctx, @@ -2829,44 +2868,6 @@ bool ggml_qnn_compute_forward(ggml_backend_qnn_context * ctx, return true; } -struct ggml_backend_qnn_buffer_context { - ggml_backend_qnn_buffer_context(size_t device) - : device(device) - , name(GGML_QNN_NAME + std::to_string(device)) {} - - ~ggml_backend_qnn_buffer_context() { - if (buffer) { - free(buffer); - } - - for (auto * sub_buffer : sub_buffers) { - free(sub_buffer); - } - - for (auto * qnn_tensor : qnn_tensors) { - free_qnn_tensor(*qnn_tensor); - free(qnn_tensor); - } - - sub_buffers.clear(); - qnn_tensors.clear(); - } - void * buffer = nullptr; - - struct ggml_backend_qnn_context * backend_ctx = nullptr; - - size_t buffer_size = 0; - std::vector sub_buffers; - std::vector qnn_tensors; - size_t device; - std::string name; -}; - -struct ggml_backend_qnn_buffer_type_context { - size_t device; - std::string name; -}; - static const char * ggml_backend_qnn_buffer_get_name(ggml_backend_buffer_t buffer) { GGML_UNUSED(buffer); return "QNN"; @@ -2922,7 +2923,7 @@ GGML_CALL static void ggml_backend_qnn_buffer_init_tensor(ggml_backend_buffer_t QNN_QUANTIZATION_ENCODING_UNDEFINED, {.scaleOffsetEncoding = {.scale = 0.0000000000000000f, .offset = 0}}}, - .rank = ggml_get_tensor_rank(tensor), + .rank = qnn_get_ggml_tensor_rank(tensor), .dimensions = dimensions, .memType = QNN_TENSORMEMTYPE_RAW, {.clientBuf = {.data = nullptr, .dataSize = 0}}}}}; @@ -3122,7 +3123,7 @@ GGML_CALL static bool ggml_backend_qnn_supports_op(ggml_backend_t backend, const ggml_tensor * op) { ggml_backend_qnn_context *ctx = (ggml_backend_qnn_context *) backend->context; - return (ggml_qnn_can_handle_op(ctx, op, true)); + return (ggml_qnn_can_handle_op(ctx, op, false)); } GGML_CALL static bool ggml_backend_qnn_offload_op(ggml_backend_t backend,const ggml_tensor * tensor) { @@ -3213,14 +3214,13 @@ ggml_backend_buffer_type_t ggml_backend_qnn_buffer_type(size_t device) { return nullptr; } - //ref:https://github.com/zhouwg/llama.cpp/pull/1 static ggml_backend_qnn_buffer_type_context ggml_backend_qnn_buffer_type_contexts[GGML_QNN_MAX_DEVICES]; static ggml_backend_buffer_type ggml_backend_qnn_buffer_types[GGML_QNN_MAX_DEVICES]; static bool ggml_backend_qnn_buffer_type_initialized = false; if (!ggml_backend_qnn_buffer_type_initialized) { for (size_t i = 0; i < GGML_QNN_MAX_DEVICES; i++) { auto & context = ggml_backend_qnn_buffer_type_contexts[i]; - context = { i, std::string(GGML_QNN_NAME) + std::to_string(i) }; + context = { i, std::string(QNN_BACKEND_NAME) + std::to_string(i) }; ggml_backend_qnn_buffer_types[i] = { /* .iface = */ { /* .get_name = */ ggml_backend_qnn_buffer_type_name, @@ -3285,10 +3285,10 @@ ggml_backend_t ggml_backend_qnn_init(size_t device, const char * qnn_lib_path) { } else { if (0 == setenv("LD_LIBRARY_PATH", path.c_str(), 1)) { QNN_LOG_INFO("%s backend setenv successfully\n", - get_qnn_backend_name(device)); + qnn_get_backend_name(device)); } else { QNN_LOG_ERROR("%s backend setenv failure\n", - get_qnn_backend_name(device)); + qnn_get_backend_name(device)); } } @@ -3298,7 +3298,7 @@ ggml_backend_t ggml_backend_qnn_init(size_t device, const char * qnn_lib_path) { if (0 != result) { QNN_LOG_WARN( "init qnn subsystem failed with qnn backend %s, pls check why\n", - get_qnn_backend_name(device)); + qnn_get_backend_name(device)); delete instance; return nullptr; } @@ -3309,7 +3309,7 @@ ggml_backend_t ggml_backend_qnn_init(size_t device, const char * qnn_lib_path) { return nullptr; } - std::string device_name = get_qnn_backend_name(device); + std::string device_name = qnn_get_backend_name(device); QNN_LOG_INFO("qnn device name %s", device_name.c_str()); instance->init_qnn_graph(device_name.c_str(), false); g_qnn_mgr[device].instance = instance; From 5269e082aa479de382fefde7518a84036c1b6b7f Mon Sep 17 00:00:00 2001 From: "zhou.weiguo" Date: Tue, 11 Jun 2024 23:05:00 +0800 Subject: [PATCH 014/143] ggml-qnn: refine ggml inference using QNN NPU --- ggml-qnn.cpp | 250 ++++++++++++------------ tests/ggml-qnn/ggml-qnn-ut-build-run.sh | 10 +- tests/ggml-qnn/ggml-qnn-ut.cpp | 42 ++-- 3 files changed, 149 insertions(+), 153 deletions(-) diff --git a/ggml-qnn.cpp b/ggml-qnn.cpp index 43a8fcd3ea8cb..4700e145112d6 100644 --- a/ggml-qnn.cpp +++ b/ggml-qnn.cpp @@ -55,6 +55,7 @@ #include "Saver/QnnSaver.h" #include "System/QnnSystemInterface.h" #include "HTP/QnnHtpDevice.h" +#include // ================================================================================================= // @@ -72,9 +73,16 @@ static int free_qnn_tensor(Qnn_Tensor_t & tensor); // self-defined macro / data structure // // ================================================================================================= -#define ENABLE_QNNBACKEND_DEBUG 1 // for troubleshooting QNN backend +#ifdef NDEBUG +#define ENABLE_QNNBACKEND_DEBUG 0 // for troubleshooting QNN backend #define ENABLE_QNNSDK_LOG 0 // enable/disable QNN SDK's internal log #define ENABLE_QNNBACKEND_PERF 0 // enable/disable op's perf info +#else +#define ENABLE_QNNBACKEND_DEBUG 1 // for troubleshooting QNN backend +#define ENABLE_QNNSDK_LOG 1 // enable/disable QNN SDK's internal log +#define ENABLE_QNNBACKEND_PERF 1 // enable/disable op's perf info +#endif + #define QNN_LOGBUF_LEN 4096 #define QNN_BACKEND_NAME "qnn" @@ -393,7 +401,6 @@ static void qnn_internal_log(ggml_log_level level, const char * file, } } - static bool qnn_is_valid_params(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { if ((nullptr == ctx) || (nullptr == src0) || (nullptr == src1) || (nullptr == dst)) { @@ -438,8 +445,8 @@ class qnn_perf { void info() { _end_time = ggml_time_us(); - _duration = (_end_time - _begin_time) / 1000; - QNN_LOG_DEBUG("duration of %s : %lld milliseconds\n", _perf_name.c_str(), _duration); + _duration = (_end_time - _begin_time); + QNN_LOG_DEBUG("duration of %s : %lld microseconds\n", _perf_name.c_str(), _duration); } private: @@ -473,15 +480,15 @@ enum qnn_sdk_profile_level { profile_detail = 2 }; -using _pfn_rpc_mem_init = void (*)(void); -using _pfn_rpc_mem_deinit = void (*)(void); -using _pfn_rpc_mem_alloc = void *(*) (int, uint32_t, int); -using _pfn_rpc_mem_free = void (*)(void *); -using _pfn_rpc_mem_to_fd = int (*)(void *); +using pfn_rpc_mem_init = void (*)(void); +using pfn_rpc_mem_deinit = void (*)(void); +using pfn_rpc_mem_alloc = void *(*) (int, uint32_t, int); +using pfn_rpc_mem_free = void (*)(void *); +using pfn_rpc_mem_to_fd = int (*)(void *); -using _pfn_qnnsaver_initialize = decltype(QnnSaver_initialize); -using _pfn_qnninterface_getproviders = decltype(QnnInterface_getProviders); -using _pfn_qnnsysteminterface_getproviders = decltype(QnnSystemInterface_getProviders); +using pfn_qnnsaver_initialize = decltype(QnnSaver_initialize); +using pfn_qnninterface_getproviders = decltype(QnnInterface_getProviders); +using pfn_qnnsysteminterface_getproviders = decltype(QnnSystemInterface_getProviders); #define QNN_VER_PTR(x) (&((x).v1)) #define RPCMEM_DEFAULT_FLAGS 1 @@ -702,7 +709,7 @@ static int deep_copy_qnn_tensors(Qnn_Tensor_t & src, Qnn_Tensor_t & dst) { if (encoding == QNN_QUANTIZATION_ENCODING_AXIS_SCALE_OFFSET) { Qnn_QuantizeParams_t src_qparam_cpy = src_qparam; Qnn_AxisScaleOffset_t & axis_scale_offset = src_qparam_cpy.axisScaleOffsetEncoding; - Qnn_ScaleOffset_t **scaleOffset = & axis_scale_offset.scaleOffset; + Qnn_ScaleOffset_t ** scaleOffset = & axis_scale_offset.scaleOffset; size_t scaleOffsetSize = axis_scale_offset.numScaleOffsets * sizeof(Qnn_ScaleOffset_t); *scaleOffset = (Qnn_ScaleOffset_t *) malloc(scaleOffsetSize); memscpy(*scaleOffset, scaleOffsetSize, @@ -732,8 +739,8 @@ static int deep_copy_qnn_tensors(Qnn_Tensor_t & src, Qnn_Tensor_t & dst) { uint32_t rank = QNN_TENSOR_GET_RANK(src); QNN_TENSOR_SET_RANK(dst, rank); - size_t dim_size = rank * sizeof(uint32_t); - uint32_t *dimensions = (uint32_t *) malloc(dim_size); + size_t dim_size = rank * sizeof(uint32_t); + uint32_t * dimensions = (uint32_t *) malloc(dim_size); if (dimensions == nullptr) { QNN_LOG_WARN("deep_copy_qnn_tensors() allocation error while copying " "tensor %s\n", @@ -1072,26 +1079,26 @@ class qnn_instance { QNN_LOG_DEBUG("load rpcmem lib successfully\n"); set_rpcmem_initialized(true); } - __pfn_rpc_mem_init = reinterpret_cast<_pfn_rpc_mem_init>( + _pfn_rpc_mem_init = reinterpret_cast( dlsym(_rpc_lib_handle, "rpcmem_init")); - __pfn_rpc_mem_deinit = reinterpret_cast<_pfn_rpc_mem_deinit>( + _pfn_rpc_mem_deinit = reinterpret_cast( dlsym(_rpc_lib_handle, "rpcmem_deinit")); - __pfn_rpc_mem_alloc = reinterpret_cast<_pfn_rpc_mem_alloc>( + _pfn_rpc_mem_alloc = reinterpret_cast( dlsym(_rpc_lib_handle, "rpcmem_alloc")); - __pfn_rpc_mem_free = reinterpret_cast<_pfn_rpc_mem_free>( + _pfn_rpc_mem_free = reinterpret_cast( dlsym(_rpc_lib_handle, "rpcmem_free")); - __pfn_rpc_mem_to_fd = reinterpret_cast<_pfn_rpc_mem_to_fd>( + _pfn_rpc_mem_to_fd = reinterpret_cast( dlsym(_rpc_lib_handle, "rpcmem_to_fd")); - if (nullptr == __pfn_rpc_mem_alloc || nullptr == __pfn_rpc_mem_free || - nullptr == __pfn_rpc_mem_to_fd) { + if (nullptr == _pfn_rpc_mem_alloc || nullptr == _pfn_rpc_mem_free || + nullptr == _pfn_rpc_mem_to_fd) { QNN_LOG_WARN("unable to access symbols in QNN RPC lib. dlerror(): %s", dlerror()); dlclose(_rpc_lib_handle); return 9; } if (nullptr != - __pfn_rpc_mem_init) // make Qualcomm's SoC equipped low-end phone happy - __pfn_rpc_mem_init(); + _pfn_rpc_mem_init) // make Qualcomm's SoC equipped low-end phone happy + _pfn_rpc_mem_init(); std::vector temp_context_config; _qnn_interface.qnn_context_create( @@ -1124,7 +1131,6 @@ class qnn_instance { } _qnn_raw_interface.deviceFreePlatformInfo(nullptr, p_info); - //TODO: faster approach to probe the accurate capacity of rpc ion memory size_t candidate_size = 0; uint8_t * rpc_buffer = nullptr; @@ -1145,6 +1151,16 @@ class qnn_instance { if (candidate_size > _rpcmem_capacity) _rpcmem_capacity = candidate_size; QNN_LOG_INFO("capacity of rpc ion memory %d MB\n", _rpcmem_capacity); + + if (0 != init_htp_perfinfra()) { + QNN_LOG_WARN("initialize HTP performance failure"); + } + if (0 != set_rpc_polling()) { + QNN_LOG_WARN("set RPC polling failure"); + } + if (0 != set_high_performance_mode()) { + QNN_LOG_WARN("set HTP high performance mode failure"); + } } QNN_LOG_DEBUG("leave qni_init\n"); @@ -1156,9 +1172,8 @@ class qnn_instance { int ret_status = 0; Qnn_ErrorHandle_t error = QNN_SUCCESS; - if (nullptr != - __pfn_rpc_mem_deinit) // make Qualcomm's SoC equipped low-end phone happy - __pfn_rpc_mem_deinit(); + if (nullptr != _pfn_rpc_mem_deinit) // make Qualcomm's SoC equipped low-end phone happy + _pfn_rpc_mem_deinit(); if (dlclose(_rpc_lib_handle) != 0) { QNN_LOG_WARN("failed to unload qualcomm's rpc lib, error:%s\n", dlerror()); @@ -1325,6 +1340,8 @@ class qnn_instance { if (error != QNN_SUCCESS) { QNN_LOG_WARN("failed to get qnn device infra\n"); return 1; + } else { + QNN_LOG_INFO("HTP backend perf_infrastructure creation ok\n"); } QnnHtpDevice_Infrastructure_t * htp_infra = static_cast(device_infra); @@ -1333,6 +1350,11 @@ class qnn_instance { uint32_t device_id = 0; uint32_t core_id = 0; htp_perfinfra->createPowerConfigId(device_id, core_id, &power_configid); + if (htp_infra->infraType != QNN_HTP_DEVICE_INFRASTRUCTURE_TYPE_PERF) { + QNN_LOG_INFO("HTP infra type = %d, which is not perf infra type", htp_infra->infraType); + } else { + QNN_LOG_INFO("HTP infra type = %d, which is perf infra type\n", htp_infra->infraType); + } _qnn_htp_perfinfra = htp_perfinfra; _qnn_power_configid = power_configid; @@ -1343,14 +1365,17 @@ class qnn_instance { if (_qnn_rpc_pollingtime > 0) { QnnHtpPerfInfrastructure_PowerConfig_t rpc_pollingTime; memset(&rpc_pollingTime, 0, sizeof(rpc_pollingTime)); - rpc_pollingTime.option = - QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_RPC_POLLING_TIME; + rpc_pollingTime.option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_RPC_POLLING_TIME; rpc_pollingTime.rpcPollingTimeConfig = _qnn_rpc_pollingtime; - const QnnHtpPerfInfrastructure_PowerConfig_t * powerConfigs[] = { - &rpc_pollingTime, nullptr}; + + QnnHtpPerfInfrastructure_PowerConfig_t rpc_ControlLatency; + memset(&rpc_ControlLatency, 0, sizeof(rpc_ControlLatency)); + rpc_ControlLatency.option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_RPC_CONTROL_LATENCY; + rpc_ControlLatency.rpcControlLatencyConfig = 40; + + const QnnHtpPerfInfrastructure_PowerConfig_t * powerConfigs[] = {&rpc_pollingTime, &rpc_ControlLatency, nullptr}; if (_qnn_htp_perfinfra) { - _qnn_htp_perfinfra->setPowerConfig(_qnn_power_configid, - powerConfigs); + _qnn_htp_perfinfra->setPowerConfig(_qnn_power_configid, powerConfigs); } } return 0; @@ -1426,7 +1451,7 @@ class qnn_instance { } auto allocate_bytes = static_cast(bytes + alignment); - void * buf = __pfn_rpc_mem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS, + void * buf = _pfn_rpc_mem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS, allocate_bytes); if (buf == nullptr) { QNN_LOG_WARN("failed to allocate rpc memory\n"); @@ -1439,7 +1464,7 @@ class qnn_instance { _rpcmem_store_map.insert(std::pair(aligned_buf, buf)).second; if (!status) { QNN_LOG_WARN("failed to allocate rpc memory\n"); - __pfn_rpc_mem_free(buf); + _pfn_rpc_mem_free(buf); } return aligned_buf; @@ -1451,7 +1476,7 @@ class qnn_instance { } else if (0 == _rpcmem_store_map.count(buf)) { QNN_LOG_WARN("no allocated tensor\n"); } else { - __pfn_rpc_mem_free(_rpcmem_store_map[buf]); + _pfn_rpc_mem_free(_rpcmem_store_map[buf]); _rpcmem_store_map.erase(buf); } } @@ -1461,7 +1486,7 @@ class qnn_instance { if (!is_rpcmem_initialized()) { QNN_LOG_WARN("rpc memory not initialized\n"); } else { - mem_fd = __pfn_rpc_mem_to_fd(buf); + mem_fd = _pfn_rpc_mem_to_fd(buf); } return mem_fd; @@ -1560,7 +1585,7 @@ class qnn_instance { } auto * get_providers = - reinterpret_cast<_pfn_qnnsysteminterface_getproviders *>( + reinterpret_cast( dlsym(_system_lib_handle, "QnnSystemInterface_getProviders")); if (nullptr == get_providers) { QNN_LOG_WARN( @@ -1661,7 +1686,7 @@ class qnn_instance { return 1; } - auto get_providers = load_qnn_functionpointers<_pfn_qnninterface_getproviders *>( + auto get_providers = load_qnn_functionpointers( lib_handle, "QnnInterface_getProviders"); if (nullptr == get_providers) { QNN_LOG_WARN("can not load symbol QnnInterface_getProviders : %s", @@ -1805,11 +1830,11 @@ class qnn_instance { void * _rpc_lib_handle = nullptr; std::atomic_bool _rpcmem_initialized{false}; - _pfn_rpc_mem_alloc __pfn_rpc_mem_alloc; - _pfn_rpc_mem_free __pfn_rpc_mem_free; - _pfn_rpc_mem_to_fd __pfn_rpc_mem_to_fd; - _pfn_rpc_mem_init __pfn_rpc_mem_init; - _pfn_rpc_mem_deinit __pfn_rpc_mem_deinit; + pfn_rpc_mem_alloc _pfn_rpc_mem_alloc; + pfn_rpc_mem_free _pfn_rpc_mem_free; + pfn_rpc_mem_to_fd _pfn_rpc_mem_to_fd; + pfn_rpc_mem_init _pfn_rpc_mem_init; + pfn_rpc_mem_deinit _pfn_rpc_mem_deinit; std::unordered_map _rpcmem_store_map; size_t _rpcmem_capacity = 512; @@ -1824,101 +1849,63 @@ class qnn_instance { static bool ggml_qnn_can_handle_op(ggml_backend_qnn_context * ctx, const struct ggml_tensor * tensor, bool b_dump_tensor_info) { - // only support the following 3 OPs currently - // provide a GENERAL approach could fix this problem in a standalone PR of refine ggml backend - // subsystem for mixed inference between CPU&GPU / CPU&NPU easily for ANY ggml backends - // which the backend's ggml_backend_xxx_buffer_is_host return true. - // this approach could be found: - // https://github.com/ggerganov/llama.cpp/pull/7641 - bool supported_op = ((tensor->op == GGML_OP_ADD) || (tensor->op == GGML_OP_MUL) - || (tensor->op == GGML_OP_MUL_MAT)); - if (!supported_op) { + if (ggml_is_empty(tensor) || tensor->op == GGML_OP_RESHAPE || + tensor->op == GGML_OP_TRANSPOSE || tensor->op == GGML_OP_VIEW || + tensor->op == GGML_OP_PERMUTE || tensor->op == GGML_OP_NONE) { return false; } const struct ggml_tensor * src0 = tensor->src[0]; const struct ggml_tensor * src1 = tensor->src[1]; + if (nullptr == src0 || nullptr == src1) { + return false; + } + const int64_t ne00 = src0->ne[0]; const int64_t ne01 = src0->ne[1]; const int64_t ne10 = src1->ne[0]; const int64_t ne11 = src1->ne[1]; - const int64_t ne20 = tensor->ne[0]; - const int64_t ne21 = tensor->ne[1]; - - //TODO: support other quantized data type - if (ggml_is_quantized(src0->type)) { - if ((src0->type != GGML_TYPE_Q8_0) && (src0->type != GGML_TYPE_Q4_0)) { - return false; - } - } - - if (b_dump_tensor_info) { - if (tensor->op == GGML_OP_MUL_MAT) { - QNN_LOG_DEBUG("GGML_OP_MUL_MAT"); - QNN_LOG_DEBUG("op name:%s, tensor type:%s", - ggml_op_name(tensor->op), - ggml_type_name(tensor->type)); - QNN_LOG_DEBUG("src0 type:%s", ggml_type_name(tensor->src[0]->type)); - QNN_LOG_DEBUG("src1 type:%s", ggml_type_name(tensor->src[1]->type)); - QNN_LOG_DEBUG("src0 %15s: type = %i (%5s) ne = %5" PRIi64 - " x %5" PRIi64 " x %5" PRIi64 - ", nb = (%5zi, %5zi, %5zi)\n", - src0->name, src0->type, ggml_type_name(src0->type), - src0->ne[0], src0->ne[1], src0->ne[2], src0->nb[0], - src0->nb[1], src0->nb[2]); - QNN_LOG_DEBUG("src1 %15s: type = %i (%5s) ne = %5" PRIi64 - " x %5" PRIi64 " x %5" PRIi64 - ", nb = (%5zi, %5zi, %5zi)\n", - src1->name, src1->type, ggml_type_name(src1->type), - src1->ne[0], src1->ne[1], src1->ne[2], src1->nb[0], - src1->nb[1], src1->nb[2]); - QNN_LOG_DEBUG( - " %15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 - " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", - tensor->name, tensor->type, ggml_type_name(tensor->type), - tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->nb[0], - tensor->nb[1], tensor->nb[2]); - } - } - if (ggml_is_empty(tensor) || tensor->op == GGML_OP_RESHAPE || - tensor->op == GGML_OP_TRANSPOSE || tensor->op == GGML_OP_VIEW || - tensor->op == GGML_OP_PERMUTE || tensor->op == GGML_OP_NONE) { + // make qnn_get_ggml_tensor_rank and QNN SDK happy + if (ne00 <= 1 || ne01 <= 1 || ne10 <= 1 || ne11 <= 1) { return false; } - // make qnn_get_ggml_tensor_rank and QNN SDK happy - if ((ne00 <= 1 || ne01 <= 1 || ne10 <= 1 || ne11 <= 1)) { + // TODO: support other GGML OPs using QNN API + // a GENERAL approach could fix this problem in a standalone PR of refine ggml backend + // subsystem for mixed inference between CPU&GPU / CPU&NPU easily for ANY ggml backends + // which the backend's ggml_backend_xxx_buffer_is_host return true. + // this approach could be found: + // https://github.com/ggerganov/llama.cpp/pull/7641 + bool supported_op = false; + supported_op = (tensor->op == GGML_OP_ADD); + supported_op = ((tensor->op == GGML_OP_ADD) || (tensor->op == GGML_OP_MUL) || (tensor->op == GGML_OP_MUL_MAT)); + if (!supported_op) { return false; } - int qtype = src0->type; - if (tensor->op == GGML_OP_ADD) { - return (qtype == GGML_TYPE_F32 || qtype == GGML_TYPE_F16 || - qtype == GGML_TYPE_Q8_0) && - (src1->type == GGML_TYPE_F32); + //TODO: support other quantized data type + if (ggml_is_quantized(src0->type)) { + if (src0->type != GGML_TYPE_Q8_0 && src0->type != GGML_TYPE_Q4_0) { + return false; + } } + int qtype = src0->type; if (tensor->op == GGML_OP_MUL) { return (qtype == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32); } if (tensor->op == GGML_OP_MUL_MAT) { - if (ctx->device == QNN_BACKEND_GGML) { - return (ne00 == ne10) && (src1->ne[2] % src0->ne[2] == 0) && - (src1->ne[3] % src0->ne[3] == 0); - } - if ((ctx->device == QNN_BACKEND_NPU) && (qtype == GGML_TYPE_Q8_0) && - (src1->type == GGML_TYPE_F32) && (tensor->type == GGML_TYPE_F32)) { + if (ne00 <= 32 || ne01 <= 32 || ne10 <= 32 || ne11 <= 32) { + return false; + } else { return true; } - if (ctx->device == QNN_BACKEND_CPU || ctx->device == QNN_BACKEND_GPU) { - return (ne00 == ne10) && (ne00 == ne01); - } - return false; } -} + return true; +} static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { @@ -1978,10 +1965,25 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src if (!graph_initialized) { graph_name = graph_name + "_" + std::to_string(ctx->threads) + src0->name + "_" + src1->name; - QNN_LOG_DEBUG("graph name %s", graph_name.c_str()); - error = qnn_raw_interface.graphCreate( - instance->get_qnn_context_handle(), graph_name.c_str(), nullptr, - &graph_handle); + QNN_LOG_INFO("graph name %s", graph_name.c_str()); + if (ctx->device == QNN_BACKEND_NPU) { + QnnHtpGraph_CustomConfig_t custom_config; + custom_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_NUM_HVX_THREADS; + custom_config.numHvxThreads = 8; + + QnnGraph_Config_t graph_config; + graph_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; + graph_config.customConfig = &custom_config; + const QnnGraph_Config_t * p_graphconfig[] = {&graph_config, NULL}; + error = qnn_raw_interface.graphCreate( + instance->get_qnn_context_handle(), graph_name.c_str(), p_graphconfig, + &graph_handle); + } else { + error = qnn_raw_interface.graphCreate( + instance->get_qnn_context_handle(), graph_name.c_str(), nullptr, + &graph_handle); + } + if (QNN_SUCCESS != error) { QNN_LOG_INFO("can't create qnn graph handle with graph name %s, " "error = %d\n", @@ -2112,8 +2114,6 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src dst->name, dst->type, ggml_type_name(dst->type), dst->ne[0], dst->ne[1], dst->ne[2], dst->nb[0], dst->nb[1], dst->nb[2]); - QNN_LOG_DEBUG("%d, %d, %d, %d", src0->ne[0], src0->ne[1], src0->ne[2], - src0->ne[3]); } QNN_VER_PTR(*tensor_0)->dimensions = tensor_0_dimensions; @@ -2198,7 +2198,7 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, if (!graph_initialized) { graph_name = graph_name + "_" + std::to_string(ctx->threads) + src0->name + "_" + src1->name; - QNN_LOG_DEBUG("graph name %s", graph_name.c_str()); + QNN_LOG_INFO("graph name %s", graph_name.c_str()); error = qnn_raw_interface.graphCreate( instance->get_qnn_context_handle(), graph_name.c_str(), nullptr, &graph_handle); @@ -2331,8 +2331,6 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", dst->name, dst->type, ggml_type_name(dst->type), dst->ne[0], dst->ne[1], dst->ne[2], dst->nb[0], dst->nb[1], dst->nb[2]); - QNN_LOG_DEBUG("%d, %d, %d, %d", src0->ne[0], src0->ne[1], src0->ne[2], - src0->ne[3]); } QNN_VER_PTR(*tensor_0)->dimensions = tensor_0_dimensions; @@ -2894,7 +2892,6 @@ GGML_CALL static void ggml_backend_qnn_buffer_init_tensor(ggml_backend_buffer_t Qnn_ErrorHandle_t error = QNN_SUCCESS; ggml_backend_qnn_buffer_context * ctx = (ggml_backend_qnn_buffer_context *) buffer->context; - static int idx = 0; char tensor_name[GGML_MAX_NAME] = {0}; snprintf(tensor_name, GGML_MAX_NAME, "tensor_%04d", idx++); @@ -3061,7 +3058,7 @@ GGML_CALL static const char * ggml_backend_qnn_name(ggml_backend_t backend) { GGML_CALL static void ggml_backend_qnn_free(ggml_backend_t backend) { QNN_LOG_INFO("enter %s", __func__); ggml_backend_qnn_context * ctx = (ggml_backend_qnn_context *) backend->context; - QNN_LOG_DEBUG("idx %d, name:%s", ctx->device, g_qnn_mgr[ctx->device].name); + QNN_LOG_INFO("idx %d, name:%s", ctx->device, g_qnn_mgr[ctx->device].name); qnn_instance * instance = (qnn_instance *)g_qnn_mgr[ctx->device].instance; if (instance != nullptr) { @@ -3073,7 +3070,7 @@ GGML_CALL static void ggml_backend_qnn_free(ggml_backend_t backend) { auto & graph_item = graph_it->second; Qnn_GraphHandle_t & graph_handle = std::get<0>(graph_item); GGML_UNUSED(graph_handle); - QNN_LOG_DEBUG("graph type:%s", graph_it->first.c_str()); + QNN_LOG_INFO("graph type:%s", graph_it->first.c_str()); } instance->_qnn_graph_map.clear(); @@ -3104,7 +3101,7 @@ GGML_CALL static ggml_status ggml_backend_qnn_graph_compute(ggml_backend_t backe params.type = GGML_TASK_TYPE_COMPUTE; params.ith = 0; for (int i = 0; i < cgraph->n_nodes; i++) { - ggml_tensor *node = cgraph->nodes[i]; + ggml_tensor * node = cgraph->nodes[i]; if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) { @@ -3213,7 +3210,6 @@ ggml_backend_buffer_type_t ggml_backend_qnn_buffer_type(size_t device) { device, GGML_QNN_MAX_DEVICES - 1); return nullptr; } - static ggml_backend_qnn_buffer_type_context ggml_backend_qnn_buffer_type_contexts[GGML_QNN_MAX_DEVICES]; static ggml_backend_buffer_type ggml_backend_qnn_buffer_types[GGML_QNN_MAX_DEVICES]; static bool ggml_backend_qnn_buffer_type_initialized = false; diff --git a/tests/ggml-qnn/ggml-qnn-ut-build-run.sh b/tests/ggml-qnn/ggml-qnn-ut-build-run.sh index 192f2f4bda2f5..4c21be5a41fa2 100755 --- a/tests/ggml-qnn/ggml-qnn-ut-build-run.sh +++ b/tests/ggml-qnn/ggml-qnn-ut-build-run.sh @@ -12,6 +12,8 @@ ANDROID_PLATFORM=android-34 GGML_QNN_UT=ggml-qnn-ut REMOTE_PATH=/data/local/tmp/ +BUILDTYPE=Debug +BUILDTYPE=Release function dump_vars() @@ -70,7 +72,7 @@ function check_and_download_ndk() function build_arm64 { - cmake -H. -B./out/arm64-v8a -DTARGET_NAME=${GGML_QNN_UT} -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=${ANDROID_PLATFORM} -DANDROID_NDK=${ANDROID_NDK} -DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK}/build/cmake/android.toolchain.cmake -DQNN_SDK_PATH=${QNN_SDK_PATH} + cmake -H. -B./out/arm64-v8a -DTARGET_NAME=${GGML_QNN_UT} -DCMAKE_BUILD_TYPE=${BUILDTYPE} -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=${ANDROID_PLATFORM} -DANDROID_NDK=${ANDROID_NDK} -DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK}/build/cmake/android.toolchain.cmake -DQNN_SDK_PATH=${QNN_SDK_PATH} cd ./out/arm64-v8a make @@ -166,9 +168,9 @@ function show_usage() echo "Usage:" echo " $0 build (build Android command line UT program)" echo " $0 updateqnnlibs (upload the latest QNN libs to Android phone)" - echo " $0 GGML_OP_ADD 0 (QNN_CPU) / 1(QNN_GPU) / 2(QNN_NPU)" - echo " $0 GGML_OP_MUL 0 (QNN_CPU) / 1(QNN_GPU) / 2(QNN_NPU)" - echo " $0 GGML_OP_MUL_MAT 0 (QNN_CPU) / 1(QNN_GPU) / 2(QNN_NPU)" + echo " $0 GGML_OP_ADD 0 (QNN_CPU) / 1(QNN_GPU) / 2(QNN_NPU) / 3(ggml)" + echo " $0 GGML_OP_MUL 0 (QNN_CPU) / 1(QNN_GPU) / 2(QNN_NPU) / 3(ggml)" + echo " $0 GGML_OP_MUL_MAT 0 (QNN_CPU) / 1(QNN_GPU) / 2(QNN_NPU) / 3(ggml)" echo -e "\n\n\n" } diff --git a/tests/ggml-qnn/ggml-qnn-ut.cpp b/tests/ggml-qnn/ggml-qnn-ut.cpp index eb072beae6bd4..9af433ceb6690 100644 --- a/tests/ggml-qnn/ggml-qnn-ut.cpp +++ b/tests/ggml-qnn/ggml-qnn-ut.cpp @@ -72,14 +72,12 @@ static void ggml_qnn_log_internal(ggml_log_level level, const char * file, const int len_prefix = snprintf(s_ggml_qnn_log_internal_buf, GGML_QNN_LOGBUF_LEN, "[%s, %d]: ", func, line); int len = vsnprintf(s_ggml_qnn_log_internal_buf + len_prefix, GGML_QNN_LOGBUF_LEN - len_prefix, format, args); if (len < (GGML_QNN_LOGBUF_LEN - len_prefix)) { - //for Android command line application or WoA printf("%s\n", s_ggml_qnn_log_internal_buf); } va_end(args); } } - static const char * get_qnn_backend_name(int n_backend_type) { switch (n_backend_type) { case 0: @@ -95,7 +93,6 @@ static const char * get_qnn_backend_name(int n_backend_type) { } } - static bool ggml_graph_compute_helper( struct ggml_backend * backend, struct ggml_cgraph * graph, @@ -123,26 +120,25 @@ static bool ggml_graph_compute_helper( } #endif - //a new approch of mixed inference if (nullptr != backend) return ggml_backend_graph_compute(backend, graph) == GGML_STATUS_SUCCESS; else return ggml_graph_compute(graph, &plan); } - #define QK8_0 32 + typedef struct { uint16_t d; // delta int8_t qs[QK8_0]; // quants } block_q8_0; - static inline float ggml_compute_fp16_to_fp32(uint16_t h) { __fp16 tmp; memcpy(&tmp, &h, sizeof(uint16_t)); return (float)tmp; } + #define GGML_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x) static void tensor_dump(const ggml_tensor * tensor, const char * name) { @@ -245,7 +241,6 @@ static void tensor_dump(const ggml_tensor * tensor, const char * name) { } } - static uint32_t get_tensor_rank(const ggml_tensor * tensor) { uint32_t rank = 0; for (int i = 0; i < GGML_MAX_DIMS; i++) { @@ -256,7 +251,6 @@ static uint32_t get_tensor_rank(const ggml_tensor * tensor) { return rank; } - static uint32_t get_tensor_data_size(const ggml_tensor * tensor) { size_t data_size = ggml_row_size(tensor->type, tensor->ne[0]); size_t n_dims = get_tensor_rank(tensor); @@ -270,7 +264,6 @@ static uint32_t get_tensor_data_size(const ggml_tensor * tensor) { return ggml_nbytes(tensor); } - //ref: https://github.com/ggerganov/llama.cpp/blob/master/tests/test-backend-ops.cpp#L20 static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float max = 1.0f) { // static RNG initialization (revisit if n_threads stops being constant) @@ -305,8 +298,11 @@ static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float m t.join(); } if (tensor->type == GGML_TYPE_F32 || tensor->type == GGML_TYPE_I32) { - //ggml_backend_tensor_set(tensor, data.data(), 0, size * sizeof(float)); +#ifdef GGML_USE_QNN memcpy((char*)tensor->data, data.data(), size * sizeof(float)); +#else + ggml_backend_tensor_set(tensor, data.data(), 0, size * sizeof(float)); +#endif } else if (ggml_is_quantized(tensor->type) || tensor->type == GGML_TYPE_F16 || tensor->type == GGML_TYPE_BF16) { GGML_ASSERT(size % ggml_blck_size(tensor->type) == 0); std::vector dataq(ggml_row_size(tensor->type, size)); @@ -321,18 +317,23 @@ static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float m } ggml_quantize_chunk(tensor->type, data.data(), dataq.data(), 0, size/tensor->ne[0], tensor->ne[0], im); GGML_ASSERT(ggml_validate_row_data(tensor->type, dataq.data(), dataq.size())); - //ggml_backend_tensor_set(tensor, dataq.data(), 0, dataq.size()); +#ifdef GGML_USE_QNN memcpy((char*)tensor->data, dataq.data(), dataq.size()); +#else + ggml_backend_tensor_set(tensor, dataq.data(), 0, dataq.size()); +#endif } else if (tensor->type == GGML_TYPE_I8 || tensor->type == GGML_TYPE_I16 || tensor->type == GGML_TYPE_I32) { // This is going to create some weird integers though. - //ggml_backend_tensor_set(tensor, data.data(), 0, ggml_nbytes(tensor)); +#ifdef GGML_USE_QNN memcpy((char*)tensor->data, data.data(), ggml_nbytes(tensor)); +#else + ggml_backend_tensor_set(tensor, data.data(), 0, ggml_nbytes(tensor)); +#endif } else { GGML_ASSERT(false); } } - //ref: https://github.com/ggerganov/llama.cpp/blob/master/tests/test-backend-ops.cpp#L310 static void initialize_tensors(ggml_context * ctx) { for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) { @@ -340,19 +341,17 @@ static void initialize_tensors(ggml_context * ctx) { } } - static void show_usage() { printf(" " \ "\nUsage: test_qnn_ops [options]\n" \ "\n" \ "Options:\n" \ " -t GGML_OP_ADD / GGML_OP_MUL / GGML_OP_MULMAT\n" \ - " -b 0(QNN_CPU) 1(QNN_GPU) 2(QNN_NPU)\n" \ + " -b 0(QNN_CPU) 1(QNN_GPU) 2(QNN_NPU) 3(ggml)\n" \ " ?/h print usage infomation\n\n" ); } - static int qnn_op_ut(int num_threads, int n_backend_type, int n_ggml_op_type) { int64_t n_begin_time = 0LL; int64_t n_end_time = 0LL; @@ -369,16 +368,15 @@ static int qnn_op_ut(int num_threads, int n_backend_type, int n_ggml_op_type) { ggml_backend_t backend = nullptr; ggml_backend_buffer_t buffer= nullptr; - ggml_type qtype = GGML_TYPE_I8; - qtype = GGML_TYPE_F32; + ggml_type qtype = GGML_TYPE_I8; qtype = GGML_TYPE_F16; qtype = GGML_TYPE_Q8_0; + qtype = GGML_TYPE_F32; std::vector work_buffer; QNN_LOG_DEBUG("enter qnn_ggml_op\n"); QNN_LOG_DEBUG("ggml op:%d(%s)\n", n_ggml_op_type, ggml_op_name((enum ggml_op) n_ggml_op_type)); - n_begin_time = ggml_time_us(); srand(time(NULL)); @@ -473,7 +471,6 @@ static int qnn_op_ut(int num_threads, int n_backend_type, int n_ggml_op_type) { initialize_tensors(ctx); } ggml_set_f32(src1, (rand() % 100 + 1)); - //ggml_set_f32(dst, 0.0f); } ggml_graph_compute_helper(backend, gf, work_buffer, num_threads, nullptr, nullptr); @@ -501,13 +498,13 @@ static int qnn_op_ut(int num_threads, int n_backend_type, int n_ggml_op_type) { ggml_free(ctx); ggml_backend_buffer_free(buffer); ggml_backend_free(backend); + n_end_time = ggml_time_us(); n_duration = (n_end_time - n_begin_time) / 1000; QNN_LOG_DEBUG("duration of ut GGML_OP_%s using QNN backend %s: %lld milliseconds\n", ggml_op_name((enum ggml_op)n_ggml_op_type), get_qnn_backend_name(n_backend_type), n_duration); return 0; } - int main(int argc, char * argv[]) { int num_threads = 4; int n_backend_type = QNN_BACKEND_CPU; @@ -531,7 +528,7 @@ int main(int argc, char * argv[]) { } else if (0 == strcmp(argv[i], "-b")) { if (i + 1 < argc) { int backend = atoi(argv[i + 1]); - if (backend <= QNN_BACKEND_NPU) + if (backend <= QNN_BACKEND_GGML) n_backend_type = backend; else { show_usage(); @@ -549,5 +546,6 @@ int main(int argc, char * argv[]) { QNN_LOG_DEBUG("backend %d, ggml op:%d(%s)", n_backend_type, n_ggml_op_type, ggml_op_name((enum ggml_op) n_ggml_op_type)); qnn_op_ut(num_threads, n_backend_type, n_ggml_op_type); + return 0; } From faaa86b7e4925c0ea38480cc1b88e1a52097e221 Mon Sep 17 00:00:00 2001 From: "zhou.weiguo" Date: Wed, 12 Jun 2024 16:30:50 +0800 Subject: [PATCH 015/143] ggml-qnn: refine ggml inference using QNN NPU --- ggml-qnn.cpp | 668 ++++++++++++++++++++++++--------- tests/ggml-qnn/CMakeLists.txt | 8 +- tests/ggml-qnn/ggml-qnn-ut.cpp | 3 +- 3 files changed, 507 insertions(+), 172 deletions(-) diff --git a/ggml-qnn.cpp b/ggml-qnn.cpp index 4700e145112d6..f59c54fcacd97 100644 --- a/ggml-qnn.cpp +++ b/ggml-qnn.cpp @@ -1001,12 +1001,10 @@ class qnn_instance { _qnn_interface.set_qnn_interface(_loaded_backend[backend_id]); - _qnn_interface.qnn_log_create(qnn_sdk_logcallback, _qnn_log_level, - &_qnn_log_handle); + _qnn_interface.qnn_log_create(qnn_sdk_logcallback, _qnn_log_level, &_qnn_log_handle); if (nullptr == _qnn_log_handle) { - QNN_LOG_WARN( - "why failed to initialize qnn log\n"); // NPU backend not work on - // Qualcomm SoC equipped low-end phone + // NPU backend not work on Qualcomm SoC equipped low-end phone + QNN_LOG_WARN("why failed to initialize qnn log\n"); return 4; } else { QNN_LOG_DEBUG("initialize qnn log successfully\n"); @@ -1025,23 +1023,62 @@ class qnn_instance { } if (nullptr != _qnn_raw_interface.propertyHasCapability) { - auto qnnStatus = + Qnn_ErrorHandle_t qnn_status = _qnn_raw_interface.propertyHasCapability(QNN_PROPERTY_GROUP_DEVICE); - if (QNN_PROPERTY_NOT_SUPPORTED == qnnStatus) { + if (QNN_PROPERTY_NOT_SUPPORTED == qnn_status) { QNN_LOG_WARN("device property is not supported\n"); } - if (QNN_PROPERTY_ERROR_UNKNOWN_KEY == qnnStatus) { + if (QNN_PROPERTY_ERROR_UNKNOWN_KEY == qnn_status) { QNN_LOG_WARN("device property is not known to backend\n"); } } - Qnn_ErrorHandle_t qnn_status = _qnn_raw_interface.deviceCreate(_qnn_log_handle, nullptr, - &_qnn_device_handle); + Qnn_ErrorHandle_t qnn_status = QNN_SUCCESS; + if (_backend_name.find("Htp") != std::variant_npos) { + const QnnDevice_PlatformInfo_t * p_info = nullptr; + _qnn_raw_interface.deviceGetPlatformInfo(nullptr, &p_info); + QNN_LOG_INFO("device counts %d", p_info->v1.numHwDevices); + QnnDevice_HardwareDeviceInfo_t * infos = p_info->v1.hwDevices; + QnnHtpDevice_OnChipDeviceInfoExtension_t chipinfo = { }; + for (int i = 0; i < p_info->v1.numHwDevices; i++) { + QNN_LOG_INFO("deviceID:%d, deviceType:%d, numCores %d", infos[i].v1.deviceId, + infos[i].v1.deviceType, infos[i].v1.numCores); + QnnDevice_DeviceInfoExtension_t devinfo = infos[i].v1.deviceInfoExtension; + chipinfo = devinfo->onChipDevice; + QnnHtpDevice_Arch_t htp_arch = chipinfo.arch; + QNN_LOG_INFO("htp_type:%d(%s)", devinfo->devType, (devinfo->devType == QNN_HTP_DEVICE_TYPE_ON_CHIP) ? "ON_CHIP" : ""); + QNN_LOG_INFO("qualcomm soc_model:%d(%s), htp_arch:%d(%s), vtcm_size:%d MB", \ + chipinfo.socModel, qnn_get_chipset_desc(chipinfo.socModel), \ + htp_arch, qnn_get_htparch_desc(htp_arch), chipinfo.vtcmSize); + g_qnn_mgr[QNN_BACKEND_NPU].socinfo = { chipinfo.socModel, htp_arch, chipinfo.vtcmSize }; + } + _qnn_raw_interface.deviceFreePlatformInfo(nullptr, p_info); + + QnnHtpDevice_CustomConfig_t soc_customconfig; + soc_customconfig.option = QNN_HTP_DEVICE_CONFIG_OPTION_SOC; + soc_customconfig.socModel = chipinfo.socModel; + QnnDevice_Config_t soc_devconfig; + soc_devconfig.option = QNN_DEVICE_CONFIG_OPTION_CUSTOM; + soc_devconfig.customConfig = &soc_customconfig; + + QnnHtpDevice_CustomConfig_t arch_customconfig; + arch_customconfig.option = QNN_HTP_DEVICE_CONFIG_OPTION_ARCH; + arch_customconfig.arch.arch = chipinfo.arch; + arch_customconfig.arch.deviceId = 0; // Id of device to be used. If single device is used by default 0. + QnnDevice_Config_t arch_devconfig; + arch_devconfig.option = QNN_DEVICE_CONFIG_OPTION_CUSTOM; + arch_devconfig.customConfig = &arch_customconfig; + + const QnnDevice_Config_t * p_deviceconfig[] = {&soc_devconfig, &arch_devconfig, NULL}; + qnn_status = _qnn_raw_interface.deviceCreate(_qnn_log_handle, p_deviceconfig, &_qnn_device_handle); + } else { + qnn_status = _qnn_raw_interface.deviceCreate(_qnn_log_handle, nullptr, &_qnn_device_handle); + } if (QNN_SUCCESS != qnn_status && QNN_DEVICE_ERROR_UNSUPPORTED_FEATURE != qnn_status) { QNN_LOG_WARN("failed to create QNN device\n"); } else { - QNN_LOG_INFO("create device successfully\n"); + QNN_LOG_INFO("create QNN device successfully\n"); } if (qnn_sdk_profile_level::profile_off != _profile_level) { @@ -1096,9 +1133,9 @@ class qnn_instance { return 9; } - if (nullptr != - _pfn_rpc_mem_init) // make Qualcomm's SoC equipped low-end phone happy + if (nullptr != _pfn_rpc_mem_init) { // make Qualcomm's SoC equipped low-end phone happy _pfn_rpc_mem_init(); + } std::vector temp_context_config; _qnn_interface.qnn_context_create( @@ -1113,32 +1150,14 @@ class qnn_instance { } if (_backend_name.find("Htp") != std::variant_npos) { - const QnnDevice_PlatformInfo_t * p_info = nullptr; - _qnn_raw_interface.deviceGetPlatformInfo(nullptr, &p_info); - QNN_LOG_INFO("device counts %d", p_info->v1.numHwDevices); - QnnDevice_HardwareDeviceInfo_t * infos = p_info->v1.hwDevices; - for (int i = 0; i < p_info->v1.numHwDevices; i++) { - QNN_LOG_INFO("deviceID:%d, deviceType:%d, numCores %d", infos[i].v1.deviceId, - infos[i].v1.deviceType, infos[i].v1.numCores); - QnnDevice_DeviceInfoExtension_t devinfo = infos[i].v1.deviceInfoExtension; - QnnHtpDevice_OnChipDeviceInfoExtension_t chipinfo = devinfo->onChipDevice; - QnnHtpDevice_Arch_t htp_arch = chipinfo.arch; - QNN_LOG_INFO("htp_type:%d(%s)", devinfo->devType, (devinfo->devType == QNN_HTP_DEVICE_TYPE_ON_CHIP) ? "ON_CHIP" : ""); - QNN_LOG_INFO("qualcomm soc_model:%d(%s), htp_arch:%d(%s), vtcm_size:%d MB", \ - chipinfo.socModel, qnn_get_chipset_desc(chipinfo.socModel), \ - htp_arch, qnn_get_htparch_desc(htp_arch), chipinfo.vtcmSize); - g_qnn_mgr[QNN_BACKEND_NPU].socinfo = { chipinfo.socModel, htp_arch, chipinfo.vtcmSize }; - } - _qnn_raw_interface.deviceFreePlatformInfo(nullptr, p_info); - //TODO: faster approach to probe the accurate capacity of rpc ion memory size_t candidate_size = 0; uint8_t * rpc_buffer = nullptr; - const int SIZE_IN_MB = (1 << 20); + const int size_in_mb = (1 << 20); size_t probe_slots[] = {1024, 1536, 2048 - 48, 2048}; size_t probe_counts = sizeof(probe_slots) / sizeof(size_t); for (size_t idx = 0; idx < probe_counts; idx++) { - rpc_buffer = static_cast(alloc_rpcmem(probe_slots[idx] * SIZE_IN_MB, 4)); + rpc_buffer = static_cast(alloc_rpcmem(probe_slots[idx] * size_in_mb, 4)); if (nullptr == rpc_buffer) { QNN_LOG_INFO("alloc rpcmem %d (MB) failure, %s\n", probe_slots[idx], strerror(errno)); break; @@ -1150,7 +1169,7 @@ class qnn_instance { } if (candidate_size > _rpcmem_capacity) _rpcmem_capacity = candidate_size; - QNN_LOG_INFO("capacity of rpc ion memory %d MB\n", _rpcmem_capacity); + QNN_LOG_INFO("capacity of QNN rpc ion memory is about %d MB\n", _rpcmem_capacity); if (0 != init_htp_perfinfra()) { QNN_LOG_WARN("initialize HTP performance failure"); @@ -1181,6 +1200,10 @@ class qnn_instance { QNN_LOG_DEBUG("succeed to close rpcmem lib\n"); } + if (_backend_name.find("Htp") != std::variant_npos) { + _qnn_htp_perfinfra->destroyPowerConfigId(_qnn_power_configid); + } + if (nullptr != _qnn_context_handle) { error = _qnn_interface.qnn_context_free(_qnn_context_handle, _qnn_profile_handle); @@ -1239,6 +1262,9 @@ class qnn_instance { return ret_status; } + //keep it for further usage of offload the entire cgraph to a single QNN DAG directly + //which was used in Qualcomm's dedicated AI technology +#if 0 int init_qnn_graph(const char * graph_name, bool debug, uint8_t do_node_validation = true, const QnnGraph_Config_t ** graph_configs = nullptr) { @@ -1288,6 +1314,7 @@ class qnn_instance { return 0; } +#endif const qnn_interface & get_qnn_interface() { if (!_qnn_interface.is_loaded()) { @@ -1362,70 +1389,86 @@ class qnn_instance { } int set_rpc_polling() { - if (_qnn_rpc_pollingtime > 0) { - QnnHtpPerfInfrastructure_PowerConfig_t rpc_pollingTime; - memset(&rpc_pollingTime, 0, sizeof(rpc_pollingTime)); - rpc_pollingTime.option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_RPC_POLLING_TIME; - rpc_pollingTime.rpcPollingTimeConfig = _qnn_rpc_pollingtime; - - QnnHtpPerfInfrastructure_PowerConfig_t rpc_ControlLatency; - memset(&rpc_ControlLatency, 0, sizeof(rpc_ControlLatency)); - rpc_ControlLatency.option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_RPC_CONTROL_LATENCY; - rpc_ControlLatency.rpcControlLatencyConfig = 40; - - const QnnHtpPerfInfrastructure_PowerConfig_t * powerConfigs[] = {&rpc_pollingTime, &rpc_ControlLatency, nullptr}; - if (_qnn_htp_perfinfra) { - _qnn_htp_perfinfra->setPowerConfig(_qnn_power_configid, powerConfigs); + if (_qnn_htp_perfinfra) { + QnnHtpPerfInfrastructure_PowerConfig_t rpc_polling_time; + memset(&rpc_polling_time, 0, sizeof(rpc_polling_time)); + rpc_polling_time.option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_RPC_POLLING_TIME; + //use rpc polling time recommended 0-10000 us + rpc_polling_time.rpcPollingTimeConfig = 9999; + + QnnHtpPerfInfrastructure_PowerConfig_t rpc_control_latency; + memset(&rpc_control_latency, 0, sizeof(rpc_control_latency)); + rpc_control_latency.option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_RPC_CONTROL_LATENCY; + //use rpc control latency recommended 100 us, refer hexagon sdk + rpc_control_latency.rpcControlLatencyConfig = 100; + + const QnnHtpPerfInfrastructure_PowerConfig_t * power_configs[] = { + &rpc_polling_time, + &rpc_control_latency, + nullptr}; + Qnn_ErrorHandle_t qnn_status = _qnn_htp_perfinfra->setPowerConfig( + _qnn_power_configid, + power_configs); + if (qnn_status != QNN_SUCCESS) { + QNN_LOG_WARN("set htp perf failed\n"); + } else { + QNN_LOG_INFO("set htp perf ok\n"); } + } else { + QNN_LOG_WARN("can't set htp perf\n"); } + return 0; } int set_high_performance_mode() { if (nullptr == _qnn_htp_perfinfra) { - QNN_LOG_DEBUG("perf intra is null\n"); + QNN_LOG_WARN("perf intra is null\n"); return 1; } - QnnHtpPerfInfrastructure_PowerConfig_t powerConfig; - memset(&powerConfig, 0, sizeof(powerConfig)); - powerConfig.option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_DCVS_V3; - powerConfig.dcvsV3Config.dcvsEnable = 0; - powerConfig.dcvsV3Config.setDcvsEnable = 1; - powerConfig.dcvsV3Config.contextId = _qnn_power_configid; - powerConfig.dcvsV3Config.powerMode = QNN_HTP_PERF_INFRASTRUCTURE_POWERMODE_PERFORMANCE_MODE; - powerConfig.dcvsV3Config.setSleepLatency = - 1; // true to consider Latency parameter otherwise False - powerConfig.dcvsV3Config.setBusParams = - 1; // true to consider Bus parameter otherwise False - powerConfig.dcvsV3Config.setCoreParams = - 1; // true to consider Core parameter otherwise False - powerConfig.dcvsV3Config.sleepDisable = - 0; // true to consider sleep/LPM modes, False to enable - powerConfig.dcvsV3Config.setSleepDisable = - 0; // true to consider sleep disable/enable parameter otherwise False set sleep latency parameter - uint32_t latencyValue = 40; - powerConfig.dcvsV3Config.sleepLatency = - latencyValue; // range 40-2000 micro sec + QnnHtpPerfInfrastructure_PowerConfig_t power_config; + memset(&power_config, 0, sizeof(power_config)); + power_config.option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_DCVS_V3; + power_config.dcvsV3Config.dcvsEnable = 0; + power_config.dcvsV3Config.setDcvsEnable = 1; + power_config.dcvsV3Config.contextId = _qnn_power_configid; + power_config.dcvsV3Config.powerMode = QNN_HTP_PERF_INFRASTRUCTURE_POWERMODE_PERFORMANCE_MODE; + power_config.dcvsV3Config.setSleepLatency = + 1; // true to consider Latency parameter otherwise false + power_config.dcvsV3Config.sleepLatency = 10; + power_config.dcvsV3Config.setBusParams = + 1; // true to consider Bus parameter otherwise false + power_config.dcvsV3Config.setCoreParams = + 1; // true to consider Core parameter otherwise false + power_config.dcvsV3Config.sleepDisable = + 1; // true to consider sleep/LPM modes, false to enable + power_config.dcvsV3Config.setSleepDisable = + 1; // true to consider sleep disable/enable parameter otherwise false set sleep latency parameter // set Bus Clock Parameters - powerConfig.dcvsV3Config.busVoltageCornerMin = + power_config.dcvsV3Config.busVoltageCornerMin = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; - powerConfig.dcvsV3Config.busVoltageCornerTarget = + power_config.dcvsV3Config.busVoltageCornerTarget = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; - powerConfig.dcvsV3Config.busVoltageCornerMax = + power_config.dcvsV3Config.busVoltageCornerMax = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; // set Core Clock Parameters - powerConfig.dcvsV3Config.coreVoltageCornerMin = + power_config.dcvsV3Config.coreVoltageCornerMin = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; - powerConfig.dcvsV3Config.coreVoltageCornerTarget = + power_config.dcvsV3Config.coreVoltageCornerTarget = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; - powerConfig.dcvsV3Config.coreVoltageCornerMax = + power_config.dcvsV3Config.coreVoltageCornerMax = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; // set power config with different performance parameters - const QnnHtpPerfInfrastructure_PowerConfig_t *powerConfigs[] = { - &powerConfig, nullptr}; - - _qnn_htp_perfinfra->setPowerConfig(_qnn_power_configid, powerConfigs); + const QnnHtpPerfInfrastructure_PowerConfig_t * power_configs[] = { + &power_config, nullptr}; + Qnn_ErrorHandle_t qnn_status = QNN_SUCCESS; + qnn_status = _qnn_htp_perfinfra->setPowerConfig(_qnn_power_configid, power_configs); + if (qnn_status != QNN_SUCCESS) { + QNN_LOG_WARN("set htp high performance mode failed\n"); + } else { + QNN_LOG_INFO("set htp high performance mode ok\n"); + } return 0; } @@ -1505,7 +1548,7 @@ class qnn_instance { if (is_rpcmem_allocated(p_data)) { QNN_LOG_WARN("rpc memory already allocated\n"); - // return 3; + return 3; } if (is_rpcmem_registered((QNN_VER_PTR(*p_tensor)->memHandle))) { QNN_LOG_WARN("tensor %s has been registered shared memory\n", @@ -1518,7 +1561,7 @@ class qnn_instance { QNN_LOG_WARN("failed to get file descriptor\n"); return 5; } - QNN_LOG_DEBUG("mem_fd %d\n", mem_fd); + QNN_LOG_INFO("mem_fd %d\n", mem_fd); Qnn_MemDescriptor_t descriptor = {{QNN_VER_PTR(*p_tensor)->rank, QNN_VER_PTR(*p_tensor)->dimensions, nullptr}, @@ -1538,11 +1581,24 @@ class qnn_instance { (QNN_VER_PTR(*p_tensor)->name)); } QNN_VER_PTR(*p_tensor)->memHandle = handle; - _qnn_mem_set.insert(handle); + _qnn_mem_set.insert((std::pair(p_data, handle))); return 0; } + void * get_rpcmem_from_memhandle(Qnn_MemHandle_t mem_handle) { + for (std::unordered_map::iterator it = _qnn_mem_set.begin(); + it != _qnn_mem_set.end(); + it++) { + Qnn_MemHandle_t mem_handle = it->second; + if (it->second == mem_handle) { + return it->first; + } + } + QNN_LOG_WARN("can't find rpcmem from qnn mem handle %p", mem_handle); + return nullptr; + } + void unregister_rpcmem() { Qnn_ErrorHandle_t error = QNN_SUCCESS; @@ -1550,7 +1606,10 @@ class qnn_instance { QNN_LOG_WARN("no rpcmem registered\n"); } - for (auto & mem_handle : _qnn_mem_set) { + for (std::unordered_map::iterator it = _qnn_mem_set.begin(); + it != _qnn_mem_set.end(); + it++) { + Qnn_MemHandle_t mem_handle = it->second; error = _qnn_interface.qnn_mem_de_register(&mem_handle, 1); if (error != QNN_SUCCESS) { QNN_LOG_WARN("failed to unregister shared memory, error %d\n", @@ -1561,7 +1620,7 @@ class qnn_instance { } bool is_rpcmem_allocated(void * buf) { - return _rpcmem_store_map.count(buf) != 0U; + return _qnn_mem_set.count(buf) != 0U; } @@ -1686,8 +1745,9 @@ class qnn_instance { return 1; } - auto get_providers = load_qnn_functionpointers( - lib_handle, "QnnInterface_getProviders"); + auto get_providers = + load_qnn_functionpointers( + lib_handle, "QnnInterface_getProviders"); if (nullptr == get_providers) { QNN_LOG_WARN("can not load symbol QnnInterface_getProviders : %s", dlerror()); @@ -1786,7 +1846,7 @@ class qnn_instance { private: std::string _lib_path; std::string _backend_name; - std::string _model_name; // prebuilt QNN model name, not used currently + std::string _model_name; // Qualcomm's dedicated prebuilt model name, keep it for further usage BackendIdType _backend_id; bool _debug_tensor = false; @@ -1816,12 +1876,11 @@ class qnn_instance { QnnHtpDevice_PerfInfrastructure_t * _qnn_htp_perfinfra = nullptr; uint32_t _qnn_power_configid = 1; - uint32_t _qnn_rpc_pollingtime = 9999; // 0-10000 us for high performing QNN_INTERFACE_VER_TYPE _qnn_raw_interface; QNN_SYSTEM_INTERFACE_VER_TYPE _qnn_raw_system_interface; - std::unordered_set _qnn_mem_set; + std::unordered_map _qnn_mem_set; std::mutex _init_mutex; std::unordered_map _loaded_lib_handle; @@ -1898,9 +1957,8 @@ static bool ggml_qnn_can_handle_op(ggml_backend_qnn_context * ctx, if (tensor->op == GGML_OP_MUL_MAT) { if (ne00 <= 32 || ne01 <= 32 || ne10 <= 32 || ne11 <= 32) { - return false; - } else { - return true; + //make mul_mat with QNN RPC happy + //return false; } } @@ -1964,17 +2022,29 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src if (!graph_initialized) { graph_name = graph_name + "_" + std::to_string(ctx->threads) + - src0->name + "_" + src1->name; + "_" + src0->name + "_" + src1->name; QNN_LOG_INFO("graph name %s", graph_name.c_str()); if (ctx->device == QNN_BACKEND_NPU) { - QnnHtpGraph_CustomConfig_t custom_config; - custom_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_NUM_HVX_THREADS; - custom_config.numHvxThreads = 8; - - QnnGraph_Config_t graph_config; - graph_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; - graph_config.customConfig = &custom_config; - const QnnGraph_Config_t * p_graphconfig[] = {&graph_config, NULL}; + QnnHtpGraph_CustomConfig_t hvx_config; + hvx_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_NUM_HVX_THREADS; + hvx_config.numHvxThreads = 8; + QnnGraph_Config_t graph_hvx_config; + graph_hvx_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; + graph_hvx_config.customConfig = &hvx_config; + + QnnHtpGraph_CustomConfig_t dlbc_config; + dlbc_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_OPTIMIZATION; + /* + dlbc_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_ENABLE_DLBC; + dlbc_config.optimizationOption.floatValue = 1; // set to 0 to turn off DLBC + */ + dlbc_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_OPTIMIZATION_FLAG; + + QnnGraph_Config_t graph_dlbc_config; + graph_dlbc_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; + graph_dlbc_config.customConfig = &dlbc_config; + + const QnnGraph_Config_t * p_graphconfig[] = {&graph_hvx_config, &graph_dlbc_config, NULL}; error = qnn_raw_interface.graphCreate( instance->get_qnn_context_handle(), graph_name.c_str(), p_graphconfig, &graph_handle); @@ -1989,7 +2059,21 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src "error = %d\n", graph_name.c_str(), error); goto failure; + } else { + QNN_LOG_INFO("create qnn graph handle with graph name %s ok\n", graph_name.c_str()); } + + if (ctx->device == QNN_BACKEND_NPU) { + QNN_VER_PTR(*tensor_0)->memType = QNN_TENSORMEMTYPE_MEMHANDLE; + QNN_VER_PTR(*tensor_0)->clientBuf= {.data=nullptr, .dataSize=0}; + + QNN_VER_PTR(*tensor_1)->memType = QNN_TENSORMEMTYPE_MEMHANDLE; + QNN_VER_PTR(*tensor_1)->clientBuf= {.data=nullptr, .dataSize=0}; + + QNN_VER_PTR(*tensor_2)->memType = QNN_TENSORMEMTYPE_MEMHANDLE; + QNN_VER_PTR(*tensor_2)->clientBuf= {.data=nullptr, .dataSize=0}; + } + error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_0); if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); @@ -2006,13 +2090,6 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src goto failure; } - QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, - qnn_get_ggml_tensor_data_size(src0)}; - QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, - qnn_get_ggml_tensor_data_size(src1)}; - QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, - qnn_get_ggml_tensor_data_size(dst)}; - QNN_VER_PTR(*tensor_0)->dimensions = dimensions_input_0; QNN_VER_PTR(*tensor_0)->rank = qnn_get_ggml_tensor_rank(src0); QNN_VER_PTR(*tensor_0)->dataType = src0_qnn_type; @@ -2023,6 +2100,46 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src QNN_VER_PTR(*tensor_2)->rank = qnn_get_ggml_tensor_rank(dst); QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type; + if (ctx->device != QNN_BACKEND_NPU) { + QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, + qnn_get_ggml_tensor_data_size(src0)}; + QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, + qnn_get_ggml_tensor_data_size(src1)}; + QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, + qnn_get_ggml_tensor_data_size(dst)}; + } else { + uint8_t * qnn_buffer_0 = nullptr; + uint8_t * qnn_buffer_1 = nullptr; + uint8_t * qnn_buffer_2 = nullptr; + qnn_instance * instance = ctx->instance; + + qnn_buffer_0 = static_cast(instance->alloc_rpcmem(ggml_nbytes(src0), 4)); + if (nullptr == qnn_buffer_0) { + QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno)); + } else { + QNN_LOG_INFO("alloc rpcmem successfully\n"); + } + instance->register_rpcmem(qnn_buffer_0, tensor_0); + memcpy(qnn_buffer_0, src0->data, ggml_nbytes(src0)); + + qnn_buffer_1 = static_cast(instance->alloc_rpcmem(ggml_nbytes(src1), 4)); + if (nullptr == qnn_buffer_1) { + QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno)); + } else { + QNN_LOG_INFO("alloc rpcmem successfully\n"); + } + instance->register_rpcmem(qnn_buffer_1, tensor_1); + memcpy(qnn_buffer_1, src1->data, ggml_nbytes(src1)); + + qnn_buffer_2 = static_cast(instance->alloc_rpcmem(ggml_nbytes(dst), 4)); + if (nullptr == qnn_buffer_2) { + QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno)); + } else { + QNN_LOG_INFO("alloc rpcmem successfully\n"); + } + instance->register_rpcmem(qnn_buffer_2, tensor_2); + } + Qnn_Tensor_t tensor_inputs[] = {*tensor_0, *tensor_1}; Qnn_Tensor_t tensor_outputs[] = {*tensor_2}; Qnn_OpConfig_t op_config = { @@ -2048,6 +2165,12 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src QNN_LOG_INFO("error = %d\n", error); goto failure; } + + if (ctx->device == QNN_BACKEND_NPU) { + uint8_t * qnn_buffer_2 = static_cast(ctx->instance->get_rpcmem_from_memhandle( + QNN_VER_PTR(*tensor_2)->memHandle)); + memcpy(dst->data, qnn_buffer_2, ggml_nbytes(dst)); + } auto graph_item = std::make_tuple(graph_handle, tensor_0, tensor_1, tensor_2); instance->_qnn_graph_map[map_entry] = graph_item; } else { @@ -2067,13 +2190,6 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3]}; - QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, - qnn_get_ggml_tensor_data_size(src0)}; - QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, - qnn_get_ggml_tensor_data_size(src1)}; - QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, - qnn_get_ggml_tensor_data_size(dst)}; - QNN_VER_PTR(*tensor_0)->dimensions = dimensions_input_0; QNN_VER_PTR(*tensor_0)->rank = qnn_get_ggml_tensor_rank(src0); QNN_VER_PTR(*tensor_0)->dataType = src0_qnn_type; @@ -2084,6 +2200,25 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src QNN_VER_PTR(*tensor_2)->rank = qnn_get_ggml_tensor_rank(dst); QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type; + if (ctx->device != QNN_BACKEND_NPU) { + QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, + qnn_get_ggml_tensor_data_size(src0)}; + QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, + qnn_get_ggml_tensor_data_size(src1)}; + QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, + qnn_get_ggml_tensor_data_size(dst)}; + } else { + uint8_t * qnn_buffer_0 = static_cast(ctx->instance->get_rpcmem_from_memhandle( + QNN_VER_PTR(*tensor_0)->memHandle)); + if (nullptr != qnn_buffer_0) + memcpy(qnn_buffer_0, src0->data, ggml_nbytes(src0)); + + uint8_t * qnn_buffer_1 = static_cast(ctx->instance->get_rpcmem_from_memhandle( + QNN_VER_PTR(*tensor_1)->memHandle)); + if (nullptr != qnn_buffer_1) + memcpy(qnn_buffer_1, src1->data, ggml_nbytes(src1)); + } + Qnn_Tensor_t tensor_inputs[] = {*tensor_0, *tensor_1}; Qnn_Tensor_t tensor_outputs[] = {*tensor_2}; error = qnn_raw_interface.graphExecute(graph_handle, tensor_inputs,2, @@ -2093,7 +2228,15 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src QNN_LOG_INFO("error = %d\n", error); goto failure; } + + if (ctx->device == QNN_BACKEND_NPU) { + uint8_t * qnn_buffer_2 = static_cast(ctx->instance->get_rpcmem_from_memhandle( + QNN_VER_PTR(*tensor_2)->memHandle)); + if (nullptr != qnn_buffer_2) + memcpy(dst->data, qnn_buffer_2, ggml_nbytes(dst)); + } } + failure: if (QNN_SUCCESS != error) { QNN_LOG_DEBUG("tensor0 name %s", QNN_TENSOR_GET_NAME(*tensor_0)); @@ -2197,17 +2340,55 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, if (!graph_initialized) { graph_name = graph_name + "_" + std::to_string(ctx->threads) + - src0->name + "_" + src1->name; + "_" + src0->name + "_" + src1->name; QNN_LOG_INFO("graph name %s", graph_name.c_str()); - error = qnn_raw_interface.graphCreate( - instance->get_qnn_context_handle(), graph_name.c_str(), nullptr, - &graph_handle); + if (ctx->device == QNN_BACKEND_NPU) { + QnnHtpGraph_CustomConfig_t hvx_config; + hvx_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_NUM_HVX_THREADS; + hvx_config.numHvxThreads = 8; + QnnGraph_Config_t graph_hvx_config; + graph_hvx_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; + graph_hvx_config.customConfig = &hvx_config; + + QnnHtpGraph_CustomConfig_t dlbc_config; + dlbc_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_OPTIMIZATION; + /* + dlbc_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_ENABLE_DLBC; + dlbc_config.optimizationOption.floatValue = 1; // set to 0 to turn off DLBC + */ + dlbc_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_OPTIMIZATION_FLAG; + + QnnGraph_Config_t graph_dlbc_config; + graph_dlbc_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; + graph_dlbc_config.customConfig = &dlbc_config; + + const QnnGraph_Config_t * p_graphconfig[] = {&graph_hvx_config, &graph_dlbc_config, NULL}; + error = qnn_raw_interface.graphCreate( + instance->get_qnn_context_handle(), graph_name.c_str(), p_graphconfig, + &graph_handle); + } else { + error = qnn_raw_interface.graphCreate( + instance->get_qnn_context_handle(), graph_name.c_str(), nullptr, + &graph_handle); + } if (QNN_SUCCESS != error) { QNN_LOG_INFO("can't create qnn graph handle with graph name %s, " "error = %d\n", graph_name.c_str(), error); goto failure; } + + if (ctx->device == QNN_BACKEND_NPU) { + QNN_VER_PTR(*tensor_0)->memType = QNN_TENSORMEMTYPE_MEMHANDLE; + QNN_VER_PTR(*tensor_0)->clientBuf= {.data=nullptr, .dataSize=0}; + + QNN_VER_PTR(*tensor_1)->memType = QNN_TENSORMEMTYPE_MEMHANDLE; + QNN_VER_PTR(*tensor_1)->clientBuf= {.data=nullptr, .dataSize=0}; + + QNN_VER_PTR(*tensor_2)->memType = QNN_TENSORMEMTYPE_MEMHANDLE; + QNN_VER_PTR(*tensor_2)->clientBuf= {.data=nullptr, .dataSize=0}; + } + error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_0); if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); @@ -2224,13 +2405,6 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, goto failure; } - QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, - qnn_get_ggml_tensor_data_size(src0)}; - QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, - qnn_get_ggml_tensor_data_size(src1)}; - QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, - qnn_get_ggml_tensor_data_size(dst)}; - QNN_VER_PTR(*tensor_0)->dimensions = dimensions_input_0; QNN_VER_PTR(*tensor_0)->rank = qnn_get_ggml_tensor_rank(src0); QNN_VER_PTR(*tensor_0)->dataType = src0_qnn_type; @@ -2241,6 +2415,46 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, QNN_VER_PTR(*tensor_2)->rank = qnn_get_ggml_tensor_rank(dst); QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type; + if (ctx->device != QNN_BACKEND_NPU) { + QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, + qnn_get_ggml_tensor_data_size(src0)}; + QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, + qnn_get_ggml_tensor_data_size(src1)}; + QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, + qnn_get_ggml_tensor_data_size(dst)}; + } else { + uint8_t * qnn_buffer_0 = nullptr; + uint8_t * qnn_buffer_1 = nullptr; + uint8_t * qnn_buffer_2 = nullptr; + qnn_instance * instance = ctx->instance; + + qnn_buffer_0 = static_cast(instance->alloc_rpcmem(ggml_nbytes(src0), 4)); + if (nullptr == qnn_buffer_0) { + QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno)); + } else { + QNN_LOG_INFO("alloc rpcmem successfully\n"); + } + instance->register_rpcmem(qnn_buffer_0, tensor_0); + memcpy(qnn_buffer_0, src0->data, ggml_nbytes(src0)); + + qnn_buffer_1 = static_cast(instance->alloc_rpcmem(ggml_nbytes(src1), 4)); + if (nullptr == qnn_buffer_1) { + QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno)); + } else { + QNN_LOG_INFO("alloc rpcmem successfully\n"); + } + instance->register_rpcmem(qnn_buffer_1, tensor_1); + memcpy(qnn_buffer_1, src1->data, ggml_nbytes(src1)); + + qnn_buffer_2 = static_cast(instance->alloc_rpcmem(ggml_nbytes(dst), 4)); + if (nullptr == qnn_buffer_2) { + QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno)); + } else { + QNN_LOG_INFO("alloc rpcmem successfully\n"); + } + instance->register_rpcmem(qnn_buffer_2, tensor_2); + } + Qnn_Tensor_t tensor_inputs[] = {*tensor_0, *tensor_1}; Qnn_Tensor_t tensor_outputs[] = {*tensor_2}; Qnn_OpConfig_t op_config = {(Qnn_OpConfigVersion_t) 1, @@ -2266,6 +2480,13 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, QNN_LOG_INFO("error = %d\n", error); goto failure; } + + if (ctx->device == QNN_BACKEND_NPU) { + uint8_t * qnn_buffer_2 = static_cast(ctx->instance->get_rpcmem_from_memhandle( + QNN_VER_PTR(*tensor_2)->memHandle)); + memcpy(dst->data, qnn_buffer_2, ggml_nbytes(dst)); + } + auto graph_item = std::make_tuple(graph_handle, tensor_0, tensor_1, tensor_2); instance->_qnn_graph_map[map_entry] = graph_item; } else { @@ -2294,12 +2515,24 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, QNN_VER_PTR(*tensor_2)->rank = qnn_get_ggml_tensor_rank(dst); QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type; - QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, - qnn_get_ggml_tensor_data_size(src0)}; - QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, - qnn_get_ggml_tensor_data_size(src1)}; - QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, - qnn_get_ggml_tensor_data_size(dst)}; + if (ctx->device != QNN_BACKEND_NPU) { + QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, + qnn_get_ggml_tensor_data_size(src0)}; + QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, + qnn_get_ggml_tensor_data_size(src1)}; + QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, + qnn_get_ggml_tensor_data_size(dst)}; + } else { + uint8_t * qnn_buffer_0 = static_cast(ctx->instance->get_rpcmem_from_memhandle( + QNN_VER_PTR(*tensor_0)->memHandle)); + if (nullptr != qnn_buffer_0) + memcpy(qnn_buffer_0, src0->data, ggml_nbytes(src0)); + + uint8_t * qnn_buffer_1 = static_cast(ctx->instance->get_rpcmem_from_memhandle( + QNN_VER_PTR(*tensor_1)->memHandle)); + if (nullptr != qnn_buffer_1) + memcpy(qnn_buffer_1, src1->data, ggml_nbytes(src1)); + } Qnn_Tensor_t tensor_inputs[] = {*tensor_0, *tensor_1}; Qnn_Tensor_t tensor_outputs[] = {*tensor_2}; @@ -2311,7 +2544,15 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, QNN_LOG_INFO("error = %d\n", error); goto failure; } + + if (ctx->device == QNN_BACKEND_NPU) { + uint8_t * qnn_buffer_2 = static_cast(ctx->instance->get_rpcmem_from_memhandle( + QNN_VER_PTR(*tensor_2)->memHandle)); + if (nullptr != qnn_buffer_2) + memcpy(dst->data, qnn_buffer_2, ggml_nbytes(dst)); + } } + failure: if (QNN_SUCCESS != error) { QNN_LOG_DEBUG("tensor0 name %s", QNN_TENSOR_GET_NAME(*tensor_0)); @@ -2428,6 +2669,17 @@ static void ggml_qnn_hanlde_op(ggml_backend_qnn_context * ctx, goto failure; } + if (ctx->device == QNN_BACKEND_NPU) { + QNN_VER_PTR(*tensor_0)->memType = QNN_TENSORMEMTYPE_MEMHANDLE; + QNN_VER_PTR(*tensor_0)->clientBuf= {.data=nullptr, .dataSize=0}; + + QNN_VER_PTR(*tensor_1)->memType = QNN_TENSORMEMTYPE_MEMHANDLE; + QNN_VER_PTR(*tensor_1)->clientBuf= {.data=nullptr, .dataSize=0}; + + QNN_VER_PTR(*tensor_2)->memType = QNN_TENSORMEMTYPE_MEMHANDLE; + QNN_VER_PTR(*tensor_2)->clientBuf= {.data=nullptr, .dataSize=0}; + } + error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_0); if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); @@ -2444,13 +2696,6 @@ static void ggml_qnn_hanlde_op(ggml_backend_qnn_context * ctx, goto failure; } - QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, - qnn_get_ggml_tensor_data_size(src0)}; - QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, - qnn_get_ggml_tensor_data_size(src1)}; - QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, - qnn_get_ggml_tensor_data_size(dst)}; - QNN_VER_PTR(*tensor_0)->dimensions = dimensions_input_0; QNN_VER_PTR(*tensor_0)->rank = qnn_get_ggml_tensor_rank(src0); QNN_VER_PTR(*tensor_0)->dataType = src0_qnn_type; @@ -2461,6 +2706,46 @@ static void ggml_qnn_hanlde_op(ggml_backend_qnn_context * ctx, QNN_VER_PTR(*tensor_2)->rank = qnn_get_ggml_tensor_rank(dst); QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type; + if (ctx->device != QNN_BACKEND_NPU) { + QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, + qnn_get_ggml_tensor_data_size(src0)}; + QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, + qnn_get_ggml_tensor_data_size(src1)}; + QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, + qnn_get_ggml_tensor_data_size(dst)}; + } else { + uint8_t * qnn_buffer_0 = nullptr; + uint8_t * qnn_buffer_1 = nullptr; + uint8_t * qnn_buffer_2 = nullptr; + qnn_instance * instance = ctx->instance; + + qnn_buffer_0 = static_cast(instance->alloc_rpcmem(ggml_nbytes(src0), 4)); + if (nullptr == qnn_buffer_0) { + QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno)); + } else { + QNN_LOG_INFO("alloc rpcmem successfully\n"); + } + instance->register_rpcmem(qnn_buffer_0, tensor_0); + memcpy(qnn_buffer_0, src0->data, ggml_nbytes(src0)); + + qnn_buffer_1 = static_cast(instance->alloc_rpcmem(ggml_nbytes(src1), 4)); + if (nullptr == qnn_buffer_1) { + QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno)); + } else { + QNN_LOG_INFO("alloc rpcmem successfully\n"); + } + instance->register_rpcmem(qnn_buffer_1, tensor_1); + memcpy(qnn_buffer_1, src1->data, ggml_nbytes(src1)); + + qnn_buffer_2 = static_cast(instance->alloc_rpcmem(ggml_nbytes(dst), 4)); + if (nullptr == qnn_buffer_2) { + QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno)); + } else { + QNN_LOG_INFO("alloc rpcmem successfully\n"); + } + instance->register_rpcmem(qnn_buffer_2, tensor_2); + } + Qnn_Tensor_t tensor_inputs[] = {*tensor_0, *tensor_1}; Qnn_Tensor_t tensor_outputs[] = {*tensor_2}; Qnn_OpConfig_t op_config = {(Qnn_OpConfigVersion_t) 1, @@ -2486,6 +2771,13 @@ static void ggml_qnn_hanlde_op(ggml_backend_qnn_context * ctx, QNN_LOG_INFO("error = %d\n", error); goto failure; } + + if (ctx->device == QNN_BACKEND_NPU) { + uint8_t * qnn_buffer_2 = static_cast(ctx->instance->get_rpcmem_from_memhandle( + QNN_VER_PTR(*tensor_2)->memHandle)); + memcpy(dst->data, qnn_buffer_2, ggml_nbytes(dst)); + } + auto graph_item = std::make_tuple(graph_handle, tensor_0, tensor_1, tensor_2); instance->_qnn_graph_map[map_entry] = graph_item; } else { @@ -2514,17 +2806,28 @@ static void ggml_qnn_hanlde_op(ggml_backend_qnn_context * ctx, QNN_VER_PTR(*tensor_2)->rank = qnn_get_ggml_tensor_rank(dst); QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type; - QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, - qnn_get_ggml_tensor_data_size(src0)}; - QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, - qnn_get_ggml_tensor_data_size(src1)}; - QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, - qnn_get_ggml_tensor_data_size(dst)}; + if (ctx->device != QNN_BACKEND_NPU) { + QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, + qnn_get_ggml_tensor_data_size(src0)}; + QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, + qnn_get_ggml_tensor_data_size(src1)}; + QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, + qnn_get_ggml_tensor_data_size(dst)}; + } else { + uint8_t * qnn_buffer_0 = static_cast(ctx->instance->get_rpcmem_from_memhandle( + QNN_VER_PTR(*tensor_0)->memHandle)); + if (nullptr != qnn_buffer_0) + memcpy(qnn_buffer_0, src0->data, ggml_nbytes(src0)); + + uint8_t * qnn_buffer_1 = static_cast(ctx->instance->get_rpcmem_from_memhandle( + QNN_VER_PTR(*tensor_1)->memHandle)); + if (nullptr != qnn_buffer_1) + memcpy(qnn_buffer_1, src1->data, ggml_nbytes(src1)); + } Qnn_Tensor_t tensor_inputs[] = {*tensor_0, *tensor_1}; Qnn_Tensor_t tensor_outputs[] = {*tensor_2}; - error = - qnn_raw_interface.graphExecute(graph_handle, + error = qnn_raw_interface.graphExecute(graph_handle, tensor_inputs, 2, tensor_outputs, 1, nullptr, nullptr); @@ -2532,7 +2835,15 @@ static void ggml_qnn_hanlde_op(ggml_backend_qnn_context * ctx, QNN_LOG_INFO("error = %d\n", error); goto failure; } + + if (ctx->device == QNN_BACKEND_NPU) { + uint8_t * qnn_buffer_2 = static_cast(ctx->instance->get_rpcmem_from_memhandle( + QNN_VER_PTR(*tensor_2)->memHandle)); + if (nullptr != qnn_buffer_2) + memcpy(dst->data, qnn_buffer_2, ggml_nbytes(dst)); + } } + failure: if (QNN_SUCCESS != error) { QNN_LOG_DEBUG("tensor0 name %s", QNN_TENSOR_GET_NAME(*tensor_0)); @@ -2889,9 +3200,9 @@ GGML_CALL static void * ggml_backend_qnn_buffer_get_base(ggml_backend_buffer_t b GGML_CALL static void ggml_backend_qnn_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) { - Qnn_ErrorHandle_t error = QNN_SUCCESS; - ggml_backend_qnn_buffer_context * ctx = - (ggml_backend_qnn_buffer_context *) buffer->context; + Qnn_ErrorHandle_t error = QNN_SUCCESS; + ggml_backend_qnn_buffer_context * ctx = (ggml_backend_qnn_buffer_context *) buffer->context; + static int idx = 0; char tensor_name[GGML_MAX_NAME] = {0}; snprintf(tensor_name, GGML_MAX_NAME, "tensor_%04d", idx++); @@ -2908,22 +3219,43 @@ GGML_CALL static void ggml_backend_qnn_buffer_init_tensor(ggml_backend_buffer_t } else if (tensor->flags & GGML_TENSOR_FLAG_OUTPUT) { qnn_tensor_type = QNN_TENSOR_TYPE_APP_READ; } - Qnn_Tensor_t qnn_tensor = { - .version = QNN_TENSOR_VERSION_1, - {.v1 = {.id = 0, - .name = tensor_name, - .type = qnn_tensor_type, - .dataFormat = QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER, - .dataType = qnn_data_type, - .quantizeParams = - {QNN_DEFINITION_UNDEFINED, - QNN_QUANTIZATION_ENCODING_UNDEFINED, - {.scaleOffsetEncoding = {.scale = 0.0000000000000000f, - .offset = 0}}}, - .rank = qnn_get_ggml_tensor_rank(tensor), - .dimensions = dimensions, - .memType = QNN_TENSORMEMTYPE_RAW, - {.clientBuf = {.data = nullptr, .dataSize = 0}}}}}; + Qnn_Tensor_t qnn_tensor = QNN_TENSOR_INIT; + + if (ctx->device != QNN_BACKEND_GPU) { + qnn_tensor = { + .version = QNN_TENSOR_VERSION_1, + {.v1 = {.id = 0, + .name = tensor_name, + .type = qnn_tensor_type, + .dataFormat = QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER, + .dataType = qnn_data_type, + .quantizeParams = + {QNN_DEFINITION_UNDEFINED, + QNN_QUANTIZATION_ENCODING_UNDEFINED, + {.scaleOffsetEncoding = {.scale = 0.0000000000000000f, + .offset = 0}}}, + .rank = qnn_get_ggml_tensor_rank(tensor), + .dimensions = dimensions, + .memType = QNN_TENSORMEMTYPE_RAW, + {.clientBuf = {.data = nullptr, .dataSize = 0}}}}}; + } else { + qnn_tensor = { + .version = QNN_TENSOR_VERSION_1, + {.v1 = {.id = 0, + .name = tensor_name, + .type = qnn_tensor_type, + .dataFormat = QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER, + .dataType = qnn_data_type, + .quantizeParams = + {QNN_DEFINITION_UNDEFINED, + QNN_QUANTIZATION_ENCODING_UNDEFINED, + {.scaleOffsetEncoding = {.scale = 0.0000000000000000f, + .offset = 0}}}, + .rank = qnn_get_ggml_tensor_rank(tensor), + .dimensions = dimensions, + .memType = QNN_TENSORMEMTYPE_MEMHANDLE, + {.clientBuf = {.data = nullptr, .dataSize = 0}}}}}; + } Qnn_Tensor_t * p_qnn_tensor = (Qnn_Tensor_t *)calloc(1, sizeof(Qnn_Tensor_t)); if (nullptr == p_qnn_tensor) { @@ -2933,7 +3265,7 @@ GGML_CALL static void ggml_backend_qnn_buffer_init_tensor(ggml_backend_buffer_t error = deep_copy_qnn_tensors(qnn_tensor, *p_qnn_tensor); if (error != QNN_SUCCESS) { free(p_qnn_tensor); - QNN_LOG_DEBUG("init tensor failed"); + QNN_LOG_WARN("init tensor failed"); return; } tensor->extra = p_qnn_tensor; @@ -3210,6 +3542,7 @@ ggml_backend_buffer_type_t ggml_backend_qnn_buffer_type(size_t device) { device, GGML_QNN_MAX_DEVICES - 1); return nullptr; } + static ggml_backend_qnn_buffer_type_context ggml_backend_qnn_buffer_type_contexts[GGML_QNN_MAX_DEVICES]; static ggml_backend_buffer_type ggml_backend_qnn_buffer_types[GGML_QNN_MAX_DEVICES]; static bool ggml_backend_qnn_buffer_type_initialized = false; @@ -3307,7 +3640,6 @@ ggml_backend_t ggml_backend_qnn_init(size_t device, const char * qnn_lib_path) { std::string device_name = qnn_get_backend_name(device); QNN_LOG_INFO("qnn device name %s", device_name.c_str()); - instance->init_qnn_graph(device_name.c_str(), false); g_qnn_mgr[device].instance = instance; g_qnn_mgr[device].raw_interface = instance->get_qnn_raw_interface(); g_qnn_mgr[device].raw_system_interface = instance->get_qnn_raw_system_interface(); diff --git a/tests/ggml-qnn/CMakeLists.txt b/tests/ggml-qnn/CMakeLists.txt index a78bdaeaf8009..bf061e6c7c3a1 100644 --- a/tests/ggml-qnn/CMakeLists.txt +++ b/tests/ggml-qnn/CMakeLists.txt @@ -6,8 +6,8 @@ set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_POSITION_INDEPENDENT_CODE ON) -#set to ON if target Android phone is based on Qualcomm Snapdragon 8 Gen 3 -set(TARGET_SNAPDRAGON_8_GEN3 OFF) +#set to OFF if target Android phone is not equipped with Qualcomm Snapdragon 8 Gen 3 +set(TARGET_SNAPDRAGON_8_GEN3 ON) set(QNN_INC_PATH ${QNN_SDK_PATH}/include/QNN) set(QNN_LIB_PATH ${QNN_SDK_PATH}/lib/aarch64-android) @@ -35,6 +35,8 @@ add_definitions(-DGGML_USE_QNN) if(CMAKE_BUILD_TYPE STREQUAL "Release") add_definitions(-DNDEBUG) add_definitions(-O3) +else() +add_definitions(-O3) endif() if (TARGET_SNAPDRAGON_8_GEN3) @@ -44,7 +46,7 @@ add_definitions(-mcpu=cortex-x1) add_definitions(-mtune=cortex-x1) else() -# the below build optimization might be works well on ALL mainstream Android phone based on Qualcomm mobile SoC +# the below build optimization might be works well on ALL Android phone equipped with Qualcomm mainstream mobile SoC add_definitions(-mcpu=cortex-a72) endif() diff --git a/tests/ggml-qnn/ggml-qnn-ut.cpp b/tests/ggml-qnn/ggml-qnn-ut.cpp index 9af433ceb6690..0abfc62073f08 100644 --- a/tests/ggml-qnn/ggml-qnn-ut.cpp +++ b/tests/ggml-qnn/ggml-qnn-ut.cpp @@ -415,7 +415,8 @@ static int qnn_op_ut(int num_threads, int n_backend_type, int n_ggml_op_type) { sizex = ggml_blck_size(qtype) * 2; } } - QNN_LOG_DEBUG("sizex %d\n", sizex); + QNN_LOG_DEBUG("sizex: %d\n", sizex); + QNN_LOG_DEBUG("sizey: %d\n", sizey); if (n_ggml_op_type == GGML_OP_MUL) { src0 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizey); From 5598fbd15dfd7e0483ca544c4c8a86aca6c79ea2 Mon Sep 17 00:00:00 2001 From: "zhou.weiguo" Date: Thu, 13 Jun 2024 15:41:53 +0800 Subject: [PATCH 016/143] review: make a MVP(Minimum Viable PR) style PR in upstream --- ggml-qnn.cpp | 597 +++++++----------------- tests/ggml-qnn/ggml-qnn-ut-build-run.sh | 10 +- tests/ggml-qnn/ggml-qnn-ut.cpp | 17 +- 3 files changed, 183 insertions(+), 441 deletions(-) diff --git a/ggml-qnn.cpp b/ggml-qnn.cpp index f59c54fcacd97..f268c7f0e825a 100644 --- a/ggml-qnn.cpp +++ b/ggml-qnn.cpp @@ -55,7 +55,7 @@ #include "Saver/QnnSaver.h" #include "System/QnnSystemInterface.h" #include "HTP/QnnHtpDevice.h" -#include +#include "HTP/QnnHtpGraph.h" // ================================================================================================= // @@ -91,12 +91,6 @@ typedef void (*ggml_qnn_func_t)(ggml_backend_qnn_context * ctx, const ggml_tensor * src1, ggml_tensor * dst); -typedef void (*ggml_qnn_func_common_t)(ggml_backend_qnn_context * ctx, - const ggml_op ggml_op, - const ggml_tensor * src0, - const ggml_tensor * src1, - ggml_tensor * dst); - enum qcom_htp_arch { NONE = 0, V68 = 68, @@ -424,6 +418,7 @@ static bool qnn_is_valid_params(ggml_backend_qnn_context * ctx, const ggml_tenso return true; } +#ifndef NDEBUG #define CHECK_PARAMS(ctx, src0, src1, dst) \ do { \ if (!qnn_is_valid_params((ctx), (src0), (src1), (dst))) { \ @@ -431,6 +426,10 @@ static bool qnn_is_valid_params(ggml_backend_qnn_context * ctx, const ggml_tenso } \ } while (0) +#else +#define CHECK_PARAMS(ctx, src0, src1, dst) +#endif + #if ENABLE_QNNBACKEND_PERF class qnn_perf { public: @@ -446,7 +445,7 @@ class qnn_perf { void info() { _end_time = ggml_time_us(); _duration = (_end_time - _begin_time); - QNN_LOG_DEBUG("duration of %s : %lld microseconds\n", _perf_name.c_str(), _duration); + QNN_LOG_INFO("duration of %s : %lld microseconds\n", _perf_name.c_str(), _duration); } private: @@ -809,7 +808,7 @@ static void qnn_sdk_logcallback(const char * fmt, QnnLog_Level_t level, memset(s_ggml_qnn_logbuf, 0, QNN_LOGBUF_LEN); vsnprintf(reinterpret_cast(s_ggml_qnn_logbuf), QNN_LOGBUF_LEN, fmt, argp); - QNN_LOG_DEBUG("%8.1fms [%-7s] %s\n", ms, log_level_desc, s_ggml_qnn_logbuf); + QNN_LOG_INFO("%8.1fms [%-7s] %s\n", ms, log_level_desc, s_ggml_qnn_logbuf); } #endif } @@ -1069,7 +1068,7 @@ class qnn_instance { arch_devconfig.option = QNN_DEVICE_CONFIG_OPTION_CUSTOM; arch_devconfig.customConfig = &arch_customconfig; - const QnnDevice_Config_t * p_deviceconfig[] = {&soc_devconfig, &arch_devconfig, NULL}; + const QnnDevice_Config_t * p_deviceconfig[] = {&soc_devconfig, &arch_devconfig, nullptr}; qnn_status = _qnn_raw_interface.deviceCreate(_qnn_log_handle, p_deviceconfig, &_qnn_device_handle); } else { qnn_status = _qnn_raw_interface.deviceCreate(_qnn_log_handle, nullptr, &_qnn_device_handle); @@ -1137,10 +1136,14 @@ class qnn_instance { _pfn_rpc_mem_init(); } - std::vector temp_context_config; + /* TODO: not used, keep it for further usage + QnnContext_Config_t qnn_context_config = QNN_CONTEXT_CONFIG_INIT; + qnn_context_config.priority = QNN_PRIORITY_DEFAULT; + const QnnContext_Config_t * context_configs[] = {&qnn_context_config, nullptr}; + */ _qnn_interface.qnn_context_create( _qnn_backend_handle, _qnn_device_handle, - temp_context_config.empty() ? nullptr : temp_context_config.data(), + nullptr, &_qnn_context_handle); if (nullptr == _qnn_context_handle) { QNN_LOG_WARN("why failed to initialize qnn context\n"); @@ -1157,9 +1160,11 @@ class qnn_instance { size_t probe_slots[] = {1024, 1536, 2048 - 48, 2048}; size_t probe_counts = sizeof(probe_slots) / sizeof(size_t); for (size_t idx = 0; idx < probe_counts; idx++) { - rpc_buffer = static_cast(alloc_rpcmem(probe_slots[idx] * size_in_mb, 4)); + rpc_buffer = static_cast(alloc_rpcmem( + probe_slots[idx] * size_in_mb, 4)); if (nullptr == rpc_buffer) { - QNN_LOG_INFO("alloc rpcmem %d (MB) failure, %s\n", probe_slots[idx], strerror(errno)); + QNN_LOG_INFO("alloc rpcmem %d (MB) failure, %s\n", + probe_slots[idx], strerror(errno)); break; } else { candidate_size = probe_slots[idx]; @@ -1262,8 +1267,8 @@ class qnn_instance { return ret_status; } - //keep it for further usage of offload the entire cgraph to a single QNN DAG directly - //which was used in Qualcomm's dedicated AI technology + //TODO:keep it for further usage of offload the entire cgraph to a single QNN DAG directly + // which was used in Qualcomm's dedicated AI technology #if 0 int init_qnn_graph(const char * graph_name, bool debug, uint8_t do_node_validation = true, @@ -1430,13 +1435,14 @@ class qnn_instance { QnnHtpPerfInfrastructure_PowerConfig_t power_config; memset(&power_config, 0, sizeof(power_config)); power_config.option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_DCVS_V3; - power_config.dcvsV3Config.dcvsEnable = 0; + power_config.dcvsV3Config.setDcvsEnable = 1; + power_config.dcvsV3Config.dcvsEnable = 0; power_config.dcvsV3Config.contextId = _qnn_power_configid; power_config.dcvsV3Config.powerMode = QNN_HTP_PERF_INFRASTRUCTURE_POWERMODE_PERFORMANCE_MODE; power_config.dcvsV3Config.setSleepLatency = 1; // true to consider Latency parameter otherwise false - power_config.dcvsV3Config.sleepLatency = 10; + power_config.dcvsV3Config.sleepLatency = 40; power_config.dcvsV3Config.setBusParams = 1; // true to consider Bus parameter otherwise false power_config.dcvsV3Config.setCoreParams = @@ -1459,6 +1465,7 @@ class qnn_instance { DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; power_config.dcvsV3Config.coreVoltageCornerMax = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; + // set power config with different performance parameters const QnnHtpPerfInfrastructure_PowerConfig_t * power_configs[] = { &power_config, nullptr}; @@ -1550,6 +1557,7 @@ class qnn_instance { QNN_LOG_WARN("rpc memory already allocated\n"); return 3; } + if (is_rpcmem_registered((QNN_VER_PTR(*p_tensor)->memHandle))) { QNN_LOG_WARN("tensor %s has been registered shared memory\n", (QNN_VER_PTR(*p_tensor)->name)); @@ -1710,7 +1718,7 @@ class qnn_instance { int result = 0; if (nullptr == _system_lib_handle) { - QNN_LOG_DEBUG("system lib handle is null\n"); + QNN_LOG_WARN("system lib handle is null\n"); return 1; } @@ -1724,8 +1732,7 @@ class qnn_instance { int dlclose_error = dlclose(_system_lib_handle); if (dlclose_error != 0) { - QNN_LOG_WARN("failed to close QnnSystem library, error %s\n", - dlerror()); + QNN_LOG_WARN("failed to close QnnSystem library, error %s\n", dlerror()); return 2; } @@ -1740,8 +1747,7 @@ class qnn_instance { void * lib_handle = dlopen(lib_path.c_str(), RTLD_NOW | RTLD_GLOBAL); if (nullptr == lib_handle) { - QNN_LOG_WARN("can not open QNN library %s, with error: %s", - lib_path.c_str(), dlerror()); + QNN_LOG_WARN("can not open QNN library %s, with error: %s", lib_path.c_str(), dlerror()); return 1; } @@ -1749,8 +1755,7 @@ class qnn_instance { load_qnn_functionpointers( lib_handle, "QnnInterface_getProviders"); if (nullptr == get_providers) { - QNN_LOG_WARN("can not load symbol QnnInterface_getProviders : %s", - dlerror()); + QNN_LOG_WARN("can not load symbol QnnInterface_getProviders : %s", dlerror()); return 2; } @@ -1758,14 +1763,12 @@ class qnn_instance { const QnnInterface_t ** provider_list = nullptr; error = get_providers(&provider_list, &num_providers); if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to get providers, error %d", - QNN_GET_ERROR_CODE(error)); + QNN_LOG_WARN("failed to get providers, error %d", QNN_GET_ERROR_CODE(error)); return 3; } QNN_LOG_DEBUG("num_providers=%d\n", num_providers); if (num_providers != _required_num_providers) { - QNN_LOG_WARN("providers is %d instead of required %d", num_providers, - _required_num_providers); + QNN_LOG_WARN("providers is %d instead of required %d", num_providers, _required_num_providers); return 4; } @@ -1797,16 +1800,14 @@ class qnn_instance { BackendIdType backend_id = provider_list[0]->backendId; _lib_path_to_backend_id[lib_path] = backend_id; if (_loaded_backend.count(backend_id) > 0) { - QNN_LOG_WARN("lib_path %s is loaded, but backend %d already exists\n", - lib_path.c_str(), backend_id); + QNN_LOG_WARN("lib_path %s is loaded, but backend %d already exists\n", lib_path.c_str(), backend_id); } _loaded_backend[backend_id] = provider_list[0]; if (_loaded_lib_handle.count(backend_id) > 0) { QNN_LOG_WARN("closing %p\n", _loaded_lib_handle[backend_id]); int dlclose_error = dlclose(_loaded_lib_handle[backend_id]); if (dlclose_error != 0) { - QNN_LOG_WARN("fail to close %p with error %s\n", - _loaded_lib_handle[backend_id], dlerror()); + QNN_LOG_WARN("fail to close %p with error %s\n", _loaded_lib_handle[backend_id], dlerror()); } } _loaded_lib_handle[backend_id] = lib_handle; @@ -1820,8 +1821,7 @@ class qnn_instance { for (auto & it : _loaded_lib_handle) { dlclose_error = dlclose(it.second); if (dlclose_error != 0) { - QNN_LOG_WARN("failed to close QNN backend %d, error %s\n", it.first, - dlerror()); + QNN_LOG_WARN("failed to close QNN backend %d, error %s\n", it.first, dlerror()); } } @@ -1924,7 +1924,6 @@ static bool ggml_qnn_can_handle_op(ggml_backend_qnn_context * ctx, const int64_t ne01 = src0->ne[1]; const int64_t ne10 = src1->ne[0]; const int64_t ne11 = src1->ne[1]; - // make qnn_get_ggml_tensor_rank and QNN SDK happy if (ne00 <= 1 || ne01 <= 1 || ne10 <= 1 || ne11 <= 1) { return false; @@ -1932,13 +1931,13 @@ static bool ggml_qnn_can_handle_op(ggml_backend_qnn_context * ctx, // TODO: support other GGML OPs using QNN API // a GENERAL approach could fix this problem in a standalone PR of refine ggml backend - // subsystem for mixed inference between CPU&GPU / CPU&NPU easily for ANY ggml backends - // which the backend's ggml_backend_xxx_buffer_is_host return true. - // this approach could be found: + // subsystem for hybrid inference between CPU&GPU / CPU&NPU easily(less the 100 LoC and no + // side-effect to the existing codes) for ANY ggml backends which the backend's + // ggml_backend_xxx_buffer_is_host return true. this approach could be found at: // https://github.com/ggerganov/llama.cpp/pull/7641 bool supported_op = false; supported_op = (tensor->op == GGML_OP_ADD); - supported_op = ((tensor->op == GGML_OP_ADD) || (tensor->op == GGML_OP_MUL) || (tensor->op == GGML_OP_MUL_MAT)); + supported_op = ((tensor->op == GGML_OP_ADD) || (tensor->op == GGML_OP_MUL_MAT)); if (!supported_op) { return false; } @@ -1950,14 +1949,9 @@ static bool ggml_qnn_can_handle_op(ggml_backend_qnn_context * ctx, } } - int qtype = src0->type; - if (tensor->op == GGML_OP_MUL) { - return (qtype == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32); - } - if (tensor->op == GGML_OP_MUL_MAT) { if (ne00 <= 32 || ne01 <= 32 || ne10 <= 32 || ne11 <= 32) { - //make mul_mat with QNN RPC happy + //comment it for make UT of mul_mat with QNN RPC happy //return false; } } @@ -1965,6 +1959,8 @@ static bool ggml_qnn_can_handle_op(ggml_backend_qnn_context * ctx, return true; } +//TODO: this function can be removed later because there are duplicated codes with ggml_qnn_mul_mat +// keep it for illustrate how to implement a specified GGMPL OP using QNN API + QNN RPC static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { Qnn_ErrorHandle_t error = QNN_SUCCESS; @@ -1986,10 +1982,11 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src tensor_1 = (Qnn_Tensor_t *) src1->extra; tensor_2 = (Qnn_Tensor_t *) dst->extra; instance = ctx->instance; + QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface; + qnn_perf perf("ggml_qnn_add"); perf.start(); - QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface; QNN_VER_PTR(*tensor_0)->type = QNN_TENSOR_TYPE_APP_WRITE; QNN_VER_PTR(*tensor_1)->type = QNN_TENSOR_TYPE_APP_WRITE; QNN_VER_PTR(*tensor_2)->type = QNN_TENSOR_TYPE_APP_READ; @@ -2034,17 +2031,31 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src QnnHtpGraph_CustomConfig_t dlbc_config; dlbc_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_OPTIMIZATION; - /* dlbc_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_ENABLE_DLBC; - dlbc_config.optimizationOption.floatValue = 1; // set to 0 to turn off DLBC - */ - dlbc_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_OPTIMIZATION_FLAG; - + dlbc_config.optimizationOption.floatValue = 1.0; // set to 0.0 to turn off DLBC QnnGraph_Config_t graph_dlbc_config; graph_dlbc_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; graph_dlbc_config.customConfig = &dlbc_config; - const QnnGraph_Config_t * p_graphconfig[] = {&graph_hvx_config, &graph_dlbc_config, NULL}; + QnnHtpGraph_CustomConfig_t opt_config; + opt_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_OPTIMIZATION_FLAG; + opt_config.optimizationOption.floatValue = 1; // 1 / 3 + QnnGraph_Config_t graph_opt_config; + graph_opt_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; + graph_opt_config.customConfig = &opt_config; + + QnnHtpGraph_CustomConfig_t vtcm_config; + vtcm_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_VTCM_SIZE; + vtcm_config.vtcmSizeInMB = ctx->socinfo.vtcm_size_in_mb; + QnnGraph_Config_t graph_vtcm_config; + graph_vtcm_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; + graph_vtcm_config.customConfig = &vtcm_config; + + const QnnGraph_Config_t * p_graphconfig[] = {&graph_hvx_config, + &graph_dlbc_config, + &graph_vtcm_config, + &graph_opt_config, + NULL}; error = qnn_raw_interface.graphCreate( instance->get_qnn_context_handle(), graph_name.c_str(), p_graphconfig, &graph_handle); @@ -2113,27 +2124,33 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src uint8_t * qnn_buffer_2 = nullptr; qnn_instance * instance = ctx->instance; - qnn_buffer_0 = static_cast(instance->alloc_rpcmem(ggml_nbytes(src0), 4)); + qnn_buffer_0 = static_cast(instance->alloc_rpcmem( + ggml_nbytes(src0), 4)); if (nullptr == qnn_buffer_0) { QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno)); + goto failure; } else { QNN_LOG_INFO("alloc rpcmem successfully\n"); } instance->register_rpcmem(qnn_buffer_0, tensor_0); memcpy(qnn_buffer_0, src0->data, ggml_nbytes(src0)); - qnn_buffer_1 = static_cast(instance->alloc_rpcmem(ggml_nbytes(src1), 4)); + qnn_buffer_1 = static_cast(instance->alloc_rpcmem( + ggml_nbytes(src1), 4)); if (nullptr == qnn_buffer_1) { QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno)); + goto failure; } else { QNN_LOG_INFO("alloc rpcmem successfully\n"); } instance->register_rpcmem(qnn_buffer_1, tensor_1); memcpy(qnn_buffer_1, src1->data, ggml_nbytes(src1)); - qnn_buffer_2 = static_cast(instance->alloc_rpcmem(ggml_nbytes(dst), 4)); + qnn_buffer_2 = static_cast(instance->alloc_rpcmem( + ggml_nbytes(dst), 4)); if (nullptr == qnn_buffer_2) { QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno)); + goto failure; } else { QNN_LOG_INFO("alloc rpcmem successfully\n"); } @@ -2144,23 +2161,33 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src Qnn_Tensor_t tensor_outputs[] = {*tensor_2}; Qnn_OpConfig_t op_config = { (Qnn_OpConfigVersion_t) 1, - .v1 = {"ggml_op_add", QNN_OP_PACKAGE_NAME_QTI_AISW, - QNN_OP_ELEMENT_WISE_ADD, 0, qnn_params, - 2, tensor_inputs, 1, - tensor_outputs}}; + .v1 = {"ggml_op_add", + QNN_OP_PACKAGE_NAME_QTI_AISW, + QNN_OP_ELEMENT_WISE_ADD, + 0, qnn_params, + 2, tensor_inputs, + 1,tensor_outputs} + }; error = qnn_raw_interface.graphAddNode(graph_handle, op_config); if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); goto failure; } - error = qnn_raw_interface.graphFinalize(graph_handle, nullptr, nullptr); + error = qnn_raw_interface.graphFinalize(graph_handle, + nullptr, nullptr); if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); goto failure; } - error = qnn_raw_interface.graphExecute(graph_handle, tensor_inputs, 2, + error = qnn_raw_interface.graphExecute(graph_handle, + tensor_inputs, 2, tensor_outputs, 1, nullptr, nullptr); + if (ctx->device == QNN_BACKEND_NPU) { + if (QNN_COMMON_ERROR_SYSTEM_COMMUNICATION == error) { + QNN_LOG_WARN("NPU crashed. SSR detected. Caused QNN graph execute error\n"); + } + } if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); goto failure; @@ -2221,9 +2248,15 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src Qnn_Tensor_t tensor_inputs[] = {*tensor_0, *tensor_1}; Qnn_Tensor_t tensor_outputs[] = {*tensor_2}; - error = qnn_raw_interface.graphExecute(graph_handle, tensor_inputs,2, + error = qnn_raw_interface.graphExecute(graph_handle, + tensor_inputs,2, tensor_outputs,1, nullptr, nullptr); + if (ctx->device == QNN_BACKEND_NPU) { + if (QNN_COMMON_ERROR_SYSTEM_COMMUNICATION == error) { + QNN_LOG_WARN("NPU crashed. SSR detected. Caused QNN graph execute error\n"); + } + } if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); goto failure; @@ -2299,6 +2332,8 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, tensor_1 = (Qnn_Tensor_t *) src1->extra; tensor_2 = (Qnn_Tensor_t *) dst->extra; instance = ctx->instance; + QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface; + qnn_perf perf("ggml_qnn_mul_mat"); perf.start(); @@ -2307,7 +2342,6 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, tensor_2 = (Qnn_Tensor_t *) dst->extra; instance = ctx->instance; - QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface; QNN_VER_PTR(*tensor_0)->type = QNN_TENSOR_TYPE_APP_WRITE; QNN_VER_PTR(*tensor_1)->type = QNN_TENSOR_TYPE_APP_WRITE; QNN_VER_PTR(*tensor_2)->type = QNN_TENSOR_TYPE_APP_READ; @@ -2338,6 +2372,11 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, uint32_t * tensor_1_dimensions = QNN_VER_PTR(*tensor_1)->dimensions; uint32_t * tensor_2_dimensions = QNN_VER_PTR(*tensor_2)->dimensions; + //TODO: for scenarios of quantized data in src0 + // pass-1: dequantize src0 to FP32 + // pass-2: dq-src0 * src1 + // the performance gains is worth although there is performance loss in pass-1 + if (!graph_initialized) { graph_name = graph_name + "_" + std::to_string(ctx->threads) + "_" + src0->name + "_" + src1->name; @@ -2352,17 +2391,31 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, QnnHtpGraph_CustomConfig_t dlbc_config; dlbc_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_OPTIMIZATION; - /* dlbc_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_ENABLE_DLBC; - dlbc_config.optimizationOption.floatValue = 1; // set to 0 to turn off DLBC - */ - dlbc_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_OPTIMIZATION_FLAG; - + dlbc_config.optimizationOption.floatValue = 1.0; // set to 0.0 to turn off DLBC QnnGraph_Config_t graph_dlbc_config; graph_dlbc_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; graph_dlbc_config.customConfig = &dlbc_config; - const QnnGraph_Config_t * p_graphconfig[] = {&graph_hvx_config, &graph_dlbc_config, NULL}; + QnnHtpGraph_CustomConfig_t opt_config; + opt_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_OPTIMIZATION_FLAG; + opt_config.optimizationOption.floatValue = 1; //1 / 3 + QnnGraph_Config_t graph_opt_config; + graph_opt_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; + graph_opt_config.customConfig = &opt_config; + + QnnHtpGraph_CustomConfig_t vtcm_config; + vtcm_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_VTCM_SIZE; + vtcm_config.vtcmSizeInMB = ctx->socinfo.vtcm_size_in_mb; + QnnGraph_Config_t graph_vtcm_config; + graph_vtcm_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; + graph_vtcm_config.customConfig = &vtcm_config; + + const QnnGraph_Config_t * p_graphconfig[] = {&graph_hvx_config, + &graph_dlbc_config, + &graph_vtcm_config, + &graph_opt_config, + NULL}; error = qnn_raw_interface.graphCreate( instance->get_qnn_context_handle(), graph_name.c_str(), p_graphconfig, &graph_handle); @@ -2428,27 +2481,33 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, uint8_t * qnn_buffer_2 = nullptr; qnn_instance * instance = ctx->instance; - qnn_buffer_0 = static_cast(instance->alloc_rpcmem(ggml_nbytes(src0), 4)); + qnn_buffer_0 = static_cast(instance->alloc_rpcmem( + ggml_nbytes(src0), 4)); if (nullptr == qnn_buffer_0) { QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno)); + goto failure; } else { QNN_LOG_INFO("alloc rpcmem successfully\n"); } instance->register_rpcmem(qnn_buffer_0, tensor_0); memcpy(qnn_buffer_0, src0->data, ggml_nbytes(src0)); - qnn_buffer_1 = static_cast(instance->alloc_rpcmem(ggml_nbytes(src1), 4)); + qnn_buffer_1 = static_cast(instance->alloc_rpcmem( + ggml_nbytes(src1), 4)); if (nullptr == qnn_buffer_1) { QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno)); + goto failure; } else { QNN_LOG_INFO("alloc rpcmem successfully\n"); } instance->register_rpcmem(qnn_buffer_1, tensor_1); memcpy(qnn_buffer_1, src1->data, ggml_nbytes(src1)); - qnn_buffer_2 = static_cast(instance->alloc_rpcmem(ggml_nbytes(dst), 4)); + qnn_buffer_2 = static_cast(instance->alloc_rpcmem( + ggml_nbytes(dst), 4)); if (nullptr == qnn_buffer_2) { QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno)); + goto failure; } else { QNN_LOG_INFO("alloc rpcmem successfully\n"); } @@ -2457,25 +2516,35 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, Qnn_Tensor_t tensor_inputs[] = {*tensor_0, *tensor_1}; Qnn_Tensor_t tensor_outputs[] = {*tensor_2}; - Qnn_OpConfig_t op_config = {(Qnn_OpConfigVersion_t) 1, - .v1 = {"ggml_op_mul_mat", - QNN_OP_PACKAGE_NAME_QTI_AISW, - QNN_OP_MAT_MUL, 0, qnn_params, 2, - tensor_inputs, 1, tensor_outputs}}; + Qnn_OpConfig_t op_config = { + (Qnn_OpConfigVersion_t) 1, + .v1 = {"ggml_op_mul_mat", + QNN_OP_PACKAGE_NAME_QTI_AISW, + QNN_OP_MAT_MUL, + 0, qnn_params, + 2, tensor_inputs, + 1, tensor_outputs} + }; error = qnn_raw_interface.graphAddNode(graph_handle, op_config); if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); goto failure; } - error = qnn_raw_interface.graphFinalize(graph_handle, nullptr, nullptr); + error = qnn_raw_interface.graphFinalize(graph_handle, + nullptr, nullptr); if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); goto failure; } error = qnn_raw_interface.graphExecute(graph_handle, - tensor_inputs, 2, + tensor_inputs, 2, tensor_outputs, 1, nullptr, nullptr); + if (ctx->device == QNN_BACKEND_NPU) { + if (QNN_COMMON_ERROR_SYSTEM_COMMUNICATION == error) { + QNN_LOG_WARN("NPU crashed. SSR detected. Caused QNN graph execute error\n"); + } + } if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); goto failure; @@ -2537,300 +2606,14 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, Qnn_Tensor_t tensor_inputs[] = {*tensor_0, *tensor_1}; Qnn_Tensor_t tensor_outputs[] = {*tensor_2}; error = qnn_raw_interface.graphExecute(graph_handle, - tensor_inputs, 2, + tensor_inputs, 2, tensor_outputs, 1, nullptr, nullptr); - if (QNN_SUCCESS != error) { - QNN_LOG_INFO("error = %d\n", error); - goto failure; - } - if (ctx->device == QNN_BACKEND_NPU) { - uint8_t * qnn_buffer_2 = static_cast(ctx->instance->get_rpcmem_from_memhandle( - QNN_VER_PTR(*tensor_2)->memHandle)); - if (nullptr != qnn_buffer_2) - memcpy(dst->data, qnn_buffer_2, ggml_nbytes(dst)); - } - } - -failure: - if (QNN_SUCCESS != error) { - QNN_LOG_DEBUG("tensor0 name %s", QNN_TENSOR_GET_NAME(*tensor_0)); - QNN_LOG_DEBUG("tensor1 name %s", QNN_TENSOR_GET_NAME(*tensor_1)); - QNN_LOG_DEBUG("tensor2 name %s", QNN_TENSOR_GET_NAME(*tensor_2)); - QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 - " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", - src0->name, src0->type, ggml_type_name(src0->type), - src0->ne[0], src0->ne[1], src0->ne[2], src0->nb[0], - src0->nb[1], src0->nb[2]); - QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 - " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", - src1->name, src1->type, ggml_type_name(src1->type), - src1->ne[0], src1->ne[1], src1->ne[2], src1->nb[0], - src1->nb[1], src1->nb[2]); - QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 - " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", - dst->name, dst->type, ggml_type_name(dst->type), dst->ne[0], - dst->ne[1], dst->ne[2], dst->nb[0], dst->nb[1], dst->nb[2]); - } - - QNN_VER_PTR(*tensor_0)->dimensions = tensor_0_dimensions; - QNN_VER_PTR(*tensor_1)->dimensions = tensor_1_dimensions; - QNN_VER_PTR(*tensor_2)->dimensions = tensor_2_dimensions; - perf.info(); -} - -// common function for GGML OPs using QNN API -static void ggml_qnn_hanlde_op(ggml_backend_qnn_context * ctx, - const enum ggml_op ggmlop, - const ggml_tensor * src0, const ggml_tensor * src1, - ggml_tensor * dst) { - Qnn_ErrorHandle_t error = QNN_SUCCESS; - bool graph_initialized = false; - qnn_instance * instance = nullptr; - std::string qnn_graph_name = "ggml_qnn_graph"; - std::string qnn_op_config_name = "ggml_qnn_op_config"; - const char * qnn_op_name = nullptr; - Qnn_GraphHandle_t graph_handle = nullptr; - Qnn_Tensor_t * tensor_0 = nullptr; - Qnn_Tensor_t * tensor_1 = nullptr; - Qnn_Tensor_t * tensor_2 = nullptr; - Qnn_Param_t qnn_params[] = {}; - Qnn_DataType_t src0_qnn_type = QNN_DATATYPE_FLOAT_32; - Qnn_DataType_t src1_qnn_type = QNN_DATATYPE_FLOAT_32; - Qnn_DataType_t dst_qnn_type = QNN_DATATYPE_FLOAT_32; - - CHECK_PARAMS(ctx, src0, src1, dst); - tensor_0 = (Qnn_Tensor_t *) src0->extra; - tensor_1 = (Qnn_Tensor_t *) src1->extra; - tensor_2 = (Qnn_Tensor_t *) dst->extra; - instance = ctx->instance; - qnn_perf perf(ggml_op_name(ggmlop)); - perf.start(); - - qnn_op_name = qnn_opname_from_ggmlop(ggmlop); - if (nullptr == qnn_op_name) { - QNN_LOG_WARN("ggml op %d(%s) not supported currently", ggmlop, ggml_op_name(ggmlop)); - return; - } - - tensor_0 = (Qnn_Tensor_t *) src0->extra; - tensor_1 = (Qnn_Tensor_t *) src1->extra; - tensor_2 = (Qnn_Tensor_t *) dst->extra; - instance = ctx->instance; - QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface; - - src0_qnn_type = qnn_datatype_from_ggml_datatype(src0->type); - src1_qnn_type = qnn_datatype_from_ggml_datatype(src1->type); - dst_qnn_type = qnn_datatype_from_ggml_datatype(dst->type); - - QNN_VER_PTR(*tensor_0)->type = QNN_TENSOR_TYPE_APP_WRITE; - QNN_VER_PTR(*tensor_1)->type = QNN_TENSOR_TYPE_APP_WRITE; - QNN_VER_PTR(*tensor_2)->type = QNN_TENSOR_TYPE_APP_READ; - - uint32_t dimensions_input_0[] = { - (uint32_t) src0->ne[0], (uint32_t) src0->ne[1], (uint32_t) src0->ne[2], - (uint32_t) src0->ne[3]}; - uint32_t dimensions_input_1[] = { - (uint32_t) src1->ne[0], (uint32_t) src1->ne[1], (uint32_t) src1->ne[2], - (uint32_t) src1->ne[3]}; - uint32_t dimensions_output[] = { - (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], - (uint32_t) dst->ne[3]}; - - std::string map_entry = std::string(ggml_op_name(ggmlop)); - if (instance->_qnn_graph_map.find(map_entry) != - instance->_qnn_graph_map.end()) { - graph_initialized = true; - auto & graph_item = instance->_qnn_graph_map[map_entry]; - graph_handle = std::get<0>(graph_item); - } - - uint32_t * tensor_0_dimensions = QNN_VER_PTR(*tensor_0)->dimensions; - uint32_t * tensor_1_dimensions = QNN_VER_PTR(*tensor_1)->dimensions; - uint32_t * tensor_2_dimensions = QNN_VER_PTR(*tensor_2)->dimensions; - - if (!graph_initialized) { - qnn_graph_name = qnn_graph_name + "_" + ggml_op_name(ggmlop) + - std::to_string(ctx->threads) + src0->name + "_" + - src1->name; - qnn_op_config_name = qnn_op_config_name + "_" + ggml_op_name(ggmlop) + - std::to_string(ctx->threads) + src0->name + "_" + - src1->name; - QNN_LOG_DEBUG("qnn graph name %s", qnn_graph_name.c_str()); - QNN_LOG_DEBUG("qnn op_config name %s", qnn_op_config_name.c_str()); - error = qnn_raw_interface.graphCreate( - instance->get_qnn_context_handle(), qnn_graph_name.c_str(), nullptr, - &graph_handle); - if (QNN_SUCCESS != error) { - QNN_LOG_INFO("can't create qnn graph handle with ggml op %s, graph " - "name %s, error = %d\n", - ggml_op_name(ggmlop), qnn_graph_name.c_str(), error); - goto failure; - } - - if (ctx->device == QNN_BACKEND_NPU) { - QNN_VER_PTR(*tensor_0)->memType = QNN_TENSORMEMTYPE_MEMHANDLE; - QNN_VER_PTR(*tensor_0)->clientBuf= {.data=nullptr, .dataSize=0}; - - QNN_VER_PTR(*tensor_1)->memType = QNN_TENSORMEMTYPE_MEMHANDLE; - QNN_VER_PTR(*tensor_1)->clientBuf= {.data=nullptr, .dataSize=0}; - - QNN_VER_PTR(*tensor_2)->memType = QNN_TENSORMEMTYPE_MEMHANDLE; - QNN_VER_PTR(*tensor_2)->clientBuf= {.data=nullptr, .dataSize=0}; - } - - error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_0); - if (QNN_SUCCESS != error) { - QNN_LOG_INFO("error = %d\n", error); - goto failure; - } - error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_1); - if (QNN_SUCCESS != error) { - QNN_LOG_INFO("error = %d\n", error); - goto failure; - } - error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_2); - if (QNN_SUCCESS != error) { - QNN_LOG_INFO("error = %d\n", error); - goto failure; - } - - QNN_VER_PTR(*tensor_0)->dimensions = dimensions_input_0; - QNN_VER_PTR(*tensor_0)->rank = qnn_get_ggml_tensor_rank(src0); - QNN_VER_PTR(*tensor_0)->dataType = src0_qnn_type; - QNN_VER_PTR(*tensor_1)->dimensions = dimensions_input_1; - QNN_VER_PTR(*tensor_1)->rank = qnn_get_ggml_tensor_rank(src1); - QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type; - QNN_VER_PTR(*tensor_2)->dimensions = dimensions_output; - QNN_VER_PTR(*tensor_2)->rank = qnn_get_ggml_tensor_rank(dst); - QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type; - - if (ctx->device != QNN_BACKEND_NPU) { - QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, - qnn_get_ggml_tensor_data_size(src0)}; - QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, - qnn_get_ggml_tensor_data_size(src1)}; - QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, - qnn_get_ggml_tensor_data_size(dst)}; - } else { - uint8_t * qnn_buffer_0 = nullptr; - uint8_t * qnn_buffer_1 = nullptr; - uint8_t * qnn_buffer_2 = nullptr; - qnn_instance * instance = ctx->instance; - - qnn_buffer_0 = static_cast(instance->alloc_rpcmem(ggml_nbytes(src0), 4)); - if (nullptr == qnn_buffer_0) { - QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno)); - } else { - QNN_LOG_INFO("alloc rpcmem successfully\n"); + if (QNN_COMMON_ERROR_SYSTEM_COMMUNICATION == error) { + QNN_LOG_WARN("NPU crashed. SSR detected. Caused QNN graph execute error\n"); } - instance->register_rpcmem(qnn_buffer_0, tensor_0); - memcpy(qnn_buffer_0, src0->data, ggml_nbytes(src0)); - - qnn_buffer_1 = static_cast(instance->alloc_rpcmem(ggml_nbytes(src1), 4)); - if (nullptr == qnn_buffer_1) { - QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno)); - } else { - QNN_LOG_INFO("alloc rpcmem successfully\n"); - } - instance->register_rpcmem(qnn_buffer_1, tensor_1); - memcpy(qnn_buffer_1, src1->data, ggml_nbytes(src1)); - - qnn_buffer_2 = static_cast(instance->alloc_rpcmem(ggml_nbytes(dst), 4)); - if (nullptr == qnn_buffer_2) { - QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno)); - } else { - QNN_LOG_INFO("alloc rpcmem successfully\n"); - } - instance->register_rpcmem(qnn_buffer_2, tensor_2); - } - - Qnn_Tensor_t tensor_inputs[] = {*tensor_0, *tensor_1}; - Qnn_Tensor_t tensor_outputs[] = {*tensor_2}; - Qnn_OpConfig_t op_config = {(Qnn_OpConfigVersion_t) 1, - .v1 = {qnn_op_config_name.c_str(), - QNN_OP_PACKAGE_NAME_QTI_AISW, - qnn_op_name, 0, qnn_params, 2, - tensor_inputs, 1, tensor_outputs}}; - error = qnn_raw_interface.graphAddNode(graph_handle, op_config); - if (QNN_SUCCESS != error) { - QNN_LOG_INFO("error = %d\n", error); - goto failure; - } - error = qnn_raw_interface.graphFinalize(graph_handle, nullptr, nullptr); - if (QNN_SUCCESS != error) { - QNN_LOG_INFO("error = %d\n", error); - goto failure; - } - error = qnn_raw_interface.graphExecute(graph_handle, - tensor_inputs, 2, - tensor_outputs, 1, - nullptr, nullptr); - if (QNN_SUCCESS != error) { - QNN_LOG_INFO("error = %d\n", error); - goto failure; } - - if (ctx->device == QNN_BACKEND_NPU) { - uint8_t * qnn_buffer_2 = static_cast(ctx->instance->get_rpcmem_from_memhandle( - QNN_VER_PTR(*tensor_2)->memHandle)); - memcpy(dst->data, qnn_buffer_2, ggml_nbytes(dst)); - } - - auto graph_item = std::make_tuple(graph_handle, tensor_0, tensor_1, tensor_2); - instance->_qnn_graph_map[map_entry] = graph_item; - } else { - auto & graph_item = instance->_qnn_graph_map[map_entry]; - graph_handle = std::get<0>(graph_item); - tensor_0 = std::get<1>(graph_item); - tensor_1 = std::get<2>(graph_item); - tensor_2 = std::get<3>(graph_item); - - uint32_t dimensions_input_0[] = { - (uint32_t) src0->ne[0], (uint32_t) src0->ne[1], - (uint32_t) src0->ne[2], (uint32_t) src0->ne[3]}; - uint32_t dimensions_input_1[] = { - (uint32_t) src1->ne[0], (uint32_t) src1->ne[1], - (uint32_t) src1->ne[2], (uint32_t) src1->ne[3]}; - uint32_t dimensions_output[] = { - (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], - (uint32_t) dst->ne[3]}; - QNN_VER_PTR(*tensor_0)->dimensions = dimensions_input_0; - QNN_VER_PTR(*tensor_0)->rank = qnn_get_ggml_tensor_rank(src0); - QNN_VER_PTR(*tensor_0)->dataType = src0_qnn_type; - QNN_VER_PTR(*tensor_1)->dimensions = dimensions_input_1; - QNN_VER_PTR(*tensor_1)->rank = qnn_get_ggml_tensor_rank(src1); - QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type; - QNN_VER_PTR(*tensor_2)->dimensions = dimensions_output; - QNN_VER_PTR(*tensor_2)->rank = qnn_get_ggml_tensor_rank(dst); - QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type; - - if (ctx->device != QNN_BACKEND_NPU) { - QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, - qnn_get_ggml_tensor_data_size(src0)}; - QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, - qnn_get_ggml_tensor_data_size(src1)}; - QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, - qnn_get_ggml_tensor_data_size(dst)}; - } else { - uint8_t * qnn_buffer_0 = static_cast(ctx->instance->get_rpcmem_from_memhandle( - QNN_VER_PTR(*tensor_0)->memHandle)); - if (nullptr != qnn_buffer_0) - memcpy(qnn_buffer_0, src0->data, ggml_nbytes(src0)); - - uint8_t * qnn_buffer_1 = static_cast(ctx->instance->get_rpcmem_from_memhandle( - QNN_VER_PTR(*tensor_1)->memHandle)); - if (nullptr != qnn_buffer_1) - memcpy(qnn_buffer_1, src1->data, ggml_nbytes(src1)); - } - - Qnn_Tensor_t tensor_inputs[] = {*tensor_0, *tensor_1}; - Qnn_Tensor_t tensor_outputs[] = {*tensor_2}; - error = qnn_raw_interface.graphExecute(graph_handle, - tensor_inputs, 2, - tensor_outputs, 1, - nullptr, nullptr); if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); goto failure; @@ -2863,8 +2646,6 @@ static void ggml_qnn_hanlde_op(ggml_backend_qnn_context * ctx, " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", dst->name, dst->type, ggml_type_name(dst->type), dst->ne[0], dst->ne[1], dst->ne[2], dst->nb[0], dst->nb[1], dst->nb[2]); - QNN_LOG_DEBUG("%d, %d, %d, %d", src0->ne[0], src0->ne[1], src0->ne[2], - src0->ne[3]); } QNN_VER_PTR(*tensor_0)->dimensions = tensor_0_dimensions; @@ -3038,21 +2819,14 @@ bool ggml_qnn_compute_forward(ggml_backend_qnn_context * ctx, struct ggml_compute_params * params, struct ggml_tensor * tensor) { ggml_qnn_func_t func = nullptr; - ggml_qnn_func_common_t func_common = nullptr; switch (tensor->op) { case GGML_OP_ADD: func = ggml_qnn_add; break; - - case GGML_OP_MUL: - func_common = ggml_qnn_hanlde_op; - break; - case GGML_OP_MUL_MAT: func = ggml_qnn_mul_mat; break; - case GGML_OP_REPEAT: func = ggml_qnn_repeat; break; @@ -3062,15 +2836,12 @@ bool ggml_qnn_compute_forward(ggml_backend_qnn_context * ctx, case GGML_OP_DUP: func = ggml_qnn_dup; break; - case GGML_OP_ACC: func = ggml_qnn_acc; break; - case GGML_OP_DIV: func = ggml_qnn_div; break; - case GGML_OP_UNARY: switch (ggml_get_unary_op(tensor)) { case GGML_UNARY_OP_GELU: @@ -3169,10 +2940,9 @@ bool ggml_qnn_compute_forward(ggml_backend_qnn_context * ctx, return false; } - if (nullptr != func) func(ctx, tensor->src[0], tensor->src[1], tensor); - - if (nullptr != func_common) - func_common(ctx, tensor->op, tensor->src[0], tensor->src[1], tensor); + if (nullptr != func) { + func(ctx, tensor->src[0], tensor->src[1], tensor); + } return true; } @@ -3221,41 +2991,28 @@ GGML_CALL static void ggml_backend_qnn_buffer_init_tensor(ggml_backend_buffer_t } Qnn_Tensor_t qnn_tensor = QNN_TENSOR_INIT; - if (ctx->device != QNN_BACKEND_GPU) { - qnn_tensor = { - .version = QNN_TENSOR_VERSION_1, - {.v1 = {.id = 0, - .name = tensor_name, - .type = qnn_tensor_type, - .dataFormat = QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER, - .dataType = qnn_data_type, - .quantizeParams = - {QNN_DEFINITION_UNDEFINED, - QNN_QUANTIZATION_ENCODING_UNDEFINED, - {.scaleOffsetEncoding = {.scale = 0.0000000000000000f, - .offset = 0}}}, - .rank = qnn_get_ggml_tensor_rank(tensor), - .dimensions = dimensions, - .memType = QNN_TENSORMEMTYPE_RAW, - {.clientBuf = {.data = nullptr, .dataSize = 0}}}}}; - } else { - qnn_tensor = { - .version = QNN_TENSOR_VERSION_1, - {.v1 = {.id = 0, - .name = tensor_name, - .type = qnn_tensor_type, - .dataFormat = QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER, - .dataType = qnn_data_type, - .quantizeParams = - {QNN_DEFINITION_UNDEFINED, - QNN_QUANTIZATION_ENCODING_UNDEFINED, - {.scaleOffsetEncoding = {.scale = 0.0000000000000000f, - .offset = 0}}}, - .rank = qnn_get_ggml_tensor_rank(tensor), - .dimensions = dimensions, - .memType = QNN_TENSORMEMTYPE_MEMHANDLE, - {.clientBuf = {.data = nullptr, .dataSize = 0}}}}}; - } + Qnn_TensorMemType_t qnn_mem_type = QNN_TENSORMEMTYPE_RAW; + if (ctx->device == QNN_BACKEND_GPU) { + qnn_mem_type = QNN_TENSORMEMTYPE_MEMHANDLE; + } + + qnn_tensor = { + .version = QNN_TENSOR_VERSION_1, + {.v1 = {.id = 0, + .name = tensor_name, + .type = qnn_tensor_type, + .dataFormat = QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER, + .dataType = qnn_data_type, + .quantizeParams = + {QNN_DEFINITION_UNDEFINED, + QNN_QUANTIZATION_ENCODING_UNDEFINED, + {.scaleOffsetEncoding = {.scale = 0.0000000000000000f, + .offset = 0}}}, + .rank = qnn_get_ggml_tensor_rank(tensor), + .dimensions = dimensions, + .memType = qnn_mem_type, + {.clientBuf = {.data = nullptr, .dataSize = 0}}}}}; + Qnn_Tensor_t * p_qnn_tensor = (Qnn_Tensor_t *)calloc(1, sizeof(Qnn_Tensor_t)); if (nullptr == p_qnn_tensor) { diff --git a/tests/ggml-qnn/ggml-qnn-ut-build-run.sh b/tests/ggml-qnn/ggml-qnn-ut-build-run.sh index 4c21be5a41fa2..e12b987b8d69d 100755 --- a/tests/ggml-qnn/ggml-qnn-ut-build-run.sh +++ b/tests/ggml-qnn/ggml-qnn-ut-build-run.sh @@ -12,8 +12,8 @@ ANDROID_PLATFORM=android-34 GGML_QNN_UT=ggml-qnn-ut REMOTE_PATH=/data/local/tmp/ -BUILDTYPE=Debug BUILDTYPE=Release +BUILDTYPE=Debug function dump_vars() @@ -100,7 +100,7 @@ function update_qnn_libs() adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnCpu.so ${REMOTE_PATH}/ adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnGpu.so ${REMOTE_PATH}/ - #the QNN NPU(aka HTP/DSP) backend only verified on Xiaomi14(Qualcomm SM8650-AB Snapdragon 8 Gen 3) successfully + #the QNN NPU(aka HTP) backend only verified on Qualcomm Snapdragon 8 Gen 3 equipped Android phone adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtp.so ${REMOTE_PATH}/ adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtpNetRunExtensions.so ${REMOTE_PATH}/ adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtpPrepare.so ${REMOTE_PATH}/ @@ -142,14 +142,9 @@ function run_ggml_qnn_ut() case "$ggmlop" in GGML_OP_ADD) - echo "adb shell ${REMOTE_PATH}/${GGML_QNN_UT} -t GGML_OP_ADD -b $qnnbackend" adb shell ${REMOTE_PATH}/${GGML_QNN_UT} -t GGML_OP_ADD -b $qnnbackend ;; - GGML_OP_MUL) - adb shell ${REMOTE_PATH}/${GGML_QNN_UT} -t GGML_OP_MUL -b $qnnbackend - ;; - GGML_OP_MUL_MAT) adb shell ${REMOTE_PATH}/${GGML_QNN_UT} -t GGML_OP_MUL_MAT -b $qnnbackend ;; @@ -169,7 +164,6 @@ function show_usage() echo " $0 build (build Android command line UT program)" echo " $0 updateqnnlibs (upload the latest QNN libs to Android phone)" echo " $0 GGML_OP_ADD 0 (QNN_CPU) / 1(QNN_GPU) / 2(QNN_NPU) / 3(ggml)" - echo " $0 GGML_OP_MUL 0 (QNN_CPU) / 1(QNN_GPU) / 2(QNN_NPU) / 3(ggml)" echo " $0 GGML_OP_MUL_MAT 0 (QNN_CPU) / 1(QNN_GPU) / 2(QNN_NPU) / 3(ggml)" echo -e "\n\n\n" } diff --git a/tests/ggml-qnn/ggml-qnn-ut.cpp b/tests/ggml-qnn/ggml-qnn-ut.cpp index 0abfc62073f08..fa0883af8993e 100644 --- a/tests/ggml-qnn/ggml-qnn-ut.cpp +++ b/tests/ggml-qnn/ggml-qnn-ut.cpp @@ -346,7 +346,7 @@ static void show_usage() { "\nUsage: test_qnn_ops [options]\n" \ "\n" \ "Options:\n" \ - " -t GGML_OP_ADD / GGML_OP_MUL / GGML_OP_MULMAT\n" \ + " -t GGML_OP_ADD / GGML_OP_MULMAT\n" \ " -b 0(QNN_CPU) 1(QNN_GPU) 2(QNN_NPU) 3(ggml)\n" \ " ?/h print usage infomation\n\n" ); @@ -418,13 +418,9 @@ static int qnn_op_ut(int num_threads, int n_backend_type, int n_ggml_op_type) { QNN_LOG_DEBUG("sizex: %d\n", sizex); QNN_LOG_DEBUG("sizey: %d\n", sizey); - if (n_ggml_op_type == GGML_OP_MUL) { - src0 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizey); - src1 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizey); - } else { - src0 = ggml_new_tensor_2d(ctx, qtype, sizex, sizey); - src1 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizey); - } + src0 = ggml_new_tensor_2d(ctx, qtype, sizex, sizey); + src1 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizey); + ggml_set_input(src0); ggml_set_input(src1); @@ -432,9 +428,6 @@ static int qnn_op_ut(int num_threads, int n_backend_type, int n_ggml_op_type) { case GGML_OP_ADD: dst = ggml_add(ctx, src0, src1); break; - case GGML_OP_MUL: - dst = ggml_mul(ctx, src0, src1); - break; case GGML_OP_MUL_MAT: dst = ggml_mul_mat(ctx, src0, src1); break; @@ -518,8 +511,6 @@ int main(int argc, char * argv[]) { n_ggml_op_type = GGML_OP_ADD; } else if (0 == memcmp(argv[i + 1], "GGML_OP_MUL_MAT", 15)) { n_ggml_op_type = GGML_OP_MUL_MAT; - } else if (0 == memcmp(argv[i + 1], "GGML_OP_MUL", 11)) { - n_ggml_op_type = GGML_OP_MUL; } else { show_usage(); return 1; From 5e18cdc2689523ea28b829e8ed09db262453023c Mon Sep 17 00:00:00 2001 From: hongruichen Date: Sat, 15 Jun 2024 12:55:06 +0800 Subject: [PATCH 017/143] init the test array with const values --- tests/ggml-qnn/ggml-qnn-ut.cpp | 36 +++++----------------------------- 1 file changed, 5 insertions(+), 31 deletions(-) diff --git a/tests/ggml-qnn/ggml-qnn-ut.cpp b/tests/ggml-qnn/ggml-qnn-ut.cpp index fa0883af8993e..ff01e62f983c7 100644 --- a/tests/ggml-qnn/ggml-qnn-ut.cpp +++ b/tests/ggml-qnn/ggml-qnn-ut.cpp @@ -266,37 +266,12 @@ static uint32_t get_tensor_data_size(const ggml_tensor * tensor) { //ref: https://github.com/ggerganov/llama.cpp/blob/master/tests/test-backend-ops.cpp#L20 static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float max = 1.0f) { - // static RNG initialization (revisit if n_threads stops being constant) - static const size_t n_threads = std::thread::hardware_concurrency(); - static std::vector generators = []() { - std::random_device rd; - std::vector vec; - vec.reserve(n_threads); - //for (size_t i = 0; i < n_threads; i++) { vec.emplace_back(1234 + i); } // fixed seed - for (size_t i = 0; i < n_threads; i++) { vec.emplace_back(rd()); } - return vec; - }(); - size_t size = ggml_nelements(tensor); std::vector data(size); - - auto init_thread = [&](size_t ith, size_t start, size_t end) { - std::uniform_real_distribution distribution(min, max); - for (size_t i = start; i < end; i++) { - data[i] = distribution(generators[ith]); - } - }; - - std::vector threads; - threads.reserve(n_threads); - for (size_t i = 0; i < n_threads; i++) { - size_t start = i*size/n_threads; - size_t end = (i+1)*size/n_threads; - threads.emplace_back(init_thread, i, start, end); - } - for (auto & t : threads) { - t.join(); + for (size_t i = 0; i < size; i++) { + data[i] = i + 1; } + if (tensor->type == GGML_TYPE_F32 || tensor->type == GGML_TYPE_I32) { #ifdef GGML_USE_QNN memcpy((char*)tensor->data, data.data(), size * sizeof(float)); @@ -378,7 +353,6 @@ static int qnn_op_ut(int num_threads, int n_backend_type, int n_ggml_op_type) { QNN_LOG_DEBUG("ggml op:%d(%s)\n", n_ggml_op_type, ggml_op_name((enum ggml_op) n_ggml_op_type)); n_begin_time = ggml_time_us(); - srand(time(NULL)); ctx_size += 1024 * 1024 * 32; QNN_LOG_DEBUG("Allocating Memory of size %zi bytes, %zi MB\n", ctx_size, @@ -460,11 +434,11 @@ static int qnn_op_ut(int num_threads, int n_backend_type, int n_ggml_op_type) { initialize_tensors(ctx); } else { if (qtype == GGML_TYPE_F32) { - ggml_set_f32(src0, (rand() % 100 + 1)); + ggml_set_f32(src0, 2.f); } else { initialize_tensors(ctx); } - ggml_set_f32(src1, (rand() % 100 + 1)); + ggml_set_f32(src1, 3.f); } ggml_graph_compute_helper(backend, gf, work_buffer, num_threads, nullptr, nullptr); From 6c68adc1d942a5a0173b537237656a4220e7487b Mon Sep 17 00:00:00 2001 From: hongruichen Date: Fri, 14 Jun 2024 18:52:54 +0800 Subject: [PATCH 018/143] add ggml_qnn_tensor_binder --- ggml-qnn.cpp | 110 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 110 insertions(+) diff --git a/ggml-qnn.cpp b/ggml-qnn.cpp index f268c7f0e825a..62fee4281d1f0 100644 --- a/ggml-qnn.cpp +++ b/ggml-qnn.cpp @@ -1959,6 +1959,116 @@ static bool ggml_qnn_can_handle_op(ggml_backend_qnn_context * ctx, return true; } +template +class ggml_qnn_tensor_binder +{ +public: + ggml_qnn_tensor_binder(const ggml_tensor *tensor, ggml_backend_qnn_context * ctx, Qnn_GraphHandle_t graph_handle) + : _tensor(tensor) + , _qnn_tensor(reinterpret_cast(tensor->extra)) + , _context(ctx) { + _old_dimensions = QNN_VER_PTR(*_qnn_tensor)->dimensions; + const auto qnn_data_type = qnn_datatype_from_ggml_datatype(tensor->type); + const bool is_npu = ctx->device == QNN_BACKEND_NPU; + QNN_VER_PTR(*_qnn_tensor)->type = _tensorType; + if (is_npu) { + QNN_VER_PTR(*_qnn_tensor)->memType = QNN_TENSORMEMTYPE_MEMHANDLE; + QNN_VER_PTR(*_qnn_tensor)->clientBuf= {.data=nullptr, .dataSize=0}; + } + + auto err = ctx->raw_interface.tensorCreateGraphTensor(graph_handle, _qnn_tensor); + if (err != QNN_SUCCESS) { + QNN_LOG_INFO("error = %d\n", err); + _context = nullptr; + return; + } + + _dimensions[0] = (uint32_t)tensor->ne[0]; + _dimensions[1] = (uint32_t)tensor->ne[1]; + _dimensions[2] = (uint32_t)tensor->ne[2]; + _dimensions[3] = (uint32_t)tensor->ne[3]; + QNN_VER_PTR(*_qnn_tensor)->dimensions = _dimensions; + QNN_VER_PTR(*_qnn_tensor)->rank = qnn_get_ggml_tensor_rank(tensor); + QNN_VER_PTR(*_qnn_tensor)->dataType = qnn_data_type; + + + if (is_npu) { + qnn_instance * instance = ctx->instance; + uint8_t *qnn_buffer = static_cast(instance->alloc_rpcmem( + ggml_nbytes(tensor), 4)); // TODO: should we get the align param from device here? + if (!qnn_buffer) { + QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno)); + _context = nullptr; + return; + } else { + QNN_LOG_INFO("alloc rpcmem successfully\n"); + } + + instance->register_rpcmem(qnn_buffer, _qnn_tensor); + if (_tensorType == QNN_TENSOR_TYPE_APP_WRITE || _tensorType == QNN_TENSOR_TYPE_APP_READWRITE) { + memcpy(qnn_buffer, tensor->data, ggml_nbytes(tensor)); + } + } else { + QNN_VER_PTR(*_qnn_tensor)->clientBuf = {tensor->data, + qnn_get_ggml_tensor_data_size(tensor)}; + } + } + + ggml_qnn_tensor_binder(const ggml_tensor *tensor, Qnn_Tensor_t *qnn_tensor, ggml_backend_qnn_context * ctx) + : _tensor(tensor) + , _qnn_tensor(qnn_tensor) + , _context(ctx) { + _old_dimensions = QNN_VER_PTR(*_qnn_tensor)->dimensions; + const auto qnn_data_type = qnn_datatype_from_ggml_datatype(tensor->type); + const bool is_npu = ctx->device == QNN_BACKEND_NPU; + + _dimensions[0] = (uint32_t)tensor->ne[0]; + _dimensions[1] = (uint32_t)tensor->ne[1]; + _dimensions[2] = (uint32_t)tensor->ne[2]; + _dimensions[3] = (uint32_t)tensor->ne[3]; + QNN_VER_PTR(*_qnn_tensor)->dimensions = _dimensions; + QNN_VER_PTR(*_qnn_tensor)->rank = qnn_get_ggml_tensor_rank(tensor); + QNN_VER_PTR(*_qnn_tensor)->dataType = qnn_data_type; + + + if (is_npu) { + uint8_t * qnn_buffer = static_cast(ctx->instance->get_rpcmem_from_memhandle( + QNN_VER_PTR(*_qnn_tensor)->memHandle)); + if (qnn_buffer) { + memcpy(qnn_buffer, tensor->data, ggml_nbytes(tensor)); + } else { + QNN_LOG_WARN("can't find rpcmem from qnn mem handle\n"); + } + } else { + QNN_VER_PTR(*_qnn_tensor)->clientBuf = {tensor->data, + qnn_get_ggml_tensor_data_size(tensor)}; + } + } + + ~ggml_qnn_tensor_binder() { + if (_context && _context->device == QNN_BACKEND_NPU && + (_tensorType == QNN_TENSOR_TYPE_APP_READWRITE || _tensorType == QNN_TENSOR_TYPE_APP_READ)) { + uint8_t * qnn_buffer = static_cast(_context->instance->get_rpcmem_from_memhandle( + QNN_VER_PTR(*_qnn_tensor)->memHandle)); + memcpy(_tensor->data, qnn_buffer, ggml_nbytes(_tensor)); + } + + QNN_VER_PTR(*_qnn_tensor)->dimensions = _old_dimensions; + } + +private: + const ggml_tensor *_tensor; + Qnn_Tensor_t *_qnn_tensor; + ggml_backend_qnn_context *_context; + uint32_t *_old_dimensions; + uint32_t _dimensions[4] = {}; + + ggml_qnn_tensor_binder(const ggml_qnn_tensor_binder&) = delete; + ggml_qnn_tensor_binder(ggml_qnn_tensor_binder&&) = delete; + void operator=(const ggml_qnn_tensor_binder&) = delete; + void operator=(ggml_qnn_tensor_binder&&) = delete; +}; + //TODO: this function can be removed later because there are duplicated codes with ggml_qnn_mul_mat // keep it for illustrate how to implement a specified GGMPL OP using QNN API + QNN RPC static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, From 37bb9263dd1687601c7dad0f3fc0332b82f3901c Mon Sep 17 00:00:00 2001 From: hongruichen Date: Sat, 15 Jun 2024 11:13:30 +0800 Subject: [PATCH 019/143] use tensor wrapper in add --- ggml-qnn.cpp | 86 ++++++++++++++++------------------------------------ 1 file changed, 26 insertions(+), 60 deletions(-) diff --git a/ggml-qnn.cpp b/ggml-qnn.cpp index 62fee4281d1f0..ab28a2daec725 100644 --- a/ggml-qnn.cpp +++ b/ggml-qnn.cpp @@ -1960,10 +1960,10 @@ static bool ggml_qnn_can_handle_op(ggml_backend_qnn_context * ctx, } template -class ggml_qnn_tensor_binder +class ggml_qnn_tensor_readwrite { public: - ggml_qnn_tensor_binder(const ggml_tensor *tensor, ggml_backend_qnn_context * ctx, Qnn_GraphHandle_t graph_handle) + ggml_qnn_tensor_readwrite(const ggml_tensor *tensor, Qnn_GraphHandle_t graph_handle, ggml_backend_qnn_context * ctx) : _tensor(tensor) , _qnn_tensor(reinterpret_cast(tensor->extra)) , _context(ctx) { @@ -1979,6 +1979,7 @@ class ggml_qnn_tensor_binder auto err = ctx->raw_interface.tensorCreateGraphTensor(graph_handle, _qnn_tensor); if (err != QNN_SUCCESS) { QNN_LOG_INFO("error = %d\n", err); + QNN_LOG_DEBUG("tensor%p name %s", _qnn_tensor, QNN_TENSOR_GET_NAME(*_qnn_tensor)); _context = nullptr; return; } @@ -1998,7 +1999,9 @@ class ggml_qnn_tensor_binder ggml_nbytes(tensor), 4)); // TODO: should we get the align param from device here? if (!qnn_buffer) { QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno)); + QNN_LOG_DEBUG("tensor%p name %s", _qnn_tensor, QNN_TENSOR_GET_NAME(*_qnn_tensor)); _context = nullptr; + // TODO: should we free the tensor here? return; } else { QNN_LOG_INFO("alloc rpcmem successfully\n"); @@ -2014,7 +2017,7 @@ class ggml_qnn_tensor_binder } } - ggml_qnn_tensor_binder(const ggml_tensor *tensor, Qnn_Tensor_t *qnn_tensor, ggml_backend_qnn_context * ctx) + ggml_qnn_tensor_readwrite(const ggml_tensor *tensor, Qnn_Tensor_t *qnn_tensor, ggml_backend_qnn_context * ctx) : _tensor(tensor) , _qnn_tensor(qnn_tensor) , _context(ctx) { @@ -2038,6 +2041,9 @@ class ggml_qnn_tensor_binder memcpy(qnn_buffer, tensor->data, ggml_nbytes(tensor)); } else { QNN_LOG_WARN("can't find rpcmem from qnn mem handle\n"); + QNN_LOG_DEBUG("tensor%p name %s", _qnn_tensor, QNN_TENSOR_GET_NAME(*_qnn_tensor)); + _context = nullptr; + return; } } else { QNN_VER_PTR(*_qnn_tensor)->clientBuf = {tensor->data, @@ -2045,7 +2051,7 @@ class ggml_qnn_tensor_binder } } - ~ggml_qnn_tensor_binder() { + ~ggml_qnn_tensor_readwrite() { if (_context && _context->device == QNN_BACKEND_NPU && (_tensorType == QNN_TENSOR_TYPE_APP_READWRITE || _tensorType == QNN_TENSOR_TYPE_APP_READ)) { uint8_t * qnn_buffer = static_cast(_context->instance->get_rpcmem_from_memhandle( @@ -2056,6 +2062,9 @@ class ggml_qnn_tensor_binder QNN_VER_PTR(*_qnn_tensor)->dimensions = _old_dimensions; } + bool is_valid() const { return _context; } + Qnn_Tensor_t * get_qnn_tensor() const { return _qnn_tensor; } + private: const ggml_tensor *_tensor; Qnn_Tensor_t *_qnn_tensor; @@ -2063,12 +2072,15 @@ class ggml_qnn_tensor_binder uint32_t *_old_dimensions; uint32_t _dimensions[4] = {}; - ggml_qnn_tensor_binder(const ggml_qnn_tensor_binder&) = delete; - ggml_qnn_tensor_binder(ggml_qnn_tensor_binder&&) = delete; - void operator=(const ggml_qnn_tensor_binder&) = delete; - void operator=(ggml_qnn_tensor_binder&&) = delete; + ggml_qnn_tensor_readwrite(const ggml_qnn_tensor_readwrite&) = delete; + void operator=(const ggml_qnn_tensor_readwrite&) = delete; + ggml_qnn_tensor_readwrite(ggml_qnn_tensor_readwrite&&) = delete; + void operator=(ggml_qnn_tensor_readwrite&&) = delete; }; +using ggml_qnn_tensor_reader = ggml_qnn_tensor_readwrite; +using ggml_qnn_tensor_writer = ggml_qnn_tensor_readwrite; + //TODO: this function can be removed later because there are duplicated codes with ggml_qnn_mul_mat // keep it for illustrate how to implement a specified GGMPL OP using QNN API + QNN RPC static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, @@ -2078,17 +2090,14 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src qnn_instance * instance = nullptr; std::string graph_name = "ggml_op_qnn_add"; Qnn_GraphHandle_t graph_handle = nullptr; - Qnn_Tensor_t * tensor_0 = nullptr; Qnn_Tensor_t * tensor_1 = nullptr; Qnn_Tensor_t * tensor_2 = nullptr; Qnn_Param_t qnn_params[] = {}; enum ggml_op ggmlop = GGML_OP_ADD; - Qnn_DataType_t src0_qnn_type = QNN_DATATYPE_FLOAT_32; Qnn_DataType_t src1_qnn_type = QNN_DATATYPE_FLOAT_32; Qnn_DataType_t dst_qnn_type = QNN_DATATYPE_FLOAT_32; CHECK_PARAMS(ctx, src0, src1, dst); - tensor_0 = (Qnn_Tensor_t *) src0->extra; tensor_1 = (Qnn_Tensor_t *) src1->extra; tensor_2 = (Qnn_Tensor_t *) dst->extra; instance = ctx->instance; @@ -2097,17 +2106,12 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src qnn_perf perf("ggml_qnn_add"); perf.start(); - QNN_VER_PTR(*tensor_0)->type = QNN_TENSOR_TYPE_APP_WRITE; QNN_VER_PTR(*tensor_1)->type = QNN_TENSOR_TYPE_APP_WRITE; QNN_VER_PTR(*tensor_2)->type = QNN_TENSOR_TYPE_APP_READ; - src0_qnn_type = qnn_datatype_from_ggml_datatype(src0->type); src1_qnn_type = qnn_datatype_from_ggml_datatype(src1->type); dst_qnn_type = qnn_datatype_from_ggml_datatype(dst->type); - uint32_t dimensions_input_0[] = { - (uint32_t) src0->ne[0], (uint32_t) src0->ne[1], (uint32_t) src0->ne[2], - (uint32_t) src0->ne[3]}; uint32_t dimensions_input_1[] = { (uint32_t) src1->ne[0], (uint32_t) src1->ne[1], (uint32_t) src1->ne[2], (uint32_t) src1->ne[3]}; @@ -2123,7 +2127,6 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src graph_handle = std::get<0>(graph_item); } - uint32_t * tensor_0_dimensions = QNN_VER_PTR(*tensor_0)->dimensions; uint32_t * tensor_1_dimensions = QNN_VER_PTR(*tensor_1)->dimensions; uint32_t * tensor_2_dimensions = QNN_VER_PTR(*tensor_2)->dimensions; @@ -2185,9 +2188,6 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src } if (ctx->device == QNN_BACKEND_NPU) { - QNN_VER_PTR(*tensor_0)->memType = QNN_TENSORMEMTYPE_MEMHANDLE; - QNN_VER_PTR(*tensor_0)->clientBuf= {.data=nullptr, .dataSize=0}; - QNN_VER_PTR(*tensor_1)->memType = QNN_TENSORMEMTYPE_MEMHANDLE; QNN_VER_PTR(*tensor_1)->clientBuf= {.data=nullptr, .dataSize=0}; @@ -2195,9 +2195,8 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src QNN_VER_PTR(*tensor_2)->clientBuf= {.data=nullptr, .dataSize=0}; } - error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_0); - if (QNN_SUCCESS != error) { - QNN_LOG_INFO("error = %d\n", error); + ggml_qnn_tensor_writer tensor_writer0(src0, graph_handle, ctx); + if (!tensor_writer0.is_valid()) { goto failure; } error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_1); @@ -2211,9 +2210,6 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src goto failure; } - QNN_VER_PTR(*tensor_0)->dimensions = dimensions_input_0; - QNN_VER_PTR(*tensor_0)->rank = qnn_get_ggml_tensor_rank(src0); - QNN_VER_PTR(*tensor_0)->dataType = src0_qnn_type; QNN_VER_PTR(*tensor_1)->dimensions = dimensions_input_1; QNN_VER_PTR(*tensor_1)->rank = qnn_get_ggml_tensor_rank(src1); QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type; @@ -2222,29 +2218,15 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type; if (ctx->device != QNN_BACKEND_NPU) { - QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, - qnn_get_ggml_tensor_data_size(src0)}; QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, qnn_get_ggml_tensor_data_size(src1)}; QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, qnn_get_ggml_tensor_data_size(dst)}; } else { - uint8_t * qnn_buffer_0 = nullptr; uint8_t * qnn_buffer_1 = nullptr; uint8_t * qnn_buffer_2 = nullptr; qnn_instance * instance = ctx->instance; - qnn_buffer_0 = static_cast(instance->alloc_rpcmem( - ggml_nbytes(src0), 4)); - if (nullptr == qnn_buffer_0) { - QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno)); - goto failure; - } else { - QNN_LOG_INFO("alloc rpcmem successfully\n"); - } - instance->register_rpcmem(qnn_buffer_0, tensor_0); - memcpy(qnn_buffer_0, src0->data, ggml_nbytes(src0)); - qnn_buffer_1 = static_cast(instance->alloc_rpcmem( ggml_nbytes(src1), 4)); if (nullptr == qnn_buffer_1) { @@ -2267,7 +2249,7 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src instance->register_rpcmem(qnn_buffer_2, tensor_2); } - Qnn_Tensor_t tensor_inputs[] = {*tensor_0, *tensor_1}; + Qnn_Tensor_t tensor_inputs[] = {*tensor_writer0.get_qnn_tensor(), *tensor_1}; Qnn_Tensor_t tensor_outputs[] = {*tensor_2}; Qnn_OpConfig_t op_config = { (Qnn_OpConfigVersion_t) 1, @@ -2308,18 +2290,14 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src QNN_VER_PTR(*tensor_2)->memHandle)); memcpy(dst->data, qnn_buffer_2, ggml_nbytes(dst)); } - auto graph_item = std::make_tuple(graph_handle, tensor_0, tensor_1, tensor_2); + auto graph_item = std::make_tuple(graph_handle, tensor_writer0.get_qnn_tensor(), tensor_1, tensor_2); instance->_qnn_graph_map[map_entry] = graph_item; } else { auto & graph_item = instance->_qnn_graph_map[map_entry]; - graph_handle = std::get<0>(graph_item); - tensor_0 = std::get<1>(graph_item); + ggml_qnn_tensor_writer tensor_writer0(src0, std::get<1>(graph_item), ctx); tensor_1 = std::get<2>(graph_item); tensor_2 = std::get<3>(graph_item); - uint32_t dimensions_input_0[] = { - (uint32_t) src0->ne[0], (uint32_t) src0->ne[1], - (uint32_t) src0->ne[2], (uint32_t) src0->ne[3]}; uint32_t dimensions_input_1[] = { (uint32_t) src1->ne[0], (uint32_t) src1->ne[1], (uint32_t) src1->ne[2], (uint32_t) src1->ne[3]}; @@ -2327,9 +2305,6 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3]}; - QNN_VER_PTR(*tensor_0)->dimensions = dimensions_input_0; - QNN_VER_PTR(*tensor_0)->rank = qnn_get_ggml_tensor_rank(src0); - QNN_VER_PTR(*tensor_0)->dataType = src0_qnn_type; QNN_VER_PTR(*tensor_1)->dimensions = dimensions_input_1; QNN_VER_PTR(*tensor_1)->rank = qnn_get_ggml_tensor_rank(src1); QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type; @@ -2338,25 +2313,18 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type; if (ctx->device != QNN_BACKEND_NPU) { - QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, - qnn_get_ggml_tensor_data_size(src0)}; QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, qnn_get_ggml_tensor_data_size(src1)}; QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, qnn_get_ggml_tensor_data_size(dst)}; } else { - uint8_t * qnn_buffer_0 = static_cast(ctx->instance->get_rpcmem_from_memhandle( - QNN_VER_PTR(*tensor_0)->memHandle)); - if (nullptr != qnn_buffer_0) - memcpy(qnn_buffer_0, src0->data, ggml_nbytes(src0)); - uint8_t * qnn_buffer_1 = static_cast(ctx->instance->get_rpcmem_from_memhandle( QNN_VER_PTR(*tensor_1)->memHandle)); if (nullptr != qnn_buffer_1) memcpy(qnn_buffer_1, src1->data, ggml_nbytes(src1)); } - Qnn_Tensor_t tensor_inputs[] = {*tensor_0, *tensor_1}; + Qnn_Tensor_t tensor_inputs[] = {*tensor_writer0.get_qnn_tensor(), *tensor_1}; Qnn_Tensor_t tensor_outputs[] = {*tensor_2}; error = qnn_raw_interface.graphExecute(graph_handle, tensor_inputs,2, @@ -2382,7 +2350,6 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src failure: if (QNN_SUCCESS != error) { - QNN_LOG_DEBUG("tensor0 name %s", QNN_TENSOR_GET_NAME(*tensor_0)); QNN_LOG_DEBUG("tensor1 name %s", QNN_TENSOR_GET_NAME(*tensor_1)); QNN_LOG_DEBUG("tensor2 name %s", QNN_TENSOR_GET_NAME(*tensor_2)); QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 @@ -2402,7 +2369,6 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src dst->nb[1], dst->nb[2]); } - QNN_VER_PTR(*tensor_0)->dimensions = tensor_0_dimensions; QNN_VER_PTR(*tensor_1)->dimensions = tensor_1_dimensions; QNN_VER_PTR(*tensor_2)->dimensions = tensor_2_dimensions; From 36e41a1055a85eee98a72f0a29c2c636f476c150 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Sun, 16 Jun 2024 21:46:15 +0800 Subject: [PATCH 020/143] use tensor wrapper in matmul --- ggml-qnn.cpp | 59 ++++++---------------------------------------------- 1 file changed, 6 insertions(+), 53 deletions(-) diff --git a/ggml-qnn.cpp b/ggml-qnn.cpp index ab28a2daec725..8d65b6a4e59ea 100644 --- a/ggml-qnn.cpp +++ b/ggml-qnn.cpp @@ -2394,17 +2394,14 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, qnn_instance * instance = nullptr; std::string graph_name = "ggml_op_qnn_mul_mat"; Qnn_GraphHandle_t graph_handle = nullptr; - Qnn_Tensor_t * tensor_0 = nullptr; Qnn_Tensor_t * tensor_1 = nullptr; Qnn_Tensor_t * tensor_2 = nullptr; Qnn_Param_t qnn_params[] = {}; enum ggml_op ggmlop = GGML_OP_MUL_MAT; - Qnn_DataType_t src0_qnn_type = QNN_DATATYPE_FLOAT_32; Qnn_DataType_t src1_qnn_type = QNN_DATATYPE_FLOAT_32; Qnn_DataType_t dst_qnn_type = QNN_DATATYPE_FLOAT_32; CHECK_PARAMS(ctx, src0, src1, dst); - tensor_0 = (Qnn_Tensor_t *) src0->extra; tensor_1 = (Qnn_Tensor_t *) src1->extra; tensor_2 = (Qnn_Tensor_t *) dst->extra; instance = ctx->instance; @@ -2413,22 +2410,16 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, qnn_perf perf("ggml_qnn_mul_mat"); perf.start(); - tensor_0 = (Qnn_Tensor_t *) src0->extra; tensor_1 = (Qnn_Tensor_t *) src1->extra; tensor_2 = (Qnn_Tensor_t *) dst->extra; instance = ctx->instance; - QNN_VER_PTR(*tensor_0)->type = QNN_TENSOR_TYPE_APP_WRITE; QNN_VER_PTR(*tensor_1)->type = QNN_TENSOR_TYPE_APP_WRITE; QNN_VER_PTR(*tensor_2)->type = QNN_TENSOR_TYPE_APP_READ; - src0_qnn_type = qnn_datatype_from_ggml_datatype(src0->type); src1_qnn_type = qnn_datatype_from_ggml_datatype(src1->type); dst_qnn_type = qnn_datatype_from_ggml_datatype(dst->type); - uint32_t dimensions_input_0[] = { - (uint32_t) src0->ne[0], (uint32_t) src0->ne[1], (uint32_t) src0->ne[2], - (uint32_t) src0->ne[3]}; uint32_t dimensions_input_1[] = { (uint32_t) src1->ne[0], (uint32_t) src1->ne[1], (uint32_t) src1->ne[2], (uint32_t) src1->ne[3]}; @@ -2444,7 +2435,6 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, graph_handle = std::get<0>(graph_item); } - uint32_t * tensor_0_dimensions = QNN_VER_PTR(*tensor_0)->dimensions; uint32_t * tensor_1_dimensions = QNN_VER_PTR(*tensor_1)->dimensions; uint32_t * tensor_2_dimensions = QNN_VER_PTR(*tensor_2)->dimensions; @@ -2508,9 +2498,6 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, } if (ctx->device == QNN_BACKEND_NPU) { - QNN_VER_PTR(*tensor_0)->memType = QNN_TENSORMEMTYPE_MEMHANDLE; - QNN_VER_PTR(*tensor_0)->clientBuf= {.data=nullptr, .dataSize=0}; - QNN_VER_PTR(*tensor_1)->memType = QNN_TENSORMEMTYPE_MEMHANDLE; QNN_VER_PTR(*tensor_1)->clientBuf= {.data=nullptr, .dataSize=0}; @@ -2518,9 +2505,8 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, QNN_VER_PTR(*tensor_2)->clientBuf= {.data=nullptr, .dataSize=0}; } - error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_0); - if (QNN_SUCCESS != error) { - QNN_LOG_INFO("error = %d\n", error); + ggml_qnn_tensor_writer tensor_writer0(src0, graph_handle, ctx); + if (!tensor_writer0.is_valid()) { goto failure; } error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_1); @@ -2534,9 +2520,6 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, goto failure; } - QNN_VER_PTR(*tensor_0)->dimensions = dimensions_input_0; - QNN_VER_PTR(*tensor_0)->rank = qnn_get_ggml_tensor_rank(src0); - QNN_VER_PTR(*tensor_0)->dataType = src0_qnn_type; QNN_VER_PTR(*tensor_1)->dimensions = dimensions_input_1; QNN_VER_PTR(*tensor_1)->rank = qnn_get_ggml_tensor_rank(src1); QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type; @@ -2545,29 +2528,15 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type; if (ctx->device != QNN_BACKEND_NPU) { - QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, - qnn_get_ggml_tensor_data_size(src0)}; QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, qnn_get_ggml_tensor_data_size(src1)}; QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, qnn_get_ggml_tensor_data_size(dst)}; } else { - uint8_t * qnn_buffer_0 = nullptr; uint8_t * qnn_buffer_1 = nullptr; uint8_t * qnn_buffer_2 = nullptr; qnn_instance * instance = ctx->instance; - qnn_buffer_0 = static_cast(instance->alloc_rpcmem( - ggml_nbytes(src0), 4)); - if (nullptr == qnn_buffer_0) { - QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno)); - goto failure; - } else { - QNN_LOG_INFO("alloc rpcmem successfully\n"); - } - instance->register_rpcmem(qnn_buffer_0, tensor_0); - memcpy(qnn_buffer_0, src0->data, ggml_nbytes(src0)); - qnn_buffer_1 = static_cast(instance->alloc_rpcmem( ggml_nbytes(src1), 4)); if (nullptr == qnn_buffer_1) { @@ -2590,7 +2559,7 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, instance->register_rpcmem(qnn_buffer_2, tensor_2); } - Qnn_Tensor_t tensor_inputs[] = {*tensor_0, *tensor_1}; + Qnn_Tensor_t tensor_inputs[] = {*tensor_writer0.get_qnn_tensor(), *tensor_1}; Qnn_Tensor_t tensor_outputs[] = {*tensor_2}; Qnn_OpConfig_t op_config = { (Qnn_OpConfigVersion_t) 1, @@ -2632,27 +2601,20 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, memcpy(dst->data, qnn_buffer_2, ggml_nbytes(dst)); } - auto graph_item = std::make_tuple(graph_handle, tensor_0, tensor_1, tensor_2); + auto graph_item = std::make_tuple(graph_handle, tensor_writer0.get_qnn_tensor(), tensor_1, tensor_2); instance->_qnn_graph_map[map_entry] = graph_item; } else { auto & graph_item= instance->_qnn_graph_map[map_entry]; - graph_handle = std::get<0>(graph_item); - tensor_0 = std::get<1>(graph_item); + ggml_qnn_tensor_writer tensor_writer0(src0, std::get<1>(graph_item), ctx); tensor_1 = std::get<2>(graph_item); tensor_2 = std::get<3>(graph_item); - uint32_t dimensions_input_0[] = { - (uint32_t) src0->ne[0], (uint32_t) src0->ne[1], - (uint32_t) src0->ne[2], (uint32_t) src0->ne[3]}; uint32_t dimensions_input_1[] = { (uint32_t) src1->ne[0], (uint32_t) src1->ne[1], (uint32_t) src1->ne[2], (uint32_t) src1->ne[3]}; uint32_t dimensions_output[] = { (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3]}; - QNN_VER_PTR(*tensor_0)->dimensions = dimensions_input_0; - QNN_VER_PTR(*tensor_0)->rank = qnn_get_ggml_tensor_rank(src0); - QNN_VER_PTR(*tensor_0)->dataType = src0_qnn_type; QNN_VER_PTR(*tensor_1)->dimensions = dimensions_input_1; QNN_VER_PTR(*tensor_1)->rank = qnn_get_ggml_tensor_rank(src1); QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type; @@ -2661,25 +2623,18 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type; if (ctx->device != QNN_BACKEND_NPU) { - QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, - qnn_get_ggml_tensor_data_size(src0)}; QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, qnn_get_ggml_tensor_data_size(src1)}; QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, qnn_get_ggml_tensor_data_size(dst)}; } else { - uint8_t * qnn_buffer_0 = static_cast(ctx->instance->get_rpcmem_from_memhandle( - QNN_VER_PTR(*tensor_0)->memHandle)); - if (nullptr != qnn_buffer_0) - memcpy(qnn_buffer_0, src0->data, ggml_nbytes(src0)); - uint8_t * qnn_buffer_1 = static_cast(ctx->instance->get_rpcmem_from_memhandle( QNN_VER_PTR(*tensor_1)->memHandle)); if (nullptr != qnn_buffer_1) memcpy(qnn_buffer_1, src1->data, ggml_nbytes(src1)); } - Qnn_Tensor_t tensor_inputs[] = {*tensor_0, *tensor_1}; + Qnn_Tensor_t tensor_inputs[] = {*tensor_writer0.get_qnn_tensor(), *tensor_1}; Qnn_Tensor_t tensor_outputs[] = {*tensor_2}; error = qnn_raw_interface.graphExecute(graph_handle, tensor_inputs, 2, @@ -2705,7 +2660,6 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, failure: if (QNN_SUCCESS != error) { - QNN_LOG_DEBUG("tensor0 name %s", QNN_TENSOR_GET_NAME(*tensor_0)); QNN_LOG_DEBUG("tensor1 name %s", QNN_TENSOR_GET_NAME(*tensor_1)); QNN_LOG_DEBUG("tensor2 name %s", QNN_TENSOR_GET_NAME(*tensor_2)); QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 @@ -2724,7 +2678,6 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, dst->ne[1], dst->ne[2], dst->nb[0], dst->nb[1], dst->nb[2]); } - QNN_VER_PTR(*tensor_0)->dimensions = tensor_0_dimensions; QNN_VER_PTR(*tensor_1)->dimensions = tensor_1_dimensions; QNN_VER_PTR(*tensor_2)->dimensions = tensor_2_dimensions; perf.info(); From a5679ddd8e6f1b0ebfe2b876e6720e5d793e9bb5 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Sun, 16 Jun 2024 22:01:14 +0800 Subject: [PATCH 021/143] use ggml_qnn_tensor_reader for output tensor --- ggml-qnn.cpp | 128 +++++---------------------------------------------- 1 file changed, 12 insertions(+), 116 deletions(-) diff --git a/ggml-qnn.cpp b/ggml-qnn.cpp index 8d65b6a4e59ea..eda83597f53b1 100644 --- a/ggml-qnn.cpp +++ b/ggml-qnn.cpp @@ -2091,15 +2091,12 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src std::string graph_name = "ggml_op_qnn_add"; Qnn_GraphHandle_t graph_handle = nullptr; Qnn_Tensor_t * tensor_1 = nullptr; - Qnn_Tensor_t * tensor_2 = nullptr; Qnn_Param_t qnn_params[] = {}; enum ggml_op ggmlop = GGML_OP_ADD; Qnn_DataType_t src1_qnn_type = QNN_DATATYPE_FLOAT_32; - Qnn_DataType_t dst_qnn_type = QNN_DATATYPE_FLOAT_32; CHECK_PARAMS(ctx, src0, src1, dst); tensor_1 = (Qnn_Tensor_t *) src1->extra; - tensor_2 = (Qnn_Tensor_t *) dst->extra; instance = ctx->instance; QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface; @@ -2107,17 +2104,12 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src perf.start(); QNN_VER_PTR(*tensor_1)->type = QNN_TENSOR_TYPE_APP_WRITE; - QNN_VER_PTR(*tensor_2)->type = QNN_TENSOR_TYPE_APP_READ; src1_qnn_type = qnn_datatype_from_ggml_datatype(src1->type); - dst_qnn_type = qnn_datatype_from_ggml_datatype(dst->type); uint32_t dimensions_input_1[] = { (uint32_t) src1->ne[0], (uint32_t) src1->ne[1], (uint32_t) src1->ne[2], (uint32_t) src1->ne[3]}; - uint32_t dimensions_output[] = { - (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], - (uint32_t) dst->ne[3]}; std::string map_entry = std::string(ggml_op_name(ggmlop)); if (instance->_qnn_graph_map.find(map_entry) != @@ -2128,7 +2120,6 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src } uint32_t * tensor_1_dimensions = QNN_VER_PTR(*tensor_1)->dimensions; - uint32_t * tensor_2_dimensions = QNN_VER_PTR(*tensor_2)->dimensions; if (!graph_initialized) { graph_name = graph_name + "_" + std::to_string(ctx->threads) + @@ -2190,9 +2181,6 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src if (ctx->device == QNN_BACKEND_NPU) { QNN_VER_PTR(*tensor_1)->memType = QNN_TENSORMEMTYPE_MEMHANDLE; QNN_VER_PTR(*tensor_1)->clientBuf= {.data=nullptr, .dataSize=0}; - - QNN_VER_PTR(*tensor_2)->memType = QNN_TENSORMEMTYPE_MEMHANDLE; - QNN_VER_PTR(*tensor_2)->clientBuf= {.data=nullptr, .dataSize=0}; } ggml_qnn_tensor_writer tensor_writer0(src0, graph_handle, ctx); @@ -2204,27 +2192,20 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src QNN_LOG_INFO("error = %d\n", error); goto failure; } - error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_2); - if (QNN_SUCCESS != error) { - QNN_LOG_INFO("error = %d\n", error); + ggml_qnn_tensor_reader tensor_reader(dst, graph_handle, ctx); + if (!tensor_writer0.is_valid()) { goto failure; } QNN_VER_PTR(*tensor_1)->dimensions = dimensions_input_1; QNN_VER_PTR(*tensor_1)->rank = qnn_get_ggml_tensor_rank(src1); QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type; - QNN_VER_PTR(*tensor_2)->dimensions = dimensions_output; - QNN_VER_PTR(*tensor_2)->rank = qnn_get_ggml_tensor_rank(dst); - QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type; if (ctx->device != QNN_BACKEND_NPU) { QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, qnn_get_ggml_tensor_data_size(src1)}; - QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, - qnn_get_ggml_tensor_data_size(dst)}; } else { uint8_t * qnn_buffer_1 = nullptr; - uint8_t * qnn_buffer_2 = nullptr; qnn_instance * instance = ctx->instance; qnn_buffer_1 = static_cast(instance->alloc_rpcmem( @@ -2237,20 +2218,10 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src } instance->register_rpcmem(qnn_buffer_1, tensor_1); memcpy(qnn_buffer_1, src1->data, ggml_nbytes(src1)); - - qnn_buffer_2 = static_cast(instance->alloc_rpcmem( - ggml_nbytes(dst), 4)); - if (nullptr == qnn_buffer_2) { - QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno)); - goto failure; - } else { - QNN_LOG_INFO("alloc rpcmem successfully\n"); - } - instance->register_rpcmem(qnn_buffer_2, tensor_2); } Qnn_Tensor_t tensor_inputs[] = {*tensor_writer0.get_qnn_tensor(), *tensor_1}; - Qnn_Tensor_t tensor_outputs[] = {*tensor_2}; + Qnn_Tensor_t tensor_outputs[] = {*tensor_reader.get_qnn_tensor()}; Qnn_OpConfig_t op_config = { (Qnn_OpConfigVersion_t) 1, .v1 = {"ggml_op_add", @@ -2285,38 +2256,25 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src goto failure; } - if (ctx->device == QNN_BACKEND_NPU) { - uint8_t * qnn_buffer_2 = static_cast(ctx->instance->get_rpcmem_from_memhandle( - QNN_VER_PTR(*tensor_2)->memHandle)); - memcpy(dst->data, qnn_buffer_2, ggml_nbytes(dst)); - } - auto graph_item = std::make_tuple(graph_handle, tensor_writer0.get_qnn_tensor(), tensor_1, tensor_2); + auto graph_item = std::make_tuple(graph_handle, tensor_writer0.get_qnn_tensor(), tensor_1, tensor_reader.get_qnn_tensor()); instance->_qnn_graph_map[map_entry] = graph_item; } else { auto & graph_item = instance->_qnn_graph_map[map_entry]; ggml_qnn_tensor_writer tensor_writer0(src0, std::get<1>(graph_item), ctx); tensor_1 = std::get<2>(graph_item); - tensor_2 = std::get<3>(graph_item); + ggml_qnn_tensor_reader tensor_reader(dst, std::get<3>(graph_item), ctx); uint32_t dimensions_input_1[] = { (uint32_t) src1->ne[0], (uint32_t) src1->ne[1], (uint32_t) src1->ne[2], (uint32_t) src1->ne[3]}; - uint32_t dimensions_output[] = { - (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], - (uint32_t) dst->ne[3]}; QNN_VER_PTR(*tensor_1)->dimensions = dimensions_input_1; QNN_VER_PTR(*tensor_1)->rank = qnn_get_ggml_tensor_rank(src1); QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type; - QNN_VER_PTR(*tensor_2)->dimensions = dimensions_output; - QNN_VER_PTR(*tensor_2)->rank = qnn_get_ggml_tensor_rank(dst); - QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type; if (ctx->device != QNN_BACKEND_NPU) { QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, qnn_get_ggml_tensor_data_size(src1)}; - QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, - qnn_get_ggml_tensor_data_size(dst)}; } else { uint8_t * qnn_buffer_1 = static_cast(ctx->instance->get_rpcmem_from_memhandle( QNN_VER_PTR(*tensor_1)->memHandle)); @@ -2325,7 +2283,7 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src } Qnn_Tensor_t tensor_inputs[] = {*tensor_writer0.get_qnn_tensor(), *tensor_1}; - Qnn_Tensor_t tensor_outputs[] = {*tensor_2}; + Qnn_Tensor_t tensor_outputs[] = {*tensor_reader.get_qnn_tensor()}; error = qnn_raw_interface.graphExecute(graph_handle, tensor_inputs,2, tensor_outputs,1, @@ -2339,19 +2297,11 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src QNN_LOG_INFO("error = %d\n", error); goto failure; } - - if (ctx->device == QNN_BACKEND_NPU) { - uint8_t * qnn_buffer_2 = static_cast(ctx->instance->get_rpcmem_from_memhandle( - QNN_VER_PTR(*tensor_2)->memHandle)); - if (nullptr != qnn_buffer_2) - memcpy(dst->data, qnn_buffer_2, ggml_nbytes(dst)); - } } failure: if (QNN_SUCCESS != error) { QNN_LOG_DEBUG("tensor1 name %s", QNN_TENSOR_GET_NAME(*tensor_1)); - QNN_LOG_DEBUG("tensor2 name %s", QNN_TENSOR_GET_NAME(*tensor_2)); QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", src0->name, src0->type, ggml_type_name(src0->type), @@ -2370,7 +2320,6 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src } QNN_VER_PTR(*tensor_1)->dimensions = tensor_1_dimensions; - QNN_VER_PTR(*tensor_2)->dimensions = tensor_2_dimensions; perf.info(); } @@ -2395,15 +2344,12 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, std::string graph_name = "ggml_op_qnn_mul_mat"; Qnn_GraphHandle_t graph_handle = nullptr; Qnn_Tensor_t * tensor_1 = nullptr; - Qnn_Tensor_t * tensor_2 = nullptr; Qnn_Param_t qnn_params[] = {}; enum ggml_op ggmlop = GGML_OP_MUL_MAT; Qnn_DataType_t src1_qnn_type = QNN_DATATYPE_FLOAT_32; - Qnn_DataType_t dst_qnn_type = QNN_DATATYPE_FLOAT_32; CHECK_PARAMS(ctx, src0, src1, dst); tensor_1 = (Qnn_Tensor_t *) src1->extra; - tensor_2 = (Qnn_Tensor_t *) dst->extra; instance = ctx->instance; QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface; @@ -2411,21 +2357,15 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, perf.start(); tensor_1 = (Qnn_Tensor_t *) src1->extra; - tensor_2 = (Qnn_Tensor_t *) dst->extra; instance = ctx->instance; QNN_VER_PTR(*tensor_1)->type = QNN_TENSOR_TYPE_APP_WRITE; - QNN_VER_PTR(*tensor_2)->type = QNN_TENSOR_TYPE_APP_READ; src1_qnn_type = qnn_datatype_from_ggml_datatype(src1->type); - dst_qnn_type = qnn_datatype_from_ggml_datatype(dst->type); uint32_t dimensions_input_1[] = { (uint32_t) src1->ne[0], (uint32_t) src1->ne[1], (uint32_t) src1->ne[2], (uint32_t) src1->ne[3]}; - uint32_t dimensions_output[] = { - (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], - (uint32_t) dst->ne[3]}; std::string map_entry = std::string(ggml_op_name(ggmlop)); if (instance->_qnn_graph_map.find(map_entry) != @@ -2436,7 +2376,6 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, } uint32_t * tensor_1_dimensions = QNN_VER_PTR(*tensor_1)->dimensions; - uint32_t * tensor_2_dimensions = QNN_VER_PTR(*tensor_2)->dimensions; //TODO: for scenarios of quantized data in src0 // pass-1: dequantize src0 to FP32 @@ -2500,9 +2439,6 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, if (ctx->device == QNN_BACKEND_NPU) { QNN_VER_PTR(*tensor_1)->memType = QNN_TENSORMEMTYPE_MEMHANDLE; QNN_VER_PTR(*tensor_1)->clientBuf= {.data=nullptr, .dataSize=0}; - - QNN_VER_PTR(*tensor_2)->memType = QNN_TENSORMEMTYPE_MEMHANDLE; - QNN_VER_PTR(*tensor_2)->clientBuf= {.data=nullptr, .dataSize=0}; } ggml_qnn_tensor_writer tensor_writer0(src0, graph_handle, ctx); @@ -2514,27 +2450,20 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, QNN_LOG_INFO("error = %d\n", error); goto failure; } - error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_2); - if (QNN_SUCCESS != error) { - QNN_LOG_INFO("error = %d\n", error); + ggml_qnn_tensor_reader tensor_reader(dst, graph_handle, ctx); + if (!tensor_writer0.is_valid()) { goto failure; } QNN_VER_PTR(*tensor_1)->dimensions = dimensions_input_1; QNN_VER_PTR(*tensor_1)->rank = qnn_get_ggml_tensor_rank(src1); QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type; - QNN_VER_PTR(*tensor_2)->dimensions = dimensions_output; - QNN_VER_PTR(*tensor_2)->rank = qnn_get_ggml_tensor_rank(dst); - QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type; if (ctx->device != QNN_BACKEND_NPU) { QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, qnn_get_ggml_tensor_data_size(src1)}; - QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, - qnn_get_ggml_tensor_data_size(dst)}; } else { uint8_t * qnn_buffer_1 = nullptr; - uint8_t * qnn_buffer_2 = nullptr; qnn_instance * instance = ctx->instance; qnn_buffer_1 = static_cast(instance->alloc_rpcmem( @@ -2547,20 +2476,10 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, } instance->register_rpcmem(qnn_buffer_1, tensor_1); memcpy(qnn_buffer_1, src1->data, ggml_nbytes(src1)); - - qnn_buffer_2 = static_cast(instance->alloc_rpcmem( - ggml_nbytes(dst), 4)); - if (nullptr == qnn_buffer_2) { - QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno)); - goto failure; - } else { - QNN_LOG_INFO("alloc rpcmem successfully\n"); - } - instance->register_rpcmem(qnn_buffer_2, tensor_2); } Qnn_Tensor_t tensor_inputs[] = {*tensor_writer0.get_qnn_tensor(), *tensor_1}; - Qnn_Tensor_t tensor_outputs[] = {*tensor_2}; + Qnn_Tensor_t tensor_outputs[] = {*tensor_reader.get_qnn_tensor()}; Qnn_OpConfig_t op_config = { (Qnn_OpConfigVersion_t) 1, .v1 = {"ggml_op_mul_mat", @@ -2595,38 +2514,24 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, goto failure; } - if (ctx->device == QNN_BACKEND_NPU) { - uint8_t * qnn_buffer_2 = static_cast(ctx->instance->get_rpcmem_from_memhandle( - QNN_VER_PTR(*tensor_2)->memHandle)); - memcpy(dst->data, qnn_buffer_2, ggml_nbytes(dst)); - } - - auto graph_item = std::make_tuple(graph_handle, tensor_writer0.get_qnn_tensor(), tensor_1, tensor_2); + auto graph_item = std::make_tuple(graph_handle, tensor_writer0.get_qnn_tensor(), tensor_1, tensor_reader.get_qnn_tensor()); instance->_qnn_graph_map[map_entry] = graph_item; } else { auto & graph_item= instance->_qnn_graph_map[map_entry]; ggml_qnn_tensor_writer tensor_writer0(src0, std::get<1>(graph_item), ctx); tensor_1 = std::get<2>(graph_item); - tensor_2 = std::get<3>(graph_item); + ggml_qnn_tensor_reader tensor_reader(dst, std::get<3>(graph_item), ctx); uint32_t dimensions_input_1[] = { (uint32_t) src1->ne[0], (uint32_t) src1->ne[1], (uint32_t) src1->ne[2], (uint32_t) src1->ne[3]}; - uint32_t dimensions_output[] = { - (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], - (uint32_t) dst->ne[3]}; QNN_VER_PTR(*tensor_1)->dimensions = dimensions_input_1; QNN_VER_PTR(*tensor_1)->rank = qnn_get_ggml_tensor_rank(src1); QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type; - QNN_VER_PTR(*tensor_2)->dimensions = dimensions_output; - QNN_VER_PTR(*tensor_2)->rank = qnn_get_ggml_tensor_rank(dst); - QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type; if (ctx->device != QNN_BACKEND_NPU) { QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, qnn_get_ggml_tensor_data_size(src1)}; - QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, - qnn_get_ggml_tensor_data_size(dst)}; } else { uint8_t * qnn_buffer_1 = static_cast(ctx->instance->get_rpcmem_from_memhandle( QNN_VER_PTR(*tensor_1)->memHandle)); @@ -2635,7 +2540,7 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, } Qnn_Tensor_t tensor_inputs[] = {*tensor_writer0.get_qnn_tensor(), *tensor_1}; - Qnn_Tensor_t tensor_outputs[] = {*tensor_2}; + Qnn_Tensor_t tensor_outputs[] = {*tensor_reader.get_qnn_tensor()}; error = qnn_raw_interface.graphExecute(graph_handle, tensor_inputs, 2, tensor_outputs, 1, @@ -2649,19 +2554,11 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, QNN_LOG_INFO("error = %d\n", error); goto failure; } - - if (ctx->device == QNN_BACKEND_NPU) { - uint8_t * qnn_buffer_2 = static_cast(ctx->instance->get_rpcmem_from_memhandle( - QNN_VER_PTR(*tensor_2)->memHandle)); - if (nullptr != qnn_buffer_2) - memcpy(dst->data, qnn_buffer_2, ggml_nbytes(dst)); - } } failure: if (QNN_SUCCESS != error) { QNN_LOG_DEBUG("tensor1 name %s", QNN_TENSOR_GET_NAME(*tensor_1)); - QNN_LOG_DEBUG("tensor2 name %s", QNN_TENSOR_GET_NAME(*tensor_2)); QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", src0->name, src0->type, ggml_type_name(src0->type), @@ -2679,7 +2576,6 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, } QNN_VER_PTR(*tensor_1)->dimensions = tensor_1_dimensions; - QNN_VER_PTR(*tensor_2)->dimensions = tensor_2_dimensions; perf.info(); } From 5fe7b87ba1b850ddf896ea3ce48acf5c892b56d0 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Sun, 16 Jun 2024 23:54:00 +0800 Subject: [PATCH 022/143] use ggml_qnn_tensor_writer for all parameters --- ggml-qnn.cpp | 161 +++++++-------------------------------------------- 1 file changed, 20 insertions(+), 141 deletions(-) diff --git a/ggml-qnn.cpp b/ggml-qnn.cpp index eda83597f53b1..c23d67bb3affc 100644 --- a/ggml-qnn.cpp +++ b/ggml-qnn.cpp @@ -1991,7 +1991,6 @@ class ggml_qnn_tensor_readwrite QNN_VER_PTR(*_qnn_tensor)->dimensions = _dimensions; QNN_VER_PTR(*_qnn_tensor)->rank = qnn_get_ggml_tensor_rank(tensor); QNN_VER_PTR(*_qnn_tensor)->dataType = qnn_data_type; - if (is_npu) { qnn_instance * instance = ctx->instance; @@ -2090,27 +2089,16 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src qnn_instance * instance = nullptr; std::string graph_name = "ggml_op_qnn_add"; Qnn_GraphHandle_t graph_handle = nullptr; - Qnn_Tensor_t * tensor_1 = nullptr; Qnn_Param_t qnn_params[] = {}; enum ggml_op ggmlop = GGML_OP_ADD; - Qnn_DataType_t src1_qnn_type = QNN_DATATYPE_FLOAT_32; CHECK_PARAMS(ctx, src0, src1, dst); - tensor_1 = (Qnn_Tensor_t *) src1->extra; instance = ctx->instance; QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface; qnn_perf perf("ggml_qnn_add"); perf.start(); - QNN_VER_PTR(*tensor_1)->type = QNN_TENSOR_TYPE_APP_WRITE; - - src1_qnn_type = qnn_datatype_from_ggml_datatype(src1->type); - - uint32_t dimensions_input_1[] = { - (uint32_t) src1->ne[0], (uint32_t) src1->ne[1], (uint32_t) src1->ne[2], - (uint32_t) src1->ne[3]}; - std::string map_entry = std::string(ggml_op_name(ggmlop)); if (instance->_qnn_graph_map.find(map_entry) != instance->_qnn_graph_map.end()) { @@ -2119,8 +2107,6 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src graph_handle = std::get<0>(graph_item); } - uint32_t * tensor_1_dimensions = QNN_VER_PTR(*tensor_1)->dimensions; - if (!graph_initialized) { graph_name = graph_name + "_" + std::to_string(ctx->threads) + "_" + src0->name + "_" + src1->name; @@ -2178,49 +2164,21 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src QNN_LOG_INFO("create qnn graph handle with graph name %s ok\n", graph_name.c_str()); } - if (ctx->device == QNN_BACKEND_NPU) { - QNN_VER_PTR(*tensor_1)->memType = QNN_TENSORMEMTYPE_MEMHANDLE; - QNN_VER_PTR(*tensor_1)->clientBuf= {.data=nullptr, .dataSize=0}; - } - ggml_qnn_tensor_writer tensor_writer0(src0, graph_handle, ctx); if (!tensor_writer0.is_valid()) { goto failure; } - error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_1); - if (QNN_SUCCESS != error) { + ggml_qnn_tensor_writer tensor_writer1(src1, graph_handle, ctx); + if (!tensor_writer1.is_valid()) { QNN_LOG_INFO("error = %d\n", error); goto failure; } ggml_qnn_tensor_reader tensor_reader(dst, graph_handle, ctx); - if (!tensor_writer0.is_valid()) { + if (!tensor_reader.is_valid()) { goto failure; } - QNN_VER_PTR(*tensor_1)->dimensions = dimensions_input_1; - QNN_VER_PTR(*tensor_1)->rank = qnn_get_ggml_tensor_rank(src1); - QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type; - - if (ctx->device != QNN_BACKEND_NPU) { - QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, - qnn_get_ggml_tensor_data_size(src1)}; - } else { - uint8_t * qnn_buffer_1 = nullptr; - qnn_instance * instance = ctx->instance; - - qnn_buffer_1 = static_cast(instance->alloc_rpcmem( - ggml_nbytes(src1), 4)); - if (nullptr == qnn_buffer_1) { - QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno)); - goto failure; - } else { - QNN_LOG_INFO("alloc rpcmem successfully\n"); - } - instance->register_rpcmem(qnn_buffer_1, tensor_1); - memcpy(qnn_buffer_1, src1->data, ggml_nbytes(src1)); - } - - Qnn_Tensor_t tensor_inputs[] = {*tensor_writer0.get_qnn_tensor(), *tensor_1}; + Qnn_Tensor_t tensor_inputs[] = {*tensor_writer0.get_qnn_tensor(), *tensor_writer1.get_qnn_tensor()}; Qnn_Tensor_t tensor_outputs[] = {*tensor_reader.get_qnn_tensor()}; Qnn_OpConfig_t op_config = { (Qnn_OpConfigVersion_t) 1, @@ -2256,33 +2214,18 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src goto failure; } - auto graph_item = std::make_tuple(graph_handle, tensor_writer0.get_qnn_tensor(), tensor_1, tensor_reader.get_qnn_tensor()); + auto graph_item = std::make_tuple(graph_handle, + tensor_writer0.get_qnn_tensor(), + tensor_writer1.get_qnn_tensor(), + tensor_reader.get_qnn_tensor()); instance->_qnn_graph_map[map_entry] = graph_item; } else { auto & graph_item = instance->_qnn_graph_map[map_entry]; ggml_qnn_tensor_writer tensor_writer0(src0, std::get<1>(graph_item), ctx); - tensor_1 = std::get<2>(graph_item); + ggml_qnn_tensor_writer tensor_writer1(src1, std::get<2>(graph_item), ctx); ggml_qnn_tensor_reader tensor_reader(dst, std::get<3>(graph_item), ctx); - uint32_t dimensions_input_1[] = { - (uint32_t) src1->ne[0], (uint32_t) src1->ne[1], - (uint32_t) src1->ne[2], (uint32_t) src1->ne[3]}; - - QNN_VER_PTR(*tensor_1)->dimensions = dimensions_input_1; - QNN_VER_PTR(*tensor_1)->rank = qnn_get_ggml_tensor_rank(src1); - QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type; - - if (ctx->device != QNN_BACKEND_NPU) { - QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, - qnn_get_ggml_tensor_data_size(src1)}; - } else { - uint8_t * qnn_buffer_1 = static_cast(ctx->instance->get_rpcmem_from_memhandle( - QNN_VER_PTR(*tensor_1)->memHandle)); - if (nullptr != qnn_buffer_1) - memcpy(qnn_buffer_1, src1->data, ggml_nbytes(src1)); - } - - Qnn_Tensor_t tensor_inputs[] = {*tensor_writer0.get_qnn_tensor(), *tensor_1}; + Qnn_Tensor_t tensor_inputs[] = {*tensor_writer0.get_qnn_tensor(), *tensor_writer1.get_qnn_tensor()}; Qnn_Tensor_t tensor_outputs[] = {*tensor_reader.get_qnn_tensor()}; error = qnn_raw_interface.graphExecute(graph_handle, tensor_inputs,2, @@ -2301,7 +2244,6 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src failure: if (QNN_SUCCESS != error) { - QNN_LOG_DEBUG("tensor1 name %s", QNN_TENSOR_GET_NAME(*tensor_1)); QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", src0->name, src0->type, ggml_type_name(src0->type), @@ -2319,8 +2261,6 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src dst->nb[1], dst->nb[2]); } - QNN_VER_PTR(*tensor_1)->dimensions = tensor_1_dimensions; - perf.info(); } @@ -2343,30 +2283,16 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, qnn_instance * instance = nullptr; std::string graph_name = "ggml_op_qnn_mul_mat"; Qnn_GraphHandle_t graph_handle = nullptr; - Qnn_Tensor_t * tensor_1 = nullptr; Qnn_Param_t qnn_params[] = {}; enum ggml_op ggmlop = GGML_OP_MUL_MAT; - Qnn_DataType_t src1_qnn_type = QNN_DATATYPE_FLOAT_32; CHECK_PARAMS(ctx, src0, src1, dst); - tensor_1 = (Qnn_Tensor_t *) src1->extra; instance = ctx->instance; QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface; qnn_perf perf("ggml_qnn_mul_mat"); perf.start(); - tensor_1 = (Qnn_Tensor_t *) src1->extra; - instance = ctx->instance; - - QNN_VER_PTR(*tensor_1)->type = QNN_TENSOR_TYPE_APP_WRITE; - - src1_qnn_type = qnn_datatype_from_ggml_datatype(src1->type); - - uint32_t dimensions_input_1[] = { - (uint32_t) src1->ne[0], (uint32_t) src1->ne[1], (uint32_t) src1->ne[2], - (uint32_t) src1->ne[3]}; - std::string map_entry = std::string(ggml_op_name(ggmlop)); if (instance->_qnn_graph_map.find(map_entry) != instance->_qnn_graph_map.end()) { @@ -2375,8 +2301,6 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, graph_handle = std::get<0>(graph_item); } - uint32_t * tensor_1_dimensions = QNN_VER_PTR(*tensor_1)->dimensions; - //TODO: for scenarios of quantized data in src0 // pass-1: dequantize src0 to FP32 // pass-2: dq-src0 * src1 @@ -2436,49 +2360,20 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, goto failure; } - if (ctx->device == QNN_BACKEND_NPU) { - QNN_VER_PTR(*tensor_1)->memType = QNN_TENSORMEMTYPE_MEMHANDLE; - QNN_VER_PTR(*tensor_1)->clientBuf= {.data=nullptr, .dataSize=0}; - } - ggml_qnn_tensor_writer tensor_writer0(src0, graph_handle, ctx); if (!tensor_writer0.is_valid()) { goto failure; } - error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_1); - if (QNN_SUCCESS != error) { - QNN_LOG_INFO("error = %d\n", error); + ggml_qnn_tensor_writer tensor_writer1(src1, graph_handle, ctx); + if (!tensor_writer1.is_valid()) { goto failure; } ggml_qnn_tensor_reader tensor_reader(dst, graph_handle, ctx); - if (!tensor_writer0.is_valid()) { + if (!tensor_reader.is_valid()) { goto failure; } - QNN_VER_PTR(*tensor_1)->dimensions = dimensions_input_1; - QNN_VER_PTR(*tensor_1)->rank = qnn_get_ggml_tensor_rank(src1); - QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type; - - if (ctx->device != QNN_BACKEND_NPU) { - QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, - qnn_get_ggml_tensor_data_size(src1)}; - } else { - uint8_t * qnn_buffer_1 = nullptr; - qnn_instance * instance = ctx->instance; - - qnn_buffer_1 = static_cast(instance->alloc_rpcmem( - ggml_nbytes(src1), 4)); - if (nullptr == qnn_buffer_1) { - QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno)); - goto failure; - } else { - QNN_LOG_INFO("alloc rpcmem successfully\n"); - } - instance->register_rpcmem(qnn_buffer_1, tensor_1); - memcpy(qnn_buffer_1, src1->data, ggml_nbytes(src1)); - } - - Qnn_Tensor_t tensor_inputs[] = {*tensor_writer0.get_qnn_tensor(), *tensor_1}; + Qnn_Tensor_t tensor_inputs[] = {*tensor_writer0.get_qnn_tensor(), *tensor_writer1.get_qnn_tensor()}; Qnn_Tensor_t tensor_outputs[] = {*tensor_reader.get_qnn_tensor()}; Qnn_OpConfig_t op_config = { (Qnn_OpConfigVersion_t) 1, @@ -2514,32 +2409,18 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, goto failure; } - auto graph_item = std::make_tuple(graph_handle, tensor_writer0.get_qnn_tensor(), tensor_1, tensor_reader.get_qnn_tensor()); + auto graph_item = std::make_tuple(graph_handle, + tensor_writer0.get_qnn_tensor(), + tensor_writer1.get_qnn_tensor(), + tensor_reader.get_qnn_tensor()); instance->_qnn_graph_map[map_entry] = graph_item; } else { auto & graph_item= instance->_qnn_graph_map[map_entry]; ggml_qnn_tensor_writer tensor_writer0(src0, std::get<1>(graph_item), ctx); - tensor_1 = std::get<2>(graph_item); + ggml_qnn_tensor_writer tensor_writer1(src1, std::get<2>(graph_item), ctx); ggml_qnn_tensor_reader tensor_reader(dst, std::get<3>(graph_item), ctx); - uint32_t dimensions_input_1[] = { - (uint32_t) src1->ne[0], (uint32_t) src1->ne[1], - (uint32_t) src1->ne[2], (uint32_t) src1->ne[3]}; - QNN_VER_PTR(*tensor_1)->dimensions = dimensions_input_1; - QNN_VER_PTR(*tensor_1)->rank = qnn_get_ggml_tensor_rank(src1); - QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type; - - if (ctx->device != QNN_BACKEND_NPU) { - QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, - qnn_get_ggml_tensor_data_size(src1)}; - } else { - uint8_t * qnn_buffer_1 = static_cast(ctx->instance->get_rpcmem_from_memhandle( - QNN_VER_PTR(*tensor_1)->memHandle)); - if (nullptr != qnn_buffer_1) - memcpy(qnn_buffer_1, src1->data, ggml_nbytes(src1)); - } - - Qnn_Tensor_t tensor_inputs[] = {*tensor_writer0.get_qnn_tensor(), *tensor_1}; + Qnn_Tensor_t tensor_inputs[] = {*tensor_writer0.get_qnn_tensor(), *tensor_writer1.get_qnn_tensor()}; Qnn_Tensor_t tensor_outputs[] = {*tensor_reader.get_qnn_tensor()}; error = qnn_raw_interface.graphExecute(graph_handle, tensor_inputs, 2, @@ -2558,7 +2439,6 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, failure: if (QNN_SUCCESS != error) { - QNN_LOG_DEBUG("tensor1 name %s", QNN_TENSOR_GET_NAME(*tensor_1)); QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", src0->name, src0->type, ggml_type_name(src0->type), @@ -2575,7 +2455,6 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, dst->ne[1], dst->ne[2], dst->nb[0], dst->nb[1], dst->nb[2]); } - QNN_VER_PTR(*tensor_1)->dimensions = tensor_1_dimensions; perf.info(); } From 9456bba1210a6ec95f96adf92ff8b263d7786253 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Mon, 17 Jun 2024 18:44:19 +0800 Subject: [PATCH 023/143] rename --- ggml-qnn.cpp | 68 ++++++++++++++++++++++++++-------------------------- 1 file changed, 34 insertions(+), 34 deletions(-) diff --git a/ggml-qnn.cpp b/ggml-qnn.cpp index c23d67bb3affc..b97b202453fa0 100644 --- a/ggml-qnn.cpp +++ b/ggml-qnn.cpp @@ -2077,8 +2077,8 @@ class ggml_qnn_tensor_readwrite void operator=(ggml_qnn_tensor_readwrite&&) = delete; }; -using ggml_qnn_tensor_reader = ggml_qnn_tensor_readwrite; -using ggml_qnn_tensor_writer = ggml_qnn_tensor_readwrite; +using ggml_qnn_tensor_output = ggml_qnn_tensor_readwrite; +using ggml_qnn_tensor_input = ggml_qnn_tensor_readwrite; //TODO: this function can be removed later because there are duplicated codes with ggml_qnn_mul_mat // keep it for illustrate how to implement a specified GGMPL OP using QNN API + QNN RPC @@ -2164,22 +2164,22 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src QNN_LOG_INFO("create qnn graph handle with graph name %s ok\n", graph_name.c_str()); } - ggml_qnn_tensor_writer tensor_writer0(src0, graph_handle, ctx); - if (!tensor_writer0.is_valid()) { + ggml_qnn_tensor_input tensor_input0(src0, graph_handle, ctx); + if (!tensor_input0.is_valid()) { goto failure; } - ggml_qnn_tensor_writer tensor_writer1(src1, graph_handle, ctx); - if (!tensor_writer1.is_valid()) { + ggml_qnn_tensor_input tensor_input1(src1, graph_handle, ctx); + if (!tensor_input1.is_valid()) { QNN_LOG_INFO("error = %d\n", error); goto failure; } - ggml_qnn_tensor_reader tensor_reader(dst, graph_handle, ctx); - if (!tensor_reader.is_valid()) { + ggml_qnn_tensor_output tensor_output(dst, graph_handle, ctx); + if (!tensor_output.is_valid()) { goto failure; } - Qnn_Tensor_t tensor_inputs[] = {*tensor_writer0.get_qnn_tensor(), *tensor_writer1.get_qnn_tensor()}; - Qnn_Tensor_t tensor_outputs[] = {*tensor_reader.get_qnn_tensor()}; + Qnn_Tensor_t tensor_inputs[] = {*tensor_input0.get_qnn_tensor(), *tensor_input1.get_qnn_tensor()}; + Qnn_Tensor_t tensor_outputs[] = {*tensor_output.get_qnn_tensor()}; Qnn_OpConfig_t op_config = { (Qnn_OpConfigVersion_t) 1, .v1 = {"ggml_op_add", @@ -2215,18 +2215,18 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src } auto graph_item = std::make_tuple(graph_handle, - tensor_writer0.get_qnn_tensor(), - tensor_writer1.get_qnn_tensor(), - tensor_reader.get_qnn_tensor()); + tensor_input0.get_qnn_tensor(), + tensor_input1.get_qnn_tensor(), + tensor_output.get_qnn_tensor()); instance->_qnn_graph_map[map_entry] = graph_item; } else { auto & graph_item = instance->_qnn_graph_map[map_entry]; - ggml_qnn_tensor_writer tensor_writer0(src0, std::get<1>(graph_item), ctx); - ggml_qnn_tensor_writer tensor_writer1(src1, std::get<2>(graph_item), ctx); - ggml_qnn_tensor_reader tensor_reader(dst, std::get<3>(graph_item), ctx); + ggml_qnn_tensor_input tensor_input0(src0, std::get<1>(graph_item), ctx); + ggml_qnn_tensor_input tensor_input1(src1, std::get<2>(graph_item), ctx); + ggml_qnn_tensor_output tensor_output(dst, std::get<3>(graph_item), ctx); - Qnn_Tensor_t tensor_inputs[] = {*tensor_writer0.get_qnn_tensor(), *tensor_writer1.get_qnn_tensor()}; - Qnn_Tensor_t tensor_outputs[] = {*tensor_reader.get_qnn_tensor()}; + Qnn_Tensor_t tensor_inputs[] = {*tensor_input0.get_qnn_tensor(), *tensor_input1.get_qnn_tensor()}; + Qnn_Tensor_t tensor_outputs[] = {*tensor_output.get_qnn_tensor()}; error = qnn_raw_interface.graphExecute(graph_handle, tensor_inputs,2, tensor_outputs,1, @@ -2360,21 +2360,21 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, goto failure; } - ggml_qnn_tensor_writer tensor_writer0(src0, graph_handle, ctx); - if (!tensor_writer0.is_valid()) { + ggml_qnn_tensor_input tensor_input0(src0, graph_handle, ctx); + if (!tensor_input0.is_valid()) { goto failure; } - ggml_qnn_tensor_writer tensor_writer1(src1, graph_handle, ctx); - if (!tensor_writer1.is_valid()) { + ggml_qnn_tensor_input tensor_input1(src1, graph_handle, ctx); + if (!tensor_input1.is_valid()) { goto failure; } - ggml_qnn_tensor_reader tensor_reader(dst, graph_handle, ctx); - if (!tensor_reader.is_valid()) { + ggml_qnn_tensor_output tensor_output(dst, graph_handle, ctx); + if (!tensor_output.is_valid()) { goto failure; } - Qnn_Tensor_t tensor_inputs[] = {*tensor_writer0.get_qnn_tensor(), *tensor_writer1.get_qnn_tensor()}; - Qnn_Tensor_t tensor_outputs[] = {*tensor_reader.get_qnn_tensor()}; + Qnn_Tensor_t tensor_inputs[] = {*tensor_input0.get_qnn_tensor(), *tensor_input1.get_qnn_tensor()}; + Qnn_Tensor_t tensor_outputs[] = {*tensor_output.get_qnn_tensor()}; Qnn_OpConfig_t op_config = { (Qnn_OpConfigVersion_t) 1, .v1 = {"ggml_op_mul_mat", @@ -2410,18 +2410,18 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, } auto graph_item = std::make_tuple(graph_handle, - tensor_writer0.get_qnn_tensor(), - tensor_writer1.get_qnn_tensor(), - tensor_reader.get_qnn_tensor()); + tensor_input0.get_qnn_tensor(), + tensor_input1.get_qnn_tensor(), + tensor_output.get_qnn_tensor()); instance->_qnn_graph_map[map_entry] = graph_item; } else { auto & graph_item= instance->_qnn_graph_map[map_entry]; - ggml_qnn_tensor_writer tensor_writer0(src0, std::get<1>(graph_item), ctx); - ggml_qnn_tensor_writer tensor_writer1(src1, std::get<2>(graph_item), ctx); - ggml_qnn_tensor_reader tensor_reader(dst, std::get<3>(graph_item), ctx); + ggml_qnn_tensor_input tensor_input0(src0, std::get<1>(graph_item), ctx); + ggml_qnn_tensor_input tensor_input1(src1, std::get<2>(graph_item), ctx); + ggml_qnn_tensor_output tensor_output(dst, std::get<3>(graph_item), ctx); - Qnn_Tensor_t tensor_inputs[] = {*tensor_writer0.get_qnn_tensor(), *tensor_writer1.get_qnn_tensor()}; - Qnn_Tensor_t tensor_outputs[] = {*tensor_reader.get_qnn_tensor()}; + Qnn_Tensor_t tensor_inputs[] = {*tensor_input0.get_qnn_tensor(), *tensor_input1.get_qnn_tensor()}; + Qnn_Tensor_t tensor_outputs[] = {*tensor_output.get_qnn_tensor()}; error = qnn_raw_interface.graphExecute(graph_handle, tensor_inputs, 2, tensor_outputs, 1, From 65a14d9e9a6977edd154e844b90108ef0d0725f0 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Tue, 18 Jun 2024 23:07:01 +0800 Subject: [PATCH 024/143] fix todo --- ggml-qnn.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml-qnn.cpp b/ggml-qnn.cpp index b97b202453fa0..47810c933ab75 100644 --- a/ggml-qnn.cpp +++ b/ggml-qnn.cpp @@ -1995,7 +1995,7 @@ class ggml_qnn_tensor_readwrite if (is_npu) { qnn_instance * instance = ctx->instance; uint8_t *qnn_buffer = static_cast(instance->alloc_rpcmem( - ggml_nbytes(tensor), 4)); // TODO: should we get the align param from device here? + ggml_nbytes(tensor), alignof(void*))); if (!qnn_buffer) { QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno)); QNN_LOG_DEBUG("tensor%p name %s", _qnn_tensor, QNN_TENSOR_GET_NAME(*_qnn_tensor)); From aeef0c68f498001495a18e251c16a4a3fcad2e88 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Wed, 19 Jun 2024 10:29:53 +0800 Subject: [PATCH 025/143] make the constant condition first --- ggml-qnn.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ggml-qnn.cpp b/ggml-qnn.cpp index 47810c933ab75..5b4d665dcecba 100644 --- a/ggml-qnn.cpp +++ b/ggml-qnn.cpp @@ -2051,8 +2051,8 @@ class ggml_qnn_tensor_readwrite } ~ggml_qnn_tensor_readwrite() { - if (_context && _context->device == QNN_BACKEND_NPU && - (_tensorType == QNN_TENSOR_TYPE_APP_READWRITE || _tensorType == QNN_TENSOR_TYPE_APP_READ)) { + if ((_tensorType == QNN_TENSOR_TYPE_APP_READWRITE || _tensorType == QNN_TENSOR_TYPE_APP_READ) && + _context && _context->device == QNN_BACKEND_NPU) { uint8_t * qnn_buffer = static_cast(_context->instance->get_rpcmem_from_memhandle( QNN_VER_PTR(*_qnn_tensor)->memHandle)); memcpy(_tensor->data, qnn_buffer, ggml_nbytes(_tensor)); From dfe159ffffcc82484f00d701a2076859ac0f88be Mon Sep 17 00:00:00 2001 From: hongruichen Date: Wed, 19 Jun 2024 10:58:12 +0800 Subject: [PATCH 026/143] remove TODO --- ggml-qnn.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml-qnn.cpp b/ggml-qnn.cpp index 5b4d665dcecba..f40efd72915df 100644 --- a/ggml-qnn.cpp +++ b/ggml-qnn.cpp @@ -2000,7 +2000,7 @@ class ggml_qnn_tensor_readwrite QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno)); QNN_LOG_DEBUG("tensor%p name %s", _qnn_tensor, QNN_TENSOR_GET_NAME(*_qnn_tensor)); _context = nullptr; - // TODO: should we free the tensor here? + // No free for _qnn_tensor, because it's not registered. return; } else { QNN_LOG_INFO("alloc rpcmem successfully\n"); From 99320620b07a344c701e0c574922ed061c4257c9 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Wed, 19 Jun 2024 12:25:32 +0800 Subject: [PATCH 027/143] split logger function, tensors and backend from main qnn source --- ggml-qnn.cpp | 1511 +-------------------------------- ggml-qnn/backend.hpp | 24 + ggml-qnn/logger.cpp | 78 ++ ggml-qnn/logger.hpp | 49 ++ ggml-qnn/qnn-types.hpp | 46 + ggml-qnn/qnn.hpp | 1139 +++++++++++++++++++++++++ ggml-qnn/tensor.hpp | 145 ++++ ggml-qnn/utils.hpp | 99 +++ tests/ggml-qnn/CMakeLists.txt | 1 + 9 files changed, 1606 insertions(+), 1486 deletions(-) create mode 100644 ggml-qnn/backend.hpp create mode 100644 ggml-qnn/logger.cpp create mode 100644 ggml-qnn/logger.hpp create mode 100644 ggml-qnn/qnn-types.hpp create mode 100644 ggml-qnn/qnn.hpp create mode 100644 ggml-qnn/tensor.hpp create mode 100644 ggml-qnn/utils.hpp diff --git a/ggml-qnn.cpp b/ggml-qnn.cpp index f40efd72915df..a552fd5ec935e 100644 --- a/ggml-qnn.cpp +++ b/ggml-qnn.cpp @@ -34,38 +34,20 @@ #include #include -#if (defined __ANDROID__) || (defined ANDROID) -#include -#endif - #include "ggml-qnn.h" #include "ggml-backend-impl.h" -// header file of Qualcomm QNN(Qualcomm Neural Network, aka Qualcomm AI Engine Direct) SDK -// https://qpm.qualcomm.com/#/main/tools/details/qualcomm_ai_engine_direct -#include "QnnTypes.h" -#include "QnnCommon.h" -#include "QnnContext.h" -#include "QnnBackend.h" -#include "QnnGraph.h" -#include "QnnProperty.h" -#include "QnnTensor.h" -#include "QnnInterface.h" -#include "Saver/QnnSaver.h" -#include "System/QnnSystemInterface.h" -#include "HTP/QnnHtpDevice.h" -#include "HTP/QnnHtpGraph.h" +#include "ggml-qnn/logger.hpp" +#include "ggml-qnn/utils.hpp" +#include "ggml-qnn/backend.hpp" +#include "ggml-qnn/tensor.hpp" // ================================================================================================= // // forward declaration // // ================================================================================================= -class qnn_instance; - -struct ggml_backend_qnn_context; - static int free_qnn_tensor(Qnn_Tensor_t & tensor); // ================================================================================================= @@ -74,16 +56,11 @@ static int free_qnn_tensor(Qnn_Tensor_t & tensor); // // ================================================================================================= #ifdef NDEBUG -#define ENABLE_QNNBACKEND_DEBUG 0 // for troubleshooting QNN backend -#define ENABLE_QNNSDK_LOG 0 // enable/disable QNN SDK's internal log #define ENABLE_QNNBACKEND_PERF 0 // enable/disable op's perf info #else -#define ENABLE_QNNBACKEND_DEBUG 1 // for troubleshooting QNN backend -#define ENABLE_QNNSDK_LOG 1 // enable/disable QNN SDK's internal log #define ENABLE_QNNBACKEND_PERF 1 // enable/disable op's perf info #endif -#define QNN_LOGBUF_LEN 4096 #define QNN_BACKEND_NAME "qnn" typedef void (*ggml_qnn_func_t)(ggml_backend_qnn_context * ctx, @@ -91,29 +68,7 @@ typedef void (*ggml_qnn_func_t)(ggml_backend_qnn_context * ctx, const ggml_tensor * src1, ggml_tensor * dst); -enum qcom_htp_arch { - NONE = 0, - V68 = 68, - V69 = 69, - V73 = 73, - V75 = 75, -}; - -enum qcom_chipset { - UNKNOWN_SM = 0, - SM8450 = 36, // v69 - SM8475 = 42, // v69 - SM8550 = 43, // v73 - SM8650 = 57, // v75 -}; - -struct qcom_socinfo { - uint32_t soc_model; - size_t htp_arch; - size_t vtcm_size_in_mb; -}; - -static struct qcom_socinfo g_qnn_soc_info_table[] = { +static struct qnn::qcom_socinfo g_qnn_soc_info_table[] = { /* Qualcomm SnapDragon 8 Gen 1 */ [SM8450] = { .soc_model = SM8450, @@ -140,18 +95,6 @@ static struct qcom_socinfo g_qnn_soc_info_table[] = { }; -struct ggml_backend_qnn_context { - int device; - int threads; - char name[GGML_MAX_NAME]; - char lib[GGML_MAX_NAME]; - qnn_instance * instance; - struct ggml_backend * backend; - QNN_INTERFACE_VER_TYPE raw_interface; - QNN_SYSTEM_INTERFACE_VER_TYPE raw_system_interface; - struct qcom_socinfo socinfo; -}; - // according to the QNN SDK Reference Guide, // CPU - Choose a non-quantized model.Quantized models are currently incompatible with the CPU backend // GPU - Choose a non-quantized model.Quantized models are currently incompatible with the GPU backend @@ -235,65 +178,11 @@ struct ggml_backend_qnn_buffer_type_context { std::string name; }; -// ================================================================================================= -// -// QNN backend internal log function -// -// ================================================================================================= -static void qnn_internal_log(ggml_log_level level, const char * file, - const char * func, int line, - const char * format, ...); -#define QNN_LOG_ERROR(...) \ - qnn_internal_log(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) - -#define QNN_LOG_WARN(...) \ - qnn_internal_log(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) - -#define QNN_LOG_INFO(...) \ - qnn_internal_log(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) - -#if ENABLE_QNNBACKEND_DEBUG -#define QNN_LOG_DEBUG(...) \ - qnn_internal_log(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) -#else -#define QNN_LOG_DEBUG(...) -#endif - // ================================================================================================= // // QNN backend internal helper functions // // ================================================================================================= -static uint32_t qnn_get_ggml_tensor_rank(const ggml_tensor * tensor) { - uint32_t rank = 0; - for (int i = 0; i < GGML_MAX_DIMS; i++) { - if ((0 != tensor->ne[i]) && (1 != tensor->ne[i])) { - rank++; - } - } - return rank; -} - -// TODO: mapping more ggml data type to QNN data type -// ref:explanation of k-quants, https://github.com/ggerganov/llama.cpp/pull/1684 -static Qnn_DataType_t qnn_datatype_from_ggml_datatype(enum ggml_type ggmltype) { - switch (ggmltype) { - case GGML_TYPE_F16: - return QNN_DATATYPE_FLOAT_16; - case GGML_TYPE_F32: - return QNN_DATATYPE_FLOAT_32; - case GGML_TYPE_I8: - return QNN_DATATYPE_INT_8; - case GGML_TYPE_Q8_0: - return QNN_DATATYPE_SFIXED_POINT_8; - case GGML_TYPE_Q4_0: - return QNN_DATATYPE_SFIXED_POINT_4; - default: - break; - } - return QNN_DATATYPE_UNDEFINED; -} - // TODO: only support GGML_OP_ADD/GGML_OP_MUL/GGML_OP_MUL_MAT static const char * qnn_opname_from_ggmlop(enum ggml_op ggmlop) { switch (ggmlop) { @@ -322,79 +211,6 @@ static uint32_t qnn_get_ggml_tensor_data_size(const ggml_tensor * tensor) { return ggml_nbytes(tensor); } -static const char * qnn_get_backend_name(int n_backend_type) { - switch (n_backend_type) { - case QNN_BACKEND_CPU: - return "QNN-CPU"; - case QNN_BACKEND_GPU: - return "QNN-GPU"; - case QNN_BACKEND_NPU: - return "QNN-NPU"; - case QNN_BACKEND_GGML: - return "ggml"; //"fake" QNN backend, used for compare performance between QNN backend and original GGML - default: - return "unknown"; - } -} - -static const char * qnn_get_chipset_desc(uint32_t chipset_id) { - switch (chipset_id) { - case SM8450: - return "SM8450"; - case SM8475: - return "SM8475"; - case SM8550: - return "SM8550"; - case SM8650: - return "SM8650"; - default: - return "unknown"; - } -} - -static const char * qnn_get_htparch_desc(size_t htp_arch) { - switch (htp_arch) { - case V68: - return "QCOM_HTP_V68"; - case V69: - return "QCOM_HTP_V69"; - case V73: - return "QCOM_HTP_V73"; - case V75: - return "QCOM_HTP_V75"; - default: - return "unknown"; - } -} - -static void qnn_internal_log(ggml_log_level level, const char * file, - const char * func, int line, - const char * format, ...) { - static std::mutex qnn_internal_log_mutex; - static char s_qnn_internal_log_buf[QNN_LOGBUF_LEN]; - - { - std::lock_guard lock(qnn_internal_log_mutex); - va_list args; - - va_start(args, format); - int len_prefix = - snprintf(s_qnn_internal_log_buf, QNN_LOGBUF_LEN, - "[%s, %d]: ", func, line); - int len = vsnprintf(s_qnn_internal_log_buf + len_prefix, - QNN_LOGBUF_LEN - len_prefix, format, args); - if (len < (QNN_LOGBUF_LEN - len_prefix)) { -#if (defined __ANDROID__) || (defined ANDROID) - // for Android APK - __android_log_print(level, "ggml-qnn", "%s\n", s_qnn_internal_log_buf); -#endif - // for Android command line application or WoA(Windows on ARM) - printf("%s\n", s_qnn_internal_log_buf); - } - va_end(args); - } -} - static bool qnn_is_valid_params(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { if ((nullptr == ctx) || (nullptr == src0) || (nullptr == src1) || (nullptr == dst)) { @@ -467,29 +283,10 @@ class qnn_perf { }; #endif -// ================================================================================================= -// -// helper data type / data structure / macros / functions of -// Qualcomm QNN(Qualcomm Neural Network, aka Qualcomm AI Engine Direct) SDK -// ref:https://github.com/pytorch/executorch/tree/main/backends/qualcomm -// ================================================================================================= -enum qnn_sdk_profile_level { - profile_off = 0, - profile_basic = 1, - profile_detail = 2 -}; - -using pfn_rpc_mem_init = void (*)(void); -using pfn_rpc_mem_deinit = void (*)(void); -using pfn_rpc_mem_alloc = void *(*) (int, uint32_t, int); -using pfn_rpc_mem_free = void (*)(void *); -using pfn_rpc_mem_to_fd = int (*)(void *); - using pfn_qnnsaver_initialize = decltype(QnnSaver_initialize); using pfn_qnninterface_getproviders = decltype(QnnInterface_getProviders); using pfn_qnnsysteminterface_getproviders = decltype(QnnSystemInterface_getProviders); -#define QNN_VER_PTR(x) (&((x).v1)) #define RPCMEM_DEFAULT_FLAGS 1 #define RPCMEM_HEAP_ID_SYSTEM 25 @@ -762,1144 +559,6 @@ static int free_qnn_tensor(Qnn_Tensor_t & tensor) { return err; } -template Fn load_qnn_functionpointers(void * handle, const char * function_name) { - return reinterpret_cast(dlsym(handle, function_name)); -} - -static intptr_t align_to(size_t alignment, intptr_t offset) { - return offset % alignment == 0 - ? offset - : offset + (static_cast(alignment) - - offset % static_cast(alignment)); -} - -static void qnn_sdk_logcallback(const char * fmt, QnnLog_Level_t level, - uint64_t timestamp, va_list argp) { - -#if ENABLE_QNNSDK_LOG - static std::mutex log_mutex; - static unsigned char s_ggml_qnn_logbuf[QNN_LOGBUF_LEN]; - - const char * log_level_desc = ""; - switch (level) { - case QNN_LOG_LEVEL_ERROR: - log_level_desc = "ERROR"; - break; - case QNN_LOG_LEVEL_WARN: - log_level_desc = "WARNING"; - break; - case QNN_LOG_LEVEL_INFO: - log_level_desc = "INFO"; - break; - case QNN_LOG_LEVEL_DEBUG: - log_level_desc = "DEBUG"; - break; - case QNN_LOG_LEVEL_VERBOSE: - log_level_desc = "VERBOSE"; - break; - case QNN_LOG_LEVEL_MAX: - log_level_desc = "UNKNOWN"; - break; - } - - double ms = (double) timestamp / 1000000.0; - { - std::lock_guard lock(log_mutex); - - memset(s_ggml_qnn_logbuf, 0, QNN_LOGBUF_LEN); - vsnprintf(reinterpret_cast(s_ggml_qnn_logbuf), QNN_LOGBUF_LEN, fmt, argp); - QNN_LOG_INFO("%8.1fms [%-7s] %s\n", ms, log_level_desc, s_ggml_qnn_logbuf); - } -#endif -} - -// ================================================================================================= -// -// wrapper class of Qualcomm QNN(Qualcomm Neural Network, aka Qualcomm AI Engine Direct) SDK -// ref:https://github.com/pytorch/executorch/tree/main/backends/qualcomm -// ================================================================================================= -class qnn_interface { - -#define DEFINE_SHIM_FUNCTION_INTERFACE(F, pointer_name) \ - template inline auto qnn_##F(Args... args) const { \ - return (_qnn_interface->QNN_INTERFACE_VER_NAME.pointer_name)( \ - std::forward(args)...); \ - } - -#define DEFINE_SHIM_FUNCTION_SYS_INTERFACE(F, pointer_name) \ - template inline auto qnn_##F(Args... args) const { \ - return ( \ - _qnn_sys_interface->QNN_SYSTEM_INTERFACE_VER_NAME.pointer_name)( \ - std::forward(args)...); \ - } - - friend class qnn_instance; - - public: - qnn_interface() = default; - - // QnnBackend - DEFINE_SHIM_FUNCTION_INTERFACE(backend_create, backendCreate); - - DEFINE_SHIM_FUNCTION_INTERFACE(backend_free, backendFree); - - DEFINE_SHIM_FUNCTION_INTERFACE(backend_register_op_package, - backendRegisterOpPackage); - - DEFINE_SHIM_FUNCTION_INTERFACE(backend_validate_op_config, - backendValidateOpConfig); - - DEFINE_SHIM_FUNCTION_INTERFACE(backend_get_api_version, - backendGetApiVersion); - - // QnnDevice - DEFINE_SHIM_FUNCTION_INTERFACE(device_create, deviceCreate); - - DEFINE_SHIM_FUNCTION_INTERFACE(device_free, deviceFree); - - DEFINE_SHIM_FUNCTION_INTERFACE(device_get_infrastructure, - deviceGetInfrastructure); - - DEFINE_SHIM_FUNCTION_INTERFACE(device_get_platform_info, - deviceGetPlatformInfo); - - DEFINE_SHIM_FUNCTION_INTERFACE(device_get_info, deviceGetInfo); - - // QnnContext - DEFINE_SHIM_FUNCTION_INTERFACE(context_create, contextCreate); - - DEFINE_SHIM_FUNCTION_INTERFACE(context_get_binary_size, - contextGetBinarySize); - - DEFINE_SHIM_FUNCTION_INTERFACE(context_get_binary, contextGetBinary); - - DEFINE_SHIM_FUNCTION_INTERFACE(context_create_from_binary, - contextCreateFromBinary); - - DEFINE_SHIM_FUNCTION_INTERFACE(context_free, contextFree); - - // QnnGraph - DEFINE_SHIM_FUNCTION_INTERFACE(graph_create, graphCreate); - - DEFINE_SHIM_FUNCTION_INTERFACE(graph_add_node, graphAddNode); - - DEFINE_SHIM_FUNCTION_INTERFACE(graph_finalize, graphFinalize); - - DEFINE_SHIM_FUNCTION_INTERFACE(graph_execute, graphExecute); - - DEFINE_SHIM_FUNCTION_INTERFACE(graph_retrieve, graphRetrieve); - - // QnnLog - DEFINE_SHIM_FUNCTION_INTERFACE(log_create, logCreate); - - DEFINE_SHIM_FUNCTION_INTERFACE(log_free, logFree); - - DEFINE_SHIM_FUNCTION_INTERFACE(log_set_log_level, logSetLogLevel); - - // QnnProfile - DEFINE_SHIM_FUNCTION_INTERFACE(profile_create, profileCreate); - - DEFINE_SHIM_FUNCTION_INTERFACE(profile_get_events, profileGetEvents); - - DEFINE_SHIM_FUNCTION_INTERFACE(profile_get_sub_events, profileGetSubEvents); - - DEFINE_SHIM_FUNCTION_INTERFACE(profile_get_event_data, profileGetEventData); - - DEFINE_SHIM_FUNCTION_INTERFACE(profile_free, profileFree); - - // QnnMem - DEFINE_SHIM_FUNCTION_INTERFACE(mem_register, memRegister); - - DEFINE_SHIM_FUNCTION_INTERFACE(mem_de_register, memDeRegister); - - // QnnProperty - DEFINE_SHIM_FUNCTION_INTERFACE(property_has_capability, - propertyHasCapability); - - // QnnTensor - DEFINE_SHIM_FUNCTION_INTERFACE(tensor_create_context_tensor, - tensorCreateContextTensor); - - DEFINE_SHIM_FUNCTION_INTERFACE(tensor_create_graph_tensor, - tensorCreateGraphTensor); - - // QnnSystem - DEFINE_SHIM_FUNCTION_SYS_INTERFACE(system_context_create, - systemContextCreate); - - DEFINE_SHIM_FUNCTION_SYS_INTERFACE(system_context_get_binary_info, - systemContextGetBinaryInfo); - - DEFINE_SHIM_FUNCTION_SYS_INTERFACE(system_context_free, systemContextFree); - - void set_qnn_interface(const QnnInterface_t * qnn_interface) { - _qnn_interface = qnn_interface; - } - - void set_qnn_system_interface( - const QnnSystemInterface_t * qnn_sys_interface) { - _qnn_sys_interface = qnn_sys_interface; - } - - uint32_t get_backend_id() const { return _qnn_interface->backendId; } - - bool is_loaded() const { - return ((_qnn_sys_interface != nullptr) && (_qnn_interface != nullptr)); - } - - private: - const QnnInterface_t * _qnn_interface = nullptr; - - const QnnSystemInterface_t * _qnn_sys_interface = nullptr; -}; - -class qnn_instance { - public: - using BackendIdType = decltype(QnnInterface_t{}.backendId); - - explicit qnn_instance(const std::string & lib_path, - const std::string & backend_name, - const std::string & model_name) - : _lib_path(std::move(lib_path)) - , _backend_name(std::move(backend_name)) - , _model_name(std::move(model_name)){}; - - ~qnn_instance() {} - - int qnn_init(const QnnSaver_Config_t ** saver_config) { - BackendIdType backend_id = QNN_BACKEND_ID_NULL; - QNN_LOG_DEBUG("enter qni_init\n"); - - std::lock_guard lock(_init_mutex); - - if (0 != load_system()) { - QNN_LOG_WARN("can not load QNN system lib, pls check why?\n"); - return 1; - } else { - QNN_LOG_DEBUG("load QNN system lib successfully\n"); - } - - std::string backend_lib_path = _lib_path + _backend_name; - if (0 == _lib_path_to_backend_id.count(backend_lib_path)) { - int is_load_ok = load_backend(backend_lib_path, saver_config); - if (0 != is_load_ok) { - QNN_LOG_WARN("failed to load QNN backend\n"); - return 2; - } - } - - backend_id = _lib_path_to_backend_id[backend_lib_path]; - if (0 == _loaded_backend.count(backend_id) || - 0 == _loaded_lib_handle.count(backend_id)) { - QNN_LOG_WARN("library %s is loaded but loaded backend count=%zu, " - "loaded lib_handle count=%zu\n", - backend_lib_path.c_str(), _loaded_backend.count(backend_id), - _loaded_lib_handle.count(backend_id)); - return 3; - } - - _qnn_interface.set_qnn_interface(_loaded_backend[backend_id]); - - _qnn_interface.qnn_log_create(qnn_sdk_logcallback, _qnn_log_level, &_qnn_log_handle); - if (nullptr == _qnn_log_handle) { - // NPU backend not work on Qualcomm SoC equipped low-end phone - QNN_LOG_WARN("why failed to initialize qnn log\n"); - return 4; - } else { - QNN_LOG_DEBUG("initialize qnn log successfully\n"); - } - - std::vector temp_backend_config; - _qnn_interface.qnn_backend_create( - _qnn_log_handle, - temp_backend_config.empty() ? nullptr : temp_backend_config.data(), - &_qnn_backend_handle); - if (nullptr == _qnn_backend_handle) { - QNN_LOG_WARN("why failed to initialize qnn backend\n"); - return 5; - } else { - QNN_LOG_DEBUG("initialize qnn backend successfully\n"); - } - - if (nullptr != _qnn_raw_interface.propertyHasCapability) { - Qnn_ErrorHandle_t qnn_status = - _qnn_raw_interface.propertyHasCapability(QNN_PROPERTY_GROUP_DEVICE); - if (QNN_PROPERTY_NOT_SUPPORTED == qnn_status) { - QNN_LOG_WARN("device property is not supported\n"); - } - if (QNN_PROPERTY_ERROR_UNKNOWN_KEY == qnn_status) { - QNN_LOG_WARN("device property is not known to backend\n"); - } - } - - Qnn_ErrorHandle_t qnn_status = QNN_SUCCESS; - if (_backend_name.find("Htp") != std::variant_npos) { - const QnnDevice_PlatformInfo_t * p_info = nullptr; - _qnn_raw_interface.deviceGetPlatformInfo(nullptr, &p_info); - QNN_LOG_INFO("device counts %d", p_info->v1.numHwDevices); - QnnDevice_HardwareDeviceInfo_t * infos = p_info->v1.hwDevices; - QnnHtpDevice_OnChipDeviceInfoExtension_t chipinfo = { }; - for (int i = 0; i < p_info->v1.numHwDevices; i++) { - QNN_LOG_INFO("deviceID:%d, deviceType:%d, numCores %d", infos[i].v1.deviceId, - infos[i].v1.deviceType, infos[i].v1.numCores); - QnnDevice_DeviceInfoExtension_t devinfo = infos[i].v1.deviceInfoExtension; - chipinfo = devinfo->onChipDevice; - QnnHtpDevice_Arch_t htp_arch = chipinfo.arch; - QNN_LOG_INFO("htp_type:%d(%s)", devinfo->devType, (devinfo->devType == QNN_HTP_DEVICE_TYPE_ON_CHIP) ? "ON_CHIP" : ""); - QNN_LOG_INFO("qualcomm soc_model:%d(%s), htp_arch:%d(%s), vtcm_size:%d MB", \ - chipinfo.socModel, qnn_get_chipset_desc(chipinfo.socModel), \ - htp_arch, qnn_get_htparch_desc(htp_arch), chipinfo.vtcmSize); - g_qnn_mgr[QNN_BACKEND_NPU].socinfo = { chipinfo.socModel, htp_arch, chipinfo.vtcmSize }; - } - _qnn_raw_interface.deviceFreePlatformInfo(nullptr, p_info); - - QnnHtpDevice_CustomConfig_t soc_customconfig; - soc_customconfig.option = QNN_HTP_DEVICE_CONFIG_OPTION_SOC; - soc_customconfig.socModel = chipinfo.socModel; - QnnDevice_Config_t soc_devconfig; - soc_devconfig.option = QNN_DEVICE_CONFIG_OPTION_CUSTOM; - soc_devconfig.customConfig = &soc_customconfig; - - QnnHtpDevice_CustomConfig_t arch_customconfig; - arch_customconfig.option = QNN_HTP_DEVICE_CONFIG_OPTION_ARCH; - arch_customconfig.arch.arch = chipinfo.arch; - arch_customconfig.arch.deviceId = 0; // Id of device to be used. If single device is used by default 0. - QnnDevice_Config_t arch_devconfig; - arch_devconfig.option = QNN_DEVICE_CONFIG_OPTION_CUSTOM; - arch_devconfig.customConfig = &arch_customconfig; - - const QnnDevice_Config_t * p_deviceconfig[] = {&soc_devconfig, &arch_devconfig, nullptr}; - qnn_status = _qnn_raw_interface.deviceCreate(_qnn_log_handle, p_deviceconfig, &_qnn_device_handle); - } else { - qnn_status = _qnn_raw_interface.deviceCreate(_qnn_log_handle, nullptr, &_qnn_device_handle); - } - if (QNN_SUCCESS != qnn_status && - QNN_DEVICE_ERROR_UNSUPPORTED_FEATURE != qnn_status) { - QNN_LOG_WARN("failed to create QNN device\n"); - } else { - QNN_LOG_INFO("create QNN device successfully\n"); - } - - if (qnn_sdk_profile_level::profile_off != _profile_level) { - QNN_LOG_INFO("profiling turned on; level = %d", _profile_level); - if (qnn_sdk_profile_level::profile_basic == _profile_level) { - QNN_LOG_INFO("basic profiling requested. creating Qnn Profile object\n"); - if (QNN_PROFILE_NO_ERROR != - _qnn_raw_interface.profileCreate(_qnn_backend_handle, - QNN_PROFILE_LEVEL_BASIC, - &_qnn_profile_handle)) { - QNN_LOG_WARN("unable to create profile handle in the backend\n"); - return 6; - } else { - QNN_LOG_DEBUG("initialize qnn profile successfully\n"); - } - } else if (qnn_sdk_profile_level::profile_detail == _profile_level) { - QNN_LOG_INFO("detailed profiling requested. Creating Qnn Profile object\n"); - if (QNN_PROFILE_NO_ERROR != - _qnn_raw_interface.profileCreate(_qnn_backend_handle, - QNN_PROFILE_LEVEL_DETAILED, - &_qnn_profile_handle)) { - QNN_LOG_WARN("unable to create profile handle in the backend\n"); - return 7; - } else { - QNN_LOG_DEBUG("initialize qnn profile successfully\n"); - } - } - } - - _rpc_lib_handle = dlopen("libcdsprpc.so", RTLD_NOW | RTLD_LOCAL); - if (nullptr == _rpc_lib_handle) { - QNN_LOG_WARN("failed to load qualcomm's rpc lib, error:%s\n", dlerror()); - return 8; - } else { - QNN_LOG_DEBUG("load rpcmem lib successfully\n"); - set_rpcmem_initialized(true); - } - _pfn_rpc_mem_init = reinterpret_cast( - dlsym(_rpc_lib_handle, "rpcmem_init")); - _pfn_rpc_mem_deinit = reinterpret_cast( - dlsym(_rpc_lib_handle, "rpcmem_deinit")); - _pfn_rpc_mem_alloc = reinterpret_cast( - dlsym(_rpc_lib_handle, "rpcmem_alloc")); - _pfn_rpc_mem_free = reinterpret_cast( - dlsym(_rpc_lib_handle, "rpcmem_free")); - _pfn_rpc_mem_to_fd = reinterpret_cast( - dlsym(_rpc_lib_handle, "rpcmem_to_fd")); - if (nullptr == _pfn_rpc_mem_alloc || nullptr == _pfn_rpc_mem_free || - nullptr == _pfn_rpc_mem_to_fd) { - QNN_LOG_WARN("unable to access symbols in QNN RPC lib. dlerror(): %s", dlerror()); - dlclose(_rpc_lib_handle); - return 9; - } - - if (nullptr != _pfn_rpc_mem_init) { // make Qualcomm's SoC equipped low-end phone happy - _pfn_rpc_mem_init(); - } - - /* TODO: not used, keep it for further usage - QnnContext_Config_t qnn_context_config = QNN_CONTEXT_CONFIG_INIT; - qnn_context_config.priority = QNN_PRIORITY_DEFAULT; - const QnnContext_Config_t * context_configs[] = {&qnn_context_config, nullptr}; - */ - _qnn_interface.qnn_context_create( - _qnn_backend_handle, _qnn_device_handle, - nullptr, - &_qnn_context_handle); - if (nullptr == _qnn_context_handle) { - QNN_LOG_WARN("why failed to initialize qnn context\n"); - return 10; - } else { - QNN_LOG_DEBUG("initialize qnn context successfully\n"); - } - - if (_backend_name.find("Htp") != std::variant_npos) { - //TODO: faster approach to probe the accurate capacity of rpc ion memory - size_t candidate_size = 0; - uint8_t * rpc_buffer = nullptr; - const int size_in_mb = (1 << 20); - size_t probe_slots[] = {1024, 1536, 2048 - 48, 2048}; - size_t probe_counts = sizeof(probe_slots) / sizeof(size_t); - for (size_t idx = 0; idx < probe_counts; idx++) { - rpc_buffer = static_cast(alloc_rpcmem( - probe_slots[idx] * size_in_mb, 4)); - if (nullptr == rpc_buffer) { - QNN_LOG_INFO("alloc rpcmem %d (MB) failure, %s\n", - probe_slots[idx], strerror(errno)); - break; - } else { - candidate_size = probe_slots[idx]; - free_rpcmem(rpc_buffer); - rpc_buffer = nullptr; - } - } - if (candidate_size > _rpcmem_capacity) - _rpcmem_capacity = candidate_size; - QNN_LOG_INFO("capacity of QNN rpc ion memory is about %d MB\n", _rpcmem_capacity); - - if (0 != init_htp_perfinfra()) { - QNN_LOG_WARN("initialize HTP performance failure"); - } - if (0 != set_rpc_polling()) { - QNN_LOG_WARN("set RPC polling failure"); - } - if (0 != set_high_performance_mode()) { - QNN_LOG_WARN("set HTP high performance mode failure"); - } - } - - QNN_LOG_DEBUG("leave qni_init\n"); - - return 0; - } - - int qnn_finalize() { - int ret_status = 0; - Qnn_ErrorHandle_t error = QNN_SUCCESS; - - if (nullptr != _pfn_rpc_mem_deinit) // make Qualcomm's SoC equipped low-end phone happy - _pfn_rpc_mem_deinit(); - - if (dlclose(_rpc_lib_handle) != 0) { - QNN_LOG_WARN("failed to unload qualcomm's rpc lib, error:%s\n", dlerror()); - } else { - QNN_LOG_DEBUG("succeed to close rpcmem lib\n"); - } - - if (_backend_name.find("Htp") != std::variant_npos) { - _qnn_htp_perfinfra->destroyPowerConfigId(_qnn_power_configid); - } - - if (nullptr != _qnn_context_handle) { - error = _qnn_interface.qnn_context_free(_qnn_context_handle, - _qnn_profile_handle); - if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to free QNN context_handle: ID %u, error %d\n", - _qnn_interface.get_backend_id(), - QNN_GET_ERROR_CODE(error)); - } - _qnn_context_handle = nullptr; - } - - if (nullptr != _qnn_profile_handle) { - error = _qnn_interface.qnn_profile_free(_qnn_profile_handle); - if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to free QNN profile_handle: ID %u, error %d\n", - _qnn_interface.get_backend_id(), - QNN_GET_ERROR_CODE(error)); - } - _qnn_profile_handle = nullptr; - } - - if (nullptr != _qnn_device_handle) { - error = _qnn_interface.qnn_device_free(_qnn_device_handle); - if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to free QNN device_handle: ID %u, error %d\n", - _qnn_interface.get_backend_id(), - QNN_GET_ERROR_CODE(error)); - } - _qnn_device_handle = nullptr; - } - - if (nullptr != _qnn_backend_handle) { - error = _qnn_interface.qnn_backend_free(_qnn_backend_handle); - if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to free QNN backend_handle: ID %u, error %d\n", - _qnn_interface.get_backend_id(), - QNN_GET_ERROR_CODE(error)); - } - _qnn_backend_handle = nullptr; - } - - if (nullptr != _qnn_log_handle) { - error = _qnn_interface.qnn_log_free(_qnn_log_handle); - if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to free QNN log_handle: ID %u, error %d\n", - _qnn_interface.get_backend_id(), - QNN_GET_ERROR_CODE(error)); - } - _qnn_log_handle = nullptr; - } - - unload_backend(); - - unload_system(); - - return ret_status; - } - - //TODO:keep it for further usage of offload the entire cgraph to a single QNN DAG directly - // which was used in Qualcomm's dedicated AI technology -#if 0 - int init_qnn_graph(const char * graph_name, bool debug, - uint8_t do_node_validation = true, - const QnnGraph_Config_t ** graph_configs = nullptr) { - int result = 0; - - if (nullptr == graph_name) { - QNN_LOG_WARN("graph name is null\n"); - return 1; - } - - if (!_graph_name.empty()) { - QNN_LOG_WARN("qnn model for graph %s already initialized\n", graph_name); - return 2; - } - - if (!do_node_validation) { - QNN_LOG_WARN("node validation disabled, backend will not perform op " - "validation prior to adding node\n"); - } - - _graph_name = graph_name; - _debug_tensor = debug; - _do_node_validations = do_node_validation; - - result = _qnn_raw_interface.graphCreate(_qnn_context_handle, graph_name, - graph_configs, &_qnn_graph_handle); - if (result != QNN_GRAPH_NO_ERROR || nullptr == _qnn_graph_handle) { - QNN_LOG_WARN("failed to create graph in qnn context\n"); - return 3; - } else { - QNN_LOG_INFO("succeed to create graph %s, %p\n", graph_name, _qnn_graph_handle); - } - - return 0; - } - - int finalize_qnn_graph() { - if (nullptr != _qnn_graph_handle) { - if (_qnn_raw_interface.graphFinalize(_qnn_graph_handle, - _qnn_profile_handle, - nullptr) != QNN_GRAPH_NO_ERROR) { - QNN_LOG_WARN("finalizing graph failure\n"); - } - } else { - QNN_LOG_DEBUG("qnn graph handle is null\n"); - } - - return 0; - } -#endif - - const qnn_interface & get_qnn_interface() { - if (!_qnn_interface.is_loaded()) { - QNN_LOG_WARN("pls check why _qnn_interface is not loaded\n"); - } - return _qnn_interface; - } - - const QNN_INTERFACE_VER_TYPE & get_qnn_raw_interface() { - if (!_qnn_interface.is_loaded()) { - QNN_LOG_WARN("pls check why _qnn_interface is not loaded\n"); - } - return _qnn_raw_interface; - } - - const QNN_SYSTEM_INTERFACE_VER_TYPE & get_qnn_raw_system_interface() { - if (!_qnn_interface.is_loaded()) { - QNN_LOG_WARN("pls check why _qnn_interface is not loaded\n"); - } - return _qnn_raw_system_interface; - } - - const Qnn_LogHandle_t get_qnn_log_handle() { return _qnn_log_handle; } - - const Qnn_ProfileHandle_t get_qnn_profile_handle() { - return _qnn_profile_handle; - } - - const Qnn_DeviceHandle_t get_qnn_device_handle() { - return _qnn_device_handle; - } - - const Qnn_BackendHandle_t get_qnn_backend_handle() { - return _qnn_backend_handle; - } - - const Qnn_ContextHandle_t get_qnn_context_handle() { - return _qnn_context_handle; - } - - const QnnSystemContext_Handle_t get_qnn_system_handle() { - return _qnn_system_handle; - } - - const Qnn_GraphHandle_t get_qnn_graph_handle() { return _qnn_graph_handle; } - - int init_htp_perfinfra() { - QnnDevice_Infrastructure_t device_infra = nullptr; - int error = _qnn_raw_interface.deviceGetInfrastructure(&device_infra); - if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to get qnn device infra\n"); - return 1; - } else { - QNN_LOG_INFO("HTP backend perf_infrastructure creation ok\n"); - } - - QnnHtpDevice_Infrastructure_t * htp_infra = static_cast(device_infra); - QnnHtpDevice_PerfInfrastructure_t * htp_perfinfra = &htp_infra->perfInfra; - uint32_t power_configid = 1; - uint32_t device_id = 0; - uint32_t core_id = 0; - htp_perfinfra->createPowerConfigId(device_id, core_id, &power_configid); - if (htp_infra->infraType != QNN_HTP_DEVICE_INFRASTRUCTURE_TYPE_PERF) { - QNN_LOG_INFO("HTP infra type = %d, which is not perf infra type", htp_infra->infraType); - } else { - QNN_LOG_INFO("HTP infra type = %d, which is perf infra type\n", htp_infra->infraType); - } - _qnn_htp_perfinfra = htp_perfinfra; - _qnn_power_configid = power_configid; - - return 0; - } - - int set_rpc_polling() { - if (_qnn_htp_perfinfra) { - QnnHtpPerfInfrastructure_PowerConfig_t rpc_polling_time; - memset(&rpc_polling_time, 0, sizeof(rpc_polling_time)); - rpc_polling_time.option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_RPC_POLLING_TIME; - //use rpc polling time recommended 0-10000 us - rpc_polling_time.rpcPollingTimeConfig = 9999; - - QnnHtpPerfInfrastructure_PowerConfig_t rpc_control_latency; - memset(&rpc_control_latency, 0, sizeof(rpc_control_latency)); - rpc_control_latency.option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_RPC_CONTROL_LATENCY; - //use rpc control latency recommended 100 us, refer hexagon sdk - rpc_control_latency.rpcControlLatencyConfig = 100; - - const QnnHtpPerfInfrastructure_PowerConfig_t * power_configs[] = { - &rpc_polling_time, - &rpc_control_latency, - nullptr}; - Qnn_ErrorHandle_t qnn_status = _qnn_htp_perfinfra->setPowerConfig( - _qnn_power_configid, - power_configs); - if (qnn_status != QNN_SUCCESS) { - QNN_LOG_WARN("set htp perf failed\n"); - } else { - QNN_LOG_INFO("set htp perf ok\n"); - } - } else { - QNN_LOG_WARN("can't set htp perf\n"); - } - - return 0; - } - - int set_high_performance_mode() { - if (nullptr == _qnn_htp_perfinfra) { - QNN_LOG_WARN("perf intra is null\n"); - return 1; - } - - QnnHtpPerfInfrastructure_PowerConfig_t power_config; - memset(&power_config, 0, sizeof(power_config)); - power_config.option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_DCVS_V3; - - power_config.dcvsV3Config.setDcvsEnable = 1; - power_config.dcvsV3Config.dcvsEnable = 0; - power_config.dcvsV3Config.contextId = _qnn_power_configid; - power_config.dcvsV3Config.powerMode = QNN_HTP_PERF_INFRASTRUCTURE_POWERMODE_PERFORMANCE_MODE; - power_config.dcvsV3Config.setSleepLatency = - 1; // true to consider Latency parameter otherwise false - power_config.dcvsV3Config.sleepLatency = 40; - power_config.dcvsV3Config.setBusParams = - 1; // true to consider Bus parameter otherwise false - power_config.dcvsV3Config.setCoreParams = - 1; // true to consider Core parameter otherwise false - power_config.dcvsV3Config.sleepDisable = - 1; // true to consider sleep/LPM modes, false to enable - power_config.dcvsV3Config.setSleepDisable = - 1; // true to consider sleep disable/enable parameter otherwise false set sleep latency parameter - // set Bus Clock Parameters - power_config.dcvsV3Config.busVoltageCornerMin = - DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; - power_config.dcvsV3Config.busVoltageCornerTarget = - DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; - power_config.dcvsV3Config.busVoltageCornerMax = - DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; - // set Core Clock Parameters - power_config.dcvsV3Config.coreVoltageCornerMin = - DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; - power_config.dcvsV3Config.coreVoltageCornerTarget = - DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; - power_config.dcvsV3Config.coreVoltageCornerMax = - DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; - - // set power config with different performance parameters - const QnnHtpPerfInfrastructure_PowerConfig_t * power_configs[] = { - &power_config, nullptr}; - Qnn_ErrorHandle_t qnn_status = QNN_SUCCESS; - qnn_status = _qnn_htp_perfinfra->setPowerConfig(_qnn_power_configid, power_configs); - if (qnn_status != QNN_SUCCESS) { - QNN_LOG_WARN("set htp high performance mode failed\n"); - } else { - QNN_LOG_INFO("set htp high performance mode ok\n"); - } - - return 0; - } - - std::string & get_qnn_graph_name() { return _graph_name; } - - bool is_rpcmem_initialized() { return _rpcmem_initialized; } - - void set_rpcmem_initialized(bool initialized) { - _rpcmem_initialized = initialized; - } - - size_t get_rpcmem_capacity() { return _rpcmem_capacity; } - - bool is_rpcmem_registered(Qnn_MemHandle_t handle) { - return _qnn_mem_set.count(handle) != 0U; - } - - void * alloc_rpcmem(size_t bytes, size_t alignment) { - if (!_rpcmem_initialized) { - QNN_LOG_WARN("rpc memory not initialized\n"); - return nullptr; - } - - auto allocate_bytes = static_cast(bytes + alignment); - void * buf = _pfn_rpc_mem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS, - allocate_bytes); - if (buf == nullptr) { - QNN_LOG_WARN("failed to allocate rpc memory\n"); - return nullptr; - } - - auto aligned_buf = reinterpret_cast( - align_to(alignment, reinterpret_cast(buf))); - bool status = - _rpcmem_store_map.insert(std::pair(aligned_buf, buf)).second; - if (!status) { - QNN_LOG_WARN("failed to allocate rpc memory\n"); - _pfn_rpc_mem_free(buf); - } - - return aligned_buf; - } - - void free_rpcmem(void * buf) { - if (!_rpcmem_initialized) { - QNN_LOG_WARN("rpc memory not initialized\n"); - } else if (0 == _rpcmem_store_map.count(buf)) { - QNN_LOG_WARN("no allocated tensor\n"); - } else { - _pfn_rpc_mem_free(_rpcmem_store_map[buf]); - _rpcmem_store_map.erase(buf); - } - } - - int32_t rpcmem_to_fd(void * buf) { - int32_t mem_fd = -1; - if (!is_rpcmem_initialized()) { - QNN_LOG_WARN("rpc memory not initialized\n"); - } else { - mem_fd = _pfn_rpc_mem_to_fd(buf); - } - - return mem_fd; - } - - int register_rpcmem(void * p_data, Qnn_Tensor_t * p_tensor) { - if (nullptr == p_data || (nullptr == p_tensor)) { - QNN_LOG_WARN("invalid param\n"); - return 1; - } - - if (!is_rpcmem_initialized()) { - QNN_LOG_WARN("rpc memory not initialized\n"); - return 2; - } - - if (is_rpcmem_allocated(p_data)) { - QNN_LOG_WARN("rpc memory already allocated\n"); - return 3; - } - - if (is_rpcmem_registered((QNN_VER_PTR(*p_tensor)->memHandle))) { - QNN_LOG_WARN("tensor %s has been registered shared memory\n", - (QNN_VER_PTR(*p_tensor)->name)); - return 4; - } - - int32_t mem_fd = rpcmem_to_fd(p_data); - if (-1 == mem_fd) { - QNN_LOG_WARN("failed to get file descriptor\n"); - return 5; - } - QNN_LOG_INFO("mem_fd %d\n", mem_fd); - Qnn_MemDescriptor_t descriptor = {{QNN_VER_PTR(*p_tensor)->rank, - QNN_VER_PTR(*p_tensor)->dimensions, - nullptr}, - QNN_VER_PTR(*p_tensor)->dataType, - QNN_MEM_TYPE_ION, - {{mem_fd}}}; - Qnn_MemHandle_t handle = nullptr; - int error = QNN_SUCCESS; - error = _qnn_interface.qnn_mem_register(_qnn_context_handle, &descriptor, - /*numDescriptors=*/1, &handle); - if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to register shared memory, error %d, %s\n", - QNN_GET_ERROR_CODE(error), strerror(error)); - return 6; - } else { - QNN_LOG_INFO("tensor %s successfully register shared memory\n", - (QNN_VER_PTR(*p_tensor)->name)); - } - QNN_VER_PTR(*p_tensor)->memHandle = handle; - _qnn_mem_set.insert((std::pair(p_data, handle))); - - return 0; - } - - void * get_rpcmem_from_memhandle(Qnn_MemHandle_t mem_handle) { - for (std::unordered_map::iterator it = _qnn_mem_set.begin(); - it != _qnn_mem_set.end(); - it++) { - Qnn_MemHandle_t mem_handle = it->second; - if (it->second == mem_handle) { - return it->first; - } - } - QNN_LOG_WARN("can't find rpcmem from qnn mem handle %p", mem_handle); - return nullptr; - } - - void unregister_rpcmem() { - Qnn_ErrorHandle_t error = QNN_SUCCESS; - - if (_qnn_mem_set.empty()) { - QNN_LOG_WARN("no rpcmem registered\n"); - } - - for (std::unordered_map::iterator it = _qnn_mem_set.begin(); - it != _qnn_mem_set.end(); - it++) { - Qnn_MemHandle_t mem_handle = it->second; - error = _qnn_interface.qnn_mem_de_register(&mem_handle, 1); - if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to unregister shared memory, error %d\n", - QNN_GET_ERROR_CODE(error)); - } - } - _qnn_mem_set.clear(); - } - - bool is_rpcmem_allocated(void * buf) { - return _qnn_mem_set.count(buf) != 0U; - } - - - public: - std::map> - _qnn_graph_map; - - private: - int load_system() { - Qnn_ErrorHandle_t error = QNN_SUCCESS; - - std::string system_lib_path = _lib_path + "libQnnSystem.so"; - QNN_LOG_DEBUG("system_lib_path:%s\n", system_lib_path.c_str()); - - _system_lib_handle = dlopen(system_lib_path.c_str(), RTLD_NOW | RTLD_LOCAL); - if (nullptr == _system_lib_handle) { - QNN_LOG_WARN("can not open QNN library %s, error: %s\n", - system_lib_path.c_str(), dlerror()); - return 1; - } - - auto * get_providers = - reinterpret_cast( - dlsym(_system_lib_handle, "QnnSystemInterface_getProviders")); - if (nullptr == get_providers) { - QNN_LOG_WARN( - "can not load QNN symbol QnnSystemInterface_getProviders: %s\n", - dlerror()); - return 2; - } - - uint32_t num_providers = 0; - const QnnSystemInterface_t ** provider_list = nullptr; - error = get_providers(&provider_list, &num_providers); - if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to get providers, error %d\n", - QNN_GET_ERROR_CODE(error)); - return 3; - } - - if (num_providers != _required_num_providers) { - QNN_LOG_WARN("providers is %d instead of required %d\n", num_providers, - _required_num_providers); - return 4; - } - - if (nullptr == provider_list) { - QNN_LOG_WARN("can not get providers\n"); - return 5; - } - - QNN_SYSTEM_INTERFACE_VER_TYPE qnn_system_interface; - bool found_valid_system_interface = false; - for (size_t idx = 0; idx < num_providers; idx++) { - if (QNN_SYSTEM_API_VERSION_MAJOR == - provider_list[idx]->systemApiVersion.major && - QNN_SYSTEM_API_VERSION_MINOR <= - provider_list[idx]->systemApiVersion.minor) { - found_valid_system_interface = true; - qnn_system_interface = - provider_list[idx]->QNN_SYSTEM_INTERFACE_VER_NAME; - break; - } - } - if (!found_valid_system_interface) { - QNN_LOG_WARN("unable to find a valid qnn system interface\n"); - return 6; - } else { - QNN_LOG_INFO("find a valid qnn system interface\n"); - } - set_qnn_raw_system_interface(qnn_system_interface); - - _qnn_interface.set_qnn_system_interface(provider_list[0]); - - _qnn_interface.qnn_system_context_create(&_qnn_system_handle); - if (nullptr == _qnn_system_handle) { - QNN_LOG_WARN("can not create QNN system contenxt\n"); - } else { - QNN_LOG_INFO("initialize qnn system successfully\n"); - } - - return 0; - } - - int unload_system() { - int result = 0; - - if (nullptr == _system_lib_handle) { - QNN_LOG_WARN("system lib handle is null\n"); - return 1; - } - - if (nullptr != _qnn_system_handle) { - result = _qnn_interface.qnn_system_context_free(_qnn_system_handle); - if (result != QNN_SUCCESS) { - QNN_LOG_WARN("failed to free QNN system context\n"); - } - _qnn_system_handle = nullptr; - } - - int dlclose_error = dlclose(_system_lib_handle); - if (dlclose_error != 0) { - QNN_LOG_WARN("failed to close QnnSystem library, error %s\n", dlerror()); - return 2; - } - - _system_lib_handle = nullptr; - - return result; - } - - int load_backend(std::string & lib_path, const QnnSaver_Config_t ** saver_config) { - Qnn_ErrorHandle_t error = QNN_SUCCESS; - QNN_LOG_DEBUG("lib_path:%s\n", lib_path.c_str()); - - void * lib_handle = dlopen(lib_path.c_str(), RTLD_NOW | RTLD_GLOBAL); - if (nullptr == lib_handle) { - QNN_LOG_WARN("can not open QNN library %s, with error: %s", lib_path.c_str(), dlerror()); - return 1; - } - - auto get_providers = - load_qnn_functionpointers( - lib_handle, "QnnInterface_getProviders"); - if (nullptr == get_providers) { - QNN_LOG_WARN("can not load symbol QnnInterface_getProviders : %s", dlerror()); - return 2; - } - - std::uint32_t num_providers = 0; - const QnnInterface_t ** provider_list = nullptr; - error = get_providers(&provider_list, &num_providers); - if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to get providers, error %d", QNN_GET_ERROR_CODE(error)); - return 3; - } - QNN_LOG_DEBUG("num_providers=%d\n", num_providers); - if (num_providers != _required_num_providers) { - QNN_LOG_WARN("providers is %d instead of required %d", num_providers, _required_num_providers); - return 4; - } - - if (nullptr == provider_list) { - QNN_LOG_WARN("failed to get qnn interface providers\n"); - return 5; - } - bool found_valid_interface = false; - QNN_INTERFACE_VER_TYPE qnn_interface; - for (size_t idx = 0; idx < num_providers; idx++) { - if (QNN_API_VERSION_MAJOR == - provider_list[idx]->apiVersion.coreApiVersion.major && - QNN_API_VERSION_MINOR <= - provider_list[idx]->apiVersion.coreApiVersion.minor) { - found_valid_interface = true; - qnn_interface = provider_list[idx]->QNN_INTERFACE_VER_NAME; - break; - } - } - - if (!found_valid_interface) { - QNN_LOG_WARN("unable to find a valid qnn interface\n"); - return 6; - } else { - QNN_LOG_INFO("find a valid qnn interface\n"); - } - set_qnn_raw_interface(qnn_interface); - - BackendIdType backend_id = provider_list[0]->backendId; - _lib_path_to_backend_id[lib_path] = backend_id; - if (_loaded_backend.count(backend_id) > 0) { - QNN_LOG_WARN("lib_path %s is loaded, but backend %d already exists\n", lib_path.c_str(), backend_id); - } - _loaded_backend[backend_id] = provider_list[0]; - if (_loaded_lib_handle.count(backend_id) > 0) { - QNN_LOG_WARN("closing %p\n", _loaded_lib_handle[backend_id]); - int dlclose_error = dlclose(_loaded_lib_handle[backend_id]); - if (dlclose_error != 0) { - QNN_LOG_WARN("fail to close %p with error %s\n", _loaded_lib_handle[backend_id], dlerror()); - } - } - _loaded_lib_handle[backend_id] = lib_handle; - _backend_id = backend_id; - - return 0; - } - - int unload_backend() { - int dlclose_error = 0; - for (auto & it : _loaded_lib_handle) { - dlclose_error = dlclose(it.second); - if (dlclose_error != 0) { - QNN_LOG_WARN("failed to close QNN backend %d, error %s\n", it.first, dlerror()); - } - } - - _loaded_lib_handle.clear(); - _lib_path_to_backend_id.clear(); - _loaded_backend.clear(); - - return 0; - } - - void set_qnn_raw_interface(QNN_INTERFACE_VER_TYPE & raw_interface) { - _qnn_raw_interface = raw_interface; - } - - void set_qnn_raw_system_interface(QNN_SYSTEM_INTERFACE_VER_TYPE & raw_interface) { - _qnn_raw_system_interface = raw_interface; - } - - private: - static constexpr const int _required_num_providers = 1; - - private: - std::string _lib_path; - std::string _backend_name; - std::string _model_name; // Qualcomm's dedicated prebuilt model name, keep it for further usage - BackendIdType _backend_id; - - bool _debug_tensor = false; - bool _do_node_validations = true; - - QnnLog_Level_t _qnn_log_level = QNN_LOG_LEVEL_DEBUG; - - qnn_sdk_profile_level _profile_level = qnn_sdk_profile_level::profile_detail; - - qnn_interface _qnn_interface; - - void * _system_lib_handle = nullptr; - - Qnn_GraphHandle_t _qnn_graph_handle = nullptr; - - Qnn_LogHandle_t _qnn_log_handle = nullptr; - - Qnn_ProfileHandle_t _qnn_profile_handle = nullptr; - - Qnn_DeviceHandle_t _qnn_device_handle = nullptr; - - Qnn_BackendHandle_t _qnn_backend_handle = nullptr; - - Qnn_ContextHandle_t _qnn_context_handle = nullptr; - - QnnSystemContext_Handle_t _qnn_system_handle = nullptr; - - QnnHtpDevice_PerfInfrastructure_t * _qnn_htp_perfinfra = nullptr; - uint32_t _qnn_power_configid = 1; - - QNN_INTERFACE_VER_TYPE _qnn_raw_interface; - QNN_SYSTEM_INTERFACE_VER_TYPE _qnn_raw_system_interface; - - std::unordered_map _qnn_mem_set; - - std::mutex _init_mutex; - std::unordered_map _loaded_lib_handle; - std::unordered_map _lib_path_to_backend_id; - std::unordered_map _loaded_backend; - - void * _rpc_lib_handle = nullptr; - std::atomic_bool _rpcmem_initialized{false}; - pfn_rpc_mem_alloc _pfn_rpc_mem_alloc; - pfn_rpc_mem_free _pfn_rpc_mem_free; - pfn_rpc_mem_to_fd _pfn_rpc_mem_to_fd; - pfn_rpc_mem_init _pfn_rpc_mem_init; - pfn_rpc_mem_deinit _pfn_rpc_mem_deinit; - std::unordered_map _rpcmem_store_map; - size_t _rpcmem_capacity = 512; - - std::string _graph_name; -}; - // ================================================================================================= // // implementation of QNN backend for GGML @@ -1959,126 +618,6 @@ static bool ggml_qnn_can_handle_op(ggml_backend_qnn_context * ctx, return true; } -template -class ggml_qnn_tensor_readwrite -{ -public: - ggml_qnn_tensor_readwrite(const ggml_tensor *tensor, Qnn_GraphHandle_t graph_handle, ggml_backend_qnn_context * ctx) - : _tensor(tensor) - , _qnn_tensor(reinterpret_cast(tensor->extra)) - , _context(ctx) { - _old_dimensions = QNN_VER_PTR(*_qnn_tensor)->dimensions; - const auto qnn_data_type = qnn_datatype_from_ggml_datatype(tensor->type); - const bool is_npu = ctx->device == QNN_BACKEND_NPU; - QNN_VER_PTR(*_qnn_tensor)->type = _tensorType; - if (is_npu) { - QNN_VER_PTR(*_qnn_tensor)->memType = QNN_TENSORMEMTYPE_MEMHANDLE; - QNN_VER_PTR(*_qnn_tensor)->clientBuf= {.data=nullptr, .dataSize=0}; - } - - auto err = ctx->raw_interface.tensorCreateGraphTensor(graph_handle, _qnn_tensor); - if (err != QNN_SUCCESS) { - QNN_LOG_INFO("error = %d\n", err); - QNN_LOG_DEBUG("tensor%p name %s", _qnn_tensor, QNN_TENSOR_GET_NAME(*_qnn_tensor)); - _context = nullptr; - return; - } - - _dimensions[0] = (uint32_t)tensor->ne[0]; - _dimensions[1] = (uint32_t)tensor->ne[1]; - _dimensions[2] = (uint32_t)tensor->ne[2]; - _dimensions[3] = (uint32_t)tensor->ne[3]; - QNN_VER_PTR(*_qnn_tensor)->dimensions = _dimensions; - QNN_VER_PTR(*_qnn_tensor)->rank = qnn_get_ggml_tensor_rank(tensor); - QNN_VER_PTR(*_qnn_tensor)->dataType = qnn_data_type; - - if (is_npu) { - qnn_instance * instance = ctx->instance; - uint8_t *qnn_buffer = static_cast(instance->alloc_rpcmem( - ggml_nbytes(tensor), alignof(void*))); - if (!qnn_buffer) { - QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno)); - QNN_LOG_DEBUG("tensor%p name %s", _qnn_tensor, QNN_TENSOR_GET_NAME(*_qnn_tensor)); - _context = nullptr; - // No free for _qnn_tensor, because it's not registered. - return; - } else { - QNN_LOG_INFO("alloc rpcmem successfully\n"); - } - - instance->register_rpcmem(qnn_buffer, _qnn_tensor); - if (_tensorType == QNN_TENSOR_TYPE_APP_WRITE || _tensorType == QNN_TENSOR_TYPE_APP_READWRITE) { - memcpy(qnn_buffer, tensor->data, ggml_nbytes(tensor)); - } - } else { - QNN_VER_PTR(*_qnn_tensor)->clientBuf = {tensor->data, - qnn_get_ggml_tensor_data_size(tensor)}; - } - } - - ggml_qnn_tensor_readwrite(const ggml_tensor *tensor, Qnn_Tensor_t *qnn_tensor, ggml_backend_qnn_context * ctx) - : _tensor(tensor) - , _qnn_tensor(qnn_tensor) - , _context(ctx) { - _old_dimensions = QNN_VER_PTR(*_qnn_tensor)->dimensions; - const auto qnn_data_type = qnn_datatype_from_ggml_datatype(tensor->type); - const bool is_npu = ctx->device == QNN_BACKEND_NPU; - - _dimensions[0] = (uint32_t)tensor->ne[0]; - _dimensions[1] = (uint32_t)tensor->ne[1]; - _dimensions[2] = (uint32_t)tensor->ne[2]; - _dimensions[3] = (uint32_t)tensor->ne[3]; - QNN_VER_PTR(*_qnn_tensor)->dimensions = _dimensions; - QNN_VER_PTR(*_qnn_tensor)->rank = qnn_get_ggml_tensor_rank(tensor); - QNN_VER_PTR(*_qnn_tensor)->dataType = qnn_data_type; - - - if (is_npu) { - uint8_t * qnn_buffer = static_cast(ctx->instance->get_rpcmem_from_memhandle( - QNN_VER_PTR(*_qnn_tensor)->memHandle)); - if (qnn_buffer) { - memcpy(qnn_buffer, tensor->data, ggml_nbytes(tensor)); - } else { - QNN_LOG_WARN("can't find rpcmem from qnn mem handle\n"); - QNN_LOG_DEBUG("tensor%p name %s", _qnn_tensor, QNN_TENSOR_GET_NAME(*_qnn_tensor)); - _context = nullptr; - return; - } - } else { - QNN_VER_PTR(*_qnn_tensor)->clientBuf = {tensor->data, - qnn_get_ggml_tensor_data_size(tensor)}; - } - } - - ~ggml_qnn_tensor_readwrite() { - if ((_tensorType == QNN_TENSOR_TYPE_APP_READWRITE || _tensorType == QNN_TENSOR_TYPE_APP_READ) && - _context && _context->device == QNN_BACKEND_NPU) { - uint8_t * qnn_buffer = static_cast(_context->instance->get_rpcmem_from_memhandle( - QNN_VER_PTR(*_qnn_tensor)->memHandle)); - memcpy(_tensor->data, qnn_buffer, ggml_nbytes(_tensor)); - } - - QNN_VER_PTR(*_qnn_tensor)->dimensions = _old_dimensions; - } - - bool is_valid() const { return _context; } - Qnn_Tensor_t * get_qnn_tensor() const { return _qnn_tensor; } - -private: - const ggml_tensor *_tensor; - Qnn_Tensor_t *_qnn_tensor; - ggml_backend_qnn_context *_context; - uint32_t *_old_dimensions; - uint32_t _dimensions[4] = {}; - - ggml_qnn_tensor_readwrite(const ggml_qnn_tensor_readwrite&) = delete; - void operator=(const ggml_qnn_tensor_readwrite&) = delete; - ggml_qnn_tensor_readwrite(ggml_qnn_tensor_readwrite&&) = delete; - void operator=(ggml_qnn_tensor_readwrite&&) = delete; -}; - -using ggml_qnn_tensor_output = ggml_qnn_tensor_readwrite; -using ggml_qnn_tensor_input = ggml_qnn_tensor_readwrite; //TODO: this function can be removed later because there are duplicated codes with ggml_qnn_mul_mat // keep it for illustrate how to implement a specified GGMPL OP using QNN API + QNN RPC @@ -2164,16 +703,16 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src QNN_LOG_INFO("create qnn graph handle with graph name %s ok\n", graph_name.c_str()); } - ggml_qnn_tensor_input tensor_input0(src0, graph_handle, ctx); + qnn::ggml_qnn_tensor_input tensor_input0(src0, graph_handle, ctx); if (!tensor_input0.is_valid()) { goto failure; } - ggml_qnn_tensor_input tensor_input1(src1, graph_handle, ctx); + qnn::ggml_qnn_tensor_input tensor_input1(src1, graph_handle, ctx); if (!tensor_input1.is_valid()) { QNN_LOG_INFO("error = %d\n", error); goto failure; } - ggml_qnn_tensor_output tensor_output(dst, graph_handle, ctx); + qnn::ggml_qnn_tensor_output tensor_output(dst, graph_handle, ctx); if (!tensor_output.is_valid()) { goto failure; } @@ -2221,9 +760,9 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src instance->_qnn_graph_map[map_entry] = graph_item; } else { auto & graph_item = instance->_qnn_graph_map[map_entry]; - ggml_qnn_tensor_input tensor_input0(src0, std::get<1>(graph_item), ctx); - ggml_qnn_tensor_input tensor_input1(src1, std::get<2>(graph_item), ctx); - ggml_qnn_tensor_output tensor_output(dst, std::get<3>(graph_item), ctx); + qnn::ggml_qnn_tensor_input tensor_input0(src0, std::get<1>(graph_item), ctx); + qnn::ggml_qnn_tensor_input tensor_input1(src1, std::get<2>(graph_item), ctx); + qnn::ggml_qnn_tensor_output tensor_output(dst, std::get<3>(graph_item), ctx); Qnn_Tensor_t tensor_inputs[] = {*tensor_input0.get_qnn_tensor(), *tensor_input1.get_qnn_tensor()}; Qnn_Tensor_t tensor_outputs[] = {*tensor_output.get_qnn_tensor()}; @@ -2360,15 +899,15 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, goto failure; } - ggml_qnn_tensor_input tensor_input0(src0, graph_handle, ctx); + qnn::ggml_qnn_tensor_input tensor_input0(src0, graph_handle, ctx); if (!tensor_input0.is_valid()) { goto failure; } - ggml_qnn_tensor_input tensor_input1(src1, graph_handle, ctx); + qnn::ggml_qnn_tensor_input tensor_input1(src1, graph_handle, ctx); if (!tensor_input1.is_valid()) { goto failure; } - ggml_qnn_tensor_output tensor_output(dst, graph_handle, ctx); + qnn::ggml_qnn_tensor_output tensor_output(dst, graph_handle, ctx); if (!tensor_output.is_valid()) { goto failure; } @@ -2416,9 +955,9 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, instance->_qnn_graph_map[map_entry] = graph_item; } else { auto & graph_item= instance->_qnn_graph_map[map_entry]; - ggml_qnn_tensor_input tensor_input0(src0, std::get<1>(graph_item), ctx); - ggml_qnn_tensor_input tensor_input1(src1, std::get<2>(graph_item), ctx); - ggml_qnn_tensor_output tensor_output(dst, std::get<3>(graph_item), ctx); + qnn::ggml_qnn_tensor_input tensor_input0(src0, std::get<1>(graph_item), ctx); + qnn::ggml_qnn_tensor_input tensor_input1(src1, std::get<2>(graph_item), ctx); + qnn::ggml_qnn_tensor_output tensor_output(dst, std::get<3>(graph_item), ctx); Qnn_Tensor_t tensor_inputs[] = {*tensor_input0.get_qnn_tensor(), *tensor_input1.get_qnn_tensor()}; Qnn_Tensor_t tensor_outputs[] = {*tensor_output.get_qnn_tensor()}; @@ -2785,7 +1324,7 @@ GGML_CALL static void ggml_backend_qnn_buffer_init_tensor(ggml_backend_buffer_t (uint32_t) tensor->ne[2], (uint32_t) tensor->ne[3]}; Qnn_DataType_t qnn_data_type = - qnn_datatype_from_ggml_datatype(tensor->type); + qnn::datatype_from_ggml_datatype(tensor->type); Qnn_TensorType_t qnn_tensor_type = QNN_TENSOR_TYPE_APP_WRITE; if (tensor->flags & GGML_TENSOR_FLAG_INPUT) { @@ -2812,7 +1351,7 @@ GGML_CALL static void ggml_backend_qnn_buffer_init_tensor(ggml_backend_buffer_t QNN_QUANTIZATION_ENCODING_UNDEFINED, {.scaleOffsetEncoding = {.scale = 0.0000000000000000f, .offset = 0}}}, - .rank = qnn_get_ggml_tensor_rank(tensor), + .rank = qnn::get_ggml_tensor_rank(tensor), .dimensions = dimensions, .memType = qnn_mem_type, {.clientBuf = {.data = nullptr, .dataSize = 0}}}}}; @@ -3070,7 +1609,7 @@ bool ggml_backend_is_qnn(ggml_backend_t backend) { void ggml_backend_qnn_set_n_threads(ggml_backend_t backend, int n_threads) { GGML_ASSERT(ggml_backend_is_qnn(backend)); - struct ggml_backend_qnn_context * ctx = (struct ggml_backend_qnn_context *) backend->context; + auto * ctx = (ggml_backend_qnn_context *) backend->context; ctx->threads = n_threads; } @@ -3175,10 +1714,10 @@ ggml_backend_t ggml_backend_qnn_init(size_t device, const char * qnn_lib_path) { } else { if (0 == setenv("LD_LIBRARY_PATH", path.c_str(), 1)) { QNN_LOG_INFO("%s backend setenv successfully\n", - qnn_get_backend_name(device)); + qnn::get_backend_name(device)); } else { QNN_LOG_ERROR("%s backend setenv failure\n", - qnn_get_backend_name(device)); + qnn::get_backend_name(device)); } } @@ -3188,18 +1727,18 @@ ggml_backend_t ggml_backend_qnn_init(size_t device, const char * qnn_lib_path) { if (0 != result) { QNN_LOG_WARN( "init qnn subsystem failed with qnn backend %s, pls check why\n", - qnn_get_backend_name(device)); + qnn::get_backend_name(device)); delete instance; return nullptr; } - qnn_interface qnn_interface = instance->get_qnn_interface(); + auto qnn_interface = instance->get_qnn_interface(); if (!qnn_interface.is_loaded()) { QNN_LOG_WARN("qnn subsystem failure\n"); delete instance; return nullptr; } - std::string device_name = qnn_get_backend_name(device); + std::string device_name = qnn::get_backend_name(device); QNN_LOG_INFO("qnn device name %s", device_name.c_str()); g_qnn_mgr[device].instance = instance; g_qnn_mgr[device].raw_interface = instance->get_qnn_raw_interface(); diff --git a/ggml-qnn/backend.hpp b/ggml-qnn/backend.hpp new file mode 100644 index 0000000000000..3a624eab050ac --- /dev/null +++ b/ggml-qnn/backend.hpp @@ -0,0 +1,24 @@ + +#pragma once + +#include "QnnTypes.h" +#include "QnnCommon.h" +#include "QnnContext.h" +#include "QnnBackend.h" + +#include "ggml.h" +#include "ggml-backend.h" + +#include "qnn.hpp" + +struct ggml_backend_qnn_context { + int device; + int threads; + char name[GGML_MAX_NAME]; + char lib[GGML_MAX_NAME]; + qnn_internal::qnn_instance* instance; + struct ggml_backend* backend; + QNN_INTERFACE_VER_TYPE raw_interface; + QNN_SYSTEM_INTERFACE_VER_TYPE raw_system_interface; + struct qcom_socinfo socinfo; +}; diff --git a/ggml-qnn/logger.cpp b/ggml-qnn/logger.cpp new file mode 100644 index 0000000000000..43856c9f48a9f --- /dev/null +++ b/ggml-qnn/logger.cpp @@ -0,0 +1,78 @@ + +#include "logger.hpp" + +#include +#include + +#if (defined __ANDROID__) || (defined ANDROID) +#include +#endif + +#define QNN_LOGBUF_LEN 4096 + +void qnn::internal_log(ggml_log_level level, const char* file, + const char* func, int line, + const char* format, ...) { + static std::mutex qnn_internal_log_mutex; + static char s_qnn_internal_log_buf[QNN_LOGBUF_LEN]; + + { + std::lock_guard lock(qnn_internal_log_mutex); + va_list args; + + va_start(args, format); + int len_prefix = + snprintf(s_qnn_internal_log_buf, QNN_LOGBUF_LEN, + "[%s, %d]: ", func, line); + int len = vsnprintf(s_qnn_internal_log_buf + len_prefix, + QNN_LOGBUF_LEN - len_prefix, format, args); + if (len < (QNN_LOGBUF_LEN - len_prefix)) { +#if (defined __ANDROID__) || (defined ANDROID) + // for Android APK + __android_log_print(level, "ggml-qnn", "%s\n", s_qnn_internal_log_buf); +#endif + // for Android command line application or WoA(Windows on ARM) + printf("%s\n", s_qnn_internal_log_buf); + } + va_end(args); + } +} + +void qnn::sdk_logcallback(const char* fmt, QnnLog_Level_t level, + uint64_t timestamp, va_list argp) { +#if ENABLE_QNNSDK_LOG + static std::mutex log_mutex; + static unsigned char s_ggml_qnn_logbuf[QNN_LOGBUF_LEN]; + + const char* log_level_desc = ""; + switch (level) { + case QNN_LOG_LEVEL_ERROR: + log_level_desc = "ERROR"; + break; + case QNN_LOG_LEVEL_WARN: + log_level_desc = "WARNING"; + break; + case QNN_LOG_LEVEL_INFO: + log_level_desc = "INFO"; + break; + case QNN_LOG_LEVEL_DEBUG: + log_level_desc = "DEBUG"; + break; + case QNN_LOG_LEVEL_VERBOSE: + log_level_desc = "VERBOSE"; + break; + case QNN_LOG_LEVEL_MAX: + log_level_desc = "UNKNOWN"; + break; + } + + double ms = (double)timestamp / 1000000.0; + { + std::lock_guard lock(log_mutex); + + memset(s_ggml_qnn_logbuf, 0, QNN_LOGBUF_LEN); + vsnprintf(reinterpret_cast(s_ggml_qnn_logbuf), QNN_LOGBUF_LEN, fmt, argp); + QNN_LOG_INFO("%8.1fms [%-7s] %s\n", ms, log_level_desc, s_ggml_qnn_logbuf); + } +#endif +} diff --git a/ggml-qnn/logger.hpp b/ggml-qnn/logger.hpp new file mode 100644 index 0000000000000..003436da10fae --- /dev/null +++ b/ggml-qnn/logger.hpp @@ -0,0 +1,49 @@ +#pragma once + +#include + +#include "QnnTypes.h" +#include "QnnCommon.h" +#include "QnnInterface.h" +#include "System/QnnSystemInterface.h" + +#include "ggml.h" + +namespace qnn { + void internal_log(ggml_log_level level, const char* file, + const char* func, int line, + const char* format, ...); + + + void sdk_logcallback(const char* fmt, QnnLog_Level_t level, + uint64_t timestamp, va_list argp); +} + +// ================================================================================================= +// +// QNN backend internal log function +// +// ================================================================================================= +#define QNN_LOG_ERROR(...) \ + qnn::internal_log(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) + +#define QNN_LOG_WARN(...) \ + qnn::internal_log(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) + +#define QNN_LOG_INFO(...) \ + qnn::internal_log(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) + +#ifdef NDEBUG +#define ENABLE_QNNBACKEND_DEBUG 0 // for troubleshooting QNN backend +#define ENABLE_QNNSDK_LOG 0 // enable/disable QNN SDK's internal log +#else +#define ENABLE_QNNBACKEND_DEBUG 1 // for troubleshooting QNN backend +#define ENABLE_QNNSDK_LOG 1 // enable/disable QNN SDK's internal log +#endif + +#if ENABLE_QNNBACKEND_DEBUG +#define QNN_LOG_DEBUG(...) \ + qnn::internal_log(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) +#else +#define QNN_LOG_DEBUG(...) +#endif diff --git a/ggml-qnn/qnn-types.hpp b/ggml-qnn/qnn-types.hpp new file mode 100644 index 0000000000000..33f468eb796d1 --- /dev/null +++ b/ggml-qnn/qnn-types.hpp @@ -0,0 +1,46 @@ + +#pragma once + +namespace qnn { + // ================================================================================================= + // + // helper data type / data structure / macros / functions of + // Qualcomm QNN(Qualcomm Neural Network, aka Qualcomm AI Engine Direct) SDK + // ref:https://github.com/pytorch/executorch/tree/main/backends/qualcomm + // ================================================================================================= + enum sdk_profile_level { + profile_off = 0, + profile_basic = 1, + profile_detail = 2 + }; + + enum qcom_htp_arch { + NONE = 0, + V68 = 68, + V69 = 69, + V73 = 73, + V75 = 75, + }; + + enum qcom_chipset { + UNKNOWN_SM = 0, + SM8450 = 36, // v69 + SM8475 = 42, // v69 + SM8550 = 43, // v73 + SM8650 = 57, // v75 + }; + + using pfn_rpc_mem_init = void (*)(void); + using pfn_rpc_mem_deinit = void (*)(void); + using pfn_rpc_mem_alloc = void* (*) (int, uint32_t, int); + using pfn_rpc_mem_free = void (*)(void*); + using pfn_rpc_mem_to_fd = int (*)(void*); + + struct qcom_socinfo { + uint32_t soc_model; + size_t htp_arch; + size_t vtcm_size_in_mb; + }; +} + +#define QNN_VER_PTR(x) (&((x).v1)) // TODO: remove this macro after we have a separate header for QNN diff --git a/ggml-qnn/qnn.hpp b/ggml-qnn/qnn.hpp new file mode 100644 index 0000000000000..bd83a9f05e946 --- /dev/null +++ b/ggml-qnn/qnn.hpp @@ -0,0 +1,1139 @@ +#pragma once + +#include + +// header file of Qualcomm QNN(Qualcomm Neural Network, aka Qualcomm AI Engine Direct) SDK +// https://qpm.qualcomm.com/#/main/tools/details/qualcomm_ai_engine_direct +#include "QnnTypes.h" +#include "QnnCommon.h" +#include "QnnContext.h" +#include "QnnBackend.h" +#include "QnnGraph.h" +#include "QnnProperty.h" +#include "QnnTensor.h" +#include "QnnInterface.h" +#include "Saver/QnnSaver.h" +#include "System/QnnSystemInterface.h" +#include "HTP/QnnHtpDevice.h" +#include "HTP/QnnHtpGraph.h" + +#include "utils.hpp" +#include "logger.hpp" + +namespace qnn_internal { + + // ================================================================================================= + // + // wrapper class of Qualcomm QNN(Qualcomm Neural Network, aka Qualcomm AI Engine Direct) SDK + // ref:https://github.com/pytorch/executorch/tree/main/backends/qualcomm + // ================================================================================================= + class qnn_interface { + +#define DEFINE_SHIM_FUNCTION_INTERFACE(F, pointer_name) \ + template inline auto qnn_##F(Args... args) const { \ + return (_qnn_interface->QNN_INTERFACE_VER_NAME.pointer_name)( \ + std::forward(args)...); \ + } + +#define DEFINE_SHIM_FUNCTION_SYS_INTERFACE(F, pointer_name) \ + template inline auto qnn_##F(Args... args) const { \ + return ( \ + _qnn_sys_interface->QNN_SYSTEM_INTERFACE_VER_NAME.pointer_name)( \ + std::forward(args)...); \ + } + + friend class qnn_instance; + + public: + qnn_interface() = default; + + // QnnBackend + DEFINE_SHIM_FUNCTION_INTERFACE(backend_create, backendCreate); + + DEFINE_SHIM_FUNCTION_INTERFACE(backend_free, backendFree); + + DEFINE_SHIM_FUNCTION_INTERFACE(backend_register_op_package, + backendRegisterOpPackage); + + DEFINE_SHIM_FUNCTION_INTERFACE(backend_validate_op_config, + backendValidateOpConfig); + + DEFINE_SHIM_FUNCTION_INTERFACE(backend_get_api_version, + backendGetApiVersion); + + // QnnDevice + DEFINE_SHIM_FUNCTION_INTERFACE(device_create, deviceCreate); + + DEFINE_SHIM_FUNCTION_INTERFACE(device_free, deviceFree); + + DEFINE_SHIM_FUNCTION_INTERFACE(device_get_infrastructure, + deviceGetInfrastructure); + + DEFINE_SHIM_FUNCTION_INTERFACE(device_get_platform_info, + deviceGetPlatformInfo); + + DEFINE_SHIM_FUNCTION_INTERFACE(device_get_info, deviceGetInfo); + + // QnnContext + DEFINE_SHIM_FUNCTION_INTERFACE(context_create, contextCreate); + + DEFINE_SHIM_FUNCTION_INTERFACE(context_get_binary_size, + contextGetBinarySize); + + DEFINE_SHIM_FUNCTION_INTERFACE(context_get_binary, contextGetBinary); + + DEFINE_SHIM_FUNCTION_INTERFACE(context_create_from_binary, + contextCreateFromBinary); + + DEFINE_SHIM_FUNCTION_INTERFACE(context_free, contextFree); + + // QnnGraph + DEFINE_SHIM_FUNCTION_INTERFACE(graph_create, graphCreate); + + DEFINE_SHIM_FUNCTION_INTERFACE(graph_add_node, graphAddNode); + + DEFINE_SHIM_FUNCTION_INTERFACE(graph_finalize, graphFinalize); + + DEFINE_SHIM_FUNCTION_INTERFACE(graph_execute, graphExecute); + + DEFINE_SHIM_FUNCTION_INTERFACE(graph_retrieve, graphRetrieve); + + // QnnLog + DEFINE_SHIM_FUNCTION_INTERFACE(log_create, logCreate); + + DEFINE_SHIM_FUNCTION_INTERFACE(log_free, logFree); + + DEFINE_SHIM_FUNCTION_INTERFACE(log_set_log_level, logSetLogLevel); + + // QnnProfile + DEFINE_SHIM_FUNCTION_INTERFACE(profile_create, profileCreate); + + DEFINE_SHIM_FUNCTION_INTERFACE(profile_get_events, profileGetEvents); + + DEFINE_SHIM_FUNCTION_INTERFACE(profile_get_sub_events, profileGetSubEvents); + + DEFINE_SHIM_FUNCTION_INTERFACE(profile_get_event_data, profileGetEventData); + + DEFINE_SHIM_FUNCTION_INTERFACE(profile_free, profileFree); + + // QnnMem + DEFINE_SHIM_FUNCTION_INTERFACE(mem_register, memRegister); + + DEFINE_SHIM_FUNCTION_INTERFACE(mem_de_register, memDeRegister); + + // QnnProperty + DEFINE_SHIM_FUNCTION_INTERFACE(property_has_capability, + propertyHasCapability); + + // QnnTensor + DEFINE_SHIM_FUNCTION_INTERFACE(tensor_create_context_tensor, + tensorCreateContextTensor); + + DEFINE_SHIM_FUNCTION_INTERFACE(tensor_create_graph_tensor, + tensorCreateGraphTensor); + + // QnnSystem + DEFINE_SHIM_FUNCTION_SYS_INTERFACE(system_context_create, + systemContextCreate); + + DEFINE_SHIM_FUNCTION_SYS_INTERFACE(system_context_get_binary_info, + systemContextGetBinaryInfo); + + DEFINE_SHIM_FUNCTION_SYS_INTERFACE(system_context_free, systemContextFree); + + void set_qnn_interface(const QnnInterface_t* qnn_interface) { + _qnn_interface = qnn_interface; + } + + void set_qnn_system_interface( + const QnnSystemInterface_t* qnn_sys_interface) { + _qnn_sys_interface = qnn_sys_interface; + } + + uint32_t get_backend_id() const { return _qnn_interface->backendId; } + + bool is_loaded() const { + return ((_qnn_sys_interface != nullptr) && (_qnn_interface != nullptr)); + } + + private: + const QnnInterface_t* _qnn_interface = nullptr; + + const QnnSystemInterface_t* _qnn_sys_interface = nullptr; + }; + + + class qnn_instance { + public: + using BackendIdType = decltype(QnnInterface_t{}.backendId); + + explicit qnn_instance(const std::string& lib_path, + const std::string& backend_name, + const std::string& model_name) + : _lib_path(std::move(lib_path)) + , _backend_name(std::move(backend_name)) + , _model_name(std::move(model_name)) {}; + + ~qnn_instance() {} + + int qnn_init(const QnnSaver_Config_t** saver_config) { + BackendIdType backend_id = QNN_BACKEND_ID_NULL; + QNN_LOG_DEBUG("enter qni_init\n"); + + std::lock_guard lock(_init_mutex); + + if (0 != load_system()) { + QNN_LOG_WARN("can not load QNN system lib, pls check why?\n"); + return 1; + } + else { + QNN_LOG_DEBUG("load QNN system lib successfully\n"); + } + + std::string backend_lib_path = _lib_path + _backend_name; + if (0 == _lib_path_to_backend_id.count(backend_lib_path)) { + int is_load_ok = load_backend(backend_lib_path, saver_config); + if (0 != is_load_ok) { + QNN_LOG_WARN("failed to load QNN backend\n"); + return 2; + } + } + + backend_id = _lib_path_to_backend_id[backend_lib_path]; + if (0 == _loaded_backend.count(backend_id) || + 0 == _loaded_lib_handle.count(backend_id)) { + QNN_LOG_WARN("library %s is loaded but loaded backend count=%zu, " + "loaded lib_handle count=%zu\n", + backend_lib_path.c_str(), _loaded_backend.count(backend_id), + _loaded_lib_handle.count(backend_id)); + return 3; + } + + _qnn_interface.set_qnn_interface(_loaded_backend[backend_id]); + + _qnn_interface.qnn_log_create(qnn::sdk_logcallback, _qnn_log_level, &_qnn_log_handle); + if (nullptr == _qnn_log_handle) { + // NPU backend not work on Qualcomm SoC equipped low-end phone + QNN_LOG_WARN("why failed to initialize qnn log\n"); + return 4; + } + else { + QNN_LOG_DEBUG("initialize qnn log successfully\n"); + } + + std::vector temp_backend_config; + _qnn_interface.qnn_backend_create( + _qnn_log_handle, + temp_backend_config.empty() ? nullptr : temp_backend_config.data(), + &_qnn_backend_handle); + if (nullptr == _qnn_backend_handle) { + QNN_LOG_WARN("why failed to initialize qnn backend\n"); + return 5; + } + else { + QNN_LOG_DEBUG("initialize qnn backend successfully\n"); + } + + if (nullptr != _qnn_raw_interface.propertyHasCapability) { + Qnn_ErrorHandle_t qnn_status = + _qnn_raw_interface.propertyHasCapability(QNN_PROPERTY_GROUP_DEVICE); + if (QNN_PROPERTY_NOT_SUPPORTED == qnn_status) { + QNN_LOG_WARN("device property is not supported\n"); + } + if (QNN_PROPERTY_ERROR_UNKNOWN_KEY == qnn_status) { + QNN_LOG_WARN("device property is not known to backend\n"); + } + } + + Qnn_ErrorHandle_t qnn_status = QNN_SUCCESS; + if (_backend_name.find("Htp") != std::variant_npos) { + const QnnDevice_PlatformInfo_t* p_info = nullptr; + _qnn_raw_interface.deviceGetPlatformInfo(nullptr, &p_info); + QNN_LOG_INFO("device counts %d", p_info->v1.numHwDevices); + QnnDevice_HardwareDeviceInfo_t* infos = p_info->v1.hwDevices; + QnnHtpDevice_OnChipDeviceInfoExtension_t chipinfo = { }; + for (int i = 0; i < p_info->v1.numHwDevices; i++) { + QNN_LOG_INFO("deviceID:%d, deviceType:%d, numCores %d", infos[i].v1.deviceId, + infos[i].v1.deviceType, infos[i].v1.numCores); + QnnDevice_DeviceInfoExtension_t devinfo = infos[i].v1.deviceInfoExtension; + chipinfo = devinfo->onChipDevice; + QnnHtpDevice_Arch_t htp_arch = chipinfo.arch; + QNN_LOG_INFO("htp_type:%d(%s)", devinfo->devType, (devinfo->devType == QNN_HTP_DEVICE_TYPE_ON_CHIP) ? "ON_CHIP" : ""); + QNN_LOG_INFO("qualcomm soc_model:%d(%s), htp_arch:%d(%s), vtcm_size:%d MB", + chipinfo.socModel, qnn::get_chipset_desc(chipinfo.socModel), + htp_arch, qnn::get_htparch_desc(htp_arch), chipinfo.vtcmSize); + g_qnn_mgr[QNN_BACKEND_NPU].socinfo = { chipinfo.socModel, htp_arch, chipinfo.vtcmSize }; + } + _qnn_raw_interface.deviceFreePlatformInfo(nullptr, p_info); + + QnnHtpDevice_CustomConfig_t soc_customconfig; + soc_customconfig.option = QNN_HTP_DEVICE_CONFIG_OPTION_SOC; + soc_customconfig.socModel = chipinfo.socModel; + QnnDevice_Config_t soc_devconfig; + soc_devconfig.option = QNN_DEVICE_CONFIG_OPTION_CUSTOM; + soc_devconfig.customConfig = &soc_customconfig; + + QnnHtpDevice_CustomConfig_t arch_customconfig; + arch_customconfig.option = QNN_HTP_DEVICE_CONFIG_OPTION_ARCH; + arch_customconfig.arch.arch = chipinfo.arch; + arch_customconfig.arch.deviceId = 0; // Id of device to be used. If single device is used by default 0. + QnnDevice_Config_t arch_devconfig; + arch_devconfig.option = QNN_DEVICE_CONFIG_OPTION_CUSTOM; + arch_devconfig.customConfig = &arch_customconfig; + + const QnnDevice_Config_t* p_deviceconfig[] = { &soc_devconfig, &arch_devconfig, nullptr }; + qnn_status = _qnn_raw_interface.deviceCreate(_qnn_log_handle, p_deviceconfig, &_qnn_device_handle); + } + else { + qnn_status = _qnn_raw_interface.deviceCreate(_qnn_log_handle, nullptr, &_qnn_device_handle); + } + if (QNN_SUCCESS != qnn_status && + QNN_DEVICE_ERROR_UNSUPPORTED_FEATURE != qnn_status) { + QNN_LOG_WARN("failed to create QNN device\n"); + } + else { + QNN_LOG_INFO("create QNN device successfully\n"); + } + + if (qnn::sdk_profile_level::profile_off != _profile_level) { + QNN_LOG_INFO("profiling turned on; level = %d", _profile_level); + if (qnn::sdk_profile_level::profile_basic == _profile_level) { + QNN_LOG_INFO("basic profiling requested. creating Qnn Profile object\n"); + if (QNN_PROFILE_NO_ERROR != + _qnn_raw_interface.profileCreate(_qnn_backend_handle, + QNN_PROFILE_LEVEL_BASIC, + &_qnn_profile_handle)) { + QNN_LOG_WARN("unable to create profile handle in the backend\n"); + return 6; + } + else { + QNN_LOG_DEBUG("initialize qnn profile successfully\n"); + } + } + else if (qnn::sdk_profile_level::profile_detail == _profile_level) { + QNN_LOG_INFO("detailed profiling requested. Creating Qnn Profile object\n"); + if (QNN_PROFILE_NO_ERROR != + _qnn_raw_interface.profileCreate(_qnn_backend_handle, + QNN_PROFILE_LEVEL_DETAILED, + &_qnn_profile_handle)) { + QNN_LOG_WARN("unable to create profile handle in the backend\n"); + return 7; + } + else { + QNN_LOG_DEBUG("initialize qnn profile successfully\n"); + } + } + } + + _rpc_lib_handle = dlopen("libcdsprpc.so", RTLD_NOW | RTLD_LOCAL); + if (nullptr == _rpc_lib_handle) { + QNN_LOG_WARN("failed to load qualcomm's rpc lib, error:%s\n", dlerror()); + return 8; + } + else { + QNN_LOG_DEBUG("load rpcmem lib successfully\n"); + set_rpcmem_initialized(true); + } + _pfn_rpc_mem_init = reinterpret_cast( + dlsym(_rpc_lib_handle, "rpcmem_init")); + _pfn_rpc_mem_deinit = reinterpret_cast( + dlsym(_rpc_lib_handle, "rpcmem_deinit")); + _pfn_rpc_mem_alloc = reinterpret_cast( + dlsym(_rpc_lib_handle, "rpcmem_alloc")); + _pfn_rpc_mem_free = reinterpret_cast( + dlsym(_rpc_lib_handle, "rpcmem_free")); + _pfn_rpc_mem_to_fd = reinterpret_cast( + dlsym(_rpc_lib_handle, "rpcmem_to_fd")); + if (nullptr == _pfn_rpc_mem_alloc || nullptr == _pfn_rpc_mem_free || + nullptr == _pfn_rpc_mem_to_fd) { + QNN_LOG_WARN("unable to access symbols in QNN RPC lib. dlerror(): %s", dlerror()); + dlclose(_rpc_lib_handle); + return 9; + } + + if (nullptr != _pfn_rpc_mem_init) { // make Qualcomm's SoC equipped low-end phone happy + _pfn_rpc_mem_init(); + } + + /* TODO: not used, keep it for further usage + QnnContext_Config_t qnn_context_config = QNN_CONTEXT_CONFIG_INIT; + qnn_context_config.priority = QNN_PRIORITY_DEFAULT; + const QnnContext_Config_t * context_configs[] = {&qnn_context_config, nullptr}; + */ + _qnn_interface.qnn_context_create( + _qnn_backend_handle, _qnn_device_handle, + nullptr, + &_qnn_context_handle); + if (nullptr == _qnn_context_handle) { + QNN_LOG_WARN("why failed to initialize qnn context\n"); + return 10; + } + else { + QNN_LOG_DEBUG("initialize qnn context successfully\n"); + } + + if (_backend_name.find("Htp") != std::variant_npos) { + //TODO: faster approach to probe the accurate capacity of rpc ion memory + size_t candidate_size = 0; + uint8_t* rpc_buffer = nullptr; + const int size_in_mb = (1 << 20); + size_t probe_slots[] = { 1024, 1536, 2048 - 48, 2048 }; + size_t probe_counts = sizeof(probe_slots) / sizeof(size_t); + for (size_t idx = 0; idx < probe_counts; idx++) { + rpc_buffer = static_cast(alloc_rpcmem( + probe_slots[idx] * size_in_mb, 4)); + if (nullptr == rpc_buffer) { + QNN_LOG_INFO("alloc rpcmem %d (MB) failure, %s\n", + probe_slots[idx], strerror(errno)); + break; + } + else { + candidate_size = probe_slots[idx]; + free_rpcmem(rpc_buffer); + rpc_buffer = nullptr; + } + } + if (candidate_size > _rpcmem_capacity) + _rpcmem_capacity = candidate_size; + QNN_LOG_INFO("capacity of QNN rpc ion memory is about %d MB\n", _rpcmem_capacity); + + if (0 != init_htp_perfinfra()) { + QNN_LOG_WARN("initialize HTP performance failure"); + } + if (0 != set_rpc_polling()) { + QNN_LOG_WARN("set RPC polling failure"); + } + if (0 != set_high_performance_mode()) { + QNN_LOG_WARN("set HTP high performance mode failure"); + } + } + + QNN_LOG_DEBUG("leave qni_init\n"); + + return 0; + } + + int qnn_finalize() { + int ret_status = 0; + Qnn_ErrorHandle_t error = QNN_SUCCESS; + + if (nullptr != _pfn_rpc_mem_deinit) // make Qualcomm's SoC equipped low-end phone happy + _pfn_rpc_mem_deinit(); + + if (dlclose(_rpc_lib_handle) != 0) { + QNN_LOG_WARN("failed to unload qualcomm's rpc lib, error:%s\n", dlerror()); + } + else { + QNN_LOG_DEBUG("succeed to close rpcmem lib\n"); + } + + if (_backend_name.find("Htp") != std::variant_npos) { + _qnn_htp_perfinfra->destroyPowerConfigId(_qnn_power_configid); + } + + if (nullptr != _qnn_context_handle) { + error = _qnn_interface.qnn_context_free(_qnn_context_handle, + _qnn_profile_handle); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to free QNN context_handle: ID %u, error %d\n", + _qnn_interface.get_backend_id(), + QNN_GET_ERROR_CODE(error)); + } + _qnn_context_handle = nullptr; + } + + if (nullptr != _qnn_profile_handle) { + error = _qnn_interface.qnn_profile_free(_qnn_profile_handle); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to free QNN profile_handle: ID %u, error %d\n", + _qnn_interface.get_backend_id(), + QNN_GET_ERROR_CODE(error)); + } + _qnn_profile_handle = nullptr; + } + + if (nullptr != _qnn_device_handle) { + error = _qnn_interface.qnn_device_free(_qnn_device_handle); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to free QNN device_handle: ID %u, error %d\n", + _qnn_interface.get_backend_id(), + QNN_GET_ERROR_CODE(error)); + } + _qnn_device_handle = nullptr; + } + + if (nullptr != _qnn_backend_handle) { + error = _qnn_interface.qnn_backend_free(_qnn_backend_handle); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to free QNN backend_handle: ID %u, error %d\n", + _qnn_interface.get_backend_id(), + QNN_GET_ERROR_CODE(error)); + } + _qnn_backend_handle = nullptr; + } + + if (nullptr != _qnn_log_handle) { + error = _qnn_interface.qnn_log_free(_qnn_log_handle); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to free QNN log_handle: ID %u, error %d\n", + _qnn_interface.get_backend_id(), + QNN_GET_ERROR_CODE(error)); + } + _qnn_log_handle = nullptr; + } + + unload_backend(); + + unload_system(); + + return ret_status; + } + + //TODO:keep it for further usage of offload the entire cgraph to a single QNN DAG directly + // which was used in Qualcomm's dedicated AI technology +#if 0 + int init_qnn_graph(const char* graph_name, bool debug, + uint8_t do_node_validation = true, + const QnnGraph_Config_t** graph_configs = nullptr) { + int result = 0; + + if (nullptr == graph_name) { + QNN_LOG_WARN("graph name is null\n"); + return 1; + } + + if (!_graph_name.empty()) { + QNN_LOG_WARN("qnn model for graph %s already initialized\n", graph_name); + return 2; + } + + if (!do_node_validation) { + QNN_LOG_WARN("node validation disabled, backend will not perform op " + "validation prior to adding node\n"); + } + + _graph_name = graph_name; + _debug_tensor = debug; + _do_node_validations = do_node_validation; + + result = _qnn_raw_interface.graphCreate(_qnn_context_handle, graph_name, + graph_configs, &_qnn_graph_handle); + if (result != QNN_GRAPH_NO_ERROR || nullptr == _qnn_graph_handle) { + QNN_LOG_WARN("failed to create graph in qnn context\n"); + return 3; + } + else { + QNN_LOG_INFO("succeed to create graph %s, %p\n", graph_name, _qnn_graph_handle); + } + + return 0; + } + + int finalize_qnn_graph() { + if (nullptr != _qnn_graph_handle) { + if (_qnn_raw_interface.graphFinalize(_qnn_graph_handle, + _qnn_profile_handle, + nullptr) != QNN_GRAPH_NO_ERROR) { + QNN_LOG_WARN("finalizing graph failure\n"); + } + } + else { + QNN_LOG_DEBUG("qnn graph handle is null\n"); + } + + return 0; + } +#endif + + const qnn_interface& get_qnn_interface() { + if (!_qnn_interface.is_loaded()) { + QNN_LOG_WARN("pls check why _qnn_interface is not loaded\n"); + } + return _qnn_interface; + } + + const QNN_INTERFACE_VER_TYPE& get_qnn_raw_interface() { + if (!_qnn_interface.is_loaded()) { + QNN_LOG_WARN("pls check why _qnn_interface is not loaded\n"); + } + return _qnn_raw_interface; + } + + const QNN_SYSTEM_INTERFACE_VER_TYPE& get_qnn_raw_system_interface() { + if (!_qnn_interface.is_loaded()) { + QNN_LOG_WARN("pls check why _qnn_interface is not loaded\n"); + } + return _qnn_raw_system_interface; + } + + const Qnn_LogHandle_t get_qnn_log_handle() { return _qnn_log_handle; } + + const Qnn_ProfileHandle_t get_qnn_profile_handle() { + return _qnn_profile_handle; + } + + const Qnn_DeviceHandle_t get_qnn_device_handle() { + return _qnn_device_handle; + } + + const Qnn_BackendHandle_t get_qnn_backend_handle() { + return _qnn_backend_handle; + } + + const Qnn_ContextHandle_t get_qnn_context_handle() { + return _qnn_context_handle; + } + + const QnnSystemContext_Handle_t get_qnn_system_handle() { + return _qnn_system_handle; + } + + const Qnn_GraphHandle_t get_qnn_graph_handle() { return _qnn_graph_handle; } + + int init_htp_perfinfra() { + QnnDevice_Infrastructure_t device_infra = nullptr; + int error = _qnn_raw_interface.deviceGetInfrastructure(&device_infra); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to get qnn device infra\n"); + return 1; + } + else { + QNN_LOG_INFO("HTP backend perf_infrastructure creation ok\n"); + } + + QnnHtpDevice_Infrastructure_t* htp_infra = static_cast(device_infra); + QnnHtpDevice_PerfInfrastructure_t* htp_perfinfra = &htp_infra->perfInfra; + uint32_t power_configid = 1; + uint32_t device_id = 0; + uint32_t core_id = 0; + htp_perfinfra->createPowerConfigId(device_id, core_id, &power_configid); + if (htp_infra->infraType != QNN_HTP_DEVICE_INFRASTRUCTURE_TYPE_PERF) { + QNN_LOG_INFO("HTP infra type = %d, which is not perf infra type", htp_infra->infraType); + } + else { + QNN_LOG_INFO("HTP infra type = %d, which is perf infra type\n", htp_infra->infraType); + } + _qnn_htp_perfinfra = htp_perfinfra; + _qnn_power_configid = power_configid; + + return 0; + } + + int set_rpc_polling() { + if (_qnn_htp_perfinfra) { + QnnHtpPerfInfrastructure_PowerConfig_t rpc_polling_time; + memset(&rpc_polling_time, 0, sizeof(rpc_polling_time)); + rpc_polling_time.option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_RPC_POLLING_TIME; + //use rpc polling time recommended 0-10000 us + rpc_polling_time.rpcPollingTimeConfig = 9999; + + QnnHtpPerfInfrastructure_PowerConfig_t rpc_control_latency; + memset(&rpc_control_latency, 0, sizeof(rpc_control_latency)); + rpc_control_latency.option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_RPC_CONTROL_LATENCY; + //use rpc control latency recommended 100 us, refer hexagon sdk + rpc_control_latency.rpcControlLatencyConfig = 100; + + const QnnHtpPerfInfrastructure_PowerConfig_t* power_configs[] = { + &rpc_polling_time, + &rpc_control_latency, + nullptr }; + Qnn_ErrorHandle_t qnn_status = _qnn_htp_perfinfra->setPowerConfig( + _qnn_power_configid, + power_configs); + if (qnn_status != QNN_SUCCESS) { + QNN_LOG_WARN("set htp perf failed\n"); + } + else { + QNN_LOG_INFO("set htp perf ok\n"); + } + } + else { + QNN_LOG_WARN("can't set htp perf\n"); + } + + return 0; + } + + int set_high_performance_mode() { + if (nullptr == _qnn_htp_perfinfra) { + QNN_LOG_WARN("perf intra is null\n"); + return 1; + } + + QnnHtpPerfInfrastructure_PowerConfig_t power_config; + memset(&power_config, 0, sizeof(power_config)); + power_config.option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_DCVS_V3; + + power_config.dcvsV3Config.setDcvsEnable = 1; + power_config.dcvsV3Config.dcvsEnable = 0; + power_config.dcvsV3Config.contextId = _qnn_power_configid; + power_config.dcvsV3Config.powerMode = QNN_HTP_PERF_INFRASTRUCTURE_POWERMODE_PERFORMANCE_MODE; + power_config.dcvsV3Config.setSleepLatency = + 1; // true to consider Latency parameter otherwise false + power_config.dcvsV3Config.sleepLatency = 40; + power_config.dcvsV3Config.setBusParams = + 1; // true to consider Bus parameter otherwise false + power_config.dcvsV3Config.setCoreParams = + 1; // true to consider Core parameter otherwise false + power_config.dcvsV3Config.sleepDisable = + 1; // true to consider sleep/LPM modes, false to enable + power_config.dcvsV3Config.setSleepDisable = + 1; // true to consider sleep disable/enable parameter otherwise false set sleep latency parameter + // set Bus Clock Parameters + power_config.dcvsV3Config.busVoltageCornerMin = + DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; + power_config.dcvsV3Config.busVoltageCornerTarget = + DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; + power_config.dcvsV3Config.busVoltageCornerMax = + DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; + // set Core Clock Parameters + power_config.dcvsV3Config.coreVoltageCornerMin = + DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; + power_config.dcvsV3Config.coreVoltageCornerTarget = + DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; + power_config.dcvsV3Config.coreVoltageCornerMax = + DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; + + // set power config with different performance parameters + const QnnHtpPerfInfrastructure_PowerConfig_t* power_configs[] = { + &power_config, nullptr }; + Qnn_ErrorHandle_t qnn_status = QNN_SUCCESS; + qnn_status = _qnn_htp_perfinfra->setPowerConfig(_qnn_power_configid, power_configs); + if (qnn_status != QNN_SUCCESS) { + QNN_LOG_WARN("set htp high performance mode failed\n"); + } + else { + QNN_LOG_INFO("set htp high performance mode ok\n"); + } + + return 0; + } + + std::string& get_qnn_graph_name() { return _graph_name; } + + bool is_rpcmem_initialized() { return _rpcmem_initialized; } + + void set_rpcmem_initialized(bool initialized) { + _rpcmem_initialized = initialized; + } + + size_t get_rpcmem_capacity() { return _rpcmem_capacity; } + + bool is_rpcmem_registered(Qnn_MemHandle_t handle) { + return _qnn_mem_set.count(handle) != 0U; + } + + void* alloc_rpcmem(size_t bytes, size_t alignment) { + if (!_rpcmem_initialized) { + QNN_LOG_WARN("rpc memory not initialized\n"); + return nullptr; + } + + auto allocate_bytes = static_cast(bytes + alignment); + void* buf = _pfn_rpc_mem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS, + allocate_bytes); + if (buf == nullptr) { + QNN_LOG_WARN("failed to allocate rpc memory\n"); + return nullptr; + } + + auto aligned_buf = reinterpret_cast( + qnn::align_to(alignment, reinterpret_cast(buf))); + bool status = + _rpcmem_store_map.insert(std::pair(aligned_buf, buf)).second; + if (!status) { + QNN_LOG_WARN("failed to allocate rpc memory\n"); + _pfn_rpc_mem_free(buf); + } + + return aligned_buf; + } + + void free_rpcmem(void* buf) { + if (!_rpcmem_initialized) { + QNN_LOG_WARN("rpc memory not initialized\n"); + } + else if (0 == _rpcmem_store_map.count(buf)) { + QNN_LOG_WARN("no allocated tensor\n"); + } + else { + _pfn_rpc_mem_free(_rpcmem_store_map[buf]); + _rpcmem_store_map.erase(buf); + } + } + + int32_t rpcmem_to_fd(void* buf) { + int32_t mem_fd = -1; + if (!is_rpcmem_initialized()) { + QNN_LOG_WARN("rpc memory not initialized\n"); + } + else { + mem_fd = _pfn_rpc_mem_to_fd(buf); + } + + return mem_fd; + } + + int register_rpcmem(void* p_data, Qnn_Tensor_t* p_tensor) { + if (nullptr == p_data || (nullptr == p_tensor)) { + QNN_LOG_WARN("invalid param\n"); + return 1; + } + + if (!is_rpcmem_initialized()) { + QNN_LOG_WARN("rpc memory not initialized\n"); + return 2; + } + + if (is_rpcmem_allocated(p_data)) { + QNN_LOG_WARN("rpc memory already allocated\n"); + return 3; + } + + if (is_rpcmem_registered((QNN_VER_PTR(*p_tensor)->memHandle))) { + QNN_LOG_WARN("tensor %s has been registered shared memory\n", + (QNN_VER_PTR(*p_tensor)->name)); + return 4; + } + + int32_t mem_fd = rpcmem_to_fd(p_data); + if (-1 == mem_fd) { + QNN_LOG_WARN("failed to get file descriptor\n"); + return 5; + } + QNN_LOG_INFO("mem_fd %d\n", mem_fd); + Qnn_MemDescriptor_t descriptor = { {QNN_VER_PTR(*p_tensor)->rank, + QNN_VER_PTR(*p_tensor)->dimensions, + nullptr}, + QNN_VER_PTR(*p_tensor)->dataType, + QNN_MEM_TYPE_ION, + {{mem_fd}} }; + Qnn_MemHandle_t handle = nullptr; + int error = QNN_SUCCESS; + error = _qnn_interface.qnn_mem_register(_qnn_context_handle, &descriptor, + /*numDescriptors=*/1, &handle); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to register shared memory, error %d, %s\n", + QNN_GET_ERROR_CODE(error), strerror(error)); + return 6; + } + else { + QNN_LOG_INFO("tensor %s successfully register shared memory\n", + (QNN_VER_PTR(*p_tensor)->name)); + } + QNN_VER_PTR(*p_tensor)->memHandle = handle; + _qnn_mem_set.insert((std::pair(p_data, handle))); + + return 0; + } + + void* get_rpcmem_from_memhandle(Qnn_MemHandle_t mem_handle) { + for (std::unordered_map::iterator it = _qnn_mem_set.begin(); + it != _qnn_mem_set.end(); + it++) { + Qnn_MemHandle_t mem_handle = it->second; + if (it->second == mem_handle) { + return it->first; + } + } + QNN_LOG_WARN("can't find rpcmem from qnn mem handle %p", mem_handle); + return nullptr; + } + + void unregister_rpcmem() { + Qnn_ErrorHandle_t error = QNN_SUCCESS; + + if (_qnn_mem_set.empty()) { + QNN_LOG_WARN("no rpcmem registered\n"); + } + + for (std::unordered_map::iterator it = _qnn_mem_set.begin(); + it != _qnn_mem_set.end(); + it++) { + Qnn_MemHandle_t mem_handle = it->second; + error = _qnn_interface.qnn_mem_de_register(&mem_handle, 1); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to unregister shared memory, error %d\n", + QNN_GET_ERROR_CODE(error)); + } + } + _qnn_mem_set.clear(); + } + + bool is_rpcmem_allocated(void* buf) { + return _qnn_mem_set.count(buf) != 0U; + } + + + public: + std::map> + _qnn_graph_map; + + private: + int load_system() { + Qnn_ErrorHandle_t error = QNN_SUCCESS; + + std::string system_lib_path = _lib_path + "libQnnSystem.so"; + QNN_LOG_DEBUG("system_lib_path:%s\n", system_lib_path.c_str()); + + _system_lib_handle = dlopen(system_lib_path.c_str(), RTLD_NOW | RTLD_LOCAL); + if (nullptr == _system_lib_handle) { + QNN_LOG_WARN("can not open QNN library %s, error: %s\n", + system_lib_path.c_str(), dlerror()); + return 1; + } + + auto* get_providers = + reinterpret_cast( + dlsym(_system_lib_handle, "QnnSystemInterface_getProviders")); + if (nullptr == get_providers) { + QNN_LOG_WARN( + "can not load QNN symbol QnnSystemInterface_getProviders: %s\n", + dlerror()); + return 2; + } + + uint32_t num_providers = 0; + const QnnSystemInterface_t** provider_list = nullptr; + error = get_providers(&provider_list, &num_providers); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to get providers, error %d\n", + QNN_GET_ERROR_CODE(error)); + return 3; + } + + if (num_providers != _required_num_providers) { + QNN_LOG_WARN("providers is %d instead of required %d\n", num_providers, + _required_num_providers); + return 4; + } + + if (nullptr == provider_list) { + QNN_LOG_WARN("can not get providers\n"); + return 5; + } + + QNN_SYSTEM_INTERFACE_VER_TYPE qnn_system_interface; + bool found_valid_system_interface = false; + for (size_t idx = 0; idx < num_providers; idx++) { + if (QNN_SYSTEM_API_VERSION_MAJOR == + provider_list[idx]->systemApiVersion.major && + QNN_SYSTEM_API_VERSION_MINOR <= + provider_list[idx]->systemApiVersion.minor) { + found_valid_system_interface = true; + qnn_system_interface = + provider_list[idx]->QNN_SYSTEM_INTERFACE_VER_NAME; + break; + } + } + if (!found_valid_system_interface) { + QNN_LOG_WARN("unable to find a valid qnn system interface\n"); + return 6; + } + else { + QNN_LOG_INFO("find a valid qnn system interface\n"); + } + set_qnn_raw_system_interface(qnn_system_interface); + + _qnn_interface.set_qnn_system_interface(provider_list[0]); + + _qnn_interface.qnn_system_context_create(&_qnn_system_handle); + if (nullptr == _qnn_system_handle) { + QNN_LOG_WARN("can not create QNN system contenxt\n"); + } + else { + QNN_LOG_INFO("initialize qnn system successfully\n"); + } + + return 0; + } + + int unload_system() { + int result = 0; + + if (nullptr == _system_lib_handle) { + QNN_LOG_WARN("system lib handle is null\n"); + return 1; + } + + if (nullptr != _qnn_system_handle) { + result = _qnn_interface.qnn_system_context_free(_qnn_system_handle); + if (result != QNN_SUCCESS) { + QNN_LOG_WARN("failed to free QNN system context\n"); + } + _qnn_system_handle = nullptr; + } + + int dlclose_error = dlclose(_system_lib_handle); + if (dlclose_error != 0) { + QNN_LOG_WARN("failed to close QnnSystem library, error %s\n", dlerror()); + return 2; + } + + _system_lib_handle = nullptr; + + return result; + } + + int load_backend(std::string& lib_path, const QnnSaver_Config_t** saver_config) { + Qnn_ErrorHandle_t error = QNN_SUCCESS; + QNN_LOG_DEBUG("lib_path:%s\n", lib_path.c_str()); + + void* lib_handle = dlopen(lib_path.c_str(), RTLD_NOW | RTLD_GLOBAL); + if (nullptr == lib_handle) { + QNN_LOG_WARN("can not open QNN library %s, with error: %s", lib_path.c_str(), dlerror()); + return 1; + } + + auto get_providers = + load_qnn_functionpointers( + lib_handle, "QnnInterface_getProviders"); + if (nullptr == get_providers) { + QNN_LOG_WARN("can not load symbol QnnInterface_getProviders : %s", dlerror()); + return 2; + } + + std::uint32_t num_providers = 0; + const QnnInterface_t** provider_list = nullptr; + error = get_providers(&provider_list, &num_providers); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to get providers, error %d", QNN_GET_ERROR_CODE(error)); + return 3; + } + QNN_LOG_DEBUG("num_providers=%d\n", num_providers); + if (num_providers != _required_num_providers) { + QNN_LOG_WARN("providers is %d instead of required %d", num_providers, _required_num_providers); + return 4; + } + + if (nullptr == provider_list) { + QNN_LOG_WARN("failed to get qnn interface providers\n"); + return 5; + } + bool found_valid_interface = false; + QNN_INTERFACE_VER_TYPE qnn_interface; + for (size_t idx = 0; idx < num_providers; idx++) { + if (QNN_API_VERSION_MAJOR == + provider_list[idx]->apiVersion.coreApiVersion.major && + QNN_API_VERSION_MINOR <= + provider_list[idx]->apiVersion.coreApiVersion.minor) { + found_valid_interface = true; + qnn_interface = provider_list[idx]->QNN_INTERFACE_VER_NAME; + break; + } + } + + if (!found_valid_interface) { + QNN_LOG_WARN("unable to find a valid qnn interface\n"); + return 6; + } + else { + QNN_LOG_INFO("find a valid qnn interface\n"); + } + set_qnn_raw_interface(qnn_interface); + + BackendIdType backend_id = provider_list[0]->backendId; + _lib_path_to_backend_id[lib_path] = backend_id; + if (_loaded_backend.count(backend_id) > 0) { + QNN_LOG_WARN("lib_path %s is loaded, but backend %d already exists\n", lib_path.c_str(), backend_id); + } + _loaded_backend[backend_id] = provider_list[0]; + if (_loaded_lib_handle.count(backend_id) > 0) { + QNN_LOG_WARN("closing %p\n", _loaded_lib_handle[backend_id]); + int dlclose_error = dlclose(_loaded_lib_handle[backend_id]); + if (dlclose_error != 0) { + QNN_LOG_WARN("fail to close %p with error %s\n", _loaded_lib_handle[backend_id], dlerror()); + } + } + _loaded_lib_handle[backend_id] = lib_handle; + _backend_id = backend_id; + + return 0; + } + + int unload_backend() { + int dlclose_error = 0; + for (auto& it : _loaded_lib_handle) { + dlclose_error = dlclose(it.second); + if (dlclose_error != 0) { + QNN_LOG_WARN("failed to close QNN backend %d, error %s\n", it.first, dlerror()); + } + } + + _loaded_lib_handle.clear(); + _lib_path_to_backend_id.clear(); + _loaded_backend.clear(); + + return 0; + } + + void set_qnn_raw_interface(QNN_INTERFACE_VER_TYPE& raw_interface) { + _qnn_raw_interface = raw_interface; + } + + void set_qnn_raw_system_interface(QNN_SYSTEM_INTERFACE_VER_TYPE& raw_interface) { + _qnn_raw_system_interface = raw_interface; + } + + private: + static constexpr const int _required_num_providers = 1; + + private: + std::string _lib_path; + std::string _backend_name; + std::string _model_name; // Qualcomm's dedicated prebuilt model name, keep it for further usage + BackendIdType _backend_id; + + bool _debug_tensor = false; + bool _do_node_validations = true; + + QnnLog_Level_t _qnn_log_level = QNN_LOG_LEVEL_DEBUG; + + qnn::sdk_profile_level _profile_level = qnn::sdk_profile_level::profile_detail; + + qnn_interface _qnn_interface; + + void* _system_lib_handle = nullptr; + + Qnn_GraphHandle_t _qnn_graph_handle = nullptr; + + Qnn_LogHandle_t _qnn_log_handle = nullptr; + + Qnn_ProfileHandle_t _qnn_profile_handle = nullptr; + + Qnn_DeviceHandle_t _qnn_device_handle = nullptr; + + Qnn_BackendHandle_t _qnn_backend_handle = nullptr; + + Qnn_ContextHandle_t _qnn_context_handle = nullptr; + + QnnSystemContext_Handle_t _qnn_system_handle = nullptr; + + QnnHtpDevice_PerfInfrastructure_t* _qnn_htp_perfinfra = nullptr; + uint32_t _qnn_power_configid = 1; + + QNN_INTERFACE_VER_TYPE _qnn_raw_interface; + QNN_SYSTEM_INTERFACE_VER_TYPE _qnn_raw_system_interface; + + std::unordered_map _qnn_mem_set; + + std::mutex _init_mutex; + std::unordered_map _loaded_lib_handle; + std::unordered_map _lib_path_to_backend_id; + std::unordered_map _loaded_backend; + + void* _rpc_lib_handle = nullptr; + std::atomic_bool _rpcmem_initialized{ false }; + qnn::pfn_rpc_mem_alloc _pfn_rpc_mem_alloc; + qnn::pfn_rpc_mem_free _pfn_rpc_mem_free; + qnn::pfn_rpc_mem_to_fd _pfn_rpc_mem_to_fd; + qnn::pfn_rpc_mem_init _pfn_rpc_mem_init; + qnn::pfn_rpc_mem_deinit _pfn_rpc_mem_deinit; + std::unordered_map _rpcmem_store_map; + size_t _rpcmem_capacity = 512; + + std::string _graph_name; + }; + +} diff --git a/ggml-qnn/tensor.hpp b/ggml-qnn/tensor.hpp new file mode 100644 index 0000000000000..514061146e840 --- /dev/null +++ b/ggml-qnn/tensor.hpp @@ -0,0 +1,145 @@ + +#pragma once + +#include "QnnTensor.h" +#include "System/QnnSystemInterface.h" + +#include "backend.hpp" +#include "qnn.hpp" + +namespace qnn { + + template class ggml_qnn_tensor_readwrite { + public: + ggml_qnn_tensor_readwrite(const ggml_tensor* tensor, + Qnn_GraphHandle_t graph_handle, + ggml_backend_qnn_context* ctx) + : _tensor(tensor), + _qnn_tensor(reinterpret_cast(tensor->extra)), + _context(ctx) { + _old_dimensions = QNN_VER_PTR(*_qnn_tensor)->dimensions; + const auto qnn_data_type = datatype_from_ggml_datatype(tensor->type); + const bool is_npu = ctx->device == QNN_BACKEND_NPU; + QNN_VER_PTR(*_qnn_tensor)->type = _tensorType; + if (is_npu) { + QNN_VER_PTR(*_qnn_tensor)->memType = QNN_TENSORMEMTYPE_MEMHANDLE; + QNN_VER_PTR(*_qnn_tensor)->clientBuf = { .data = nullptr, .dataSize = 0 }; + } + + auto err = + ctx->raw_interface.tensorCreateGraphTensor(graph_handle, _qnn_tensor); + if (err != QNN_SUCCESS) { + QNN_LOG_INFO("error = %d\n", err); + QNN_LOG_DEBUG("tensor%p name %s", _qnn_tensor, + QNN_TENSOR_GET_NAME(*_qnn_tensor)); + _context = nullptr; + return; + } + + _dimensions[0] = (uint32_t)tensor->ne[0]; + _dimensions[1] = (uint32_t)tensor->ne[1]; + _dimensions[2] = (uint32_t)tensor->ne[2]; + _dimensions[3] = (uint32_t)tensor->ne[3]; + QNN_VER_PTR(*_qnn_tensor)->dimensions = _dimensions; + QNN_VER_PTR(*_qnn_tensor)->rank = qnn::get_ggml_tensor_rank(tensor); + QNN_VER_PTR(*_qnn_tensor)->dataType = qnn_data_type; + + if (is_npu) { + qnn_instance* instance = ctx->instance; + uint8_t* qnn_buffer = static_cast( + instance->alloc_rpcmem(ggml_nbytes(tensor), alignof(void*))); + if (!qnn_buffer) { + QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno)); + QNN_LOG_DEBUG("tensor%p name %s", _qnn_tensor, + QNN_TENSOR_GET_NAME(*_qnn_tensor)); + _context = nullptr; + // No free for _qnn_tensor, because it's not registered. + return; + } + else { + QNN_LOG_INFO("alloc rpcmem successfully\n"); + } + + instance->register_rpcmem(qnn_buffer, _qnn_tensor); + if (_tensorType == QNN_TENSOR_TYPE_APP_WRITE || + _tensorType == QNN_TENSOR_TYPE_APP_READWRITE) { + memcpy(qnn_buffer, tensor->data, ggml_nbytes(tensor)); + } + } + else { + QNN_VER_PTR(*_qnn_tensor)->clientBuf = { + tensor->data, qnn_get_ggml_tensor_data_size(tensor) }; + } + } + + ggml_qnn_tensor_readwrite(const ggml_tensor* tensor, Qnn_Tensor_t* qnn_tensor, + ggml_backend_qnn_context* ctx) + : _tensor(tensor), _qnn_tensor(qnn_tensor), _context(ctx) { + _old_dimensions = QNN_VER_PTR(*_qnn_tensor)->dimensions; + const auto qnn_data_type = qnn_datatype_from_ggml_datatype(tensor->type); + const bool is_npu = ctx->device == QNN_BACKEND_NPU; + + _dimensions[0] = (uint32_t)tensor->ne[0]; + _dimensions[1] = (uint32_t)tensor->ne[1]; + _dimensions[2] = (uint32_t)tensor->ne[2]; + _dimensions[3] = (uint32_t)tensor->ne[3]; + QNN_VER_PTR(*_qnn_tensor)->dimensions = _dimensions; + QNN_VER_PTR(*_qnn_tensor)->rank = qnn_get_ggml_tensor_rank(tensor); + QNN_VER_PTR(*_qnn_tensor)->dataType = qnn_data_type; + + if (is_npu) { + uint8_t* qnn_buffer = + static_cast(ctx->instance->get_rpcmem_from_memhandle( + QNN_VER_PTR(*_qnn_tensor)->memHandle)); + if (qnn_buffer) { + memcpy(qnn_buffer, tensor->data, ggml_nbytes(tensor)); + } + else { + QNN_LOG_WARN("can't find rpcmem from qnn mem handle\n"); + QNN_LOG_DEBUG("tensor%p name %s", _qnn_tensor, + QNN_TENSOR_GET_NAME(*_qnn_tensor)); + _context = nullptr; + return; + } + } + else { + QNN_VER_PTR(*_qnn_tensor)->clientBuf = { + tensor->data, qnn_get_ggml_tensor_data_size(tensor) }; + } + } + + ~ggml_qnn_tensor_readwrite() { + if ((_tensorType == QNN_TENSOR_TYPE_APP_READWRITE || + _tensorType == QNN_TENSOR_TYPE_APP_READ) && + _context && _context->device == QNN_BACKEND_NPU) { + uint8_t* qnn_buffer = + static_cast(_context->instance->get_rpcmem_from_memhandle( + QNN_VER_PTR(*_qnn_tensor)->memHandle)); + memcpy(_tensor->data, qnn_buffer, ggml_nbytes(_tensor)); + } + + QNN_VER_PTR(*_qnn_tensor)->dimensions = _old_dimensions; + } + + bool is_valid() const { return _context; } + Qnn_Tensor_t* get_qnn_tensor() const { return _qnn_tensor; } + + private: + const ggml_tensor* _tensor; + Qnn_Tensor_t* _qnn_tensor; + ggml_backend_qnn_context* _context; + uint32_t* _old_dimensions; + uint32_t _dimensions[4] = {}; + + ggml_qnn_tensor_readwrite(const ggml_qnn_tensor_readwrite&) = delete; + void operator=(const ggml_qnn_tensor_readwrite&) = delete; + ggml_qnn_tensor_readwrite(ggml_qnn_tensor_readwrite&&) = delete; + void operator=(ggml_qnn_tensor_readwrite&&) = delete; + }; + + using ggml_qnn_tensor_output = + ggml_qnn_tensor_readwrite; + using ggml_qnn_tensor_input = + ggml_qnn_tensor_readwrite; + +} // namespace qnn diff --git a/ggml-qnn/utils.hpp b/ggml-qnn/utils.hpp new file mode 100644 index 0000000000000..4141c4e33c79d --- /dev/null +++ b/ggml-qnn/utils.hpp @@ -0,0 +1,99 @@ +#pragma once + +#include "QnnTypes.h" + +#include "ggml.h" + +#include "qnn-types.hpp" + +namespace qnn { + + // TODO: mapping more ggml data type to QNN data type + // ref:explanation of k-quants, https://github.com/ggerganov/llama.cpp/pull/1684 + Qnn_DataType_t datatype_from_ggml_datatype(enum ggml_type ggmltype) { + switch (ggmltype) { + case GGML_TYPE_F16: + return QNN_DATATYPE_FLOAT_16; + case GGML_TYPE_F32: + return QNN_DATATYPE_FLOAT_32; + case GGML_TYPE_I8: + return QNN_DATATYPE_INT_8; + case GGML_TYPE_Q8_0: + return QNN_DATATYPE_SFIXED_POINT_8; + case GGML_TYPE_Q4_0: + return QNN_DATATYPE_SFIXED_POINT_4; + default: + break; + } + return QNN_DATATYPE_UNDEFINED; + } + + + uint32_t get_ggml_tensor_rank(const ggml_tensor* tensor) { + uint32_t rank = 0; + for (int i = 0; i < GGML_MAX_DIMS; i++) { + if ((0 != tensor->ne[i]) && (1 != tensor->ne[i])) { + rank++; + } + } + return rank; + } + + + const char* get_backend_name(int n_backend_type) { + switch (n_backend_type) { + case QNN_BACKEND_CPU: + return "QNN-CPU"; + case QNN_BACKEND_GPU: + return "QNN-GPU"; + case QNN_BACKEND_NPU: + return "QNN-NPU"; + case QNN_BACKEND_GGML: + return "ggml"; //"fake" QNN backend, used for compare performance between QNN backend and original GGML + default: + return "unknown"; + } + } + + const char* get_chipset_desc(uint32_t chipset_id) { + switch (chipset_id) { + case SM8450: + return "SM8450"; + case SM8475: + return "SM8475"; + case SM8550: + return "SM8550"; + case SM8650: + return "SM8650"; + default: + return "unknown"; + } + } + + const char* get_htparch_desc(size_t htp_arch) { + switch (htp_arch) { + case V68: + return "QCOM_HTP_V68"; + case V69: + return "QCOM_HTP_V69"; + case V73: + return "QCOM_HTP_V73"; + case V75: + return "QCOM_HTP_V75"; + default: + return "unknown"; + } + } + + template Fn load_qnn_functionpointers(void* handle, const char* function_name) { + return reinterpret_cast(dlsym(handle, function_name)); + } + + intptr_t align_to(size_t alignment, intptr_t offset) { + return offset % alignment == 0 + ? offset + : offset + (static_cast(alignment) - + offset % static_cast(alignment)); + } + +} diff --git a/tests/ggml-qnn/CMakeLists.txt b/tests/ggml-qnn/CMakeLists.txt index bf061e6c7c3a1..77a2059ed0f0c 100644 --- a/tests/ggml-qnn/CMakeLists.txt +++ b/tests/ggml-qnn/CMakeLists.txt @@ -20,6 +20,7 @@ set(SOURCE_FILES ../../ggml-alloc.c ../../ggml-backend.c ../../ggml-quants.c + ../../ggml-qnn/logger.cpp ../../ggml-qnn.cpp ggml-qnn-ut.cpp ) From 3c491a32634cafa011fed756783a8fe655d988cc Mon Sep 17 00:00:00 2001 From: hongruichen Date: Wed, 19 Jun 2024 14:43:22 +0800 Subject: [PATCH 028/143] remove reference of g_qnn_mgr in qnn_instance --- ggml-qnn.cpp | 1 + ggml-qnn/qnn.hpp | 6 ++++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/ggml-qnn.cpp b/ggml-qnn.cpp index a552fd5ec935e..b59126067595c 100644 --- a/ggml-qnn.cpp +++ b/ggml-qnn.cpp @@ -1743,6 +1743,7 @@ ggml_backend_t ggml_backend_qnn_init(size_t device, const char * qnn_lib_path) { g_qnn_mgr[device].instance = instance; g_qnn_mgr[device].raw_interface = instance->get_qnn_raw_interface(); g_qnn_mgr[device].raw_system_interface = instance->get_qnn_raw_system_interface(); + g_qnn_mgr[device].socinfo = instance->get_soc_info(); ggml_backend_t qnn_backend = new ggml_backend{/* .guid = */ ggml_backend_qnn_guid(), diff --git a/ggml-qnn/qnn.hpp b/ggml-qnn/qnn.hpp index bd83a9f05e946..15df7dcbbe300 100644 --- a/ggml-qnn/qnn.hpp +++ b/ggml-qnn/qnn.hpp @@ -262,7 +262,7 @@ namespace qnn_internal { QNN_LOG_INFO("qualcomm soc_model:%d(%s), htp_arch:%d(%s), vtcm_size:%d MB", chipinfo.socModel, qnn::get_chipset_desc(chipinfo.socModel), htp_arch, qnn::get_htparch_desc(htp_arch), chipinfo.vtcmSize); - g_qnn_mgr[QNN_BACKEND_NPU].socinfo = { chipinfo.socModel, htp_arch, chipinfo.vtcmSize }; + _soc_info = { chipinfo.socModel, htp_arch, chipinfo.vtcmSize }; } _qnn_raw_interface.deviceFreePlatformInfo(nullptr, p_info); @@ -864,6 +864,7 @@ namespace qnn_internal { return _qnn_mem_set.count(buf) != 0U; } + const qnn::qcom_socinfo &get_soc_info() { return _soc_info; } public: std::map Date: Wed, 19 Jun 2024 14:47:41 +0800 Subject: [PATCH 029/143] fix compiling error --- ggml-qnn.cpp | 84 ++++++++++++++++-------------------------- ggml-qnn/backend.hpp | 2 +- ggml-qnn/qnn-types.hpp | 23 +++++++++--- ggml-qnn/qnn.hpp | 9 ++--- ggml-qnn/tensor.hpp | 10 ++--- ggml-qnn/utils.hpp | 13 +++++++ 6 files changed, 72 insertions(+), 69 deletions(-) diff --git a/ggml-qnn.cpp b/ggml-qnn.cpp index b59126067595c..fdbcbdafb6641 100644 --- a/ggml-qnn.cpp +++ b/ggml-qnn.cpp @@ -70,27 +70,27 @@ typedef void (*ggml_qnn_func_t)(ggml_backend_qnn_context * ctx, static struct qnn::qcom_socinfo g_qnn_soc_info_table[] = { /* Qualcomm SnapDragon 8 Gen 1 */ - [SM8450] = { - .soc_model = SM8450, - .htp_arch = V69, + [qnn::SM8450] = { + .soc_model = qnn::SM8450, + .htp_arch = qnn::V69, .vtcm_size_in_mb = 8}, /* Qualcomm SnapDragon 8 Gen 1+ */ - [SM8475] = { - .soc_model = SM8475, - .htp_arch = V69, + [qnn::SM8475] = { + .soc_model = qnn::SM8475, + .htp_arch = qnn::V69, .vtcm_size_in_mb = 8}, /* Qualcomm SnapDragon 8 Gen 2 */ - [SM8550] = { - .soc_model = SM8550, - .htp_arch = V73, + [qnn::SM8550] = { + .soc_model = qnn::SM8550, + .htp_arch = qnn::V73, .vtcm_size_in_mb = 8}, /* Qualcomm SnapDragon 8 Gen 3 */ - [SM8650] = { - .soc_model = SM8650, - .htp_arch = V75, + [qnn::SM8650] = { + .soc_model = qnn::SM8650, + .htp_arch = qnn::V75, .vtcm_size_in_mb = 8}, }; @@ -198,19 +198,6 @@ static const char * qnn_opname_from_ggmlop(enum ggml_op ggmlop) { return nullptr; } -static uint32_t qnn_get_ggml_tensor_data_size(const ggml_tensor * tensor) { - /* - size_t data_size = ggml_row_size(tensor->type, tensor->ne[0]); - size_t n_dims = qnn_get_ggml_tensor_rank(tensor); - for (int i = 1; i < n_dims; i++) { - data_size *= tensor->ne[i]; - } - - return data_size; - */ - return ggml_nbytes(tensor); -} - static bool qnn_is_valid_params(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { if ((nullptr == ctx) || (nullptr == src0) || (nullptr == src1) || (nullptr == dst)) { @@ -218,10 +205,10 @@ static bool qnn_is_valid_params(ggml_backend_qnn_context * ctx, const ggml_tenso return false; } - qnn_instance * instance = nullptr; - Qnn_Tensor_t * tensor_0 = nullptr; - Qnn_Tensor_t * tensor_1 = nullptr; - Qnn_Tensor_t * tensor_2 = nullptr; + qnn_internal::qnn_instance *instance = nullptr; + Qnn_Tensor_t *tensor_0 = nullptr; + Qnn_Tensor_t *tensor_1 = nullptr; + Qnn_Tensor_t *tensor_2 = nullptr; tensor_0 = (Qnn_Tensor_t *) src0->extra; tensor_1 = (Qnn_Tensor_t *) src1->extra; tensor_2 = (Qnn_Tensor_t *) dst->extra; @@ -283,13 +270,6 @@ class qnn_perf { }; #endif -using pfn_qnnsaver_initialize = decltype(QnnSaver_initialize); -using pfn_qnninterface_getproviders = decltype(QnnInterface_getProviders); -using pfn_qnnsysteminterface_getproviders = decltype(QnnSystemInterface_getProviders); - -#define RPCMEM_DEFAULT_FLAGS 1 -#define RPCMEM_HEAP_ID_SYSTEM 25 - #define VALIDATE(value, status) \ do { \ status = value; \ @@ -625,11 +605,11 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src const ggml_tensor * src1, ggml_tensor * dst) { Qnn_ErrorHandle_t error = QNN_SUCCESS; bool graph_initialized = false; - qnn_instance * instance = nullptr; - std::string graph_name = "ggml_op_qnn_add"; - Qnn_GraphHandle_t graph_handle = nullptr; - Qnn_Param_t qnn_params[] = {}; - enum ggml_op ggmlop = GGML_OP_ADD; + qnn_internal::qnn_instance *instance = nullptr; + std::string graph_name = "ggml_op_qnn_add"; + Qnn_GraphHandle_t graph_handle = nullptr; + Qnn_Param_t qnn_params[] = {}; + enum ggml_op ggmlop = GGML_OP_ADD; CHECK_PARAMS(ctx, src0, src1, dst); instance = ctx->instance; @@ -817,13 +797,13 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - Qnn_ErrorHandle_t error = QNN_SUCCESS; - bool graph_initialized = false; - qnn_instance * instance = nullptr; - std::string graph_name = "ggml_op_qnn_mul_mat"; - Qnn_GraphHandle_t graph_handle = nullptr; - Qnn_Param_t qnn_params[] = {}; - enum ggml_op ggmlop = GGML_OP_MUL_MAT; + Qnn_ErrorHandle_t error = QNN_SUCCESS; + bool graph_initialized = false; + qnn_internal::qnn_instance *instance = nullptr; + std::string graph_name = "ggml_op_qnn_mul_mat"; + Qnn_GraphHandle_t graph_handle = nullptr; + Qnn_Param_t qnn_params[] = {}; + enum ggml_op ggmlop = GGML_OP_MUL_MAT; CHECK_PARAMS(ctx, src0, src1, dst); instance = ctx->instance; @@ -1492,8 +1472,9 @@ GGML_CALL static void ggml_backend_qnn_free(ggml_backend_t backend) { ggml_backend_qnn_context * ctx = (ggml_backend_qnn_context *) backend->context; QNN_LOG_INFO("idx %d, name:%s", ctx->device, g_qnn_mgr[ctx->device].name); - qnn_instance * instance = (qnn_instance *)g_qnn_mgr[ctx->device].instance; + auto *instance = g_qnn_mgr[ctx->device].instance; if (instance != nullptr) { + // TODO: this should be done inside the destructor std::map>::iterator graph_it; @@ -1721,9 +1702,8 @@ ggml_backend_t ggml_backend_qnn_init(size_t device, const char * qnn_lib_path) { } } - qnn_instance * instance = nullptr; - instance = new qnn_instance(qnn_lib_path, g_qnn_mgr[device].lib, ""); - result = instance->qnn_init(nullptr); + auto *instance = new qnn_internal::qnn_instance(qnn_lib_path, g_qnn_mgr[device].lib, ""); + result = instance->qnn_init(nullptr); if (0 != result) { QNN_LOG_WARN( "init qnn subsystem failed with qnn backend %s, pls check why\n", diff --git a/ggml-qnn/backend.hpp b/ggml-qnn/backend.hpp index 3a624eab050ac..fd40d8ad24066 100644 --- a/ggml-qnn/backend.hpp +++ b/ggml-qnn/backend.hpp @@ -20,5 +20,5 @@ struct ggml_backend_qnn_context { struct ggml_backend* backend; QNN_INTERFACE_VER_TYPE raw_interface; QNN_SYSTEM_INTERFACE_VER_TYPE raw_system_interface; - struct qcom_socinfo socinfo; + qnn::qcom_socinfo socinfo; }; diff --git a/ggml-qnn/qnn-types.hpp b/ggml-qnn/qnn-types.hpp index 33f468eb796d1..db1d592f08a20 100644 --- a/ggml-qnn/qnn-types.hpp +++ b/ggml-qnn/qnn-types.hpp @@ -1,6 +1,12 @@ #pragma once +#include "QnnTypes.h" +#include "QnnCommon.h" +#include "QnnInterface.h" +#include "Saver/QnnSaver.h" +#include "System/QnnSystemInterface.h" + namespace qnn { // ================================================================================================= // @@ -30,17 +36,24 @@ namespace qnn { SM8650 = 57, // v75 }; + struct qcom_socinfo { + uint32_t soc_model; + size_t htp_arch; + size_t vtcm_size_in_mb; + }; + using pfn_rpc_mem_init = void (*)(void); using pfn_rpc_mem_deinit = void (*)(void); using pfn_rpc_mem_alloc = void* (*) (int, uint32_t, int); using pfn_rpc_mem_free = void (*)(void*); using pfn_rpc_mem_to_fd = int (*)(void*); - struct qcom_socinfo { - uint32_t soc_model; - size_t htp_arch; - size_t vtcm_size_in_mb; - }; + using pfn_qnnsaver_initialize = decltype(QnnSaver_initialize); + using pfn_qnninterface_getproviders = decltype(QnnInterface_getProviders); + using pfn_qnnsysteminterface_getproviders = decltype(QnnSystemInterface_getProviders); } #define QNN_VER_PTR(x) (&((x).v1)) // TODO: remove this macro after we have a separate header for QNN + +#define RPCMEM_DEFAULT_FLAGS 1 +#define RPCMEM_HEAP_ID_SYSTEM 25 diff --git a/ggml-qnn/qnn.hpp b/ggml-qnn/qnn.hpp index 15df7dcbbe300..8d8ab72b446e3 100644 --- a/ggml-qnn/qnn.hpp +++ b/ggml-qnn/qnn.hpp @@ -11,9 +11,6 @@ #include "QnnGraph.h" #include "QnnProperty.h" #include "QnnTensor.h" -#include "QnnInterface.h" -#include "Saver/QnnSaver.h" -#include "System/QnnSystemInterface.h" #include "HTP/QnnHtpDevice.h" #include "HTP/QnnHtpGraph.h" @@ -864,7 +861,7 @@ namespace qnn_internal { return _qnn_mem_set.count(buf) != 0U; } - const qnn::qcom_socinfo &get_soc_info() { return _soc_info; } + const qnn::qcom_socinfo& get_soc_info() { return _soc_info; } public: std::map( + reinterpret_cast( dlsym(_system_lib_handle, "QnnSystemInterface_getProviders")); if (nullptr == get_providers) { QNN_LOG_WARN( @@ -988,7 +985,7 @@ namespace qnn_internal { } auto get_providers = - load_qnn_functionpointers( + qnn::load_qnn_functionpointers( lib_handle, "QnnInterface_getProviders"); if (nullptr == get_providers) { QNN_LOG_WARN("can not load symbol QnnInterface_getProviders : %s", dlerror()); diff --git a/ggml-qnn/tensor.hpp b/ggml-qnn/tensor.hpp index 514061146e840..687ebd8905ef4 100644 --- a/ggml-qnn/tensor.hpp +++ b/ggml-qnn/tensor.hpp @@ -45,7 +45,7 @@ namespace qnn { QNN_VER_PTR(*_qnn_tensor)->dataType = qnn_data_type; if (is_npu) { - qnn_instance* instance = ctx->instance; + auto* instance = ctx->instance; uint8_t* qnn_buffer = static_cast( instance->alloc_rpcmem(ggml_nbytes(tensor), alignof(void*))); if (!qnn_buffer) { @@ -68,7 +68,7 @@ namespace qnn { } else { QNN_VER_PTR(*_qnn_tensor)->clientBuf = { - tensor->data, qnn_get_ggml_tensor_data_size(tensor) }; + tensor->data, get_ggml_tensor_data_size(tensor) }; } } @@ -76,7 +76,7 @@ namespace qnn { ggml_backend_qnn_context* ctx) : _tensor(tensor), _qnn_tensor(qnn_tensor), _context(ctx) { _old_dimensions = QNN_VER_PTR(*_qnn_tensor)->dimensions; - const auto qnn_data_type = qnn_datatype_from_ggml_datatype(tensor->type); + const auto qnn_data_type = qnn::datatype_from_ggml_datatype(tensor->type); const bool is_npu = ctx->device == QNN_BACKEND_NPU; _dimensions[0] = (uint32_t)tensor->ne[0]; @@ -84,7 +84,7 @@ namespace qnn { _dimensions[2] = (uint32_t)tensor->ne[2]; _dimensions[3] = (uint32_t)tensor->ne[3]; QNN_VER_PTR(*_qnn_tensor)->dimensions = _dimensions; - QNN_VER_PTR(*_qnn_tensor)->rank = qnn_get_ggml_tensor_rank(tensor); + QNN_VER_PTR(*_qnn_tensor)->rank = get_ggml_tensor_rank(tensor); QNN_VER_PTR(*_qnn_tensor)->dataType = qnn_data_type; if (is_npu) { @@ -104,7 +104,7 @@ namespace qnn { } else { QNN_VER_PTR(*_qnn_tensor)->clientBuf = { - tensor->data, qnn_get_ggml_tensor_data_size(tensor) }; + tensor->data, get_ggml_tensor_data_size(tensor) }; } } diff --git a/ggml-qnn/utils.hpp b/ggml-qnn/utils.hpp index 4141c4e33c79d..c952e8bc298c6 100644 --- a/ggml-qnn/utils.hpp +++ b/ggml-qnn/utils.hpp @@ -96,4 +96,17 @@ namespace qnn { offset % static_cast(alignment)); } + uint32_t get_ggml_tensor_data_size(const ggml_tensor* tensor) { + /* + size_t data_size = ggml_row_size(tensor->type, tensor->ne[0]); + size_t n_dims = qnn_get_ggml_tensor_rank(tensor); + for (int i = 1; i < n_dims; i++) { + data_size *= tensor->ne[i]; + } + + return data_size; + */ + return ggml_nbytes(tensor); + } + } From 37a1585eade7b45b4a0ce01f50b5640ec22f0418 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Wed, 19 Jun 2024 17:36:50 +0800 Subject: [PATCH 030/143] rename --- ggml-qnn.cpp | 38 +++++++++++++++++++------------------- ggml-qnn/backend.hpp | 2 +- ggml-qnn/qnn.hpp | 2 +- 3 files changed, 21 insertions(+), 21 deletions(-) diff --git a/ggml-qnn.cpp b/ggml-qnn.cpp index fdbcbdafb6641..b9599293ba177 100644 --- a/ggml-qnn.cpp +++ b/ggml-qnn.cpp @@ -205,10 +205,10 @@ static bool qnn_is_valid_params(ggml_backend_qnn_context * ctx, const ggml_tenso return false; } - qnn_internal::qnn_instance *instance = nullptr; - Qnn_Tensor_t *tensor_0 = nullptr; - Qnn_Tensor_t *tensor_1 = nullptr; - Qnn_Tensor_t *tensor_2 = nullptr; + qnn::qnn_instance *instance = nullptr; + Qnn_Tensor_t *tensor_0 = nullptr; + Qnn_Tensor_t *tensor_1 = nullptr; + Qnn_Tensor_t *tensor_2 = nullptr; tensor_0 = (Qnn_Tensor_t *) src0->extra; tensor_1 = (Qnn_Tensor_t *) src1->extra; tensor_2 = (Qnn_Tensor_t *) dst->extra; @@ -603,13 +603,13 @@ static bool ggml_qnn_can_handle_op(ggml_backend_qnn_context * ctx, // keep it for illustrate how to implement a specified GGMPL OP using QNN API + QNN RPC static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - Qnn_ErrorHandle_t error = QNN_SUCCESS; - bool graph_initialized = false; - qnn_internal::qnn_instance *instance = nullptr; - std::string graph_name = "ggml_op_qnn_add"; - Qnn_GraphHandle_t graph_handle = nullptr; - Qnn_Param_t qnn_params[] = {}; - enum ggml_op ggmlop = GGML_OP_ADD; + Qnn_ErrorHandle_t error = QNN_SUCCESS; + bool graph_initialized = false; + qnn::qnn_instance *instance = nullptr; + std::string graph_name = "ggml_op_qnn_add"; + Qnn_GraphHandle_t graph_handle = nullptr; + Qnn_Param_t qnn_params[] = {}; + enum ggml_op ggmlop = GGML_OP_ADD; CHECK_PARAMS(ctx, src0, src1, dst); instance = ctx->instance; @@ -797,13 +797,13 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - Qnn_ErrorHandle_t error = QNN_SUCCESS; - bool graph_initialized = false; - qnn_internal::qnn_instance *instance = nullptr; - std::string graph_name = "ggml_op_qnn_mul_mat"; - Qnn_GraphHandle_t graph_handle = nullptr; - Qnn_Param_t qnn_params[] = {}; - enum ggml_op ggmlop = GGML_OP_MUL_MAT; + Qnn_ErrorHandle_t error = QNN_SUCCESS; + bool graph_initialized = false; + qnn::qnn_instance *instance = nullptr; + std::string graph_name = "ggml_op_qnn_mul_mat"; + Qnn_GraphHandle_t graph_handle = nullptr; + Qnn_Param_t qnn_params[] = {}; + enum ggml_op ggmlop = GGML_OP_MUL_MAT; CHECK_PARAMS(ctx, src0, src1, dst); instance = ctx->instance; @@ -1702,7 +1702,7 @@ ggml_backend_t ggml_backend_qnn_init(size_t device, const char * qnn_lib_path) { } } - auto *instance = new qnn_internal::qnn_instance(qnn_lib_path, g_qnn_mgr[device].lib, ""); + auto *instance = new qnn::qnn_instance(qnn_lib_path, g_qnn_mgr[device].lib, ""); result = instance->qnn_init(nullptr); if (0 != result) { QNN_LOG_WARN( diff --git a/ggml-qnn/backend.hpp b/ggml-qnn/backend.hpp index fd40d8ad24066..1f674103d29ac 100644 --- a/ggml-qnn/backend.hpp +++ b/ggml-qnn/backend.hpp @@ -16,7 +16,7 @@ struct ggml_backend_qnn_context { int threads; char name[GGML_MAX_NAME]; char lib[GGML_MAX_NAME]; - qnn_internal::qnn_instance* instance; + qnn::qnn_instance* instance; struct ggml_backend* backend; QNN_INTERFACE_VER_TYPE raw_interface; QNN_SYSTEM_INTERFACE_VER_TYPE raw_system_interface; diff --git a/ggml-qnn/qnn.hpp b/ggml-qnn/qnn.hpp index 8d8ab72b446e3..212b6f8521745 100644 --- a/ggml-qnn/qnn.hpp +++ b/ggml-qnn/qnn.hpp @@ -17,7 +17,7 @@ #include "utils.hpp" #include "logger.hpp" -namespace qnn_internal { +namespace qnn { // ================================================================================================= // From ff0359d6f4190c97668680f7ac27477c6d0a21af Mon Sep 17 00:00:00 2001 From: hongruichen Date: Wed, 19 Jun 2024 18:16:11 +0800 Subject: [PATCH 031/143] move qnn helper function into utility files --- ggml-qnn.cpp | 194 +----------------------------------------- ggml-qnn/backend.hpp | 14 ++-- ggml-qnn/utils.hpp | 196 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 205 insertions(+), 199 deletions(-) diff --git a/ggml-qnn.cpp b/ggml-qnn.cpp index b9599293ba177..3a667a1970aba 100644 --- a/ggml-qnn.cpp +++ b/ggml-qnn.cpp @@ -183,21 +183,6 @@ struct ggml_backend_qnn_buffer_type_context { // QNN backend internal helper functions // // ================================================================================================= -// TODO: only support GGML_OP_ADD/GGML_OP_MUL/GGML_OP_MUL_MAT -static const char * qnn_opname_from_ggmlop(enum ggml_op ggmlop) { - switch (ggmlop) { - case GGML_OP_ADD: - return QNN_OP_ELEMENT_WISE_ADD; - case GGML_OP_MUL: - return QNN_OP_ELEMENT_WISE_MULTIPLY; - case GGML_OP_MUL_MAT: - return QNN_OP_MAT_MUL; - default: - break; - } - return nullptr; -} - static bool qnn_is_valid_params(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { if ((nullptr == ctx) || (nullptr == src0) || (nullptr == src1) || (nullptr == dst)) { @@ -270,181 +255,6 @@ class qnn_perf { }; #endif -#define VALIDATE(value, status) \ - do { \ - status = value; \ - if (status != QNN_SUCCESS) { \ - QNN_LOG_WARN("%s expected QNN_SUCCESS\n", #value); \ - return status; \ - } \ - } while (0) - -#define QNN_TENSOR_GET_ID(tensor) get_qnn_tensorid(tensor) -#define QNN_TENSOR_GET_NAME(tensor) get_qnn_tensorname(tensor) -#define QNN_TENSOR_GET_TYPE(tensor) get_qnn_tensortype(tensor) -#define QNN_TENSOR_GET_DATA_FORMAT(tensor) get_qnn_tensor_dataformat(tensor) -#define QNN_TENSOR_GET_DATA_TYPE(tensor) get_qnn_tensor_datatype(tensor) -#define QNN_TENSOR_GET_QUANT_PARAMS(tensor) get_qnn_tensor_quantparams(tensor) -#define QNN_TENSOR_GET_RANK(tensor) get_qnn_tensor_rank(tensor) -#define QNN_TENSOR_GET_DIMENSIONS(tensor) get_qnn_tensor_dimensions(tensor) -#define QNN_TENSOR_GET_MEM_TYPE(tensor) get_qnn_tensor_memtype(tensor) - -#define QNN_TENSOR_SET_ID(tensor, value) set_qnn_tensor_id(tensor, value) -#define QNN_TENSOR_SET_NAME(tensor, value) set_qnn_tensor_name(tensor, value) -#define QNN_TENSOR_SET_TYPE(tensor, value) set_qnn_tensor_type(tensor, value) -#define QNN_TENSOR_SET_DATA_FORMAT(tensor, value) set_qnn_tensor_dataformat(tensor, value) -#define QNN_TENSOR_SET_DATA_TYPE(tensor, value) set_qnn_tensor_datatype(tensor, value) -#define QNN_TENSOR_SET_QUANT_PARAMS(tensor, value) set_qnn_tensor_quantparams(tensor, value) -#define QNN_TENSOR_SET_RANK(tensor, value) set_qnn_tensor_rank(tensor, value) -#define QNN_TENSOR_SET_DIMENSIONS(tensor, value) set_qnn_tensor_dimensions(tensor, value) -#define QNN_TENSOR_SET_MEM_TYPE(tensor, value) set_qnn_tensor_memtype(tensor, value) -#define QNN_TENSOR_SET_CLIENT_BUF(tensor, value) set_qnn_tensor_clientbuf(tensor, value) -#define QNN_TENSOR_SET_MEM_HANDLE(tensor, value) set_qnn_tensor_memhandle(tensor, value) -#define VALIDATE_TENSOR_VERSION(tensor, err) VALIDATE(validate_tensor_version(tensor), err) - -static inline int validate_tensor_version(Qnn_Tensor_t tensor) { - if (tensor.version != QNN_TENSOR_VERSION_1) { - QNN_LOG_WARN( - "validate_tensor_version() tensor %s, got unsupported version %d\n", - tensor.v1.name, tensor.version); - return 1; - } - return 0; -} - -static inline uint32_t get_qnn_tensorid(const Qnn_Tensor_t & tensor) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - return tensor.v1.id; - } - - return 0u; -} - -static inline const char * get_qnn_tensorname(const Qnn_Tensor_t & tensor) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - return tensor.v1.name; - } - return nullptr; -} - -static inline Qnn_TensorType_t get_qnn_tensortype(const Qnn_Tensor_t & tensor) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - return tensor.v1.type; - } - return QNN_TENSOR_TYPE_UNDEFINED; -} - -static inline Qnn_TensorDataFormat_t - get_qnn_tensor_dataformat(const Qnn_Tensor_t & tensor) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - return tensor.v1.dataFormat; - } - return QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER; -} - -static inline Qnn_DataType_t - get_qnn_tensor_datatype(const Qnn_Tensor_t & tensor) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - return tensor.v1.dataType; - } - return QNN_DATATYPE_UNDEFINED; -} - -static inline Qnn_QuantizeParams_t - get_qnn_tensor_quantparams(const Qnn_Tensor_t & tensor) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - return tensor.v1.quantizeParams; - } - return QNN_QUANTIZE_PARAMS_INIT; -} - -static inline uint32_t get_qnn_tensor_rank(const Qnn_Tensor_t & tensor) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - return tensor.v1.rank; - } - return 0u; -} - -static inline uint32_t * get_qnn_tensor_dimensions(const Qnn_Tensor_t & tensor) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - return tensor.v1.dimensions; - } - return nullptr; -} - -static inline Qnn_TensorMemType_t get_qnn_tensor_memtype(const Qnn_Tensor_t & tensor) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - return tensor.v1.memType; - } - return QNN_TENSORMEMTYPE_UNDEFINED; -} - -static inline void set_qnn_tensor_id(Qnn_Tensor_t & tensor, uint32_t id) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - tensor.v1.id = id; - } -} - -static inline void set_qnn_tensor_name(Qnn_Tensor_t & tensor, const char * name) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - tensor.v1.name = name; - } -} - -static inline void set_qnn_tensor_type(Qnn_Tensor_t & tensor, Qnn_TensorType_t type) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - tensor.v1.type = type; - } -} - -static inline void set_qnn_tensor_dataformat(Qnn_Tensor_t & tensor, Qnn_TensorDataFormat_t format) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - tensor.v1.dataFormat = format; - } -} - -static inline void set_qnn_tensor_datatype(Qnn_Tensor_t & tensor, Qnn_DataType_t dataType) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - tensor.v1.dataType = dataType; - } -} - -static inline void set_qnn_tensor_quantparams(Qnn_Tensor_t & tensor, Qnn_QuantizeParams_t params) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - tensor.v1.quantizeParams = params; - } -} - -static inline void set_qnn_tensor_rank(Qnn_Tensor_t & tensor, uint32_t rank) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - tensor.v1.rank = rank; - } -} - -static inline void set_qnn_tensor_dimensions(Qnn_Tensor_t & tensor, uint32_t * dims) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - tensor.v1.dimensions = dims; - } -} - -static inline void set_qnn_tensor_memtype(Qnn_Tensor_t & tensor, Qnn_TensorMemType_t mem_type) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - tensor.v1.memType = mem_type; - } -} - -static inline void set_qnn_tensor_clientbuf(Qnn_Tensor_t & tensor, Qnn_ClientBuffer_t client_buf) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - tensor.v1.clientBuf = client_buf; - } -} - -static inline void set_qnn_tensor_memhandle(Qnn_Tensor_t & tensor, Qnn_MemHandle_t handle) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - tensor.v1.memHandle = handle; - } -} - static size_t memscpy(void * dst, size_t dst_size, const void * src, size_t copy_size) { if (!dst || !src || !dst_size || !copy_size) return 0; @@ -613,7 +423,7 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src CHECK_PARAMS(ctx, src0, src1, dst); instance = ctx->instance; - QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface; + auto qnn_raw_interface = ctx->raw_interface; qnn_perf perf("ggml_qnn_add"); perf.start(); @@ -807,7 +617,7 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, CHECK_PARAMS(ctx, src0, src1, dst); instance = ctx->instance; - QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface; + auto qnn_raw_interface = ctx->raw_interface; qnn_perf perf("ggml_qnn_mul_mat"); perf.start(); diff --git a/ggml-qnn/backend.hpp b/ggml-qnn/backend.hpp index 1f674103d29ac..b5aacf57c1aa0 100644 --- a/ggml-qnn/backend.hpp +++ b/ggml-qnn/backend.hpp @@ -12,13 +12,13 @@ #include "qnn.hpp" struct ggml_backend_qnn_context { - int device; - int threads; - char name[GGML_MAX_NAME]; - char lib[GGML_MAX_NAME]; + int device; + int threads; + char name[GGML_MAX_NAME]; + char lib[GGML_MAX_NAME]; qnn::qnn_instance* instance; - struct ggml_backend* backend; - QNN_INTERFACE_VER_TYPE raw_interface; + ggml_backend* backend; + QNN_INTERFACE_VER_TYPE raw_interface; QNN_SYSTEM_INTERFACE_VER_TYPE raw_system_interface; - qnn::qcom_socinfo socinfo; + qnn::qcom_socinfo socinfo; }; diff --git a/ggml-qnn/utils.hpp b/ggml-qnn/utils.hpp index c952e8bc298c6..2ec7c0f13f0ce 100644 --- a/ggml-qnn/utils.hpp +++ b/ggml-qnn/utils.hpp @@ -109,4 +109,200 @@ namespace qnn { return ggml_nbytes(tensor); } + + // ================================================================================================= + // + // QNN backend internal helper functions + // + // ================================================================================================= + // TODO: only support GGML_OP_ADD/GGML_OP_MUL/GGML_OP_MUL_MAT + const char* opname_from_ggmlop(enum ggml_op ggmlop) { + switch (ggmlop) { + case GGML_OP_ADD: + return QNN_OP_ELEMENT_WISE_ADD; + case GGML_OP_MUL: + return QNN_OP_ELEMENT_WISE_MULTIPLY; + case GGML_OP_MUL_MAT: + return QNN_OP_MAT_MUL; + default: + break; + } + return nullptr; + } + + inline int validate_tensor_version(Qnn_Tensor_t tensor) { + if (tensor.version != QNN_TENSOR_VERSION_1) { + QNN_LOG_WARN( + "validate_tensor_version() tensor %s, got unsupported version %d\n", + tensor.v1.name, tensor.version); + return 1; + } + return 0; + } + + inline uint32_t get_qnn_tensorid(const Qnn_Tensor_t& tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.id; + } + + return 0u; + } + + inline const char* get_qnn_tensorname(const Qnn_Tensor_t& tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.name; + } + return nullptr; + } + + inline Qnn_TensorType_t get_qnn_tensortype(const Qnn_Tensor_t& tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.type; + } + return QNN_TENSOR_TYPE_UNDEFINED; + } + + inline Qnn_TensorDataFormat_t + get_qnn_tensor_dataformat(const Qnn_Tensor_t& tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.dataFormat; + } + return QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER; + } + + inline Qnn_DataType_t + get_qnn_tensor_datatype(const Qnn_Tensor_t& tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.dataType; + } + return QNN_DATATYPE_UNDEFINED; + } + + inline Qnn_QuantizeParams_t + get_qnn_tensor_quantparams(const Qnn_Tensor_t& tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.quantizeParams; + } + return QNN_QUANTIZE_PARAMS_INIT; + } + + inline uint32_t get_qnn_tensor_rank(const Qnn_Tensor_t& tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.rank; + } + return 0u; + } + + inline uint32_t* get_qnn_tensor_dimensions(const Qnn_Tensor_t& tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.dimensions; + } + return nullptr; + } + + inline Qnn_TensorMemType_t get_qnn_tensor_memtype(const Qnn_Tensor_t& tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.memType; + } + return QNN_TENSORMEMTYPE_UNDEFINED; + } + + inline void set_qnn_tensor_id(Qnn_Tensor_t& tensor, uint32_t id) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + tensor.v1.id = id; + } + } + + inline void set_qnn_tensor_name(Qnn_Tensor_t& tensor, const char* name) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + tensor.v1.name = name; + } + } + + inline void set_qnn_tensor_type(Qnn_Tensor_t& tensor, Qnn_TensorType_t type) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + tensor.v1.type = type; + } + } + + inline void set_qnn_tensor_dataformat(Qnn_Tensor_t& tensor, Qnn_TensorDataFormat_t format) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + tensor.v1.dataFormat = format; + } + } + + inline void set_qnn_tensor_datatype(Qnn_Tensor_t& tensor, Qnn_DataType_t dataType) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + tensor.v1.dataType = dataType; + } + } + + inline void set_qnn_tensor_quantparams(Qnn_Tensor_t& tensor, Qnn_QuantizeParams_t params) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + tensor.v1.quantizeParams = params; + } + } + + inline void set_qnn_tensor_rank(Qnn_Tensor_t& tensor, uint32_t rank) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + tensor.v1.rank = rank; + } + } + + inline void set_qnn_tensor_dimensions(Qnn_Tensor_t& tensor, uint32_t* dims) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + tensor.v1.dimensions = dims; + } + } + + inline void set_qnn_tensor_memtype(Qnn_Tensor_t& tensor, Qnn_TensorMemType_t mem_type) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + tensor.v1.memType = mem_type; + } + } + + inline void set_qnn_tensor_clientbuf(Qnn_Tensor_t& tensor, Qnn_ClientBuffer_t client_buf) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + tensor.v1.clientBuf = client_buf; + } + } + + inline void set_qnn_tensor_memhandle(Qnn_Tensor_t& tensor, Qnn_MemHandle_t handle) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + tensor.v1.memHandle = handle; + } + } } + + +#define VALIDATE(value, status) \ + do { \ + status = value; \ + if (status != QNN_SUCCESS) { \ + QNN_LOG_WARN("%s expected QNN_SUCCESS\n", #value); \ + return status; \ + } \ + } while (0) + +#define QNN_TENSOR_GET_ID(tensor) qnn::get_qnn_tensorid(tensor) +#define QNN_TENSOR_GET_NAME(tensor) qnn::get_qnn_tensorname(tensor) +#define QNN_TENSOR_GET_TYPE(tensor) qnn::get_qnn_tensortype(tensor) +#define QNN_TENSOR_GET_DATA_FORMAT(tensor) qnn::get_qnn_tensor_dataformat(tensor) +#define QNN_TENSOR_GET_DATA_TYPE(tensor) qnn::get_qnn_tensor_datatype(tensor) +#define QNN_TENSOR_GET_QUANT_PARAMS(tensor) qnn::get_qnn_tensor_quantparams(tensor) +#define QNN_TENSOR_GET_RANK(tensor) qnn::get_qnn_tensor_rank(tensor) +#define QNN_TENSOR_GET_DIMENSIONS(tensor) qnn::get_qnn_tensor_dimensions(tensor) +#define QNN_TENSOR_GET_MEM_TYPE(tensor) qnn::get_qnn_tensor_memtype(tensor) + +#define QNN_TENSOR_SET_ID(tensor, value) qnn::set_qnn_tensor_id(tensor, value) +#define QNN_TENSOR_SET_NAME(tensor, value) qnn::set_qnn_tensor_name(tensor, value) +#define QNN_TENSOR_SET_TYPE(tensor, value) qnn::set_qnn_tensor_type(tensor, value) +#define QNN_TENSOR_SET_DATA_FORMAT(tensor, value) qnn::set_qnn_tensor_dataformat(tensor, value) +#define QNN_TENSOR_SET_DATA_TYPE(tensor, value) qnn::set_qnn_tensor_datatype(tensor, value) +#define QNN_TENSOR_SET_QUANT_PARAMS(tensor, value) qnn::set_qnn_tensor_quantparams(tensor, value) +#define QNN_TENSOR_SET_RANK(tensor, value) qnn::set_qnn_tensor_rank(tensor, value) +#define QNN_TENSOR_SET_DIMENSIONS(tensor, value) qnn::set_qnn_tensor_dimensions(tensor, value) +#define QNN_TENSOR_SET_MEM_TYPE(tensor, value) qnn::set_qnn_tensor_memtype(tensor, value) +#define QNN_TENSOR_SET_CLIENT_BUF(tensor, value) qnn::set_qnn_tensor_clientbuf(tensor, value) +#define QNN_TENSOR_SET_MEM_HANDLE(tensor, value) qnn::set_qnn_tensor_memhandle(tensor, value) +#define VALIDATE_TENSOR_VERSION(tensor, err) VALIDATE(qnn::validate_tensor_version(tensor), err) From e1056da1c083ecba7b10d4833963a7c429cee054 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Mon, 24 Jun 2024 12:06:42 +0800 Subject: [PATCH 032/143] fix op handle checker --- ggml-qnn.cpp | 231 ++++++++++++++++++++++----------------------------- 1 file changed, 100 insertions(+), 131 deletions(-) diff --git a/ggml-qnn.cpp b/ggml-qnn.cpp index 3a667a1970aba..ffa43718410ab 100644 --- a/ggml-qnn.cpp +++ b/ggml-qnn.cpp @@ -354,12 +354,100 @@ static int free_qnn_tensor(Qnn_Tensor_t & tensor) { // implementation of QNN backend for GGML // // ================================================================================================= +static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, + const ggml_tensor * src1, ggml_tensor * dst); +static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, + const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); + +static ggml_qnn_func_t s_op_table[GGML_OP_COUNT] = { + nullptr, // GGML_OP_NONE + nullptr, // GGML_OP_DUP + ggml_qnn_add, // GGML_OP_ADD + nullptr, // GGML_OP_ADD1 + nullptr, // GGML_OP_ACC + nullptr, // GGML_OP_SUB + nullptr, // GGML_OP_MUL + nullptr, // GGML_OP_DIV + nullptr, // GGML_OP_SQR + nullptr, // GGML_OP_SQRT + nullptr, // GGML_OP_LOG + nullptr, // GGML_OP_SUM + nullptr, // GGML_OP_SUM_ROWS + nullptr, // GGML_OP_MEAN + nullptr, // GGML_OP_ARGMAX + nullptr, // GGML_OP_REPEAT + nullptr, // GGML_OP_REPEAT_BACK + nullptr, // GGML_OP_CONCAT + nullptr, // GGML_OP_SILU_BACK + nullptr, // GGML_OP_NORM + nullptr, // GGML_OP_RMS_NORM + nullptr, // GGML_OP_RMS_NORM_BACK + nullptr, // GGML_OP_GROUP_NORM + + ggml_qnn_mul_mat, // GGML_OP_MUL_MAT + nullptr, // GGML_OP_MUL_MAT_ID + nullptr, // GGML_OP_OUT_PROD + + nullptr, // GGML_OP_SCALE + nullptr, // GGML_OP_SET + nullptr, // GGML_OP_CPY + nullptr, // GGML_OP_CONT + nullptr, // GGML_OP_RESHAPE + nullptr, // GGML_OP_VIEW + nullptr, // GGML_OP_PERMUTE + nullptr, // GGML_OP_TRANSPOSE + nullptr, // GGML_OP_GET_ROWS + nullptr, // GGML_OP_GET_ROWS_BACK + nullptr, // GGML_OP_DIAG + nullptr, // GGML_OP_DIAG_MASK_INF + nullptr, // GGML_OP_DIAG_MASK_ZERO + nullptr, // GGML_OP_SOFT_MAX + nullptr, // GGML_OP_SOFT_MAX_BACK + nullptr, // GGML_OP_ROPE + nullptr, // GGML_OP_ROPE_BACK + nullptr, // GGML_OP_CLAMP + nullptr, // GGML_OP_CONV_TRANSPOSE_1D + nullptr, // GGML_OP_IM2COL + nullptr, // GGML_OP_CONV_TRANSPOSE_2D + nullptr, // GGML_OP_POOL_1D + nullptr, // GGML_OP_POOL_2D + nullptr, // GGML_OP_UPSCALE + nullptr, // GGML_OP_PAD + nullptr, // GGML_OP_ARANGE + nullptr, // GGML_OP_TIMESTEP_EMBEDDING + nullptr, // GGML_OP_ARGSORT + nullptr, // GGML_OP_LEAKY_RELU + + nullptr, // GGML_OP_FLASH_ATTN_EXT + nullptr, // GGML_OP_FLASH_ATTN_BACK + nullptr, // GGML_OP_SSM_CONV + nullptr, // GGML_OP_SSM_SCAN + nullptr, // GGML_OP_WIN_PART + nullptr, // GGML_OP_WIN_UNPART + nullptr, // GGML_OP_GET_REL_POS + nullptr, // GGML_OP_ADD_REL_POS + + nullptr, // GGML_OP_UNARY + + nullptr, // GGML_OP_MAP_UNARY + nullptr, // GGML_OP_MAP_BINARY + + nullptr, // GGML_OP_MAP_CUSTOM1_F32 + nullptr, // GGML_OP_MAP_CUSTOM2_F32 + nullptr, // GGML_OP_MAP_CUSTOM3_F32 + + nullptr, // GGML_OP_MAP_CUSTOM1 + nullptr, // GGML_OP_MAP_CUSTOM2 + nullptr, // GGML_OP_MAP_CUSTOM3 + + nullptr, // GGML_OP_CROSS_ENTROPY_LOSS + nullptr, // GGML_OP_CROSS_ENTROPY_LOSS_BACK +}; + static bool ggml_qnn_can_handle_op(ggml_backend_qnn_context * ctx, const struct ggml_tensor * tensor, bool b_dump_tensor_info) { - if (ggml_is_empty(tensor) || tensor->op == GGML_OP_RESHAPE || - tensor->op == GGML_OP_TRANSPOSE || tensor->op == GGML_OP_VIEW || - tensor->op == GGML_OP_PERMUTE || tensor->op == GGML_OP_NONE) { + if (ggml_is_empty(tensor) || !s_op_table[tensor->op]) { return false; } @@ -369,10 +457,10 @@ static bool ggml_qnn_can_handle_op(ggml_backend_qnn_context * ctx, return false; } - const int64_t ne00 = src0->ne[0]; - const int64_t ne01 = src0->ne[1]; - const int64_t ne10 = src1->ne[0]; - const int64_t ne11 = src1->ne[1]; + const auto ne00 = src0->ne[0]; + const auto ne01 = src0->ne[1]; + const auto ne10 = src1->ne[0]; + const auto ne11 = src1->ne[1]; // make qnn_get_ggml_tensor_rank and QNN SDK happy if (ne00 <= 1 || ne01 <= 1 || ne10 <= 1 || ne11 <= 1) { return false; @@ -951,132 +1039,13 @@ static void ggml_qnn_nop(ggml_backend_qnn_context * ctx, const ggml_tensor * src bool ggml_qnn_compute_forward(ggml_backend_qnn_context * ctx, struct ggml_compute_params * params, struct ggml_tensor * tensor) { - ggml_qnn_func_t func = nullptr; - - switch (tensor->op) { - case GGML_OP_ADD: - func = ggml_qnn_add; - break; - case GGML_OP_MUL_MAT: - func = ggml_qnn_mul_mat; - break; - case GGML_OP_REPEAT: - func = ggml_qnn_repeat; - break; - case GGML_OP_GET_ROWS: - func = ggml_qnn_get_rows; - break; - case GGML_OP_DUP: - func = ggml_qnn_dup; - break; - case GGML_OP_ACC: - func = ggml_qnn_acc; - break; - case GGML_OP_DIV: - func = ggml_qnn_div; - break; - case GGML_OP_UNARY: - switch (ggml_get_unary_op(tensor)) { - case GGML_UNARY_OP_GELU: - func = ggml_qnn_gelu; - break; - case GGML_UNARY_OP_SILU: - func = ggml_qnn_silu; - break; - case GGML_UNARY_OP_GELU_QUICK: - func = ggml_qnn_gelu_quick; - break; - case GGML_UNARY_OP_TANH: - func = ggml_qnn_tanh; - break; - case GGML_UNARY_OP_RELU: - func = ggml_qnn_relu; - break; - case GGML_UNARY_OP_HARDSIGMOID: - func = ggml_qnn_hardsigmoid; - break; - case GGML_UNARY_OP_HARDSWISH: - func = ggml_qnn_hardswish; - break; - default: - return false; - } - break; - case GGML_OP_NORM: - func = ggml_qnn_norm; - break; - case GGML_OP_GROUP_NORM: - func = ggml_qnn_group_norm; - break; - case GGML_OP_CONCAT: - func = ggml_qnn_concat; - break; - case GGML_OP_UPSCALE: - func = ggml_qnn_upscale; - break; - case GGML_OP_PAD: - func = ggml_qnn_pad; - break; - case GGML_OP_LEAKY_RELU: - func = ggml_qnn_leaky_relu; - break; - case GGML_OP_RMS_NORM: - func = ggml_qnn_rms_norm; - break; - case GGML_OP_MUL_MAT_ID: - func = ggml_qnn_mul_mat_id; - break; - case GGML_OP_SCALE: - func = ggml_qnn_scale; - break; - case GGML_OP_SQR: - func = ggml_qnn_sqr; - break; - case GGML_OP_CLAMP: - func = ggml_qnn_clamp; - break; - case GGML_OP_CPY: - func = ggml_qnn_cpy; - break; - case GGML_OP_CONT: - func = ggml_qnn_dup; - break; - case GGML_OP_NONE: - case GGML_OP_RESHAPE: - case GGML_OP_VIEW: - case GGML_OP_PERMUTE: - case GGML_OP_TRANSPOSE: - func = ggml_qnn_nop; - break; - case GGML_OP_DIAG_MASK_INF: - func = ggml_qnn_diag_mask_inf; - break; - case GGML_OP_SOFT_MAX: - func = ggml_qnn_soft_max; - break; - case GGML_OP_ROPE: - func = ggml_qnn_rope; - break; - case GGML_OP_IM2COL: - func = ggml_qnn_im2col; - break; - case GGML_OP_POOL_2D: - func = ggml_qnn_pool2d; - break; - case GGML_OP_SUM_ROWS: - func = ggml_qnn_sum_rows; - break; - case GGML_OP_ARGSORT: - func = ggml_qnn_argsort; - break; - default: + ggml_qnn_func_t func = s_op_table[tensor->op]; + if (!func) { + QNN_LOG_WARN("unsupported op %d", tensor->op); return false; } - if (nullptr != func) { - func(ctx, tensor->src[0], tensor->src[1], tensor); - } - + func(ctx, tensor->src[0], tensor->src[1], tensor); return true; } @@ -1349,7 +1318,7 @@ GGML_CALL static bool ggml_backend_qnn_supports_op(ggml_backend_t backend, GGML_CALL static bool ggml_backend_qnn_offload_op(ggml_backend_t backend,const ggml_tensor * tensor) { ggml_backend_qnn_context * ctx = (ggml_backend_qnn_context *) backend->context; - return ggml_qnn_compute_forward(ctx, nullptr, (ggml_tensor *) tensor); + return ggml_qnn_can_handle_op(ctx, tensor, false); } static ggml_backend_i ggml_backend_qnn_interface = { From c9e99bd603ea358eaa1c54505fd2d26faa3d9d4e Mon Sep 17 00:00:00 2001 From: hongruichen Date: Mon, 24 Jun 2024 22:11:28 +0800 Subject: [PATCH 033/143] split qnn ops into file --- ggml-qnn.cpp | 723 +--------------------------------- ggml-qnn/backend-ops.cpp | 675 +++++++++++++++++++++++++++++++ ggml-qnn/backend-ops.hpp | 17 + ggml-qnn/backend.hpp | 5 - ggml-qnn/qnn.hpp | 13 +- ggml-qnn/tensor.hpp | 1 + ggml-qnn/utils.cpp | 126 ++++++ ggml-qnn/utils.hpp | 172 +++----- tests/ggml-qnn/CMakeLists.txt | 2 + 9 files changed, 889 insertions(+), 845 deletions(-) create mode 100644 ggml-qnn/backend-ops.cpp create mode 100644 ggml-qnn/backend-ops.hpp create mode 100644 ggml-qnn/utils.cpp diff --git a/ggml-qnn.cpp b/ggml-qnn.cpp index ffa43718410ab..750d5ff91c3d3 100644 --- a/ggml-qnn.cpp +++ b/ggml-qnn.cpp @@ -1,22 +1,14 @@ #include #include -#include #include #include -#include -#include -#include #include #include -#include -#include #include -#include #include #include #include -#include #include #include #include @@ -28,7 +20,6 @@ #include #include #include -#include #include #include #include @@ -40,8 +31,9 @@ #include "ggml-qnn/logger.hpp" #include "ggml-qnn/utils.hpp" -#include "ggml-qnn/backend.hpp" #include "ggml-qnn/tensor.hpp" +#include "ggml-qnn/backend.hpp" +#include "ggml-qnn/backend-ops.hpp" // ================================================================================================= // @@ -63,11 +55,6 @@ static int free_qnn_tensor(Qnn_Tensor_t & tensor); #define QNN_BACKEND_NAME "qnn" -typedef void (*ggml_qnn_func_t)(ggml_backend_qnn_context * ctx, - const ggml_tensor * src0, - const ggml_tensor * src1, - ggml_tensor * dst); - static struct qnn::qcom_socinfo g_qnn_soc_info_table[] = { /* Qualcomm SnapDragon 8 Gen 1 */ [qnn::SM8450] = { @@ -183,78 +170,6 @@ struct ggml_backend_qnn_buffer_type_context { // QNN backend internal helper functions // // ================================================================================================= -static bool qnn_is_valid_params(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, - const ggml_tensor * src1, ggml_tensor * dst) { - if ((nullptr == ctx) || (nullptr == src0) || (nullptr == src1) || (nullptr == dst)) { - QNN_LOG_WARN("invalid params\n"); - return false; - } - - qnn::qnn_instance *instance = nullptr; - Qnn_Tensor_t *tensor_0 = nullptr; - Qnn_Tensor_t *tensor_1 = nullptr; - Qnn_Tensor_t *tensor_2 = nullptr; - tensor_0 = (Qnn_Tensor_t *) src0->extra; - tensor_1 = (Qnn_Tensor_t *) src1->extra; - tensor_2 = (Qnn_Tensor_t *) dst->extra; - instance = ctx->instance; - if ((nullptr == instance) || (nullptr == tensor_0) || (nullptr == tensor_1) || (nullptr == tensor_2)) { - QNN_LOG_WARN("invalid params\n"); - return false; - } - - return true; -} - -#ifndef NDEBUG -#define CHECK_PARAMS(ctx, src0, src1, dst) \ - do { \ - if (!qnn_is_valid_params((ctx), (src0), (src1), (dst))) { \ - return; \ - } \ - } while (0) - -#else -#define CHECK_PARAMS(ctx, src0, src1, dst) -#endif - -#if ENABLE_QNNBACKEND_PERF -class qnn_perf { -public: - qnn_perf(const std::string & perf_name) : _perf_name(std::move(perf_name)) {}; - qnn_perf() = delete; - qnn_perf(const qnn_perf & ) = delete; - qnn_perf & operator= (const qnn_perf & ) = delete; - - void start() { - _begin_time = ggml_time_us(); - } - - void info() { - _end_time = ggml_time_us(); - _duration = (_end_time - _begin_time); - QNN_LOG_INFO("duration of %s : %lld microseconds\n", _perf_name.c_str(), _duration); - } - -private: - int64_t _begin_time = 0LL; - int64_t _end_time = 0LL; - int64_t _duration = 0LL; - std::string _perf_name; -}; -#else -class qnn_perf { -public: - qnn_perf(const std::string & perf_name) {} - qnn_perf() = delete; - qnn_perf(const qnn_perf & ) = delete; - qnn_perf & operator= (const qnn_perf & ) = delete; - - void start() {} - void info() {} -}; -#endif - static size_t memscpy(void * dst, size_t dst_size, const void * src, size_t copy_size) { if (!dst || !src || !dst_size || !copy_size) return 0; @@ -354,100 +269,10 @@ static int free_qnn_tensor(Qnn_Tensor_t & tensor) { // implementation of QNN backend for GGML // // ================================================================================================= -static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, - const ggml_tensor * src1, ggml_tensor * dst); -static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, - const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); - -static ggml_qnn_func_t s_op_table[GGML_OP_COUNT] = { - nullptr, // GGML_OP_NONE - nullptr, // GGML_OP_DUP - ggml_qnn_add, // GGML_OP_ADD - nullptr, // GGML_OP_ADD1 - nullptr, // GGML_OP_ACC - nullptr, // GGML_OP_SUB - nullptr, // GGML_OP_MUL - nullptr, // GGML_OP_DIV - nullptr, // GGML_OP_SQR - nullptr, // GGML_OP_SQRT - nullptr, // GGML_OP_LOG - nullptr, // GGML_OP_SUM - nullptr, // GGML_OP_SUM_ROWS - nullptr, // GGML_OP_MEAN - nullptr, // GGML_OP_ARGMAX - nullptr, // GGML_OP_REPEAT - nullptr, // GGML_OP_REPEAT_BACK - nullptr, // GGML_OP_CONCAT - nullptr, // GGML_OP_SILU_BACK - nullptr, // GGML_OP_NORM - nullptr, // GGML_OP_RMS_NORM - nullptr, // GGML_OP_RMS_NORM_BACK - nullptr, // GGML_OP_GROUP_NORM - - ggml_qnn_mul_mat, // GGML_OP_MUL_MAT - nullptr, // GGML_OP_MUL_MAT_ID - nullptr, // GGML_OP_OUT_PROD - - nullptr, // GGML_OP_SCALE - nullptr, // GGML_OP_SET - nullptr, // GGML_OP_CPY - nullptr, // GGML_OP_CONT - nullptr, // GGML_OP_RESHAPE - nullptr, // GGML_OP_VIEW - nullptr, // GGML_OP_PERMUTE - nullptr, // GGML_OP_TRANSPOSE - nullptr, // GGML_OP_GET_ROWS - nullptr, // GGML_OP_GET_ROWS_BACK - nullptr, // GGML_OP_DIAG - nullptr, // GGML_OP_DIAG_MASK_INF - nullptr, // GGML_OP_DIAG_MASK_ZERO - nullptr, // GGML_OP_SOFT_MAX - nullptr, // GGML_OP_SOFT_MAX_BACK - nullptr, // GGML_OP_ROPE - nullptr, // GGML_OP_ROPE_BACK - nullptr, // GGML_OP_CLAMP - nullptr, // GGML_OP_CONV_TRANSPOSE_1D - nullptr, // GGML_OP_IM2COL - nullptr, // GGML_OP_CONV_TRANSPOSE_2D - nullptr, // GGML_OP_POOL_1D - nullptr, // GGML_OP_POOL_2D - nullptr, // GGML_OP_UPSCALE - nullptr, // GGML_OP_PAD - nullptr, // GGML_OP_ARANGE - nullptr, // GGML_OP_TIMESTEP_EMBEDDING - nullptr, // GGML_OP_ARGSORT - nullptr, // GGML_OP_LEAKY_RELU - - nullptr, // GGML_OP_FLASH_ATTN_EXT - nullptr, // GGML_OP_FLASH_ATTN_BACK - nullptr, // GGML_OP_SSM_CONV - nullptr, // GGML_OP_SSM_SCAN - nullptr, // GGML_OP_WIN_PART - nullptr, // GGML_OP_WIN_UNPART - nullptr, // GGML_OP_GET_REL_POS - nullptr, // GGML_OP_ADD_REL_POS - - nullptr, // GGML_OP_UNARY - - nullptr, // GGML_OP_MAP_UNARY - nullptr, // GGML_OP_MAP_BINARY - - nullptr, // GGML_OP_MAP_CUSTOM1_F32 - nullptr, // GGML_OP_MAP_CUSTOM2_F32 - nullptr, // GGML_OP_MAP_CUSTOM3_F32 - - nullptr, // GGML_OP_MAP_CUSTOM1 - nullptr, // GGML_OP_MAP_CUSTOM2 - nullptr, // GGML_OP_MAP_CUSTOM3 - - nullptr, // GGML_OP_CROSS_ENTROPY_LOSS - nullptr, // GGML_OP_CROSS_ENTROPY_LOSS_BACK -}; - static bool ggml_qnn_can_handle_op(ggml_backend_qnn_context * ctx, const struct ggml_tensor * tensor, bool b_dump_tensor_info) { - if (ggml_is_empty(tensor) || !s_op_table[tensor->op]) { + if (ggml_is_empty(tensor) || !qnn::ggml_qnn_op_array()[tensor->op]) { return false; } @@ -496,550 +321,10 @@ static bool ggml_qnn_can_handle_op(ggml_backend_qnn_context * ctx, return true; } - -//TODO: this function can be removed later because there are duplicated codes with ggml_qnn_mul_mat -// keep it for illustrate how to implement a specified GGMPL OP using QNN API + QNN RPC -static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, - const ggml_tensor * src1, ggml_tensor * dst) { - Qnn_ErrorHandle_t error = QNN_SUCCESS; - bool graph_initialized = false; - qnn::qnn_instance *instance = nullptr; - std::string graph_name = "ggml_op_qnn_add"; - Qnn_GraphHandle_t graph_handle = nullptr; - Qnn_Param_t qnn_params[] = {}; - enum ggml_op ggmlop = GGML_OP_ADD; - - CHECK_PARAMS(ctx, src0, src1, dst); - instance = ctx->instance; - auto qnn_raw_interface = ctx->raw_interface; - - qnn_perf perf("ggml_qnn_add"); - perf.start(); - - std::string map_entry = std::string(ggml_op_name(ggmlop)); - if (instance->_qnn_graph_map.find(map_entry) != - instance->_qnn_graph_map.end()) { - graph_initialized = true; - auto & graph_item = instance->_qnn_graph_map[map_entry]; - graph_handle = std::get<0>(graph_item); - } - - if (!graph_initialized) { - graph_name = graph_name + "_" + std::to_string(ctx->threads) + - "_" + src0->name + "_" + src1->name; - QNN_LOG_INFO("graph name %s", graph_name.c_str()); - if (ctx->device == QNN_BACKEND_NPU) { - QnnHtpGraph_CustomConfig_t hvx_config; - hvx_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_NUM_HVX_THREADS; - hvx_config.numHvxThreads = 8; - QnnGraph_Config_t graph_hvx_config; - graph_hvx_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; - graph_hvx_config.customConfig = &hvx_config; - - QnnHtpGraph_CustomConfig_t dlbc_config; - dlbc_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_OPTIMIZATION; - dlbc_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_ENABLE_DLBC; - dlbc_config.optimizationOption.floatValue = 1.0; // set to 0.0 to turn off DLBC - QnnGraph_Config_t graph_dlbc_config; - graph_dlbc_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; - graph_dlbc_config.customConfig = &dlbc_config; - - QnnHtpGraph_CustomConfig_t opt_config; - opt_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_OPTIMIZATION_FLAG; - opt_config.optimizationOption.floatValue = 1; // 1 / 3 - QnnGraph_Config_t graph_opt_config; - graph_opt_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; - graph_opt_config.customConfig = &opt_config; - - QnnHtpGraph_CustomConfig_t vtcm_config; - vtcm_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_VTCM_SIZE; - vtcm_config.vtcmSizeInMB = ctx->socinfo.vtcm_size_in_mb; - QnnGraph_Config_t graph_vtcm_config; - graph_vtcm_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; - graph_vtcm_config.customConfig = &vtcm_config; - - const QnnGraph_Config_t * p_graphconfig[] = {&graph_hvx_config, - &graph_dlbc_config, - &graph_vtcm_config, - &graph_opt_config, - NULL}; - error = qnn_raw_interface.graphCreate( - instance->get_qnn_context_handle(), graph_name.c_str(), p_graphconfig, - &graph_handle); - } else { - error = qnn_raw_interface.graphCreate( - instance->get_qnn_context_handle(), graph_name.c_str(), nullptr, - &graph_handle); - } - - if (QNN_SUCCESS != error) { - QNN_LOG_INFO("can't create qnn graph handle with graph name %s, " - "error = %d\n", - graph_name.c_str(), error); - goto failure; - } else { - QNN_LOG_INFO("create qnn graph handle with graph name %s ok\n", graph_name.c_str()); - } - - qnn::ggml_qnn_tensor_input tensor_input0(src0, graph_handle, ctx); - if (!tensor_input0.is_valid()) { - goto failure; - } - qnn::ggml_qnn_tensor_input tensor_input1(src1, graph_handle, ctx); - if (!tensor_input1.is_valid()) { - QNN_LOG_INFO("error = %d\n", error); - goto failure; - } - qnn::ggml_qnn_tensor_output tensor_output(dst, graph_handle, ctx); - if (!tensor_output.is_valid()) { - goto failure; - } - - Qnn_Tensor_t tensor_inputs[] = {*tensor_input0.get_qnn_tensor(), *tensor_input1.get_qnn_tensor()}; - Qnn_Tensor_t tensor_outputs[] = {*tensor_output.get_qnn_tensor()}; - Qnn_OpConfig_t op_config = { - (Qnn_OpConfigVersion_t) 1, - .v1 = {"ggml_op_add", - QNN_OP_PACKAGE_NAME_QTI_AISW, - QNN_OP_ELEMENT_WISE_ADD, - 0, qnn_params, - 2, tensor_inputs, - 1,tensor_outputs} - }; - error = qnn_raw_interface.graphAddNode(graph_handle, op_config); - if (QNN_SUCCESS != error) { - QNN_LOG_INFO("error = %d\n", error); - goto failure; - } - error = qnn_raw_interface.graphFinalize(graph_handle, - nullptr, nullptr); - if (QNN_SUCCESS != error) { - QNN_LOG_INFO("error = %d\n", error); - goto failure; - } - error = qnn_raw_interface.graphExecute(graph_handle, - tensor_inputs, 2, - tensor_outputs, 1, - nullptr, nullptr); - if (ctx->device == QNN_BACKEND_NPU) { - if (QNN_COMMON_ERROR_SYSTEM_COMMUNICATION == error) { - QNN_LOG_WARN("NPU crashed. SSR detected. Caused QNN graph execute error\n"); - } - } - if (QNN_SUCCESS != error) { - QNN_LOG_INFO("error = %d\n", error); - goto failure; - } - - auto graph_item = std::make_tuple(graph_handle, - tensor_input0.get_qnn_tensor(), - tensor_input1.get_qnn_tensor(), - tensor_output.get_qnn_tensor()); - instance->_qnn_graph_map[map_entry] = graph_item; - } else { - auto & graph_item = instance->_qnn_graph_map[map_entry]; - qnn::ggml_qnn_tensor_input tensor_input0(src0, std::get<1>(graph_item), ctx); - qnn::ggml_qnn_tensor_input tensor_input1(src1, std::get<2>(graph_item), ctx); - qnn::ggml_qnn_tensor_output tensor_output(dst, std::get<3>(graph_item), ctx); - - Qnn_Tensor_t tensor_inputs[] = {*tensor_input0.get_qnn_tensor(), *tensor_input1.get_qnn_tensor()}; - Qnn_Tensor_t tensor_outputs[] = {*tensor_output.get_qnn_tensor()}; - error = qnn_raw_interface.graphExecute(graph_handle, - tensor_inputs,2, - tensor_outputs,1, - nullptr, nullptr); - if (ctx->device == QNN_BACKEND_NPU) { - if (QNN_COMMON_ERROR_SYSTEM_COMMUNICATION == error) { - QNN_LOG_WARN("NPU crashed. SSR detected. Caused QNN graph execute error\n"); - } - } - if (QNN_SUCCESS != error) { - QNN_LOG_INFO("error = %d\n", error); - goto failure; - } - } - -failure: - if (QNN_SUCCESS != error) { - QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 - " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", - src0->name, src0->type, ggml_type_name(src0->type), - src0->ne[0], src0->ne[1], src0->ne[2], src0->nb[0], - src0->nb[1], src0->nb[2]); - QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 - " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", - src1->name, src1->type, ggml_type_name(src1->type), - src1->ne[0], src1->ne[1], src1->ne[2], src1->nb[0], - src1->nb[1], src1->nb[2]); - QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 - " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", - dst->name, dst->type, ggml_type_name(dst->type), - dst->ne[0], dst->ne[1], dst->ne[2], dst->nb[0], - dst->nb[1], dst->nb[2]); - } - - perf.info(); -} - -/* - * ggml_qnn_mul_mat was re-added as a standalone function because - * the following comments came from https://github.com/ggerganov/llama.cpp/pull/1632 - * MUL_MAT take most of the compute time (about 95%). - * So to speed up llama, we have to focus on MUL_MAT. - * - * We have three kinds of MUL_MAT to compute: - * mul_mat_f32: both src0 and src1 are F32. - * mul_mat_f16_f32: src0 is F16 and src1 is F32. - * mul_mat_q_f32: src0 is quantized (Q4_0, Q4_1, ...), and src1 is F32. - */ -static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, - const ggml_tensor * src0, const ggml_tensor * src1, - ggml_tensor * dst) { - Qnn_ErrorHandle_t error = QNN_SUCCESS; - bool graph_initialized = false; - qnn::qnn_instance *instance = nullptr; - std::string graph_name = "ggml_op_qnn_mul_mat"; - Qnn_GraphHandle_t graph_handle = nullptr; - Qnn_Param_t qnn_params[] = {}; - enum ggml_op ggmlop = GGML_OP_MUL_MAT; - - CHECK_PARAMS(ctx, src0, src1, dst); - instance = ctx->instance; - auto qnn_raw_interface = ctx->raw_interface; - - qnn_perf perf("ggml_qnn_mul_mat"); - perf.start(); - - std::string map_entry = std::string(ggml_op_name(ggmlop)); - if (instance->_qnn_graph_map.find(map_entry) != - instance->_qnn_graph_map.end()) { - graph_initialized = true; - auto & graph_item = instance->_qnn_graph_map[map_entry]; - graph_handle = std::get<0>(graph_item); - } - - //TODO: for scenarios of quantized data in src0 - // pass-1: dequantize src0 to FP32 - // pass-2: dq-src0 * src1 - // the performance gains is worth although there is performance loss in pass-1 - - if (!graph_initialized) { - graph_name = graph_name + "_" + std::to_string(ctx->threads) + - "_" + src0->name + "_" + src1->name; - QNN_LOG_INFO("graph name %s", graph_name.c_str()); - if (ctx->device == QNN_BACKEND_NPU) { - QnnHtpGraph_CustomConfig_t hvx_config; - hvx_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_NUM_HVX_THREADS; - hvx_config.numHvxThreads = 8; - QnnGraph_Config_t graph_hvx_config; - graph_hvx_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; - graph_hvx_config.customConfig = &hvx_config; - - QnnHtpGraph_CustomConfig_t dlbc_config; - dlbc_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_OPTIMIZATION; - dlbc_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_ENABLE_DLBC; - dlbc_config.optimizationOption.floatValue = 1.0; // set to 0.0 to turn off DLBC - QnnGraph_Config_t graph_dlbc_config; - graph_dlbc_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; - graph_dlbc_config.customConfig = &dlbc_config; - - QnnHtpGraph_CustomConfig_t opt_config; - opt_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_OPTIMIZATION_FLAG; - opt_config.optimizationOption.floatValue = 1; //1 / 3 - QnnGraph_Config_t graph_opt_config; - graph_opt_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; - graph_opt_config.customConfig = &opt_config; - - QnnHtpGraph_CustomConfig_t vtcm_config; - vtcm_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_VTCM_SIZE; - vtcm_config.vtcmSizeInMB = ctx->socinfo.vtcm_size_in_mb; - QnnGraph_Config_t graph_vtcm_config; - graph_vtcm_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; - graph_vtcm_config.customConfig = &vtcm_config; - - const QnnGraph_Config_t * p_graphconfig[] = {&graph_hvx_config, - &graph_dlbc_config, - &graph_vtcm_config, - &graph_opt_config, - NULL}; - error = qnn_raw_interface.graphCreate( - instance->get_qnn_context_handle(), graph_name.c_str(), p_graphconfig, - &graph_handle); - } else { - error = qnn_raw_interface.graphCreate( - instance->get_qnn_context_handle(), graph_name.c_str(), nullptr, - &graph_handle); - } - if (QNN_SUCCESS != error) { - QNN_LOG_INFO("can't create qnn graph handle with graph name %s, " - "error = %d\n", - graph_name.c_str(), error); - goto failure; - } - - qnn::ggml_qnn_tensor_input tensor_input0(src0, graph_handle, ctx); - if (!tensor_input0.is_valid()) { - goto failure; - } - qnn::ggml_qnn_tensor_input tensor_input1(src1, graph_handle, ctx); - if (!tensor_input1.is_valid()) { - goto failure; - } - qnn::ggml_qnn_tensor_output tensor_output(dst, graph_handle, ctx); - if (!tensor_output.is_valid()) { - goto failure; - } - - Qnn_Tensor_t tensor_inputs[] = {*tensor_input0.get_qnn_tensor(), *tensor_input1.get_qnn_tensor()}; - Qnn_Tensor_t tensor_outputs[] = {*tensor_output.get_qnn_tensor()}; - Qnn_OpConfig_t op_config = { - (Qnn_OpConfigVersion_t) 1, - .v1 = {"ggml_op_mul_mat", - QNN_OP_PACKAGE_NAME_QTI_AISW, - QNN_OP_MAT_MUL, - 0, qnn_params, - 2, tensor_inputs, - 1, tensor_outputs} - }; - error = qnn_raw_interface.graphAddNode(graph_handle, op_config); - if (QNN_SUCCESS != error) { - QNN_LOG_INFO("error = %d\n", error); - goto failure; - } - error = qnn_raw_interface.graphFinalize(graph_handle, - nullptr, nullptr); - if (QNN_SUCCESS != error) { - QNN_LOG_INFO("error = %d\n", error); - goto failure; - } - error = qnn_raw_interface.graphExecute(graph_handle, - tensor_inputs, 2, - tensor_outputs, 1, - nullptr, nullptr); - if (ctx->device == QNN_BACKEND_NPU) { - if (QNN_COMMON_ERROR_SYSTEM_COMMUNICATION == error) { - QNN_LOG_WARN("NPU crashed. SSR detected. Caused QNN graph execute error\n"); - } - } - if (QNN_SUCCESS != error) { - QNN_LOG_INFO("error = %d\n", error); - goto failure; - } - - auto graph_item = std::make_tuple(graph_handle, - tensor_input0.get_qnn_tensor(), - tensor_input1.get_qnn_tensor(), - tensor_output.get_qnn_tensor()); - instance->_qnn_graph_map[map_entry] = graph_item; - } else { - auto & graph_item= instance->_qnn_graph_map[map_entry]; - qnn::ggml_qnn_tensor_input tensor_input0(src0, std::get<1>(graph_item), ctx); - qnn::ggml_qnn_tensor_input tensor_input1(src1, std::get<2>(graph_item), ctx); - qnn::ggml_qnn_tensor_output tensor_output(dst, std::get<3>(graph_item), ctx); - - Qnn_Tensor_t tensor_inputs[] = {*tensor_input0.get_qnn_tensor(), *tensor_input1.get_qnn_tensor()}; - Qnn_Tensor_t tensor_outputs[] = {*tensor_output.get_qnn_tensor()}; - error = qnn_raw_interface.graphExecute(graph_handle, - tensor_inputs, 2, - tensor_outputs, 1, - nullptr, nullptr); - if (ctx->device == QNN_BACKEND_NPU) { - if (QNN_COMMON_ERROR_SYSTEM_COMMUNICATION == error) { - QNN_LOG_WARN("NPU crashed. SSR detected. Caused QNN graph execute error\n"); - } - } - if (QNN_SUCCESS != error) { - QNN_LOG_INFO("error = %d\n", error); - goto failure; - } - } - -failure: - if (QNN_SUCCESS != error) { - QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 - " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", - src0->name, src0->type, ggml_type_name(src0->type), - src0->ne[0], src0->ne[1], src0->ne[2], src0->nb[0], - src0->nb[1], src0->nb[2]); - QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 - " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", - src1->name, src1->type, ggml_type_name(src1->type), - src1->ne[0], src1->ne[1], src1->ne[2], src1->nb[0], - src1->nb[1], src1->nb[2]); - QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 - " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", - dst->name, dst->type, ggml_type_name(dst->type), dst->ne[0], - dst->ne[1], dst->ne[2], dst->nb[0], dst->nb[1], dst->nb[2]); - } - - perf.info(); -} - -static void ggml_qnn_repeat(ggml_backend_qnn_context * ctx, - const ggml_tensor * src0, const ggml_tensor * src1, - ggml_tensor * dst) { -} - -static void ggml_qnn_get_rows(ggml_backend_qnn_context * ctx, - const ggml_tensor * src0, const ggml_tensor * src1, - ggml_tensor * dst) { -} - -static void ggml_qnn_acc(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, - const ggml_tensor * src1, ggml_tensor * dst) { -} - -static void ggml_qnn_div(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, - const ggml_tensor * src1, ggml_tensor * dst) { -} - -static void ggml_qnn_gelu(ggml_backend_qnn_context * ctx, - const ggml_tensor * src0, const ggml_tensor * src1, - ggml_tensor * dst) { -} - -static void ggml_qnn_silu(ggml_backend_qnn_context * ctx, - const ggml_tensor * src0, const ggml_tensor * src1, - ggml_tensor * dst) { -} - -static void ggml_qnn_gelu_quick(ggml_backend_qnn_context * ctx, - const ggml_tensor * src0, - const ggml_tensor * src1, ggml_tensor * dst) { -} - -static void ggml_qnn_tanh(ggml_backend_qnn_context * ctx, - const ggml_tensor * src0, const ggml_tensor * src1, - ggml_tensor * dst) { -} - -static void ggml_qnn_relu(ggml_backend_qnn_context * ctx, - const ggml_tensor * src0, const ggml_tensor * src1, - ggml_tensor * dst) { -} - -static void ggml_qnn_hardsigmoid(ggml_backend_qnn_context * ctx, - const ggml_tensor * src0, - const ggml_tensor * src1, ggml_tensor * dst) { -} - -static void ggml_qnn_hardswish(ggml_backend_qnn_context * ctx, - const ggml_tensor * src0, const ggml_tensor * src1, - ggml_tensor * dst) { -} - -static void ggml_qnn_leaky_relu(ggml_backend_qnn_context * ctx, - const ggml_tensor * src0, - const ggml_tensor * src1, ggml_tensor * dst) { -} - -static void ggml_qnn_sqr(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, - const ggml_tensor * src1, ggml_tensor * dst) { -} - -static void ggml_qnn_norm(ggml_backend_qnn_context * ctx, - const ggml_tensor * src0, const ggml_tensor * src1, - ggml_tensor * dst) { -} - -static void ggml_qnn_group_norm(ggml_backend_qnn_context * ctx, - const ggml_tensor * src0, - const ggml_tensor * src1, ggml_tensor * dst) { -} - -static void ggml_qnn_concat(ggml_backend_qnn_context * ctx, - const ggml_tensor * src0, const ggml_tensor * src1, - ggml_tensor * dst) { -} - -static void ggml_qnn_upscale(ggml_backend_qnn_context * ctx, - const ggml_tensor * src0, const ggml_tensor * src1, - ggml_tensor * dst) { -} - -static void ggml_qnn_pad(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, - const ggml_tensor * src1, ggml_tensor * dst) { -} - -static void ggml_qnn_rms_norm(ggml_backend_qnn_context * ctx, - const ggml_tensor * src0, const ggml_tensor * src1, - ggml_tensor * dst) { -} - -static void ggml_qnn_cpy(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, - const ggml_tensor * src1, ggml_tensor * dst) { -} - -static void ggml_qnn_dup(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, - const ggml_tensor * src1, ggml_tensor * dst) { - ggml_qnn_cpy(ctx, src0, dst, nullptr); - (void) src1; -} - -static void ggml_qnn_mul_mat_id(ggml_backend_qnn_context * ctx, - const ggml_tensor * src0, - const ggml_tensor * src1, ggml_tensor * dst) { -} - -static void ggml_qnn_scale(ggml_backend_qnn_context * ctx, - const ggml_tensor * src0, const ggml_tensor * src1, - ggml_tensor * dst) { -} - -static void ggml_qnn_clamp(ggml_backend_qnn_context * ctx, - const ggml_tensor * src0, const ggml_tensor * src1, - ggml_tensor * dst) { -} - -static void ggml_qnn_diag_mask_inf(ggml_backend_qnn_context * ctx, - const ggml_tensor * src0, - const ggml_tensor * src1, ggml_tensor * dst) { -} - -static void ggml_qnn_soft_max(ggml_backend_qnn_context * ctx, - const ggml_tensor * src0, const ggml_tensor * src1, - ggml_tensor * dst) { -} - -static void ggml_qnn_rope(ggml_backend_qnn_context * ctx, - const ggml_tensor * src0, const ggml_tensor * src1, - ggml_tensor * dst) { - GGML_ASSERT(ggml_is_contiguous(src0)); -} - -static void ggml_qnn_pool2d(ggml_backend_qnn_context * ctx, - const ggml_tensor * src0, const ggml_tensor * src1, - ggml_tensor * dst) { -} - -static void ggml_qnn_im2col(ggml_backend_qnn_context * ctx, - const ggml_tensor * src0, const ggml_tensor * src1, - ggml_tensor * dst) { -} - -static void ggml_qnn_sum_rows(ggml_backend_qnn_context * ctx, - const ggml_tensor * src0, const ggml_tensor * src1, - ggml_tensor * dst) { - GGML_ASSERT(ggml_is_contiguous(src0)); -} - -static void ggml_qnn_argsort(ggml_backend_qnn_context * ctx, - const ggml_tensor * src0, const ggml_tensor * src1, - ggml_tensor * dst) { - GGML_ASSERT(ggml_is_contiguous(src0)); -} - -static void ggml_qnn_nop(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, - const ggml_tensor * src1, ggml_tensor * dst) { - (void)src0; - (void)src1; - (void)dst; -} - bool ggml_qnn_compute_forward(ggml_backend_qnn_context * ctx, struct ggml_compute_params * params, struct ggml_tensor * tensor) { - ggml_qnn_func_t func = s_op_table[tensor->op]; + auto func = qnn::ggml_qnn_op_array()[tensor->op]; if (!func) { QNN_LOG_WARN("unsupported op %d", tensor->op); return false; diff --git a/ggml-qnn/backend-ops.cpp b/ggml-qnn/backend-ops.cpp new file mode 100644 index 0000000000000..a9c94a6df3102 --- /dev/null +++ b/ggml-qnn/backend-ops.cpp @@ -0,0 +1,675 @@ + +#include "backend-ops.hpp" + +#include "utils.hpp" +#include "logger.hpp" +#include "tensor.hpp" + + +static bool qnn_is_valid_params(ggml_backend_qnn_context* ctx, const ggml_tensor* src0, + const ggml_tensor* src1, ggml_tensor* dst) { + if ((nullptr == ctx) || (nullptr == src0) || (nullptr == src1) || (nullptr == dst)) { + QNN_LOG_WARN("invalid params\n"); + return false; + } + + qnn::qnn_instance* instance = nullptr; + Qnn_Tensor_t* tensor_0 = nullptr; + Qnn_Tensor_t* tensor_1 = nullptr; + Qnn_Tensor_t* tensor_2 = nullptr; + tensor_0 = (Qnn_Tensor_t*)src0->extra; + tensor_1 = (Qnn_Tensor_t*)src1->extra; + tensor_2 = (Qnn_Tensor_t*)dst->extra; + instance = ctx->instance; + if ((nullptr == instance) || (nullptr == tensor_0) || (nullptr == tensor_1) || (nullptr == tensor_2)) { + QNN_LOG_WARN("invalid params\n"); + return false; + } + + return true; +} + +#ifndef NDEBUG +#define CHECK_PARAMS(ctx, src0, src1, dst) \ + do { \ + if (!qnn_is_valid_params((ctx), (src0), (src1), (dst))) { \ + return; \ + } \ + } while (0) + +#else +#define CHECK_PARAMS(ctx, src0, src1, dst) +#endif + +//TODO: this function can be removed later because there are duplicated codes with ggml_qnn_mul_mat +// keep it for illustrate how to implement a specified GGMPL OP using QNN API + QNN RPC +static void ggml_qnn_add(ggml_backend_qnn_context* ctx, const ggml_tensor* src0, + const ggml_tensor* src1, ggml_tensor* dst) { + Qnn_ErrorHandle_t error = QNN_SUCCESS; + bool graph_initialized = false; + qnn::qnn_instance* instance = nullptr; + std::string graph_name = "ggml_op_qnn_add"; + Qnn_GraphHandle_t graph_handle = nullptr; + Qnn_Param_t qnn_params[] = {}; + enum ggml_op ggmlop = GGML_OP_ADD; + + CHECK_PARAMS(ctx, src0, src1, dst); + instance = ctx->instance; + auto qnn_raw_interface = ctx->raw_interface; + + qnn::qnn_perf perf("ggml_qnn_add"); + perf.start(); + + std::string map_entry = std::string(ggml_op_name(ggmlop)); + if (instance->_qnn_graph_map.find(map_entry) != + instance->_qnn_graph_map.end()) { + graph_initialized = true; + auto& graph_item = instance->_qnn_graph_map[map_entry]; + graph_handle = std::get<0>(graph_item); + } + + if (!graph_initialized) { + graph_name = graph_name + "_" + std::to_string(ctx->threads) + + "_" + src0->name + "_" + src1->name; + QNN_LOG_INFO("graph name %s", graph_name.c_str()); + if (ctx->device == QNN_BACKEND_NPU) { + QnnHtpGraph_CustomConfig_t hvx_config; + hvx_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_NUM_HVX_THREADS; + hvx_config.numHvxThreads = 8; + QnnGraph_Config_t graph_hvx_config; + graph_hvx_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; + graph_hvx_config.customConfig = &hvx_config; + + QnnHtpGraph_CustomConfig_t dlbc_config; + dlbc_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_OPTIMIZATION; + dlbc_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_ENABLE_DLBC; + dlbc_config.optimizationOption.floatValue = 1.0; // set to 0.0 to turn off DLBC + QnnGraph_Config_t graph_dlbc_config; + graph_dlbc_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; + graph_dlbc_config.customConfig = &dlbc_config; + + QnnHtpGraph_CustomConfig_t opt_config; + opt_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_OPTIMIZATION_FLAG; + opt_config.optimizationOption.floatValue = 1; // 1 / 3 + QnnGraph_Config_t graph_opt_config; + graph_opt_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; + graph_opt_config.customConfig = &opt_config; + + QnnHtpGraph_CustomConfig_t vtcm_config; + vtcm_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_VTCM_SIZE; + vtcm_config.vtcmSizeInMB = ctx->socinfo.vtcm_size_in_mb; + QnnGraph_Config_t graph_vtcm_config; + graph_vtcm_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; + graph_vtcm_config.customConfig = &vtcm_config; + + const QnnGraph_Config_t* p_graphconfig[] = { &graph_hvx_config, + &graph_dlbc_config, + &graph_vtcm_config, + &graph_opt_config, + NULL }; + error = qnn_raw_interface.graphCreate( + instance->get_qnn_context_handle(), graph_name.c_str(), p_graphconfig, + &graph_handle); + } + else { + error = qnn_raw_interface.graphCreate( + instance->get_qnn_context_handle(), graph_name.c_str(), nullptr, + &graph_handle); + } + + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("can't create qnn graph handle with graph name %s, " + "error = %d\n", + graph_name.c_str(), error); + goto failure; + } + else { + QNN_LOG_INFO("create qnn graph handle with graph name %s ok\n", graph_name.c_str()); + } + + qnn::ggml_qnn_tensor_input tensor_input0(src0, graph_handle, ctx); + if (!tensor_input0.is_valid()) { + goto failure; + } + qnn::ggml_qnn_tensor_input tensor_input1(src1, graph_handle, ctx); + if (!tensor_input1.is_valid()) { + QNN_LOG_INFO("error = %d\n", error); + goto failure; + } + qnn::ggml_qnn_tensor_output tensor_output(dst, graph_handle, ctx); + if (!tensor_output.is_valid()) { + goto failure; + } + + Qnn_Tensor_t tensor_inputs[] = { *tensor_input0.get_qnn_tensor(), *tensor_input1.get_qnn_tensor() }; + Qnn_Tensor_t tensor_outputs[] = { *tensor_output.get_qnn_tensor() }; + Qnn_OpConfig_t op_config = { + (Qnn_OpConfigVersion_t)1, + .v1 = {"ggml_op_add", + QNN_OP_PACKAGE_NAME_QTI_AISW, + QNN_OP_ELEMENT_WISE_ADD, + 0, qnn_params, + 2, tensor_inputs, + 1,tensor_outputs} + }; + error = qnn_raw_interface.graphAddNode(graph_handle, op_config); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + goto failure; + } + error = qnn_raw_interface.graphFinalize(graph_handle, + nullptr, nullptr); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + goto failure; + } + error = qnn_raw_interface.graphExecute(graph_handle, + tensor_inputs, 2, + tensor_outputs, 1, + nullptr, nullptr); + if (ctx->device == QNN_BACKEND_NPU) { + if (QNN_COMMON_ERROR_SYSTEM_COMMUNICATION == error) { + QNN_LOG_WARN("NPU crashed. SSR detected. Caused QNN graph execute error\n"); + } + } + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + goto failure; + } + + auto graph_item = std::make_tuple(graph_handle, + tensor_input0.get_qnn_tensor(), + tensor_input1.get_qnn_tensor(), + tensor_output.get_qnn_tensor()); + instance->_qnn_graph_map[map_entry] = graph_item; + } + else { + auto& graph_item = instance->_qnn_graph_map[map_entry]; + qnn::ggml_qnn_tensor_input tensor_input0(src0, std::get<1>(graph_item), ctx); + qnn::ggml_qnn_tensor_input tensor_input1(src1, std::get<2>(graph_item), ctx); + qnn::ggml_qnn_tensor_output tensor_output(dst, std::get<3>(graph_item), ctx); + + Qnn_Tensor_t tensor_inputs[] = { *tensor_input0.get_qnn_tensor(), *tensor_input1.get_qnn_tensor() }; + Qnn_Tensor_t tensor_outputs[] = { *tensor_output.get_qnn_tensor() }; + error = qnn_raw_interface.graphExecute(graph_handle, + tensor_inputs, 2, + tensor_outputs, 1, + nullptr, nullptr); + if (ctx->device == QNN_BACKEND_NPU) { + if (QNN_COMMON_ERROR_SYSTEM_COMMUNICATION == error) { + QNN_LOG_WARN("NPU crashed. SSR detected. Caused QNN graph execute error\n"); + } + } + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + goto failure; + } + } + +failure: + if (QNN_SUCCESS != error) { + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 + " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + src0->name, src0->type, ggml_type_name(src0->type), + src0->ne[0], src0->ne[1], src0->ne[2], src0->nb[0], + src0->nb[1], src0->nb[2]); + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 + " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + src1->name, src1->type, ggml_type_name(src1->type), + src1->ne[0], src1->ne[1], src1->ne[2], src1->nb[0], + src1->nb[1], src1->nb[2]); + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 + " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + dst->name, dst->type, ggml_type_name(dst->type), + dst->ne[0], dst->ne[1], dst->ne[2], dst->nb[0], + dst->nb[1], dst->nb[2]); + } + + perf.info(); +} + +/* + * ggml_qnn_mul_mat was re-added as a standalone function because + * the following comments came from https://github.com/ggerganov/llama.cpp/pull/1632 + * MUL_MAT take most of the compute time (about 95%). + * So to speed up llama, we have to focus on MUL_MAT. + * + * We have three kinds of MUL_MAT to compute: + * mul_mat_f32: both src0 and src1 are F32. + * mul_mat_f16_f32: src0 is F16 and src1 is F32. + * mul_mat_q_f32: src0 is quantized (Q4_0, Q4_1, ...), and src1 is F32. + */ +static void ggml_qnn_mul_mat(ggml_backend_qnn_context* ctx, + const ggml_tensor* src0, const ggml_tensor* src1, + ggml_tensor* dst) { + Qnn_ErrorHandle_t error = QNN_SUCCESS; + bool graph_initialized = false; + qnn::qnn_instance* instance = nullptr; + std::string graph_name = "ggml_op_qnn_mul_mat"; + Qnn_GraphHandle_t graph_handle = nullptr; + Qnn_Param_t qnn_params[] = {}; + enum ggml_op ggmlop = GGML_OP_MUL_MAT; + + CHECK_PARAMS(ctx, src0, src1, dst); + instance = ctx->instance; + auto qnn_raw_interface = ctx->raw_interface; + + qnn::qnn_perf perf("ggml_qnn_mul_mat"); + perf.start(); + + std::string map_entry = std::string(ggml_op_name(ggmlop)); + if (instance->_qnn_graph_map.find(map_entry) != + instance->_qnn_graph_map.end()) { + graph_initialized = true; + auto& graph_item = instance->_qnn_graph_map[map_entry]; + graph_handle = std::get<0>(graph_item); + } + + //TODO: for scenarios of quantized data in src0 + // pass-1: dequantize src0 to FP32 + // pass-2: dq-src0 * src1 + // the performance gains is worth although there is performance loss in pass-1 + + if (!graph_initialized) { + graph_name = graph_name + "_" + std::to_string(ctx->threads) + + "_" + src0->name + "_" + src1->name; + QNN_LOG_INFO("graph name %s", graph_name.c_str()); + if (ctx->device == QNN_BACKEND_NPU) { + QnnHtpGraph_CustomConfig_t hvx_config; + hvx_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_NUM_HVX_THREADS; + hvx_config.numHvxThreads = 8; + QnnGraph_Config_t graph_hvx_config; + graph_hvx_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; + graph_hvx_config.customConfig = &hvx_config; + + QnnHtpGraph_CustomConfig_t dlbc_config; + dlbc_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_OPTIMIZATION; + dlbc_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_ENABLE_DLBC; + dlbc_config.optimizationOption.floatValue = 1.0; // set to 0.0 to turn off DLBC + QnnGraph_Config_t graph_dlbc_config; + graph_dlbc_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; + graph_dlbc_config.customConfig = &dlbc_config; + + QnnHtpGraph_CustomConfig_t opt_config; + opt_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_OPTIMIZATION_FLAG; + opt_config.optimizationOption.floatValue = 1; //1 / 3 + QnnGraph_Config_t graph_opt_config; + graph_opt_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; + graph_opt_config.customConfig = &opt_config; + + QnnHtpGraph_CustomConfig_t vtcm_config; + vtcm_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_VTCM_SIZE; + vtcm_config.vtcmSizeInMB = ctx->socinfo.vtcm_size_in_mb; + QnnGraph_Config_t graph_vtcm_config; + graph_vtcm_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; + graph_vtcm_config.customConfig = &vtcm_config; + + const QnnGraph_Config_t* p_graphconfig[] = { &graph_hvx_config, + &graph_dlbc_config, + &graph_vtcm_config, + &graph_opt_config, + NULL }; + error = qnn_raw_interface.graphCreate( + instance->get_qnn_context_handle(), graph_name.c_str(), p_graphconfig, + &graph_handle); + } + else { + error = qnn_raw_interface.graphCreate( + instance->get_qnn_context_handle(), graph_name.c_str(), nullptr, + &graph_handle); + } + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("can't create qnn graph handle with graph name %s, " + "error = %d\n", + graph_name.c_str(), error); + goto failure; + } + + qnn::ggml_qnn_tensor_input tensor_input0(src0, graph_handle, ctx); + if (!tensor_input0.is_valid()) { + goto failure; + } + qnn::ggml_qnn_tensor_input tensor_input1(src1, graph_handle, ctx); + if (!tensor_input1.is_valid()) { + goto failure; + } + qnn::ggml_qnn_tensor_output tensor_output(dst, graph_handle, ctx); + if (!tensor_output.is_valid()) { + goto failure; + } + + Qnn_Tensor_t tensor_inputs[] = { *tensor_input0.get_qnn_tensor(), *tensor_input1.get_qnn_tensor() }; + Qnn_Tensor_t tensor_outputs[] = { *tensor_output.get_qnn_tensor() }; + Qnn_OpConfig_t op_config = { + (Qnn_OpConfigVersion_t)1, + .v1 = {"ggml_op_mul_mat", + QNN_OP_PACKAGE_NAME_QTI_AISW, + QNN_OP_MAT_MUL, + 0, qnn_params, + 2, tensor_inputs, + 1, tensor_outputs} + }; + error = qnn_raw_interface.graphAddNode(graph_handle, op_config); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + goto failure; + } + error = qnn_raw_interface.graphFinalize(graph_handle, + nullptr, nullptr); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + goto failure; + } + error = qnn_raw_interface.graphExecute(graph_handle, + tensor_inputs, 2, + tensor_outputs, 1, + nullptr, nullptr); + if (ctx->device == QNN_BACKEND_NPU) { + if (QNN_COMMON_ERROR_SYSTEM_COMMUNICATION == error) { + QNN_LOG_WARN("NPU crashed. SSR detected. Caused QNN graph execute error\n"); + } + } + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + goto failure; + } + + auto graph_item = std::make_tuple(graph_handle, + tensor_input0.get_qnn_tensor(), + tensor_input1.get_qnn_tensor(), + tensor_output.get_qnn_tensor()); + instance->_qnn_graph_map[map_entry] = graph_item; + } + else { + auto& graph_item = instance->_qnn_graph_map[map_entry]; + qnn::ggml_qnn_tensor_input tensor_input0(src0, std::get<1>(graph_item), ctx); + qnn::ggml_qnn_tensor_input tensor_input1(src1, std::get<2>(graph_item), ctx); + qnn::ggml_qnn_tensor_output tensor_output(dst, std::get<3>(graph_item), ctx); + + Qnn_Tensor_t tensor_inputs[] = { *tensor_input0.get_qnn_tensor(), *tensor_input1.get_qnn_tensor() }; + Qnn_Tensor_t tensor_outputs[] = { *tensor_output.get_qnn_tensor() }; + error = qnn_raw_interface.graphExecute(graph_handle, + tensor_inputs, 2, + tensor_outputs, 1, + nullptr, nullptr); + if (ctx->device == QNN_BACKEND_NPU) { + if (QNN_COMMON_ERROR_SYSTEM_COMMUNICATION == error) { + QNN_LOG_WARN("NPU crashed. SSR detected. Caused QNN graph execute error\n"); + } + } + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + goto failure; + } + } + +failure: + if (QNN_SUCCESS != error) { + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 + " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + src0->name, src0->type, ggml_type_name(src0->type), + src0->ne[0], src0->ne[1], src0->ne[2], src0->nb[0], + src0->nb[1], src0->nb[2]); + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 + " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + src1->name, src1->type, ggml_type_name(src1->type), + src1->ne[0], src1->ne[1], src1->ne[2], src1->nb[0], + src1->nb[1], src1->nb[2]); + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 + " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + dst->name, dst->type, ggml_type_name(dst->type), dst->ne[0], + dst->ne[1], dst->ne[2], dst->nb[0], dst->nb[1], dst->nb[2]); + } + + perf.info(); +} + +static void ggml_qnn_repeat(ggml_backend_qnn_context* ctx, + const ggml_tensor* src0, const ggml_tensor* src1, + ggml_tensor* dst) { +} + +static void ggml_qnn_get_rows(ggml_backend_qnn_context* ctx, + const ggml_tensor* src0, const ggml_tensor* src1, + ggml_tensor* dst) { +} + +static void ggml_qnn_acc(ggml_backend_qnn_context* ctx, const ggml_tensor* src0, + const ggml_tensor* src1, ggml_tensor* dst) { +} + +static void ggml_qnn_div(ggml_backend_qnn_context* ctx, const ggml_tensor* src0, + const ggml_tensor* src1, ggml_tensor* dst) { +} + +static void ggml_qnn_gelu(ggml_backend_qnn_context* ctx, + const ggml_tensor* src0, const ggml_tensor* src1, + ggml_tensor* dst) { +} + +static void ggml_qnn_silu(ggml_backend_qnn_context* ctx, + const ggml_tensor* src0, const ggml_tensor* src1, + ggml_tensor* dst) { +} + +static void ggml_qnn_gelu_quick(ggml_backend_qnn_context* ctx, + const ggml_tensor* src0, + const ggml_tensor* src1, ggml_tensor* dst) { +} + +static void ggml_qnn_tanh(ggml_backend_qnn_context* ctx, + const ggml_tensor* src0, const ggml_tensor* src1, + ggml_tensor* dst) { +} + +static void ggml_qnn_relu(ggml_backend_qnn_context* ctx, + const ggml_tensor* src0, const ggml_tensor* src1, + ggml_tensor* dst) { +} + +static void ggml_qnn_hardsigmoid(ggml_backend_qnn_context* ctx, + const ggml_tensor* src0, + const ggml_tensor* src1, ggml_tensor* dst) { +} + +static void ggml_qnn_hardswish(ggml_backend_qnn_context* ctx, + const ggml_tensor* src0, const ggml_tensor* src1, + ggml_tensor* dst) { +} + +static void ggml_qnn_leaky_relu(ggml_backend_qnn_context* ctx, + const ggml_tensor* src0, + const ggml_tensor* src1, ggml_tensor* dst) { +} + +static void ggml_qnn_sqr(ggml_backend_qnn_context* ctx, const ggml_tensor* src0, + const ggml_tensor* src1, ggml_tensor* dst) { +} + +static void ggml_qnn_norm(ggml_backend_qnn_context* ctx, + const ggml_tensor* src0, const ggml_tensor* src1, + ggml_tensor* dst) { +} + +static void ggml_qnn_group_norm(ggml_backend_qnn_context* ctx, + const ggml_tensor* src0, + const ggml_tensor* src1, ggml_tensor* dst) { +} + +static void ggml_qnn_concat(ggml_backend_qnn_context* ctx, + const ggml_tensor* src0, const ggml_tensor* src1, + ggml_tensor* dst) { +} + +static void ggml_qnn_upscale(ggml_backend_qnn_context* ctx, + const ggml_tensor* src0, const ggml_tensor* src1, + ggml_tensor* dst) { +} + +static void ggml_qnn_pad(ggml_backend_qnn_context* ctx, const ggml_tensor* src0, + const ggml_tensor* src1, ggml_tensor* dst) { +} + +static void ggml_qnn_rms_norm(ggml_backend_qnn_context* ctx, + const ggml_tensor* src0, const ggml_tensor* src1, + ggml_tensor* dst) { +} + +static void ggml_qnn_cpy(ggml_backend_qnn_context* ctx, const ggml_tensor* src0, + const ggml_tensor* src1, ggml_tensor* dst) { +} + +static void ggml_qnn_dup(ggml_backend_qnn_context* ctx, const ggml_tensor* src0, + const ggml_tensor* src1, ggml_tensor* dst) { + ggml_qnn_cpy(ctx, src0, dst, nullptr); + (void)src1; +} + +static void ggml_qnn_mul_mat_id(ggml_backend_qnn_context* ctx, + const ggml_tensor* src0, + const ggml_tensor* src1, ggml_tensor* dst) { +} + +static void ggml_qnn_scale(ggml_backend_qnn_context* ctx, + const ggml_tensor* src0, const ggml_tensor* src1, + ggml_tensor* dst) { +} + +static void ggml_qnn_clamp(ggml_backend_qnn_context* ctx, + const ggml_tensor* src0, const ggml_tensor* src1, + ggml_tensor* dst) { +} + +static void ggml_qnn_diag_mask_inf(ggml_backend_qnn_context* ctx, + const ggml_tensor* src0, + const ggml_tensor* src1, ggml_tensor* dst) { +} + +static void ggml_qnn_soft_max(ggml_backend_qnn_context* ctx, + const ggml_tensor* src0, const ggml_tensor* src1, + ggml_tensor* dst) { +} + +static void ggml_qnn_rope(ggml_backend_qnn_context* ctx, + const ggml_tensor* src0, const ggml_tensor* src1, + ggml_tensor* dst) { + GGML_ASSERT(ggml_is_contiguous(src0)); +} + +static void ggml_qnn_pool2d(ggml_backend_qnn_context* ctx, + const ggml_tensor* src0, const ggml_tensor* src1, + ggml_tensor* dst) { +} + +static void ggml_qnn_im2col(ggml_backend_qnn_context* ctx, + const ggml_tensor* src0, const ggml_tensor* src1, + ggml_tensor* dst) { +} + +static void ggml_qnn_sum_rows(ggml_backend_qnn_context* ctx, + const ggml_tensor* src0, const ggml_tensor* src1, + ggml_tensor* dst) { + GGML_ASSERT(ggml_is_contiguous(src0)); +} + +static void ggml_qnn_argsort(ggml_backend_qnn_context* ctx, + const ggml_tensor* src0, const ggml_tensor* src1, + ggml_tensor* dst) { + GGML_ASSERT(ggml_is_contiguous(src0)); +} + +static void ggml_qnn_nop(ggml_backend_qnn_context* ctx, const ggml_tensor* src0, + const ggml_tensor* src1, ggml_tensor* dst) { + (void)src0; + (void)src1; + (void)dst; +} + +qnn::ggml_qnn_op_array_t qnn::ggml_qnn_op_array() { + static constexpr const qnn::ggml_qnn_op_t kQnnOpsTable[GGML_OP_COUNT] = { + nullptr, // GGML_OP_NONE + nullptr, // GGML_OP_DUP + ggml_qnn_add, // GGML_OP_ADD + nullptr, // GGML_OP_ADD1 + nullptr, // GGML_OP_ACC + nullptr, // GGML_OP_SUB + nullptr, // GGML_OP_MUL + nullptr, // GGML_OP_DIV + nullptr, // GGML_OP_SQR + nullptr, // GGML_OP_SQRT + nullptr, // GGML_OP_LOG + nullptr, // GGML_OP_SUM + nullptr, // GGML_OP_SUM_ROWS + nullptr, // GGML_OP_MEAN + nullptr, // GGML_OP_ARGMAX + nullptr, // GGML_OP_REPEAT + nullptr, // GGML_OP_REPEAT_BACK + nullptr, // GGML_OP_CONCAT + nullptr, // GGML_OP_SILU_BACK + nullptr, // GGML_OP_NORM + nullptr, // GGML_OP_RMS_NORM + nullptr, // GGML_OP_RMS_NORM_BACK + nullptr, // GGML_OP_GROUP_NORM + + ggml_qnn_mul_mat, // GGML_OP_MUL_MAT + nullptr, // GGML_OP_MUL_MAT_ID + nullptr, // GGML_OP_OUT_PROD + + nullptr, // GGML_OP_SCALE + nullptr, // GGML_OP_SET + nullptr, // GGML_OP_CPY + nullptr, // GGML_OP_CONT + nullptr, // GGML_OP_RESHAPE + nullptr, // GGML_OP_VIEW + nullptr, // GGML_OP_PERMUTE + nullptr, // GGML_OP_TRANSPOSE + nullptr, // GGML_OP_GET_ROWS + nullptr, // GGML_OP_GET_ROWS_BACK + nullptr, // GGML_OP_DIAG + nullptr, // GGML_OP_DIAG_MASK_INF + nullptr, // GGML_OP_DIAG_MASK_ZERO + nullptr, // GGML_OP_SOFT_MAX + nullptr, // GGML_OP_SOFT_MAX_BACK + nullptr, // GGML_OP_ROPE + nullptr, // GGML_OP_ROPE_BACK + nullptr, // GGML_OP_CLAMP + nullptr, // GGML_OP_CONV_TRANSPOSE_1D + nullptr, // GGML_OP_IM2COL + nullptr, // GGML_OP_CONV_TRANSPOSE_2D + nullptr, // GGML_OP_POOL_1D + nullptr, // GGML_OP_POOL_2D + nullptr, // GGML_OP_UPSCALE + nullptr, // GGML_OP_PAD + nullptr, // GGML_OP_ARANGE + nullptr, // GGML_OP_TIMESTEP_EMBEDDING + nullptr, // GGML_OP_ARGSORT + nullptr, // GGML_OP_LEAKY_RELU + + nullptr, // GGML_OP_FLASH_ATTN_EXT + nullptr, // GGML_OP_FLASH_ATTN_BACK + nullptr, // GGML_OP_SSM_CONV + nullptr, // GGML_OP_SSM_SCAN + nullptr, // GGML_OP_WIN_PART + nullptr, // GGML_OP_WIN_UNPART + nullptr, // GGML_OP_GET_REL_POS + nullptr, // GGML_OP_ADD_REL_POS + + nullptr, // GGML_OP_UNARY + + nullptr, // GGML_OP_MAP_UNARY + nullptr, // GGML_OP_MAP_BINARY + + nullptr, // GGML_OP_MAP_CUSTOM1_F32 + nullptr, // GGML_OP_MAP_CUSTOM2_F32 + nullptr, // GGML_OP_MAP_CUSTOM3_F32 + + nullptr, // GGML_OP_MAP_CUSTOM1 + nullptr, // GGML_OP_MAP_CUSTOM2 + nullptr, // GGML_OP_MAP_CUSTOM3 + + nullptr, // GGML_OP_CROSS_ENTROPY_LOSS + nullptr, // GGML_OP_CROSS_ENTROPY_LOSS_BACK + }; + + return kQnnOpsTable; +} diff --git a/ggml-qnn/backend-ops.hpp b/ggml-qnn/backend-ops.hpp new file mode 100644 index 0000000000000..c3dd5de302289 --- /dev/null +++ b/ggml-qnn/backend-ops.hpp @@ -0,0 +1,17 @@ +#pragma once + +#include "ggml.h" +#include "backend.hpp" + +namespace qnn { + + typedef void (*ggml_qnn_op_t)(ggml_backend_qnn_context* ctx, + const ggml_tensor* src0, + const ggml_tensor* src1, + ggml_tensor* dst); + + typedef const ggml_qnn_op_t(&ggml_qnn_op_array_t)[GGML_OP_COUNT]; + + ggml_qnn_op_array_t ggml_qnn_op_array(); + +} diff --git a/ggml-qnn/backend.hpp b/ggml-qnn/backend.hpp index b5aacf57c1aa0..dc40090ee6114 100644 --- a/ggml-qnn/backend.hpp +++ b/ggml-qnn/backend.hpp @@ -1,11 +1,6 @@ #pragma once -#include "QnnTypes.h" -#include "QnnCommon.h" -#include "QnnContext.h" -#include "QnnBackend.h" - #include "ggml.h" #include "ggml-backend.h" diff --git a/ggml-qnn/qnn.hpp b/ggml-qnn/qnn.hpp index 212b6f8521745..6caefb75644f7 100644 --- a/ggml-qnn/qnn.hpp +++ b/ggml-qnn/qnn.hpp @@ -1,21 +1,27 @@ #pragma once +#include #include +#include +#include +#include // header file of Qualcomm QNN(Qualcomm Neural Network, aka Qualcomm AI Engine Direct) SDK // https://qpm.qualcomm.com/#/main/tools/details/qualcomm_ai_engine_direct #include "QnnTypes.h" #include "QnnCommon.h" +#include "QnnInterface.h" #include "QnnContext.h" #include "QnnBackend.h" #include "QnnGraph.h" #include "QnnProperty.h" #include "QnnTensor.h" +#include "System/QnnSystemInterface.h" #include "HTP/QnnHtpDevice.h" #include "HTP/QnnHtpGraph.h" +#include "qnn-types.hpp" #include "utils.hpp" -#include "logger.hpp" namespace qnn { @@ -864,9 +870,8 @@ namespace qnn { const qnn::qcom_socinfo& get_soc_info() { return _soc_info; } public: - std::map> - _qnn_graph_map; + std::map> _qnn_graph_map; private: int load_system() { diff --git a/ggml-qnn/tensor.hpp b/ggml-qnn/tensor.hpp index 687ebd8905ef4..de0d1dc2dbbef 100644 --- a/ggml-qnn/tensor.hpp +++ b/ggml-qnn/tensor.hpp @@ -4,6 +4,7 @@ #include "QnnTensor.h" #include "System/QnnSystemInterface.h" +#include "ggml-qnn.h" #include "backend.hpp" #include "qnn.hpp" diff --git a/ggml-qnn/utils.cpp b/ggml-qnn/utils.cpp new file mode 100644 index 0000000000000..798445c02fd76 --- /dev/null +++ b/ggml-qnn/utils.cpp @@ -0,0 +1,126 @@ + +#include "utils.hpp" + +#include "ggml-qnn.h" +#include "qnn-types.hpp" + +namespace qnn { + + // TODO: mapping more ggml data type to QNN data type + // ref:explanation of k-quants, https://github.com/ggerganov/llama.cpp/pull/1684 + Qnn_DataType_t datatype_from_ggml_datatype(enum ggml_type ggmltype) { + switch (ggmltype) { + case GGML_TYPE_F16: + return QNN_DATATYPE_FLOAT_16; + case GGML_TYPE_F32: + return QNN_DATATYPE_FLOAT_32; + case GGML_TYPE_I8: + return QNN_DATATYPE_INT_8; + case GGML_TYPE_Q8_0: + return QNN_DATATYPE_SFIXED_POINT_8; + case GGML_TYPE_Q4_0: + return QNN_DATATYPE_SFIXED_POINT_4; + default: + break; + } + return QNN_DATATYPE_UNDEFINED; + } + + + uint32_t get_ggml_tensor_rank(const ggml_tensor* tensor) { + uint32_t rank = 0; + for (int i = 0; i < GGML_MAX_DIMS; i++) { + if ((0 != tensor->ne[i]) && (1 != tensor->ne[i])) { + rank++; + } + } + return rank; + } + + + const char* get_backend_name(int n_backend_type) { + switch (n_backend_type) { + case QNN_BACKEND_CPU: + return "QNN-CPU"; + case QNN_BACKEND_GPU: + return "QNN-GPU"; + case QNN_BACKEND_NPU: + return "QNN-NPU"; + case QNN_BACKEND_GGML: + return "ggml"; //"fake" QNN backend, used for compare performance between QNN backend and original GGML + default: + return "unknown"; + } + } + + const char* get_chipset_desc(uint32_t chipset_id) { + switch (chipset_id) { + case SM8450: + return "SM8450"; + case SM8475: + return "SM8475"; + case SM8550: + return "SM8550"; + case SM8650: + return "SM8650"; + default: + return "unknown"; + } + } + + const char* get_htparch_desc(size_t htp_arch) { + switch (htp_arch) { + case V68: + return "QCOM_HTP_V68"; + case V69: + return "QCOM_HTP_V69"; + case V73: + return "QCOM_HTP_V73"; + case V75: + return "QCOM_HTP_V75"; + default: + return "unknown"; + } + } + + intptr_t align_to(size_t alignment, intptr_t offset) { + return offset % alignment == 0 + ? offset + : offset + (static_cast(alignment) - + offset % static_cast(alignment)); + } + + uint32_t get_ggml_tensor_data_size(const ggml_tensor* tensor) { + /* + size_t data_size = ggml_row_size(tensor->type, tensor->ne[0]); + size_t n_dims = qnn_get_ggml_tensor_rank(tensor); + for (int i = 1; i < n_dims; i++) { + data_size *= tensor->ne[i]; + } + + return data_size; + */ + return ggml_nbytes(tensor); + } + + // ================================================================================================= + // + // QNN backend internal helper functions + // + // ================================================================================================= + // TODO: only support GGML_OP_ADD/GGML_OP_MUL/GGML_OP_MUL_MAT + const char* opname_from_ggmlop(enum ggml_op ggmlop) { + switch (ggmlop) { + case GGML_OP_ADD: + return QNN_OP_ELEMENT_WISE_ADD; + case GGML_OP_MUL: + return QNN_OP_ELEMENT_WISE_MULTIPLY; + case GGML_OP_MUL_MAT: + return QNN_OP_MAT_MUL; + default: + break; + } + return nullptr; + } + +} diff --git a/ggml-qnn/utils.hpp b/ggml-qnn/utils.hpp index 2ec7c0f13f0ce..4889c6dc8601c 100644 --- a/ggml-qnn/utils.hpp +++ b/ggml-qnn/utils.hpp @@ -1,135 +1,34 @@ #pragma once +#include +#include +#include +#include +#include +#include + #include "QnnTypes.h" #include "ggml.h" -#include "qnn-types.hpp" +#include "logger.hpp" namespace qnn { - // TODO: mapping more ggml data type to QNN data type - // ref:explanation of k-quants, https://github.com/ggerganov/llama.cpp/pull/1684 - Qnn_DataType_t datatype_from_ggml_datatype(enum ggml_type ggmltype) { - switch (ggmltype) { - case GGML_TYPE_F16: - return QNN_DATATYPE_FLOAT_16; - case GGML_TYPE_F32: - return QNN_DATATYPE_FLOAT_32; - case GGML_TYPE_I8: - return QNN_DATATYPE_INT_8; - case GGML_TYPE_Q8_0: - return QNN_DATATYPE_SFIXED_POINT_8; - case GGML_TYPE_Q4_0: - return QNN_DATATYPE_SFIXED_POINT_4; - default: - break; - } - return QNN_DATATYPE_UNDEFINED; - } - - - uint32_t get_ggml_tensor_rank(const ggml_tensor* tensor) { - uint32_t rank = 0; - for (int i = 0; i < GGML_MAX_DIMS; i++) { - if ((0 != tensor->ne[i]) && (1 != tensor->ne[i])) { - rank++; - } - } - return rank; - } + Qnn_DataType_t datatype_from_ggml_datatype(enum ggml_type ggmltype); + uint32_t get_ggml_tensor_rank(const ggml_tensor* tensor); + const char* get_backend_name(int n_backend_type); + const char* get_chipset_desc(uint32_t chipset_id); + const char* get_htparch_desc(size_t htp_arch); + intptr_t align_to(size_t alignment, intptr_t offset); + uint32_t get_ggml_tensor_data_size(const ggml_tensor* tensor); - - const char* get_backend_name(int n_backend_type) { - switch (n_backend_type) { - case QNN_BACKEND_CPU: - return "QNN-CPU"; - case QNN_BACKEND_GPU: - return "QNN-GPU"; - case QNN_BACKEND_NPU: - return "QNN-NPU"; - case QNN_BACKEND_GGML: - return "ggml"; //"fake" QNN backend, used for compare performance between QNN backend and original GGML - default: - return "unknown"; - } - } - - const char* get_chipset_desc(uint32_t chipset_id) { - switch (chipset_id) { - case SM8450: - return "SM8450"; - case SM8475: - return "SM8475"; - case SM8550: - return "SM8550"; - case SM8650: - return "SM8650"; - default: - return "unknown"; - } - } - - const char* get_htparch_desc(size_t htp_arch) { - switch (htp_arch) { - case V68: - return "QCOM_HTP_V68"; - case V69: - return "QCOM_HTP_V69"; - case V73: - return "QCOM_HTP_V73"; - case V75: - return "QCOM_HTP_V75"; - default: - return "unknown"; - } - } + const char* opname_from_ggmlop(enum ggml_op ggmlop); template Fn load_qnn_functionpointers(void* handle, const char* function_name) { return reinterpret_cast(dlsym(handle, function_name)); } - intptr_t align_to(size_t alignment, intptr_t offset) { - return offset % alignment == 0 - ? offset - : offset + (static_cast(alignment) - - offset % static_cast(alignment)); - } - - uint32_t get_ggml_tensor_data_size(const ggml_tensor* tensor) { - /* - size_t data_size = ggml_row_size(tensor->type, tensor->ne[0]); - size_t n_dims = qnn_get_ggml_tensor_rank(tensor); - for (int i = 1; i < n_dims; i++) { - data_size *= tensor->ne[i]; - } - - return data_size; - */ - return ggml_nbytes(tensor); - } - - - // ================================================================================================= - // - // QNN backend internal helper functions - // - // ================================================================================================= - // TODO: only support GGML_OP_ADD/GGML_OP_MUL/GGML_OP_MUL_MAT - const char* opname_from_ggmlop(enum ggml_op ggmlop) { - switch (ggmlop) { - case GGML_OP_ADD: - return QNN_OP_ELEMENT_WISE_ADD; - case GGML_OP_MUL: - return QNN_OP_ELEMENT_WISE_MULTIPLY; - case GGML_OP_MUL_MAT: - return QNN_OP_MAT_MUL; - default: - break; - } - return nullptr; - } - inline int validate_tensor_version(Qnn_Tensor_t tensor) { if (tensor.version != QNN_TENSOR_VERSION_1) { QNN_LOG_WARN( @@ -272,6 +171,45 @@ namespace qnn { tensor.v1.memHandle = handle; } } + + +#if ENABLE_QNNBACKEND_PERF + class qnn_perf { + public: + qnn_perf(const std::string& perf_name) : _perf_name(std::move(perf_name)) {}; + qnn_perf() = delete; + qnn_perf(const qnn_perf&) = delete; + qnn_perf& operator= (const qnn_perf&) = delete; + + void start() { + _begin_time = ggml_time_us(); + } + + void info() { + _end_time = ggml_time_us(); + _duration = (_end_time - _begin_time); + QNN_LOG_INFO("duration of %s : %lld microseconds\n", _perf_name.c_str(), _duration); + } + + private: + int64_t _begin_time = 0LL; + int64_t _end_time = 0LL; + int64_t _duration = 0LL; + std::string _perf_name; + }; +#else + class qnn_perf { + public: + qnn_perf(const std::string& perf_name) {} + qnn_perf() = delete; + qnn_perf(const qnn_perf&) = delete; + qnn_perf& operator= (const qnn_perf&) = delete; + + void start() {} + void info() {} + }; +#endif + } diff --git a/tests/ggml-qnn/CMakeLists.txt b/tests/ggml-qnn/CMakeLists.txt index 77a2059ed0f0c..66e8c077a1d3a 100644 --- a/tests/ggml-qnn/CMakeLists.txt +++ b/tests/ggml-qnn/CMakeLists.txt @@ -21,6 +21,8 @@ set(SOURCE_FILES ../../ggml-backend.c ../../ggml-quants.c ../../ggml-qnn/logger.cpp + ../../ggml-qnn/utils.cpp + ../../ggml-qnn/backend-ops.cpp ../../ggml-qnn.cpp ggml-qnn-ut.cpp ) From 8b677d1b2facc248409b3356a18198437add807d Mon Sep 17 00:00:00 2001 From: hongruichen Date: Tue, 2 Jul 2024 10:40:07 +0800 Subject: [PATCH 034/143] move qnn backend into sub folder --- ggml-qnn.h => ggml/include/ggml-qnn.h | 0 ggml-qnn.cpp => ggml/src/ggml-qnn.cpp | 0 .../src/ggml-qnn}/backend-ops.cpp | 0 .../src/ggml-qnn}/backend-ops.hpp | 0 {ggml-qnn => ggml/src/ggml-qnn}/backend.hpp | 0 {ggml-qnn => ggml/src/ggml-qnn}/logger.cpp | 0 {ggml-qnn => ggml/src/ggml-qnn}/logger.hpp | 0 {ggml-qnn => ggml/src/ggml-qnn}/qnn-types.hpp | 0 {ggml-qnn => ggml/src/ggml-qnn}/qnn.hpp | 0 {ggml-qnn => ggml/src/ggml-qnn}/tensor.hpp | 0 {ggml-qnn => ggml/src/ggml-qnn}/utils.cpp | 0 {ggml-qnn => ggml/src/ggml-qnn}/utils.hpp | 0 tests/ggml-qnn/CMakeLists.txt | 40 +++++++++---------- 13 files changed, 19 insertions(+), 21 deletions(-) rename ggml-qnn.h => ggml/include/ggml-qnn.h (100%) rename ggml-qnn.cpp => ggml/src/ggml-qnn.cpp (100%) rename {ggml-qnn => ggml/src/ggml-qnn}/backend-ops.cpp (100%) rename {ggml-qnn => ggml/src/ggml-qnn}/backend-ops.hpp (100%) rename {ggml-qnn => ggml/src/ggml-qnn}/backend.hpp (100%) rename {ggml-qnn => ggml/src/ggml-qnn}/logger.cpp (100%) rename {ggml-qnn => ggml/src/ggml-qnn}/logger.hpp (100%) rename {ggml-qnn => ggml/src/ggml-qnn}/qnn-types.hpp (100%) rename {ggml-qnn => ggml/src/ggml-qnn}/qnn.hpp (100%) rename {ggml-qnn => ggml/src/ggml-qnn}/tensor.hpp (100%) rename {ggml-qnn => ggml/src/ggml-qnn}/utils.cpp (100%) rename {ggml-qnn => ggml/src/ggml-qnn}/utils.hpp (100%) diff --git a/ggml-qnn.h b/ggml/include/ggml-qnn.h similarity index 100% rename from ggml-qnn.h rename to ggml/include/ggml-qnn.h diff --git a/ggml-qnn.cpp b/ggml/src/ggml-qnn.cpp similarity index 100% rename from ggml-qnn.cpp rename to ggml/src/ggml-qnn.cpp diff --git a/ggml-qnn/backend-ops.cpp b/ggml/src/ggml-qnn/backend-ops.cpp similarity index 100% rename from ggml-qnn/backend-ops.cpp rename to ggml/src/ggml-qnn/backend-ops.cpp diff --git a/ggml-qnn/backend-ops.hpp b/ggml/src/ggml-qnn/backend-ops.hpp similarity index 100% rename from ggml-qnn/backend-ops.hpp rename to ggml/src/ggml-qnn/backend-ops.hpp diff --git a/ggml-qnn/backend.hpp b/ggml/src/ggml-qnn/backend.hpp similarity index 100% rename from ggml-qnn/backend.hpp rename to ggml/src/ggml-qnn/backend.hpp diff --git a/ggml-qnn/logger.cpp b/ggml/src/ggml-qnn/logger.cpp similarity index 100% rename from ggml-qnn/logger.cpp rename to ggml/src/ggml-qnn/logger.cpp diff --git a/ggml-qnn/logger.hpp b/ggml/src/ggml-qnn/logger.hpp similarity index 100% rename from ggml-qnn/logger.hpp rename to ggml/src/ggml-qnn/logger.hpp diff --git a/ggml-qnn/qnn-types.hpp b/ggml/src/ggml-qnn/qnn-types.hpp similarity index 100% rename from ggml-qnn/qnn-types.hpp rename to ggml/src/ggml-qnn/qnn-types.hpp diff --git a/ggml-qnn/qnn.hpp b/ggml/src/ggml-qnn/qnn.hpp similarity index 100% rename from ggml-qnn/qnn.hpp rename to ggml/src/ggml-qnn/qnn.hpp diff --git a/ggml-qnn/tensor.hpp b/ggml/src/ggml-qnn/tensor.hpp similarity index 100% rename from ggml-qnn/tensor.hpp rename to ggml/src/ggml-qnn/tensor.hpp diff --git a/ggml-qnn/utils.cpp b/ggml/src/ggml-qnn/utils.cpp similarity index 100% rename from ggml-qnn/utils.cpp rename to ggml/src/ggml-qnn/utils.cpp diff --git a/ggml-qnn/utils.hpp b/ggml/src/ggml-qnn/utils.hpp similarity index 100% rename from ggml-qnn/utils.hpp rename to ggml/src/ggml-qnn/utils.hpp diff --git a/tests/ggml-qnn/CMakeLists.txt b/tests/ggml-qnn/CMakeLists.txt index 66e8c077a1d3a..b4f1bd6c07482 100644 --- a/tests/ggml-qnn/CMakeLists.txt +++ b/tests/ggml-qnn/CMakeLists.txt @@ -13,18 +13,18 @@ set(QNN_INC_PATH ${QNN_SDK_PATH}/include/QNN) set(QNN_LIB_PATH ${QNN_SDK_PATH}/lib/aarch64-android) include_directories(${QNN_INC_PATH}) -include_directories(../../) # ggml.h +include_directories(../../ggml/include) # ggml.h, ggml-qnn.h set(SOURCE_FILES - ../../ggml.c - ../../ggml-alloc.c - ../../ggml-backend.c - ../../ggml-quants.c - ../../ggml-qnn/logger.cpp - ../../ggml-qnn/utils.cpp - ../../ggml-qnn/backend-ops.cpp - ../../ggml-qnn.cpp - ggml-qnn-ut.cpp + ../../ggml/src/ggml.c + ../../ggml/src/ggml-alloc.c + ../../ggml/src/ggml-backend.c + ../../ggml/src/ggml-quants.c + ../../ggml/src/ggml-qnn/logger.cpp + ../../ggml/src/ggml-qnn/utils.cpp + ../../ggml/src/ggml-qnn/backend-ops.cpp + ../../ggml/src/ggml-qnn.cpp + ggml-qnn-ut.cpp ) @@ -36,22 +36,20 @@ add_definitions(-D__ARM_NEON) add_definitions(-DGGML_USE_QNN) if(CMAKE_BUILD_TYPE STREQUAL "Release") -add_definitions(-DNDEBUG) -add_definitions(-O3) + add_definitions(-DNDEBUG) + add_definitions(-O3) else() -add_definitions(-O3) + add_definitions(-O3) endif() if (TARGET_SNAPDRAGON_8_GEN3) -# the below build optimization only verified and works well on Qualcomm SM8650-AB Snapdragon 8 Gen 3 -add_definitions(-march=armv8.7-a) -add_definitions(-mcpu=cortex-x1) -add_definitions(-mtune=cortex-x1) - + # the below build optimization only verified and works well on Qualcomm SM8650-AB Snapdragon 8 Gen 3 + add_definitions(-march=armv8.7-a) + add_definitions(-mcpu=cortex-x1) + add_definitions(-mtune=cortex-x1) else() -# the below build optimization might be works well on ALL Android phone equipped with Qualcomm mainstream mobile SoC -add_definitions(-mcpu=cortex-a72) - + # the below build optimization might be works well on ALL Android phone equipped with Qualcomm mainstream mobile SoC + add_definitions(-mcpu=cortex-a72) endif() add_compile_options("-Wall" "-Wno-sign-compare") From 38f88d5fb15eed11265bba11ddbd85e36ebffaa1 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Tue, 2 Jul 2024 19:46:17 +0800 Subject: [PATCH 035/143] fix compiling error after merge latest master --- ggml/src/ggml-qnn.cpp | 21 +++++---------------- ggml/src/ggml-qnn/backend-ops.cpp | 24 ++++++++++-------------- 2 files changed, 15 insertions(+), 30 deletions(-) diff --git a/ggml/src/ggml-qnn.cpp b/ggml/src/ggml-qnn.cpp index 750d5ff91c3d3..e5fc00045beb3 100644 --- a/ggml/src/ggml-qnn.cpp +++ b/ggml/src/ggml-qnn.cpp @@ -321,9 +321,7 @@ static bool ggml_qnn_can_handle_op(ggml_backend_qnn_context * ctx, return true; } -bool ggml_qnn_compute_forward(ggml_backend_qnn_context * ctx, - struct ggml_compute_params * params, - struct ggml_tensor * tensor) { +bool ggml_qnn_compute_forward(ggml_backend_qnn_context * ctx, struct ggml_tensor * tensor) { auto func = qnn::ggml_qnn_op_array()[tensor->op]; if (!func) { QNN_LOG_WARN("unsupported op %d", tensor->op); @@ -515,13 +513,6 @@ GGML_CALL static size_t ggml_backend_qnn_buffer_type_get_max_size(ggml_backend_b return (96 * 1024 * 1024); } -GGML_CALL static bool ggml_backend_qnn_buffer_type_supports_backend( - ggml_backend_buffer_type_t buft, ggml_backend_t backend) { - GGML_UNUSED(buft); - - return ggml_backend_is_qnn(backend) || ggml_backend_is_cpu(backend); -} - GGML_CALL static bool ggml_backend_qnn_buffer_is_host(ggml_backend_buffer_type_t buft) { GGML_UNUSED(buft); return true; @@ -574,9 +565,6 @@ GGML_CALL static ggml_status ggml_backend_qnn_graph_compute(ggml_backend_t backe ggml_backend_qnn_context * ctx = (ggml_backend_qnn_context *) backend->context; GGML_UNUSED(ctx); - ggml_compute_params params = {}; - params.type = GGML_TASK_TYPE_COMPUTE; - params.ith = 0; for (int i = 0; i < cgraph->n_nodes; i++) { ggml_tensor * node = cgraph->nodes[i]; if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || @@ -584,7 +572,7 @@ GGML_CALL static ggml_status ggml_backend_qnn_graph_compute(ggml_backend_t backe node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) { continue; } - bool ok = ggml_qnn_compute_forward(ctx, ¶ms, node); + bool ok = ggml_qnn_compute_forward(ctx, node); if (!ok) { QNN_LOG_DEBUG("error: op not supported %s (%s)\n", node->name, ggml_op_name(node->op)); } @@ -616,9 +604,11 @@ static ggml_backend_i ggml_backend_qnn_interface = { /* .synchronize = */ nullptr, /* .graph_plan_create = */ nullptr, /* .graph_plan_free = */ nullptr, + /* .graph_plan_update = */ nullptr, /* .graph_plan_compute = */ nullptr, /* .graph_compute = */ ggml_backend_qnn_graph_compute, /* .supports_op = */ ggml_backend_qnn_supports_op, + /* .supports_buft = */ nullptr, /* .offload_op = */ ggml_backend_qnn_offload_op, /* .event_new = */ nullptr, /* .event_free = */ nullptr, @@ -702,10 +692,9 @@ ggml_backend_buffer_type_t ggml_backend_qnn_buffer_type(size_t device) { /* .get_alignment = */ ggml_backend_qnn_buffer_type_get_alignment, /* .get_max_size = */ ggml_backend_qnn_buffer_type_get_max_size, /* .get_alloc_size = */ nullptr, // defaults to ggml_nbytes - /* .supports_backend = */ ggml_backend_qnn_buffer_type_supports_backend, /* .is_host = */ ggml_backend_qnn_buffer_is_host }, - /* .context = */ & context, + /* .context = */ &context, }; } ggml_backend_qnn_buffer_type_initialized = true; diff --git a/ggml/src/ggml-qnn/backend-ops.cpp b/ggml/src/ggml-qnn/backend-ops.cpp index a9c94a6df3102..f1fe699ab653d 100644 --- a/ggml/src/ggml-qnn/backend-ops.cpp +++ b/ggml/src/ggml-qnn/backend-ops.cpp @@ -8,21 +8,17 @@ static bool qnn_is_valid_params(ggml_backend_qnn_context* ctx, const ggml_tensor* src0, const ggml_tensor* src1, ggml_tensor* dst) { - if ((nullptr == ctx) || (nullptr == src0) || (nullptr == src1) || (nullptr == dst)) { + if (!ctx || !src0 || !src1 || !dst) { QNN_LOG_WARN("invalid params\n"); return false; } - qnn::qnn_instance* instance = nullptr; - Qnn_Tensor_t* tensor_0 = nullptr; - Qnn_Tensor_t* tensor_1 = nullptr; - Qnn_Tensor_t* tensor_2 = nullptr; - tensor_0 = (Qnn_Tensor_t*)src0->extra; - tensor_1 = (Qnn_Tensor_t*)src1->extra; - tensor_2 = (Qnn_Tensor_t*)dst->extra; - instance = ctx->instance; - if ((nullptr == instance) || (nullptr == tensor_0) || (nullptr == tensor_1) || (nullptr == tensor_2)) { - QNN_LOG_WARN("invalid params\n"); + auto* instance = ctx->instance; + auto* tensor0 = src0->extra; + auto* tensor1 = src1->extra; + auto* tensor2 = dst->extra; + if (!instance || !tensor0 || !tensor1 || !tensor2) { + QNN_LOG_WARN("invalid tensors\n"); return false; } @@ -60,7 +56,7 @@ static void ggml_qnn_add(ggml_backend_qnn_context* ctx, const ggml_tensor* src0, qnn::qnn_perf perf("ggml_qnn_add"); perf.start(); - std::string map_entry = std::string(ggml_op_name(ggmlop)); + std::string map_entry(ggml_op_name(ggmlop)); if (instance->_qnn_graph_map.find(map_entry) != instance->_qnn_graph_map.end()) { graph_initialized = true; @@ -141,8 +137,8 @@ static void ggml_qnn_add(ggml_backend_qnn_context* ctx, const ggml_tensor* src0, goto failure; } - Qnn_Tensor_t tensor_inputs[] = { *tensor_input0.get_qnn_tensor(), *tensor_input1.get_qnn_tensor() }; - Qnn_Tensor_t tensor_outputs[] = { *tensor_output.get_qnn_tensor() }; + Qnn_Tensor_t tensor_inputs[] = { *tensor_input0.get_qnn_tensor(), *tensor_input1.get_qnn_tensor() }; + Qnn_Tensor_t tensor_outputs[] = { *tensor_output.get_qnn_tensor() }; Qnn_OpConfig_t op_config = { (Qnn_OpConfigVersion_t)1, .v1 = {"ggml_op_add", From 000240cf6273d02e91ba38ba7873d6151368ec6c Mon Sep 17 00:00:00 2001 From: hongruichen Date: Thu, 4 Jul 2024 22:18:45 +0800 Subject: [PATCH 036/143] add clang format file and reformating --- ggml/include/ggml-qnn.h | 39 +- ggml/src/ggml-qnn/.clang-format | 31 + ggml/src/ggml-qnn/backend-ops.cpp | 500 ++++----- ggml/src/ggml-qnn/backend-ops.hpp | 13 +- ggml/src/ggml-qnn/backend.hpp | 5 +- ggml/src/ggml-qnn/logger.cpp | 63 +- ggml/src/ggml-qnn/logger.hpp | 36 +- ggml/src/ggml-qnn/qnn-types.hpp | 96 +- ggml/src/ggml-qnn/qnn.hpp | 1666 +++++++++++++---------------- ggml/src/ggml-qnn/tensor.hpp | 225 ++-- ggml/src/ggml-qnn/utils.cpp | 102 +- ggml/src/ggml-qnn/utils.hpp | 357 +++---- 12 files changed, 1419 insertions(+), 1714 deletions(-) create mode 100644 ggml/src/ggml-qnn/.clang-format diff --git a/ggml/include/ggml-qnn.h b/ggml/include/ggml-qnn.h index 9ea3dcda62c64..60aaf22179647 100644 --- a/ggml/include/ggml-qnn.h +++ b/ggml/include/ggml-qnn.h @@ -1,41 +1,48 @@ #pragma once -#include "ggml.h" #include "ggml-backend.h" +#include "ggml.h" #ifdef __cplusplus extern "C" { #endif - -#define GGML_QNN_MAX_DEVICES 3 +#define GGML_QNN_MAX_DEVICES 3 enum QNNBackend { - QNN_BACKEND_CPU, - QNN_BACKEND_GPU, - QNN_BACKEND_NPU, - QNN_BACKEND_GGML, //"fake" QNN backend, used for compare performance between QNN and original GGML + QNN_BACKEND_CPU, + QNN_BACKEND_GPU, + QNN_BACKEND_NPU, + QNN_BACKEND_GGML, //"fake" QNN backend, used for compare performance between + // QNN and original GGML }; -GGML_API int ggml_backend_qnn_reg_devices(void); +GGML_API int ggml_backend_qnn_reg_devices(void); /** * - * @param device 0: QNN_BACKEND_CPU 1: QNN_BACKEND_GPU 2: QNN_BACKEND_NPU - * @param qnn_lib_path qnn library path, such as "/data/local/tmp/" on Android or specified in JNI layer + * @param device 0: QNN_BACKEND_CPU 1: QNN_BACKEND_GPU 2: + * QNN_BACKEND_NPU + * @param qnn_lib_path qnn library path, such as "/data/local/tmp/" on + * Android or specified in JNI layer * @return */ -GGML_API ggml_backend_t ggml_backend_qnn_init(size_t dev_num, const char * qnn_lib_path); +GGML_API ggml_backend_t ggml_backend_qnn_init(size_t dev_num, + const char* qnn_lib_path); -GGML_API bool ggml_backend_is_qnn(ggml_backend_t backend); +GGML_API bool ggml_backend_is_qnn(ggml_backend_t backend); -GGML_API void ggml_backend_qnn_set_n_threads(ggml_backend_t backend, int thread_counts); +GGML_API void ggml_backend_qnn_set_n_threads(ggml_backend_t backend, + int thread_counts); -GGML_API int ggml_backend_qnn_get_device_count(void); +GGML_API int ggml_backend_qnn_get_device_count(void); -GGML_API void ggml_backend_qnn_get_device_description(size_t dev_num, char * description, size_t description_size); +GGML_API void ggml_backend_qnn_get_device_description(size_t dev_num, + char* description, + size_t description_size); -GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_qnn_buffer_type(size_t dev_num); +GGML_API GGML_CALL ggml_backend_buffer_type_t +ggml_backend_qnn_buffer_type(size_t dev_num); #ifdef __cplusplus } diff --git a/ggml/src/ggml-qnn/.clang-format b/ggml/src/ggml-qnn/.clang-format new file mode 100644 index 0000000000000..3b933ff10db42 --- /dev/null +++ b/ggml/src/ggml-qnn/.clang-format @@ -0,0 +1,31 @@ +--- +BasedOnStyle: Google +IndentWidth: 4 +AccessModifierOffset: -4 +AlignAfterOpenBracket: Align +AlignOperands: true +AlignTrailingComments: true +BinPackArguments: true +BinPackParameters: true +BreakBeforeBraces: Custom +BreakConstructorInitializers: AfterColon +ColumnLimit: 120 +Cpp11BracedListStyle: false +DerivePointerAlignment: false +IncludeCategories: + - Regex: '^<.*\.h>' + Priority: 1 + - Regex: '^<.*' + Priority: 2 + - Regex: '^"ggml\.h"' + Priority: 3 + - Regex: '^"ggml-.+\.h"' + Priority: 4 + - Regex: '.*' + Priority: 5 +KeepEmptyLinesAtTheStartOfBlocks: true +MaxEmptyLinesToKeep: 1 +PointerAlignment: Right +SortIncludes: true +SpacesBeforeTrailingComments: 1 +UseTab: Never \ No newline at end of file diff --git a/ggml/src/ggml-qnn/backend-ops.cpp b/ggml/src/ggml-qnn/backend-ops.cpp index f1fe699ab653d..cde1bd248cc29 100644 --- a/ggml/src/ggml-qnn/backend-ops.cpp +++ b/ggml/src/ggml-qnn/backend-ops.cpp @@ -1,22 +1,21 @@ #include "backend-ops.hpp" -#include "utils.hpp" #include "logger.hpp" #include "tensor.hpp" +#include "utils.hpp" - -static bool qnn_is_valid_params(ggml_backend_qnn_context* ctx, const ggml_tensor* src0, - const ggml_tensor* src1, ggml_tensor* dst) { +static bool qnn_is_valid_params(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, + ggml_tensor *dst) { if (!ctx || !src0 || !src1 || !dst) { QNN_LOG_WARN("invalid params\n"); return false; } - auto* instance = ctx->instance; - auto* tensor0 = src0->extra; - auto* tensor1 = src1->extra; - auto* tensor2 = dst->extra; + auto *instance = ctx->instance; + auto *tensor0 = src0->extra; + auto *tensor1 = src1->extra; + auto *tensor2 = dst->extra; if (!instance || !tensor0 || !tensor1 || !tensor2) { QNN_LOG_WARN("invalid tensors\n"); return false; @@ -26,28 +25,28 @@ static bool qnn_is_valid_params(ggml_backend_qnn_context* ctx, const ggml_tensor } #ifndef NDEBUG -#define CHECK_PARAMS(ctx, src0, src1, dst) \ - do { \ - if (!qnn_is_valid_params((ctx), (src0), (src1), (dst))) { \ - return; \ - } \ +#define CHECK_PARAMS(ctx, src0, src1, dst) \ + do { \ + if (!qnn_is_valid_params((ctx), (src0), (src1), (dst))) { \ + return; \ + } \ } while (0) #else #define CHECK_PARAMS(ctx, src0, src1, dst) #endif -//TODO: this function can be removed later because there are duplicated codes with ggml_qnn_mul_mat -// keep it for illustrate how to implement a specified GGMPL OP using QNN API + QNN RPC -static void ggml_qnn_add(ggml_backend_qnn_context* ctx, const ggml_tensor* src0, - const ggml_tensor* src1, ggml_tensor* dst) { - Qnn_ErrorHandle_t error = QNN_SUCCESS; - bool graph_initialized = false; - qnn::qnn_instance* instance = nullptr; - std::string graph_name = "ggml_op_qnn_add"; - Qnn_GraphHandle_t graph_handle = nullptr; - Qnn_Param_t qnn_params[] = {}; - enum ggml_op ggmlop = GGML_OP_ADD; +// TODO: this function can be removed later because there are duplicated codes with ggml_qnn_mul_mat +// keep it for illustrate how to implement a specified GGMPL OP using QNN API + QNN RPC +static void ggml_qnn_add(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, + ggml_tensor *dst) { + Qnn_ErrorHandle_t error = QNN_SUCCESS; + bool graph_initialized = false; + qnn::qnn_instance *instance = nullptr; + std::string graph_name = "ggml_op_qnn_add"; + Qnn_GraphHandle_t graph_handle = nullptr; + Qnn_Param_t qnn_params[] = {}; + enum ggml_op ggmlop = GGML_OP_ADD; CHECK_PARAMS(ctx, src0, src1, dst); instance = ctx->instance; @@ -57,16 +56,14 @@ static void ggml_qnn_add(ggml_backend_qnn_context* ctx, const ggml_tensor* src0, perf.start(); std::string map_entry(ggml_op_name(ggmlop)); - if (instance->_qnn_graph_map.find(map_entry) != - instance->_qnn_graph_map.end()) { + if (instance->_qnn_graph_map.find(map_entry) != instance->_qnn_graph_map.end()) { graph_initialized = true; - auto& graph_item = instance->_qnn_graph_map[map_entry]; + auto &graph_item = instance->_qnn_graph_map[map_entry]; graph_handle = std::get<0>(graph_item); } if (!graph_initialized) { - graph_name = graph_name + "_" + std::to_string(ctx->threads) + - "_" + src0->name + "_" + src1->name; + graph_name = graph_name + "_" + std::to_string(ctx->threads) + "_" + src0->name + "_" + src1->name; QNN_LOG_INFO("graph name %s", graph_name.c_str()); if (ctx->device == QNN_BACKEND_NPU) { QnnHtpGraph_CustomConfig_t hvx_config; @@ -86,7 +83,7 @@ static void ggml_qnn_add(ggml_backend_qnn_context* ctx, const ggml_tensor* src0, QnnHtpGraph_CustomConfig_t opt_config; opt_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_OPTIMIZATION_FLAG; - opt_config.optimizationOption.floatValue = 1; // 1 / 3 + opt_config.optimizationOption.floatValue = 1; // 1 / 3 QnnGraph_Config_t graph_opt_config; graph_opt_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; graph_opt_config.customConfig = &opt_config; @@ -98,28 +95,22 @@ static void ggml_qnn_add(ggml_backend_qnn_context* ctx, const ggml_tensor* src0, graph_vtcm_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; graph_vtcm_config.customConfig = &vtcm_config; - const QnnGraph_Config_t* p_graphconfig[] = { &graph_hvx_config, - &graph_dlbc_config, - &graph_vtcm_config, - &graph_opt_config, - NULL }; - error = qnn_raw_interface.graphCreate( - instance->get_qnn_context_handle(), graph_name.c_str(), p_graphconfig, - &graph_handle); - } - else { - error = qnn_raw_interface.graphCreate( - instance->get_qnn_context_handle(), graph_name.c_str(), nullptr, - &graph_handle); + const QnnGraph_Config_t *p_graphconfig[] = { &graph_hvx_config, &graph_dlbc_config, &graph_vtcm_config, + &graph_opt_config, NULL }; + error = qnn_raw_interface.graphCreate(instance->get_qnn_context_handle(), graph_name.c_str(), p_graphconfig, + &graph_handle); + } else { + error = qnn_raw_interface.graphCreate(instance->get_qnn_context_handle(), graph_name.c_str(), nullptr, + &graph_handle); } if (QNN_SUCCESS != error) { - QNN_LOG_INFO("can't create qnn graph handle with graph name %s, " + QNN_LOG_INFO( + "can't create qnn graph handle with graph name %s, " "error = %d\n", graph_name.c_str(), error); goto failure; - } - else { + } else { QNN_LOG_INFO("create qnn graph handle with graph name %s ok\n", graph_name.c_str()); } @@ -139,30 +130,20 @@ static void ggml_qnn_add(ggml_backend_qnn_context* ctx, const ggml_tensor* src0, Qnn_Tensor_t tensor_inputs[] = { *tensor_input0.get_qnn_tensor(), *tensor_input1.get_qnn_tensor() }; Qnn_Tensor_t tensor_outputs[] = { *tensor_output.get_qnn_tensor() }; - Qnn_OpConfig_t op_config = { - (Qnn_OpConfigVersion_t)1, - .v1 = {"ggml_op_add", - QNN_OP_PACKAGE_NAME_QTI_AISW, - QNN_OP_ELEMENT_WISE_ADD, - 0, qnn_params, - 2, tensor_inputs, - 1,tensor_outputs} - }; + Qnn_OpConfig_t op_config = { (Qnn_OpConfigVersion_t)1, + .v1 = { "ggml_op_add", QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_ELEMENT_WISE_ADD, 0, + qnn_params, 2, tensor_inputs, 1, tensor_outputs } }; error = qnn_raw_interface.graphAddNode(graph_handle, op_config); if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); goto failure; } - error = qnn_raw_interface.graphFinalize(graph_handle, - nullptr, nullptr); + error = qnn_raw_interface.graphFinalize(graph_handle, nullptr, nullptr); if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); goto failure; } - error = qnn_raw_interface.graphExecute(graph_handle, - tensor_inputs, 2, - tensor_outputs, 1, - nullptr, nullptr); + error = qnn_raw_interface.graphExecute(graph_handle, tensor_inputs, 2, tensor_outputs, 1, nullptr, nullptr); if (ctx->device == QNN_BACKEND_NPU) { if (QNN_COMMON_ERROR_SYSTEM_COMMUNICATION == error) { QNN_LOG_WARN("NPU crashed. SSR detected. Caused QNN graph execute error\n"); @@ -173,24 +154,18 @@ static void ggml_qnn_add(ggml_backend_qnn_context* ctx, const ggml_tensor* src0, goto failure; } - auto graph_item = std::make_tuple(graph_handle, - tensor_input0.get_qnn_tensor(), - tensor_input1.get_qnn_tensor(), - tensor_output.get_qnn_tensor()); + auto graph_item = std::make_tuple(graph_handle, tensor_input0.get_qnn_tensor(), tensor_input1.get_qnn_tensor(), + tensor_output.get_qnn_tensor()); instance->_qnn_graph_map[map_entry] = graph_item; - } - else { - auto& graph_item = instance->_qnn_graph_map[map_entry]; + } else { + auto &graph_item = instance->_qnn_graph_map[map_entry]; qnn::ggml_qnn_tensor_input tensor_input0(src0, std::get<1>(graph_item), ctx); qnn::ggml_qnn_tensor_input tensor_input1(src1, std::get<2>(graph_item), ctx); qnn::ggml_qnn_tensor_output tensor_output(dst, std::get<3>(graph_item), ctx); Qnn_Tensor_t tensor_inputs[] = { *tensor_input0.get_qnn_tensor(), *tensor_input1.get_qnn_tensor() }; Qnn_Tensor_t tensor_outputs[] = { *tensor_output.get_qnn_tensor() }; - error = qnn_raw_interface.graphExecute(graph_handle, - tensor_inputs, 2, - tensor_outputs, 1, - nullptr, nullptr); + error = qnn_raw_interface.graphExecute(graph_handle, tensor_inputs, 2, tensor_outputs, 1, nullptr, nullptr); if (ctx->device == QNN_BACKEND_NPU) { if (QNN_COMMON_ERROR_SYSTEM_COMMUNICATION == error) { QNN_LOG_WARN("NPU crashed. SSR detected. Caused QNN graph execute error\n"); @@ -204,21 +179,18 @@ static void ggml_qnn_add(ggml_backend_qnn_context* ctx, const ggml_tensor* src0, failure: if (QNN_SUCCESS != error) { - QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 - " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", - src0->name, src0->type, ggml_type_name(src0->type), - src0->ne[0], src0->ne[1], src0->ne[2], src0->nb[0], - src0->nb[1], src0->nb[2]); - QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 - " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", - src1->name, src1->type, ggml_type_name(src1->type), - src1->ne[0], src1->ne[1], src1->ne[2], src1->nb[0], - src1->nb[1], src1->nb[2]); - QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 - " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", - dst->name, dst->type, ggml_type_name(dst->type), - dst->ne[0], dst->ne[1], dst->ne[2], dst->nb[0], - dst->nb[1], dst->nb[2]); + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 + ", nb = (%5zi, %5zi, %5zi)\n", + src0->name, src0->type, ggml_type_name(src0->type), src0->ne[0], src0->ne[1], src0->ne[2], + src0->nb[0], src0->nb[1], src0->nb[2]); + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 + ", nb = (%5zi, %5zi, %5zi)\n", + src1->name, src1->type, ggml_type_name(src1->type), src1->ne[0], src1->ne[1], src1->ne[2], + src1->nb[0], src1->nb[1], src1->nb[2]); + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 + ", nb = (%5zi, %5zi, %5zi)\n", + dst->name, dst->type, ggml_type_name(dst->type), dst->ne[0], dst->ne[1], dst->ne[2], dst->nb[0], + dst->nb[1], dst->nb[2]); } perf.info(); @@ -235,16 +207,15 @@ static void ggml_qnn_add(ggml_backend_qnn_context* ctx, const ggml_tensor* src0, * mul_mat_f16_f32: src0 is F16 and src1 is F32. * mul_mat_q_f32: src0 is quantized (Q4_0, Q4_1, ...), and src1 is F32. */ -static void ggml_qnn_mul_mat(ggml_backend_qnn_context* ctx, - const ggml_tensor* src0, const ggml_tensor* src1, - ggml_tensor* dst) { - Qnn_ErrorHandle_t error = QNN_SUCCESS; - bool graph_initialized = false; - qnn::qnn_instance* instance = nullptr; - std::string graph_name = "ggml_op_qnn_mul_mat"; - Qnn_GraphHandle_t graph_handle = nullptr; - Qnn_Param_t qnn_params[] = {}; - enum ggml_op ggmlop = GGML_OP_MUL_MAT; +static void ggml_qnn_mul_mat(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, + ggml_tensor *dst) { + Qnn_ErrorHandle_t error = QNN_SUCCESS; + bool graph_initialized = false; + qnn::qnn_instance *instance = nullptr; + std::string graph_name = "ggml_op_qnn_mul_mat"; + Qnn_GraphHandle_t graph_handle = nullptr; + Qnn_Param_t qnn_params[] = {}; + enum ggml_op ggmlop = GGML_OP_MUL_MAT; CHECK_PARAMS(ctx, src0, src1, dst); instance = ctx->instance; @@ -254,21 +225,19 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context* ctx, perf.start(); std::string map_entry = std::string(ggml_op_name(ggmlop)); - if (instance->_qnn_graph_map.find(map_entry) != - instance->_qnn_graph_map.end()) { + if (instance->_qnn_graph_map.find(map_entry) != instance->_qnn_graph_map.end()) { graph_initialized = true; - auto& graph_item = instance->_qnn_graph_map[map_entry]; + auto &graph_item = instance->_qnn_graph_map[map_entry]; graph_handle = std::get<0>(graph_item); } - //TODO: for scenarios of quantized data in src0 - // pass-1: dequantize src0 to FP32 - // pass-2: dq-src0 * src1 - // the performance gains is worth although there is performance loss in pass-1 + // TODO: for scenarios of quantized data in src0 + // pass-1: dequantize src0 to FP32 + // pass-2: dq-src0 * src1 + // the performance gains is worth although there is performance loss in pass-1 if (!graph_initialized) { - graph_name = graph_name + "_" + std::to_string(ctx->threads) + - "_" + src0->name + "_" + src1->name; + graph_name = graph_name + "_" + std::to_string(ctx->threads) + "_" + src0->name + "_" + src1->name; QNN_LOG_INFO("graph name %s", graph_name.c_str()); if (ctx->device == QNN_BACKEND_NPU) { QnnHtpGraph_CustomConfig_t hvx_config; @@ -288,7 +257,7 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context* ctx, QnnHtpGraph_CustomConfig_t opt_config; opt_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_OPTIMIZATION_FLAG; - opt_config.optimizationOption.floatValue = 1; //1 / 3 + opt_config.optimizationOption.floatValue = 1; // 1 / 3 QnnGraph_Config_t graph_opt_config; graph_opt_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; graph_opt_config.customConfig = &opt_config; @@ -300,22 +269,17 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context* ctx, graph_vtcm_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; graph_vtcm_config.customConfig = &vtcm_config; - const QnnGraph_Config_t* p_graphconfig[] = { &graph_hvx_config, - &graph_dlbc_config, - &graph_vtcm_config, - &graph_opt_config, - NULL }; - error = qnn_raw_interface.graphCreate( - instance->get_qnn_context_handle(), graph_name.c_str(), p_graphconfig, - &graph_handle); - } - else { - error = qnn_raw_interface.graphCreate( - instance->get_qnn_context_handle(), graph_name.c_str(), nullptr, - &graph_handle); + const QnnGraph_Config_t *p_graphconfig[] = { &graph_hvx_config, &graph_dlbc_config, &graph_vtcm_config, + &graph_opt_config, NULL }; + error = qnn_raw_interface.graphCreate(instance->get_qnn_context_handle(), graph_name.c_str(), p_graphconfig, + &graph_handle); + } else { + error = qnn_raw_interface.graphCreate(instance->get_qnn_context_handle(), graph_name.c_str(), nullptr, + &graph_handle); } if (QNN_SUCCESS != error) { - QNN_LOG_INFO("can't create qnn graph handle with graph name %s, " + QNN_LOG_INFO( + "can't create qnn graph handle with graph name %s, " "error = %d\n", graph_name.c_str(), error); goto failure; @@ -334,32 +298,22 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context* ctx, goto failure; } - Qnn_Tensor_t tensor_inputs[] = { *tensor_input0.get_qnn_tensor(), *tensor_input1.get_qnn_tensor() }; - Qnn_Tensor_t tensor_outputs[] = { *tensor_output.get_qnn_tensor() }; - Qnn_OpConfig_t op_config = { - (Qnn_OpConfigVersion_t)1, - .v1 = {"ggml_op_mul_mat", - QNN_OP_PACKAGE_NAME_QTI_AISW, - QNN_OP_MAT_MUL, - 0, qnn_params, - 2, tensor_inputs, - 1, tensor_outputs} - }; + Qnn_Tensor_t tensor_inputs[] = { *tensor_input0.get_qnn_tensor(), *tensor_input1.get_qnn_tensor() }; + Qnn_Tensor_t tensor_outputs[] = { *tensor_output.get_qnn_tensor() }; + Qnn_OpConfig_t op_config = { (Qnn_OpConfigVersion_t)1, + .v1 = { "ggml_op_mul_mat", QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_MAT_MUL, 0, + qnn_params, 2, tensor_inputs, 1, tensor_outputs } }; error = qnn_raw_interface.graphAddNode(graph_handle, op_config); if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); goto failure; } - error = qnn_raw_interface.graphFinalize(graph_handle, - nullptr, nullptr); + error = qnn_raw_interface.graphFinalize(graph_handle, nullptr, nullptr); if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); goto failure; } - error = qnn_raw_interface.graphExecute(graph_handle, - tensor_inputs, 2, - tensor_outputs, 1, - nullptr, nullptr); + error = qnn_raw_interface.graphExecute(graph_handle, tensor_inputs, 2, tensor_outputs, 1, nullptr, nullptr); if (ctx->device == QNN_BACKEND_NPU) { if (QNN_COMMON_ERROR_SYSTEM_COMMUNICATION == error) { QNN_LOG_WARN("NPU crashed. SSR detected. Caused QNN graph execute error\n"); @@ -370,24 +324,18 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context* ctx, goto failure; } - auto graph_item = std::make_tuple(graph_handle, - tensor_input0.get_qnn_tensor(), - tensor_input1.get_qnn_tensor(), - tensor_output.get_qnn_tensor()); + auto graph_item = std::make_tuple(graph_handle, tensor_input0.get_qnn_tensor(), tensor_input1.get_qnn_tensor(), + tensor_output.get_qnn_tensor()); instance->_qnn_graph_map[map_entry] = graph_item; - } - else { - auto& graph_item = instance->_qnn_graph_map[map_entry]; + } else { + auto &graph_item = instance->_qnn_graph_map[map_entry]; qnn::ggml_qnn_tensor_input tensor_input0(src0, std::get<1>(graph_item), ctx); qnn::ggml_qnn_tensor_input tensor_input1(src1, std::get<2>(graph_item), ctx); qnn::ggml_qnn_tensor_output tensor_output(dst, std::get<3>(graph_item), ctx); Qnn_Tensor_t tensor_inputs[] = { *tensor_input0.get_qnn_tensor(), *tensor_input1.get_qnn_tensor() }; Qnn_Tensor_t tensor_outputs[] = { *tensor_output.get_qnn_tensor() }; - error = qnn_raw_interface.graphExecute(graph_handle, - tensor_inputs, 2, - tensor_outputs, 1, - nullptr, nullptr); + error = qnn_raw_interface.graphExecute(graph_handle, tensor_inputs, 2, tensor_outputs, 1, nullptr, nullptr); if (ctx->device == QNN_BACKEND_NPU) { if (QNN_COMMON_ERROR_SYSTEM_COMMUNICATION == error) { QNN_LOG_WARN("NPU crashed. SSR detected. Caused QNN graph execute error\n"); @@ -401,181 +349,127 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context* ctx, failure: if (QNN_SUCCESS != error) { - QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 - " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", - src0->name, src0->type, ggml_type_name(src0->type), - src0->ne[0], src0->ne[1], src0->ne[2], src0->nb[0], - src0->nb[1], src0->nb[2]); - QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 - " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", - src1->name, src1->type, ggml_type_name(src1->type), - src1->ne[0], src1->ne[1], src1->ne[2], src1->nb[0], - src1->nb[1], src1->nb[2]); - QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 - " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", - dst->name, dst->type, ggml_type_name(dst->type), dst->ne[0], - dst->ne[1], dst->ne[2], dst->nb[0], dst->nb[1], dst->nb[2]); + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 + ", nb = (%5zi, %5zi, %5zi)\n", + src0->name, src0->type, ggml_type_name(src0->type), src0->ne[0], src0->ne[1], src0->ne[2], + src0->nb[0], src0->nb[1], src0->nb[2]); + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 + ", nb = (%5zi, %5zi, %5zi)\n", + src1->name, src1->type, ggml_type_name(src1->type), src1->ne[0], src1->ne[1], src1->ne[2], + src1->nb[0], src1->nb[1], src1->nb[2]); + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 + ", nb = (%5zi, %5zi, %5zi)\n", + dst->name, dst->type, ggml_type_name(dst->type), dst->ne[0], dst->ne[1], dst->ne[2], dst->nb[0], + dst->nb[1], dst->nb[2]); } perf.info(); } -static void ggml_qnn_repeat(ggml_backend_qnn_context* ctx, - const ggml_tensor* src0, const ggml_tensor* src1, - ggml_tensor* dst) { -} +static void ggml_qnn_repeat(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, + ggml_tensor *dst) {} -static void ggml_qnn_get_rows(ggml_backend_qnn_context* ctx, - const ggml_tensor* src0, const ggml_tensor* src1, - ggml_tensor* dst) { -} +static void ggml_qnn_get_rows(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, + ggml_tensor *dst) {} -static void ggml_qnn_acc(ggml_backend_qnn_context* ctx, const ggml_tensor* src0, - const ggml_tensor* src1, ggml_tensor* dst) { -} +static void ggml_qnn_acc(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, + ggml_tensor *dst) {} -static void ggml_qnn_div(ggml_backend_qnn_context* ctx, const ggml_tensor* src0, - const ggml_tensor* src1, ggml_tensor* dst) { -} +static void ggml_qnn_div(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, + ggml_tensor *dst) {} -static void ggml_qnn_gelu(ggml_backend_qnn_context* ctx, - const ggml_tensor* src0, const ggml_tensor* src1, - ggml_tensor* dst) { -} +static void ggml_qnn_gelu(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, + ggml_tensor *dst) {} -static void ggml_qnn_silu(ggml_backend_qnn_context* ctx, - const ggml_tensor* src0, const ggml_tensor* src1, - ggml_tensor* dst) { -} +static void ggml_qnn_silu(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, + ggml_tensor *dst) {} -static void ggml_qnn_gelu_quick(ggml_backend_qnn_context* ctx, - const ggml_tensor* src0, - const ggml_tensor* src1, ggml_tensor* dst) { -} +static void ggml_qnn_gelu_quick(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, + ggml_tensor *dst) {} -static void ggml_qnn_tanh(ggml_backend_qnn_context* ctx, - const ggml_tensor* src0, const ggml_tensor* src1, - ggml_tensor* dst) { -} +static void ggml_qnn_tanh(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, + ggml_tensor *dst) {} -static void ggml_qnn_relu(ggml_backend_qnn_context* ctx, - const ggml_tensor* src0, const ggml_tensor* src1, - ggml_tensor* dst) { -} +static void ggml_qnn_relu(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, + ggml_tensor *dst) {} -static void ggml_qnn_hardsigmoid(ggml_backend_qnn_context* ctx, - const ggml_tensor* src0, - const ggml_tensor* src1, ggml_tensor* dst) { -} +static void ggml_qnn_hardsigmoid(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, + ggml_tensor *dst) {} -static void ggml_qnn_hardswish(ggml_backend_qnn_context* ctx, - const ggml_tensor* src0, const ggml_tensor* src1, - ggml_tensor* dst) { -} +static void ggml_qnn_hardswish(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, + ggml_tensor *dst) {} -static void ggml_qnn_leaky_relu(ggml_backend_qnn_context* ctx, - const ggml_tensor* src0, - const ggml_tensor* src1, ggml_tensor* dst) { -} +static void ggml_qnn_leaky_relu(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, + ggml_tensor *dst) {} -static void ggml_qnn_sqr(ggml_backend_qnn_context* ctx, const ggml_tensor* src0, - const ggml_tensor* src1, ggml_tensor* dst) { -} +static void ggml_qnn_sqr(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, + ggml_tensor *dst) {} -static void ggml_qnn_norm(ggml_backend_qnn_context* ctx, - const ggml_tensor* src0, const ggml_tensor* src1, - ggml_tensor* dst) { -} +static void ggml_qnn_norm(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, + ggml_tensor *dst) {} -static void ggml_qnn_group_norm(ggml_backend_qnn_context* ctx, - const ggml_tensor* src0, - const ggml_tensor* src1, ggml_tensor* dst) { -} +static void ggml_qnn_group_norm(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, + ggml_tensor *dst) {} -static void ggml_qnn_concat(ggml_backend_qnn_context* ctx, - const ggml_tensor* src0, const ggml_tensor* src1, - ggml_tensor* dst) { -} +static void ggml_qnn_concat(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, + ggml_tensor *dst) {} -static void ggml_qnn_upscale(ggml_backend_qnn_context* ctx, - const ggml_tensor* src0, const ggml_tensor* src1, - ggml_tensor* dst) { -} +static void ggml_qnn_upscale(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, + ggml_tensor *dst) {} -static void ggml_qnn_pad(ggml_backend_qnn_context* ctx, const ggml_tensor* src0, - const ggml_tensor* src1, ggml_tensor* dst) { -} +static void ggml_qnn_pad(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, + ggml_tensor *dst) {} -static void ggml_qnn_rms_norm(ggml_backend_qnn_context* ctx, - const ggml_tensor* src0, const ggml_tensor* src1, - ggml_tensor* dst) { -} +static void ggml_qnn_rms_norm(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, + ggml_tensor *dst) {} -static void ggml_qnn_cpy(ggml_backend_qnn_context* ctx, const ggml_tensor* src0, - const ggml_tensor* src1, ggml_tensor* dst) { -} +static void ggml_qnn_cpy(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, + ggml_tensor *dst) {} -static void ggml_qnn_dup(ggml_backend_qnn_context* ctx, const ggml_tensor* src0, - const ggml_tensor* src1, ggml_tensor* dst) { +static void ggml_qnn_dup(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, + ggml_tensor *dst) { ggml_qnn_cpy(ctx, src0, dst, nullptr); (void)src1; } -static void ggml_qnn_mul_mat_id(ggml_backend_qnn_context* ctx, - const ggml_tensor* src0, - const ggml_tensor* src1, ggml_tensor* dst) { -} +static void ggml_qnn_mul_mat_id(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, + ggml_tensor *dst) {} -static void ggml_qnn_scale(ggml_backend_qnn_context* ctx, - const ggml_tensor* src0, const ggml_tensor* src1, - ggml_tensor* dst) { -} +static void ggml_qnn_scale(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, + ggml_tensor *dst) {} -static void ggml_qnn_clamp(ggml_backend_qnn_context* ctx, - const ggml_tensor* src0, const ggml_tensor* src1, - ggml_tensor* dst) { -} +static void ggml_qnn_clamp(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, + ggml_tensor *dst) {} -static void ggml_qnn_diag_mask_inf(ggml_backend_qnn_context* ctx, - const ggml_tensor* src0, - const ggml_tensor* src1, ggml_tensor* dst) { -} +static void ggml_qnn_diag_mask_inf(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, + ggml_tensor *dst) {} -static void ggml_qnn_soft_max(ggml_backend_qnn_context* ctx, - const ggml_tensor* src0, const ggml_tensor* src1, - ggml_tensor* dst) { -} +static void ggml_qnn_soft_max(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, + ggml_tensor *dst) {} -static void ggml_qnn_rope(ggml_backend_qnn_context* ctx, - const ggml_tensor* src0, const ggml_tensor* src1, - ggml_tensor* dst) { +static void ggml_qnn_rope(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, + ggml_tensor *dst) { GGML_ASSERT(ggml_is_contiguous(src0)); } -static void ggml_qnn_pool2d(ggml_backend_qnn_context* ctx, - const ggml_tensor* src0, const ggml_tensor* src1, - ggml_tensor* dst) { -} +static void ggml_qnn_pool2d(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, + ggml_tensor *dst) {} -static void ggml_qnn_im2col(ggml_backend_qnn_context* ctx, - const ggml_tensor* src0, const ggml_tensor* src1, - ggml_tensor* dst) { -} +static void ggml_qnn_im2col(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, + ggml_tensor *dst) {} -static void ggml_qnn_sum_rows(ggml_backend_qnn_context* ctx, - const ggml_tensor* src0, const ggml_tensor* src1, - ggml_tensor* dst) { +static void ggml_qnn_sum_rows(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, + ggml_tensor *dst) { GGML_ASSERT(ggml_is_contiguous(src0)); } -static void ggml_qnn_argsort(ggml_backend_qnn_context* ctx, - const ggml_tensor* src0, const ggml_tensor* src1, - ggml_tensor* dst) { +static void ggml_qnn_argsort(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, + ggml_tensor *dst) { GGML_ASSERT(ggml_is_contiguous(src0)); } -static void ggml_qnn_nop(ggml_backend_qnn_context* ctx, const ggml_tensor* src0, - const ggml_tensor* src1, ggml_tensor* dst) { +static void ggml_qnn_nop(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, + ggml_tensor *dst) { (void)src0; (void)src1; (void)dst; @@ -583,33 +477,33 @@ static void ggml_qnn_nop(ggml_backend_qnn_context* ctx, const ggml_tensor* src0, qnn::ggml_qnn_op_array_t qnn::ggml_qnn_op_array() { static constexpr const qnn::ggml_qnn_op_t kQnnOpsTable[GGML_OP_COUNT] = { - nullptr, // GGML_OP_NONE - nullptr, // GGML_OP_DUP + nullptr, // GGML_OP_NONE + nullptr, // GGML_OP_DUP ggml_qnn_add, // GGML_OP_ADD - nullptr, // GGML_OP_ADD1 - nullptr, // GGML_OP_ACC - nullptr, // GGML_OP_SUB - nullptr, // GGML_OP_MUL - nullptr, // GGML_OP_DIV - nullptr, // GGML_OP_SQR - nullptr, // GGML_OP_SQRT - nullptr, // GGML_OP_LOG - nullptr, // GGML_OP_SUM - nullptr, // GGML_OP_SUM_ROWS - nullptr, // GGML_OP_MEAN - nullptr, // GGML_OP_ARGMAX - nullptr, // GGML_OP_REPEAT - nullptr, // GGML_OP_REPEAT_BACK - nullptr, // GGML_OP_CONCAT - nullptr, // GGML_OP_SILU_BACK - nullptr, // GGML_OP_NORM - nullptr, // GGML_OP_RMS_NORM - nullptr, // GGML_OP_RMS_NORM_BACK - nullptr, // GGML_OP_GROUP_NORM + nullptr, // GGML_OP_ADD1 + nullptr, // GGML_OP_ACC + nullptr, // GGML_OP_SUB + nullptr, // GGML_OP_MUL + nullptr, // GGML_OP_DIV + nullptr, // GGML_OP_SQR + nullptr, // GGML_OP_SQRT + nullptr, // GGML_OP_LOG + nullptr, // GGML_OP_SUM + nullptr, // GGML_OP_SUM_ROWS + nullptr, // GGML_OP_MEAN + nullptr, // GGML_OP_ARGMAX + nullptr, // GGML_OP_REPEAT + nullptr, // GGML_OP_REPEAT_BACK + nullptr, // GGML_OP_CONCAT + nullptr, // GGML_OP_SILU_BACK + nullptr, // GGML_OP_NORM + nullptr, // GGML_OP_RMS_NORM + nullptr, // GGML_OP_RMS_NORM_BACK + nullptr, // GGML_OP_GROUP_NORM ggml_qnn_mul_mat, // GGML_OP_MUL_MAT - nullptr, // GGML_OP_MUL_MAT_ID - nullptr, // GGML_OP_OUT_PROD + nullptr, // GGML_OP_MUL_MAT_ID + nullptr, // GGML_OP_OUT_PROD nullptr, // GGML_OP_SCALE nullptr, // GGML_OP_SET diff --git a/ggml/src/ggml-qnn/backend-ops.hpp b/ggml/src/ggml-qnn/backend-ops.hpp index c3dd5de302289..01c23ecff9b16 100644 --- a/ggml/src/ggml-qnn/backend-ops.hpp +++ b/ggml/src/ggml-qnn/backend-ops.hpp @@ -1,17 +1,16 @@ #pragma once #include "ggml.h" + #include "backend.hpp" namespace qnn { - typedef void (*ggml_qnn_op_t)(ggml_backend_qnn_context* ctx, - const ggml_tensor* src0, - const ggml_tensor* src1, - ggml_tensor* dst); +typedef void (*ggml_qnn_op_t)(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, + ggml_tensor *dst); - typedef const ggml_qnn_op_t(&ggml_qnn_op_array_t)[GGML_OP_COUNT]; +typedef const ggml_qnn_op_t (&ggml_qnn_op_array_t)[GGML_OP_COUNT]; - ggml_qnn_op_array_t ggml_qnn_op_array(); +ggml_qnn_op_array_t ggml_qnn_op_array(); -} +} // namespace qnn diff --git a/ggml/src/ggml-qnn/backend.hpp b/ggml/src/ggml-qnn/backend.hpp index dc40090ee6114..74bce38b7111c 100644 --- a/ggml/src/ggml-qnn/backend.hpp +++ b/ggml/src/ggml-qnn/backend.hpp @@ -2,6 +2,7 @@ #pragma once #include "ggml.h" + #include "ggml-backend.h" #include "qnn.hpp" @@ -11,8 +12,8 @@ struct ggml_backend_qnn_context { int threads; char name[GGML_MAX_NAME]; char lib[GGML_MAX_NAME]; - qnn::qnn_instance* instance; - ggml_backend* backend; + qnn::qnn_instance *instance; + ggml_backend *backend; QNN_INTERFACE_VER_TYPE raw_interface; QNN_SYSTEM_INTERFACE_VER_TYPE raw_system_interface; qnn::qcom_socinfo socinfo; diff --git a/ggml/src/ggml-qnn/logger.cpp b/ggml/src/ggml-qnn/logger.cpp index 43856c9f48a9f..8b29979224866 100644 --- a/ggml/src/ggml-qnn/logger.cpp +++ b/ggml/src/ggml-qnn/logger.cpp @@ -2,30 +2,26 @@ #include "logger.hpp" #include + #include #if (defined __ANDROID__) || (defined ANDROID) #include #endif -#define QNN_LOGBUF_LEN 4096 +#define QNN_LOGBUF_LEN 4096 -void qnn::internal_log(ggml_log_level level, const char* file, - const char* func, int line, - const char* format, ...) { +void qnn::internal_log(ggml_log_level level, const char *file, const char *func, int line, const char *format, ...) { static std::mutex qnn_internal_log_mutex; - static char s_qnn_internal_log_buf[QNN_LOGBUF_LEN]; + static char s_qnn_internal_log_buf[QNN_LOGBUF_LEN]; { std::lock_guard lock(qnn_internal_log_mutex); - va_list args; + va_list args; va_start(args, format); - int len_prefix = - snprintf(s_qnn_internal_log_buf, QNN_LOGBUF_LEN, - "[%s, %d]: ", func, line); - int len = vsnprintf(s_qnn_internal_log_buf + len_prefix, - QNN_LOGBUF_LEN - len_prefix, format, args); + int len_prefix = snprintf(s_qnn_internal_log_buf, QNN_LOGBUF_LEN, "[%s, %d]: ", func, line); + int len = vsnprintf(s_qnn_internal_log_buf + len_prefix, QNN_LOGBUF_LEN - len_prefix, format, args); if (len < (QNN_LOGBUF_LEN - len_prefix)) { #if (defined __ANDROID__) || (defined ANDROID) // for Android APK @@ -38,32 +34,31 @@ void qnn::internal_log(ggml_log_level level, const char* file, } } -void qnn::sdk_logcallback(const char* fmt, QnnLog_Level_t level, - uint64_t timestamp, va_list argp) { +void qnn::sdk_logcallback(const char *fmt, QnnLog_Level_t level, uint64_t timestamp, va_list argp) { #if ENABLE_QNNSDK_LOG - static std::mutex log_mutex; + static std::mutex log_mutex; static unsigned char s_ggml_qnn_logbuf[QNN_LOGBUF_LEN]; - const char* log_level_desc = ""; + const char *log_level_desc = ""; switch (level) { - case QNN_LOG_LEVEL_ERROR: - log_level_desc = "ERROR"; - break; - case QNN_LOG_LEVEL_WARN: - log_level_desc = "WARNING"; - break; - case QNN_LOG_LEVEL_INFO: - log_level_desc = "INFO"; - break; - case QNN_LOG_LEVEL_DEBUG: - log_level_desc = "DEBUG"; - break; - case QNN_LOG_LEVEL_VERBOSE: - log_level_desc = "VERBOSE"; - break; - case QNN_LOG_LEVEL_MAX: - log_level_desc = "UNKNOWN"; - break; + case QNN_LOG_LEVEL_ERROR: + log_level_desc = "ERROR"; + break; + case QNN_LOG_LEVEL_WARN: + log_level_desc = "WARNING"; + break; + case QNN_LOG_LEVEL_INFO: + log_level_desc = "INFO"; + break; + case QNN_LOG_LEVEL_DEBUG: + log_level_desc = "DEBUG"; + break; + case QNN_LOG_LEVEL_VERBOSE: + log_level_desc = "VERBOSE"; + break; + case QNN_LOG_LEVEL_MAX: + log_level_desc = "UNKNOWN"; + break; } double ms = (double)timestamp / 1000000.0; @@ -71,7 +66,7 @@ void qnn::sdk_logcallback(const char* fmt, QnnLog_Level_t level, std::lock_guard lock(log_mutex); memset(s_ggml_qnn_logbuf, 0, QNN_LOGBUF_LEN); - vsnprintf(reinterpret_cast(s_ggml_qnn_logbuf), QNN_LOGBUF_LEN, fmt, argp); + vsnprintf(reinterpret_cast(s_ggml_qnn_logbuf), QNN_LOGBUF_LEN, fmt, argp); QNN_LOG_INFO("%8.1fms [%-7s] %s\n", ms, log_level_desc, s_ggml_qnn_logbuf); } #endif diff --git a/ggml/src/ggml-qnn/logger.hpp b/ggml/src/ggml-qnn/logger.hpp index 003436da10fae..f81a1814e9756 100644 --- a/ggml/src/ggml-qnn/logger.hpp +++ b/ggml/src/ggml-qnn/logger.hpp @@ -2,48 +2,40 @@ #include -#include "QnnTypes.h" +#include "ggml.h" + #include "QnnCommon.h" #include "QnnInterface.h" +#include "QnnTypes.h" #include "System/QnnSystemInterface.h" -#include "ggml.h" - namespace qnn { - void internal_log(ggml_log_level level, const char* file, - const char* func, int line, - const char* format, ...); - +void internal_log(ggml_log_level level, const char *file, const char *func, int line, const char *format, ...); - void sdk_logcallback(const char* fmt, QnnLog_Level_t level, - uint64_t timestamp, va_list argp); -} +void sdk_logcallback(const char *fmt, QnnLog_Level_t level, uint64_t timestamp, va_list argp); +} // namespace qnn // ================================================================================================= // // QNN backend internal log function // // ================================================================================================= -#define QNN_LOG_ERROR(...) \ - qnn::internal_log(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) +#define QNN_LOG_ERROR(...) qnn::internal_log(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) -#define QNN_LOG_WARN(...) \ - qnn::internal_log(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) +#define QNN_LOG_WARN(...) qnn::internal_log(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) -#define QNN_LOG_INFO(...) \ - qnn::internal_log(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) +#define QNN_LOG_INFO(...) qnn::internal_log(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) #ifdef NDEBUG -#define ENABLE_QNNBACKEND_DEBUG 0 // for troubleshooting QNN backend -#define ENABLE_QNNSDK_LOG 0 // enable/disable QNN SDK's internal log +#define ENABLE_QNNBACKEND_DEBUG 0 // for troubleshooting QNN backend +#define ENABLE_QNNSDK_LOG 0 // enable/disable QNN SDK's internal log #else -#define ENABLE_QNNBACKEND_DEBUG 1 // for troubleshooting QNN backend -#define ENABLE_QNNSDK_LOG 1 // enable/disable QNN SDK's internal log +#define ENABLE_QNNBACKEND_DEBUG 1 // for troubleshooting QNN backend +#define ENABLE_QNNSDK_LOG 1 // enable/disable QNN SDK's internal log #endif #if ENABLE_QNNBACKEND_DEBUG -#define QNN_LOG_DEBUG(...) \ - qnn::internal_log(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) +#define QNN_LOG_DEBUG(...) qnn::internal_log(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) #else #define QNN_LOG_DEBUG(...) #endif diff --git a/ggml/src/ggml-qnn/qnn-types.hpp b/ggml/src/ggml-qnn/qnn-types.hpp index db1d592f08a20..7c245651032c0 100644 --- a/ggml/src/ggml-qnn/qnn-types.hpp +++ b/ggml/src/ggml-qnn/qnn-types.hpp @@ -1,59 +1,55 @@ #pragma once -#include "QnnTypes.h" #include "QnnCommon.h" #include "QnnInterface.h" +#include "QnnTypes.h" #include "Saver/QnnSaver.h" #include "System/QnnSystemInterface.h" namespace qnn { - // ================================================================================================= - // - // helper data type / data structure / macros / functions of - // Qualcomm QNN(Qualcomm Neural Network, aka Qualcomm AI Engine Direct) SDK - // ref:https://github.com/pytorch/executorch/tree/main/backends/qualcomm - // ================================================================================================= - enum sdk_profile_level { - profile_off = 0, - profile_basic = 1, - profile_detail = 2 - }; - - enum qcom_htp_arch { - NONE = 0, - V68 = 68, - V69 = 69, - V73 = 73, - V75 = 75, - }; - - enum qcom_chipset { - UNKNOWN_SM = 0, - SM8450 = 36, // v69 - SM8475 = 42, // v69 - SM8550 = 43, // v73 - SM8650 = 57, // v75 - }; - - struct qcom_socinfo { - uint32_t soc_model; - size_t htp_arch; - size_t vtcm_size_in_mb; - }; - - using pfn_rpc_mem_init = void (*)(void); - using pfn_rpc_mem_deinit = void (*)(void); - using pfn_rpc_mem_alloc = void* (*) (int, uint32_t, int); - using pfn_rpc_mem_free = void (*)(void*); - using pfn_rpc_mem_to_fd = int (*)(void*); - - using pfn_qnnsaver_initialize = decltype(QnnSaver_initialize); - using pfn_qnninterface_getproviders = decltype(QnnInterface_getProviders); - using pfn_qnnsysteminterface_getproviders = decltype(QnnSystemInterface_getProviders); -} - -#define QNN_VER_PTR(x) (&((x).v1)) // TODO: remove this macro after we have a separate header for QNN - -#define RPCMEM_DEFAULT_FLAGS 1 -#define RPCMEM_HEAP_ID_SYSTEM 25 +// ================================================================================================= +// +// helper data type / data structure / macros / functions of +// Qualcomm QNN(Qualcomm Neural Network, aka Qualcomm AI Engine Direct) SDK +// ref:https://github.com/pytorch/executorch/tree/main/backends/qualcomm +// ================================================================================================= +enum sdk_profile_level { profile_off = 0, profile_basic = 1, profile_detail = 2 }; + +enum qcom_htp_arch { + NONE = 0, + V68 = 68, + V69 = 69, + V73 = 73, + V75 = 75, +}; + +enum qcom_chipset { + UNKNOWN_SM = 0, + SM8450 = 36, // v69 + SM8475 = 42, // v69 + SM8550 = 43, // v73 + SM8650 = 57, // v75 +}; + +struct qcom_socinfo { + uint32_t soc_model; + size_t htp_arch; + size_t vtcm_size_in_mb; +}; + +using pfn_rpc_mem_init = void (*)(void); +using pfn_rpc_mem_deinit = void (*)(void); +using pfn_rpc_mem_alloc = void *(*)(int, uint32_t, int); +using pfn_rpc_mem_free = void (*)(void *); +using pfn_rpc_mem_to_fd = int (*)(void *); + +using pfn_qnnsaver_initialize = decltype(QnnSaver_initialize); +using pfn_qnninterface_getproviders = decltype(QnnInterface_getProviders); +using pfn_qnnsysteminterface_getproviders = decltype(QnnSystemInterface_getProviders); +} // namespace qnn + +#define QNN_VER_PTR(x) (&((x).v1)) // TODO: remove this macro after we have a separate header for QNN + +#define RPCMEM_DEFAULT_FLAGS 1 +#define RPCMEM_HEAP_ID_SYSTEM 25 diff --git a/ggml/src/ggml-qnn/qnn.hpp b/ggml/src/ggml-qnn/qnn.hpp index 6caefb75644f7..bccc3a4ba32ac 100644 --- a/ggml/src/ggml-qnn/qnn.hpp +++ b/ggml/src/ggml-qnn/qnn.hpp @@ -1,1143 +1,961 @@ #pragma once #include + +#include #include #include #include -#include // header file of Qualcomm QNN(Qualcomm Neural Network, aka Qualcomm AI Engine Direct) SDK // https://qpm.qualcomm.com/#/main/tools/details/qualcomm_ai_engine_direct -#include "QnnTypes.h" -#include "QnnCommon.h" -#include "QnnInterface.h" -#include "QnnContext.h" -#include "QnnBackend.h" -#include "QnnGraph.h" -#include "QnnProperty.h" -#include "QnnTensor.h" -#include "System/QnnSystemInterface.h" -#include "HTP/QnnHtpDevice.h" -#include "HTP/QnnHtpGraph.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include #include "qnn-types.hpp" #include "utils.hpp" namespace qnn { - // ================================================================================================= - // - // wrapper class of Qualcomm QNN(Qualcomm Neural Network, aka Qualcomm AI Engine Direct) SDK - // ref:https://github.com/pytorch/executorch/tree/main/backends/qualcomm - // ================================================================================================= - class qnn_interface { - -#define DEFINE_SHIM_FUNCTION_INTERFACE(F, pointer_name) \ - template inline auto qnn_##F(Args... args) const { \ - return (_qnn_interface->QNN_INTERFACE_VER_NAME.pointer_name)( \ - std::forward(args)...); \ +// ================================================================================================= +// +// wrapper class of Qualcomm QNN(Qualcomm Neural Network, aka Qualcomm AI Engine Direct) SDK +// ref:https://github.com/pytorch/executorch/tree/main/backends/qualcomm +// ================================================================================================= +class qnn_interface { + +#define DEFINE_SHIM_FUNCTION_INTERFACE(F, pointer_name) \ + template \ + inline auto qnn_##F(Args... args) const { \ + return (_qnn_interface->QNN_INTERFACE_VER_NAME.pointer_name)(std::forward(args)...); \ } -#define DEFINE_SHIM_FUNCTION_SYS_INTERFACE(F, pointer_name) \ - template inline auto qnn_##F(Args... args) const { \ - return ( \ - _qnn_sys_interface->QNN_SYSTEM_INTERFACE_VER_NAME.pointer_name)( \ - std::forward(args)...); \ +#define DEFINE_SHIM_FUNCTION_SYS_INTERFACE(F, pointer_name) \ + template \ + inline auto qnn_##F(Args... args) const { \ + return (_qnn_sys_interface->QNN_SYSTEM_INTERFACE_VER_NAME.pointer_name)(std::forward(args)...); \ } - friend class qnn_instance; - - public: - qnn_interface() = default; + friend class qnn_instance; - // QnnBackend - DEFINE_SHIM_FUNCTION_INTERFACE(backend_create, backendCreate); +public: + qnn_interface() = default; - DEFINE_SHIM_FUNCTION_INTERFACE(backend_free, backendFree); + // QnnBackend + DEFINE_SHIM_FUNCTION_INTERFACE(backend_create, backendCreate); - DEFINE_SHIM_FUNCTION_INTERFACE(backend_register_op_package, - backendRegisterOpPackage); + DEFINE_SHIM_FUNCTION_INTERFACE(backend_free, backendFree); - DEFINE_SHIM_FUNCTION_INTERFACE(backend_validate_op_config, - backendValidateOpConfig); + DEFINE_SHIM_FUNCTION_INTERFACE(backend_register_op_package, backendRegisterOpPackage); - DEFINE_SHIM_FUNCTION_INTERFACE(backend_get_api_version, - backendGetApiVersion); + DEFINE_SHIM_FUNCTION_INTERFACE(backend_validate_op_config, backendValidateOpConfig); - // QnnDevice - DEFINE_SHIM_FUNCTION_INTERFACE(device_create, deviceCreate); + DEFINE_SHIM_FUNCTION_INTERFACE(backend_get_api_version, backendGetApiVersion); - DEFINE_SHIM_FUNCTION_INTERFACE(device_free, deviceFree); + // QnnDevice + DEFINE_SHIM_FUNCTION_INTERFACE(device_create, deviceCreate); - DEFINE_SHIM_FUNCTION_INTERFACE(device_get_infrastructure, - deviceGetInfrastructure); + DEFINE_SHIM_FUNCTION_INTERFACE(device_free, deviceFree); - DEFINE_SHIM_FUNCTION_INTERFACE(device_get_platform_info, - deviceGetPlatformInfo); + DEFINE_SHIM_FUNCTION_INTERFACE(device_get_infrastructure, deviceGetInfrastructure); - DEFINE_SHIM_FUNCTION_INTERFACE(device_get_info, deviceGetInfo); + DEFINE_SHIM_FUNCTION_INTERFACE(device_get_platform_info, deviceGetPlatformInfo); - // QnnContext - DEFINE_SHIM_FUNCTION_INTERFACE(context_create, contextCreate); + DEFINE_SHIM_FUNCTION_INTERFACE(device_get_info, deviceGetInfo); - DEFINE_SHIM_FUNCTION_INTERFACE(context_get_binary_size, - contextGetBinarySize); + // QnnContext + DEFINE_SHIM_FUNCTION_INTERFACE(context_create, contextCreate); - DEFINE_SHIM_FUNCTION_INTERFACE(context_get_binary, contextGetBinary); + DEFINE_SHIM_FUNCTION_INTERFACE(context_get_binary_size, contextGetBinarySize); - DEFINE_SHIM_FUNCTION_INTERFACE(context_create_from_binary, - contextCreateFromBinary); + DEFINE_SHIM_FUNCTION_INTERFACE(context_get_binary, contextGetBinary); - DEFINE_SHIM_FUNCTION_INTERFACE(context_free, contextFree); + DEFINE_SHIM_FUNCTION_INTERFACE(context_create_from_binary, contextCreateFromBinary); - // QnnGraph - DEFINE_SHIM_FUNCTION_INTERFACE(graph_create, graphCreate); + DEFINE_SHIM_FUNCTION_INTERFACE(context_free, contextFree); - DEFINE_SHIM_FUNCTION_INTERFACE(graph_add_node, graphAddNode); + // QnnGraph + DEFINE_SHIM_FUNCTION_INTERFACE(graph_create, graphCreate); - DEFINE_SHIM_FUNCTION_INTERFACE(graph_finalize, graphFinalize); + DEFINE_SHIM_FUNCTION_INTERFACE(graph_add_node, graphAddNode); - DEFINE_SHIM_FUNCTION_INTERFACE(graph_execute, graphExecute); + DEFINE_SHIM_FUNCTION_INTERFACE(graph_finalize, graphFinalize); - DEFINE_SHIM_FUNCTION_INTERFACE(graph_retrieve, graphRetrieve); + DEFINE_SHIM_FUNCTION_INTERFACE(graph_execute, graphExecute); - // QnnLog - DEFINE_SHIM_FUNCTION_INTERFACE(log_create, logCreate); + DEFINE_SHIM_FUNCTION_INTERFACE(graph_retrieve, graphRetrieve); - DEFINE_SHIM_FUNCTION_INTERFACE(log_free, logFree); + // QnnLog + DEFINE_SHIM_FUNCTION_INTERFACE(log_create, logCreate); - DEFINE_SHIM_FUNCTION_INTERFACE(log_set_log_level, logSetLogLevel); + DEFINE_SHIM_FUNCTION_INTERFACE(log_free, logFree); - // QnnProfile - DEFINE_SHIM_FUNCTION_INTERFACE(profile_create, profileCreate); + DEFINE_SHIM_FUNCTION_INTERFACE(log_set_log_level, logSetLogLevel); - DEFINE_SHIM_FUNCTION_INTERFACE(profile_get_events, profileGetEvents); + // QnnProfile + DEFINE_SHIM_FUNCTION_INTERFACE(profile_create, profileCreate); - DEFINE_SHIM_FUNCTION_INTERFACE(profile_get_sub_events, profileGetSubEvents); + DEFINE_SHIM_FUNCTION_INTERFACE(profile_get_events, profileGetEvents); - DEFINE_SHIM_FUNCTION_INTERFACE(profile_get_event_data, profileGetEventData); + DEFINE_SHIM_FUNCTION_INTERFACE(profile_get_sub_events, profileGetSubEvents); - DEFINE_SHIM_FUNCTION_INTERFACE(profile_free, profileFree); + DEFINE_SHIM_FUNCTION_INTERFACE(profile_get_event_data, profileGetEventData); - // QnnMem - DEFINE_SHIM_FUNCTION_INTERFACE(mem_register, memRegister); + DEFINE_SHIM_FUNCTION_INTERFACE(profile_free, profileFree); - DEFINE_SHIM_FUNCTION_INTERFACE(mem_de_register, memDeRegister); + // QnnMem + DEFINE_SHIM_FUNCTION_INTERFACE(mem_register, memRegister); - // QnnProperty - DEFINE_SHIM_FUNCTION_INTERFACE(property_has_capability, - propertyHasCapability); + DEFINE_SHIM_FUNCTION_INTERFACE(mem_de_register, memDeRegister); - // QnnTensor - DEFINE_SHIM_FUNCTION_INTERFACE(tensor_create_context_tensor, - tensorCreateContextTensor); + // QnnProperty + DEFINE_SHIM_FUNCTION_INTERFACE(property_has_capability, propertyHasCapability); - DEFINE_SHIM_FUNCTION_INTERFACE(tensor_create_graph_tensor, - tensorCreateGraphTensor); + // QnnTensor + DEFINE_SHIM_FUNCTION_INTERFACE(tensor_create_context_tensor, tensorCreateContextTensor); - // QnnSystem - DEFINE_SHIM_FUNCTION_SYS_INTERFACE(system_context_create, - systemContextCreate); + DEFINE_SHIM_FUNCTION_INTERFACE(tensor_create_graph_tensor, tensorCreateGraphTensor); - DEFINE_SHIM_FUNCTION_SYS_INTERFACE(system_context_get_binary_info, - systemContextGetBinaryInfo); - - DEFINE_SHIM_FUNCTION_SYS_INTERFACE(system_context_free, systemContextFree); - - void set_qnn_interface(const QnnInterface_t* qnn_interface) { - _qnn_interface = qnn_interface; - } + // QnnSystem + DEFINE_SHIM_FUNCTION_SYS_INTERFACE(system_context_create, systemContextCreate); - void set_qnn_system_interface( - const QnnSystemInterface_t* qnn_sys_interface) { - _qnn_sys_interface = qnn_sys_interface; - } + DEFINE_SHIM_FUNCTION_SYS_INTERFACE(system_context_get_binary_info, systemContextGetBinaryInfo); - uint32_t get_backend_id() const { return _qnn_interface->backendId; } + DEFINE_SHIM_FUNCTION_SYS_INTERFACE(system_context_free, systemContextFree); - bool is_loaded() const { - return ((_qnn_sys_interface != nullptr) && (_qnn_interface != nullptr)); - } + void set_qnn_interface(const QnnInterface_t *qnn_interface) { _qnn_interface = qnn_interface; } - private: - const QnnInterface_t* _qnn_interface = nullptr; + void set_qnn_system_interface(const QnnSystemInterface_t *qnn_sys_interface) { + _qnn_sys_interface = qnn_sys_interface; + } - const QnnSystemInterface_t* _qnn_sys_interface = nullptr; - }; + uint32_t get_backend_id() const { return _qnn_interface->backendId; } + bool is_loaded() const { return ((_qnn_sys_interface != nullptr) && (_qnn_interface != nullptr)); } - class qnn_instance { - public: - using BackendIdType = decltype(QnnInterface_t{}.backendId); +private: + const QnnInterface_t *_qnn_interface = nullptr; - explicit qnn_instance(const std::string& lib_path, - const std::string& backend_name, - const std::string& model_name) - : _lib_path(std::move(lib_path)) - , _backend_name(std::move(backend_name)) - , _model_name(std::move(model_name)) {}; + const QnnSystemInterface_t *_qnn_sys_interface = nullptr; +}; - ~qnn_instance() {} +class qnn_instance { +public: + using BackendIdType = decltype(QnnInterface_t{}.backendId); - int qnn_init(const QnnSaver_Config_t** saver_config) { - BackendIdType backend_id = QNN_BACKEND_ID_NULL; - QNN_LOG_DEBUG("enter qni_init\n"); + explicit qnn_instance(const std::string &lib_path, const std::string &backend_name, const std::string &model_name) : + _lib_path(std::move(lib_path)), _backend_name(std::move(backend_name)), _model_name(std::move(model_name)) {}; - std::lock_guard lock(_init_mutex); + ~qnn_instance() {} - if (0 != load_system()) { - QNN_LOG_WARN("can not load QNN system lib, pls check why?\n"); - return 1; - } - else { - QNN_LOG_DEBUG("load QNN system lib successfully\n"); - } + int qnn_init(const QnnSaver_Config_t **saver_config) { + BackendIdType backend_id = QNN_BACKEND_ID_NULL; + QNN_LOG_DEBUG("enter qni_init\n"); - std::string backend_lib_path = _lib_path + _backend_name; - if (0 == _lib_path_to_backend_id.count(backend_lib_path)) { - int is_load_ok = load_backend(backend_lib_path, saver_config); - if (0 != is_load_ok) { - QNN_LOG_WARN("failed to load QNN backend\n"); - return 2; - } - } + std::lock_guard lock(_init_mutex); - backend_id = _lib_path_to_backend_id[backend_lib_path]; - if (0 == _loaded_backend.count(backend_id) || - 0 == _loaded_lib_handle.count(backend_id)) { - QNN_LOG_WARN("library %s is loaded but loaded backend count=%zu, " - "loaded lib_handle count=%zu\n", - backend_lib_path.c_str(), _loaded_backend.count(backend_id), - _loaded_lib_handle.count(backend_id)); - return 3; - } - - _qnn_interface.set_qnn_interface(_loaded_backend[backend_id]); - - _qnn_interface.qnn_log_create(qnn::sdk_logcallback, _qnn_log_level, &_qnn_log_handle); - if (nullptr == _qnn_log_handle) { - // NPU backend not work on Qualcomm SoC equipped low-end phone - QNN_LOG_WARN("why failed to initialize qnn log\n"); - return 4; - } - else { - QNN_LOG_DEBUG("initialize qnn log successfully\n"); - } + if (0 != load_system()) { + QNN_LOG_WARN("can not load QNN system lib, pls check why?\n"); + return 1; + } else { + QNN_LOG_DEBUG("load QNN system lib successfully\n"); + } - std::vector temp_backend_config; - _qnn_interface.qnn_backend_create( - _qnn_log_handle, - temp_backend_config.empty() ? nullptr : temp_backend_config.data(), - &_qnn_backend_handle); - if (nullptr == _qnn_backend_handle) { - QNN_LOG_WARN("why failed to initialize qnn backend\n"); - return 5; - } - else { - QNN_LOG_DEBUG("initialize qnn backend successfully\n"); + std::string backend_lib_path = _lib_path + _backend_name; + if (0 == _lib_path_to_backend_id.count(backend_lib_path)) { + int is_load_ok = load_backend(backend_lib_path, saver_config); + if (0 != is_load_ok) { + QNN_LOG_WARN("failed to load QNN backend\n"); + return 2; } + } - if (nullptr != _qnn_raw_interface.propertyHasCapability) { - Qnn_ErrorHandle_t qnn_status = - _qnn_raw_interface.propertyHasCapability(QNN_PROPERTY_GROUP_DEVICE); - if (QNN_PROPERTY_NOT_SUPPORTED == qnn_status) { - QNN_LOG_WARN("device property is not supported\n"); - } - if (QNN_PROPERTY_ERROR_UNKNOWN_KEY == qnn_status) { - QNN_LOG_WARN("device property is not known to backend\n"); + backend_id = _lib_path_to_backend_id[backend_lib_path]; + if (0 == _loaded_backend.count(backend_id) || 0 == _loaded_lib_handle.count(backend_id)) { + QNN_LOG_WARN( + "library %s is loaded but loaded backend count=%zu, " + "loaded lib_handle count=%zu\n", + backend_lib_path.c_str(), _loaded_backend.count(backend_id), _loaded_lib_handle.count(backend_id)); + return 3; + } + + _qnn_interface.set_qnn_interface(_loaded_backend[backend_id]); + + _qnn_interface.qnn_log_create(qnn::sdk_logcallback, _qnn_log_level, &_qnn_log_handle); + if (nullptr == _qnn_log_handle) { + // NPU backend not work on Qualcomm SoC equipped low-end phone + QNN_LOG_WARN("why failed to initialize qnn log\n"); + return 4; + } else { + QNN_LOG_DEBUG("initialize qnn log successfully\n"); + } + + std::vector temp_backend_config; + _qnn_interface.qnn_backend_create( + _qnn_log_handle, temp_backend_config.empty() ? nullptr : temp_backend_config.data(), &_qnn_backend_handle); + if (nullptr == _qnn_backend_handle) { + QNN_LOG_WARN("why failed to initialize qnn backend\n"); + return 5; + } else { + QNN_LOG_DEBUG("initialize qnn backend successfully\n"); + } + + if (nullptr != _qnn_raw_interface.propertyHasCapability) { + Qnn_ErrorHandle_t qnn_status = _qnn_raw_interface.propertyHasCapability(QNN_PROPERTY_GROUP_DEVICE); + if (QNN_PROPERTY_NOT_SUPPORTED == qnn_status) { + QNN_LOG_WARN("device property is not supported\n"); + } + if (QNN_PROPERTY_ERROR_UNKNOWN_KEY == qnn_status) { + QNN_LOG_WARN("device property is not known to backend\n"); + } + } + + Qnn_ErrorHandle_t qnn_status = QNN_SUCCESS; + if (_backend_name.find("Htp") != std::variant_npos) { + const QnnDevice_PlatformInfo_t *p_info = nullptr; + _qnn_raw_interface.deviceGetPlatformInfo(nullptr, &p_info); + QNN_LOG_INFO("device counts %d", p_info->v1.numHwDevices); + QnnDevice_HardwareDeviceInfo_t *infos = p_info->v1.hwDevices; + QnnHtpDevice_OnChipDeviceInfoExtension_t chipinfo = {}; + for (int i = 0; i < p_info->v1.numHwDevices; i++) { + QNN_LOG_INFO("deviceID:%d, deviceType:%d, numCores %d", infos[i].v1.deviceId, infos[i].v1.deviceType, + infos[i].v1.numCores); + QnnDevice_DeviceInfoExtension_t devinfo = infos[i].v1.deviceInfoExtension; + chipinfo = devinfo->onChipDevice; + QnnHtpDevice_Arch_t htp_arch = chipinfo.arch; + QNN_LOG_INFO("htp_type:%d(%s)", devinfo->devType, + (devinfo->devType == QNN_HTP_DEVICE_TYPE_ON_CHIP) ? "ON_CHIP" : ""); + QNN_LOG_INFO("qualcomm soc_model:%d(%s), htp_arch:%d(%s), vtcm_size:%d MB", chipinfo.socModel, + qnn::get_chipset_desc(chipinfo.socModel), htp_arch, qnn::get_htparch_desc(htp_arch), + chipinfo.vtcmSize); + _soc_info = { chipinfo.socModel, htp_arch, chipinfo.vtcmSize }; + } + _qnn_raw_interface.deviceFreePlatformInfo(nullptr, p_info); + + QnnHtpDevice_CustomConfig_t soc_customconfig; + soc_customconfig.option = QNN_HTP_DEVICE_CONFIG_OPTION_SOC; + soc_customconfig.socModel = chipinfo.socModel; + QnnDevice_Config_t soc_devconfig; + soc_devconfig.option = QNN_DEVICE_CONFIG_OPTION_CUSTOM; + soc_devconfig.customConfig = &soc_customconfig; + + QnnHtpDevice_CustomConfig_t arch_customconfig; + arch_customconfig.option = QNN_HTP_DEVICE_CONFIG_OPTION_ARCH; + arch_customconfig.arch.arch = chipinfo.arch; + arch_customconfig.arch.deviceId = 0; // Id of device to be used. If single device is used by default 0. + QnnDevice_Config_t arch_devconfig; + arch_devconfig.option = QNN_DEVICE_CONFIG_OPTION_CUSTOM; + arch_devconfig.customConfig = &arch_customconfig; + + const QnnDevice_Config_t *p_deviceconfig[] = { &soc_devconfig, &arch_devconfig, nullptr }; + qnn_status = _qnn_raw_interface.deviceCreate(_qnn_log_handle, p_deviceconfig, &_qnn_device_handle); + } else { + qnn_status = _qnn_raw_interface.deviceCreate(_qnn_log_handle, nullptr, &_qnn_device_handle); + } + if (QNN_SUCCESS != qnn_status && QNN_DEVICE_ERROR_UNSUPPORTED_FEATURE != qnn_status) { + QNN_LOG_WARN("failed to create QNN device\n"); + } else { + QNN_LOG_INFO("create QNN device successfully\n"); + } + + if (qnn::sdk_profile_level::profile_off != _profile_level) { + QNN_LOG_INFO("profiling turned on; level = %d", _profile_level); + if (qnn::sdk_profile_level::profile_basic == _profile_level) { + QNN_LOG_INFO("basic profiling requested. creating Qnn Profile object\n"); + if (QNN_PROFILE_NO_ERROR != _qnn_raw_interface.profileCreate( + _qnn_backend_handle, QNN_PROFILE_LEVEL_BASIC, &_qnn_profile_handle)) { + QNN_LOG_WARN("unable to create profile handle in the backend\n"); + return 6; + } else { + QNN_LOG_DEBUG("initialize qnn profile successfully\n"); } - } - - Qnn_ErrorHandle_t qnn_status = QNN_SUCCESS; - if (_backend_name.find("Htp") != std::variant_npos) { - const QnnDevice_PlatformInfo_t* p_info = nullptr; - _qnn_raw_interface.deviceGetPlatformInfo(nullptr, &p_info); - QNN_LOG_INFO("device counts %d", p_info->v1.numHwDevices); - QnnDevice_HardwareDeviceInfo_t* infos = p_info->v1.hwDevices; - QnnHtpDevice_OnChipDeviceInfoExtension_t chipinfo = { }; - for (int i = 0; i < p_info->v1.numHwDevices; i++) { - QNN_LOG_INFO("deviceID:%d, deviceType:%d, numCores %d", infos[i].v1.deviceId, - infos[i].v1.deviceType, infos[i].v1.numCores); - QnnDevice_DeviceInfoExtension_t devinfo = infos[i].v1.deviceInfoExtension; - chipinfo = devinfo->onChipDevice; - QnnHtpDevice_Arch_t htp_arch = chipinfo.arch; - QNN_LOG_INFO("htp_type:%d(%s)", devinfo->devType, (devinfo->devType == QNN_HTP_DEVICE_TYPE_ON_CHIP) ? "ON_CHIP" : ""); - QNN_LOG_INFO("qualcomm soc_model:%d(%s), htp_arch:%d(%s), vtcm_size:%d MB", - chipinfo.socModel, qnn::get_chipset_desc(chipinfo.socModel), - htp_arch, qnn::get_htparch_desc(htp_arch), chipinfo.vtcmSize); - _soc_info = { chipinfo.socModel, htp_arch, chipinfo.vtcmSize }; + } else if (qnn::sdk_profile_level::profile_detail == _profile_level) { + QNN_LOG_INFO("detailed profiling requested. Creating Qnn Profile object\n"); + if (QNN_PROFILE_NO_ERROR != _qnn_raw_interface.profileCreate(_qnn_backend_handle, + QNN_PROFILE_LEVEL_DETAILED, + &_qnn_profile_handle)) { + QNN_LOG_WARN("unable to create profile handle in the backend\n"); + return 7; + } else { + QNN_LOG_DEBUG("initialize qnn profile successfully\n"); } - _qnn_raw_interface.deviceFreePlatformInfo(nullptr, p_info); - - QnnHtpDevice_CustomConfig_t soc_customconfig; - soc_customconfig.option = QNN_HTP_DEVICE_CONFIG_OPTION_SOC; - soc_customconfig.socModel = chipinfo.socModel; - QnnDevice_Config_t soc_devconfig; - soc_devconfig.option = QNN_DEVICE_CONFIG_OPTION_CUSTOM; - soc_devconfig.customConfig = &soc_customconfig; - - QnnHtpDevice_CustomConfig_t arch_customconfig; - arch_customconfig.option = QNN_HTP_DEVICE_CONFIG_OPTION_ARCH; - arch_customconfig.arch.arch = chipinfo.arch; - arch_customconfig.arch.deviceId = 0; // Id of device to be used. If single device is used by default 0. - QnnDevice_Config_t arch_devconfig; - arch_devconfig.option = QNN_DEVICE_CONFIG_OPTION_CUSTOM; - arch_devconfig.customConfig = &arch_customconfig; - - const QnnDevice_Config_t* p_deviceconfig[] = { &soc_devconfig, &arch_devconfig, nullptr }; - qnn_status = _qnn_raw_interface.deviceCreate(_qnn_log_handle, p_deviceconfig, &_qnn_device_handle); - } - else { - qnn_status = _qnn_raw_interface.deviceCreate(_qnn_log_handle, nullptr, &_qnn_device_handle); - } - if (QNN_SUCCESS != qnn_status && - QNN_DEVICE_ERROR_UNSUPPORTED_FEATURE != qnn_status) { - QNN_LOG_WARN("failed to create QNN device\n"); - } - else { - QNN_LOG_INFO("create QNN device successfully\n"); } + } - if (qnn::sdk_profile_level::profile_off != _profile_level) { - QNN_LOG_INFO("profiling turned on; level = %d", _profile_level); - if (qnn::sdk_profile_level::profile_basic == _profile_level) { - QNN_LOG_INFO("basic profiling requested. creating Qnn Profile object\n"); - if (QNN_PROFILE_NO_ERROR != - _qnn_raw_interface.profileCreate(_qnn_backend_handle, - QNN_PROFILE_LEVEL_BASIC, - &_qnn_profile_handle)) { - QNN_LOG_WARN("unable to create profile handle in the backend\n"); - return 6; - } - else { - QNN_LOG_DEBUG("initialize qnn profile successfully\n"); - } - } - else if (qnn::sdk_profile_level::profile_detail == _profile_level) { - QNN_LOG_INFO("detailed profiling requested. Creating Qnn Profile object\n"); - if (QNN_PROFILE_NO_ERROR != - _qnn_raw_interface.profileCreate(_qnn_backend_handle, - QNN_PROFILE_LEVEL_DETAILED, - &_qnn_profile_handle)) { - QNN_LOG_WARN("unable to create profile handle in the backend\n"); - return 7; - } - else { - QNN_LOG_DEBUG("initialize qnn profile successfully\n"); - } + _rpc_lib_handle = dlopen("libcdsprpc.so", RTLD_NOW | RTLD_LOCAL); + if (nullptr == _rpc_lib_handle) { + QNN_LOG_WARN("failed to load qualcomm's rpc lib, error:%s\n", dlerror()); + return 8; + } else { + QNN_LOG_DEBUG("load rpcmem lib successfully\n"); + set_rpcmem_initialized(true); + } + _pfn_rpc_mem_init = reinterpret_cast(dlsym(_rpc_lib_handle, "rpcmem_init")); + _pfn_rpc_mem_deinit = reinterpret_cast(dlsym(_rpc_lib_handle, "rpcmem_deinit")); + _pfn_rpc_mem_alloc = reinterpret_cast(dlsym(_rpc_lib_handle, "rpcmem_alloc")); + _pfn_rpc_mem_free = reinterpret_cast(dlsym(_rpc_lib_handle, "rpcmem_free")); + _pfn_rpc_mem_to_fd = reinterpret_cast(dlsym(_rpc_lib_handle, "rpcmem_to_fd")); + if (nullptr == _pfn_rpc_mem_alloc || nullptr == _pfn_rpc_mem_free || nullptr == _pfn_rpc_mem_to_fd) { + QNN_LOG_WARN("unable to access symbols in QNN RPC lib. dlerror(): %s", dlerror()); + dlclose(_rpc_lib_handle); + return 9; + } + + if (nullptr != _pfn_rpc_mem_init) { // make Qualcomm's SoC equipped low-end phone happy + _pfn_rpc_mem_init(); + } + + /* TODO: not used, keep it for further usage + QnnContext_Config_t qnn_context_config = QNN_CONTEXT_CONFIG_INIT; + qnn_context_config.priority = QNN_PRIORITY_DEFAULT; + const QnnContext_Config_t * context_configs[] = {&qnn_context_config, nullptr}; + */ + _qnn_interface.qnn_context_create(_qnn_backend_handle, _qnn_device_handle, nullptr, &_qnn_context_handle); + if (nullptr == _qnn_context_handle) { + QNN_LOG_WARN("why failed to initialize qnn context\n"); + return 10; + } else { + QNN_LOG_DEBUG("initialize qnn context successfully\n"); + } + + if (_backend_name.find("Htp") != std::variant_npos) { + // TODO: faster approach to probe the accurate capacity of rpc ion memory + size_t candidate_size = 0; + uint8_t *rpc_buffer = nullptr; + const int size_in_mb = (1 << 20); + size_t probe_slots[] = { 1024, 1536, 2048 - 48, 2048 }; + size_t probe_counts = sizeof(probe_slots) / sizeof(size_t); + for (size_t idx = 0; idx < probe_counts; idx++) { + rpc_buffer = static_cast(alloc_rpcmem(probe_slots[idx] * size_in_mb, 4)); + if (nullptr == rpc_buffer) { + QNN_LOG_INFO("alloc rpcmem %d (MB) failure, %s\n", probe_slots[idx], strerror(errno)); + break; + } else { + candidate_size = probe_slots[idx]; + free_rpcmem(rpc_buffer); + rpc_buffer = nullptr; } } + if (candidate_size > _rpcmem_capacity) _rpcmem_capacity = candidate_size; + QNN_LOG_INFO("capacity of QNN rpc ion memory is about %d MB\n", _rpcmem_capacity); - _rpc_lib_handle = dlopen("libcdsprpc.so", RTLD_NOW | RTLD_LOCAL); - if (nullptr == _rpc_lib_handle) { - QNN_LOG_WARN("failed to load qualcomm's rpc lib, error:%s\n", dlerror()); - return 8; - } - else { - QNN_LOG_DEBUG("load rpcmem lib successfully\n"); - set_rpcmem_initialized(true); + if (0 != init_htp_perfinfra()) { + QNN_LOG_WARN("initialize HTP performance failure"); } - _pfn_rpc_mem_init = reinterpret_cast( - dlsym(_rpc_lib_handle, "rpcmem_init")); - _pfn_rpc_mem_deinit = reinterpret_cast( - dlsym(_rpc_lib_handle, "rpcmem_deinit")); - _pfn_rpc_mem_alloc = reinterpret_cast( - dlsym(_rpc_lib_handle, "rpcmem_alloc")); - _pfn_rpc_mem_free = reinterpret_cast( - dlsym(_rpc_lib_handle, "rpcmem_free")); - _pfn_rpc_mem_to_fd = reinterpret_cast( - dlsym(_rpc_lib_handle, "rpcmem_to_fd")); - if (nullptr == _pfn_rpc_mem_alloc || nullptr == _pfn_rpc_mem_free || - nullptr == _pfn_rpc_mem_to_fd) { - QNN_LOG_WARN("unable to access symbols in QNN RPC lib. dlerror(): %s", dlerror()); - dlclose(_rpc_lib_handle); - return 9; + if (0 != set_rpc_polling()) { + QNN_LOG_WARN("set RPC polling failure"); } - - if (nullptr != _pfn_rpc_mem_init) { // make Qualcomm's SoC equipped low-end phone happy - _pfn_rpc_mem_init(); + if (0 != set_high_performance_mode()) { + QNN_LOG_WARN("set HTP high performance mode failure"); } + } - /* TODO: not used, keep it for further usage - QnnContext_Config_t qnn_context_config = QNN_CONTEXT_CONFIG_INIT; - qnn_context_config.priority = QNN_PRIORITY_DEFAULT; - const QnnContext_Config_t * context_configs[] = {&qnn_context_config, nullptr}; - */ - _qnn_interface.qnn_context_create( - _qnn_backend_handle, _qnn_device_handle, - nullptr, - &_qnn_context_handle); - if (nullptr == _qnn_context_handle) { - QNN_LOG_WARN("why failed to initialize qnn context\n"); - return 10; - } - else { - QNN_LOG_DEBUG("initialize qnn context successfully\n"); - } + QNN_LOG_DEBUG("leave qni_init\n"); - if (_backend_name.find("Htp") != std::variant_npos) { - //TODO: faster approach to probe the accurate capacity of rpc ion memory - size_t candidate_size = 0; - uint8_t* rpc_buffer = nullptr; - const int size_in_mb = (1 << 20); - size_t probe_slots[] = { 1024, 1536, 2048 - 48, 2048 }; - size_t probe_counts = sizeof(probe_slots) / sizeof(size_t); - for (size_t idx = 0; idx < probe_counts; idx++) { - rpc_buffer = static_cast(alloc_rpcmem( - probe_slots[idx] * size_in_mb, 4)); - if (nullptr == rpc_buffer) { - QNN_LOG_INFO("alloc rpcmem %d (MB) failure, %s\n", - probe_slots[idx], strerror(errno)); - break; - } - else { - candidate_size = probe_slots[idx]; - free_rpcmem(rpc_buffer); - rpc_buffer = nullptr; - } - } - if (candidate_size > _rpcmem_capacity) - _rpcmem_capacity = candidate_size; - QNN_LOG_INFO("capacity of QNN rpc ion memory is about %d MB\n", _rpcmem_capacity); + return 0; + } - if (0 != init_htp_perfinfra()) { - QNN_LOG_WARN("initialize HTP performance failure"); - } - if (0 != set_rpc_polling()) { - QNN_LOG_WARN("set RPC polling failure"); - } - if (0 != set_high_performance_mode()) { - QNN_LOG_WARN("set HTP high performance mode failure"); - } - } + int qnn_finalize() { + int ret_status = 0; + Qnn_ErrorHandle_t error = QNN_SUCCESS; - QNN_LOG_DEBUG("leave qni_init\n"); + if (nullptr != _pfn_rpc_mem_deinit) // make Qualcomm's SoC equipped low-end phone happy + _pfn_rpc_mem_deinit(); - return 0; + if (dlclose(_rpc_lib_handle) != 0) { + QNN_LOG_WARN("failed to unload qualcomm's rpc lib, error:%s\n", dlerror()); + } else { + QNN_LOG_DEBUG("succeed to close rpcmem lib\n"); } - int qnn_finalize() { - int ret_status = 0; - Qnn_ErrorHandle_t error = QNN_SUCCESS; - - if (nullptr != _pfn_rpc_mem_deinit) // make Qualcomm's SoC equipped low-end phone happy - _pfn_rpc_mem_deinit(); + if (_backend_name.find("Htp") != std::variant_npos) { + _qnn_htp_perfinfra->destroyPowerConfigId(_qnn_power_configid); + } - if (dlclose(_rpc_lib_handle) != 0) { - QNN_LOG_WARN("failed to unload qualcomm's rpc lib, error:%s\n", dlerror()); - } - else { - QNN_LOG_DEBUG("succeed to close rpcmem lib\n"); + if (nullptr != _qnn_context_handle) { + error = _qnn_interface.qnn_context_free(_qnn_context_handle, _qnn_profile_handle); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to free QNN context_handle: ID %u, error %d\n", _qnn_interface.get_backend_id(), + QNN_GET_ERROR_CODE(error)); } + _qnn_context_handle = nullptr; + } - if (_backend_name.find("Htp") != std::variant_npos) { - _qnn_htp_perfinfra->destroyPowerConfigId(_qnn_power_configid); + if (nullptr != _qnn_profile_handle) { + error = _qnn_interface.qnn_profile_free(_qnn_profile_handle); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to free QNN profile_handle: ID %u, error %d\n", _qnn_interface.get_backend_id(), + QNN_GET_ERROR_CODE(error)); } + _qnn_profile_handle = nullptr; + } - if (nullptr != _qnn_context_handle) { - error = _qnn_interface.qnn_context_free(_qnn_context_handle, - _qnn_profile_handle); - if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to free QNN context_handle: ID %u, error %d\n", - _qnn_interface.get_backend_id(), - QNN_GET_ERROR_CODE(error)); - } - _qnn_context_handle = nullptr; + if (nullptr != _qnn_device_handle) { + error = _qnn_interface.qnn_device_free(_qnn_device_handle); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to free QNN device_handle: ID %u, error %d\n", _qnn_interface.get_backend_id(), + QNN_GET_ERROR_CODE(error)); } + _qnn_device_handle = nullptr; + } - if (nullptr != _qnn_profile_handle) { - error = _qnn_interface.qnn_profile_free(_qnn_profile_handle); - if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to free QNN profile_handle: ID %u, error %d\n", - _qnn_interface.get_backend_id(), - QNN_GET_ERROR_CODE(error)); - } - _qnn_profile_handle = nullptr; + if (nullptr != _qnn_backend_handle) { + error = _qnn_interface.qnn_backend_free(_qnn_backend_handle); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to free QNN backend_handle: ID %u, error %d\n", _qnn_interface.get_backend_id(), + QNN_GET_ERROR_CODE(error)); } + _qnn_backend_handle = nullptr; + } - if (nullptr != _qnn_device_handle) { - error = _qnn_interface.qnn_device_free(_qnn_device_handle); - if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to free QNN device_handle: ID %u, error %d\n", - _qnn_interface.get_backend_id(), - QNN_GET_ERROR_CODE(error)); - } - _qnn_device_handle = nullptr; + if (nullptr != _qnn_log_handle) { + error = _qnn_interface.qnn_log_free(_qnn_log_handle); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to free QNN log_handle: ID %u, error %d\n", _qnn_interface.get_backend_id(), + QNN_GET_ERROR_CODE(error)); } + _qnn_log_handle = nullptr; + } - if (nullptr != _qnn_backend_handle) { - error = _qnn_interface.qnn_backend_free(_qnn_backend_handle); - if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to free QNN backend_handle: ID %u, error %d\n", - _qnn_interface.get_backend_id(), - QNN_GET_ERROR_CODE(error)); - } - _qnn_backend_handle = nullptr; - } + unload_backend(); - if (nullptr != _qnn_log_handle) { - error = _qnn_interface.qnn_log_free(_qnn_log_handle); - if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to free QNN log_handle: ID %u, error %d\n", - _qnn_interface.get_backend_id(), - QNN_GET_ERROR_CODE(error)); - } - _qnn_log_handle = nullptr; - } + unload_system(); - unload_backend(); + return ret_status; + } - unload_system(); + const qnn_interface &get_qnn_interface() { + if (!_qnn_interface.is_loaded()) { + QNN_LOG_WARN("pls check why _qnn_interface is not loaded\n"); + } + return _qnn_interface; + } - return ret_status; + const QNN_INTERFACE_VER_TYPE &get_qnn_raw_interface() { + if (!_qnn_interface.is_loaded()) { + QNN_LOG_WARN("pls check why _qnn_interface is not loaded\n"); } + return _qnn_raw_interface; + } - //TODO:keep it for further usage of offload the entire cgraph to a single QNN DAG directly - // which was used in Qualcomm's dedicated AI technology -#if 0 - int init_qnn_graph(const char* graph_name, bool debug, - uint8_t do_node_validation = true, - const QnnGraph_Config_t** graph_configs = nullptr) { - int result = 0; + const QNN_SYSTEM_INTERFACE_VER_TYPE &get_qnn_raw_system_interface() { + if (!_qnn_interface.is_loaded()) { + QNN_LOG_WARN("pls check why _qnn_interface is not loaded\n"); + } + return _qnn_raw_system_interface; + } - if (nullptr == graph_name) { - QNN_LOG_WARN("graph name is null\n"); - return 1; - } + const Qnn_LogHandle_t get_qnn_log_handle() { return _qnn_log_handle; } - if (!_graph_name.empty()) { - QNN_LOG_WARN("qnn model for graph %s already initialized\n", graph_name); - return 2; - } + const Qnn_ProfileHandle_t get_qnn_profile_handle() { return _qnn_profile_handle; } - if (!do_node_validation) { - QNN_LOG_WARN("node validation disabled, backend will not perform op " - "validation prior to adding node\n"); - } + const Qnn_DeviceHandle_t get_qnn_device_handle() { return _qnn_device_handle; } - _graph_name = graph_name; - _debug_tensor = debug; - _do_node_validations = do_node_validation; + const Qnn_BackendHandle_t get_qnn_backend_handle() { return _qnn_backend_handle; } - result = _qnn_raw_interface.graphCreate(_qnn_context_handle, graph_name, - graph_configs, &_qnn_graph_handle); - if (result != QNN_GRAPH_NO_ERROR || nullptr == _qnn_graph_handle) { - QNN_LOG_WARN("failed to create graph in qnn context\n"); - return 3; - } - else { - QNN_LOG_INFO("succeed to create graph %s, %p\n", graph_name, _qnn_graph_handle); - } + const Qnn_ContextHandle_t get_qnn_context_handle() { return _qnn_context_handle; } - return 0; - } + const QnnSystemContext_Handle_t get_qnn_system_handle() { return _qnn_system_handle; } - int finalize_qnn_graph() { - if (nullptr != _qnn_graph_handle) { - if (_qnn_raw_interface.graphFinalize(_qnn_graph_handle, - _qnn_profile_handle, - nullptr) != QNN_GRAPH_NO_ERROR) { - QNN_LOG_WARN("finalizing graph failure\n"); - } - } - else { - QNN_LOG_DEBUG("qnn graph handle is null\n"); - } + const Qnn_GraphHandle_t get_qnn_graph_handle() { return _qnn_graph_handle; } - return 0; + int init_htp_perfinfra() { + QnnDevice_Infrastructure_t device_infra = nullptr; + int error = _qnn_raw_interface.deviceGetInfrastructure(&device_infra); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to get qnn device infra\n"); + return 1; + } else { + QNN_LOG_INFO("HTP backend perf_infrastructure creation ok\n"); } -#endif - const qnn_interface& get_qnn_interface() { - if (!_qnn_interface.is_loaded()) { - QNN_LOG_WARN("pls check why _qnn_interface is not loaded\n"); - } - return _qnn_interface; + QnnHtpDevice_Infrastructure_t *htp_infra = static_cast(device_infra); + QnnHtpDevice_PerfInfrastructure_t *htp_perfinfra = &htp_infra->perfInfra; + uint32_t power_configid = 1; + uint32_t device_id = 0; + uint32_t core_id = 0; + htp_perfinfra->createPowerConfigId(device_id, core_id, &power_configid); + if (htp_infra->infraType != QNN_HTP_DEVICE_INFRASTRUCTURE_TYPE_PERF) { + QNN_LOG_INFO("HTP infra type = %d, which is not perf infra type", htp_infra->infraType); + } else { + QNN_LOG_INFO("HTP infra type = %d, which is perf infra type\n", htp_infra->infraType); } + _qnn_htp_perfinfra = htp_perfinfra; + _qnn_power_configid = power_configid; - const QNN_INTERFACE_VER_TYPE& get_qnn_raw_interface() { - if (!_qnn_interface.is_loaded()) { - QNN_LOG_WARN("pls check why _qnn_interface is not loaded\n"); - } - return _qnn_raw_interface; - } + return 0; + } - const QNN_SYSTEM_INTERFACE_VER_TYPE& get_qnn_raw_system_interface() { - if (!_qnn_interface.is_loaded()) { - QNN_LOG_WARN("pls check why _qnn_interface is not loaded\n"); + int set_rpc_polling() { + if (_qnn_htp_perfinfra) { + QnnHtpPerfInfrastructure_PowerConfig_t rpc_polling_time; + memset(&rpc_polling_time, 0, sizeof(rpc_polling_time)); + rpc_polling_time.option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_RPC_POLLING_TIME; + // use rpc polling time recommended 0-10000 us + rpc_polling_time.rpcPollingTimeConfig = 9999; + + QnnHtpPerfInfrastructure_PowerConfig_t rpc_control_latency; + memset(&rpc_control_latency, 0, sizeof(rpc_control_latency)); + rpc_control_latency.option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_RPC_CONTROL_LATENCY; + // use rpc control latency recommended 100 us, refer hexagon sdk + rpc_control_latency.rpcControlLatencyConfig = 100; + + const QnnHtpPerfInfrastructure_PowerConfig_t *power_configs[] = { &rpc_polling_time, &rpc_control_latency, + nullptr }; + Qnn_ErrorHandle_t qnn_status = _qnn_htp_perfinfra->setPowerConfig(_qnn_power_configid, power_configs); + if (qnn_status != QNN_SUCCESS) { + QNN_LOG_WARN("set htp perf failed\n"); + } else { + QNN_LOG_INFO("set htp perf ok\n"); } - return _qnn_raw_system_interface; + } else { + QNN_LOG_WARN("can't set htp perf\n"); } - const Qnn_LogHandle_t get_qnn_log_handle() { return _qnn_log_handle; } + return 0; + } - const Qnn_ProfileHandle_t get_qnn_profile_handle() { - return _qnn_profile_handle; - } + int set_high_performance_mode() { + if (nullptr == _qnn_htp_perfinfra) { + QNN_LOG_WARN("perf intra is null\n"); + return 1; + } + + QnnHtpPerfInfrastructure_PowerConfig_t power_config; + memset(&power_config, 0, sizeof(power_config)); + power_config.option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_DCVS_V3; + + power_config.dcvsV3Config.setDcvsEnable = 1; + power_config.dcvsV3Config.dcvsEnable = 0; + power_config.dcvsV3Config.contextId = _qnn_power_configid; + power_config.dcvsV3Config.powerMode = QNN_HTP_PERF_INFRASTRUCTURE_POWERMODE_PERFORMANCE_MODE; + power_config.dcvsV3Config.setSleepLatency = 1; // true to consider Latency parameter otherwise false + power_config.dcvsV3Config.sleepLatency = 40; + power_config.dcvsV3Config.setBusParams = 1; // true to consider Bus parameter otherwise false + power_config.dcvsV3Config.setCoreParams = 1; // true to consider Core parameter otherwise false + power_config.dcvsV3Config.sleepDisable = 1; // true to consider sleep/LPM modes, false to enable + power_config.dcvsV3Config.setSleepDisable = + 1; // true to consider sleep disable/enable parameter otherwise false set sleep latency parameter + // set Bus Clock Parameters + power_config.dcvsV3Config.busVoltageCornerMin = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; + power_config.dcvsV3Config.busVoltageCornerTarget = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; + power_config.dcvsV3Config.busVoltageCornerMax = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; + // set Core Clock Parameters + power_config.dcvsV3Config.coreVoltageCornerMin = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; + power_config.dcvsV3Config.coreVoltageCornerTarget = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; + power_config.dcvsV3Config.coreVoltageCornerMax = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; + + // set power config with different performance parameters + const QnnHtpPerfInfrastructure_PowerConfig_t *power_configs[] = { &power_config, nullptr }; + Qnn_ErrorHandle_t qnn_status = QNN_SUCCESS; + qnn_status = _qnn_htp_perfinfra->setPowerConfig(_qnn_power_configid, power_configs); + if (qnn_status != QNN_SUCCESS) { + QNN_LOG_WARN("set htp high performance mode failed\n"); + } else { + QNN_LOG_INFO("set htp high performance mode ok\n"); + } + + return 0; + } - const Qnn_DeviceHandle_t get_qnn_device_handle() { - return _qnn_device_handle; - } + std::string &get_qnn_graph_name() { return _graph_name; } - const Qnn_BackendHandle_t get_qnn_backend_handle() { - return _qnn_backend_handle; - } + bool is_rpcmem_initialized() { return _rpcmem_initialized; } - const Qnn_ContextHandle_t get_qnn_context_handle() { - return _qnn_context_handle; - } + void set_rpcmem_initialized(bool initialized) { _rpcmem_initialized = initialized; } - const QnnSystemContext_Handle_t get_qnn_system_handle() { - return _qnn_system_handle; - } + size_t get_rpcmem_capacity() { return _rpcmem_capacity; } - const Qnn_GraphHandle_t get_qnn_graph_handle() { return _qnn_graph_handle; } + bool is_rpcmem_registered(Qnn_MemHandle_t handle) { return _qnn_mem_set.count(handle) != 0U; } - int init_htp_perfinfra() { - QnnDevice_Infrastructure_t device_infra = nullptr; - int error = _qnn_raw_interface.deviceGetInfrastructure(&device_infra); - if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to get qnn device infra\n"); - return 1; - } - else { - QNN_LOG_INFO("HTP backend perf_infrastructure creation ok\n"); - } + void *alloc_rpcmem(size_t bytes, size_t alignment) { + if (!_rpcmem_initialized) { + QNN_LOG_WARN("rpc memory not initialized\n"); + return nullptr; + } - QnnHtpDevice_Infrastructure_t* htp_infra = static_cast(device_infra); - QnnHtpDevice_PerfInfrastructure_t* htp_perfinfra = &htp_infra->perfInfra; - uint32_t power_configid = 1; - uint32_t device_id = 0; - uint32_t core_id = 0; - htp_perfinfra->createPowerConfigId(device_id, core_id, &power_configid); - if (htp_infra->infraType != QNN_HTP_DEVICE_INFRASTRUCTURE_TYPE_PERF) { - QNN_LOG_INFO("HTP infra type = %d, which is not perf infra type", htp_infra->infraType); - } - else { - QNN_LOG_INFO("HTP infra type = %d, which is perf infra type\n", htp_infra->infraType); - } - _qnn_htp_perfinfra = htp_perfinfra; - _qnn_power_configid = power_configid; - - return 0; - } - - int set_rpc_polling() { - if (_qnn_htp_perfinfra) { - QnnHtpPerfInfrastructure_PowerConfig_t rpc_polling_time; - memset(&rpc_polling_time, 0, sizeof(rpc_polling_time)); - rpc_polling_time.option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_RPC_POLLING_TIME; - //use rpc polling time recommended 0-10000 us - rpc_polling_time.rpcPollingTimeConfig = 9999; - - QnnHtpPerfInfrastructure_PowerConfig_t rpc_control_latency; - memset(&rpc_control_latency, 0, sizeof(rpc_control_latency)); - rpc_control_latency.option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_RPC_CONTROL_LATENCY; - //use rpc control latency recommended 100 us, refer hexagon sdk - rpc_control_latency.rpcControlLatencyConfig = 100; - - const QnnHtpPerfInfrastructure_PowerConfig_t* power_configs[] = { - &rpc_polling_time, - &rpc_control_latency, - nullptr }; - Qnn_ErrorHandle_t qnn_status = _qnn_htp_perfinfra->setPowerConfig( - _qnn_power_configid, - power_configs); - if (qnn_status != QNN_SUCCESS) { - QNN_LOG_WARN("set htp perf failed\n"); - } - else { - QNN_LOG_INFO("set htp perf ok\n"); - } - } - else { - QNN_LOG_WARN("can't set htp perf\n"); - } + auto allocate_bytes = static_cast(bytes + alignment); + void *buf = _pfn_rpc_mem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS, allocate_bytes); + if (buf == nullptr) { + QNN_LOG_WARN("failed to allocate rpc memory\n"); + return nullptr; + } - return 0; + auto aligned_buf = reinterpret_cast(qnn::align_to(alignment, reinterpret_cast(buf))); + bool status = _rpcmem_store_map.insert(std::pair(aligned_buf, buf)).second; + if (!status) { + QNN_LOG_WARN("failed to allocate rpc memory\n"); + _pfn_rpc_mem_free(buf); } - int set_high_performance_mode() { - if (nullptr == _qnn_htp_perfinfra) { - QNN_LOG_WARN("perf intra is null\n"); - return 1; - } + return aligned_buf; + } - QnnHtpPerfInfrastructure_PowerConfig_t power_config; - memset(&power_config, 0, sizeof(power_config)); - power_config.option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_DCVS_V3; - - power_config.dcvsV3Config.setDcvsEnable = 1; - power_config.dcvsV3Config.dcvsEnable = 0; - power_config.dcvsV3Config.contextId = _qnn_power_configid; - power_config.dcvsV3Config.powerMode = QNN_HTP_PERF_INFRASTRUCTURE_POWERMODE_PERFORMANCE_MODE; - power_config.dcvsV3Config.setSleepLatency = - 1; // true to consider Latency parameter otherwise false - power_config.dcvsV3Config.sleepLatency = 40; - power_config.dcvsV3Config.setBusParams = - 1; // true to consider Bus parameter otherwise false - power_config.dcvsV3Config.setCoreParams = - 1; // true to consider Core parameter otherwise false - power_config.dcvsV3Config.sleepDisable = - 1; // true to consider sleep/LPM modes, false to enable - power_config.dcvsV3Config.setSleepDisable = - 1; // true to consider sleep disable/enable parameter otherwise false set sleep latency parameter - // set Bus Clock Parameters - power_config.dcvsV3Config.busVoltageCornerMin = - DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; - power_config.dcvsV3Config.busVoltageCornerTarget = - DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; - power_config.dcvsV3Config.busVoltageCornerMax = - DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; - // set Core Clock Parameters - power_config.dcvsV3Config.coreVoltageCornerMin = - DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; - power_config.dcvsV3Config.coreVoltageCornerTarget = - DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; - power_config.dcvsV3Config.coreVoltageCornerMax = - DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; - - // set power config with different performance parameters - const QnnHtpPerfInfrastructure_PowerConfig_t* power_configs[] = { - &power_config, nullptr }; - Qnn_ErrorHandle_t qnn_status = QNN_SUCCESS; - qnn_status = _qnn_htp_perfinfra->setPowerConfig(_qnn_power_configid, power_configs); - if (qnn_status != QNN_SUCCESS) { - QNN_LOG_WARN("set htp high performance mode failed\n"); - } - else { - QNN_LOG_INFO("set htp high performance mode ok\n"); - } + void free_rpcmem(void *buf) { + if (!_rpcmem_initialized) { + QNN_LOG_WARN("rpc memory not initialized\n"); + } else if (0 == _rpcmem_store_map.count(buf)) { + QNN_LOG_WARN("no allocated tensor\n"); + } else { + _pfn_rpc_mem_free(_rpcmem_store_map[buf]); + _rpcmem_store_map.erase(buf); + } + } - return 0; + int32_t rpcmem_to_fd(void *buf) { + int32_t mem_fd = -1; + if (!is_rpcmem_initialized()) { + QNN_LOG_WARN("rpc memory not initialized\n"); + } else { + mem_fd = _pfn_rpc_mem_to_fd(buf); } - std::string& get_qnn_graph_name() { return _graph_name; } + return mem_fd; + } - bool is_rpcmem_initialized() { return _rpcmem_initialized; } + int register_rpcmem(void *p_data, Qnn_Tensor_t *p_tensor) { + if (nullptr == p_data || (nullptr == p_tensor)) { + QNN_LOG_WARN("invalid param\n"); + return 1; + } - void set_rpcmem_initialized(bool initialized) { - _rpcmem_initialized = initialized; + if (!is_rpcmem_initialized()) { + QNN_LOG_WARN("rpc memory not initialized\n"); + return 2; } - size_t get_rpcmem_capacity() { return _rpcmem_capacity; } + if (is_rpcmem_allocated(p_data)) { + QNN_LOG_WARN("rpc memory already allocated\n"); + return 3; + } - bool is_rpcmem_registered(Qnn_MemHandle_t handle) { - return _qnn_mem_set.count(handle) != 0U; + if (is_rpcmem_registered((QNN_VER_PTR(*p_tensor)->memHandle))) { + QNN_LOG_WARN("tensor %s has been registered shared memory\n", (QNN_VER_PTR(*p_tensor)->name)); + return 4; } - void* alloc_rpcmem(size_t bytes, size_t alignment) { - if (!_rpcmem_initialized) { - QNN_LOG_WARN("rpc memory not initialized\n"); - return nullptr; - } + int32_t mem_fd = rpcmem_to_fd(p_data); + if (-1 == mem_fd) { + QNN_LOG_WARN("failed to get file descriptor\n"); + return 5; + } + QNN_LOG_INFO("mem_fd %d\n", mem_fd); + Qnn_MemDescriptor_t descriptor = { { QNN_VER_PTR(*p_tensor)->rank, QNN_VER_PTR(*p_tensor)->dimensions, + nullptr }, + QNN_VER_PTR(*p_tensor)->dataType, + QNN_MEM_TYPE_ION, + { { mem_fd } } }; + Qnn_MemHandle_t handle = nullptr; + int error = QNN_SUCCESS; + error = _qnn_interface.qnn_mem_register(_qnn_context_handle, &descriptor, + /*numDescriptors=*/1, &handle); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to register shared memory, error %d, %s\n", QNN_GET_ERROR_CODE(error), + strerror(error)); + return 6; + } else { + QNN_LOG_INFO("tensor %s successfully register shared memory\n", (QNN_VER_PTR(*p_tensor)->name)); + } + QNN_VER_PTR(*p_tensor)->memHandle = handle; + _qnn_mem_set.insert((std::pair(p_data, handle))); - auto allocate_bytes = static_cast(bytes + alignment); - void* buf = _pfn_rpc_mem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS, - allocate_bytes); - if (buf == nullptr) { - QNN_LOG_WARN("failed to allocate rpc memory\n"); - return nullptr; - } + return 0; + } - auto aligned_buf = reinterpret_cast( - qnn::align_to(alignment, reinterpret_cast(buf))); - bool status = - _rpcmem_store_map.insert(std::pair(aligned_buf, buf)).second; - if (!status) { - QNN_LOG_WARN("failed to allocate rpc memory\n"); - _pfn_rpc_mem_free(buf); + void *get_rpcmem_from_memhandle(Qnn_MemHandle_t mem_handle) { + for (std::unordered_map::iterator it = _qnn_mem_set.begin(); it != _qnn_mem_set.end(); + it++) { + Qnn_MemHandle_t mem_handle = it->second; + if (it->second == mem_handle) { + return it->first; } - - return aligned_buf; } + QNN_LOG_WARN("can't find rpcmem from qnn mem handle %p", mem_handle); + return nullptr; + } - void free_rpcmem(void* buf) { - if (!_rpcmem_initialized) { - QNN_LOG_WARN("rpc memory not initialized\n"); - } - else if (0 == _rpcmem_store_map.count(buf)) { - QNN_LOG_WARN("no allocated tensor\n"); - } - else { - _pfn_rpc_mem_free(_rpcmem_store_map[buf]); - _rpcmem_store_map.erase(buf); - } + void unregister_rpcmem() { + Qnn_ErrorHandle_t error = QNN_SUCCESS; + + if (_qnn_mem_set.empty()) { + QNN_LOG_WARN("no rpcmem registered\n"); } - int32_t rpcmem_to_fd(void* buf) { - int32_t mem_fd = -1; - if (!is_rpcmem_initialized()) { - QNN_LOG_WARN("rpc memory not initialized\n"); - } - else { - mem_fd = _pfn_rpc_mem_to_fd(buf); + for (std::unordered_map::iterator it = _qnn_mem_set.begin(); it != _qnn_mem_set.end(); + it++) { + Qnn_MemHandle_t mem_handle = it->second; + error = _qnn_interface.qnn_mem_de_register(&mem_handle, 1); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to unregister shared memory, error %d\n", QNN_GET_ERROR_CODE(error)); } - - return mem_fd; } + _qnn_mem_set.clear(); + } - int register_rpcmem(void* p_data, Qnn_Tensor_t* p_tensor) { - if (nullptr == p_data || (nullptr == p_tensor)) { - QNN_LOG_WARN("invalid param\n"); - return 1; - } + bool is_rpcmem_allocated(void *buf) { return _qnn_mem_set.count(buf) != 0U; } - if (!is_rpcmem_initialized()) { - QNN_LOG_WARN("rpc memory not initialized\n"); - return 2; - } + const qnn::qcom_socinfo &get_soc_info() { return _soc_info; } - if (is_rpcmem_allocated(p_data)) { - QNN_LOG_WARN("rpc memory already allocated\n"); - return 3; - } +public: + std::map> _qnn_graph_map; - if (is_rpcmem_registered((QNN_VER_PTR(*p_tensor)->memHandle))) { - QNN_LOG_WARN("tensor %s has been registered shared memory\n", - (QNN_VER_PTR(*p_tensor)->name)); - return 4; - } +private: + int load_system() { + Qnn_ErrorHandle_t error = QNN_SUCCESS; - int32_t mem_fd = rpcmem_to_fd(p_data); - if (-1 == mem_fd) { - QNN_LOG_WARN("failed to get file descriptor\n"); - return 5; - } - QNN_LOG_INFO("mem_fd %d\n", mem_fd); - Qnn_MemDescriptor_t descriptor = { {QNN_VER_PTR(*p_tensor)->rank, - QNN_VER_PTR(*p_tensor)->dimensions, - nullptr}, - QNN_VER_PTR(*p_tensor)->dataType, - QNN_MEM_TYPE_ION, - {{mem_fd}} }; - Qnn_MemHandle_t handle = nullptr; - int error = QNN_SUCCESS; - error = _qnn_interface.qnn_mem_register(_qnn_context_handle, &descriptor, - /*numDescriptors=*/1, &handle); - if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to register shared memory, error %d, %s\n", - QNN_GET_ERROR_CODE(error), strerror(error)); - return 6; - } - else { - QNN_LOG_INFO("tensor %s successfully register shared memory\n", - (QNN_VER_PTR(*p_tensor)->name)); - } - QNN_VER_PTR(*p_tensor)->memHandle = handle; - _qnn_mem_set.insert((std::pair(p_data, handle))); + std::string system_lib_path = _lib_path + "libQnnSystem.so"; + QNN_LOG_DEBUG("system_lib_path:%s\n", system_lib_path.c_str()); - return 0; + _system_lib_handle = dlopen(system_lib_path.c_str(), RTLD_NOW | RTLD_LOCAL); + if (nullptr == _system_lib_handle) { + QNN_LOG_WARN("can not open QNN library %s, error: %s\n", system_lib_path.c_str(), dlerror()); + return 1; } - void* get_rpcmem_from_memhandle(Qnn_MemHandle_t mem_handle) { - for (std::unordered_map::iterator it = _qnn_mem_set.begin(); - it != _qnn_mem_set.end(); - it++) { - Qnn_MemHandle_t mem_handle = it->second; - if (it->second == mem_handle) { - return it->first; - } - } - QNN_LOG_WARN("can't find rpcmem from qnn mem handle %p", mem_handle); - return nullptr; + auto *get_providers = reinterpret_cast( + dlsym(_system_lib_handle, "QnnSystemInterface_getProviders")); + if (nullptr == get_providers) { + QNN_LOG_WARN("can not load QNN symbol QnnSystemInterface_getProviders: %s\n", dlerror()); + return 2; } - void unregister_rpcmem() { - Qnn_ErrorHandle_t error = QNN_SUCCESS; - - if (_qnn_mem_set.empty()) { - QNN_LOG_WARN("no rpcmem registered\n"); - } - - for (std::unordered_map::iterator it = _qnn_mem_set.begin(); - it != _qnn_mem_set.end(); - it++) { - Qnn_MemHandle_t mem_handle = it->second; - error = _qnn_interface.qnn_mem_de_register(&mem_handle, 1); - if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to unregister shared memory, error %d\n", - QNN_GET_ERROR_CODE(error)); - } - } - _qnn_mem_set.clear(); + uint32_t num_providers = 0; + const QnnSystemInterface_t **provider_list = nullptr; + error = get_providers(&provider_list, &num_providers); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to get providers, error %d\n", QNN_GET_ERROR_CODE(error)); + return 3; } - bool is_rpcmem_allocated(void* buf) { - return _qnn_mem_set.count(buf) != 0U; + if (num_providers != _required_num_providers) { + QNN_LOG_WARN("providers is %d instead of required %d\n", num_providers, _required_num_providers); + return 4; } - const qnn::qcom_socinfo& get_soc_info() { return _soc_info; } - - public: - std::map> _qnn_graph_map; - - private: - int load_system() { - Qnn_ErrorHandle_t error = QNN_SUCCESS; - - std::string system_lib_path = _lib_path + "libQnnSystem.so"; - QNN_LOG_DEBUG("system_lib_path:%s\n", system_lib_path.c_str()); + if (nullptr == provider_list) { + QNN_LOG_WARN("can not get providers\n"); + return 5; + } - _system_lib_handle = dlopen(system_lib_path.c_str(), RTLD_NOW | RTLD_LOCAL); - if (nullptr == _system_lib_handle) { - QNN_LOG_WARN("can not open QNN library %s, error: %s\n", - system_lib_path.c_str(), dlerror()); - return 1; - } - - auto* get_providers = - reinterpret_cast( - dlsym(_system_lib_handle, "QnnSystemInterface_getProviders")); - if (nullptr == get_providers) { - QNN_LOG_WARN( - "can not load QNN symbol QnnSystemInterface_getProviders: %s\n", - dlerror()); - return 2; + QNN_SYSTEM_INTERFACE_VER_TYPE qnn_system_interface; + bool found_valid_system_interface = false; + for (size_t idx = 0; idx < num_providers; idx++) { + if (QNN_SYSTEM_API_VERSION_MAJOR == provider_list[idx]->systemApiVersion.major && + QNN_SYSTEM_API_VERSION_MINOR <= provider_list[idx]->systemApiVersion.minor) { + found_valid_system_interface = true; + qnn_system_interface = provider_list[idx]->QNN_SYSTEM_INTERFACE_VER_NAME; + break; } + } + if (!found_valid_system_interface) { + QNN_LOG_WARN("unable to find a valid qnn system interface\n"); + return 6; + } else { + QNN_LOG_INFO("find a valid qnn system interface\n"); + } + set_qnn_raw_system_interface(qnn_system_interface); - uint32_t num_providers = 0; - const QnnSystemInterface_t** provider_list = nullptr; - error = get_providers(&provider_list, &num_providers); - if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to get providers, error %d\n", - QNN_GET_ERROR_CODE(error)); - return 3; - } + _qnn_interface.set_qnn_system_interface(provider_list[0]); - if (num_providers != _required_num_providers) { - QNN_LOG_WARN("providers is %d instead of required %d\n", num_providers, - _required_num_providers); - return 4; - } + _qnn_interface.qnn_system_context_create(&_qnn_system_handle); + if (nullptr == _qnn_system_handle) { + QNN_LOG_WARN("can not create QNN system contenxt\n"); + } else { + QNN_LOG_INFO("initialize qnn system successfully\n"); + } - if (nullptr == provider_list) { - QNN_LOG_WARN("can not get providers\n"); - return 5; - } + return 0; + } - QNN_SYSTEM_INTERFACE_VER_TYPE qnn_system_interface; - bool found_valid_system_interface = false; - for (size_t idx = 0; idx < num_providers; idx++) { - if (QNN_SYSTEM_API_VERSION_MAJOR == - provider_list[idx]->systemApiVersion.major && - QNN_SYSTEM_API_VERSION_MINOR <= - provider_list[idx]->systemApiVersion.minor) { - found_valid_system_interface = true; - qnn_system_interface = - provider_list[idx]->QNN_SYSTEM_INTERFACE_VER_NAME; - break; - } - } - if (!found_valid_system_interface) { - QNN_LOG_WARN("unable to find a valid qnn system interface\n"); - return 6; - } - else { - QNN_LOG_INFO("find a valid qnn system interface\n"); - } - set_qnn_raw_system_interface(qnn_system_interface); + int unload_system() { + int result = 0; - _qnn_interface.set_qnn_system_interface(provider_list[0]); + if (nullptr == _system_lib_handle) { + QNN_LOG_WARN("system lib handle is null\n"); + return 1; + } - _qnn_interface.qnn_system_context_create(&_qnn_system_handle); - if (nullptr == _qnn_system_handle) { - QNN_LOG_WARN("can not create QNN system contenxt\n"); - } - else { - QNN_LOG_INFO("initialize qnn system successfully\n"); + if (nullptr != _qnn_system_handle) { + result = _qnn_interface.qnn_system_context_free(_qnn_system_handle); + if (result != QNN_SUCCESS) { + QNN_LOG_WARN("failed to free QNN system context\n"); } - - return 0; + _qnn_system_handle = nullptr; } - int unload_system() { - int result = 0; - - if (nullptr == _system_lib_handle) { - QNN_LOG_WARN("system lib handle is null\n"); - return 1; - } + int dlclose_error = dlclose(_system_lib_handle); + if (dlclose_error != 0) { + QNN_LOG_WARN("failed to close QnnSystem library, error %s\n", dlerror()); + return 2; + } - if (nullptr != _qnn_system_handle) { - result = _qnn_interface.qnn_system_context_free(_qnn_system_handle); - if (result != QNN_SUCCESS) { - QNN_LOG_WARN("failed to free QNN system context\n"); - } - _qnn_system_handle = nullptr; - } + _system_lib_handle = nullptr; - int dlclose_error = dlclose(_system_lib_handle); - if (dlclose_error != 0) { - QNN_LOG_WARN("failed to close QnnSystem library, error %s\n", dlerror()); - return 2; - } + return result; + } - _system_lib_handle = nullptr; + int load_backend(std::string &lib_path, const QnnSaver_Config_t **saver_config) { + Qnn_ErrorHandle_t error = QNN_SUCCESS; + QNN_LOG_DEBUG("lib_path:%s\n", lib_path.c_str()); - return result; + void *lib_handle = dlopen(lib_path.c_str(), RTLD_NOW | RTLD_GLOBAL); + if (nullptr == lib_handle) { + QNN_LOG_WARN("can not open QNN library %s, with error: %s", lib_path.c_str(), dlerror()); + return 1; } - int load_backend(std::string& lib_path, const QnnSaver_Config_t** saver_config) { - Qnn_ErrorHandle_t error = QNN_SUCCESS; - QNN_LOG_DEBUG("lib_path:%s\n", lib_path.c_str()); + auto get_providers = qnn::load_qnn_functionpointers( + lib_handle, "QnnInterface_getProviders"); + if (nullptr == get_providers) { + QNN_LOG_WARN("can not load symbol QnnInterface_getProviders : %s", dlerror()); + return 2; + } - void* lib_handle = dlopen(lib_path.c_str(), RTLD_NOW | RTLD_GLOBAL); - if (nullptr == lib_handle) { - QNN_LOG_WARN("can not open QNN library %s, with error: %s", lib_path.c_str(), dlerror()); - return 1; - } + std::uint32_t num_providers = 0; + const QnnInterface_t **provider_list = nullptr; + error = get_providers(&provider_list, &num_providers); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to get providers, error %d", QNN_GET_ERROR_CODE(error)); + return 3; + } + QNN_LOG_DEBUG("num_providers=%d\n", num_providers); + if (num_providers != _required_num_providers) { + QNN_LOG_WARN("providers is %d instead of required %d", num_providers, _required_num_providers); + return 4; + } - auto get_providers = - qnn::load_qnn_functionpointers( - lib_handle, "QnnInterface_getProviders"); - if (nullptr == get_providers) { - QNN_LOG_WARN("can not load symbol QnnInterface_getProviders : %s", dlerror()); - return 2; + if (nullptr == provider_list) { + QNN_LOG_WARN("failed to get qnn interface providers\n"); + return 5; + } + bool found_valid_interface = false; + QNN_INTERFACE_VER_TYPE qnn_interface; + for (size_t idx = 0; idx < num_providers; idx++) { + if (QNN_API_VERSION_MAJOR == provider_list[idx]->apiVersion.coreApiVersion.major && + QNN_API_VERSION_MINOR <= provider_list[idx]->apiVersion.coreApiVersion.minor) { + found_valid_interface = true; + qnn_interface = provider_list[idx]->QNN_INTERFACE_VER_NAME; + break; } + } - std::uint32_t num_providers = 0; - const QnnInterface_t** provider_list = nullptr; - error = get_providers(&provider_list, &num_providers); - if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to get providers, error %d", QNN_GET_ERROR_CODE(error)); - return 3; - } - QNN_LOG_DEBUG("num_providers=%d\n", num_providers); - if (num_providers != _required_num_providers) { - QNN_LOG_WARN("providers is %d instead of required %d", num_providers, _required_num_providers); - return 4; - } + if (!found_valid_interface) { + QNN_LOG_WARN("unable to find a valid qnn interface\n"); + return 6; + } else { + QNN_LOG_INFO("find a valid qnn interface\n"); + } + set_qnn_raw_interface(qnn_interface); - if (nullptr == provider_list) { - QNN_LOG_WARN("failed to get qnn interface providers\n"); - return 5; - } - bool found_valid_interface = false; - QNN_INTERFACE_VER_TYPE qnn_interface; - for (size_t idx = 0; idx < num_providers; idx++) { - if (QNN_API_VERSION_MAJOR == - provider_list[idx]->apiVersion.coreApiVersion.major && - QNN_API_VERSION_MINOR <= - provider_list[idx]->apiVersion.coreApiVersion.minor) { - found_valid_interface = true; - qnn_interface = provider_list[idx]->QNN_INTERFACE_VER_NAME; - break; - } + BackendIdType backend_id = provider_list[0]->backendId; + _lib_path_to_backend_id[lib_path] = backend_id; + if (_loaded_backend.count(backend_id) > 0) { + QNN_LOG_WARN("lib_path %s is loaded, but backend %d already exists\n", lib_path.c_str(), backend_id); + } + _loaded_backend[backend_id] = provider_list[0]; + if (_loaded_lib_handle.count(backend_id) > 0) { + QNN_LOG_WARN("closing %p\n", _loaded_lib_handle[backend_id]); + int dlclose_error = dlclose(_loaded_lib_handle[backend_id]); + if (dlclose_error != 0) { + QNN_LOG_WARN("fail to close %p with error %s\n", _loaded_lib_handle[backend_id], dlerror()); } + } + _loaded_lib_handle[backend_id] = lib_handle; + _backend_id = backend_id; - if (!found_valid_interface) { - QNN_LOG_WARN("unable to find a valid qnn interface\n"); - return 6; - } - else { - QNN_LOG_INFO("find a valid qnn interface\n"); - } - set_qnn_raw_interface(qnn_interface); + return 0; + } - BackendIdType backend_id = provider_list[0]->backendId; - _lib_path_to_backend_id[lib_path] = backend_id; - if (_loaded_backend.count(backend_id) > 0) { - QNN_LOG_WARN("lib_path %s is loaded, but backend %d already exists\n", lib_path.c_str(), backend_id); - } - _loaded_backend[backend_id] = provider_list[0]; - if (_loaded_lib_handle.count(backend_id) > 0) { - QNN_LOG_WARN("closing %p\n", _loaded_lib_handle[backend_id]); - int dlclose_error = dlclose(_loaded_lib_handle[backend_id]); - if (dlclose_error != 0) { - QNN_LOG_WARN("fail to close %p with error %s\n", _loaded_lib_handle[backend_id], dlerror()); - } + int unload_backend() { + int dlclose_error = 0; + for (auto &it : _loaded_lib_handle) { + dlclose_error = dlclose(it.second); + if (dlclose_error != 0) { + QNN_LOG_WARN("failed to close QNN backend %d, error %s\n", it.first, dlerror()); } - _loaded_lib_handle[backend_id] = lib_handle; - _backend_id = backend_id; - - return 0; } - int unload_backend() { - int dlclose_error = 0; - for (auto& it : _loaded_lib_handle) { - dlclose_error = dlclose(it.second); - if (dlclose_error != 0) { - QNN_LOG_WARN("failed to close QNN backend %d, error %s\n", it.first, dlerror()); - } - } + _loaded_lib_handle.clear(); + _lib_path_to_backend_id.clear(); + _loaded_backend.clear(); - _loaded_lib_handle.clear(); - _lib_path_to_backend_id.clear(); - _loaded_backend.clear(); - - return 0; - } + return 0; + } - void set_qnn_raw_interface(QNN_INTERFACE_VER_TYPE& raw_interface) { - _qnn_raw_interface = raw_interface; - } + void set_qnn_raw_interface(QNN_INTERFACE_VER_TYPE &raw_interface) { _qnn_raw_interface = raw_interface; } - void set_qnn_raw_system_interface(QNN_SYSTEM_INTERFACE_VER_TYPE& raw_interface) { - _qnn_raw_system_interface = raw_interface; - } + void set_qnn_raw_system_interface(QNN_SYSTEM_INTERFACE_VER_TYPE &raw_interface) { + _qnn_raw_system_interface = raw_interface; + } - private: - static constexpr const int _required_num_providers = 1; +private: + static constexpr const int _required_num_providers = 1; - std::string _lib_path; - std::string _backend_name; - std::string _model_name; // Qualcomm's dedicated prebuilt model name, keep it for further usage - BackendIdType _backend_id; + std::string _lib_path; + std::string _backend_name; + std::string _model_name; // Qualcomm's dedicated prebuilt model name, keep it for further usage + BackendIdType _backend_id; - bool _debug_tensor = false; - bool _do_node_validations = true; + bool _debug_tensor = false; + bool _do_node_validations = true; - QnnLog_Level_t _qnn_log_level = QNN_LOG_LEVEL_DEBUG; + QnnLog_Level_t _qnn_log_level = QNN_LOG_LEVEL_DEBUG; - qnn::sdk_profile_level _profile_level = qnn::sdk_profile_level::profile_detail; + qnn::sdk_profile_level _profile_level = qnn::sdk_profile_level::profile_detail; - qnn_interface _qnn_interface; + qnn_interface _qnn_interface; - void* _system_lib_handle = nullptr; + void *_system_lib_handle = nullptr; - Qnn_GraphHandle_t _qnn_graph_handle = nullptr; + Qnn_GraphHandle_t _qnn_graph_handle = nullptr; - Qnn_LogHandle_t _qnn_log_handle = nullptr; + Qnn_LogHandle_t _qnn_log_handle = nullptr; - Qnn_ProfileHandle_t _qnn_profile_handle = nullptr; + Qnn_ProfileHandle_t _qnn_profile_handle = nullptr; - Qnn_DeviceHandle_t _qnn_device_handle = nullptr; + Qnn_DeviceHandle_t _qnn_device_handle = nullptr; - Qnn_BackendHandle_t _qnn_backend_handle = nullptr; + Qnn_BackendHandle_t _qnn_backend_handle = nullptr; - Qnn_ContextHandle_t _qnn_context_handle = nullptr; + Qnn_ContextHandle_t _qnn_context_handle = nullptr; - QnnSystemContext_Handle_t _qnn_system_handle = nullptr; + QnnSystemContext_Handle_t _qnn_system_handle = nullptr; - QnnHtpDevice_PerfInfrastructure_t* _qnn_htp_perfinfra = nullptr; - uint32_t _qnn_power_configid = 1; + QnnHtpDevice_PerfInfrastructure_t *_qnn_htp_perfinfra = nullptr; + uint32_t _qnn_power_configid = 1; - QNN_INTERFACE_VER_TYPE _qnn_raw_interface; - QNN_SYSTEM_INTERFACE_VER_TYPE _qnn_raw_system_interface; + QNN_INTERFACE_VER_TYPE _qnn_raw_interface; + QNN_SYSTEM_INTERFACE_VER_TYPE _qnn_raw_system_interface; - std::unordered_map _qnn_mem_set; + std::unordered_map _qnn_mem_set; - std::mutex _init_mutex; - std::unordered_map _loaded_lib_handle; - std::unordered_map _lib_path_to_backend_id; - std::unordered_map _loaded_backend; + std::mutex _init_mutex; + std::unordered_map _loaded_lib_handle; + std::unordered_map _lib_path_to_backend_id; + std::unordered_map _loaded_backend; - void* _rpc_lib_handle = nullptr; - std::atomic_bool _rpcmem_initialized{ false }; - qnn::pfn_rpc_mem_alloc _pfn_rpc_mem_alloc; - qnn::pfn_rpc_mem_free _pfn_rpc_mem_free; - qnn::pfn_rpc_mem_to_fd _pfn_rpc_mem_to_fd; - qnn::pfn_rpc_mem_init _pfn_rpc_mem_init; - qnn::pfn_rpc_mem_deinit _pfn_rpc_mem_deinit; - std::unordered_map _rpcmem_store_map; - size_t _rpcmem_capacity = 512; + void *_rpc_lib_handle = nullptr; + std::atomic_bool _rpcmem_initialized{ false }; + qnn::pfn_rpc_mem_alloc _pfn_rpc_mem_alloc; + qnn::pfn_rpc_mem_free _pfn_rpc_mem_free; + qnn::pfn_rpc_mem_to_fd _pfn_rpc_mem_to_fd; + qnn::pfn_rpc_mem_init _pfn_rpc_mem_init; + qnn::pfn_rpc_mem_deinit _pfn_rpc_mem_deinit; + std::unordered_map _rpcmem_store_map; + size_t _rpcmem_capacity = 512; - std::string _graph_name; + std::string _graph_name; - qnn::qcom_socinfo _soc_info = {}; - }; + qnn::qcom_socinfo _soc_info = {}; +}; -} +} // namespace qnn diff --git a/ggml/src/ggml-qnn/tensor.hpp b/ggml/src/ggml-qnn/tensor.hpp index de0d1dc2dbbef..0ec75c03f0e53 100644 --- a/ggml/src/ggml-qnn/tensor.hpp +++ b/ggml/src/ggml-qnn/tensor.hpp @@ -1,146 +1,127 @@ #pragma once +#include "ggml-qnn.h" + #include "QnnTensor.h" #include "System/QnnSystemInterface.h" - -#include "ggml-qnn.h" #include "backend.hpp" #include "qnn.hpp" namespace qnn { - template class ggml_qnn_tensor_readwrite { - public: - ggml_qnn_tensor_readwrite(const ggml_tensor* tensor, - Qnn_GraphHandle_t graph_handle, - ggml_backend_qnn_context* ctx) - : _tensor(tensor), - _qnn_tensor(reinterpret_cast(tensor->extra)), - _context(ctx) { - _old_dimensions = QNN_VER_PTR(*_qnn_tensor)->dimensions; - const auto qnn_data_type = datatype_from_ggml_datatype(tensor->type); - const bool is_npu = ctx->device == QNN_BACKEND_NPU; - QNN_VER_PTR(*_qnn_tensor)->type = _tensorType; - if (is_npu) { - QNN_VER_PTR(*_qnn_tensor)->memType = QNN_TENSORMEMTYPE_MEMHANDLE; - QNN_VER_PTR(*_qnn_tensor)->clientBuf = { .data = nullptr, .dataSize = 0 }; - } +template +class ggml_qnn_tensor_readwrite { +public: + explicit ggml_qnn_tensor_readwrite(const ggml_tensor *tensor, Qnn_GraphHandle_t graph_handle, + ggml_backend_qnn_context *ctx) : + _tensor(tensor), _qnn_tensor(reinterpret_cast(tensor->extra)), _context(ctx) { + _old_dimensions = QNN_VER_PTR(*_qnn_tensor)->dimensions; + const auto qnn_data_type = datatype_from_ggml_datatype(tensor->type); + const bool is_npu = ctx->device == QNN_BACKEND_NPU; + QNN_VER_PTR(*_qnn_tensor)->type = _tensorType; + if (is_npu) { + QNN_VER_PTR(*_qnn_tensor)->memType = QNN_TENSORMEMTYPE_MEMHANDLE; + QNN_VER_PTR(*_qnn_tensor)->clientBuf = { .data = nullptr, .dataSize = 0 }; + } + + auto err = ctx->raw_interface.tensorCreateGraphTensor(graph_handle, _qnn_tensor); + if (err != QNN_SUCCESS) { + QNN_LOG_INFO("error = %d\n", err); + QNN_LOG_DEBUG("tensor%p name %s", _qnn_tensor, QNN_TENSOR_GET_NAME(*_qnn_tensor)); + _context = nullptr; + return; + } - auto err = - ctx->raw_interface.tensorCreateGraphTensor(graph_handle, _qnn_tensor); - if (err != QNN_SUCCESS) { - QNN_LOG_INFO("error = %d\n", err); - QNN_LOG_DEBUG("tensor%p name %s", _qnn_tensor, - QNN_TENSOR_GET_NAME(*_qnn_tensor)); + _dimensions[0] = (uint32_t)tensor->ne[0]; + _dimensions[1] = (uint32_t)tensor->ne[1]; + _dimensions[2] = (uint32_t)tensor->ne[2]; + _dimensions[3] = (uint32_t)tensor->ne[3]; + QNN_VER_PTR(*_qnn_tensor)->dimensions = _dimensions; + QNN_VER_PTR(*_qnn_tensor)->rank = qnn::get_ggml_tensor_rank(tensor); + QNN_VER_PTR(*_qnn_tensor)->dataType = qnn_data_type; + + if (is_npu) { + auto *instance = ctx->instance; + uint8_t *qnn_buffer = static_cast(instance->alloc_rpcmem(ggml_nbytes(tensor), alignof(void *))); + if (!qnn_buffer) { + QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno)); + QNN_LOG_DEBUG("tensor%p name %s", _qnn_tensor, QNN_TENSOR_GET_NAME(*_qnn_tensor)); _context = nullptr; + // No free for _qnn_tensor, because it's not registered. return; + } else { + QNN_LOG_INFO("alloc rpcmem successfully\n"); } - _dimensions[0] = (uint32_t)tensor->ne[0]; - _dimensions[1] = (uint32_t)tensor->ne[1]; - _dimensions[2] = (uint32_t)tensor->ne[2]; - _dimensions[3] = (uint32_t)tensor->ne[3]; - QNN_VER_PTR(*_qnn_tensor)->dimensions = _dimensions; - QNN_VER_PTR(*_qnn_tensor)->rank = qnn::get_ggml_tensor_rank(tensor); - QNN_VER_PTR(*_qnn_tensor)->dataType = qnn_data_type; - - if (is_npu) { - auto* instance = ctx->instance; - uint8_t* qnn_buffer = static_cast( - instance->alloc_rpcmem(ggml_nbytes(tensor), alignof(void*))); - if (!qnn_buffer) { - QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno)); - QNN_LOG_DEBUG("tensor%p name %s", _qnn_tensor, - QNN_TENSOR_GET_NAME(*_qnn_tensor)); - _context = nullptr; - // No free for _qnn_tensor, because it's not registered. - return; - } - else { - QNN_LOG_INFO("alloc rpcmem successfully\n"); - } - - instance->register_rpcmem(qnn_buffer, _qnn_tensor); - if (_tensorType == QNN_TENSOR_TYPE_APP_WRITE || - _tensorType == QNN_TENSOR_TYPE_APP_READWRITE) { - memcpy(qnn_buffer, tensor->data, ggml_nbytes(tensor)); - } - } - else { - QNN_VER_PTR(*_qnn_tensor)->clientBuf = { - tensor->data, get_ggml_tensor_data_size(tensor) }; + instance->register_rpcmem(qnn_buffer, _qnn_tensor); + if (_tensorType == QNN_TENSOR_TYPE_APP_WRITE || _tensorType == QNN_TENSOR_TYPE_APP_READWRITE) { + memcpy(qnn_buffer, tensor->data, ggml_nbytes(tensor)); } + } else { + QNN_VER_PTR(*_qnn_tensor)->clientBuf = { tensor->data, get_ggml_tensor_data_size(tensor) }; } - - ggml_qnn_tensor_readwrite(const ggml_tensor* tensor, Qnn_Tensor_t* qnn_tensor, - ggml_backend_qnn_context* ctx) - : _tensor(tensor), _qnn_tensor(qnn_tensor), _context(ctx) { - _old_dimensions = QNN_VER_PTR(*_qnn_tensor)->dimensions; - const auto qnn_data_type = qnn::datatype_from_ggml_datatype(tensor->type); - const bool is_npu = ctx->device == QNN_BACKEND_NPU; - - _dimensions[0] = (uint32_t)tensor->ne[0]; - _dimensions[1] = (uint32_t)tensor->ne[1]; - _dimensions[2] = (uint32_t)tensor->ne[2]; - _dimensions[3] = (uint32_t)tensor->ne[3]; - QNN_VER_PTR(*_qnn_tensor)->dimensions = _dimensions; - QNN_VER_PTR(*_qnn_tensor)->rank = get_ggml_tensor_rank(tensor); - QNN_VER_PTR(*_qnn_tensor)->dataType = qnn_data_type; - - if (is_npu) { - uint8_t* qnn_buffer = - static_cast(ctx->instance->get_rpcmem_from_memhandle( - QNN_VER_PTR(*_qnn_tensor)->memHandle)); - if (qnn_buffer) { - memcpy(qnn_buffer, tensor->data, ggml_nbytes(tensor)); - } - else { - QNN_LOG_WARN("can't find rpcmem from qnn mem handle\n"); - QNN_LOG_DEBUG("tensor%p name %s", _qnn_tensor, - QNN_TENSOR_GET_NAME(*_qnn_tensor)); - _context = nullptr; - return; - } - } - else { - QNN_VER_PTR(*_qnn_tensor)->clientBuf = { - tensor->data, get_ggml_tensor_data_size(tensor) }; + } + + explicit ggml_qnn_tensor_readwrite(const ggml_tensor *tensor, Qnn_Tensor_t *qnn_tensor, + ggml_backend_qnn_context *ctx) : + _tensor(tensor), _qnn_tensor(qnn_tensor), _context(ctx) { + _old_dimensions = QNN_VER_PTR(*_qnn_tensor)->dimensions; + const auto qnn_data_type = qnn::datatype_from_ggml_datatype(tensor->type); + const bool is_npu = ctx->device == QNN_BACKEND_NPU; + + _dimensions[0] = (uint32_t)tensor->ne[0]; + _dimensions[1] = (uint32_t)tensor->ne[1]; + _dimensions[2] = (uint32_t)tensor->ne[2]; + _dimensions[3] = (uint32_t)tensor->ne[3]; + QNN_VER_PTR(*_qnn_tensor)->dimensions = _dimensions; + QNN_VER_PTR(*_qnn_tensor)->rank = get_ggml_tensor_rank(tensor); + QNN_VER_PTR(*_qnn_tensor)->dataType = qnn_data_type; + + if (is_npu) { + uint8_t *qnn_buffer = + static_cast(ctx->instance->get_rpcmem_from_memhandle(QNN_VER_PTR(*_qnn_tensor)->memHandle)); + if (qnn_buffer) { + memcpy(qnn_buffer, tensor->data, ggml_nbytes(tensor)); + } else { + QNN_LOG_WARN("can't find rpcmem from qnn mem handle\n"); + QNN_LOG_DEBUG("tensor%p name %s", _qnn_tensor, QNN_TENSOR_GET_NAME(*_qnn_tensor)); + _context = nullptr; + return; } + } else { + QNN_VER_PTR(*_qnn_tensor)->clientBuf = { tensor->data, get_ggml_tensor_data_size(tensor) }; + } + } + + ~ggml_qnn_tensor_readwrite() { + if ((_tensorType == QNN_TENSOR_TYPE_APP_READWRITE || _tensorType == QNN_TENSOR_TYPE_APP_READ) && _context && + _context->device == QNN_BACKEND_NPU) { + uint8_t *qnn_buffer = static_cast( + _context->instance->get_rpcmem_from_memhandle(QNN_VER_PTR(*_qnn_tensor)->memHandle)); + memcpy(_tensor->data, qnn_buffer, ggml_nbytes(_tensor)); } - ~ggml_qnn_tensor_readwrite() { - if ((_tensorType == QNN_TENSOR_TYPE_APP_READWRITE || - _tensorType == QNN_TENSOR_TYPE_APP_READ) && - _context && _context->device == QNN_BACKEND_NPU) { - uint8_t* qnn_buffer = - static_cast(_context->instance->get_rpcmem_from_memhandle( - QNN_VER_PTR(*_qnn_tensor)->memHandle)); - memcpy(_tensor->data, qnn_buffer, ggml_nbytes(_tensor)); - } + QNN_VER_PTR(*_qnn_tensor)->dimensions = _old_dimensions; + } - QNN_VER_PTR(*_qnn_tensor)->dimensions = _old_dimensions; - } + bool is_valid() const { return _context; } + Qnn_Tensor_t *get_qnn_tensor() const { return _qnn_tensor; } + +private: + const ggml_tensor *_tensor; + Qnn_Tensor_t *_qnn_tensor; + ggml_backend_qnn_context *_context; + uint32_t *_old_dimensions; + uint32_t _dimensions[4] = {}; + + ggml_qnn_tensor_readwrite(const ggml_qnn_tensor_readwrite &) = delete; + void operator=(const ggml_qnn_tensor_readwrite &) = delete; + ggml_qnn_tensor_readwrite(ggml_qnn_tensor_readwrite &&) = delete; + void operator=(ggml_qnn_tensor_readwrite &&) = delete; +}; - bool is_valid() const { return _context; } - Qnn_Tensor_t* get_qnn_tensor() const { return _qnn_tensor; } - - private: - const ggml_tensor* _tensor; - Qnn_Tensor_t* _qnn_tensor; - ggml_backend_qnn_context* _context; - uint32_t* _old_dimensions; - uint32_t _dimensions[4] = {}; - - ggml_qnn_tensor_readwrite(const ggml_qnn_tensor_readwrite&) = delete; - void operator=(const ggml_qnn_tensor_readwrite&) = delete; - ggml_qnn_tensor_readwrite(ggml_qnn_tensor_readwrite&&) = delete; - void operator=(ggml_qnn_tensor_readwrite&&) = delete; - }; - - using ggml_qnn_tensor_output = - ggml_qnn_tensor_readwrite; - using ggml_qnn_tensor_input = - ggml_qnn_tensor_readwrite; +using ggml_qnn_tensor_output = ggml_qnn_tensor_readwrite; +using ggml_qnn_tensor_input = ggml_qnn_tensor_readwrite; } // namespace qnn diff --git a/ggml/src/ggml-qnn/utils.cpp b/ggml/src/ggml-qnn/utils.cpp index 798445c02fd76..2368b466c8187 100644 --- a/ggml/src/ggml-qnn/utils.cpp +++ b/ggml/src/ggml-qnn/utils.cpp @@ -2,14 +2,15 @@ #include "utils.hpp" #include "ggml-qnn.h" + #include "qnn-types.hpp" namespace qnn { - // TODO: mapping more ggml data type to QNN data type - // ref:explanation of k-quants, https://github.com/ggerganov/llama.cpp/pull/1684 - Qnn_DataType_t datatype_from_ggml_datatype(enum ggml_type ggmltype) { - switch (ggmltype) { +// TODO: mapping more ggml data type to QNN data type +// ref:explanation of k-quants, https://github.com/ggerganov/llama.cpp/pull/1684 +Qnn_DataType_t datatype_from_ggml_datatype(enum ggml_type ggmltype) { + switch (ggmltype) { case GGML_TYPE_F16: return QNN_DATATYPE_FLOAT_16; case GGML_TYPE_F32: @@ -22,24 +23,22 @@ namespace qnn { return QNN_DATATYPE_SFIXED_POINT_4; default: break; - } - return QNN_DATATYPE_UNDEFINED; } + return QNN_DATATYPE_UNDEFINED; +} - - uint32_t get_ggml_tensor_rank(const ggml_tensor* tensor) { - uint32_t rank = 0; - for (int i = 0; i < GGML_MAX_DIMS; i++) { - if ((0 != tensor->ne[i]) && (1 != tensor->ne[i])) { - rank++; - } +uint32_t get_ggml_tensor_rank(const ggml_tensor *tensor) { + uint32_t rank = 0; + for (int i = 0; i < GGML_MAX_DIMS; i++) { + if ((0 != tensor->ne[i]) && (1 != tensor->ne[i])) { + rank++; } - return rank; } + return rank; +} - - const char* get_backend_name(int n_backend_type) { - switch (n_backend_type) { +const char *get_backend_name(int n_backend_type) { + switch (n_backend_type) { case QNN_BACKEND_CPU: return "QNN-CPU"; case QNN_BACKEND_GPU: @@ -50,11 +49,11 @@ namespace qnn { return "ggml"; //"fake" QNN backend, used for compare performance between QNN backend and original GGML default: return "unknown"; - } } +} - const char* get_chipset_desc(uint32_t chipset_id) { - switch (chipset_id) { +const char *get_chipset_desc(uint32_t chipset_id) { + switch (chipset_id) { case SM8450: return "SM8450"; case SM8475: @@ -65,11 +64,11 @@ namespace qnn { return "SM8650"; default: return "unknown"; - } } +} - const char* get_htparch_desc(size_t htp_arch) { - switch (htp_arch) { +const char *get_htparch_desc(size_t htp_arch) { + switch (htp_arch) { case V68: return "QCOM_HTP_V68"; case V69: @@ -80,37 +79,36 @@ namespace qnn { return "QCOM_HTP_V75"; default: return "unknown"; - } - } - - intptr_t align_to(size_t alignment, intptr_t offset) { - return offset % alignment == 0 - ? offset - : offset + (static_cast(alignment) - - offset % static_cast(alignment)); } +} - uint32_t get_ggml_tensor_data_size(const ggml_tensor* tensor) { - /* - size_t data_size = ggml_row_size(tensor->type, tensor->ne[0]); - size_t n_dims = qnn_get_ggml_tensor_rank(tensor); - for (int i = 1; i < n_dims; i++) { - data_size *= tensor->ne[i]; - } +intptr_t align_to(size_t alignment, intptr_t offset) { + return offset % alignment == 0 + ? offset + : offset + (static_cast(alignment) - offset % static_cast(alignment)); +} - return data_size; - */ - return ggml_nbytes(tensor); +uint32_t get_ggml_tensor_data_size(const ggml_tensor *tensor) { + /* + size_t data_size = ggml_row_size(tensor->type, tensor->ne[0]); + size_t n_dims = qnn_get_ggml_tensor_rank(tensor); + for (int i = 1; i < n_dims; i++) { + data_size *= tensor->ne[i]; } - // ================================================================================================= - // - // QNN backend internal helper functions - // - // ================================================================================================= - // TODO: only support GGML_OP_ADD/GGML_OP_MUL/GGML_OP_MUL_MAT - const char* opname_from_ggmlop(enum ggml_op ggmlop) { - switch (ggmlop) { + return data_size; + */ + return ggml_nbytes(tensor); +} + +// ================================================================================================= +// +// QNN backend internal helper functions +// +// ================================================================================================= +// TODO: only support GGML_OP_ADD/GGML_OP_MUL/GGML_OP_MUL_MAT +const char *opname_from_ggmlop(enum ggml_op ggmlop) { + switch (ggmlop) { case GGML_OP_ADD: return QNN_OP_ELEMENT_WISE_ADD; case GGML_OP_MUL: @@ -119,8 +117,8 @@ namespace qnn { return QNN_OP_MAT_MUL; default: break; - } - return nullptr; } - + return nullptr; } + +} // namespace qnn diff --git a/ggml/src/ggml-qnn/utils.hpp b/ggml/src/ggml-qnn/utils.hpp index 4889c6dc8601c..673fb90e63de9 100644 --- a/ggml/src/ggml-qnn/utils.hpp +++ b/ggml/src/ggml-qnn/utils.hpp @@ -1,246 +1,239 @@ #pragma once -#include -#include -#include #include #include -#include +#include +#include +#include -#include "QnnTypes.h" +#include #include "ggml.h" +#include "QnnTypes.h" #include "logger.hpp" namespace qnn { - Qnn_DataType_t datatype_from_ggml_datatype(enum ggml_type ggmltype); - uint32_t get_ggml_tensor_rank(const ggml_tensor* tensor); - const char* get_backend_name(int n_backend_type); - const char* get_chipset_desc(uint32_t chipset_id); - const char* get_htparch_desc(size_t htp_arch); - intptr_t align_to(size_t alignment, intptr_t offset); - uint32_t get_ggml_tensor_data_size(const ggml_tensor* tensor); +Qnn_DataType_t datatype_from_ggml_datatype(enum ggml_type ggmltype); +uint32_t get_ggml_tensor_rank(const ggml_tensor *tensor); +const char *get_backend_name(int n_backend_type); +const char *get_chipset_desc(uint32_t chipset_id); +const char *get_htparch_desc(size_t htp_arch); +intptr_t align_to(size_t alignment, intptr_t offset); +uint32_t get_ggml_tensor_data_size(const ggml_tensor *tensor); - const char* opname_from_ggmlop(enum ggml_op ggmlop); +const char *opname_from_ggmlop(enum ggml_op ggmlop); - template Fn load_qnn_functionpointers(void* handle, const char* function_name) { - return reinterpret_cast(dlsym(handle, function_name)); - } +template +Fn load_qnn_functionpointers(void *handle, const char *function_name) { + return reinterpret_cast(dlsym(handle, function_name)); +} - inline int validate_tensor_version(Qnn_Tensor_t tensor) { - if (tensor.version != QNN_TENSOR_VERSION_1) { - QNN_LOG_WARN( - "validate_tensor_version() tensor %s, got unsupported version %d\n", - tensor.v1.name, tensor.version); - return 1; - } - return 0; +inline int validate_tensor_version(Qnn_Tensor_t tensor) { + if (tensor.version != QNN_TENSOR_VERSION_1) { + QNN_LOG_WARN("validate_tensor_version() tensor %s, got unsupported version %d\n", tensor.v1.name, + tensor.version); + return 1; } + return 0; +} - inline uint32_t get_qnn_tensorid(const Qnn_Tensor_t& tensor) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - return tensor.v1.id; - } - - return 0u; +inline uint32_t get_qnn_tensorid(const Qnn_Tensor_t &tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.id; } - inline const char* get_qnn_tensorname(const Qnn_Tensor_t& tensor) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - return tensor.v1.name; - } - return nullptr; - } + return 0u; +} - inline Qnn_TensorType_t get_qnn_tensortype(const Qnn_Tensor_t& tensor) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - return tensor.v1.type; - } - return QNN_TENSOR_TYPE_UNDEFINED; +inline const char *get_qnn_tensorname(const Qnn_Tensor_t &tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.name; } + return nullptr; +} - inline Qnn_TensorDataFormat_t - get_qnn_tensor_dataformat(const Qnn_Tensor_t& tensor) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - return tensor.v1.dataFormat; - } - return QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER; +inline Qnn_TensorType_t get_qnn_tensortype(const Qnn_Tensor_t &tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.type; } + return QNN_TENSOR_TYPE_UNDEFINED; +} - inline Qnn_DataType_t - get_qnn_tensor_datatype(const Qnn_Tensor_t& tensor) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - return tensor.v1.dataType; - } - return QNN_DATATYPE_UNDEFINED; +inline Qnn_TensorDataFormat_t get_qnn_tensor_dataformat(const Qnn_Tensor_t &tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.dataFormat; } + return QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER; +} - inline Qnn_QuantizeParams_t - get_qnn_tensor_quantparams(const Qnn_Tensor_t& tensor) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - return tensor.v1.quantizeParams; - } - return QNN_QUANTIZE_PARAMS_INIT; +inline Qnn_DataType_t get_qnn_tensor_datatype(const Qnn_Tensor_t &tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.dataType; } + return QNN_DATATYPE_UNDEFINED; +} - inline uint32_t get_qnn_tensor_rank(const Qnn_Tensor_t& tensor) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - return tensor.v1.rank; - } - return 0u; +inline Qnn_QuantizeParams_t get_qnn_tensor_quantparams(const Qnn_Tensor_t &tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.quantizeParams; } + return QNN_QUANTIZE_PARAMS_INIT; +} - inline uint32_t* get_qnn_tensor_dimensions(const Qnn_Tensor_t& tensor) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - return tensor.v1.dimensions; - } - return nullptr; +inline uint32_t get_qnn_tensor_rank(const Qnn_Tensor_t &tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.rank; } + return 0u; +} - inline Qnn_TensorMemType_t get_qnn_tensor_memtype(const Qnn_Tensor_t& tensor) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - return tensor.v1.memType; - } - return QNN_TENSORMEMTYPE_UNDEFINED; +inline uint32_t *get_qnn_tensor_dimensions(const Qnn_Tensor_t &tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.dimensions; } + return nullptr; +} - inline void set_qnn_tensor_id(Qnn_Tensor_t& tensor, uint32_t id) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - tensor.v1.id = id; - } +inline Qnn_TensorMemType_t get_qnn_tensor_memtype(const Qnn_Tensor_t &tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.memType; } + return QNN_TENSORMEMTYPE_UNDEFINED; +} - inline void set_qnn_tensor_name(Qnn_Tensor_t& tensor, const char* name) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - tensor.v1.name = name; - } +inline void set_qnn_tensor_id(Qnn_Tensor_t &tensor, uint32_t id) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + tensor.v1.id = id; } +} - inline void set_qnn_tensor_type(Qnn_Tensor_t& tensor, Qnn_TensorType_t type) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - tensor.v1.type = type; - } +inline void set_qnn_tensor_name(Qnn_Tensor_t &tensor, const char *name) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + tensor.v1.name = name; } +} - inline void set_qnn_tensor_dataformat(Qnn_Tensor_t& tensor, Qnn_TensorDataFormat_t format) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - tensor.v1.dataFormat = format; - } +inline void set_qnn_tensor_type(Qnn_Tensor_t &tensor, Qnn_TensorType_t type) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + tensor.v1.type = type; } +} - inline void set_qnn_tensor_datatype(Qnn_Tensor_t& tensor, Qnn_DataType_t dataType) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - tensor.v1.dataType = dataType; - } +inline void set_qnn_tensor_dataformat(Qnn_Tensor_t &tensor, Qnn_TensorDataFormat_t format) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + tensor.v1.dataFormat = format; } +} - inline void set_qnn_tensor_quantparams(Qnn_Tensor_t& tensor, Qnn_QuantizeParams_t params) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - tensor.v1.quantizeParams = params; - } +inline void set_qnn_tensor_datatype(Qnn_Tensor_t &tensor, Qnn_DataType_t dataType) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + tensor.v1.dataType = dataType; } +} - inline void set_qnn_tensor_rank(Qnn_Tensor_t& tensor, uint32_t rank) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - tensor.v1.rank = rank; - } +inline void set_qnn_tensor_quantparams(Qnn_Tensor_t &tensor, Qnn_QuantizeParams_t params) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + tensor.v1.quantizeParams = params; } +} - inline void set_qnn_tensor_dimensions(Qnn_Tensor_t& tensor, uint32_t* dims) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - tensor.v1.dimensions = dims; - } +inline void set_qnn_tensor_rank(Qnn_Tensor_t &tensor, uint32_t rank) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + tensor.v1.rank = rank; } +} - inline void set_qnn_tensor_memtype(Qnn_Tensor_t& tensor, Qnn_TensorMemType_t mem_type) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - tensor.v1.memType = mem_type; - } +inline void set_qnn_tensor_dimensions(Qnn_Tensor_t &tensor, uint32_t *dims) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + tensor.v1.dimensions = dims; } +} - inline void set_qnn_tensor_clientbuf(Qnn_Tensor_t& tensor, Qnn_ClientBuffer_t client_buf) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - tensor.v1.clientBuf = client_buf; - } +inline void set_qnn_tensor_memtype(Qnn_Tensor_t &tensor, Qnn_TensorMemType_t mem_type) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + tensor.v1.memType = mem_type; } +} - inline void set_qnn_tensor_memhandle(Qnn_Tensor_t& tensor, Qnn_MemHandle_t handle) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - tensor.v1.memHandle = handle; - } +inline void set_qnn_tensor_clientbuf(Qnn_Tensor_t &tensor, Qnn_ClientBuffer_t client_buf) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + tensor.v1.clientBuf = client_buf; } +} +inline void set_qnn_tensor_memhandle(Qnn_Tensor_t &tensor, Qnn_MemHandle_t handle) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + tensor.v1.memHandle = handle; + } +} #if ENABLE_QNNBACKEND_PERF - class qnn_perf { - public: - qnn_perf(const std::string& perf_name) : _perf_name(std::move(perf_name)) {}; - qnn_perf() = delete; - qnn_perf(const qnn_perf&) = delete; - qnn_perf& operator= (const qnn_perf&) = delete; - - void start() { - _begin_time = ggml_time_us(); - } - - void info() { - _end_time = ggml_time_us(); - _duration = (_end_time - _begin_time); - QNN_LOG_INFO("duration of %s : %lld microseconds\n", _perf_name.c_str(), _duration); - } - - private: - int64_t _begin_time = 0LL; - int64_t _end_time = 0LL; - int64_t _duration = 0LL; - std::string _perf_name; - }; +class qnn_perf { +public: + qnn_perf(const std::string &perf_name) : _perf_name(std::move(perf_name)) {}; + qnn_perf() = delete; + qnn_perf(const qnn_perf &) = delete; + qnn_perf &operator=(const qnn_perf &) = delete; + + void start() { _begin_time = ggml_time_us(); } + + void info() { + _end_time = ggml_time_us(); + _duration = (_end_time - _begin_time); + QNN_LOG_INFO("duration of %s : %lld microseconds\n", _perf_name.c_str(), _duration); + } + +private: + int64_t _begin_time = 0LL; + int64_t _end_time = 0LL; + int64_t _duration = 0LL; + std::string _perf_name; +}; #else - class qnn_perf { - public: - qnn_perf(const std::string& perf_name) {} - qnn_perf() = delete; - qnn_perf(const qnn_perf&) = delete; - qnn_perf& operator= (const qnn_perf&) = delete; - - void start() {} - void info() {} - }; +class qnn_perf { +public: + qnn_perf(const std::string &perf_name) {} + qnn_perf() = delete; + qnn_perf(const qnn_perf &) = delete; + qnn_perf &operator=(const qnn_perf &) = delete; + + void start() {} + void info() {} +}; #endif -} - +} // namespace qnn -#define VALIDATE(value, status) \ - do { \ - status = value; \ - if (status != QNN_SUCCESS) { \ - QNN_LOG_WARN("%s expected QNN_SUCCESS\n", #value); \ - return status; \ - } \ +#define VALIDATE(value, status) \ + do { \ + status = value; \ + if (status != QNN_SUCCESS) { \ + QNN_LOG_WARN("%s expected QNN_SUCCESS\n", #value); \ + return status; \ + } \ } while (0) -#define QNN_TENSOR_GET_ID(tensor) qnn::get_qnn_tensorid(tensor) -#define QNN_TENSOR_GET_NAME(tensor) qnn::get_qnn_tensorname(tensor) -#define QNN_TENSOR_GET_TYPE(tensor) qnn::get_qnn_tensortype(tensor) -#define QNN_TENSOR_GET_DATA_FORMAT(tensor) qnn::get_qnn_tensor_dataformat(tensor) -#define QNN_TENSOR_GET_DATA_TYPE(tensor) qnn::get_qnn_tensor_datatype(tensor) -#define QNN_TENSOR_GET_QUANT_PARAMS(tensor) qnn::get_qnn_tensor_quantparams(tensor) -#define QNN_TENSOR_GET_RANK(tensor) qnn::get_qnn_tensor_rank(tensor) -#define QNN_TENSOR_GET_DIMENSIONS(tensor) qnn::get_qnn_tensor_dimensions(tensor) -#define QNN_TENSOR_GET_MEM_TYPE(tensor) qnn::get_qnn_tensor_memtype(tensor) - -#define QNN_TENSOR_SET_ID(tensor, value) qnn::set_qnn_tensor_id(tensor, value) -#define QNN_TENSOR_SET_NAME(tensor, value) qnn::set_qnn_tensor_name(tensor, value) -#define QNN_TENSOR_SET_TYPE(tensor, value) qnn::set_qnn_tensor_type(tensor, value) -#define QNN_TENSOR_SET_DATA_FORMAT(tensor, value) qnn::set_qnn_tensor_dataformat(tensor, value) -#define QNN_TENSOR_SET_DATA_TYPE(tensor, value) qnn::set_qnn_tensor_datatype(tensor, value) -#define QNN_TENSOR_SET_QUANT_PARAMS(tensor, value) qnn::set_qnn_tensor_quantparams(tensor, value) -#define QNN_TENSOR_SET_RANK(tensor, value) qnn::set_qnn_tensor_rank(tensor, value) -#define QNN_TENSOR_SET_DIMENSIONS(tensor, value) qnn::set_qnn_tensor_dimensions(tensor, value) -#define QNN_TENSOR_SET_MEM_TYPE(tensor, value) qnn::set_qnn_tensor_memtype(tensor, value) -#define QNN_TENSOR_SET_CLIENT_BUF(tensor, value) qnn::set_qnn_tensor_clientbuf(tensor, value) -#define QNN_TENSOR_SET_MEM_HANDLE(tensor, value) qnn::set_qnn_tensor_memhandle(tensor, value) -#define VALIDATE_TENSOR_VERSION(tensor, err) VALIDATE(qnn::validate_tensor_version(tensor), err) +#define QNN_TENSOR_GET_ID(tensor) qnn::get_qnn_tensorid(tensor) +#define QNN_TENSOR_GET_NAME(tensor) qnn::get_qnn_tensorname(tensor) +#define QNN_TENSOR_GET_TYPE(tensor) qnn::get_qnn_tensortype(tensor) +#define QNN_TENSOR_GET_DATA_FORMAT(tensor) qnn::get_qnn_tensor_dataformat(tensor) +#define QNN_TENSOR_GET_DATA_TYPE(tensor) qnn::get_qnn_tensor_datatype(tensor) +#define QNN_TENSOR_GET_QUANT_PARAMS(tensor) qnn::get_qnn_tensor_quantparams(tensor) +#define QNN_TENSOR_GET_RANK(tensor) qnn::get_qnn_tensor_rank(tensor) +#define QNN_TENSOR_GET_DIMENSIONS(tensor) qnn::get_qnn_tensor_dimensions(tensor) +#define QNN_TENSOR_GET_MEM_TYPE(tensor) qnn::get_qnn_tensor_memtype(tensor) + +#define QNN_TENSOR_SET_ID(tensor, value) qnn::set_qnn_tensor_id(tensor, value) +#define QNN_TENSOR_SET_NAME(tensor, value) qnn::set_qnn_tensor_name(tensor, value) +#define QNN_TENSOR_SET_TYPE(tensor, value) qnn::set_qnn_tensor_type(tensor, value) +#define QNN_TENSOR_SET_DATA_FORMAT(tensor, value) qnn::set_qnn_tensor_dataformat(tensor, value) +#define QNN_TENSOR_SET_DATA_TYPE(tensor, value) qnn::set_qnn_tensor_datatype(tensor, value) +#define QNN_TENSOR_SET_QUANT_PARAMS(tensor, value) qnn::set_qnn_tensor_quantparams(tensor, value) +#define QNN_TENSOR_SET_RANK(tensor, value) qnn::set_qnn_tensor_rank(tensor, value) +#define QNN_TENSOR_SET_DIMENSIONS(tensor, value) qnn::set_qnn_tensor_dimensions(tensor, value) +#define QNN_TENSOR_SET_MEM_TYPE(tensor, value) qnn::set_qnn_tensor_memtype(tensor, value) +#define QNN_TENSOR_SET_CLIENT_BUF(tensor, value) qnn::set_qnn_tensor_clientbuf(tensor, value) +#define QNN_TENSOR_SET_MEM_HANDLE(tensor, value) qnn::set_qnn_tensor_memhandle(tensor, value) +#define VALIDATE_TENSOR_VERSION(tensor, err) VALIDATE(qnn::validate_tensor_version(tensor), err) From ca0d999c2ab97c11174a1f30852a311038792192 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Thu, 4 Jul 2024 23:32:21 +0800 Subject: [PATCH 037/143] add ggml_qnn_graph --- ggml/src/ggml-qnn/backend-ops.cpp | 5 +- ggml/src/ggml-qnn/graph.hpp | 136 ++++++++++++++++++++++++++++++ 2 files changed, 139 insertions(+), 2 deletions(-) create mode 100644 ggml/src/ggml-qnn/graph.hpp diff --git a/ggml/src/ggml-qnn/backend-ops.cpp b/ggml/src/ggml-qnn/backend-ops.cpp index cde1bd248cc29..3365e85b846a8 100644 --- a/ggml/src/ggml-qnn/backend-ops.cpp +++ b/ggml/src/ggml-qnn/backend-ops.cpp @@ -1,6 +1,7 @@ #include "backend-ops.hpp" +#include "graph.hpp" #include "logger.hpp" #include "tensor.hpp" #include "utils.hpp" @@ -130,7 +131,7 @@ static void ggml_qnn_add(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, Qnn_Tensor_t tensor_inputs[] = { *tensor_input0.get_qnn_tensor(), *tensor_input1.get_qnn_tensor() }; Qnn_Tensor_t tensor_outputs[] = { *tensor_output.get_qnn_tensor() }; - Qnn_OpConfig_t op_config = { (Qnn_OpConfigVersion_t)1, + Qnn_OpConfig_t op_config = { QNN_OPCONFIG_VERSION_1, .v1 = { "ggml_op_add", QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_ELEMENT_WISE_ADD, 0, qnn_params, 2, tensor_inputs, 1, tensor_outputs } }; error = qnn_raw_interface.graphAddNode(graph_handle, op_config); @@ -300,7 +301,7 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context *ctx, const ggml_tensor *s Qnn_Tensor_t tensor_inputs[] = { *tensor_input0.get_qnn_tensor(), *tensor_input1.get_qnn_tensor() }; Qnn_Tensor_t tensor_outputs[] = { *tensor_output.get_qnn_tensor() }; - Qnn_OpConfig_t op_config = { (Qnn_OpConfigVersion_t)1, + Qnn_OpConfig_t op_config = { QNN_OPCONFIG_VERSION_1, .v1 = { "ggml_op_mul_mat", QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_MAT_MUL, 0, qnn_params, 2, tensor_inputs, 1, tensor_outputs } }; error = qnn_raw_interface.graphAddNode(graph_handle, op_config); diff --git a/ggml/src/ggml-qnn/graph.hpp b/ggml/src/ggml-qnn/graph.hpp new file mode 100644 index 0000000000000..f2c27aeb3a2be --- /dev/null +++ b/ggml/src/ggml-qnn/graph.hpp @@ -0,0 +1,136 @@ + +#pragma once + +#include + +#include "ggml-qnn.h" + +#include "logger.hpp" +#include "qnn.hpp" + +namespace qnn { + +template +class ggml_qnn_graph { +public: + typedef std::array input_tensor_array_t; + typedef std::array output_tensor_array_t; + + explicit ggml_qnn_graph(const std::string &graph_name, QNNBackend device, Qnn_ContextHandle_t qnn_context, + QNN_INTERFACE_VER_TYPE qnn_interface, size_t vtcm_size_in_mb) : + _device(device), _qnn_interface(qnn_interface) { + QNN_LOG_INFO("graph name %s", graph_name.c_str()); + + Qnn_ErrorHandle_t error = QNN_SUCCESS; + Qnn_GraphHandle_t graph_handle = nullptr; + if (device == QNN_BACKEND_NPU) { + // TODO: fix graph config here for NPU + QnnHtpGraph_CustomConfig_t hvx_config; + hvx_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_NUM_HVX_THREADS; + hvx_config.numHvxThreads = 8; + QnnGraph_Config_t graph_hvx_config; + graph_hvx_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; + graph_hvx_config.customConfig = &hvx_config; + + QnnHtpGraph_CustomConfig_t dlbc_config; + dlbc_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_OPTIMIZATION; + dlbc_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_ENABLE_DLBC; + dlbc_config.optimizationOption.floatValue = 1.0; // set to 0.0 to turn off DLBC + QnnGraph_Config_t graph_dlbc_config; + graph_dlbc_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; + graph_dlbc_config.customConfig = &dlbc_config; + + QnnHtpGraph_CustomConfig_t opt_config; + opt_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_OPTIMIZATION_FLAG; + opt_config.optimizationOption.floatValue = 1; // 1 / 3 + QnnGraph_Config_t graph_opt_config; + graph_opt_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; + graph_opt_config.customConfig = &opt_config; + + QnnHtpGraph_CustomConfig_t vtcm_config; + vtcm_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_VTCM_SIZE; + vtcm_config.vtcmSizeInMB = vtcm_size_in_mb; + QnnGraph_Config_t graph_vtcm_config; + graph_vtcm_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; + graph_vtcm_config.customConfig = &vtcm_config; + + const QnnGraph_Config_t *p_graphconfig[] = { &graph_hvx_config, &graph_dlbc_config, &graph_vtcm_config, + &graph_opt_config, nullptr }; + error = qnn_interface.graphCreate(qnn_context, graph_name.c_str(), p_graphconfig, &graph_handle); + } else { + error = qnn_interface.graphCreate(qnn_context, graph_name.c_str(), nullptr, &graph_handle); + } + + if (error != QNN_SUCCESS) { + QNN_LOG_INFO( + "can't create qnn graph handle with graph name %s, " + "error = %d\n", + graph_name.c_str(), error); + return; + } else { + QNN_LOG_INFO("create qnn graph handle with graph name %s ok\n", graph_name.c_str()); + } + + _graph_handle = graph_handle; + } + + bool add_nodes(const input_tensor_array_t &tensor_inputs, const output_tensor_array_t &tensor_outputs) { + if (!is_valid()) { + QNN_LOG_ERROR("Invalid graph\n"); + return false; + } + + Qnn_Param_t qnn_params[] = {}; + Qnn_OpConfig_t op_config = { QNN_OPCONFIG_VERSION_1, + .v1 = { "ggml_op_add", QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_ELEMENT_WISE_ADD, 0, + qnn_params, _tensor_inputs.size(), _tensor_inputs.data(), + _tensor_outputs.size(), _tensor_outputs.data() } }; + auto error = _qnn_interface.graphAddNode(_graph_handle, op_config); + if (error != QNN_SUCCESS) { + QNN_LOG_ERROR("graphAddNode.error = %d\n", error); + return false; + } + + error = _qnn_interface.graphFinalize(_graph_handle, nullptr, nullptr); + if (error != QNN_SUCCESS) { + QNN_LOG_ERROR("graphFinalize.error = %d\n", error); + return false; + } + + return true; + } + + bool execute() { + auto error = _qnn_interface.graphExecute(_graph_handle, _tensor_inputs.data(), _tensor_inputs.size(), + _tensor_outputs.data(), _tensor_outputs.size(), nullptr, nullptr); + if (_device == QNN_BACKEND_NPU) { + if (error == QNN_COMMON_ERROR_SYSTEM_COMMUNICATION) { + QNN_LOG_WARN("NPU crashed. SSR detected. Caused QNN graph execute error\n"); + } + } + + if (error != QNN_SUCCESS) { + QNN_LOG_INFO("error = %d\n", error); + return false; + } + + return true; + } + + bool is_valid() const { return _graph_handle != nullptr; } + + Qnn_GraphHandle_t get_graph_handler() const { return _graph_handle; } + +private: + const QNNBackend _device; + const QNN_INTERFACE_VER_TYPE _qnn_interface; + Qnn_GraphHandle_t _graph_handle = nullptr; + std::array _tensor_inputs; + std::array _tensor_outputs; + + ggml_qnn_graph(const ggml_qnn_graph &) = delete; + void operator=(const ggml_qnn_graph &) = delete; + ggml_qnn_graph(ggml_qnn_graph &&) = delete; + void operator=(ggml_qnn_graph &&) = delete; +}; +} // namespace qnn From 4b2ee61f62e2e666ea47fcef4717739cd66fefcc Mon Sep 17 00:00:00 2001 From: hongruichen Date: Fri, 5 Jul 2024 11:56:31 +0800 Subject: [PATCH 038/143] move graph map to backend object --- ggml/src/ggml-qnn.cpp | 15 +++++---------- ggml/src/ggml-qnn/backend-ops.cpp | 16 ++++++++-------- ggml/src/ggml-qnn/backend.hpp | 3 +++ ggml/src/ggml-qnn/qnn.hpp | 4 ---- 4 files changed, 16 insertions(+), 22 deletions(-) diff --git a/ggml/src/ggml-qnn.cpp b/ggml/src/ggml-qnn.cpp index e5fc00045beb3..9e6404e5c1b53 100644 --- a/ggml/src/ggml-qnn.cpp +++ b/ggml/src/ggml-qnn.cpp @@ -529,18 +529,13 @@ GGML_CALL static void ggml_backend_qnn_free(ggml_backend_t backend) { auto *instance = g_qnn_mgr[ctx->device].instance; if (instance != nullptr) { - // TODO: this should be done inside the destructor - std::map>::iterator graph_it; - for (graph_it = instance->_qnn_graph_map.begin(); - graph_it != instance->_qnn_graph_map.end(); graph_it++) { - auto & graph_item = graph_it->second; - Qnn_GraphHandle_t & graph_handle = std::get<0>(graph_item); + for (const auto &graph_item: ctx->qnn_graph_map) { + Qnn_GraphHandle_t graph_handle = std::get<0>(graph_item.second); GGML_UNUSED(graph_handle); - QNN_LOG_INFO("graph type:%s", graph_it->first.c_str()); + QNN_LOG_INFO("graph type:%s", graph_item.first.c_str()); } - instance->_qnn_graph_map.clear(); + + ctx->qnn_graph_map.clear(); instance->qnn_finalize(); delete instance; diff --git a/ggml/src/ggml-qnn/backend-ops.cpp b/ggml/src/ggml-qnn/backend-ops.cpp index 3365e85b846a8..d0c132b9bca8f 100644 --- a/ggml/src/ggml-qnn/backend-ops.cpp +++ b/ggml/src/ggml-qnn/backend-ops.cpp @@ -57,9 +57,9 @@ static void ggml_qnn_add(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, perf.start(); std::string map_entry(ggml_op_name(ggmlop)); - if (instance->_qnn_graph_map.find(map_entry) != instance->_qnn_graph_map.end()) { + if (ctx->qnn_graph_map.find(map_entry) != ctx->qnn_graph_map.end()) { graph_initialized = true; - auto &graph_item = instance->_qnn_graph_map[map_entry]; + auto &graph_item = ctx->qnn_graph_map[map_entry]; graph_handle = std::get<0>(graph_item); } @@ -157,9 +157,9 @@ static void ggml_qnn_add(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, auto graph_item = std::make_tuple(graph_handle, tensor_input0.get_qnn_tensor(), tensor_input1.get_qnn_tensor(), tensor_output.get_qnn_tensor()); - instance->_qnn_graph_map[map_entry] = graph_item; + ctx->qnn_graph_map[map_entry] = graph_item; } else { - auto &graph_item = instance->_qnn_graph_map[map_entry]; + auto &graph_item = ctx->qnn_graph_map[map_entry]; qnn::ggml_qnn_tensor_input tensor_input0(src0, std::get<1>(graph_item), ctx); qnn::ggml_qnn_tensor_input tensor_input1(src1, std::get<2>(graph_item), ctx); qnn::ggml_qnn_tensor_output tensor_output(dst, std::get<3>(graph_item), ctx); @@ -226,9 +226,9 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context *ctx, const ggml_tensor *s perf.start(); std::string map_entry = std::string(ggml_op_name(ggmlop)); - if (instance->_qnn_graph_map.find(map_entry) != instance->_qnn_graph_map.end()) { + if (ctx->qnn_graph_map.find(map_entry) != ctx->qnn_graph_map.end()) { graph_initialized = true; - auto &graph_item = instance->_qnn_graph_map[map_entry]; + auto &graph_item = ctx->qnn_graph_map[map_entry]; graph_handle = std::get<0>(graph_item); } @@ -327,9 +327,9 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context *ctx, const ggml_tensor *s auto graph_item = std::make_tuple(graph_handle, tensor_input0.get_qnn_tensor(), tensor_input1.get_qnn_tensor(), tensor_output.get_qnn_tensor()); - instance->_qnn_graph_map[map_entry] = graph_item; + ctx->qnn_graph_map[map_entry] = graph_item; } else { - auto &graph_item = instance->_qnn_graph_map[map_entry]; + auto &graph_item = ctx->qnn_graph_map[map_entry]; qnn::ggml_qnn_tensor_input tensor_input0(src0, std::get<1>(graph_item), ctx); qnn::ggml_qnn_tensor_input tensor_input1(src1, std::get<2>(graph_item), ctx); qnn::ggml_qnn_tensor_output tensor_output(dst, std::get<3>(graph_item), ctx); diff --git a/ggml/src/ggml-qnn/backend.hpp b/ggml/src/ggml-qnn/backend.hpp index 74bce38b7111c..dd15b05807641 100644 --- a/ggml/src/ggml-qnn/backend.hpp +++ b/ggml/src/ggml-qnn/backend.hpp @@ -1,6 +1,8 @@ #pragma once +#include + #include "ggml.h" #include "ggml-backend.h" @@ -17,4 +19,5 @@ struct ggml_backend_qnn_context { QNN_INTERFACE_VER_TYPE raw_interface; QNN_SYSTEM_INTERFACE_VER_TYPE raw_system_interface; qnn::qcom_socinfo socinfo; + std::unordered_map> qnn_graph_map; }; diff --git a/ggml/src/ggml-qnn/qnn.hpp b/ggml/src/ggml-qnn/qnn.hpp index bccc3a4ba32ac..26465c96a0793 100644 --- a/ggml/src/ggml-qnn/qnn.hpp +++ b/ggml/src/ggml-qnn/qnn.hpp @@ -2,7 +2,6 @@ #include -#include #include #include #include @@ -705,9 +704,6 @@ class qnn_instance { const qnn::qcom_socinfo &get_soc_info() { return _soc_info; } -public: - std::map> _qnn_graph_map; - private: int load_system() { Qnn_ErrorHandle_t error = QNN_SUCCESS; From a688ed324b339eb2ca455becf1661272d28e1d99 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Fri, 5 Jul 2024 13:07:48 +0800 Subject: [PATCH 039/143] add op param to add_nodes --- ggml/src/ggml-qnn/graph.hpp | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/ggml/src/ggml-qnn/graph.hpp b/ggml/src/ggml-qnn/graph.hpp index f2c27aeb3a2be..700114d6f8a26 100644 --- a/ggml/src/ggml-qnn/graph.hpp +++ b/ggml/src/ggml-qnn/graph.hpp @@ -18,7 +18,7 @@ class ggml_qnn_graph { explicit ggml_qnn_graph(const std::string &graph_name, QNNBackend device, Qnn_ContextHandle_t qnn_context, QNN_INTERFACE_VER_TYPE qnn_interface, size_t vtcm_size_in_mb) : - _device(device), _qnn_interface(qnn_interface) { + _graph_name(graph_name), _device(device), _qnn_interface(qnn_interface) { QNN_LOG_INFO("graph name %s", graph_name.c_str()); Qnn_ErrorHandle_t error = QNN_SUCCESS; @@ -74,7 +74,8 @@ class ggml_qnn_graph { _graph_handle = graph_handle; } - bool add_nodes(const input_tensor_array_t &tensor_inputs, const output_tensor_array_t &tensor_outputs) { + bool add_nodes(const std::string &op_name, const input_tensor_array_t &tensor_inputs, + const output_tensor_array_t &tensor_outputs) { if (!is_valid()) { QNN_LOG_ERROR("Invalid graph\n"); return false; @@ -82,7 +83,7 @@ class ggml_qnn_graph { Qnn_Param_t qnn_params[] = {}; Qnn_OpConfig_t op_config = { QNN_OPCONFIG_VERSION_1, - .v1 = { "ggml_op_add", QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_ELEMENT_WISE_ADD, 0, + .v1 = { _graph_name.c_str(), QNN_OP_PACKAGE_NAME_QTI_AISW, op_name.c_str(), 0, qnn_params, _tensor_inputs.size(), _tensor_inputs.data(), _tensor_outputs.size(), _tensor_outputs.data() } }; auto error = _qnn_interface.graphAddNode(_graph_handle, op_config); @@ -122,6 +123,7 @@ class ggml_qnn_graph { Qnn_GraphHandle_t get_graph_handler() const { return _graph_handle; } private: + const std::string _graph_name; const QNNBackend _device; const QNN_INTERFACE_VER_TYPE _qnn_interface; Qnn_GraphHandle_t _graph_handle = nullptr; @@ -133,4 +135,8 @@ class ggml_qnn_graph { ggml_qnn_graph(ggml_qnn_graph &&) = delete; void operator=(ggml_qnn_graph &&) = delete; }; + +using ggml_qnn_graph_binary = ggml_qnn_graph<2, 1>; +using ggml_qnn_graph_unary = ggml_qnn_graph<1, 1>; + } // namespace qnn From 13dc3a02c371e533ffd379446cbf53b4c3bb5599 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Fri, 5 Jul 2024 13:08:14 +0800 Subject: [PATCH 040/143] use qnn graph inside add and mul ops --- ggml/src/ggml-qnn.cpp | 2 - ggml/src/ggml-qnn/backend-ops.cpp | 325 +++++++----------------------- ggml/src/ggml-qnn/backend.hpp | 6 +- ggml/src/ggml-qnn/graph.hpp | 9 +- 4 files changed, 89 insertions(+), 253 deletions(-) diff --git a/ggml/src/ggml-qnn.cpp b/ggml/src/ggml-qnn.cpp index 9e6404e5c1b53..19c970c5f0fd6 100644 --- a/ggml/src/ggml-qnn.cpp +++ b/ggml/src/ggml-qnn.cpp @@ -530,8 +530,6 @@ GGML_CALL static void ggml_backend_qnn_free(ggml_backend_t backend) { auto *instance = g_qnn_mgr[ctx->device].instance; if (instance != nullptr) { for (const auto &graph_item: ctx->qnn_graph_map) { - Qnn_GraphHandle_t graph_handle = std::get<0>(graph_item.second); - GGML_UNUSED(graph_handle); QNN_LOG_INFO("graph type:%s", graph_item.first.c_str()); } diff --git a/ggml/src/ggml-qnn/backend-ops.cpp b/ggml/src/ggml-qnn/backend-ops.cpp index d0c132b9bca8f..79e280fcbe088 100644 --- a/ggml/src/ggml-qnn/backend-ops.cpp +++ b/ggml/src/ggml-qnn/backend-ops.cpp @@ -1,13 +1,23 @@ #include "backend-ops.hpp" +#include + #include "graph.hpp" #include "logger.hpp" #include "tensor.hpp" #include "utils.hpp" -static bool qnn_is_valid_params(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, - ggml_tensor *dst) { +namespace { + +void print_ggml_tensor(const ggml_tensor *tensor) { + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + tensor->name, tensor->type, ggml_type_name(tensor->type), tensor->ne[0], tensor->ne[1], tensor->ne[2], + tensor->nb[0], tensor->nb[1], tensor->nb[2]); +} + +bool qnn_is_valid_params(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, + ggml_tensor *dst) { if (!ctx || !src0 || !src1 || !dst) { QNN_LOG_WARN("invalid params\n"); return false; @@ -25,6 +35,8 @@ static bool qnn_is_valid_params(ggml_backend_qnn_context *ctx, const ggml_tensor return true; } +} // namespace + #ifndef NDEBUG #define CHECK_PARAMS(ctx, src0, src1, dst) \ do { \ @@ -41,157 +53,65 @@ static bool qnn_is_valid_params(ggml_backend_qnn_context *ctx, const ggml_tensor // keep it for illustrate how to implement a specified GGMPL OP using QNN API + QNN RPC static void ggml_qnn_add(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst) { - Qnn_ErrorHandle_t error = QNN_SUCCESS; - bool graph_initialized = false; - qnn::qnn_instance *instance = nullptr; - std::string graph_name = "ggml_op_qnn_add"; - Qnn_GraphHandle_t graph_handle = nullptr; - Qnn_Param_t qnn_params[] = {}; - enum ggml_op ggmlop = GGML_OP_ADD; - CHECK_PARAMS(ctx, src0, src1, dst); - instance = ctx->instance; - auto qnn_raw_interface = ctx->raw_interface; - qnn::qnn_perf perf("ggml_qnn_add"); + std::string graph_name = "ggml_op_qnn_add"; + qnn::qnn_perf perf(graph_name); perf.start(); - std::string map_entry(ggml_op_name(ggmlop)); - if (ctx->qnn_graph_map.find(map_entry) != ctx->qnn_graph_map.end()) { - graph_initialized = true; - auto &graph_item = ctx->qnn_graph_map[map_entry]; - graph_handle = std::get<0>(graph_item); - } - - if (!graph_initialized) { + bool succeed = false; + std::string graph_key(ggml_op_name(GGML_OP_ADD)); + auto it = ctx->qnn_graph_map.find(graph_key); + if (it != ctx->qnn_graph_map.end()) { + const auto &graph_item = it->second; + qnn::ggml_qnn_tensor_input tensor_input0(src0, std::get<1>(graph_item), ctx); + qnn::ggml_qnn_tensor_input tensor_input1(src1, std::get<2>(graph_item), ctx); + qnn::ggml_qnn_tensor_output tensor_output(dst, std::get<3>(graph_item), ctx); + std::get<0>(graph_item)->execute(); + } else { graph_name = graph_name + "_" + std::to_string(ctx->threads) + "_" + src0->name + "_" + src1->name; - QNN_LOG_INFO("graph name %s", graph_name.c_str()); - if (ctx->device == QNN_BACKEND_NPU) { - QnnHtpGraph_CustomConfig_t hvx_config; - hvx_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_NUM_HVX_THREADS; - hvx_config.numHvxThreads = 8; - QnnGraph_Config_t graph_hvx_config; - graph_hvx_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; - graph_hvx_config.customConfig = &hvx_config; - - QnnHtpGraph_CustomConfig_t dlbc_config; - dlbc_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_OPTIMIZATION; - dlbc_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_ENABLE_DLBC; - dlbc_config.optimizationOption.floatValue = 1.0; // set to 0.0 to turn off DLBC - QnnGraph_Config_t graph_dlbc_config; - graph_dlbc_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; - graph_dlbc_config.customConfig = &dlbc_config; - - QnnHtpGraph_CustomConfig_t opt_config; - opt_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_OPTIMIZATION_FLAG; - opt_config.optimizationOption.floatValue = 1; // 1 / 3 - QnnGraph_Config_t graph_opt_config; - graph_opt_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; - graph_opt_config.customConfig = &opt_config; - - QnnHtpGraph_CustomConfig_t vtcm_config; - vtcm_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_VTCM_SIZE; - vtcm_config.vtcmSizeInMB = ctx->socinfo.vtcm_size_in_mb; - QnnGraph_Config_t graph_vtcm_config; - graph_vtcm_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; - graph_vtcm_config.customConfig = &vtcm_config; - - const QnnGraph_Config_t *p_graphconfig[] = { &graph_hvx_config, &graph_dlbc_config, &graph_vtcm_config, - &graph_opt_config, NULL }; - error = qnn_raw_interface.graphCreate(instance->get_qnn_context_handle(), graph_name.c_str(), p_graphconfig, - &graph_handle); - } else { - error = qnn_raw_interface.graphCreate(instance->get_qnn_context_handle(), graph_name.c_str(), nullptr, - &graph_handle); - } + auto graph = std::make_unique(graph_name, (QNNBackend)(ctx->device), + ctx->instance->get_qnn_context_handle(), + ctx->raw_interface, ctx->socinfo.vtcm_size_in_mb); - if (QNN_SUCCESS != error) { - QNN_LOG_INFO( - "can't create qnn graph handle with graph name %s, " - "error = %d\n", - graph_name.c_str(), error); + if (!graph->is_valid()) { goto failure; - } else { - QNN_LOG_INFO("create qnn graph handle with graph name %s ok\n", graph_name.c_str()); } - qnn::ggml_qnn_tensor_input tensor_input0(src0, graph_handle, ctx); + qnn::ggml_qnn_tensor_input tensor_input0(src0, graph->get_graph_handler(), ctx); if (!tensor_input0.is_valid()) { goto failure; } - qnn::ggml_qnn_tensor_input tensor_input1(src1, graph_handle, ctx); + qnn::ggml_qnn_tensor_input tensor_input1(src1, graph->get_graph_handler(), ctx); if (!tensor_input1.is_valid()) { - QNN_LOG_INFO("error = %d\n", error); goto failure; } - qnn::ggml_qnn_tensor_output tensor_output(dst, graph_handle, ctx); + qnn::ggml_qnn_tensor_output tensor_output(dst, graph->get_graph_handler(), ctx); if (!tensor_output.is_valid()) { goto failure; } - Qnn_Tensor_t tensor_inputs[] = { *tensor_input0.get_qnn_tensor(), *tensor_input1.get_qnn_tensor() }; - Qnn_Tensor_t tensor_outputs[] = { *tensor_output.get_qnn_tensor() }; - Qnn_OpConfig_t op_config = { QNN_OPCONFIG_VERSION_1, - .v1 = { "ggml_op_add", QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_ELEMENT_WISE_ADD, 0, - qnn_params, 2, tensor_inputs, 1, tensor_outputs } }; - error = qnn_raw_interface.graphAddNode(graph_handle, op_config); - if (QNN_SUCCESS != error) { - QNN_LOG_INFO("error = %d\n", error); - goto failure; - } - error = qnn_raw_interface.graphFinalize(graph_handle, nullptr, nullptr); - if (QNN_SUCCESS != error) { - QNN_LOG_INFO("error = %d\n", error); - goto failure; - } - error = qnn_raw_interface.graphExecute(graph_handle, tensor_inputs, 2, tensor_outputs, 1, nullptr, nullptr); - if (ctx->device == QNN_BACKEND_NPU) { - if (QNN_COMMON_ERROR_SYSTEM_COMMUNICATION == error) { - QNN_LOG_WARN("NPU crashed. SSR detected. Caused QNN graph execute error\n"); - } - } - if (QNN_SUCCESS != error) { - QNN_LOG_INFO("error = %d\n", error); + if (!graph->add_nodes(QNN_OP_ELEMENT_WISE_ADD, + { *tensor_input0.get_qnn_tensor(), *tensor_input1.get_qnn_tensor() }, + { *tensor_output.get_qnn_tensor() })) { goto failure; } - auto graph_item = std::make_tuple(graph_handle, tensor_input0.get_qnn_tensor(), tensor_input1.get_qnn_tensor(), - tensor_output.get_qnn_tensor()); - ctx->qnn_graph_map[map_entry] = graph_item; - } else { - auto &graph_item = ctx->qnn_graph_map[map_entry]; - qnn::ggml_qnn_tensor_input tensor_input0(src0, std::get<1>(graph_item), ctx); - qnn::ggml_qnn_tensor_input tensor_input1(src1, std::get<2>(graph_item), ctx); - qnn::ggml_qnn_tensor_output tensor_output(dst, std::get<3>(graph_item), ctx); - - Qnn_Tensor_t tensor_inputs[] = { *tensor_input0.get_qnn_tensor(), *tensor_input1.get_qnn_tensor() }; - Qnn_Tensor_t tensor_outputs[] = { *tensor_output.get_qnn_tensor() }; - error = qnn_raw_interface.graphExecute(graph_handle, tensor_inputs, 2, tensor_outputs, 1, nullptr, nullptr); - if (ctx->device == QNN_BACKEND_NPU) { - if (QNN_COMMON_ERROR_SYSTEM_COMMUNICATION == error) { - QNN_LOG_WARN("NPU crashed. SSR detected. Caused QNN graph execute error\n"); - } - } - if (QNN_SUCCESS != error) { - QNN_LOG_INFO("error = %d\n", error); + if (!graph->execute()) { goto failure; } + + ctx->qnn_graph_map[graph_key] = std::make_tuple(std::move(graph), tensor_input0.get_qnn_tensor(), + tensor_input1.get_qnn_tensor(), tensor_output.get_qnn_tensor()); } + succeed = true; + failure: - if (QNN_SUCCESS != error) { - QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 - ", nb = (%5zi, %5zi, %5zi)\n", - src0->name, src0->type, ggml_type_name(src0->type), src0->ne[0], src0->ne[1], src0->ne[2], - src0->nb[0], src0->nb[1], src0->nb[2]); - QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 - ", nb = (%5zi, %5zi, %5zi)\n", - src1->name, src1->type, ggml_type_name(src1->type), src1->ne[0], src1->ne[1], src1->ne[2], - src1->nb[0], src1->nb[1], src1->nb[2]); - QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 - ", nb = (%5zi, %5zi, %5zi)\n", - dst->name, dst->type, ggml_type_name(dst->type), dst->ne[0], dst->ne[1], dst->ne[2], dst->nb[0], - dst->nb[1], dst->nb[2]); + if (!succeed) { + print_ggml_tensor(src0); + print_ggml_tensor(src1); + print_ggml_tensor(dst); } perf.info(); @@ -210,158 +130,69 @@ static void ggml_qnn_add(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, */ static void ggml_qnn_mul_mat(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst) { - Qnn_ErrorHandle_t error = QNN_SUCCESS; - bool graph_initialized = false; - qnn::qnn_instance *instance = nullptr; - std::string graph_name = "ggml_op_qnn_mul_mat"; - Qnn_GraphHandle_t graph_handle = nullptr; - Qnn_Param_t qnn_params[] = {}; - enum ggml_op ggmlop = GGML_OP_MUL_MAT; - CHECK_PARAMS(ctx, src0, src1, dst); - instance = ctx->instance; - auto qnn_raw_interface = ctx->raw_interface; - qnn::qnn_perf perf("ggml_qnn_mul_mat"); + std::string graph_name = "ggml_op_qnn_mul_mat"; + qnn::qnn_perf perf(graph_name); perf.start(); - std::string map_entry = std::string(ggml_op_name(ggmlop)); - if (ctx->qnn_graph_map.find(map_entry) != ctx->qnn_graph_map.end()) { - graph_initialized = true; - auto &graph_item = ctx->qnn_graph_map[map_entry]; - graph_handle = std::get<0>(graph_item); - } - // TODO: for scenarios of quantized data in src0 // pass-1: dequantize src0 to FP32 // pass-2: dq-src0 * src1 // the performance gains is worth although there is performance loss in pass-1 - if (!graph_initialized) { + bool succeed = false; + std::string graph_key(ggml_op_name(GGML_OP_MUL_MAT)); + auto it = ctx->qnn_graph_map.find(graph_key); + if (it != ctx->qnn_graph_map.end()) { + const auto &graph_item = it->second; + qnn::ggml_qnn_tensor_input tensor_input0(src0, std::get<1>(graph_item), ctx); + qnn::ggml_qnn_tensor_input tensor_input1(src1, std::get<2>(graph_item), ctx); + qnn::ggml_qnn_tensor_output tensor_output(dst, std::get<3>(graph_item), ctx); + std::get<0>(graph_item)->execute(); + } else { graph_name = graph_name + "_" + std::to_string(ctx->threads) + "_" + src0->name + "_" + src1->name; - QNN_LOG_INFO("graph name %s", graph_name.c_str()); - if (ctx->device == QNN_BACKEND_NPU) { - QnnHtpGraph_CustomConfig_t hvx_config; - hvx_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_NUM_HVX_THREADS; - hvx_config.numHvxThreads = 8; - QnnGraph_Config_t graph_hvx_config; - graph_hvx_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; - graph_hvx_config.customConfig = &hvx_config; - - QnnHtpGraph_CustomConfig_t dlbc_config; - dlbc_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_OPTIMIZATION; - dlbc_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_ENABLE_DLBC; - dlbc_config.optimizationOption.floatValue = 1.0; // set to 0.0 to turn off DLBC - QnnGraph_Config_t graph_dlbc_config; - graph_dlbc_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; - graph_dlbc_config.customConfig = &dlbc_config; - - QnnHtpGraph_CustomConfig_t opt_config; - opt_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_OPTIMIZATION_FLAG; - opt_config.optimizationOption.floatValue = 1; // 1 / 3 - QnnGraph_Config_t graph_opt_config; - graph_opt_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; - graph_opt_config.customConfig = &opt_config; - - QnnHtpGraph_CustomConfig_t vtcm_config; - vtcm_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_VTCM_SIZE; - vtcm_config.vtcmSizeInMB = ctx->socinfo.vtcm_size_in_mb; - QnnGraph_Config_t graph_vtcm_config; - graph_vtcm_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; - graph_vtcm_config.customConfig = &vtcm_config; - - const QnnGraph_Config_t *p_graphconfig[] = { &graph_hvx_config, &graph_dlbc_config, &graph_vtcm_config, - &graph_opt_config, NULL }; - error = qnn_raw_interface.graphCreate(instance->get_qnn_context_handle(), graph_name.c_str(), p_graphconfig, - &graph_handle); - } else { - error = qnn_raw_interface.graphCreate(instance->get_qnn_context_handle(), graph_name.c_str(), nullptr, - &graph_handle); - } - if (QNN_SUCCESS != error) { - QNN_LOG_INFO( - "can't create qnn graph handle with graph name %s, " - "error = %d\n", - graph_name.c_str(), error); + auto graph = std::make_unique(graph_name, (QNNBackend)(ctx->device), + ctx->instance->get_qnn_context_handle(), + ctx->raw_interface, ctx->socinfo.vtcm_size_in_mb); + + if (!graph->is_valid()) { goto failure; } - qnn::ggml_qnn_tensor_input tensor_input0(src0, graph_handle, ctx); + qnn::ggml_qnn_tensor_input tensor_input0(src0, graph->get_graph_handler(), ctx); if (!tensor_input0.is_valid()) { goto failure; } - qnn::ggml_qnn_tensor_input tensor_input1(src1, graph_handle, ctx); + qnn::ggml_qnn_tensor_input tensor_input1(src1, graph->get_graph_handler(), ctx); if (!tensor_input1.is_valid()) { goto failure; } - qnn::ggml_qnn_tensor_output tensor_output(dst, graph_handle, ctx); + qnn::ggml_qnn_tensor_output tensor_output(dst, graph->get_graph_handler(), ctx); if (!tensor_output.is_valid()) { goto failure; } - Qnn_Tensor_t tensor_inputs[] = { *tensor_input0.get_qnn_tensor(), *tensor_input1.get_qnn_tensor() }; - Qnn_Tensor_t tensor_outputs[] = { *tensor_output.get_qnn_tensor() }; - Qnn_OpConfig_t op_config = { QNN_OPCONFIG_VERSION_1, - .v1 = { "ggml_op_mul_mat", QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_MAT_MUL, 0, - qnn_params, 2, tensor_inputs, 1, tensor_outputs } }; - error = qnn_raw_interface.graphAddNode(graph_handle, op_config); - if (QNN_SUCCESS != error) { - QNN_LOG_INFO("error = %d\n", error); - goto failure; - } - error = qnn_raw_interface.graphFinalize(graph_handle, nullptr, nullptr); - if (QNN_SUCCESS != error) { - QNN_LOG_INFO("error = %d\n", error); - goto failure; - } - error = qnn_raw_interface.graphExecute(graph_handle, tensor_inputs, 2, tensor_outputs, 1, nullptr, nullptr); - if (ctx->device == QNN_BACKEND_NPU) { - if (QNN_COMMON_ERROR_SYSTEM_COMMUNICATION == error) { - QNN_LOG_WARN("NPU crashed. SSR detected. Caused QNN graph execute error\n"); - } - } - if (QNN_SUCCESS != error) { - QNN_LOG_INFO("error = %d\n", error); + if (!graph->add_nodes(QNN_OP_MAT_MUL, { *tensor_input0.get_qnn_tensor(), *tensor_input1.get_qnn_tensor() }, + { *tensor_output.get_qnn_tensor() })) { goto failure; } - auto graph_item = std::make_tuple(graph_handle, tensor_input0.get_qnn_tensor(), tensor_input1.get_qnn_tensor(), - tensor_output.get_qnn_tensor()); - ctx->qnn_graph_map[map_entry] = graph_item; - } else { - auto &graph_item = ctx->qnn_graph_map[map_entry]; - qnn::ggml_qnn_tensor_input tensor_input0(src0, std::get<1>(graph_item), ctx); - qnn::ggml_qnn_tensor_input tensor_input1(src1, std::get<2>(graph_item), ctx); - qnn::ggml_qnn_tensor_output tensor_output(dst, std::get<3>(graph_item), ctx); - - Qnn_Tensor_t tensor_inputs[] = { *tensor_input0.get_qnn_tensor(), *tensor_input1.get_qnn_tensor() }; - Qnn_Tensor_t tensor_outputs[] = { *tensor_output.get_qnn_tensor() }; - error = qnn_raw_interface.graphExecute(graph_handle, tensor_inputs, 2, tensor_outputs, 1, nullptr, nullptr); - if (ctx->device == QNN_BACKEND_NPU) { - if (QNN_COMMON_ERROR_SYSTEM_COMMUNICATION == error) { - QNN_LOG_WARN("NPU crashed. SSR detected. Caused QNN graph execute error\n"); - } - } - if (QNN_SUCCESS != error) { - QNN_LOG_INFO("error = %d\n", error); + if (!graph->execute()) { goto failure; } + + ctx->qnn_graph_map[graph_key] = std::make_tuple(std::move(graph), tensor_input0.get_qnn_tensor(), + tensor_input1.get_qnn_tensor(), tensor_output.get_qnn_tensor()); } + succeed = true; + failure: - if (QNN_SUCCESS != error) { - QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 - ", nb = (%5zi, %5zi, %5zi)\n", - src0->name, src0->type, ggml_type_name(src0->type), src0->ne[0], src0->ne[1], src0->ne[2], - src0->nb[0], src0->nb[1], src0->nb[2]); - QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 - ", nb = (%5zi, %5zi, %5zi)\n", - src1->name, src1->type, ggml_type_name(src1->type), src1->ne[0], src1->ne[1], src1->ne[2], - src1->nb[0], src1->nb[1], src1->nb[2]); - QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 - ", nb = (%5zi, %5zi, %5zi)\n", - dst->name, dst->type, ggml_type_name(dst->type), dst->ne[0], dst->ne[1], dst->ne[2], dst->nb[0], - dst->nb[1], dst->nb[2]); + if (!succeed) { + print_ggml_tensor(src0); + print_ggml_tensor(src1); + print_ggml_tensor(dst); } perf.info(); diff --git a/ggml/src/ggml-qnn/backend.hpp b/ggml/src/ggml-qnn/backend.hpp index dd15b05807641..d60b334c0b2b5 100644 --- a/ggml/src/ggml-qnn/backend.hpp +++ b/ggml/src/ggml-qnn/backend.hpp @@ -1,12 +1,14 @@ #pragma once +#include #include #include "ggml.h" #include "ggml-backend.h" +#include "graph.hpp" #include "qnn.hpp" struct ggml_backend_qnn_context { @@ -19,5 +21,7 @@ struct ggml_backend_qnn_context { QNN_INTERFACE_VER_TYPE raw_interface; QNN_SYSTEM_INTERFACE_VER_TYPE raw_system_interface; qnn::qcom_socinfo socinfo; - std::unordered_map> qnn_graph_map; + std::unordered_map, Qnn_Tensor_t *, + Qnn_Tensor_t *, Qnn_Tensor_t *>> + qnn_graph_map; }; diff --git a/ggml/src/ggml-qnn/graph.hpp b/ggml/src/ggml-qnn/graph.hpp index 700114d6f8a26..1aad145c32896 100644 --- a/ggml/src/ggml-qnn/graph.hpp +++ b/ggml/src/ggml-qnn/graph.hpp @@ -81,11 +81,14 @@ class ggml_qnn_graph { return false; } + _tensor_inputs = tensor_inputs; + _tensor_outputs = tensor_outputs; + Qnn_Param_t qnn_params[] = {}; - Qnn_OpConfig_t op_config = { QNN_OPCONFIG_VERSION_1, + Qnn_OpConfig_t op_config = { .version = QNN_OPCONFIG_VERSION_1, .v1 = { _graph_name.c_str(), QNN_OP_PACKAGE_NAME_QTI_AISW, op_name.c_str(), 0, - qnn_params, _tensor_inputs.size(), _tensor_inputs.data(), - _tensor_outputs.size(), _tensor_outputs.data() } }; + qnn_params, (uint32_t)_tensor_inputs.size(), _tensor_inputs.data(), + (uint32_t)_tensor_outputs.size(), _tensor_outputs.data() } }; auto error = _qnn_interface.graphAddNode(_graph_handle, op_config); if (error != QNN_SUCCESS) { QNN_LOG_ERROR("graphAddNode.error = %d\n", error); From 58cec140920985f2e038512001869fdb3cf86ad8 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Fri, 5 Jul 2024 17:31:22 +0800 Subject: [PATCH 041/143] reformat --- ggml/src/ggml-qnn.cpp | 457 ++++++++++++++++-------------------- ggml/src/ggml-qnn/utils.hpp | 2 +- 2 files changed, 205 insertions(+), 254 deletions(-) diff --git a/ggml/src/ggml-qnn.cpp b/ggml/src/ggml-qnn.cpp index 19c970c5f0fd6..a590dd5f56cfb 100644 --- a/ggml/src/ggml-qnn.cpp +++ b/ggml/src/ggml-qnn.cpp @@ -1,46 +1,46 @@ +#include "ggml-qnn.h" + +#include #include #include -#include #include +#include #include #include -#include -#include -#include -#include -#include -#include -#include +#include +#include +#include #include +#include #include -#include -#include #include -#include +#include +#include #include -#include -#include -#include +#include +#include +#include +#include +#include #include #include - -#include "ggml-qnn.h" +#include #include "ggml-backend-impl.h" +#include "ggml-qnn/backend-ops.hpp" +#include "ggml-qnn/backend.hpp" #include "ggml-qnn/logger.hpp" -#include "ggml-qnn/utils.hpp" #include "ggml-qnn/tensor.hpp" -#include "ggml-qnn/backend.hpp" -#include "ggml-qnn/backend-ops.hpp" +#include "ggml-qnn/utils.hpp" // ================================================================================================= // // forward declaration // // ================================================================================================= -static int free_qnn_tensor(Qnn_Tensor_t & tensor); +static int free_qnn_tensor(Qnn_Tensor_t &tensor); // ================================================================================================= // @@ -48,37 +48,25 @@ static int free_qnn_tensor(Qnn_Tensor_t & tensor); // // ================================================================================================= #ifdef NDEBUG -#define ENABLE_QNNBACKEND_PERF 0 // enable/disable op's perf info +#define ENABLE_QNNBACKEND_PERF 0 // enable/disable op's perf info #else -#define ENABLE_QNNBACKEND_PERF 1 // enable/disable op's perf info +#define ENABLE_QNNBACKEND_PERF 1 // enable/disable op's perf info #endif -#define QNN_BACKEND_NAME "qnn" +#define QNN_BACKEND_NAME "qnn" static struct qnn::qcom_socinfo g_qnn_soc_info_table[] = { - /* Qualcomm SnapDragon 8 Gen 1 */ - [qnn::SM8450] = { - .soc_model = qnn::SM8450, - .htp_arch = qnn::V69, - .vtcm_size_in_mb = 8}, - - /* Qualcomm SnapDragon 8 Gen 1+ */ - [qnn::SM8475] = { - .soc_model = qnn::SM8475, - .htp_arch = qnn::V69, - .vtcm_size_in_mb = 8}, - - /* Qualcomm SnapDragon 8 Gen 2 */ - [qnn::SM8550] = { - .soc_model = qnn::SM8550, - .htp_arch = qnn::V73, - .vtcm_size_in_mb = 8}, - - /* Qualcomm SnapDragon 8 Gen 3 */ - [qnn::SM8650] = { - .soc_model = qnn::SM8650, - .htp_arch = qnn::V75, - .vtcm_size_in_mb = 8}, + /* Qualcomm SnapDragon 8 Gen 1 */ + [qnn::SM8450] = { .soc_model = qnn::SM8450, .htp_arch = qnn::V69, .vtcm_size_in_mb = 8 }, + + /* Qualcomm SnapDragon 8 Gen 1+ */ + [qnn::SM8475] = { .soc_model = qnn::SM8475, .htp_arch = qnn::V69, .vtcm_size_in_mb = 8 }, + + /* Qualcomm SnapDragon 8 Gen 2 */ + [qnn::SM8550] = { .soc_model = qnn::SM8550, .htp_arch = qnn::V73, .vtcm_size_in_mb = 8 }, + + /* Qualcomm SnapDragon 8 Gen 3 */ + [qnn::SM8650] = { .soc_model = qnn::SM8650, .htp_arch = qnn::V75, .vtcm_size_in_mb = 8 }, }; @@ -96,52 +84,50 @@ static struct qnn::qcom_socinfo g_qnn_soc_info_table[] = { // HMX(Hexagon Matrix eXtensions)/HTA(Hexagon Tensor Accelerator) static struct ggml_backend_qnn_context g_qnn_mgr[GGML_QNN_MAX_DEVICES] = { - [QNN_BACKEND_CPU] = {.device = 0, - .threads = 1, - .name = "qnn-cpu", - .lib = "libQnnCpu.so", - .instance = nullptr, - .backend = nullptr, - .raw_interface = {}, - .raw_system_interface = {}, - .socinfo = {}}, - - [QNN_BACKEND_GPU] = {.device = 1, - .threads = 1, - .name = "qnn-gpu", - .lib = "libQnnGpu.so", - .instance = nullptr, - .backend = nullptr, - .raw_interface = {}, - .raw_system_interface = {}, - .socinfo = {}}, - - [QNN_BACKEND_NPU] = {.device = 2, - .threads = 1, - .name = "qnn-npu", - .lib = "libQnnHtp.so", - .instance = nullptr, - .backend = nullptr, - .raw_interface = {}, - .raw_system_interface = {}, - .socinfo = {}}, + [QNN_BACKEND_CPU] = { .device = 0, + .threads = 1, + .name = "qnn-cpu", + .lib = "libQnnCpu.so", + .instance = nullptr, + .backend = nullptr, + .raw_interface = {}, + .raw_system_interface = {}, + .socinfo = {} }, + + [QNN_BACKEND_GPU] = { .device = 1, + .threads = 1, + .name = "qnn-gpu", + .lib = "libQnnGpu.so", + .instance = nullptr, + .backend = nullptr, + .raw_interface = {}, + .raw_system_interface = {}, + .socinfo = {} }, + + [QNN_BACKEND_NPU] = { .device = 2, + .threads = 1, + .name = "qnn-npu", + .lib = "libQnnHtp.so", + .instance = nullptr, + .backend = nullptr, + .raw_interface = {}, + .raw_system_interface = {}, + .socinfo = {} }, }; struct ggml_backend_qnn_buffer_context { - ggml_backend_qnn_buffer_context(size_t device) - : device(device) - , name(QNN_BACKEND_NAME + std::to_string(device)) {} + ggml_backend_qnn_buffer_context(size_t device) : device(device), name(QNN_BACKEND_NAME + std::to_string(device)) {} ~ggml_backend_qnn_buffer_context() { if (buffer) { free(buffer); } - for (auto * sub_buffer : sub_buffers) { + for (auto *sub_buffer : sub_buffers) { free(sub_buffer); } - for (auto * qnn_tensor : qnn_tensors) { + for (auto *qnn_tensor : qnn_tensors) { free_qnn_tensor(*qnn_tensor); free(qnn_tensor); } @@ -149,19 +135,19 @@ struct ggml_backend_qnn_buffer_context { sub_buffers.clear(); qnn_tensors.clear(); } - void * buffer = nullptr; + void *buffer = nullptr; - struct ggml_backend_qnn_context * backend_ctx = nullptr; + struct ggml_backend_qnn_context *backend_ctx = nullptr; - size_t buffer_size = 0; - std::vector sub_buffers; + size_t buffer_size = 0; + std::vector sub_buffers; std::vector qnn_tensors; - size_t device; - std::string name; + size_t device; + std::string name; }; struct ggml_backend_qnn_buffer_type_context { - size_t device; + size_t device; std::string name; }; @@ -170,7 +156,7 @@ struct ggml_backend_qnn_buffer_type_context { // QNN backend internal helper functions // // ================================================================================================= -static size_t memscpy(void * dst, size_t dst_size, const void * src, size_t copy_size) { +static size_t memscpy(void *dst, size_t dst_size, const void *src, size_t copy_size) { if (!dst || !src || !dst_size || !copy_size) return 0; size_t min_size = dst_size < copy_size ? dst_size : copy_size; @@ -180,13 +166,12 @@ static size_t memscpy(void * dst, size_t dst_size, const void * src, size_t copy return min_size; } -static int deep_copy_qnn_tensors(Qnn_Tensor_t & src, Qnn_Tensor_t & dst) { +static int deep_copy_qnn_tensors(Qnn_Tensor_t &src, Qnn_Tensor_t &dst) { int err = 0; VALIDATE_TENSOR_VERSION(src, err); dst.version = src.version; - QNN_TENSOR_SET_NAME( - dst, ::strndup(QNN_TENSOR_GET_NAME(src),std::string(QNN_TENSOR_GET_NAME(src)).size())); + QNN_TENSOR_SET_NAME(dst, ::strndup(QNN_TENSOR_GET_NAME(src), std::string(QNN_TENSOR_GET_NAME(src)).size())); if (nullptr == QNN_TENSOR_GET_NAME(dst)) { return 1; } @@ -197,7 +182,7 @@ static int deep_copy_qnn_tensors(Qnn_Tensor_t & src, Qnn_Tensor_t & dst) { QNN_TENSOR_SET_MEM_TYPE(dst, QNN_TENSOR_GET_MEM_TYPE(src)); if (QNN_TENSOR_GET_MEM_TYPE(src) == QNN_TENSORMEMTYPE_RAW) { - Qnn_ClientBuffer_t client_buf = {nullptr, 0}; + Qnn_ClientBuffer_t client_buf = { nullptr, 0 }; QNN_TENSOR_SET_CLIENT_BUF(dst, client_buf); } else if (QNN_TENSOR_GET_MEM_TYPE(src) == QNN_TENSORMEMTYPE_MEMHANDLE) { QNN_TENSOR_SET_MEM_HANDLE(dst, nullptr); @@ -205,33 +190,29 @@ static int deep_copy_qnn_tensors(Qnn_Tensor_t & src, Qnn_Tensor_t & dst) { return 1; } - Qnn_QuantizeParams_t src_qparam = QNN_TENSOR_GET_QUANT_PARAMS(src); - Qnn_QuantizationEncoding_t encoding = src_qparam.quantizationEncoding; + Qnn_QuantizeParams_t src_qparam = QNN_TENSOR_GET_QUANT_PARAMS(src); + Qnn_QuantizationEncoding_t encoding = src_qparam.quantizationEncoding; if (encoding == QNN_QUANTIZATION_ENCODING_AXIS_SCALE_OFFSET) { - Qnn_QuantizeParams_t src_qparam_cpy = src_qparam; - Qnn_AxisScaleOffset_t & axis_scale_offset = src_qparam_cpy.axisScaleOffsetEncoding; - Qnn_ScaleOffset_t ** scaleOffset = & axis_scale_offset.scaleOffset; - size_t scaleOffsetSize = axis_scale_offset.numScaleOffsets * sizeof(Qnn_ScaleOffset_t); - *scaleOffset = (Qnn_ScaleOffset_t *) malloc(scaleOffsetSize); - memscpy(*scaleOffset, scaleOffsetSize, - src_qparam.axisScaleOffsetEncoding.scaleOffset, - scaleOffsetSize); + Qnn_QuantizeParams_t src_qparam_cpy = src_qparam; + Qnn_AxisScaleOffset_t &axis_scale_offset = src_qparam_cpy.axisScaleOffsetEncoding; + Qnn_ScaleOffset_t **scaleOffset = &axis_scale_offset.scaleOffset; + size_t scaleOffsetSize = axis_scale_offset.numScaleOffsets * sizeof(Qnn_ScaleOffset_t); + *scaleOffset = (Qnn_ScaleOffset_t *)malloc(scaleOffsetSize); + memscpy(*scaleOffset, scaleOffsetSize, src_qparam.axisScaleOffsetEncoding.scaleOffset, scaleOffsetSize); QNN_TENSOR_SET_QUANT_PARAMS(dst, src_qparam_cpy); } else if (encoding == QNN_QUANTIZATION_ENCODING_BW_AXIS_SCALE_OFFSET) { - Qnn_QuantizeParams_t src_qparam_cpy = src_qparam; - Qnn_BwAxisScaleOffset_t & bwaxis_scale_offset = src_qparam_cpy.bwAxisScaleOffsetEncoding; - size_t scaleSize = bwaxis_scale_offset.numElements * sizeof(float); - float ** scales = &bwaxis_scale_offset.scales; - int32_t ** offsets = &bwaxis_scale_offset.offsets; - *scales = (float *) malloc(scaleSize); - memscpy(*scales, scaleSize, src_qparam.bwAxisScaleOffsetEncoding.scales, - scaleSize); + Qnn_QuantizeParams_t src_qparam_cpy = src_qparam; + Qnn_BwAxisScaleOffset_t &bwaxis_scale_offset = src_qparam_cpy.bwAxisScaleOffsetEncoding; + size_t scaleSize = bwaxis_scale_offset.numElements * sizeof(float); + float **scales = &bwaxis_scale_offset.scales; + int32_t **offsets = &bwaxis_scale_offset.offsets; + *scales = (float *)malloc(scaleSize); + memscpy(*scales, scaleSize, src_qparam.bwAxisScaleOffsetEncoding.scales, scaleSize); if (bwaxis_scale_offset.offsets != nullptr) { size_t offsetSize = bwaxis_scale_offset.numElements * sizeof(int32_t); - *offsets = (int32_t *) malloc(offsetSize); - memscpy(*offsets, offsetSize, - src_qparam.bwAxisScaleOffsetEncoding.offsets, offsetSize); + *offsets = (int32_t *)malloc(offsetSize); + memscpy(*offsets, offsetSize, src_qparam.bwAxisScaleOffsetEncoding.offsets, offsetSize); } QNN_TENSOR_SET_QUANT_PARAMS(dst, src_qparam_cpy); } else { @@ -240,12 +221,13 @@ static int deep_copy_qnn_tensors(Qnn_Tensor_t & src, Qnn_Tensor_t & dst) { uint32_t rank = QNN_TENSOR_GET_RANK(src); QNN_TENSOR_SET_RANK(dst, rank); - size_t dim_size = rank * sizeof(uint32_t); - uint32_t * dimensions = (uint32_t *) malloc(dim_size); + size_t dim_size = rank * sizeof(uint32_t); + uint32_t *dimensions = (uint32_t *)malloc(dim_size); if (dimensions == nullptr) { - QNN_LOG_WARN("deep_copy_qnn_tensors() allocation error while copying " - "tensor %s\n", - QNN_TENSOR_GET_NAME(src)); + QNN_LOG_WARN( + "deep_copy_qnn_tensors() allocation error while copying " + "tensor %s\n", + QNN_TENSOR_GET_NAME(src)); return 1; } memscpy(dimensions, dim_size, QNN_TENSOR_GET_DIMENSIONS(src), dim_size); @@ -254,11 +236,11 @@ static int deep_copy_qnn_tensors(Qnn_Tensor_t & src, Qnn_Tensor_t & dst) { return err; } -static int free_qnn_tensor(Qnn_Tensor_t & tensor) { +static int free_qnn_tensor(Qnn_Tensor_t &tensor) { int err = 0; VALIDATE_TENSOR_VERSION(tensor, err); - free((void *) QNN_TENSOR_GET_NAME(tensor)); + free((void *)QNN_TENSOR_GET_NAME(tensor)); free(QNN_TENSOR_GET_DIMENSIONS(tensor)); return err; @@ -269,15 +251,14 @@ static int free_qnn_tensor(Qnn_Tensor_t & tensor) { // implementation of QNN backend for GGML // // ================================================================================================= -static bool ggml_qnn_can_handle_op(ggml_backend_qnn_context * ctx, - const struct ggml_tensor * tensor, +static bool ggml_qnn_can_handle_op(ggml_backend_qnn_context *ctx, const struct ggml_tensor *tensor, bool b_dump_tensor_info) { if (ggml_is_empty(tensor) || !qnn::ggml_qnn_op_array()[tensor->op]) { return false; } - const struct ggml_tensor * src0 = tensor->src[0]; - const struct ggml_tensor * src1 = tensor->src[1]; + const struct ggml_tensor *src0 = tensor->src[0]; + const struct ggml_tensor *src1 = tensor->src[1]; if (nullptr == src0 || nullptr == src1) { return false; } @@ -304,7 +285,7 @@ static bool ggml_qnn_can_handle_op(ggml_backend_qnn_context * ctx, return false; } - //TODO: support other quantized data type + // TODO: support other quantized data type if (ggml_is_quantized(src0->type)) { if (src0->type != GGML_TYPE_Q8_0 && src0->type != GGML_TYPE_Q4_0) { return false; @@ -313,15 +294,15 @@ static bool ggml_qnn_can_handle_op(ggml_backend_qnn_context * ctx, if (tensor->op == GGML_OP_MUL_MAT) { if (ne00 <= 32 || ne01 <= 32 || ne10 <= 32 || ne11 <= 32) { - //comment it for make UT of mul_mat with QNN RPC happy - //return false; + // comment it for make UT of mul_mat with QNN RPC happy + // return false; } } return true; } -bool ggml_qnn_compute_forward(ggml_backend_qnn_context * ctx, struct ggml_tensor * tensor) { +bool ggml_qnn_compute_forward(ggml_backend_qnn_context *ctx, struct ggml_tensor *tensor) { auto func = qnn::ggml_qnn_op_array()[tensor->op]; if (!func) { QNN_LOG_WARN("unsupported op %d", tensor->op); @@ -332,7 +313,7 @@ bool ggml_qnn_compute_forward(ggml_backend_qnn_context * ctx, struct ggml_tensor return true; } -static const char * ggml_backend_qnn_buffer_get_name(ggml_backend_buffer_t buffer) { +static const char *ggml_backend_qnn_buffer_get_name(ggml_backend_buffer_t buffer) { GGML_UNUSED(buffer); return "QNN"; } @@ -342,31 +323,28 @@ GGML_CALL static bool ggml_backend_buffer_is_qnn(ggml_backend_buffer_t buffer) { } GGML_CALL static void ggml_backend_qnn_buffer_free_buffer(ggml_backend_buffer_t buffer) { - ggml_backend_qnn_buffer_context * ctx = (ggml_backend_qnn_buffer_context *) buffer->context; + ggml_backend_qnn_buffer_context *ctx = (ggml_backend_qnn_buffer_context *)buffer->context; delete ctx; } -GGML_CALL static void * ggml_backend_qnn_buffer_get_base(ggml_backend_buffer_t buffer) { - ggml_backend_qnn_buffer_context * ctx = (ggml_backend_qnn_buffer_context *) buffer->context; +GGML_CALL static void *ggml_backend_qnn_buffer_get_base(ggml_backend_buffer_t buffer) { + ggml_backend_qnn_buffer_context *ctx = (ggml_backend_qnn_buffer_context *)buffer->context; return ctx->buffer; } -GGML_CALL static void ggml_backend_qnn_buffer_init_tensor(ggml_backend_buffer_t buffer, - ggml_tensor * tensor) { - Qnn_ErrorHandle_t error = QNN_SUCCESS; - ggml_backend_qnn_buffer_context * ctx = (ggml_backend_qnn_buffer_context *) buffer->context; +GGML_CALL static void ggml_backend_qnn_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor *tensor) { + Qnn_ErrorHandle_t error = QNN_SUCCESS; + ggml_backend_qnn_buffer_context *ctx = (ggml_backend_qnn_buffer_context *)buffer->context; - static int idx = 0; - char tensor_name[GGML_MAX_NAME] = {0}; + static int idx = 0; + char tensor_name[GGML_MAX_NAME] = { 0 }; snprintf(tensor_name, GGML_MAX_NAME, "tensor_%04d", idx++); - uint32_t dimensions[] = {(uint32_t) tensor->ne[0], (uint32_t) tensor->ne[1], - (uint32_t) tensor->ne[2], - (uint32_t) tensor->ne[3]}; - Qnn_DataType_t qnn_data_type = - qnn::datatype_from_ggml_datatype(tensor->type); + uint32_t dimensions[] = { (uint32_t)tensor->ne[0], (uint32_t)tensor->ne[1], (uint32_t)tensor->ne[2], + (uint32_t)tensor->ne[3] }; + Qnn_DataType_t qnn_data_type = qnn::datatype_from_ggml_datatype(tensor->type); Qnn_TensorType_t qnn_tensor_type = QNN_TENSOR_TYPE_APP_WRITE; if (tensor->flags & GGML_TENSOR_FLAG_INPUT) { @@ -381,25 +359,22 @@ GGML_CALL static void ggml_backend_qnn_buffer_init_tensor(ggml_backend_buffer_t qnn_mem_type = QNN_TENSORMEMTYPE_MEMHANDLE; } - qnn_tensor = { - .version = QNN_TENSOR_VERSION_1, - {.v1 = {.id = 0, - .name = tensor_name, - .type = qnn_tensor_type, - .dataFormat = QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER, - .dataType = qnn_data_type, - .quantizeParams = - {QNN_DEFINITION_UNDEFINED, - QNN_QUANTIZATION_ENCODING_UNDEFINED, - {.scaleOffsetEncoding = {.scale = 0.0000000000000000f, - .offset = 0}}}, - .rank = qnn::get_ggml_tensor_rank(tensor), - .dimensions = dimensions, - .memType = qnn_mem_type, - {.clientBuf = {.data = nullptr, .dataSize = 0}}}}}; - - Qnn_Tensor_t * p_qnn_tensor = - (Qnn_Tensor_t *)calloc(1, sizeof(Qnn_Tensor_t)); + qnn_tensor = { .version = QNN_TENSOR_VERSION_1, + { .v1 = { + .id = 0, + .name = tensor_name, + .type = qnn_tensor_type, + .dataFormat = QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER, + .dataType = qnn_data_type, + .quantizeParams = { QNN_DEFINITION_UNDEFINED, + QNN_QUANTIZATION_ENCODING_UNDEFINED, + { .scaleOffsetEncoding = { .scale = 0.0000000000000000f, .offset = 0 } } }, + .rank = qnn::get_ggml_tensor_rank(tensor), + .dimensions = dimensions, + .memType = qnn_mem_type, + { .clientBuf = { .data = nullptr, .dataSize = 0 } } } } }; + + Qnn_Tensor_t *p_qnn_tensor = (Qnn_Tensor_t *)calloc(1, sizeof(Qnn_Tensor_t)); if (nullptr == p_qnn_tensor) { QNN_LOG_WARN("calloc failed"); return; @@ -414,24 +389,21 @@ GGML_CALL static void ggml_backend_qnn_buffer_init_tensor(ggml_backend_buffer_t ctx->qnn_tensors.push_back(p_qnn_tensor); } -GGML_CALL static void ggml_backend_qnn_buffer_set_tensor(ggml_backend_buffer_t buffer, - ggml_tensor * tensor, const void * data, - size_t offset, size_t size) { +GGML_CALL static void ggml_backend_qnn_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor *tensor, + const void *data, size_t offset, size_t size) { GGML_UNUSED(buffer); - memcpy((char *) tensor->data + offset, data, size); + memcpy((char *)tensor->data + offset, data, size); } -GGML_CALL static void ggml_backend_qnn_buffer_get_tensor(ggml_backend_buffer_t buffer, - const ggml_tensor * tensor, void * data, - size_t offset, size_t size) { +GGML_CALL static void ggml_backend_qnn_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor *tensor, + void *data, size_t offset, size_t size) { GGML_UNUSED(buffer); - memcpy(data, (const char *) tensor->data + offset, size); + memcpy(data, (const char *)tensor->data + offset, size); } -GGML_CALL static bool ggml_backend_qnn_buffer_cpy_tensor(ggml_backend_buffer_t buffer, - const struct ggml_tensor * src, - struct ggml_tensor * dst) { +GGML_CALL static bool ggml_backend_qnn_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor *src, + struct ggml_tensor *dst) { GGML_UNUSED(buffer); if (ggml_backend_buffer_is_host(src->buffer)) { memcpy(dst->data, src->data, ggml_nbytes(src)); @@ -442,7 +414,7 @@ GGML_CALL static bool ggml_backend_qnn_buffer_cpy_tensor(ggml_backend_buffer_t b } GGML_CALL static void ggml_backend_qnn_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { - ggml_backend_qnn_buffer_context * ctx = (ggml_backend_qnn_buffer_context *) buffer->context; + ggml_backend_qnn_buffer_context *ctx = (ggml_backend_qnn_buffer_context *)buffer->context; memset(ctx->buffer, value, ctx->buffer_size); } @@ -459,13 +431,11 @@ static ggml_backend_buffer_i ggml_backend_qnn_buffer_interface = { /* .reset = */ nullptr, }; -GGML_CALL static const char * ggml_backend_qnn_buffer_type_name(ggml_backend_buffer_type_t buft) { - return "QNN"; -} +GGML_CALL static const char *ggml_backend_qnn_buffer_type_name(ggml_backend_buffer_type_t buft) { return "QNN"; } -static void * ggml_qnn_host_malloc(size_t n) { - void * data = nullptr; - int result = posix_memalign((void **) &data, sysconf(_SC_PAGESIZE), n); +static void *ggml_qnn_host_malloc(size_t n) { + void *data = nullptr; + int result = posix_memalign((void **)&data, sysconf(_SC_PAGESIZE), n); if (result != 0) { QNN_LOG_WARN("%s: error: posix_memalign failed\n", __func__); return nullptr; @@ -474,10 +444,10 @@ static void * ggml_qnn_host_malloc(size_t n) { return data; } -GGML_CALL static ggml_backend_buffer_t ggml_backend_qnn_buffer_type_alloc_buffer( - ggml_backend_buffer_type_t buft, size_t size) { - ggml_backend_qnn_buffer_type_context * buft_ctx = (ggml_backend_qnn_buffer_type_context *)buft->context; - ggml_backend_qnn_buffer_context * ctx = new ggml_backend_qnn_buffer_context(buft_ctx->device); +GGML_CALL static ggml_backend_buffer_t ggml_backend_qnn_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, + size_t size) { + ggml_backend_qnn_buffer_type_context *buft_ctx = (ggml_backend_qnn_buffer_type_context *)buft->context; + ggml_backend_qnn_buffer_context *ctx = new ggml_backend_qnn_buffer_context(buft_ctx->device); size_t size_page = sysconf(_SC_PAGESIZE); @@ -487,7 +457,7 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_qnn_buffer_type_alloc_buffer } // TODO:use pre-allocated buffer in internal memory pool - ctx->buffer = ggml_qnn_host_malloc(size_aligned); + ctx->buffer = ggml_qnn_host_malloc(size_aligned); ctx->buffer_size = size_aligned; ctx->backend_ctx = &g_qnn_mgr[buft_ctx->device]; @@ -497,11 +467,10 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_qnn_buffer_type_alloc_buffer return nullptr; } - return ggml_backend_buffer_init(buft, ggml_backend_qnn_buffer_interface,ctx, size); + return ggml_backend_buffer_init(buft, ggml_backend_qnn_buffer_interface, ctx, size); } -GGML_CALL static size_t ggml_backend_qnn_buffer_type_get_alignment( - ggml_backend_buffer_type_t buft) { +GGML_CALL static size_t ggml_backend_qnn_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { GGML_UNUSED(buft); return 32; } @@ -518,18 +487,16 @@ GGML_CALL static bool ggml_backend_qnn_buffer_is_host(ggml_backend_buffer_type_t return true; } -GGML_CALL static const char * ggml_backend_qnn_name(ggml_backend_t backend) { - return "QNN"; -} +GGML_CALL static const char *ggml_backend_qnn_name(ggml_backend_t backend) { return "QNN"; } GGML_CALL static void ggml_backend_qnn_free(ggml_backend_t backend) { QNN_LOG_INFO("enter %s", __func__); - ggml_backend_qnn_context * ctx = (ggml_backend_qnn_context *) backend->context; + ggml_backend_qnn_context *ctx = (ggml_backend_qnn_context *)backend->context; QNN_LOG_INFO("idx %d, name:%s", ctx->device, g_qnn_mgr[ctx->device].name); auto *instance = g_qnn_mgr[ctx->device].instance; if (instance != nullptr) { - for (const auto &graph_item: ctx->qnn_graph_map) { + for (const auto &graph_item : ctx->qnn_graph_map) { QNN_LOG_INFO("graph type:%s", graph_item.first.c_str()); } @@ -548,21 +515,20 @@ GGML_CALL static void ggml_backend_qnn_free(ggml_backend_t backend) { } GGML_CALL static ggml_backend_buffer_type_t ggml_backend_qnn_get_default_buffer_type(ggml_backend_t backend) { - ggml_backend_qnn_context * ctx = (ggml_backend_qnn_context *) backend->context; + ggml_backend_qnn_context *ctx = (ggml_backend_qnn_context *)backend->context; return ggml_backend_qnn_buffer_type(ctx->device); } -GGML_CALL static ggml_status ggml_backend_qnn_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { - enum ggml_status result = GGML_STATUS_SUCCESS; - ggml_backend_qnn_context * ctx = (ggml_backend_qnn_context *) backend->context; +GGML_CALL static ggml_status ggml_backend_qnn_graph_compute(ggml_backend_t backend, ggml_cgraph *cgraph) { + enum ggml_status result = GGML_STATUS_SUCCESS; + ggml_backend_qnn_context *ctx = (ggml_backend_qnn_context *)backend->context; GGML_UNUSED(ctx); for (int i = 0; i < cgraph->n_nodes; i++) { - ggml_tensor * node = cgraph->nodes[i]; - if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || - node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || - node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) { + ggml_tensor *node = cgraph->nodes[i]; + if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || + node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) { continue; } bool ok = ggml_qnn_compute_forward(ctx, node); @@ -574,15 +540,14 @@ GGML_CALL static ggml_status ggml_backend_qnn_graph_compute(ggml_backend_t backe return result; } -GGML_CALL static bool ggml_backend_qnn_supports_op(ggml_backend_t backend, - const ggml_tensor * op) { - ggml_backend_qnn_context *ctx = (ggml_backend_qnn_context *) backend->context; +GGML_CALL static bool ggml_backend_qnn_supports_op(ggml_backend_t backend, const ggml_tensor *op) { + ggml_backend_qnn_context *ctx = (ggml_backend_qnn_context *)backend->context; return (ggml_qnn_can_handle_op(ctx, op, false)); } -GGML_CALL static bool ggml_backend_qnn_offload_op(ggml_backend_t backend,const ggml_tensor * tensor) { - ggml_backend_qnn_context * ctx = (ggml_backend_qnn_context *) backend->context; +GGML_CALL static bool ggml_backend_qnn_offload_op(ggml_backend_t backend, const ggml_tensor *tensor) { + ggml_backend_qnn_context *ctx = (ggml_backend_qnn_context *)backend->context; return ggml_qnn_can_handle_op(ctx, tensor, false); } @@ -611,21 +576,19 @@ static ggml_backend_i ggml_backend_qnn_interface = { }; static ggml_guid_t ggml_backend_qnn_guid() { - static ggml_guid guid = { - 0x1a, 0x2b, 0x3c, 0x4d, 0x5e, 0x6f, 0x70, 0x81, - 0x92, 0xa3, 0xb4, 0xc5, 0xd6, 0xe7, 0xf8, 0x09 - }; + static ggml_guid guid = { 0x1a, 0x2b, 0x3c, 0x4d, 0x5e, 0x6f, 0x70, 0x81, + 0x92, 0xa3, 0xb4, 0xc5, 0xd6, 0xe7, 0xf8, 0x09 }; return &guid; } -static ggml_backend_t ggml_backend_qnn_reg_init(const char * params, void * user_data) { +static ggml_backend_t ggml_backend_qnn_reg_init(const char *params, void *user_data) { if (nullptr == params) { // QNN library path // can be hardcoded to "/data/local/tmp/" for Android command line application // or specified in JNI layer for Android APK params = "/data/local/tmp/"; } - ggml_backend_t qnn_backend = ggml_backend_qnn_init((int) (intptr_t) user_data, params); + ggml_backend_t qnn_backend = ggml_backend_qnn_init((int)(intptr_t)user_data, params); return qnn_backend; } @@ -637,19 +600,15 @@ bool ggml_backend_is_qnn(ggml_backend_t backend) { void ggml_backend_qnn_set_n_threads(ggml_backend_t backend, int n_threads) { GGML_ASSERT(ggml_backend_is_qnn(backend)); - auto * ctx = (ggml_backend_qnn_context *) backend->context; + auto *ctx = (ggml_backend_qnn_context *)backend->context; ctx->threads = n_threads; } -const char * ggml_backend_qnn_get_name(ggml_backend_t backend) { - return backend->iface.get_name(backend); -} +const char *ggml_backend_qnn_get_name(ggml_backend_t backend) { return backend->iface.get_name(backend); } -int ggml_backend_qnn_get_device_count() { - return GGML_QNN_MAX_DEVICES; -} +int ggml_backend_qnn_get_device_count() { return GGML_QNN_MAX_DEVICES; } -void ggml_backend_qnn_get_device_description(size_t dev_num, char * description, size_t description_size) { +void ggml_backend_qnn_get_device_description(size_t dev_num, char *description, size_t description_size) { if (nullptr == description || 0 == description_size) { QNN_LOG_WARN("invalid param"); return; @@ -665,9 +624,10 @@ void ggml_backend_qnn_get_device_description(size_t dev_num, char * description, ggml_backend_buffer_type_t ggml_backend_qnn_buffer_type(size_t device) { if (device >= GGML_QNN_MAX_DEVICES) { - QNN_LOG_DEBUG("ggml_backend_qnn_buffer_type error: device_index:%d is " - "out of range [0, %d]\n", - device, GGML_QNN_MAX_DEVICES - 1); + QNN_LOG_DEBUG( + "ggml_backend_qnn_buffer_type error: device_index:%d is " + "out of range [0, %d]\n", + device, GGML_QNN_MAX_DEVICES - 1); return nullptr; } @@ -676,17 +636,15 @@ ggml_backend_buffer_type_t ggml_backend_qnn_buffer_type(size_t device) { static bool ggml_backend_qnn_buffer_type_initialized = false; if (!ggml_backend_qnn_buffer_type_initialized) { for (size_t i = 0; i < GGML_QNN_MAX_DEVICES; i++) { - auto & context = ggml_backend_qnn_buffer_type_contexts[i]; + auto &context = ggml_backend_qnn_buffer_type_contexts[i]; context = { i, std::string(QNN_BACKEND_NAME) + std::to_string(i) }; ggml_backend_qnn_buffer_types[i] = { - /* .iface = */ { - /* .get_name = */ ggml_backend_qnn_buffer_type_name, - /* .alloc_buffer = */ ggml_backend_qnn_buffer_type_alloc_buffer, - /* .get_alignment = */ ggml_backend_qnn_buffer_type_get_alignment, - /* .get_max_size = */ ggml_backend_qnn_buffer_type_get_max_size, - /* .get_alloc_size = */ nullptr, // defaults to ggml_nbytes - /* .is_host = */ ggml_backend_qnn_buffer_is_host - }, + /* .iface = */ { /* .get_name = */ ggml_backend_qnn_buffer_type_name, + /* .alloc_buffer = */ ggml_backend_qnn_buffer_type_alloc_buffer, + /* .get_alignment = */ ggml_backend_qnn_buffer_type_get_alignment, + /* .get_max_size = */ ggml_backend_qnn_buffer_type_get_max_size, + /* .get_alloc_size = */ nullptr, // defaults to ggml_nbytes + /* .is_host = */ ggml_backend_qnn_buffer_is_host }, /* .context = */ &context, }; } @@ -702,7 +660,7 @@ ggml_backend_buffer_type_t ggml_backend_qnn_buffer_type(size_t device) { * @param qnn_lib_path qnn library path, such as "/data/local/tmp/" on Android or specified in JNI layer * @return */ -ggml_backend_t ggml_backend_qnn_init(size_t device, const char * qnn_lib_path) { +ggml_backend_t ggml_backend_qnn_init(size_t device, const char *qnn_lib_path) { int result = 0; if (nullptr == qnn_lib_path) { @@ -729,9 +687,8 @@ ggml_backend_t ggml_backend_qnn_init(size_t device, const char * qnn_lib_path) { QNN_LOG_ERROR("QNN NPU backend setenv failure"); } if (0 == setenv("ADSP_LIBRARY_PATH", - (path + - ";/vendor/dsp/cdsp;/vendor/lib/rfsa/adsp;/system/lib/" - "rfsa/adsp;/vendor/dsp/dsp;/vendor/dsp/images;/dsp") + (path + ";/vendor/dsp/cdsp;/vendor/lib/rfsa/adsp;/system/lib/" + "rfsa/adsp;/vendor/dsp/dsp;/vendor/dsp/images;/dsp") .c_str(), 1)) { QNN_LOG_INFO("QNN NPU backend setenv successfully"); @@ -740,20 +697,16 @@ ggml_backend_t ggml_backend_qnn_init(size_t device, const char * qnn_lib_path) { } } else { if (0 == setenv("LD_LIBRARY_PATH", path.c_str(), 1)) { - QNN_LOG_INFO("%s backend setenv successfully\n", - qnn::get_backend_name(device)); + QNN_LOG_INFO("%s backend setenv successfully\n", qnn::get_backend_name(device)); } else { - QNN_LOG_ERROR("%s backend setenv failure\n", - qnn::get_backend_name(device)); + QNN_LOG_ERROR("%s backend setenv failure\n", qnn::get_backend_name(device)); } } auto *instance = new qnn::qnn_instance(qnn_lib_path, g_qnn_mgr[device].lib, ""); result = instance->qnn_init(nullptr); if (0 != result) { - QNN_LOG_WARN( - "init qnn subsystem failed with qnn backend %s, pls check why\n", - qnn::get_backend_name(device)); + QNN_LOG_WARN("init qnn subsystem failed with qnn backend %s, pls check why\n", qnn::get_backend_name(device)); delete instance; return nullptr; } @@ -766,15 +719,14 @@ ggml_backend_t ggml_backend_qnn_init(size_t device, const char * qnn_lib_path) { std::string device_name = qnn::get_backend_name(device); QNN_LOG_INFO("qnn device name %s", device_name.c_str()); - g_qnn_mgr[device].instance = instance; - g_qnn_mgr[device].raw_interface = instance->get_qnn_raw_interface(); + g_qnn_mgr[device].instance = instance; + g_qnn_mgr[device].raw_interface = instance->get_qnn_raw_interface(); g_qnn_mgr[device].raw_system_interface = instance->get_qnn_raw_system_interface(); - g_qnn_mgr[device].socinfo = instance->get_soc_info(); + g_qnn_mgr[device].socinfo = instance->get_soc_info(); - ggml_backend_t qnn_backend = - new ggml_backend{/* .guid = */ ggml_backend_qnn_guid(), - /* .iface = */ ggml_backend_qnn_interface, - /* .context = */ &g_qnn_mgr[device]}; + ggml_backend_t qnn_backend = new ggml_backend{ /* .guid = */ ggml_backend_qnn_guid(), + /* .iface = */ ggml_backend_qnn_interface, + /* .context = */ &g_qnn_mgr[device] }; g_qnn_mgr[device].backend = qnn_backend; return qnn_backend; @@ -786,9 +738,8 @@ GGML_CALL int ggml_backend_qnn_reg_devices() { for (size_t idx = 0; idx < GGML_QNN_MAX_DEVICES; idx++) { char name[GGML_MAX_NAME]; ggml_backend_qnn_get_device_description(idx, name, GGML_MAX_NAME); - ggml_backend_register(name, ggml_backend_qnn_reg_init, - ggml_backend_qnn_buffer_type(idx), - (void *) (intptr_t) idx); + ggml_backend_register(name, ggml_backend_qnn_reg_init, ggml_backend_qnn_buffer_type(idx), + (void *)(intptr_t)idx); } return GGML_QNN_MAX_DEVICES; diff --git a/ggml/src/ggml-qnn/utils.hpp b/ggml/src/ggml-qnn/utils.hpp index 673fb90e63de9..2d830f6786b7d 100644 --- a/ggml/src/ggml-qnn/utils.hpp +++ b/ggml/src/ggml-qnn/utils.hpp @@ -30,7 +30,7 @@ Fn load_qnn_functionpointers(void *handle, const char *function_name) { return reinterpret_cast(dlsym(handle, function_name)); } -inline int validate_tensor_version(Qnn_Tensor_t tensor) { +inline int validate_tensor_version(const Qnn_Tensor_t &tensor) { if (tensor.version != QNN_TENSOR_VERSION_1) { QNN_LOG_WARN("validate_tensor_version() tensor %s, got unsupported version %d\n", tensor.v1.name, tensor.version); From 0f2e68713cd9f0d8a8de6412ade139b4fdea82b4 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Fri, 5 Jul 2024 18:38:20 +0800 Subject: [PATCH 042/143] move tensor related function to utils --- ggml/src/ggml-qnn.cpp | 131 ++++-------------------------------- ggml/src/ggml-qnn/utils.cpp | 115 +++++++++++++++++++++++++++++++ ggml/src/ggml-qnn/utils.hpp | 17 ++--- 3 files changed, 134 insertions(+), 129 deletions(-) diff --git a/ggml/src/ggml-qnn.cpp b/ggml/src/ggml-qnn.cpp index a590dd5f56cfb..d6feea0437511 100644 --- a/ggml/src/ggml-qnn.cpp +++ b/ggml/src/ggml-qnn.cpp @@ -35,13 +35,6 @@ #include "ggml-qnn/tensor.hpp" #include "ggml-qnn/utils.hpp" -// ================================================================================================= -// -// forward declaration -// -// ================================================================================================= -static int free_qnn_tensor(Qnn_Tensor_t &tensor); - // ================================================================================================= // // self-defined macro / data structure @@ -128,7 +121,7 @@ struct ggml_backend_qnn_buffer_context { } for (auto *qnn_tensor : qnn_tensors) { - free_qnn_tensor(*qnn_tensor); + qnn::device_tensor_free(*qnn_tensor); free(qnn_tensor); } @@ -156,95 +149,6 @@ struct ggml_backend_qnn_buffer_type_context { // QNN backend internal helper functions // // ================================================================================================= -static size_t memscpy(void *dst, size_t dst_size, const void *src, size_t copy_size) { - if (!dst || !src || !dst_size || !copy_size) return 0; - - size_t min_size = dst_size < copy_size ? dst_size : copy_size; - - memcpy(dst, src, min_size); - - return min_size; -} - -static int deep_copy_qnn_tensors(Qnn_Tensor_t &src, Qnn_Tensor_t &dst) { - int err = 0; - VALIDATE_TENSOR_VERSION(src, err); - - dst.version = src.version; - QNN_TENSOR_SET_NAME(dst, ::strndup(QNN_TENSOR_GET_NAME(src), std::string(QNN_TENSOR_GET_NAME(src)).size())); - if (nullptr == QNN_TENSOR_GET_NAME(dst)) { - return 1; - } - QNN_TENSOR_SET_ID(dst, QNN_TENSOR_GET_ID(src)); - QNN_TENSOR_SET_TYPE(dst, QNN_TENSOR_GET_TYPE(src)); - QNN_TENSOR_SET_DATA_FORMAT(dst, QNN_TENSOR_GET_DATA_FORMAT(src)); - QNN_TENSOR_SET_DATA_TYPE(dst, QNN_TENSOR_GET_DATA_TYPE(src)); - QNN_TENSOR_SET_MEM_TYPE(dst, QNN_TENSOR_GET_MEM_TYPE(src)); - - if (QNN_TENSOR_GET_MEM_TYPE(src) == QNN_TENSORMEMTYPE_RAW) { - Qnn_ClientBuffer_t client_buf = { nullptr, 0 }; - QNN_TENSOR_SET_CLIENT_BUF(dst, client_buf); - } else if (QNN_TENSOR_GET_MEM_TYPE(src) == QNN_TENSORMEMTYPE_MEMHANDLE) { - QNN_TENSOR_SET_MEM_HANDLE(dst, nullptr); - } else { - return 1; - } - - Qnn_QuantizeParams_t src_qparam = QNN_TENSOR_GET_QUANT_PARAMS(src); - Qnn_QuantizationEncoding_t encoding = src_qparam.quantizationEncoding; - if (encoding == QNN_QUANTIZATION_ENCODING_AXIS_SCALE_OFFSET) { - Qnn_QuantizeParams_t src_qparam_cpy = src_qparam; - Qnn_AxisScaleOffset_t &axis_scale_offset = src_qparam_cpy.axisScaleOffsetEncoding; - Qnn_ScaleOffset_t **scaleOffset = &axis_scale_offset.scaleOffset; - size_t scaleOffsetSize = axis_scale_offset.numScaleOffsets * sizeof(Qnn_ScaleOffset_t); - *scaleOffset = (Qnn_ScaleOffset_t *)malloc(scaleOffsetSize); - memscpy(*scaleOffset, scaleOffsetSize, src_qparam.axisScaleOffsetEncoding.scaleOffset, scaleOffsetSize); - QNN_TENSOR_SET_QUANT_PARAMS(dst, src_qparam_cpy); - } else if (encoding == QNN_QUANTIZATION_ENCODING_BW_AXIS_SCALE_OFFSET) { - Qnn_QuantizeParams_t src_qparam_cpy = src_qparam; - Qnn_BwAxisScaleOffset_t &bwaxis_scale_offset = src_qparam_cpy.bwAxisScaleOffsetEncoding; - size_t scaleSize = bwaxis_scale_offset.numElements * sizeof(float); - float **scales = &bwaxis_scale_offset.scales; - int32_t **offsets = &bwaxis_scale_offset.offsets; - *scales = (float *)malloc(scaleSize); - memscpy(*scales, scaleSize, src_qparam.bwAxisScaleOffsetEncoding.scales, scaleSize); - - if (bwaxis_scale_offset.offsets != nullptr) { - size_t offsetSize = bwaxis_scale_offset.numElements * sizeof(int32_t); - *offsets = (int32_t *)malloc(offsetSize); - memscpy(*offsets, offsetSize, src_qparam.bwAxisScaleOffsetEncoding.offsets, offsetSize); - } - QNN_TENSOR_SET_QUANT_PARAMS(dst, src_qparam_cpy); - } else { - QNN_TENSOR_SET_QUANT_PARAMS(dst, src_qparam); - } - - uint32_t rank = QNN_TENSOR_GET_RANK(src); - QNN_TENSOR_SET_RANK(dst, rank); - size_t dim_size = rank * sizeof(uint32_t); - uint32_t *dimensions = (uint32_t *)malloc(dim_size); - if (dimensions == nullptr) { - QNN_LOG_WARN( - "deep_copy_qnn_tensors() allocation error while copying " - "tensor %s\n", - QNN_TENSOR_GET_NAME(src)); - return 1; - } - memscpy(dimensions, dim_size, QNN_TENSOR_GET_DIMENSIONS(src), dim_size); - QNN_TENSOR_SET_DIMENSIONS(dst, dimensions); - - return err; -} - -static int free_qnn_tensor(Qnn_Tensor_t &tensor) { - int err = 0; - VALIDATE_TENSOR_VERSION(tensor, err); - - free((void *)QNN_TENSOR_GET_NAME(tensor)); - free(QNN_TENSOR_GET_DIMENSIONS(tensor)); - - return err; -} // ================================================================================================= // @@ -335,9 +239,14 @@ GGML_CALL static void *ggml_backend_qnn_buffer_get_base(ggml_backend_buffer_t bu } GGML_CALL static void ggml_backend_qnn_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor *tensor) { - Qnn_ErrorHandle_t error = QNN_SUCCESS; ggml_backend_qnn_buffer_context *ctx = (ggml_backend_qnn_buffer_context *)buffer->context; + Qnn_Tensor_t *p_qnn_tensor = (Qnn_Tensor_t *)calloc(1, sizeof(Qnn_Tensor_t)); + if (!p_qnn_tensor) { + QNN_LOG_WARN("calloc failed"); + return; + } + static int idx = 0; char tensor_name[GGML_MAX_NAME] = { 0 }; snprintf(tensor_name, GGML_MAX_NAME, "tensor_%04d", idx++); @@ -352,39 +261,23 @@ GGML_CALL static void ggml_backend_qnn_buffer_init_tensor(ggml_backend_buffer_t } else if (tensor->flags & GGML_TENSOR_FLAG_OUTPUT) { qnn_tensor_type = QNN_TENSOR_TYPE_APP_READ; } - Qnn_Tensor_t qnn_tensor = QNN_TENSOR_INIT; Qnn_TensorMemType_t qnn_mem_type = QNN_TENSORMEMTYPE_RAW; if (ctx->device == QNN_BACKEND_GPU) { qnn_mem_type = QNN_TENSORMEMTYPE_MEMHANDLE; } - qnn_tensor = { .version = QNN_TENSOR_VERSION_1, - { .v1 = { - .id = 0, - .name = tensor_name, - .type = qnn_tensor_type, - .dataFormat = QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER, - .dataType = qnn_data_type, - .quantizeParams = { QNN_DEFINITION_UNDEFINED, - QNN_QUANTIZATION_ENCODING_UNDEFINED, - { .scaleOffsetEncoding = { .scale = 0.0000000000000000f, .offset = 0 } } }, - .rank = qnn::get_ggml_tensor_rank(tensor), - .dimensions = dimensions, - .memType = qnn_mem_type, - { .clientBuf = { .data = nullptr, .dataSize = 0 } } } } }; + Qnn_Tensor_t qnn_tensor; + qnn::device_tensor_init(qnn_tensor, qnn::get_ggml_tensor_rank(tensor), qnn_mem_type, tensor_name, qnn_tensor_type, + qnn_data_type, dimensions); - Qnn_Tensor_t *p_qnn_tensor = (Qnn_Tensor_t *)calloc(1, sizeof(Qnn_Tensor_t)); - if (nullptr == p_qnn_tensor) { - QNN_LOG_WARN("calloc failed"); - return; - } - error = deep_copy_qnn_tensors(qnn_tensor, *p_qnn_tensor); + Qnn_ErrorHandle_t error = qnn::device_tensor_deep_copy(qnn_tensor, *p_qnn_tensor); if (error != QNN_SUCCESS) { free(p_qnn_tensor); QNN_LOG_WARN("init tensor failed"); return; } + tensor->extra = p_qnn_tensor; ctx->qnn_tensors.push_back(p_qnn_tensor); } diff --git a/ggml/src/ggml-qnn/utils.cpp b/ggml/src/ggml-qnn/utils.cpp index 2368b466c8187..89982449a8eba 100644 --- a/ggml/src/ggml-qnn/utils.cpp +++ b/ggml/src/ggml-qnn/utils.cpp @@ -5,6 +5,20 @@ #include "qnn-types.hpp" +namespace { + +size_t memscpy(void *dst, size_t dst_size, const void *src, size_t copy_size) { + if (!dst || !src || !dst_size || !copy_size) return 0; + + size_t min_size = dst_size < copy_size ? dst_size : copy_size; + + memcpy(dst, src, min_size); + + return min_size; +} + +} // namespace + namespace qnn { // TODO: mapping more ggml data type to QNN data type @@ -121,4 +135,105 @@ const char *opname_from_ggmlop(enum ggml_op ggmlop) { return nullptr; } +void device_tensor_init(Qnn_Tensor_t &tensor, uint32_t rank, Qnn_TensorMemType_t mem_type, const char *tensor_name, + Qnn_TensorType_t qnn_tensor_type, Qnn_DataType_t qnn_data_type, uint32_t *dimensions) { + tensor = QNN_TENSOR_INIT; + tensor = { .version = QNN_TENSOR_VERSION_1, + { .v1 = { .id = 0, + .name = tensor_name, + .type = qnn_tensor_type, + .dataFormat = QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER, + .dataType = qnn_data_type, + .quantizeParams = { QNN_DEFINITION_UNDEFINED, + QNN_QUANTIZATION_ENCODING_UNDEFINED, + { .scaleOffsetEncoding = { .scale = 0.0000000000000000f, .offset = 0 } } }, + .rank = rank, + .dimensions = dimensions, + .memType = mem_type, + { .clientBuf = {} } } } }; +} + +Qnn_ErrorHandle_t device_tensor_deep_copy(const Qnn_Tensor_t &src, Qnn_Tensor_t &dst) { + Qnn_ErrorHandle_t err = validate_tensor_version(src); + if (err != QNN_SUCCESS) { + QNN_LOG_WARN("validate_tensor_version expected QNN_SUCCESS\n"); + return err; + } + + dst.version = src.version; + QNN_TENSOR_SET_NAME(dst, ::strndup(QNN_TENSOR_GET_NAME(src), std::string(QNN_TENSOR_GET_NAME(src)).size())); + if (nullptr == QNN_TENSOR_GET_NAME(dst)) { + return (Qnn_ErrorHandle_t)1; + } + QNN_TENSOR_SET_ID(dst, QNN_TENSOR_GET_ID(src)); + QNN_TENSOR_SET_TYPE(dst, QNN_TENSOR_GET_TYPE(src)); + QNN_TENSOR_SET_DATA_FORMAT(dst, QNN_TENSOR_GET_DATA_FORMAT(src)); + QNN_TENSOR_SET_DATA_TYPE(dst, QNN_TENSOR_GET_DATA_TYPE(src)); + QNN_TENSOR_SET_MEM_TYPE(dst, QNN_TENSOR_GET_MEM_TYPE(src)); + + if (QNN_TENSOR_GET_MEM_TYPE(src) == QNN_TENSORMEMTYPE_RAW) { + Qnn_ClientBuffer_t client_buf = { nullptr, 0 }; + QNN_TENSOR_SET_CLIENT_BUF(dst, client_buf); + } else if (QNN_TENSOR_GET_MEM_TYPE(src) == QNN_TENSORMEMTYPE_MEMHANDLE) { + QNN_TENSOR_SET_MEM_HANDLE(dst, nullptr); + } else { + return (Qnn_ErrorHandle_t)1; + } + + Qnn_QuantizeParams_t src_qparam = QNN_TENSOR_GET_QUANT_PARAMS(src); + Qnn_QuantizationEncoding_t encoding = src_qparam.quantizationEncoding; + if (encoding == QNN_QUANTIZATION_ENCODING_AXIS_SCALE_OFFSET) { + Qnn_QuantizeParams_t src_qparam_cpy = src_qparam; + Qnn_AxisScaleOffset_t &axis_scale_offset = src_qparam_cpy.axisScaleOffsetEncoding; + Qnn_ScaleOffset_t **scaleOffset = &axis_scale_offset.scaleOffset; + size_t scaleOffsetSize = axis_scale_offset.numScaleOffsets * sizeof(Qnn_ScaleOffset_t); + *scaleOffset = (Qnn_ScaleOffset_t *)malloc(scaleOffsetSize); + memscpy(*scaleOffset, scaleOffsetSize, src_qparam.axisScaleOffsetEncoding.scaleOffset, scaleOffsetSize); + QNN_TENSOR_SET_QUANT_PARAMS(dst, src_qparam_cpy); + } else if (encoding == QNN_QUANTIZATION_ENCODING_BW_AXIS_SCALE_OFFSET) { + Qnn_QuantizeParams_t src_qparam_cpy = src_qparam; + Qnn_BwAxisScaleOffset_t &bwaxis_scale_offset = src_qparam_cpy.bwAxisScaleOffsetEncoding; + size_t scaleSize = bwaxis_scale_offset.numElements * sizeof(float); + float **scales = &bwaxis_scale_offset.scales; + int32_t **offsets = &bwaxis_scale_offset.offsets; + *scales = (float *)malloc(scaleSize); + memscpy(*scales, scaleSize, src_qparam.bwAxisScaleOffsetEncoding.scales, scaleSize); + + if (bwaxis_scale_offset.offsets != nullptr) { + size_t offsetSize = bwaxis_scale_offset.numElements * sizeof(int32_t); + *offsets = (int32_t *)malloc(offsetSize); + memscpy(*offsets, offsetSize, src_qparam.bwAxisScaleOffsetEncoding.offsets, offsetSize); + } + QNN_TENSOR_SET_QUANT_PARAMS(dst, src_qparam_cpy); + } else { + QNN_TENSOR_SET_QUANT_PARAMS(dst, src_qparam); + } + + uint32_t rank = QNN_TENSOR_GET_RANK(src); + QNN_TENSOR_SET_RANK(dst, rank); + size_t dim_size = rank * sizeof(uint32_t); + uint32_t *dimensions = (uint32_t *)malloc(dim_size); + if (dimensions == nullptr) { + QNN_LOG_WARN( + "deep_copy_qnn_tensors() allocation error while copying " + "tensor %s\n", + QNN_TENSOR_GET_NAME(src)); + return (Qnn_ErrorHandle_t)1; + } + memscpy(dimensions, dim_size, QNN_TENSOR_GET_DIMENSIONS(src), dim_size); + QNN_TENSOR_SET_DIMENSIONS(dst, dimensions); + + return err; +} + +void device_tensor_free(Qnn_Tensor_t &tensor) { + if (validate_tensor_version(tensor) != QNN_SUCCESS) { + QNN_LOG_WARN("validate_tensor_version expected QNN_SUCCESS\n"); + return; + } + + free((void *)QNN_TENSOR_GET_NAME(tensor)); + free(QNN_TENSOR_GET_DIMENSIONS(tensor)); +} + } // namespace qnn diff --git a/ggml/src/ggml-qnn/utils.hpp b/ggml/src/ggml-qnn/utils.hpp index 2d830f6786b7d..aa824379a8d9b 100644 --- a/ggml/src/ggml-qnn/utils.hpp +++ b/ggml/src/ggml-qnn/utils.hpp @@ -169,6 +169,13 @@ inline void set_qnn_tensor_memhandle(Qnn_Tensor_t &tensor, Qnn_MemHandle_t handl } } +void device_tensor_init(Qnn_Tensor_t &tensor, uint32_t rank, Qnn_TensorMemType_t mem_type, const char *tensor_name, + Qnn_TensorType_t qnn_tensor_type, Qnn_DataType_t qnn_data_type, uint32_t *dimensions); + +Qnn_ErrorHandle_t device_tensor_deep_copy(const Qnn_Tensor_t &src, Qnn_Tensor_t &dst); + +void device_tensor_free(Qnn_Tensor_t &tensor); + #if ENABLE_QNNBACKEND_PERF class qnn_perf { public: @@ -206,15 +213,6 @@ class qnn_perf { } // namespace qnn -#define VALIDATE(value, status) \ - do { \ - status = value; \ - if (status != QNN_SUCCESS) { \ - QNN_LOG_WARN("%s expected QNN_SUCCESS\n", #value); \ - return status; \ - } \ - } while (0) - #define QNN_TENSOR_GET_ID(tensor) qnn::get_qnn_tensorid(tensor) #define QNN_TENSOR_GET_NAME(tensor) qnn::get_qnn_tensorname(tensor) #define QNN_TENSOR_GET_TYPE(tensor) qnn::get_qnn_tensortype(tensor) @@ -236,4 +234,3 @@ class qnn_perf { #define QNN_TENSOR_SET_MEM_TYPE(tensor, value) qnn::set_qnn_tensor_memtype(tensor, value) #define QNN_TENSOR_SET_CLIENT_BUF(tensor, value) qnn::set_qnn_tensor_clientbuf(tensor, value) #define QNN_TENSOR_SET_MEM_HANDLE(tensor, value) qnn::set_qnn_tensor_memhandle(tensor, value) -#define VALIDATE_TENSOR_VERSION(tensor, err) VALIDATE(qnn::validate_tensor_version(tensor), err) From 4b0f6b0cd6f24b16a2fc8022345161811c01bcc2 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Fri, 5 Jul 2024 19:34:56 +0800 Subject: [PATCH 043/143] add helper function to get Qnn_TensorType_t from ggml_tensor --- ggml/src/ggml-qnn.cpp | 16 ++++------------ ggml/src/ggml-qnn/tensor.hpp | 4 ++-- ggml/src/ggml-qnn/utils.cpp | 16 ++++++++++++++-- ggml/src/ggml-qnn/utils.hpp | 4 +++- 4 files changed, 23 insertions(+), 17 deletions(-) diff --git a/ggml/src/ggml-qnn.cpp b/ggml/src/ggml-qnn.cpp index d6feea0437511..632ce8ee5c19e 100644 --- a/ggml/src/ggml-qnn.cpp +++ b/ggml/src/ggml-qnn.cpp @@ -250,23 +250,15 @@ GGML_CALL static void ggml_backend_qnn_buffer_init_tensor(ggml_backend_buffer_t static int idx = 0; char tensor_name[GGML_MAX_NAME] = { 0 }; snprintf(tensor_name, GGML_MAX_NAME, "tensor_%04d", idx++); - - uint32_t dimensions[] = { (uint32_t)tensor->ne[0], (uint32_t)tensor->ne[1], (uint32_t)tensor->ne[2], - (uint32_t)tensor->ne[3] }; - Qnn_DataType_t qnn_data_type = qnn::datatype_from_ggml_datatype(tensor->type); - Qnn_TensorType_t qnn_tensor_type = QNN_TENSOR_TYPE_APP_WRITE; - - if (tensor->flags & GGML_TENSOR_FLAG_INPUT) { - qnn_tensor_type = QNN_TENSOR_TYPE_APP_WRITE; - } else if (tensor->flags & GGML_TENSOR_FLAG_OUTPUT) { - qnn_tensor_type = QNN_TENSOR_TYPE_APP_READ; - } - + Qnn_DataType_t qnn_data_type = qnn::device_datatype_from_ggml_datatype(tensor->type); + Qnn_TensorType_t qnn_tensor_type = qnn::device_tensortype_from_ggml_tensor(tensor); Qnn_TensorMemType_t qnn_mem_type = QNN_TENSORMEMTYPE_RAW; if (ctx->device == QNN_BACKEND_GPU) { qnn_mem_type = QNN_TENSORMEMTYPE_MEMHANDLE; } + uint32_t dimensions[] = { (uint32_t)tensor->ne[0], (uint32_t)tensor->ne[1], (uint32_t)tensor->ne[2], + (uint32_t)tensor->ne[3] }; Qnn_Tensor_t qnn_tensor; qnn::device_tensor_init(qnn_tensor, qnn::get_ggml_tensor_rank(tensor), qnn_mem_type, tensor_name, qnn_tensor_type, qnn_data_type, dimensions); diff --git a/ggml/src/ggml-qnn/tensor.hpp b/ggml/src/ggml-qnn/tensor.hpp index 0ec75c03f0e53..8a9196616fcae 100644 --- a/ggml/src/ggml-qnn/tensor.hpp +++ b/ggml/src/ggml-qnn/tensor.hpp @@ -17,7 +17,7 @@ class ggml_qnn_tensor_readwrite { ggml_backend_qnn_context *ctx) : _tensor(tensor), _qnn_tensor(reinterpret_cast(tensor->extra)), _context(ctx) { _old_dimensions = QNN_VER_PTR(*_qnn_tensor)->dimensions; - const auto qnn_data_type = datatype_from_ggml_datatype(tensor->type); + const auto qnn_data_type = device_datatype_from_ggml_datatype(tensor->type); const bool is_npu = ctx->device == QNN_BACKEND_NPU; QNN_VER_PTR(*_qnn_tensor)->type = _tensorType; if (is_npu) { @@ -67,7 +67,7 @@ class ggml_qnn_tensor_readwrite { ggml_backend_qnn_context *ctx) : _tensor(tensor), _qnn_tensor(qnn_tensor), _context(ctx) { _old_dimensions = QNN_VER_PTR(*_qnn_tensor)->dimensions; - const auto qnn_data_type = qnn::datatype_from_ggml_datatype(tensor->type); + const auto qnn_data_type = device_datatype_from_ggml_datatype(tensor->type); const bool is_npu = ctx->device == QNN_BACKEND_NPU; _dimensions[0] = (uint32_t)tensor->ne[0]; diff --git a/ggml/src/ggml-qnn/utils.cpp b/ggml/src/ggml-qnn/utils.cpp index 89982449a8eba..7c25314f731f0 100644 --- a/ggml/src/ggml-qnn/utils.cpp +++ b/ggml/src/ggml-qnn/utils.cpp @@ -23,8 +23,8 @@ namespace qnn { // TODO: mapping more ggml data type to QNN data type // ref:explanation of k-quants, https://github.com/ggerganov/llama.cpp/pull/1684 -Qnn_DataType_t datatype_from_ggml_datatype(enum ggml_type ggmltype) { - switch (ggmltype) { +Qnn_DataType_t device_datatype_from_ggml_datatype(ggml_type ggml_type) { + switch (ggml_type) { case GGML_TYPE_F16: return QNN_DATATYPE_FLOAT_16; case GGML_TYPE_F32: @@ -41,6 +41,18 @@ Qnn_DataType_t datatype_from_ggml_datatype(enum ggml_type ggmltype) { return QNN_DATATYPE_UNDEFINED; } +Qnn_TensorType_t device_tensortype_from_ggml_tensor(ggml_tensor *ggml_tensor) { + Qnn_TensorType_t qnn_tensor_type = QNN_TENSOR_TYPE_APP_WRITE; + + if (ggml_tensor->flags & GGML_TENSOR_FLAG_INPUT) { + qnn_tensor_type = QNN_TENSOR_TYPE_APP_WRITE; + } else if (ggml_tensor->flags & GGML_TENSOR_FLAG_OUTPUT) { + qnn_tensor_type = QNN_TENSOR_TYPE_APP_READ; + } + + return qnn_tensor_type; +} + uint32_t get_ggml_tensor_rank(const ggml_tensor *tensor) { uint32_t rank = 0; for (int i = 0; i < GGML_MAX_DIMS; i++) { diff --git a/ggml/src/ggml-qnn/utils.hpp b/ggml/src/ggml-qnn/utils.hpp index aa824379a8d9b..87d908f1e15fb 100644 --- a/ggml/src/ggml-qnn/utils.hpp +++ b/ggml/src/ggml-qnn/utils.hpp @@ -15,7 +15,6 @@ namespace qnn { -Qnn_DataType_t datatype_from_ggml_datatype(enum ggml_type ggmltype); uint32_t get_ggml_tensor_rank(const ggml_tensor *tensor); const char *get_backend_name(int n_backend_type); const char *get_chipset_desc(uint32_t chipset_id); @@ -169,6 +168,9 @@ inline void set_qnn_tensor_memhandle(Qnn_Tensor_t &tensor, Qnn_MemHandle_t handl } } +Qnn_DataType_t device_datatype_from_ggml_datatype(ggml_type ggml_type); +Qnn_TensorType_t device_tensortype_from_ggml_tensor(ggml_tensor *ggml_tensor); + void device_tensor_init(Qnn_Tensor_t &tensor, uint32_t rank, Qnn_TensorMemType_t mem_type, const char *tensor_name, Qnn_TensorType_t qnn_tensor_type, Qnn_DataType_t qnn_data_type, uint32_t *dimensions); From 263ffa962ea3ebfceaa1f9f52c24c67930fbface Mon Sep 17 00:00:00 2001 From: hongruichen Date: Fri, 5 Jul 2024 23:07:27 +0800 Subject: [PATCH 044/143] small opt of the qnn graph config init --- ggml/src/ggml-qnn/graph.hpp | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/ggml/src/ggml-qnn/graph.hpp b/ggml/src/ggml-qnn/graph.hpp index 1aad145c32896..651fc1c5301ec 100644 --- a/ggml/src/ggml-qnn/graph.hpp +++ b/ggml/src/ggml-qnn/graph.hpp @@ -54,9 +54,9 @@ class ggml_qnn_graph { graph_vtcm_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; graph_vtcm_config.customConfig = &vtcm_config; - const QnnGraph_Config_t *p_graphconfig[] = { &graph_hvx_config, &graph_dlbc_config, &graph_vtcm_config, + const QnnGraph_Config_t *graph_configs[] = { &graph_hvx_config, &graph_dlbc_config, &graph_vtcm_config, &graph_opt_config, nullptr }; - error = qnn_interface.graphCreate(qnn_context, graph_name.c_str(), p_graphconfig, &graph_handle); + error = qnn_interface.graphCreate(qnn_context, graph_name.c_str(), graph_configs, &graph_handle); } else { error = qnn_interface.graphCreate(qnn_context, graph_name.c_str(), nullptr, &graph_handle); } @@ -67,10 +67,9 @@ class ggml_qnn_graph { "error = %d\n", graph_name.c_str(), error); return; - } else { - QNN_LOG_INFO("create qnn graph handle with graph name %s ok\n", graph_name.c_str()); } + QNN_LOG_INFO("create qnn graph handle with graph name %s ok\n", graph_name.c_str()); _graph_handle = graph_handle; } From 874216b9c887ebbde5eba476d51b3c4db1e1f3a5 Mon Sep 17 00:00:00 2001 From: Hongrui Chen Date: Sun, 7 Jul 2024 22:32:43 +0800 Subject: [PATCH 045/143] remove unused members --- ggml/src/ggml-qnn.cpp | 6 ------ 1 file changed, 6 deletions(-) diff --git a/ggml/src/ggml-qnn.cpp b/ggml/src/ggml-qnn.cpp index 632ce8ee5c19e..d4d9e2cd5d202 100644 --- a/ggml/src/ggml-qnn.cpp +++ b/ggml/src/ggml-qnn.cpp @@ -116,16 +116,11 @@ struct ggml_backend_qnn_buffer_context { free(buffer); } - for (auto *sub_buffer : sub_buffers) { - free(sub_buffer); - } - for (auto *qnn_tensor : qnn_tensors) { qnn::device_tensor_free(*qnn_tensor); free(qnn_tensor); } - sub_buffers.clear(); qnn_tensors.clear(); } void *buffer = nullptr; @@ -133,7 +128,6 @@ struct ggml_backend_qnn_buffer_context { struct ggml_backend_qnn_context *backend_ctx = nullptr; size_t buffer_size = 0; - std::vector sub_buffers; std::vector qnn_tensors; size_t device; std::string name; From 5f2e3918f6ac0d597ec5004180814fd14edfed97 Mon Sep 17 00:00:00 2001 From: Hongrui Chen Date: Sun, 7 Jul 2024 23:51:12 +0800 Subject: [PATCH 046/143] refactoring ggml_qnn_tensor --- ggml/src/ggml-qnn.cpp | 67 +++------ ggml/src/ggml-qnn/backend-ops.cpp | 157 ++++++++++++--------- ggml/src/ggml-qnn/backend.hpp | 6 +- ggml/src/ggml-qnn/graph.hpp | 18 +++ ggml/src/ggml-qnn/qnn-types.hpp | 2 - ggml/src/ggml-qnn/qnn.hpp | 15 +- ggml/src/ggml-qnn/tensor.hpp | 224 ++++++++++++++++++++---------- ggml/src/ggml-qnn/utils.hpp | 8 ++ 8 files changed, 293 insertions(+), 204 deletions(-) diff --git a/ggml/src/ggml-qnn.cpp b/ggml/src/ggml-qnn.cpp index d4d9e2cd5d202..3584c41120ae6 100644 --- a/ggml/src/ggml-qnn.cpp +++ b/ggml/src/ggml-qnn.cpp @@ -1,5 +1,6 @@ #include "ggml-qnn.h" +#include #include #include #include @@ -81,7 +82,6 @@ static struct ggml_backend_qnn_context g_qnn_mgr[GGML_QNN_MAX_DEVICES] = { .threads = 1, .name = "qnn-cpu", .lib = "libQnnCpu.so", - .instance = nullptr, .backend = nullptr, .raw_interface = {}, .raw_system_interface = {}, @@ -91,7 +91,6 @@ static struct ggml_backend_qnn_context g_qnn_mgr[GGML_QNN_MAX_DEVICES] = { .threads = 1, .name = "qnn-gpu", .lib = "libQnnGpu.so", - .instance = nullptr, .backend = nullptr, .raw_interface = {}, .raw_system_interface = {}, @@ -101,7 +100,6 @@ static struct ggml_backend_qnn_context g_qnn_mgr[GGML_QNN_MAX_DEVICES] = { .threads = 1, .name = "qnn-npu", .lib = "libQnnHtp.so", - .instance = nullptr, .backend = nullptr, .raw_interface = {}, .raw_system_interface = {}, @@ -112,23 +110,16 @@ struct ggml_backend_qnn_buffer_context { ggml_backend_qnn_buffer_context(size_t device) : device(device), name(QNN_BACKEND_NAME + std::to_string(device)) {} ~ggml_backend_qnn_buffer_context() { + tensors.clear(); if (buffer) { free(buffer); } - - for (auto *qnn_tensor : qnn_tensors) { - qnn::device_tensor_free(*qnn_tensor); - free(qnn_tensor); - } - - qnn_tensors.clear(); } - void *buffer = nullptr; + void *buffer = nullptr; struct ggml_backend_qnn_context *backend_ctx = nullptr; - + std::list> tensors; size_t buffer_size = 0; - std::vector qnn_tensors; size_t device; std::string name; }; @@ -235,37 +226,14 @@ GGML_CALL static void *ggml_backend_qnn_buffer_get_base(ggml_backend_buffer_t bu GGML_CALL static void ggml_backend_qnn_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor *tensor) { ggml_backend_qnn_buffer_context *ctx = (ggml_backend_qnn_buffer_context *)buffer->context; - Qnn_Tensor_t *p_qnn_tensor = (Qnn_Tensor_t *)calloc(1, sizeof(Qnn_Tensor_t)); - if (!p_qnn_tensor) { - QNN_LOG_WARN("calloc failed"); - return; - } - - static int idx = 0; - char tensor_name[GGML_MAX_NAME] = { 0 }; - snprintf(tensor_name, GGML_MAX_NAME, "tensor_%04d", idx++); - Qnn_DataType_t qnn_data_type = qnn::device_datatype_from_ggml_datatype(tensor->type); - Qnn_TensorType_t qnn_tensor_type = qnn::device_tensortype_from_ggml_tensor(tensor); - Qnn_TensorMemType_t qnn_mem_type = QNN_TENSORMEMTYPE_RAW; - if (ctx->device == QNN_BACKEND_GPU) { - qnn_mem_type = QNN_TENSORMEMTYPE_MEMHANDLE; - } - - uint32_t dimensions[] = { (uint32_t)tensor->ne[0], (uint32_t)tensor->ne[1], (uint32_t)tensor->ne[2], - (uint32_t)tensor->ne[3] }; - Qnn_Tensor_t qnn_tensor; - qnn::device_tensor_init(qnn_tensor, qnn::get_ggml_tensor_rank(tensor), qnn_mem_type, tensor_name, qnn_tensor_type, - qnn_data_type, dimensions); - - Qnn_ErrorHandle_t error = qnn::device_tensor_deep_copy(qnn_tensor, *p_qnn_tensor); - if (error != QNN_SUCCESS) { - free(p_qnn_tensor); - QNN_LOG_WARN("init tensor failed"); + auto instance = ctx->backend_ctx->instance; + auto qnn_tensor = std::make_unique(tensor, (QNNBackend)(ctx->device), instance); + if (!qnn_tensor->is_valid()) { + QNN_LOG_WARN("Create ggml_qnn_tensor failed"); return; } - tensor->extra = p_qnn_tensor; - ctx->qnn_tensors.push_back(p_qnn_tensor); + ctx->tensors.push_back(std::move(qnn_tensor)); } GGML_CALL static void ggml_backend_qnn_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor *tensor, @@ -373,17 +341,16 @@ GGML_CALL static void ggml_backend_qnn_free(ggml_backend_t backend) { ggml_backend_qnn_context *ctx = (ggml_backend_qnn_context *)backend->context; QNN_LOG_INFO("idx %d, name:%s", ctx->device, g_qnn_mgr[ctx->device].name); - auto *instance = g_qnn_mgr[ctx->device].instance; - if (instance != nullptr) { - for (const auto &graph_item : ctx->qnn_graph_map) { + auto instance = g_qnn_mgr[ctx->device].instance; + if (instance) { + for (const auto &graph_item : ctx->qnn_binary_graph_cache) { QNN_LOG_INFO("graph type:%s", graph_item.first.c_str()); } - ctx->qnn_graph_map.clear(); + ctx->qnn_binary_graph_cache.clear(); instance->qnn_finalize(); - delete instance; - g_qnn_mgr[ctx->device].instance = nullptr; + g_qnn_mgr[ctx->device].instance.reset(); } if (g_qnn_mgr[ctx->device].backend != nullptr) { @@ -582,17 +549,15 @@ ggml_backend_t ggml_backend_qnn_init(size_t device, const char *qnn_lib_path) { } } - auto *instance = new qnn::qnn_instance(qnn_lib_path, g_qnn_mgr[device].lib, ""); + auto instance = std::make_shared(qnn_lib_path, g_qnn_mgr[device].lib, ""); result = instance->qnn_init(nullptr); - if (0 != result) { + if (result != 0) { QNN_LOG_WARN("init qnn subsystem failed with qnn backend %s, pls check why\n", qnn::get_backend_name(device)); - delete instance; return nullptr; } auto qnn_interface = instance->get_qnn_interface(); if (!qnn_interface.is_loaded()) { QNN_LOG_WARN("qnn subsystem failure\n"); - delete instance; return nullptr; } diff --git a/ggml/src/ggml-qnn/backend-ops.cpp b/ggml/src/ggml-qnn/backend-ops.cpp index 79e280fcbe088..1914e64dcff27 100644 --- a/ggml/src/ggml-qnn/backend-ops.cpp +++ b/ggml/src/ggml-qnn/backend-ops.cpp @@ -23,10 +23,10 @@ bool qnn_is_valid_params(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, return false; } - auto *instance = ctx->instance; - auto *tensor0 = src0->extra; - auto *tensor1 = src1->extra; - auto *tensor2 = dst->extra; + auto instance = ctx->instance; + auto *tensor0 = qnn::ggml_qnn_tensor::from_ggml_tensor(src0); + auto *tensor1 = qnn::ggml_qnn_tensor::from_ggml_tensor(src1); + auto *tensor2 = qnn::ggml_qnn_tensor::from_ggml_tensor(dst); if (!instance || !tensor0 || !tensor1 || !tensor2) { QNN_LOG_WARN("invalid tensors\n"); return false; @@ -35,6 +35,80 @@ bool qnn_is_valid_params(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, return true; } +template +bool qnn_bind_tensors_to_graph(qnn::ggml_qnn_graph<_InputSize, _OutputSize> *graph, const std::string &op_name, + const std::array &inputs, + const std::array &outputs) { + std::array qnn_input_tensors; + for (size_t i = 0; i < inputs.size(); ++i) { + auto tensor = qnn::ggml_qnn_tensor::from_ggml_tensor(inputs[i]); + if (!tensor || !tensor->bind_to_graph(*graph)) { + return false; + } + + qnn_input_tensors[i] = tensor->get_qnn_tensor(); + } + + std::array qnn_output_tensors; + for (size_t i = 0; i < outputs.size(); ++i) { + auto tensor = qnn::ggml_qnn_tensor::from_ggml_tensor(outputs[i]); + if (!tensor || !tensor->bind_to_graph(*graph)) { + return false; + } + + qnn_output_tensors[i] = tensor->get_qnn_tensor(); + } + + if (!graph->add_nodes(op_name, qnn_input_tensors, qnn_output_tensors)) { + return false; + } + + return true; +} + +template +bool write_to_qnn_tensors(const std::array &inputs) { + for (auto &input : inputs) { + auto tensor = qnn::ggml_qnn_tensor::from_ggml_tensor(input); + if (!tensor || !tensor->write_to_qnn_tensor()) { + return false; + } + } + + return true; +} + +template +bool read_from_qnn_tensors(const std::array &outputs) { + for (auto &output : outputs) { + auto tensor = qnn::ggml_qnn_tensor::from_ggml_tensor(output); + if (!tensor || !tensor->read_from_qnn_tensor()) { + return false; + } + } + + return true; +} + +template +bool execute_graph(qnn::ggml_qnn_graph<_InputSize, _OutputSize> *graph, + const std::array &inputs, + const std::array &outputs) { + if (!write_to_qnn_tensors<_InputSize>(inputs)) { + return false; + } + + if (!graph->execute()) { + return false; + } + + if (!read_from_qnn_tensors<_OutputSize>(outputs)) { + return false; + } + + return true; +} + } // namespace #ifndef NDEBUG @@ -61,13 +135,10 @@ static void ggml_qnn_add(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, bool succeed = false; std::string graph_key(ggml_op_name(GGML_OP_ADD)); - auto it = ctx->qnn_graph_map.find(graph_key); - if (it != ctx->qnn_graph_map.end()) { - const auto &graph_item = it->second; - qnn::ggml_qnn_tensor_input tensor_input0(src0, std::get<1>(graph_item), ctx); - qnn::ggml_qnn_tensor_input tensor_input1(src1, std::get<2>(graph_item), ctx); - qnn::ggml_qnn_tensor_output tensor_output(dst, std::get<3>(graph_item), ctx); - std::get<0>(graph_item)->execute(); + auto it = ctx->qnn_binary_graph_cache.find(graph_key); + qnn::ggml_qnn_graph_binary *graph_ptr = nullptr; + if (it != ctx->qnn_binary_graph_cache.end()) { + graph_ptr = it->second.get(); } else { graph_name = graph_name + "_" + std::to_string(ctx->threads) + "_" + src0->name + "_" + src1->name; auto graph = std::make_unique(graph_name, (QNNBackend)(ctx->device), @@ -78,34 +149,15 @@ static void ggml_qnn_add(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, goto failure; } - qnn::ggml_qnn_tensor_input tensor_input0(src0, graph->get_graph_handler(), ctx); - if (!tensor_input0.is_valid()) { - goto failure; - } - qnn::ggml_qnn_tensor_input tensor_input1(src1, graph->get_graph_handler(), ctx); - if (!tensor_input1.is_valid()) { - goto failure; - } - qnn::ggml_qnn_tensor_output tensor_output(dst, graph->get_graph_handler(), ctx); - if (!tensor_output.is_valid()) { + if (!qnn_bind_tensors_to_graph<2, 1>(graph.get(), QNN_OP_ELEMENT_WISE_ADD, { src0, src1 }, { dst })) { goto failure; } - if (!graph->add_nodes(QNN_OP_ELEMENT_WISE_ADD, - { *tensor_input0.get_qnn_tensor(), *tensor_input1.get_qnn_tensor() }, - { *tensor_output.get_qnn_tensor() })) { - goto failure; - } - - if (!graph->execute()) { - goto failure; - } - - ctx->qnn_graph_map[graph_key] = std::make_tuple(std::move(graph), tensor_input0.get_qnn_tensor(), - tensor_input1.get_qnn_tensor(), tensor_output.get_qnn_tensor()); + graph_ptr = graph.get(); + ctx->qnn_binary_graph_cache[graph_key] = std::move(graph); } - succeed = true; + succeed = execute_graph<2, 1>(graph_ptr, { src0, src1 }, { dst }); failure: if (!succeed) { @@ -143,13 +195,10 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context *ctx, const ggml_tensor *s bool succeed = false; std::string graph_key(ggml_op_name(GGML_OP_MUL_MAT)); - auto it = ctx->qnn_graph_map.find(graph_key); - if (it != ctx->qnn_graph_map.end()) { - const auto &graph_item = it->second; - qnn::ggml_qnn_tensor_input tensor_input0(src0, std::get<1>(graph_item), ctx); - qnn::ggml_qnn_tensor_input tensor_input1(src1, std::get<2>(graph_item), ctx); - qnn::ggml_qnn_tensor_output tensor_output(dst, std::get<3>(graph_item), ctx); - std::get<0>(graph_item)->execute(); + auto it = ctx->qnn_binary_graph_cache.find(graph_key); + qnn::ggml_qnn_graph_binary *graph_ptr = nullptr; + if (it != ctx->qnn_binary_graph_cache.end()) { + graph_ptr = it->second.get(); } else { graph_name = graph_name + "_" + std::to_string(ctx->threads) + "_" + src0->name + "_" + src1->name; auto graph = std::make_unique(graph_name, (QNNBackend)(ctx->device), @@ -160,33 +209,15 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context *ctx, const ggml_tensor *s goto failure; } - qnn::ggml_qnn_tensor_input tensor_input0(src0, graph->get_graph_handler(), ctx); - if (!tensor_input0.is_valid()) { - goto failure; - } - qnn::ggml_qnn_tensor_input tensor_input1(src1, graph->get_graph_handler(), ctx); - if (!tensor_input1.is_valid()) { - goto failure; - } - qnn::ggml_qnn_tensor_output tensor_output(dst, graph->get_graph_handler(), ctx); - if (!tensor_output.is_valid()) { - goto failure; - } - - if (!graph->add_nodes(QNN_OP_MAT_MUL, { *tensor_input0.get_qnn_tensor(), *tensor_input1.get_qnn_tensor() }, - { *tensor_output.get_qnn_tensor() })) { - goto failure; - } - - if (!graph->execute()) { + if (!qnn_bind_tensors_to_graph<2, 1>(graph.get(), QNN_OP_MAT_MUL, { src0, src1 }, { dst })) { goto failure; } - ctx->qnn_graph_map[graph_key] = std::make_tuple(std::move(graph), tensor_input0.get_qnn_tensor(), - tensor_input1.get_qnn_tensor(), tensor_output.get_qnn_tensor()); + graph_ptr = graph.get(); + ctx->qnn_binary_graph_cache[graph_key] = std::move(graph); } - succeed = true; + succeed = execute_graph<2, 1>(graph_ptr, { src0, src1 }, { dst }); failure: if (!succeed) { diff --git a/ggml/src/ggml-qnn/backend.hpp b/ggml/src/ggml-qnn/backend.hpp index d60b334c0b2b5..48b243577ca1f 100644 --- a/ggml/src/ggml-qnn/backend.hpp +++ b/ggml/src/ggml-qnn/backend.hpp @@ -16,12 +16,10 @@ struct ggml_backend_qnn_context { int threads; char name[GGML_MAX_NAME]; char lib[GGML_MAX_NAME]; - qnn::qnn_instance *instance; + std::shared_ptr instance; ggml_backend *backend; QNN_INTERFACE_VER_TYPE raw_interface; QNN_SYSTEM_INTERFACE_VER_TYPE raw_system_interface; qnn::qcom_socinfo socinfo; - std::unordered_map, Qnn_Tensor_t *, - Qnn_Tensor_t *, Qnn_Tensor_t *>> - qnn_graph_map; + std::unordered_map> qnn_binary_graph_cache; }; diff --git a/ggml/src/ggml-qnn/graph.hpp b/ggml/src/ggml-qnn/graph.hpp index 651fc1c5301ec..6f9628cbd7739 100644 --- a/ggml/src/ggml-qnn/graph.hpp +++ b/ggml/src/ggml-qnn/graph.hpp @@ -73,6 +73,22 @@ class ggml_qnn_graph { _graph_handle = graph_handle; } + bool create_graph_tensor(Qnn_Tensor_t &tensor) { + if (!is_valid()) { + QNN_LOG_ERROR("Invalid graph\n"); + return false; + } + + auto err = _qnn_interface.tensorCreateGraphTensor(_graph_handle, &tensor); + if (err != QNN_SUCCESS) { + QNN_LOG_INFO("error = %d\n", err); + QNN_LOG_DEBUG("tensor%p name %s", _qnn_tensor, QNN_TENSOR_GET_NAME(*_qnn_tensor)); + return false; + } + + return true; + } + bool add_nodes(const std::string &op_name, const input_tensor_array_t &tensor_inputs, const output_tensor_array_t &tensor_outputs) { if (!is_valid()) { @@ -124,6 +140,8 @@ class ggml_qnn_graph { Qnn_GraphHandle_t get_graph_handler() const { return _graph_handle; } + const std::string &get_name() const { return _graph_name; } + private: const std::string _graph_name; const QNNBackend _device; diff --git a/ggml/src/ggml-qnn/qnn-types.hpp b/ggml/src/ggml-qnn/qnn-types.hpp index 7c245651032c0..58ca8648b0b03 100644 --- a/ggml/src/ggml-qnn/qnn-types.hpp +++ b/ggml/src/ggml-qnn/qnn-types.hpp @@ -49,7 +49,5 @@ using pfn_qnninterface_getproviders = decltype(QnnInterface_getProviders); using pfn_qnnsysteminterface_getproviders = decltype(QnnSystemInterface_getProviders); } // namespace qnn -#define QNN_VER_PTR(x) (&((x).v1)) // TODO: remove this macro after we have a separate header for QNN - #define RPCMEM_DEFAULT_FLAGS 1 #define RPCMEM_HEAP_ID_SYSTEM 25 diff --git a/ggml/src/ggml-qnn/qnn.hpp b/ggml/src/ggml-qnn/qnn.hpp index 26465c96a0793..400ce005bfe2b 100644 --- a/ggml/src/ggml-qnn/qnn.hpp +++ b/ggml/src/ggml-qnn/qnn.hpp @@ -637,20 +637,20 @@ class qnn_instance { return 3; } - if (is_rpcmem_registered((QNN_VER_PTR(*p_tensor)->memHandle))) { - QNN_LOG_WARN("tensor %s has been registered shared memory\n", (QNN_VER_PTR(*p_tensor)->name)); + if (is_rpcmem_registered(QNN_TENSOR_GET_MEM_HANDLE(*p_tensor))) { + QNN_LOG_WARN("tensor %s has been registered shared memory\n", QNN_TENSOR_GET_NAME(*p_tensor)); return 4; } int32_t mem_fd = rpcmem_to_fd(p_data); - if (-1 == mem_fd) { + if (mem_fd == -1) { QNN_LOG_WARN("failed to get file descriptor\n"); return 5; } QNN_LOG_INFO("mem_fd %d\n", mem_fd); - Qnn_MemDescriptor_t descriptor = { { QNN_VER_PTR(*p_tensor)->rank, QNN_VER_PTR(*p_tensor)->dimensions, + Qnn_MemDescriptor_t descriptor = { { QNN_TENSOR_GET_RANK(*p_tensor), QNN_TENSOR_GET_DIMENSIONS(*p_tensor), nullptr }, - QNN_VER_PTR(*p_tensor)->dataType, + QNN_TENSOR_GET_DATA_TYPE(*p_tensor), QNN_MEM_TYPE_ION, { { mem_fd } } }; Qnn_MemHandle_t handle = nullptr; @@ -662,9 +662,10 @@ class qnn_instance { strerror(error)); return 6; } else { - QNN_LOG_INFO("tensor %s successfully register shared memory\n", (QNN_VER_PTR(*p_tensor)->name)); + QNN_LOG_INFO("tensor %s successfully register shared memory\n", QNN_TENSOR_GET_NAME(*p_tensor)); } - QNN_VER_PTR(*p_tensor)->memHandle = handle; + + QNN_TENSOR_SET_MEM_HANDLE(*p_tensor, handle); _qnn_mem_set.insert((std::pair(p_data, handle))); return 0; diff --git a/ggml/src/ggml-qnn/tensor.hpp b/ggml/src/ggml-qnn/tensor.hpp index 8a9196616fcae..335aafe533d0d 100644 --- a/ggml/src/ggml-qnn/tensor.hpp +++ b/ggml/src/ggml-qnn/tensor.hpp @@ -1,127 +1,197 @@ #pragma once +#include +#include +#include + #include "ggml-qnn.h" #include "QnnTensor.h" #include "System/QnnSystemInterface.h" #include "backend.hpp" +#include "graph.hpp" #include "qnn.hpp" +#include "utils.hpp" namespace qnn { -template -class ggml_qnn_tensor_readwrite { +class ggml_qnn_tensor { public: - explicit ggml_qnn_tensor_readwrite(const ggml_tensor *tensor, Qnn_GraphHandle_t graph_handle, - ggml_backend_qnn_context *ctx) : - _tensor(tensor), _qnn_tensor(reinterpret_cast(tensor->extra)), _context(ctx) { - _old_dimensions = QNN_VER_PTR(*_qnn_tensor)->dimensions; - const auto qnn_data_type = device_datatype_from_ggml_datatype(tensor->type); - const bool is_npu = ctx->device == QNN_BACKEND_NPU; - QNN_VER_PTR(*_qnn_tensor)->type = _tensorType; - if (is_npu) { - QNN_VER_PTR(*_qnn_tensor)->memType = QNN_TENSORMEMTYPE_MEMHANDLE; - QNN_VER_PTR(*_qnn_tensor)->clientBuf = { .data = nullptr, .dataSize = 0 }; + static ggml_qnn_tensor *from_ggml_tensor(const ggml_tensor *tensor) { + if (!tensor) { + return nullptr; } - auto err = ctx->raw_interface.tensorCreateGraphTensor(graph_handle, _qnn_tensor); - if (err != QNN_SUCCESS) { - QNN_LOG_INFO("error = %d\n", err); - QNN_LOG_DEBUG("tensor%p name %s", _qnn_tensor, QNN_TENSOR_GET_NAME(*_qnn_tensor)); - _context = nullptr; - return; + return static_cast(tensor->extra); + } + + explicit ggml_qnn_tensor(ggml_tensor *tensor, QNNBackend device, std::shared_ptr qnn_instance) : + _tensor(tensor), _device(device), _qnn_instance(qnn_instance) { + _tensor_name = ggml_get_name(tensor); + if (_tensor_name.empty()) { + static std::atomic_uint32_t unnamed_tensor_count = 0; + char buffer[GGML_MAX_NAME] = {}; + snprintf(buffer, sizeof(buffer), "unnamed_%p", unnamed_tensor_count++); + _tensor_name = buffer; } + QNN_TENSOR_SET_NAME(_qnn_tensor, _tensor_name.c_str()); _dimensions[0] = (uint32_t)tensor->ne[0]; _dimensions[1] = (uint32_t)tensor->ne[1]; _dimensions[2] = (uint32_t)tensor->ne[2]; _dimensions[3] = (uint32_t)tensor->ne[3]; - QNN_VER_PTR(*_qnn_tensor)->dimensions = _dimensions; - QNN_VER_PTR(*_qnn_tensor)->rank = qnn::get_ggml_tensor_rank(tensor); - QNN_VER_PTR(*_qnn_tensor)->dataType = qnn_data_type; - + QNN_TENSOR_SET_DIMENSIONS(_qnn_tensor, _dimensions); + QNN_TENSOR_SET_TYPE(_qnn_tensor, device_tensortype_from_ggml_tensor(tensor)); + QNN_TENSOR_SET_DATA_FORMAT(_qnn_tensor, QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER); + QNN_TENSOR_SET_DATA_TYPE(_qnn_tensor, device_datatype_from_ggml_datatype(tensor->type)); + // TODO: set the quantizeParams base on the tensor type + QNN_TENSOR_SET_RANK(_qnn_tensor, qnn::get_ggml_tensor_rank(tensor)); + + const bool is_npu = device == QNN_BACKEND_NPU; if (is_npu) { - auto *instance = ctx->instance; - uint8_t *qnn_buffer = static_cast(instance->alloc_rpcmem(ggml_nbytes(tensor), alignof(void *))); - if (!qnn_buffer) { - QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno)); - QNN_LOG_DEBUG("tensor%p name %s", _qnn_tensor, QNN_TENSOR_GET_NAME(*_qnn_tensor)); - _context = nullptr; - // No free for _qnn_tensor, because it's not registered. - return; + QNN_TENSOR_SET_MEM_TYPE(_qnn_tensor, QNN_TENSORMEMTYPE_MEMHANDLE); + QNN_TENSOR_SET_MEM_HANDLE(_qnn_tensor, nullptr); + } else { + QNN_TENSOR_SET_MEM_TYPE(_qnn_tensor, QNN_TENSORMEMTYPE_RAW); + Qnn_ClientBuffer_t client_buf = { tensor->data, get_ggml_tensor_data_size(tensor) }; + QNN_TENSOR_SET_CLIENT_BUF(_qnn_tensor, client_buf); + } + + tensor->extra = this; + } + + template + bool bind_to_graph(ggml_qnn_graph<_InputSize, _OutputSize> &graph) { + if (!is_valid()) { + QNN_LOG_WARN("tensor %s not valid", _tensor_name.c_str()); + return false; + } + + if (_graph_handle) { + if (_graph_handle != graph.get_graph_handler()) { + QNN_LOG_WARN("tensor %s has been bound to another graph", _tensor_name.c_str()); + return false; } else { - QNN_LOG_INFO("alloc rpcmem successfully\n"); + QNN_LOG_INFO("tensor %s already bound to same graph %s", _tensor_name.c_str(), + graph.get_name().c_str()); + return true; } + } - instance->register_rpcmem(qnn_buffer, _qnn_tensor); - if (_tensorType == QNN_TENSOR_TYPE_APP_WRITE || _tensorType == QNN_TENSOR_TYPE_APP_READWRITE) { - memcpy(qnn_buffer, tensor->data, ggml_nbytes(tensor)); - } - } else { - QNN_VER_PTR(*_qnn_tensor)->clientBuf = { tensor->data, get_ggml_tensor_data_size(tensor) }; + Qnn_Tensor_t tensor = _qnn_tensor; + if (!graph.create_graph_tensor(tensor)) { + QNN_LOG_WARN("create graph tensor failed, tensor %s", _tensor_name.c_str()); + return false; } + + if (!alloc_rpc_mem()) { + QNN_LOG_WARN("alloc rpc mem failed, tensor %s", _tensor_name.c_str()); + return false; + } + + QNN_TENSOR_SET_ID(_qnn_tensor, QNN_TENSOR_GET_ID(tensor)); + _graph_handle = graph.get_graph_handler(); + return true; } - explicit ggml_qnn_tensor_readwrite(const ggml_tensor *tensor, Qnn_Tensor_t *qnn_tensor, - ggml_backend_qnn_context *ctx) : - _tensor(tensor), _qnn_tensor(qnn_tensor), _context(ctx) { - _old_dimensions = QNN_VER_PTR(*_qnn_tensor)->dimensions; - const auto qnn_data_type = device_datatype_from_ggml_datatype(tensor->type); - const bool is_npu = ctx->device == QNN_BACKEND_NPU; + bool write_to_qnn_tensor() { + if (!is_valid()) { + QNN_LOG_WARN("tensor %s not valid", _tensor_name.c_str()); + return false; + } - _dimensions[0] = (uint32_t)tensor->ne[0]; - _dimensions[1] = (uint32_t)tensor->ne[1]; - _dimensions[2] = (uint32_t)tensor->ne[2]; - _dimensions[3] = (uint32_t)tensor->ne[3]; - QNN_VER_PTR(*_qnn_tensor)->dimensions = _dimensions; - QNN_VER_PTR(*_qnn_tensor)->rank = get_ggml_tensor_rank(tensor); - QNN_VER_PTR(*_qnn_tensor)->dataType = qnn_data_type; + auto tensor_type = QNN_TENSOR_GET_TYPE(_qnn_tensor); + if (tensor_type != QNN_TENSOR_TYPE_APP_WRITE && tensor_type != QNN_TENSOR_TYPE_APP_READWRITE) { + QNN_LOG_WARN("tensor %s not writable", _tensor_name.c_str()); + return false; + } - if (is_npu) { - uint8_t *qnn_buffer = - static_cast(ctx->instance->get_rpcmem_from_memhandle(QNN_VER_PTR(*_qnn_tensor)->memHandle)); + if (should_use_mem_handle()) { + uint8_t *qnn_buffer = static_cast( + _qnn_instance->get_rpcmem_from_memhandle(QNN_TENSOR_GET_MEM_HANDLE(_qnn_tensor))); if (qnn_buffer) { - memcpy(qnn_buffer, tensor->data, ggml_nbytes(tensor)); + memcpy(qnn_buffer, _tensor->data, ggml_nbytes(_tensor)); } else { QNN_LOG_WARN("can't find rpcmem from qnn mem handle\n"); - QNN_LOG_DEBUG("tensor%p name %s", _qnn_tensor, QNN_TENSOR_GET_NAME(*_qnn_tensor)); - _context = nullptr; - return; + return false; } - } else { - QNN_VER_PTR(*_qnn_tensor)->clientBuf = { tensor->data, get_ggml_tensor_data_size(tensor) }; } + + // For CPU and GPU, the data is already in the tensor. + return true; } - ~ggml_qnn_tensor_readwrite() { - if ((_tensorType == QNN_TENSOR_TYPE_APP_READWRITE || _tensorType == QNN_TENSOR_TYPE_APP_READ) && _context && - _context->device == QNN_BACKEND_NPU) { + bool read_from_qnn_tensor() { + if (!is_valid()) { + QNN_LOG_WARN("tensor %s not valid", _tensor_name.c_str()); + return false; + } + + auto tensor_type = QNN_TENSOR_GET_TYPE(_qnn_tensor); + if (tensor_type != QNN_TENSOR_TYPE_APP_READ && tensor_type != QNN_TENSOR_TYPE_APP_READWRITE) { + QNN_LOG_WARN("tensor %s not readable", _tensor_name.c_str()); + return false; + } + + if (should_use_mem_handle()) { uint8_t *qnn_buffer = static_cast( - _context->instance->get_rpcmem_from_memhandle(QNN_VER_PTR(*_qnn_tensor)->memHandle)); - memcpy(_tensor->data, qnn_buffer, ggml_nbytes(_tensor)); + _qnn_instance->get_rpcmem_from_memhandle(QNN_TENSOR_GET_MEM_HANDLE(_qnn_tensor))); + if (qnn_buffer) { + memcpy(_tensor->data, qnn_buffer, ggml_nbytes(_tensor)); + } else { + QNN_LOG_WARN("can't find rpcmem from qnn mem handle\n"); + return false; + } } - QNN_VER_PTR(*_qnn_tensor)->dimensions = _old_dimensions; + // For CPU and GPU, the data is already in the tensor. + return true; } - bool is_valid() const { return _context; } - Qnn_Tensor_t *get_qnn_tensor() const { return _qnn_tensor; } + bool is_valid() const { return _tensor; } + const Qnn_Tensor_t &get_qnn_tensor() const { return _qnn_tensor; } private: + bool alloc_rpc_mem() { + if (!should_use_mem_handle()) { + return true; + } + + uint8_t *qnn_buffer = + static_cast(_qnn_instance->alloc_rpcmem(ggml_nbytes(_tensor), alignof(void *))); + if (!qnn_buffer) { + QNN_LOG_WARN("alloc rpc mem failure, %s\n", strerror(errno)); + QNN_LOG_DEBUG("tensor name %s", _tensor_name.c_str()); + return false; + } else { + QNN_LOG_INFO("alloc rpcmem successfully\n"); + } + + auto error = _qnn_instance->register_rpcmem(qnn_buffer, &_qnn_tensor); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("register rpc mem failure, %d\n", (int)error); + QNN_LOG_DEBUG("tensor name %s", _tensor_name.c_str()); + return false; + } + + return true; + } + + bool should_use_mem_handle() const { return _device == QNN_BACKEND_NPU; } + const ggml_tensor *_tensor; - Qnn_Tensor_t *_qnn_tensor; - ggml_backend_qnn_context *_context; - uint32_t *_old_dimensions; + QNNBackend _device; + std::shared_ptr _qnn_instance; + Qnn_Tensor_t _qnn_tensor = QNN_TENSOR_INIT; uint32_t _dimensions[4] = {}; + std::string _tensor_name; + Qnn_GraphHandle_t _graph_handle = nullptr; - ggml_qnn_tensor_readwrite(const ggml_qnn_tensor_readwrite &) = delete; - void operator=(const ggml_qnn_tensor_readwrite &) = delete; - ggml_qnn_tensor_readwrite(ggml_qnn_tensor_readwrite &&) = delete; - void operator=(ggml_qnn_tensor_readwrite &&) = delete; + ggml_qnn_tensor(const ggml_qnn_tensor &) = delete; + void operator=(const ggml_qnn_tensor &) = delete; + ggml_qnn_tensor(ggml_qnn_tensor &&) = delete; + void operator=(ggml_qnn_tensor &&) = delete; }; -using ggml_qnn_tensor_output = ggml_qnn_tensor_readwrite; -using ggml_qnn_tensor_input = ggml_qnn_tensor_readwrite; - } // namespace qnn diff --git a/ggml/src/ggml-qnn/utils.hpp b/ggml/src/ggml-qnn/utils.hpp index 87d908f1e15fb..84cd8354e2d59 100644 --- a/ggml/src/ggml-qnn/utils.hpp +++ b/ggml/src/ggml-qnn/utils.hpp @@ -102,6 +102,13 @@ inline Qnn_TensorMemType_t get_qnn_tensor_memtype(const Qnn_Tensor_t &tensor) { return QNN_TENSORMEMTYPE_UNDEFINED; } +inline Qnn_MemHandle_t get_qnn_tensor_memhandle(const Qnn_Tensor_t &tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.memHandle; + } + return nullptr; +} + inline void set_qnn_tensor_id(Qnn_Tensor_t &tensor, uint32_t id) { if (tensor.version == QNN_TENSOR_VERSION_1) { tensor.v1.id = id; @@ -224,6 +231,7 @@ class qnn_perf { #define QNN_TENSOR_GET_RANK(tensor) qnn::get_qnn_tensor_rank(tensor) #define QNN_TENSOR_GET_DIMENSIONS(tensor) qnn::get_qnn_tensor_dimensions(tensor) #define QNN_TENSOR_GET_MEM_TYPE(tensor) qnn::get_qnn_tensor_memtype(tensor) +#define QNN_TENSOR_GET_MEM_HANDLE(tensor) qnn::get_qnn_tensor_memhandle(tensor) #define QNN_TENSOR_SET_ID(tensor, value) qnn::set_qnn_tensor_id(tensor, value) #define QNN_TENSOR_SET_NAME(tensor, value) qnn::set_qnn_tensor_name(tensor, value) From af869fd636fdfe0656dd94b4d9fc9d6f254207ea Mon Sep 17 00:00:00 2001 From: hongruichen Date: Tue, 9 Jul 2024 23:21:55 +0800 Subject: [PATCH 047/143] fix compiling error in debug build --- ggml/src/ggml-qnn/graph.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-qnn/graph.hpp b/ggml/src/ggml-qnn/graph.hpp index 6f9628cbd7739..01c44fe374eef 100644 --- a/ggml/src/ggml-qnn/graph.hpp +++ b/ggml/src/ggml-qnn/graph.hpp @@ -82,7 +82,7 @@ class ggml_qnn_graph { auto err = _qnn_interface.tensorCreateGraphTensor(_graph_handle, &tensor); if (err != QNN_SUCCESS) { QNN_LOG_INFO("error = %d\n", err); - QNN_LOG_DEBUG("tensor%p name %s", _qnn_tensor, QNN_TENSOR_GET_NAME(*_qnn_tensor)); + QNN_LOG_DEBUG("tensor%p name %s", &tensor, QNN_TENSOR_GET_NAME(tensor)); return false; } From a7be0693ba1645ec1cf32bc13117229dde668e86 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Tue, 9 Jul 2024 20:35:58 +0800 Subject: [PATCH 048/143] add log --- ggml/src/ggml-qnn/tensor.hpp | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-qnn/tensor.hpp b/ggml/src/ggml-qnn/tensor.hpp index 335aafe533d0d..e023fb7fc0157 100644 --- a/ggml/src/ggml-qnn/tensor.hpp +++ b/ggml/src/ggml-qnn/tensor.hpp @@ -11,6 +11,7 @@ #include "System/QnnSystemInterface.h" #include "backend.hpp" #include "graph.hpp" +#include "logger.hpp" #include "qnn.hpp" #include "utils.hpp" @@ -59,6 +60,7 @@ class ggml_qnn_tensor { } tensor->extra = this; + QNN_LOG_DEBUG("create tensor %s with device %d", _tensor_name.c_str(), device); } template @@ -92,6 +94,8 @@ class ggml_qnn_tensor { QNN_TENSOR_SET_ID(_qnn_tensor, QNN_TENSOR_GET_ID(tensor)); _graph_handle = graph.get_graph_handler(); + + QNN_LOG_DEBUG("bind tensor %s to graph %s", _tensor_name.c_str(), graph.get_name().c_str()); return true; } @@ -164,10 +168,10 @@ class ggml_qnn_tensor { QNN_LOG_WARN("alloc rpc mem failure, %s\n", strerror(errno)); QNN_LOG_DEBUG("tensor name %s", _tensor_name.c_str()); return false; - } else { - QNN_LOG_INFO("alloc rpcmem successfully\n"); } + QNN_LOG_INFO("tensor %s: alloc rpcmem(%p) successfully\n", _tensor_name.c_str(), qnn_buffer); + auto error = _qnn_instance->register_rpcmem(qnn_buffer, &_qnn_tensor); if (error != QNN_SUCCESS) { QNN_LOG_WARN("register rpc mem failure, %d\n", (int)error); From 9add256efee043640952127c947efedc014fc79e Mon Sep 17 00:00:00 2001 From: hongruichen Date: Wed, 10 Jul 2024 00:31:13 +0800 Subject: [PATCH 049/143] use helper function instead --- ggml/src/ggml-qnn/tensor.hpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/ggml/src/ggml-qnn/tensor.hpp b/ggml/src/ggml-qnn/tensor.hpp index e023fb7fc0157..e966e638bee1f 100644 --- a/ggml/src/ggml-qnn/tensor.hpp +++ b/ggml/src/ggml-qnn/tensor.hpp @@ -49,8 +49,7 @@ class ggml_qnn_tensor { // TODO: set the quantizeParams base on the tensor type QNN_TENSOR_SET_RANK(_qnn_tensor, qnn::get_ggml_tensor_rank(tensor)); - const bool is_npu = device == QNN_BACKEND_NPU; - if (is_npu) { + if (should_use_mem_handle()) { QNN_TENSOR_SET_MEM_TYPE(_qnn_tensor, QNN_TENSORMEMTYPE_MEMHANDLE); QNN_TENSOR_SET_MEM_HANDLE(_qnn_tensor, nullptr); } else { @@ -171,7 +170,7 @@ class ggml_qnn_tensor { } QNN_LOG_INFO("tensor %s: alloc rpcmem(%p) successfully\n", _tensor_name.c_str(), qnn_buffer); - + auto error = _qnn_instance->register_rpcmem(qnn_buffer, &_qnn_tensor); if (error != QNN_SUCCESS) { QNN_LOG_WARN("register rpc mem failure, %d\n", (int)error); From dc7d83e121e48b5df6af52986723ad8f4946846d Mon Sep 17 00:00:00 2001 From: hongruichen Date: Wed, 10 Jul 2024 00:33:23 +0800 Subject: [PATCH 050/143] add log --- ggml/src/ggml-qnn/logger.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-qnn/logger.cpp b/ggml/src/ggml-qnn/logger.cpp index 8b29979224866..5a1ad13ba40ce 100644 --- a/ggml/src/ggml-qnn/logger.cpp +++ b/ggml/src/ggml-qnn/logger.cpp @@ -67,7 +67,7 @@ void qnn::sdk_logcallback(const char *fmt, QnnLog_Level_t level, uint64_t timest memset(s_ggml_qnn_logbuf, 0, QNN_LOGBUF_LEN); vsnprintf(reinterpret_cast(s_ggml_qnn_logbuf), QNN_LOGBUF_LEN, fmt, argp); - QNN_LOG_INFO("%8.1fms [%-7s] %s\n", ms, log_level_desc, s_ggml_qnn_logbuf); + QNN_LOG_INFO("%8.1fms [%-7s] %s", ms, log_level_desc, s_ggml_qnn_logbuf); } #endif } From e97d3a6c48941451b7281766371092f6987b285e Mon Sep 17 00:00:00 2001 From: hongruichen Date: Wed, 10 Jul 2024 11:56:01 +0800 Subject: [PATCH 051/143] fix tensor buffer allocation add log commit qnn buffer after changed add log register_rpc_mem 2 times update input tensors before graph finalize default to QNN_TENSORMEMTYPE_RAW set new tensors at execute move write input tensors to exec check if mem registered before actual do register rpc mem once allocated --- ggml/src/ggml-qnn/backend-ops.cpp | 49 ++++++++++---------- ggml/src/ggml-qnn/graph.hpp | 6 ++- ggml/src/ggml-qnn/qnn.hpp | 3 +- ggml/src/ggml-qnn/tensor.hpp | 76 +++++++++++++++++++------------ 4 files changed, 77 insertions(+), 57 deletions(-) diff --git a/ggml/src/ggml-qnn/backend-ops.cpp b/ggml/src/ggml-qnn/backend-ops.cpp index 1914e64dcff27..bafe5ca160e66 100644 --- a/ggml/src/ggml-qnn/backend-ops.cpp +++ b/ggml/src/ggml-qnn/backend-ops.cpp @@ -66,44 +66,43 @@ bool qnn_bind_tensors_to_graph(qnn::ggml_qnn_graph<_InputSize, _OutputSize> *gra return true; } -template -bool write_to_qnn_tensors(const std::array &inputs) { - for (auto &input : inputs) { - auto tensor = qnn::ggml_qnn_tensor::from_ggml_tensor(input); +template +bool execute_graph(qnn::ggml_qnn_graph<_InputSize, _OutputSize> *graph, + const std::array &inputs, + const std::array &outputs) { + + std::array qnn_input_tensors; + for (size_t i = 0; i < inputs.size(); ++i) { + auto tensor = qnn::ggml_qnn_tensor::from_ggml_tensor(inputs[i]); if (!tensor || !tensor->write_to_qnn_tensor()) { + QNN_LOG_WARN("write_to_qnn_tensor failed\n"); return false; } - } - return true; -} + qnn_input_tensors[i] = tensor->get_qnn_tensor(); + } -template -bool read_from_qnn_tensors(const std::array &outputs) { - for (auto &output : outputs) { - auto tensor = qnn::ggml_qnn_tensor::from_ggml_tensor(output); - if (!tensor || !tensor->read_from_qnn_tensor()) { + std::array qnn_output_tensors; + for (size_t i = 0; i < outputs.size(); ++i) { + auto tensor = qnn::ggml_qnn_tensor::from_ggml_tensor(outputs[i]); + if (!tensor) { return false; } - } - - return true; -} -template -bool execute_graph(qnn::ggml_qnn_graph<_InputSize, _OutputSize> *graph, - const std::array &inputs, - const std::array &outputs) { - if (!write_to_qnn_tensors<_InputSize>(inputs)) { - return false; + qnn_output_tensors[i] = tensor->get_qnn_tensor(); } - if (!graph->execute()) { + if (!graph->execute(qnn_input_tensors, qnn_output_tensors)) { + QNN_LOG_WARN("execute failed\n"); return false; } - if (!read_from_qnn_tensors<_OutputSize>(outputs)) { - return false; + for (auto &output : outputs) { + auto tensor = qnn::ggml_qnn_tensor::from_ggml_tensor(output); + if (!tensor || !tensor->read_from_qnn_tensor()) { + QNN_LOG_WARN("read_from_qnn_tensors failed\n"); + return false; + } } return true; diff --git a/ggml/src/ggml-qnn/graph.hpp b/ggml/src/ggml-qnn/graph.hpp index 01c44fe374eef..cb04b1efda0fc 100644 --- a/ggml/src/ggml-qnn/graph.hpp +++ b/ggml/src/ggml-qnn/graph.hpp @@ -96,6 +96,7 @@ class ggml_qnn_graph { return false; } + QNN_LOG_DEBUG("graph name %s, add_nodes start", _graph_name.c_str()); _tensor_inputs = tensor_inputs; _tensor_outputs = tensor_outputs; @@ -116,10 +117,13 @@ class ggml_qnn_graph { return false; } + QNN_LOG_DEBUG("graph name %s, add_nodes succeed", _graph_name.c_str()); return true; } - bool execute() { + bool execute(const input_tensor_array_t &tensor_inputs, const output_tensor_array_t &tensor_outputs) { + _tensor_inputs = tensor_inputs; + _tensor_outputs = tensor_outputs; auto error = _qnn_interface.graphExecute(_graph_handle, _tensor_inputs.data(), _tensor_inputs.size(), _tensor_outputs.data(), _tensor_outputs.size(), nullptr, nullptr); if (_device == QNN_BACKEND_NPU) { diff --git a/ggml/src/ggml-qnn/qnn.hpp b/ggml/src/ggml-qnn/qnn.hpp index 400ce005bfe2b..9d60d2f6c551c 100644 --- a/ggml/src/ggml-qnn/qnn.hpp +++ b/ggml/src/ggml-qnn/qnn.hpp @@ -661,13 +661,12 @@ class qnn_instance { QNN_LOG_WARN("failed to register shared memory, error %d, %s\n", QNN_GET_ERROR_CODE(error), strerror(error)); return 6; - } else { - QNN_LOG_INFO("tensor %s successfully register shared memory\n", QNN_TENSOR_GET_NAME(*p_tensor)); } QNN_TENSOR_SET_MEM_HANDLE(*p_tensor, handle); _qnn_mem_set.insert((std::pair(p_data, handle))); + QNN_LOG_INFO("tensor %s successfully register shared memory handler: %p\n", QNN_TENSOR_GET_NAME(*p_tensor), handle); return 0; } diff --git a/ggml/src/ggml-qnn/tensor.hpp b/ggml/src/ggml-qnn/tensor.hpp index e966e638bee1f..aeab605693caf 100644 --- a/ggml/src/ggml-qnn/tensor.hpp +++ b/ggml/src/ggml-qnn/tensor.hpp @@ -49,14 +49,9 @@ class ggml_qnn_tensor { // TODO: set the quantizeParams base on the tensor type QNN_TENSOR_SET_RANK(_qnn_tensor, qnn::get_ggml_tensor_rank(tensor)); - if (should_use_mem_handle()) { - QNN_TENSOR_SET_MEM_TYPE(_qnn_tensor, QNN_TENSORMEMTYPE_MEMHANDLE); - QNN_TENSOR_SET_MEM_HANDLE(_qnn_tensor, nullptr); - } else { - QNN_TENSOR_SET_MEM_TYPE(_qnn_tensor, QNN_TENSORMEMTYPE_RAW); - Qnn_ClientBuffer_t client_buf = { tensor->data, get_ggml_tensor_data_size(tensor) }; - QNN_TENSOR_SET_CLIENT_BUF(_qnn_tensor, client_buf); - } + QNN_TENSOR_SET_MEM_TYPE(_qnn_tensor, QNN_TENSORMEMTYPE_RAW); + Qnn_ClientBuffer_t client_buf = {}; + QNN_TENSOR_SET_CLIENT_BUF(_qnn_tensor, client_buf); tensor->extra = this; QNN_LOG_DEBUG("create tensor %s with device %d", _tensor_name.c_str(), device); @@ -86,9 +81,26 @@ class ggml_qnn_tensor { return false; } - if (!alloc_rpc_mem()) { - QNN_LOG_WARN("alloc rpc mem failed, tensor %s", _tensor_name.c_str()); - return false; + if (should_use_mem_handle()) { + _qnn_rpc_buffer = alloc_rpc_mem(); + if (!_qnn_rpc_buffer) { + QNN_LOG_WARN("alloc rpc mem failed, tensor %s", _tensor_name.c_str()); + return false; + } + + auto tensor_type = QNN_TENSOR_GET_TYPE(_qnn_tensor); + if (!register_rpc_mem(_qnn_rpc_buffer)) { + QNN_LOG_WARN("commit rpc mem failure\n"); + return false; + } + + QNN_LOG_DEBUG("tensor %s, use mem handle %p", _tensor_name.c_str(), QNN_TENSOR_GET_MEM_HANDLE(_qnn_tensor)); + } else { + QNN_TENSOR_SET_MEM_TYPE(_qnn_tensor, QNN_TENSORMEMTYPE_RAW); + Qnn_ClientBuffer_t client_buf = { _tensor->data, get_ggml_tensor_data_size(_tensor) }; + QNN_TENSOR_SET_CLIENT_BUF(_qnn_tensor, client_buf); + QNN_LOG_DEBUG("tensor %s, use client buffer %p size %d", _tensor_name.c_str(), client_buf.data, + (int)client_buf.dataSize); } QNN_TENSOR_SET_ID(_qnn_tensor, QNN_TENSOR_GET_ID(tensor)); @@ -111,10 +123,8 @@ class ggml_qnn_tensor { } if (should_use_mem_handle()) { - uint8_t *qnn_buffer = static_cast( - _qnn_instance->get_rpcmem_from_memhandle(QNN_TENSOR_GET_MEM_HANDLE(_qnn_tensor))); - if (qnn_buffer) { - memcpy(qnn_buffer, _tensor->data, ggml_nbytes(_tensor)); + if (_qnn_rpc_buffer) { + memcpy(_qnn_rpc_buffer, _tensor->data, ggml_nbytes(_tensor)); } else { QNN_LOG_WARN("can't find rpcmem from qnn mem handle\n"); return false; @@ -122,6 +132,7 @@ class ggml_qnn_tensor { } // For CPU and GPU, the data is already in the tensor. + QNN_LOG_DEBUG("write tensor %s to qnn", _tensor_name.c_str()); return true; } @@ -138,10 +149,8 @@ class ggml_qnn_tensor { } if (should_use_mem_handle()) { - uint8_t *qnn_buffer = static_cast( - _qnn_instance->get_rpcmem_from_memhandle(QNN_TENSOR_GET_MEM_HANDLE(_qnn_tensor))); - if (qnn_buffer) { - memcpy(_tensor->data, qnn_buffer, ggml_nbytes(_tensor)); + if (_qnn_rpc_buffer) { + memcpy(_tensor->data, _qnn_rpc_buffer, ggml_nbytes(_tensor)); } else { QNN_LOG_WARN("can't find rpcmem from qnn mem handle\n"); return false; @@ -149,6 +158,7 @@ class ggml_qnn_tensor { } // For CPU and GPU, the data is already in the tensor. + QNN_LOG_DEBUG("read tensor %s from qnn", _tensor_name.c_str()); return true; } @@ -156,28 +166,35 @@ class ggml_qnn_tensor { const Qnn_Tensor_t &get_qnn_tensor() const { return _qnn_tensor; } private: - bool alloc_rpc_mem() { - if (!should_use_mem_handle()) { - return true; - } - - uint8_t *qnn_buffer = + uint8_t *alloc_rpc_mem() { + uint8_t *qnn_rpc_buffer = static_cast(_qnn_instance->alloc_rpcmem(ggml_nbytes(_tensor), alignof(void *))); - if (!qnn_buffer) { + if (!qnn_rpc_buffer) { QNN_LOG_WARN("alloc rpc mem failure, %s\n", strerror(errno)); QNN_LOG_DEBUG("tensor name %s", _tensor_name.c_str()); - return false; + return nullptr; } - QNN_LOG_INFO("tensor %s: alloc rpcmem(%p) successfully\n", _tensor_name.c_str(), qnn_buffer); + QNN_LOG_INFO("tensor %s: alloc rpcmem(%p) successfully\n", _tensor_name.c_str(), qnn_rpc_buffer); + return qnn_rpc_buffer; + } + + bool register_rpc_mem(uint8_t *qnn_rpc_buffer) { + if (_qnn_instance->is_rpcmem_registered(QNN_TENSOR_GET_MEM_HANDLE(_qnn_tensor))) { + QNN_LOG_INFO("tensor %s: rpcmem(%p) already registered\n", _tensor_name.c_str(), qnn_rpc_buffer); + return true; + } - auto error = _qnn_instance->register_rpcmem(qnn_buffer, &_qnn_tensor); + auto error = _qnn_instance->register_rpcmem(qnn_rpc_buffer, &_qnn_tensor); if (error != QNN_SUCCESS) { QNN_LOG_WARN("register rpc mem failure, %d\n", (int)error); QNN_LOG_DEBUG("tensor name %s", _tensor_name.c_str()); return false; } + // The mem handle will be set at qnn_instance::register_rpcmem + QNN_TENSOR_SET_MEM_TYPE(_qnn_tensor, QNN_TENSORMEMTYPE_MEMHANDLE); + QNN_LOG_INFO("tensor %s: register rpcmem(%p) successfully\n", _tensor_name.c_str(), qnn_rpc_buffer); return true; } @@ -190,6 +207,7 @@ class ggml_qnn_tensor { uint32_t _dimensions[4] = {}; std::string _tensor_name; Qnn_GraphHandle_t _graph_handle = nullptr; + uint8_t *_qnn_rpc_buffer = nullptr; ggml_qnn_tensor(const ggml_qnn_tensor &) = delete; void operator=(const ggml_qnn_tensor &) = delete; From 3feb574bf05191f2d2306f6b56bc7c81805f7f0d Mon Sep 17 00:00:00 2001 From: hongruichen Date: Wed, 10 Jul 2024 19:40:02 +0800 Subject: [PATCH 052/143] merge register_rpc_mem into alloc_rpc_mem --- ggml/src/ggml-qnn/tensor.hpp | 22 ++++------------------ 1 file changed, 4 insertions(+), 18 deletions(-) diff --git a/ggml/src/ggml-qnn/tensor.hpp b/ggml/src/ggml-qnn/tensor.hpp index aeab605693caf..8a825b57de57b 100644 --- a/ggml/src/ggml-qnn/tensor.hpp +++ b/ggml/src/ggml-qnn/tensor.hpp @@ -2,6 +2,7 @@ #pragma once #include +#include #include #include @@ -9,7 +10,6 @@ #include "QnnTensor.h" #include "System/QnnSystemInterface.h" -#include "backend.hpp" #include "graph.hpp" #include "logger.hpp" #include "qnn.hpp" @@ -88,12 +88,6 @@ class ggml_qnn_tensor { return false; } - auto tensor_type = QNN_TENSOR_GET_TYPE(_qnn_tensor); - if (!register_rpc_mem(_qnn_rpc_buffer)) { - QNN_LOG_WARN("commit rpc mem failure\n"); - return false; - } - QNN_LOG_DEBUG("tensor %s, use mem handle %p", _tensor_name.c_str(), QNN_TENSOR_GET_MEM_HANDLE(_qnn_tensor)); } else { QNN_TENSOR_SET_MEM_TYPE(_qnn_tensor, QNN_TENSORMEMTYPE_RAW); @@ -176,26 +170,18 @@ class ggml_qnn_tensor { } QNN_LOG_INFO("tensor %s: alloc rpcmem(%p) successfully\n", _tensor_name.c_str(), qnn_rpc_buffer); - return qnn_rpc_buffer; - } - - bool register_rpc_mem(uint8_t *qnn_rpc_buffer) { - if (_qnn_instance->is_rpcmem_registered(QNN_TENSOR_GET_MEM_HANDLE(_qnn_tensor))) { - QNN_LOG_INFO("tensor %s: rpcmem(%p) already registered\n", _tensor_name.c_str(), qnn_rpc_buffer); - return true; - } - auto error = _qnn_instance->register_rpcmem(qnn_rpc_buffer, &_qnn_tensor); if (error != QNN_SUCCESS) { QNN_LOG_WARN("register rpc mem failure, %d\n", (int)error); QNN_LOG_DEBUG("tensor name %s", _tensor_name.c_str()); - return false; + _qnn_instance->free_rpcmem(qnn_rpc_buffer); + return nullptr; } // The mem handle will be set at qnn_instance::register_rpcmem QNN_TENSOR_SET_MEM_TYPE(_qnn_tensor, QNN_TENSORMEMTYPE_MEMHANDLE); QNN_LOG_INFO("tensor %s: register rpcmem(%p) successfully\n", _tensor_name.c_str(), qnn_rpc_buffer); - return true; + return qnn_rpc_buffer; } bool should_use_mem_handle() const { return _device == QNN_BACKEND_NPU; } From b49b501e267f96747554f321196275b4f81ae5f9 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Wed, 10 Jul 2024 19:48:57 +0800 Subject: [PATCH 053/143] fix sprintf type --- ggml/src/ggml-qnn/tensor.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-qnn/tensor.hpp b/ggml/src/ggml-qnn/tensor.hpp index 8a825b57de57b..e6bb63c54481c 100644 --- a/ggml/src/ggml-qnn/tensor.hpp +++ b/ggml/src/ggml-qnn/tensor.hpp @@ -33,7 +33,7 @@ class ggml_qnn_tensor { if (_tensor_name.empty()) { static std::atomic_uint32_t unnamed_tensor_count = 0; char buffer[GGML_MAX_NAME] = {}; - snprintf(buffer, sizeof(buffer), "unnamed_%p", unnamed_tensor_count++); + snprintf(buffer, sizeof(buffer), "unnamed_%d", (int)(unnamed_tensor_count++)); _tensor_name = buffer; } From 80051cfc4d9f340d4bb5eed1eddcdbdf98e5da51 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Wed, 10 Jul 2024 19:57:47 +0800 Subject: [PATCH 054/143] remove unused variables --- ggml/src/ggml-qnn/backend-ops.cpp | 2 -- ggml/src/ggml-qnn/qnn.hpp | 3 --- 2 files changed, 5 deletions(-) diff --git a/ggml/src/ggml-qnn/backend-ops.cpp b/ggml/src/ggml-qnn/backend-ops.cpp index bafe5ca160e66..c84c59e1e0c2b 100644 --- a/ggml/src/ggml-qnn/backend-ops.cpp +++ b/ggml/src/ggml-qnn/backend-ops.cpp @@ -122,8 +122,6 @@ bool execute_graph(qnn::ggml_qnn_graph<_InputSize, _OutputSize> *graph, #define CHECK_PARAMS(ctx, src0, src1, dst) #endif -// TODO: this function can be removed later because there are duplicated codes with ggml_qnn_mul_mat -// keep it for illustrate how to implement a specified GGMPL OP using QNN API + QNN RPC static void ggml_qnn_add(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst) { CHECK_PARAMS(ctx, src0, src1, dst); diff --git a/ggml/src/ggml-qnn/qnn.hpp b/ggml/src/ggml-qnn/qnn.hpp index 9d60d2f6c551c..10549a6c5e413 100644 --- a/ggml/src/ggml-qnn/qnn.hpp +++ b/ggml/src/ggml-qnn/qnn.hpp @@ -901,9 +901,6 @@ class qnn_instance { std::string _model_name; // Qualcomm's dedicated prebuilt model name, keep it for further usage BackendIdType _backend_id; - bool _debug_tensor = false; - bool _do_node_validations = true; - QnnLog_Level_t _qnn_log_level = QNN_LOG_LEVEL_DEBUG; qnn::sdk_profile_level _profile_level = qnn::sdk_profile_level::profile_detail; From b6f29273f0bffbb59e5d7d1d99e479983da6a740 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Wed, 10 Jul 2024 23:00:31 +0800 Subject: [PATCH 055/143] add function to get graph from cache --- ggml/src/ggml-qnn/backend-ops.cpp | 106 +++++++++++++----------------- ggml/src/ggml-qnn/utils.hpp | 2 + 2 files changed, 47 insertions(+), 61 deletions(-) diff --git a/ggml/src/ggml-qnn/backend-ops.cpp b/ggml/src/ggml-qnn/backend-ops.cpp index c84c59e1e0c2b..2627e23fd8e3c 100644 --- a/ggml/src/ggml-qnn/backend-ops.cpp +++ b/ggml/src/ggml-qnn/backend-ops.cpp @@ -108,6 +108,41 @@ bool execute_graph(qnn::ggml_qnn_graph<_InputSize, _OutputSize> *graph, return true; } +template +qnn::ggml_qnn_graph_binary *get_qnn_graph_from_cache(ggml_backend_qnn_context *ctx, ggml_op op, + const std::string &qnn_op, + const std::array &inputs, + const std::array &outputs) { + const std::string graph_key(ggml_op_name(op)); + auto it = ctx->qnn_binary_graph_cache.find(graph_key); + qnn::ggml_qnn_graph_binary *graph_ptr = nullptr; + if (it != ctx->qnn_binary_graph_cache.end()) { + graph_ptr = it->second.get(); + } else { + std::string graph_name = graph_key + "_" + std::to_string(ctx->threads); + for (auto &input: inputs) { + graph_name += "_"; + graph_name += input->name; + } + auto graph = std::make_unique(graph_name, (QNNBackend)(ctx->device), + ctx->instance->get_qnn_context_handle(), + ctx->raw_interface, ctx->socinfo.vtcm_size_in_mb); + + if (!graph->is_valid()) { + return nullptr; + } + + if (!qnn_bind_tensors_to_graph<2, 1>(graph.get(), qnn_op.c_str(), inputs, outputs)) { + return nullptr; + } + + graph_ptr = graph.get(); + ctx->qnn_binary_graph_cache[graph_key] = std::move(graph); + } + + return graph_ptr; +} + } // namespace #ifndef NDEBUG @@ -126,44 +161,21 @@ static void ggml_qnn_add(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, ggml_tensor *dst) { CHECK_PARAMS(ctx, src0, src1, dst); - std::string graph_name = "ggml_op_qnn_add"; - qnn::qnn_perf perf(graph_name); + qnn::qnn_perf perf("ggml_op_qnn_add"); perf.start(); bool succeed = false; - std::string graph_key(ggml_op_name(GGML_OP_ADD)); - auto it = ctx->qnn_binary_graph_cache.find(graph_key); - qnn::ggml_qnn_graph_binary *graph_ptr = nullptr; - if (it != ctx->qnn_binary_graph_cache.end()) { - graph_ptr = it->second.get(); - } else { - graph_name = graph_name + "_" + std::to_string(ctx->threads) + "_" + src0->name + "_" + src1->name; - auto graph = std::make_unique(graph_name, (QNNBackend)(ctx->device), - ctx->instance->get_qnn_context_handle(), - ctx->raw_interface, ctx->socinfo.vtcm_size_in_mb); - - if (!graph->is_valid()) { - goto failure; - } - - if (!qnn_bind_tensors_to_graph<2, 1>(graph.get(), QNN_OP_ELEMENT_WISE_ADD, { src0, src1 }, { dst })) { - goto failure; - } - - graph_ptr = graph.get(); - ctx->qnn_binary_graph_cache[graph_key] = std::move(graph); + qnn::ggml_qnn_graph_binary *graph_ptr = + get_qnn_graph_from_cache<2, 1>(ctx, GGML_OP_ADD, QNN_OP_ELEMENT_WISE_ADD, { src0, src1 }, { dst }); + if (graph_ptr) { + succeed = execute_graph<2, 1>(graph_ptr, { src0, src1 }, { dst }); } - succeed = execute_graph<2, 1>(graph_ptr, { src0, src1 }, { dst }); - -failure: if (!succeed) { print_ggml_tensor(src0); print_ggml_tensor(src1); print_ggml_tensor(dst); } - - perf.info(); } /* @@ -181,49 +193,21 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context *ctx, const ggml_tensor *s ggml_tensor *dst) { CHECK_PARAMS(ctx, src0, src1, dst); - std::string graph_name = "ggml_op_qnn_mul_mat"; - qnn::qnn_perf perf(graph_name); + qnn::qnn_perf perf("ggml_op_qnn_mul_mat"); perf.start(); - // TODO: for scenarios of quantized data in src0 - // pass-1: dequantize src0 to FP32 - // pass-2: dq-src0 * src1 - // the performance gains is worth although there is performance loss in pass-1 - bool succeed = false; - std::string graph_key(ggml_op_name(GGML_OP_MUL_MAT)); - auto it = ctx->qnn_binary_graph_cache.find(graph_key); - qnn::ggml_qnn_graph_binary *graph_ptr = nullptr; - if (it != ctx->qnn_binary_graph_cache.end()) { - graph_ptr = it->second.get(); - } else { - graph_name = graph_name + "_" + std::to_string(ctx->threads) + "_" + src0->name + "_" + src1->name; - auto graph = std::make_unique(graph_name, (QNNBackend)(ctx->device), - ctx->instance->get_qnn_context_handle(), - ctx->raw_interface, ctx->socinfo.vtcm_size_in_mb); - - if (!graph->is_valid()) { - goto failure; - } - - if (!qnn_bind_tensors_to_graph<2, 1>(graph.get(), QNN_OP_MAT_MUL, { src0, src1 }, { dst })) { - goto failure; - } - - graph_ptr = graph.get(); - ctx->qnn_binary_graph_cache[graph_key] = std::move(graph); + qnn::ggml_qnn_graph_binary *graph_ptr = + get_qnn_graph_from_cache<2, 1>(ctx, GGML_OP_MUL_MAT, QNN_OP_MAT_MUL, { src0, src1 }, { dst }); + if (graph_ptr) { + succeed = execute_graph<2, 1>(graph_ptr, { src0, src1 }, { dst }); } - succeed = execute_graph<2, 1>(graph_ptr, { src0, src1 }, { dst }); - -failure: if (!succeed) { print_ggml_tensor(src0); print_ggml_tensor(src1); print_ggml_tensor(dst); } - - perf.info(); } static void ggml_qnn_repeat(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, diff --git a/ggml/src/ggml-qnn/utils.hpp b/ggml/src/ggml-qnn/utils.hpp index 84cd8354e2d59..4a01347d0fc1b 100644 --- a/ggml/src/ggml-qnn/utils.hpp +++ b/ggml/src/ggml-qnn/utils.hpp @@ -189,6 +189,7 @@ void device_tensor_free(Qnn_Tensor_t &tensor); class qnn_perf { public: qnn_perf(const std::string &perf_name) : _perf_name(std::move(perf_name)) {}; + ~qnn_perf() { info(); } qnn_perf() = delete; qnn_perf(const qnn_perf &) = delete; qnn_perf &operator=(const qnn_perf &) = delete; @@ -211,6 +212,7 @@ class qnn_perf { class qnn_perf { public: qnn_perf(const std::string &perf_name) {} + ~qnn_perf() { info(); } qnn_perf() = delete; qnn_perf(const qnn_perf &) = delete; qnn_perf &operator=(const qnn_perf &) = delete; From 7ea28a6fac55a6723a02a8a873fc830b581b36c9 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Wed, 10 Jul 2024 23:39:03 +0800 Subject: [PATCH 056/143] add helper function for binary op --- ggml/src/ggml-qnn/backend-ops.cpp | 150 +++++++++++++++++++++++------- 1 file changed, 116 insertions(+), 34 deletions(-) diff --git a/ggml/src/ggml-qnn/backend-ops.cpp b/ggml/src/ggml-qnn/backend-ops.cpp index 2627e23fd8e3c..5871a7b6ef211 100644 --- a/ggml/src/ggml-qnn/backend-ops.cpp +++ b/ggml/src/ggml-qnn/backend-ops.cpp @@ -120,7 +120,7 @@ qnn::ggml_qnn_graph_binary *get_qnn_graph_from_cache(ggml_backend_qnn_context *c graph_ptr = it->second.get(); } else { std::string graph_name = graph_key + "_" + std::to_string(ctx->threads); - for (auto &input: inputs) { + for (auto &input : inputs) { graph_name += "_"; graph_name += input->name; } @@ -143,6 +143,116 @@ qnn::ggml_qnn_graph_binary *get_qnn_graph_from_cache(ggml_backend_qnn_context *c return graph_ptr; } +constexpr const char *kGgmlOpToQnnOp[] = { + nullptr, // GGML_OP_NONE + nullptr, // GGML_OP_DUP + QNN_OP_ELEMENT_WISE_ADD, // GGML_OP_ADD + nullptr, // GGML_OP_ADD1 + nullptr, // GGML_OP_ACC + nullptr, // GGML_OP_SUB + nullptr, // GGML_OP_MUL + nullptr, // GGML_OP_DIV + nullptr, // GGML_OP_SQR + nullptr, // GGML_OP_SQRT + nullptr, // GGML_OP_LOG + nullptr, // GGML_OP_SUM + nullptr, // GGML_OP_SUM_ROWS + nullptr, // GGML_OP_MEAN + nullptr, // GGML_OP_ARGMAX + nullptr, // GGML_OP_REPEAT + nullptr, // GGML_OP_REPEAT_BACK + nullptr, // GGML_OP_CONCAT + nullptr, // GGML_OP_SILU_BACK + nullptr, // GGML_OP_NORM + nullptr, // GGML_OP_RMS_NORM + nullptr, // GGML_OP_RMS_NORM_BACK + nullptr, // GGML_OP_GROUP_NORM + + QNN_OP_MAT_MUL, // GGML_OP_MUL_MAT + nullptr, // GGML_OP_MUL_MAT_ID + nullptr, // GGML_OP_OUT_PROD + + nullptr, // GGML_OP_SCALE + nullptr, // GGML_OP_SET + nullptr, // GGML_OP_CPY + nullptr, // GGML_OP_CONT + nullptr, // GGML_OP_RESHAPE + nullptr, // GGML_OP_VIEW + nullptr, // GGML_OP_PERMUTE + nullptr, // GGML_OP_TRANSPOSE + nullptr, // GGML_OP_GET_ROWS + nullptr, // GGML_OP_GET_ROWS_BACK + nullptr, // GGML_OP_DIAG + nullptr, // GGML_OP_DIAG_MASK_INF + nullptr, // GGML_OP_DIAG_MASK_ZERO + nullptr, // GGML_OP_SOFT_MAX + nullptr, // GGML_OP_SOFT_MAX_BACK + nullptr, // GGML_OP_ROPE + nullptr, // GGML_OP_ROPE_BACK + nullptr, // GGML_OP_CLAMP + nullptr, // GGML_OP_CONV_TRANSPOSE_1D + nullptr, // GGML_OP_IM2COL + nullptr, // GGML_OP_CONV_TRANSPOSE_2D + nullptr, // GGML_OP_POOL_1D + nullptr, // GGML_OP_POOL_2D + nullptr, // GGML_OP_UPSCALE + nullptr, // GGML_OP_PAD + nullptr, // GGML_OP_ARANGE + nullptr, // GGML_OP_TIMESTEP_EMBEDDING + nullptr, // GGML_OP_ARGSORT + nullptr, // GGML_OP_LEAKY_RELU + + nullptr, // GGML_OP_FLASH_ATTN_EXT + nullptr, // GGML_OP_FLASH_ATTN_BACK + nullptr, // GGML_OP_SSM_CONV + nullptr, // GGML_OP_SSM_SCAN + nullptr, // GGML_OP_WIN_PART + nullptr, // GGML_OP_WIN_UNPART + nullptr, // GGML_OP_GET_REL_POS + nullptr, // GGML_OP_ADD_REL_POS + + nullptr, // GGML_OP_UNARY + + nullptr, // GGML_OP_MAP_UNARY + nullptr, // GGML_OP_MAP_BINARY + + nullptr, // GGML_OP_MAP_CUSTOM1_F32 + nullptr, // GGML_OP_MAP_CUSTOM2_F32 + nullptr, // GGML_OP_MAP_CUSTOM3_F32 + + nullptr, // GGML_OP_MAP_CUSTOM1 + nullptr, // GGML_OP_MAP_CUSTOM2 + nullptr, // GGML_OP_MAP_CUSTOM3 + + nullptr, // GGML_OP_CROSS_ENTROPY_LOSS + nullptr, // GGML_OP_CROSS_ENTROPY_LOSS_BACK +}; + +static_assert(sizeof(kGgmlOpToQnnOp) / sizeof(kGgmlOpToQnnOp[0]) == GGML_OP_COUNT, + "GGML_OP_COUNT does not match the size of the ops table"); + +template +void qnn_binary_op_impl(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, + ggml_tensor *dst) { + static_assert(kGgmlOpToQnnOp[_GgmlOp] != nullptr, "GGML_OP does not have a corresponding QNN_OP"); + + qnn::qnn_perf perf(ggml_op_name(_GgmlOp)); + perf.start(); + + bool succeed = false; + qnn::ggml_qnn_graph_binary *graph_ptr = + get_qnn_graph_from_cache<2, 1>(ctx, _GgmlOp, kGgmlOpToQnnOp[_GgmlOp], { src0, src1 }, { dst }); + if (graph_ptr) { + succeed = execute_graph<2, 1>(graph_ptr, { src0, src1 }, { dst }); + } + + if (!succeed) { + print_ggml_tensor(src0); + print_ggml_tensor(src1); + print_ggml_tensor(dst); + } +} + } // namespace #ifndef NDEBUG @@ -160,22 +270,7 @@ qnn::ggml_qnn_graph_binary *get_qnn_graph_from_cache(ggml_backend_qnn_context *c static void ggml_qnn_add(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst) { CHECK_PARAMS(ctx, src0, src1, dst); - - qnn::qnn_perf perf("ggml_op_qnn_add"); - perf.start(); - - bool succeed = false; - qnn::ggml_qnn_graph_binary *graph_ptr = - get_qnn_graph_from_cache<2, 1>(ctx, GGML_OP_ADD, QNN_OP_ELEMENT_WISE_ADD, { src0, src1 }, { dst }); - if (graph_ptr) { - succeed = execute_graph<2, 1>(graph_ptr, { src0, src1 }, { dst }); - } - - if (!succeed) { - print_ggml_tensor(src0); - print_ggml_tensor(src1); - print_ggml_tensor(dst); - } + qnn_binary_op_impl(ctx, src0, src1, dst); } /* @@ -192,22 +287,7 @@ static void ggml_qnn_add(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, static void ggml_qnn_mul_mat(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst) { CHECK_PARAMS(ctx, src0, src1, dst); - - qnn::qnn_perf perf("ggml_op_qnn_mul_mat"); - perf.start(); - - bool succeed = false; - qnn::ggml_qnn_graph_binary *graph_ptr = - get_qnn_graph_from_cache<2, 1>(ctx, GGML_OP_MUL_MAT, QNN_OP_MAT_MUL, { src0, src1 }, { dst }); - if (graph_ptr) { - succeed = execute_graph<2, 1>(graph_ptr, { src0, src1 }, { dst }); - } - - if (!succeed) { - print_ggml_tensor(src0); - print_ggml_tensor(src1); - print_ggml_tensor(dst); - } + qnn_binary_op_impl(ctx, src0, src1, dst); } static void ggml_qnn_repeat(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, @@ -320,7 +400,7 @@ static void ggml_qnn_nop(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, } qnn::ggml_qnn_op_array_t qnn::ggml_qnn_op_array() { - static constexpr const qnn::ggml_qnn_op_t kQnnOpsTable[GGML_OP_COUNT] = { + static constexpr const qnn::ggml_qnn_op_t kQnnOpsTable[] = { nullptr, // GGML_OP_NONE nullptr, // GGML_OP_DUP ggml_qnn_add, // GGML_OP_ADD @@ -405,5 +485,7 @@ qnn::ggml_qnn_op_array_t qnn::ggml_qnn_op_array() { nullptr, // GGML_OP_CROSS_ENTROPY_LOSS_BACK }; + static_assert(sizeof(kQnnOpsTable) / sizeof(kQnnOpsTable[0]) == GGML_OP_COUNT, + "GGML_OP_COUNT does not match the size of the ops table"); return kQnnOpsTable; } From 8932135fdb8ee05a9e4c44a64137a28c35bf05bc Mon Sep 17 00:00:00 2001 From: hongruichen Date: Thu, 11 Jul 2024 00:07:00 +0800 Subject: [PATCH 057/143] add sqrt and mul ops --- ggml/src/ggml-qnn/backend-ops.cpp | 137 +++++++++++++++--------------- 1 file changed, 70 insertions(+), 67 deletions(-) diff --git a/ggml/src/ggml-qnn/backend-ops.cpp b/ggml/src/ggml-qnn/backend-ops.cpp index 5871a7b6ef211..ca48f79bbc44b 100644 --- a/ggml/src/ggml-qnn/backend-ops.cpp +++ b/ggml/src/ggml-qnn/backend-ops.cpp @@ -8,6 +8,18 @@ #include "tensor.hpp" #include "utils.hpp" +#ifndef NDEBUG +#define CHECK_PARAMS(ctx, src0, src1, dst) \ + do { \ + if (!qnn_is_valid_params((ctx), (src0), (src1), (dst))) { \ + return; \ + } \ + } while (0) + +#else +#define CHECK_PARAMS(ctx, src0, src1, dst) +#endif + namespace { void print_ggml_tensor(const ggml_tensor *tensor) { @@ -144,29 +156,29 @@ qnn::ggml_qnn_graph_binary *get_qnn_graph_from_cache(ggml_backend_qnn_context *c } constexpr const char *kGgmlOpToQnnOp[] = { - nullptr, // GGML_OP_NONE - nullptr, // GGML_OP_DUP - QNN_OP_ELEMENT_WISE_ADD, // GGML_OP_ADD - nullptr, // GGML_OP_ADD1 - nullptr, // GGML_OP_ACC - nullptr, // GGML_OP_SUB - nullptr, // GGML_OP_MUL - nullptr, // GGML_OP_DIV - nullptr, // GGML_OP_SQR - nullptr, // GGML_OP_SQRT - nullptr, // GGML_OP_LOG - nullptr, // GGML_OP_SUM - nullptr, // GGML_OP_SUM_ROWS - nullptr, // GGML_OP_MEAN - nullptr, // GGML_OP_ARGMAX - nullptr, // GGML_OP_REPEAT - nullptr, // GGML_OP_REPEAT_BACK - nullptr, // GGML_OP_CONCAT - nullptr, // GGML_OP_SILU_BACK - nullptr, // GGML_OP_NORM - nullptr, // GGML_OP_RMS_NORM - nullptr, // GGML_OP_RMS_NORM_BACK - nullptr, // GGML_OP_GROUP_NORM + nullptr, // GGML_OP_NONE + nullptr, // GGML_OP_DUP + QNN_OP_ELEMENT_WISE_ADD, // GGML_OP_ADD + nullptr, // GGML_OP_ADD1 + nullptr, // GGML_OP_ACC + nullptr, // GGML_OP_SUB + QNN_OP_ELEMENT_WISE_MULTIPLY, // GGML_OP_MUL + nullptr, // GGML_OP_DIV + nullptr, // GGML_OP_SQR + QNN_OP_ELEMENT_WISE_SQUARE_ROOT, // GGML_OP_SQRT + nullptr, // GGML_OP_LOG + nullptr, // GGML_OP_SUM + nullptr, // GGML_OP_SUM_ROWS + nullptr, // GGML_OP_MEAN + nullptr, // GGML_OP_ARGMAX + nullptr, // GGML_OP_REPEAT + nullptr, // GGML_OP_REPEAT_BACK + nullptr, // GGML_OP_CONCAT + nullptr, // GGML_OP_SILU_BACK + nullptr, // GGML_OP_NORM + nullptr, // GGML_OP_RMS_NORM + nullptr, // GGML_OP_RMS_NORM_BACK + nullptr, // GGML_OP_GROUP_NORM QNN_OP_MAT_MUL, // GGML_OP_MUL_MAT nullptr, // GGML_OP_MUL_MAT_ID @@ -236,6 +248,8 @@ void qnn_binary_op_impl(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, ggml_tensor *dst) { static_assert(kGgmlOpToQnnOp[_GgmlOp] != nullptr, "GGML_OP does not have a corresponding QNN_OP"); + CHECK_PARAMS(ctx, src0, src1, dst); + qnn::qnn_perf perf(ggml_op_name(_GgmlOp)); perf.start(); @@ -255,24 +269,16 @@ void qnn_binary_op_impl(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, } // namespace -#ifndef NDEBUG -#define CHECK_PARAMS(ctx, src0, src1, dst) \ - do { \ - if (!qnn_is_valid_params((ctx), (src0), (src1), (dst))) { \ - return; \ - } \ - } while (0) - -#else -#define CHECK_PARAMS(ctx, src0, src1, dst) -#endif - static void ggml_qnn_add(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst) { - CHECK_PARAMS(ctx, src0, src1, dst); qnn_binary_op_impl(ctx, src0, src1, dst); } +static void ggml_qnn_mul(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, + ggml_tensor *dst) { + qnn_binary_op_impl(ctx, src0, src1, dst); +} + /* * ggml_qnn_mul_mat was re-added as a standalone function because * the following comments came from https://github.com/ggerganov/llama.cpp/pull/1632 @@ -286,7 +292,6 @@ static void ggml_qnn_add(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, */ static void ggml_qnn_mul_mat(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst) { - CHECK_PARAMS(ctx, src0, src1, dst); qnn_binary_op_impl(ctx, src0, src1, dst); } @@ -329,6 +334,11 @@ static void ggml_qnn_leaky_relu(ggml_backend_qnn_context *ctx, const ggml_tensor static void ggml_qnn_sqr(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst) {} +static void ggml_qnn_sqrt(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, + ggml_tensor *dst) { + qnn_binary_op_impl(ctx, src0, src1, dst); +} + static void ggml_qnn_norm(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst) {} @@ -392,38 +402,31 @@ static void ggml_qnn_argsort(ggml_backend_qnn_context *ctx, const ggml_tensor *s GGML_ASSERT(ggml_is_contiguous(src0)); } -static void ggml_qnn_nop(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, - ggml_tensor *dst) { - (void)src0; - (void)src1; - (void)dst; -} - qnn::ggml_qnn_op_array_t qnn::ggml_qnn_op_array() { static constexpr const qnn::ggml_qnn_op_t kQnnOpsTable[] = { - nullptr, // GGML_OP_NONE - nullptr, // GGML_OP_DUP - ggml_qnn_add, // GGML_OP_ADD - nullptr, // GGML_OP_ADD1 - nullptr, // GGML_OP_ACC - nullptr, // GGML_OP_SUB - nullptr, // GGML_OP_MUL - nullptr, // GGML_OP_DIV - nullptr, // GGML_OP_SQR - nullptr, // GGML_OP_SQRT - nullptr, // GGML_OP_LOG - nullptr, // GGML_OP_SUM - nullptr, // GGML_OP_SUM_ROWS - nullptr, // GGML_OP_MEAN - nullptr, // GGML_OP_ARGMAX - nullptr, // GGML_OP_REPEAT - nullptr, // GGML_OP_REPEAT_BACK - nullptr, // GGML_OP_CONCAT - nullptr, // GGML_OP_SILU_BACK - nullptr, // GGML_OP_NORM - nullptr, // GGML_OP_RMS_NORM - nullptr, // GGML_OP_RMS_NORM_BACK - nullptr, // GGML_OP_GROUP_NORM + nullptr, // GGML_OP_NONE + nullptr, // GGML_OP_DUP + ggml_qnn_add, // GGML_OP_ADD + nullptr, // GGML_OP_ADD1 + nullptr, // GGML_OP_ACC + nullptr, // GGML_OP_SUB + ggml_qnn_mul, // GGML_OP_MUL + nullptr, // GGML_OP_DIV + nullptr, // GGML_OP_SQR + ggml_qnn_sqrt, // GGML_OP_SQRT + nullptr, // GGML_OP_LOG + nullptr, // GGML_OP_SUM + nullptr, // GGML_OP_SUM_ROWS + nullptr, // GGML_OP_MEAN + nullptr, // GGML_OP_ARGMAX + nullptr, // GGML_OP_REPEAT + nullptr, // GGML_OP_REPEAT_BACK + nullptr, // GGML_OP_CONCAT + nullptr, // GGML_OP_SILU_BACK + nullptr, // GGML_OP_NORM + nullptr, // GGML_OP_RMS_NORM + nullptr, // GGML_OP_RMS_NORM_BACK + nullptr, // GGML_OP_GROUP_NORM ggml_qnn_mul_mat, // GGML_OP_MUL_MAT nullptr, // GGML_OP_MUL_MAT_ID From be3aa9631fb43fd39a078f8f55b3646ffe0492d9 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Thu, 11 Jul 2024 00:09:56 +0800 Subject: [PATCH 058/143] use template function directly --- ggml/src/ggml-qnn/backend-ops.cpp | 187 +++++------------------------- 1 file changed, 27 insertions(+), 160 deletions(-) diff --git a/ggml/src/ggml-qnn/backend-ops.cpp b/ggml/src/ggml-qnn/backend-ops.cpp index ca48f79bbc44b..1aaba32c93176 100644 --- a/ggml/src/ggml-qnn/backend-ops.cpp +++ b/ggml/src/ggml-qnn/backend-ops.cpp @@ -269,168 +269,35 @@ void qnn_binary_op_impl(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, } // namespace -static void ggml_qnn_add(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, - ggml_tensor *dst) { - qnn_binary_op_impl(ctx, src0, src1, dst); -} - -static void ggml_qnn_mul(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, - ggml_tensor *dst) { - qnn_binary_op_impl(ctx, src0, src1, dst); -} - -/* - * ggml_qnn_mul_mat was re-added as a standalone function because - * the following comments came from https://github.com/ggerganov/llama.cpp/pull/1632 - * MUL_MAT take most of the compute time (about 95%). - * So to speed up llama, we have to focus on MUL_MAT. - * - * We have three kinds of MUL_MAT to compute: - * mul_mat_f32: both src0 and src1 are F32. - * mul_mat_f16_f32: src0 is F16 and src1 is F32. - * mul_mat_q_f32: src0 is quantized (Q4_0, Q4_1, ...), and src1 is F32. - */ -static void ggml_qnn_mul_mat(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, - ggml_tensor *dst) { - qnn_binary_op_impl(ctx, src0, src1, dst); -} - -static void ggml_qnn_repeat(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, - ggml_tensor *dst) {} - -static void ggml_qnn_get_rows(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, - ggml_tensor *dst) {} - -static void ggml_qnn_acc(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, - ggml_tensor *dst) {} - -static void ggml_qnn_div(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, - ggml_tensor *dst) {} - -static void ggml_qnn_gelu(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, - ggml_tensor *dst) {} - -static void ggml_qnn_silu(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, - ggml_tensor *dst) {} - -static void ggml_qnn_gelu_quick(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, - ggml_tensor *dst) {} - -static void ggml_qnn_tanh(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, - ggml_tensor *dst) {} - -static void ggml_qnn_relu(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, - ggml_tensor *dst) {} - -static void ggml_qnn_hardsigmoid(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, - ggml_tensor *dst) {} - -static void ggml_qnn_hardswish(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, - ggml_tensor *dst) {} - -static void ggml_qnn_leaky_relu(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, - ggml_tensor *dst) {} - -static void ggml_qnn_sqr(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, - ggml_tensor *dst) {} - -static void ggml_qnn_sqrt(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, - ggml_tensor *dst) { - qnn_binary_op_impl(ctx, src0, src1, dst); -} - -static void ggml_qnn_norm(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, - ggml_tensor *dst) {} - -static void ggml_qnn_group_norm(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, - ggml_tensor *dst) {} - -static void ggml_qnn_concat(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, - ggml_tensor *dst) {} - -static void ggml_qnn_upscale(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, - ggml_tensor *dst) {} - -static void ggml_qnn_pad(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, - ggml_tensor *dst) {} - -static void ggml_qnn_rms_norm(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, - ggml_tensor *dst) {} - -static void ggml_qnn_cpy(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, - ggml_tensor *dst) {} - -static void ggml_qnn_dup(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, - ggml_tensor *dst) { - ggml_qnn_cpy(ctx, src0, dst, nullptr); - (void)src1; -} - -static void ggml_qnn_mul_mat_id(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, - ggml_tensor *dst) {} - -static void ggml_qnn_scale(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, - ggml_tensor *dst) {} - -static void ggml_qnn_clamp(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, - ggml_tensor *dst) {} - -static void ggml_qnn_diag_mask_inf(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, - ggml_tensor *dst) {} - -static void ggml_qnn_soft_max(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, - ggml_tensor *dst) {} - -static void ggml_qnn_rope(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, - ggml_tensor *dst) { - GGML_ASSERT(ggml_is_contiguous(src0)); -} - -static void ggml_qnn_pool2d(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, - ggml_tensor *dst) {} - -static void ggml_qnn_im2col(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, - ggml_tensor *dst) {} - -static void ggml_qnn_sum_rows(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, - ggml_tensor *dst) { - GGML_ASSERT(ggml_is_contiguous(src0)); -} - -static void ggml_qnn_argsort(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, - ggml_tensor *dst) { - GGML_ASSERT(ggml_is_contiguous(src0)); -} - qnn::ggml_qnn_op_array_t qnn::ggml_qnn_op_array() { static constexpr const qnn::ggml_qnn_op_t kQnnOpsTable[] = { - nullptr, // GGML_OP_NONE - nullptr, // GGML_OP_DUP - ggml_qnn_add, // GGML_OP_ADD - nullptr, // GGML_OP_ADD1 - nullptr, // GGML_OP_ACC - nullptr, // GGML_OP_SUB - ggml_qnn_mul, // GGML_OP_MUL - nullptr, // GGML_OP_DIV - nullptr, // GGML_OP_SQR - ggml_qnn_sqrt, // GGML_OP_SQRT - nullptr, // GGML_OP_LOG - nullptr, // GGML_OP_SUM - nullptr, // GGML_OP_SUM_ROWS - nullptr, // GGML_OP_MEAN - nullptr, // GGML_OP_ARGMAX - nullptr, // GGML_OP_REPEAT - nullptr, // GGML_OP_REPEAT_BACK - nullptr, // GGML_OP_CONCAT - nullptr, // GGML_OP_SILU_BACK - nullptr, // GGML_OP_NORM - nullptr, // GGML_OP_RMS_NORM - nullptr, // GGML_OP_RMS_NORM_BACK - nullptr, // GGML_OP_GROUP_NORM - - ggml_qnn_mul_mat, // GGML_OP_MUL_MAT - nullptr, // GGML_OP_MUL_MAT_ID - nullptr, // GGML_OP_OUT_PROD + nullptr, // GGML_OP_NONE + nullptr, // GGML_OP_DUP + qnn_binary_op_impl, // GGML_OP_ADD + nullptr, // GGML_OP_ADD1 + nullptr, // GGML_OP_ACC + nullptr, // GGML_OP_SUB + qnn_binary_op_impl, // GGML_OP_MUL + nullptr, // GGML_OP_DIV + nullptr, // GGML_OP_SQR + qnn_binary_op_impl, // GGML_OP_SQRT + nullptr, // GGML_OP_LOG + nullptr, // GGML_OP_SUM + nullptr, // GGML_OP_SUM_ROWS + nullptr, // GGML_OP_MEAN + nullptr, // GGML_OP_ARGMAX + nullptr, // GGML_OP_REPEAT + nullptr, // GGML_OP_REPEAT_BACK + nullptr, // GGML_OP_CONCAT + nullptr, // GGML_OP_SILU_BACK + nullptr, // GGML_OP_NORM + nullptr, // GGML_OP_RMS_NORM + nullptr, // GGML_OP_RMS_NORM_BACK + nullptr, // GGML_OP_GROUP_NORM + + qnn_binary_op_impl, // GGML_OP_MUL_MAT + nullptr, // GGML_OP_MUL_MAT_ID + nullptr, // GGML_OP_OUT_PROD nullptr, // GGML_OP_SCALE nullptr, // GGML_OP_SET From f0894d897a587f2244a49fd3161feeb6244c9e01 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Fri, 12 Jul 2024 19:40:55 +0800 Subject: [PATCH 059/143] wip wip --- ggml/src/ggml-qnn/backend-ops.cpp | 92 +++++++++++++++---------------- 1 file changed, 46 insertions(+), 46 deletions(-) diff --git a/ggml/src/ggml-qnn/backend-ops.cpp b/ggml/src/ggml-qnn/backend-ops.cpp index 1aaba32c93176..94a5d3c28a9d1 100644 --- a/ggml/src/ggml-qnn/backend-ops.cpp +++ b/ggml/src/ggml-qnn/backend-ops.cpp @@ -156,29 +156,29 @@ qnn::ggml_qnn_graph_binary *get_qnn_graph_from_cache(ggml_backend_qnn_context *c } constexpr const char *kGgmlOpToQnnOp[] = { - nullptr, // GGML_OP_NONE - nullptr, // GGML_OP_DUP - QNN_OP_ELEMENT_WISE_ADD, // GGML_OP_ADD - nullptr, // GGML_OP_ADD1 - nullptr, // GGML_OP_ACC - nullptr, // GGML_OP_SUB - QNN_OP_ELEMENT_WISE_MULTIPLY, // GGML_OP_MUL - nullptr, // GGML_OP_DIV - nullptr, // GGML_OP_SQR - QNN_OP_ELEMENT_WISE_SQUARE_ROOT, // GGML_OP_SQRT - nullptr, // GGML_OP_LOG - nullptr, // GGML_OP_SUM - nullptr, // GGML_OP_SUM_ROWS - nullptr, // GGML_OP_MEAN - nullptr, // GGML_OP_ARGMAX - nullptr, // GGML_OP_REPEAT - nullptr, // GGML_OP_REPEAT_BACK - nullptr, // GGML_OP_CONCAT - nullptr, // GGML_OP_SILU_BACK - nullptr, // GGML_OP_NORM - nullptr, // GGML_OP_RMS_NORM - nullptr, // GGML_OP_RMS_NORM_BACK - nullptr, // GGML_OP_GROUP_NORM + nullptr, // GGML_OP_NONE + nullptr, // GGML_OP_DUP + QNN_OP_ELEMENT_WISE_ADD, // GGML_OP_ADD + nullptr, // GGML_OP_ADD1 + nullptr, // GGML_OP_ACC + nullptr, // GGML_OP_SUB + QNN_OP_ELEMENT_WISE_MULTIPLY, // GGML_OP_MUL + nullptr, // GGML_OP_DIV + nullptr, // GGML_OP_SQR + nullptr, // GGML_OP_SQRT + nullptr, // GGML_OP_LOG + nullptr, // GGML_OP_SUM + nullptr, // GGML_OP_SUM_ROWS + nullptr, // GGML_OP_MEAN + nullptr, // GGML_OP_ARGMAX + nullptr, // GGML_OP_REPEAT + nullptr, // GGML_OP_REPEAT_BACK + nullptr, // GGML_OP_CONCAT + nullptr, // GGML_OP_SILU_BACK + nullptr, // GGML_OP_NORM + nullptr, // GGML_OP_RMS_NORM + nullptr, // GGML_OP_RMS_NORM_BACK + nullptr, // GGML_OP_GROUP_NORM QNN_OP_MAT_MUL, // GGML_OP_MUL_MAT nullptr, // GGML_OP_MUL_MAT_ID @@ -271,29 +271,29 @@ void qnn_binary_op_impl(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, qnn::ggml_qnn_op_array_t qnn::ggml_qnn_op_array() { static constexpr const qnn::ggml_qnn_op_t kQnnOpsTable[] = { - nullptr, // GGML_OP_NONE - nullptr, // GGML_OP_DUP - qnn_binary_op_impl, // GGML_OP_ADD - nullptr, // GGML_OP_ADD1 - nullptr, // GGML_OP_ACC - nullptr, // GGML_OP_SUB - qnn_binary_op_impl, // GGML_OP_MUL - nullptr, // GGML_OP_DIV - nullptr, // GGML_OP_SQR - qnn_binary_op_impl, // GGML_OP_SQRT - nullptr, // GGML_OP_LOG - nullptr, // GGML_OP_SUM - nullptr, // GGML_OP_SUM_ROWS - nullptr, // GGML_OP_MEAN - nullptr, // GGML_OP_ARGMAX - nullptr, // GGML_OP_REPEAT - nullptr, // GGML_OP_REPEAT_BACK - nullptr, // GGML_OP_CONCAT - nullptr, // GGML_OP_SILU_BACK - nullptr, // GGML_OP_NORM - nullptr, // GGML_OP_RMS_NORM - nullptr, // GGML_OP_RMS_NORM_BACK - nullptr, // GGML_OP_GROUP_NORM + nullptr, // GGML_OP_NONE + nullptr, // GGML_OP_DUP + qnn_binary_op_impl, // GGML_OP_ADD + nullptr, // GGML_OP_ADD1 + nullptr, // GGML_OP_ACC + nullptr, // GGML_OP_SUB + qnn_binary_op_impl, // GGML_OP_MUL + nullptr, // GGML_OP_DIV + nullptr, // GGML_OP_SQR + nullptr, // GGML_OP_SQRT + nullptr, // GGML_OP_LOG + nullptr, // GGML_OP_SUM + nullptr, // GGML_OP_SUM_ROWS + nullptr, // GGML_OP_MEAN + nullptr, // GGML_OP_ARGMAX + nullptr, // GGML_OP_REPEAT + nullptr, // GGML_OP_REPEAT_BACK + nullptr, // GGML_OP_CONCAT + nullptr, // GGML_OP_SILU_BACK + nullptr, // GGML_OP_NORM + nullptr, // GGML_OP_RMS_NORM + nullptr, // GGML_OP_RMS_NORM_BACK + nullptr, // GGML_OP_GROUP_NORM qnn_binary_op_impl, // GGML_OP_MUL_MAT nullptr, // GGML_OP_MUL_MAT_ID From 0eb595cc6e7691d8fbf30b7b04cdf8dd7eb108e3 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Fri, 12 Jul 2024 19:52:35 +0800 Subject: [PATCH 060/143] use table to simpilify the op mapping --- tests/ggml-qnn/ggml-qnn-ut.cpp | 69 +++++++++++++++++++++++++--------- 1 file changed, 52 insertions(+), 17 deletions(-) diff --git a/tests/ggml-qnn/ggml-qnn-ut.cpp b/tests/ggml-qnn/ggml-qnn-ut.cpp index ff01e62f983c7..0c3fbf71ebdbf 100644 --- a/tests/ggml-qnn/ggml-qnn-ut.cpp +++ b/tests/ggml-qnn/ggml-qnn-ut.cpp @@ -327,6 +327,41 @@ static void show_usage() { ); } + +typedef ggml_tensor * (*ggml_op_binary_t)( + ggml_context * ctx, + ggml_tensor * a, + ggml_tensor * b); + +static constexpr const ggml_op_binary_t kBinaryOps[] = { + nullptr, // GGML_OP_NONE + nullptr, // GGML_OP_DUP + ggml_add, // GGML_OP_ADD + nullptr, // GGML_OP_ADD1 + nullptr, // GGML_OP_ACC + nullptr, // GGML_OP_SUB + ggml_mul, // GGML_OP_MUL + nullptr, // GGML_OP_DIV + nullptr, // GGML_OP_SQR + nullptr, // GGML_OP_SQRT + nullptr, // GGML_OP_LOG + nullptr, // GGML_OP_SUM + nullptr, // GGML_OP_SUM_ROWS + nullptr, // GGML_OP_MEAN + nullptr, // GGML_OP_ARGMAX + nullptr, // GGML_OP_REPEAT + nullptr, // GGML_OP_REPEAT_BACK + nullptr, // GGML_OP_CONCAT + nullptr, // GGML_OP_SILU_BACK + nullptr, // GGML_OP_NORM + nullptr, // GGML_OP_RMS_NORM + nullptr, // GGML_OP_RMS_NORM_BACK + nullptr, // GGML_OP_GROUP_NORM + ggml_mul_mat, // GGML_OP_MUL_MAT +}; + +static_assert(kBinaryOps[GGML_OP_MUL_MAT] == ggml_mul_mat, "ggml_mul_mat at wrong index, check kBinaryOps"); + static int qnn_op_ut(int num_threads, int n_backend_type, int n_ggml_op_type) { int64_t n_begin_time = 0LL; int64_t n_end_time = 0LL; @@ -398,19 +433,15 @@ static int qnn_op_ut(int num_threads, int n_backend_type, int n_ggml_op_type) { ggml_set_input(src0); ggml_set_input(src1); - switch (n_ggml_op_type) { - case GGML_OP_ADD: - dst = ggml_add(ctx, src0, src1); - break; - case GGML_OP_MUL_MAT: - dst = ggml_mul_mat(ctx, src0, src1); - break; - default: - QNN_LOG_WARN("ggml op %d(%s) not supported", n_ggml_op_type, - ggml_op_name((enum ggml_op) n_ggml_op_type)); - ggml_free(ctx); - ggml_backend_free(backend); - return 3; + auto binary_op = kBinaryOps[n_ggml_op_type]; + if (binary_op) { + dst = binary_op(ctx, src0, src1); + } else { + QNN_LOG_WARN("ggml op %d(%s) not supported", n_ggml_op_type, + ggml_op_name((enum ggml_op) n_ggml_op_type)); + ggml_free(ctx); + ggml_backend_free(backend); + return 3; } ggml_set_output(dst); @@ -473,6 +504,11 @@ static int qnn_op_ut(int num_threads, int n_backend_type, int n_ggml_op_type) { return 0; } +static const std::unordered_map kMapStringToGGMLOp = { + {"GGML_OP_ADD", GGML_OP_ADD}, + {"GGML_OP_MUL_MAT", GGML_OP_MUL_MAT}, +}; + int main(int argc, char * argv[]) { int num_threads = 4; int n_backend_type = QNN_BACKEND_CPU; @@ -481,10 +517,9 @@ int main(int argc, char * argv[]) { for (int i = 1; i < argc; i++) { if (0 == strcmp(argv[i], "-t")) { if (i + 1 < argc) { - if (0 == memcmp(argv[i + 1], "GGML_OP_ADD", 11)) { - n_ggml_op_type = GGML_OP_ADD; - } else if (0 == memcmp(argv[i + 1], "GGML_OP_MUL_MAT", 15)) { - n_ggml_op_type = GGML_OP_MUL_MAT; + auto it = kMapStringToGGMLOp.find(argv[i + 1]); + if (it != kMapStringToGGMLOp.end()) { + n_ggml_op_type = it->second; } else { show_usage(); return 1; From e3aa43adbdabc75b9f6fcf5bf5bf4ab9899df0a7 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Fri, 12 Jul 2024 23:26:11 +0800 Subject: [PATCH 061/143] suppress warning --- ggml/src/ggml-qnn/backend-ops.cpp | 37 ++++++++++++++++++------------- 1 file changed, 21 insertions(+), 16 deletions(-) diff --git a/ggml/src/ggml-qnn/backend-ops.cpp b/ggml/src/ggml-qnn/backend-ops.cpp index 94a5d3c28a9d1..30f2e402cf16f 100644 --- a/ggml/src/ggml-qnn/backend-ops.cpp +++ b/ggml/src/ggml-qnn/backend-ops.cpp @@ -9,25 +9,9 @@ #include "utils.hpp" #ifndef NDEBUG -#define CHECK_PARAMS(ctx, src0, src1, dst) \ - do { \ - if (!qnn_is_valid_params((ctx), (src0), (src1), (dst))) { \ - return; \ - } \ - } while (0) - -#else -#define CHECK_PARAMS(ctx, src0, src1, dst) -#endif namespace { -void print_ggml_tensor(const ggml_tensor *tensor) { - QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", - tensor->name, tensor->type, ggml_type_name(tensor->type), tensor->ne[0], tensor->ne[1], tensor->ne[2], - tensor->nb[0], tensor->nb[1], tensor->nb[2]); -} - bool qnn_is_valid_params(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst) { if (!ctx || !src0 || !src1 || !dst) { @@ -47,6 +31,27 @@ bool qnn_is_valid_params(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, return true; } +} // namespace + +#define CHECK_PARAMS(ctx, src0, src1, dst) \ + do { \ + if (!qnn_is_valid_params((ctx), (src0), (src1), (dst))) { \ + return; \ + } \ + } while (0) + +#else +#define CHECK_PARAMS(ctx, src0, src1, dst) +#endif + +namespace { + +void print_ggml_tensor(const ggml_tensor *tensor) { + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + tensor->name, tensor->type, ggml_type_name(tensor->type), tensor->ne[0], tensor->ne[1], tensor->ne[2], + tensor->nb[0], tensor->nb[1], tensor->nb[2]); +} + template bool qnn_bind_tensors_to_graph(qnn::ggml_qnn_graph<_InputSize, _OutputSize> *graph, const std::string &op_name, const std::array &inputs, From 7cbc4fbd8c9bd15513d0b47e1fe88e722bd863d5 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Fri, 12 Jul 2024 23:26:38 +0800 Subject: [PATCH 062/143] add mul --- tests/ggml-qnn/ggml-qnn-ut.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/ggml-qnn/ggml-qnn-ut.cpp b/tests/ggml-qnn/ggml-qnn-ut.cpp index 0c3fbf71ebdbf..96dfa2bcfe27e 100644 --- a/tests/ggml-qnn/ggml-qnn-ut.cpp +++ b/tests/ggml-qnn/ggml-qnn-ut.cpp @@ -507,6 +507,7 @@ static int qnn_op_ut(int num_threads, int n_backend_type, int n_ggml_op_type) { static const std::unordered_map kMapStringToGGMLOp = { {"GGML_OP_ADD", GGML_OP_ADD}, {"GGML_OP_MUL_MAT", GGML_OP_MUL_MAT}, + {"GGML_OP_MUL", GGML_OP_MUL}, }; int main(int argc, char * argv[]) { From 100ccd5e7fb5bafa92d57ea87108461f91bcfcc6 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Sat, 13 Jul 2024 00:06:58 +0800 Subject: [PATCH 063/143] add unary op template and more ops --- ggml/src/ggml-qnn.cpp | 37 ++--- ggml/src/ggml-qnn/backend-ops.cpp | 243 ++++++++++++++++++++++++------ ggml/src/ggml-qnn/backend-ops.hpp | 11 +- ggml/src/ggml-qnn/backend.hpp | 8 +- 4 files changed, 225 insertions(+), 74 deletions(-) diff --git a/ggml/src/ggml-qnn.cpp b/ggml/src/ggml-qnn.cpp index 3584c41120ae6..de1fefe497e58 100644 --- a/ggml/src/ggml-qnn.cpp +++ b/ggml/src/ggml-qnn.cpp @@ -1,6 +1,5 @@ #include "ggml-qnn.h" -#include #include #include #include @@ -15,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -142,7 +142,8 @@ struct ggml_backend_qnn_buffer_type_context { // ================================================================================================= static bool ggml_qnn_can_handle_op(ggml_backend_qnn_context *ctx, const struct ggml_tensor *tensor, bool b_dump_tensor_info) { - if (ggml_is_empty(tensor) || !qnn::ggml_qnn_op_array()[tensor->op]) { + if (ggml_is_empty(tensor) || + (!qnn::ggml_qnn_unary_op_array()[tensor->op] && !qnn::ggml_qnn_binary_op_array()[tensor->op])) { return false; } @@ -161,19 +162,6 @@ static bool ggml_qnn_can_handle_op(ggml_backend_qnn_context *ctx, const struct g return false; } - // TODO: support other GGML OPs using QNN API - // a GENERAL approach could fix this problem in a standalone PR of refine ggml backend - // subsystem for hybrid inference between CPU&GPU / CPU&NPU easily(less the 100 LoC and no - // side-effect to the existing codes) for ANY ggml backends which the backend's - // ggml_backend_xxx_buffer_is_host return true. this approach could be found at: - // https://github.com/ggerganov/llama.cpp/pull/7641 - bool supported_op = false; - supported_op = (tensor->op == GGML_OP_ADD); - supported_op = ((tensor->op == GGML_OP_ADD) || (tensor->op == GGML_OP_MUL_MAT)); - if (!supported_op) { - return false; - } - // TODO: support other quantized data type if (ggml_is_quantized(src0->type)) { if (src0->type != GGML_TYPE_Q8_0 && src0->type != GGML_TYPE_Q4_0) { @@ -192,14 +180,18 @@ static bool ggml_qnn_can_handle_op(ggml_backend_qnn_context *ctx, const struct g } bool ggml_qnn_compute_forward(ggml_backend_qnn_context *ctx, struct ggml_tensor *tensor) { - auto func = qnn::ggml_qnn_op_array()[tensor->op]; - if (!func) { - QNN_LOG_WARN("unsupported op %d", tensor->op); - return false; + auto unary_op = qnn::ggml_qnn_unary_op_array()[tensor->op]; + if (unary_op) { + return unary_op(ctx, tensor->src[0], tensor); } - func(ctx, tensor->src[0], tensor->src[1], tensor); - return true; + auto binary_op = qnn::ggml_qnn_binary_op_array()[tensor->op]; + if (binary_op) { + return binary_op(ctx, tensor->src[0], tensor->src[1], tensor); + } + + QNN_LOG_WARN("unsupported op %d", tensor->op); + return false; } static const char *ggml_backend_qnn_buffer_get_name(ggml_backend_buffer_t buffer) { @@ -232,7 +224,7 @@ GGML_CALL static void ggml_backend_qnn_buffer_init_tensor(ggml_backend_buffer_t QNN_LOG_WARN("Create ggml_qnn_tensor failed"); return; } - + ctx->tensors.push_back(std::move(qnn_tensor)); } @@ -343,6 +335,7 @@ GGML_CALL static void ggml_backend_qnn_free(ggml_backend_t backend) { auto instance = g_qnn_mgr[ctx->device].instance; if (instance) { + ctx->qnn_unary_graph_cache.clear(); for (const auto &graph_item : ctx->qnn_binary_graph_cache) { QNN_LOG_INFO("graph type:%s", graph_item.first.c_str()); } diff --git a/ggml/src/ggml-qnn/backend-ops.cpp b/ggml/src/ggml-qnn/backend-ops.cpp index 30f2e402cf16f..a516d8b06c344 100644 --- a/ggml/src/ggml-qnn/backend-ops.cpp +++ b/ggml/src/ggml-qnn/backend-ops.cpp @@ -12,6 +12,23 @@ namespace { +bool qnn_is_valid_params(ggml_backend_qnn_context *ctx, const ggml_tensor *src, ggml_tensor *dst) { + if (!ctx || !src || !dst) { + QNN_LOG_WARN("invalid params\n"); + return false; + } + + auto instance = ctx->instance; + auto *tensor0 = qnn::ggml_qnn_tensor::from_ggml_tensor(src); + auto *tensor1 = qnn::ggml_qnn_tensor::from_ggml_tensor(dst); + if (!instance || !tensor0 || !tensor1) { + QNN_LOG_WARN("invalid tensors\n"); + return false; + } + + return true; +} + bool qnn_is_valid_params(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst) { if (!ctx || !src0 || !src1 || !dst) { @@ -33,15 +50,13 @@ bool qnn_is_valid_params(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, } // namespace -#define CHECK_PARAMS(ctx, src0, src1, dst) \ - do { \ - if (!qnn_is_valid_params((ctx), (src0), (src1), (dst))) { \ - return; \ - } \ - } while (0) +#define CHECK_PARAMS(ctx, ...) \ + if (!qnn_is_valid_params((ctx), __VA_ARGS__)) { \ + return false; \ + } #else -#define CHECK_PARAMS(ctx, src0, src1, dst) +#define CHECK_PARAMS(ctx, ...) #endif namespace { @@ -125,15 +140,33 @@ bool execute_graph(qnn::ggml_qnn_graph<_InputSize, _OutputSize> *graph, return true; } +qnn::ggml_qnn_unary_graph_cache_t &get_qnn_graph_cache(ggml_backend_qnn_context *ctx, + const std::array &inputs, + const std::array &outputs) { + GGML_UNUSED(inputs); + GGML_UNUSED(outputs); + return ctx->qnn_unary_graph_cache; +} + +qnn::ggml_qnn_binary_graph_cache_t &get_qnn_graph_cache(ggml_backend_qnn_context *ctx, + const std::array &inputs, + const std::array &outputs) { + GGML_UNUSED(inputs); + GGML_UNUSED(outputs); + return ctx->qnn_binary_graph_cache; +} + template -qnn::ggml_qnn_graph_binary *get_qnn_graph_from_cache(ggml_backend_qnn_context *ctx, ggml_op op, - const std::string &qnn_op, - const std::array &inputs, - const std::array &outputs) { +qnn::ggml_qnn_graph<_InputSize, _OutputSize> *get_qnn_graph_from_cache( + ggml_backend_qnn_context *ctx, ggml_op op, const std::string &qnn_op, + const std::array &inputs, const std::array &outputs) { + using graph_t = qnn::ggml_qnn_graph<_InputSize, _OutputSize>; + + auto &graph_cache = get_qnn_graph_cache(ctx, inputs, outputs); const std::string graph_key(ggml_op_name(op)); - auto it = ctx->qnn_binary_graph_cache.find(graph_key); - qnn::ggml_qnn_graph_binary *graph_ptr = nullptr; - if (it != ctx->qnn_binary_graph_cache.end()) { + auto it = graph_cache.find(graph_key); + graph_t *graph_ptr = nullptr; + if (it != graph_cache.end()) { graph_ptr = it->second.get(); } else { std::string graph_name = graph_key + "_" + std::to_string(ctx->threads); @@ -141,49 +174,49 @@ qnn::ggml_qnn_graph_binary *get_qnn_graph_from_cache(ggml_backend_qnn_context *c graph_name += "_"; graph_name += input->name; } - auto graph = std::make_unique(graph_name, (QNNBackend)(ctx->device), - ctx->instance->get_qnn_context_handle(), - ctx->raw_interface, ctx->socinfo.vtcm_size_in_mb); + auto graph = + std::make_unique(graph_name, (QNNBackend)(ctx->device), ctx->instance->get_qnn_context_handle(), + ctx->raw_interface, ctx->socinfo.vtcm_size_in_mb); if (!graph->is_valid()) { return nullptr; } - if (!qnn_bind_tensors_to_graph<2, 1>(graph.get(), qnn_op.c_str(), inputs, outputs)) { + if (!qnn_bind_tensors_to_graph<_InputSize, _OutputSize>(graph.get(), qnn_op.c_str(), inputs, outputs)) { return nullptr; } graph_ptr = graph.get(); - ctx->qnn_binary_graph_cache[graph_key] = std::move(graph); + graph_cache[graph_key] = std::move(graph); } return graph_ptr; } constexpr const char *kGgmlOpToQnnOp[] = { - nullptr, // GGML_OP_NONE - nullptr, // GGML_OP_DUP - QNN_OP_ELEMENT_WISE_ADD, // GGML_OP_ADD - nullptr, // GGML_OP_ADD1 - nullptr, // GGML_OP_ACC - nullptr, // GGML_OP_SUB - QNN_OP_ELEMENT_WISE_MULTIPLY, // GGML_OP_MUL - nullptr, // GGML_OP_DIV - nullptr, // GGML_OP_SQR - nullptr, // GGML_OP_SQRT - nullptr, // GGML_OP_LOG - nullptr, // GGML_OP_SUM - nullptr, // GGML_OP_SUM_ROWS - nullptr, // GGML_OP_MEAN - nullptr, // GGML_OP_ARGMAX - nullptr, // GGML_OP_REPEAT - nullptr, // GGML_OP_REPEAT_BACK - nullptr, // GGML_OP_CONCAT - nullptr, // GGML_OP_SILU_BACK - nullptr, // GGML_OP_NORM - nullptr, // GGML_OP_RMS_NORM - nullptr, // GGML_OP_RMS_NORM_BACK - nullptr, // GGML_OP_GROUP_NORM + nullptr, // GGML_OP_NONE + nullptr, // GGML_OP_DUP + QNN_OP_ELEMENT_WISE_ADD, // GGML_OP_ADD + nullptr, // GGML_OP_ADD1 + nullptr, // GGML_OP_ACC + QNN_OP_ELEMENT_WISE_SUBTRACT, // GGML_OP_SUB + QNN_OP_ELEMENT_WISE_MULTIPLY, // GGML_OP_MUL + QNN_OP_ELEMENT_WISE_DIVIDE, // GGML_OP_DIV + nullptr, // GGML_OP_SQR + QNN_OP_ELEMENT_WISE_SQUARE_ROOT, // GGML_OP_SQRT + nullptr, // GGML_OP_LOG + nullptr, // GGML_OP_SUM + nullptr, // GGML_OP_SUM_ROWS + nullptr, // GGML_OP_MEAN + nullptr, // GGML_OP_ARGMAX + nullptr, // GGML_OP_REPEAT + nullptr, // GGML_OP_REPEAT_BACK + nullptr, // GGML_OP_CONCAT + nullptr, // GGML_OP_SILU_BACK + nullptr, // GGML_OP_NORM + nullptr, // GGML_OP_RMS_NORM + nullptr, // GGML_OP_RMS_NORM_BACK + nullptr, // GGML_OP_GROUP_NORM QNN_OP_MAT_MUL, // GGML_OP_MUL_MAT nullptr, // GGML_OP_MUL_MAT_ID @@ -249,7 +282,7 @@ static_assert(sizeof(kGgmlOpToQnnOp) / sizeof(kGgmlOpToQnnOp[0]) == GGML_OP_COUN "GGML_OP_COUNT does not match the size of the ops table"); template -void qnn_binary_op_impl(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, +bool qnn_binary_op_impl(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst) { static_assert(kGgmlOpToQnnOp[_GgmlOp] != nullptr, "GGML_OP does not have a corresponding QNN_OP"); @@ -270,21 +303,137 @@ void qnn_binary_op_impl(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, print_ggml_tensor(src1); print_ggml_tensor(dst); } + + return succeed; +} + +template +bool qnn_unary_op_impl(ggml_backend_qnn_context *ctx, const ggml_tensor *src, ggml_tensor *dst) { + static_assert(kGgmlOpToQnnOp[_GgmlOp] != nullptr, "GGML_OP does not have a corresponding QNN_OP"); + + CHECK_PARAMS(ctx, src, dst); + + qnn::qnn_perf perf(ggml_op_name(_GgmlOp)); + perf.start(); + + bool succeed = false; + auto *graph_ptr = get_qnn_graph_from_cache<1, 1>(ctx, _GgmlOp, kGgmlOpToQnnOp[_GgmlOp], { src }, { dst }); + if (graph_ptr) { + succeed = execute_graph<1, 1>(graph_ptr, { src }, { dst }); + } + + if (!succeed) { + print_ggml_tensor(src); + print_ggml_tensor(dst); + } + + return succeed; } } // namespace -qnn::ggml_qnn_op_array_t qnn::ggml_qnn_op_array() { - static constexpr const qnn::ggml_qnn_op_t kQnnOpsTable[] = { +qnn::ggml_qnn_unary_op_array_t qnn::ggml_qnn_unary_op_array() { + static constexpr const qnn::ggml_qnn_unary_op_t kQnnOpsTable[] = { nullptr, // GGML_OP_NONE nullptr, // GGML_OP_DUP - qnn_binary_op_impl, // GGML_OP_ADD + nullptr, // GGML_OP_ADD nullptr, // GGML_OP_ADD1 nullptr, // GGML_OP_ACC nullptr, // GGML_OP_SUB - qnn_binary_op_impl, // GGML_OP_MUL + nullptr, // GGML_OP_MUL nullptr, // GGML_OP_DIV nullptr, // GGML_OP_SQR + qnn_unary_op_impl, // GGML_OP_SQRT + nullptr, // GGML_OP_LOG + nullptr, // GGML_OP_SUM + nullptr, // GGML_OP_SUM_ROWS + nullptr, // GGML_OP_MEAN + nullptr, // GGML_OP_ARGMAX + nullptr, // GGML_OP_REPEAT + nullptr, // GGML_OP_REPEAT_BACK + nullptr, // GGML_OP_CONCAT + nullptr, // GGML_OP_SILU_BACK + nullptr, // GGML_OP_NORM + nullptr, // GGML_OP_RMS_NORM + nullptr, // GGML_OP_RMS_NORM_BACK + nullptr, // GGML_OP_GROUP_NORM + + nullptr, // GGML_OP_MUL_MAT + nullptr, // GGML_OP_MUL_MAT_ID + nullptr, // GGML_OP_OUT_PROD + + nullptr, // GGML_OP_SCALE + nullptr, // GGML_OP_SET + nullptr, // GGML_OP_CPY + nullptr, // GGML_OP_CONT + nullptr, // GGML_OP_RESHAPE + nullptr, // GGML_OP_VIEW + nullptr, // GGML_OP_PERMUTE + nullptr, // GGML_OP_TRANSPOSE + nullptr, // GGML_OP_GET_ROWS + nullptr, // GGML_OP_GET_ROWS_BACK + nullptr, // GGML_OP_DIAG + nullptr, // GGML_OP_DIAG_MASK_INF + nullptr, // GGML_OP_DIAG_MASK_ZERO + nullptr, // GGML_OP_SOFT_MAX + nullptr, // GGML_OP_SOFT_MAX_BACK + nullptr, // GGML_OP_ROPE + nullptr, // GGML_OP_ROPE_BACK + nullptr, // GGML_OP_CLAMP + nullptr, // GGML_OP_CONV_TRANSPOSE_1D + nullptr, // GGML_OP_IM2COL + nullptr, // GGML_OP_CONV_TRANSPOSE_2D + nullptr, // GGML_OP_POOL_1D + nullptr, // GGML_OP_POOL_2D + nullptr, // GGML_OP_UPSCALE + nullptr, // GGML_OP_PAD + nullptr, // GGML_OP_ARANGE + nullptr, // GGML_OP_TIMESTEP_EMBEDDING + nullptr, // GGML_OP_ARGSORT + nullptr, // GGML_OP_LEAKY_RELU + + nullptr, // GGML_OP_FLASH_ATTN_EXT + nullptr, // GGML_OP_FLASH_ATTN_BACK + nullptr, // GGML_OP_SSM_CONV + nullptr, // GGML_OP_SSM_SCAN + nullptr, // GGML_OP_WIN_PART + nullptr, // GGML_OP_WIN_UNPART + nullptr, // GGML_OP_GET_REL_POS + nullptr, // GGML_OP_ADD_REL_POS + + nullptr, // GGML_OP_UNARY + + nullptr, // GGML_OP_MAP_UNARY + nullptr, // GGML_OP_MAP_BINARY + + nullptr, // GGML_OP_MAP_CUSTOM1_F32 + nullptr, // GGML_OP_MAP_CUSTOM2_F32 + nullptr, // GGML_OP_MAP_CUSTOM3_F32 + + nullptr, // GGML_OP_MAP_CUSTOM1 + nullptr, // GGML_OP_MAP_CUSTOM2 + nullptr, // GGML_OP_MAP_CUSTOM3 + + nullptr, // GGML_OP_CROSS_ENTROPY_LOSS + nullptr, // GGML_OP_CROSS_ENTROPY_LOSS_BACK + }; + + static_assert(sizeof(kQnnOpsTable) / sizeof(kQnnOpsTable[0]) == GGML_OP_COUNT, + "GGML_OP_COUNT does not match the size of the ops table"); + return kQnnOpsTable; +} + +qnn::ggml_qnn_binary_op_array_t qnn::ggml_qnn_binary_op_array() { + static constexpr const qnn::ggml_qnn_binary_op_t kQnnOpsTable[] = { + nullptr, // GGML_OP_NONE + nullptr, // GGML_OP_DUP + qnn_binary_op_impl, // GGML_OP_ADD + nullptr, // GGML_OP_ADD1 + nullptr, // GGML_OP_ACC + qnn_binary_op_impl, // GGML_OP_SUB + qnn_binary_op_impl, // GGML_OP_MUL + qnn_binary_op_impl, // GGML_OP_DIV + nullptr, // GGML_OP_SQR nullptr, // GGML_OP_SQRT nullptr, // GGML_OP_LOG nullptr, // GGML_OP_SUM diff --git a/ggml/src/ggml-qnn/backend-ops.hpp b/ggml/src/ggml-qnn/backend-ops.hpp index 01c23ecff9b16..8d94fc6c25424 100644 --- a/ggml/src/ggml-qnn/backend-ops.hpp +++ b/ggml/src/ggml-qnn/backend-ops.hpp @@ -6,11 +6,14 @@ namespace qnn { -typedef void (*ggml_qnn_op_t)(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, - ggml_tensor *dst); +typedef bool (*ggml_qnn_unary_op_t)(ggml_backend_qnn_context *ctx, const ggml_tensor *src, ggml_tensor *dst); +typedef bool (*ggml_qnn_binary_op_t)(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, + ggml_tensor *dst); -typedef const ggml_qnn_op_t (&ggml_qnn_op_array_t)[GGML_OP_COUNT]; +typedef const ggml_qnn_unary_op_t (&ggml_qnn_unary_op_array_t)[GGML_OP_COUNT]; +typedef const ggml_qnn_binary_op_t (&ggml_qnn_binary_op_array_t)[GGML_OP_COUNT]; -ggml_qnn_op_array_t ggml_qnn_op_array(); +ggml_qnn_unary_op_array_t ggml_qnn_unary_op_array(); +ggml_qnn_binary_op_array_t ggml_qnn_binary_op_array(); } // namespace qnn diff --git a/ggml/src/ggml-qnn/backend.hpp b/ggml/src/ggml-qnn/backend.hpp index 48b243577ca1f..0ec927779cc31 100644 --- a/ggml/src/ggml-qnn/backend.hpp +++ b/ggml/src/ggml-qnn/backend.hpp @@ -11,6 +11,11 @@ #include "graph.hpp" #include "qnn.hpp" +namespace qnn { +typedef std::unordered_map> ggml_qnn_unary_graph_cache_t; +typedef std::unordered_map> ggml_qnn_binary_graph_cache_t; +} // namespace qnn + struct ggml_backend_qnn_context { int device; int threads; @@ -21,5 +26,6 @@ struct ggml_backend_qnn_context { QNN_INTERFACE_VER_TYPE raw_interface; QNN_SYSTEM_INTERFACE_VER_TYPE raw_system_interface; qnn::qcom_socinfo socinfo; - std::unordered_map> qnn_binary_graph_cache; + qnn::ggml_qnn_unary_graph_cache_t qnn_unary_graph_cache; + qnn::ggml_qnn_binary_graph_cache_t qnn_binary_graph_cache; }; From c1e2283887c3fb6d09b2b3fdedd3847f1060ddfa Mon Sep 17 00:00:00 2001 From: hongruichen Date: Sat, 13 Jul 2024 10:55:36 +0800 Subject: [PATCH 064/143] expose op at unit test --- tests/ggml-qnn/ggml-qnn-ut.cpp | 50 ++++++++++++++++++++++++++++++---- 1 file changed, 44 insertions(+), 6 deletions(-) diff --git a/tests/ggml-qnn/ggml-qnn-ut.cpp b/tests/ggml-qnn/ggml-qnn-ut.cpp index 96dfa2bcfe27e..dea336966061c 100644 --- a/tests/ggml-qnn/ggml-qnn-ut.cpp +++ b/tests/ggml-qnn/ggml-qnn-ut.cpp @@ -327,21 +327,51 @@ static void show_usage() { ); } +typedef ggml_tensor * (*ggml_op_unary_t)( + ggml_context * ctx, + ggml_tensor * a); typedef ggml_tensor * (*ggml_op_binary_t)( ggml_context * ctx, ggml_tensor * a, ggml_tensor * b); +static constexpr const ggml_op_unary_t kUnaryOps[] = { + nullptr, // GGML_OP_NONE + nullptr, // GGML_OP_DUP + nullptr, // GGML_OP_ADD + nullptr, // GGML_OP_ADD1 + nullptr, // GGML_OP_ACC + nullptr, // GGML_OP_SUB + nullptr, // GGML_OP_MUL + nullptr, // GGML_OP_DIV + nullptr, // GGML_OP_SQR + ggml_sqrt, // GGML_OP_SQRT + nullptr, // GGML_OP_LOG + nullptr, // GGML_OP_SUM + nullptr, // GGML_OP_SUM_ROWS + nullptr, // GGML_OP_MEAN + nullptr, // GGML_OP_ARGMAX + nullptr, // GGML_OP_REPEAT + nullptr, // GGML_OP_REPEAT_BACK + nullptr, // GGML_OP_CONCAT + nullptr, // GGML_OP_SILU_BACK + nullptr, // GGML_OP_NORM + nullptr, // GGML_OP_RMS_NORM + nullptr, // GGML_OP_RMS_NORM_BACK + nullptr, // GGML_OP_GROUP_NORM + nullptr, // GGML_OP_MUL_MAT +}; + static constexpr const ggml_op_binary_t kBinaryOps[] = { nullptr, // GGML_OP_NONE nullptr, // GGML_OP_DUP ggml_add, // GGML_OP_ADD nullptr, // GGML_OP_ADD1 nullptr, // GGML_OP_ACC - nullptr, // GGML_OP_SUB + ggml_sub, // GGML_OP_SUB ggml_mul, // GGML_OP_MUL - nullptr, // GGML_OP_DIV + ggml_div, // GGML_OP_DIV nullptr, // GGML_OP_SQR nullptr, // GGML_OP_SQRT nullptr, // GGML_OP_LOG @@ -433,8 +463,11 @@ static int qnn_op_ut(int num_threads, int n_backend_type, int n_ggml_op_type) { ggml_set_input(src0); ggml_set_input(src1); + auto unary_op = kUnaryOps[n_ggml_op_type]; auto binary_op = kBinaryOps[n_ggml_op_type]; - if (binary_op) { + if (unary_op) { + dst = unary_op(ctx, src0); + } else if (binary_op) { dst = binary_op(ctx, src0, src1); } else { QNN_LOG_WARN("ggml op %d(%s) not supported", n_ggml_op_type, @@ -504,10 +537,15 @@ static int qnn_op_ut(int num_threads, int n_backend_type, int n_ggml_op_type) { return 0; } +#define DEFINE_OP(op) { #op, op } + static const std::unordered_map kMapStringToGGMLOp = { - {"GGML_OP_ADD", GGML_OP_ADD}, - {"GGML_OP_MUL_MAT", GGML_OP_MUL_MAT}, - {"GGML_OP_MUL", GGML_OP_MUL}, + DEFINE_OP(GGML_OP_ADD), + DEFINE_OP(GGML_OP_SUB), + DEFINE_OP(GGML_OP_MUL), + DEFINE_OP(GGML_OP_DIV), + DEFINE_OP(GGML_OP_SQRT), + DEFINE_OP(GGML_OP_MUL_MAT), }; int main(int argc, char * argv[]) { From 148ceab70c8b08a93345819b817cf08e19a5316a Mon Sep 17 00:00:00 2001 From: hongruichen Date: Sun, 14 Jul 2024 22:57:09 +0800 Subject: [PATCH 065/143] add log op --- ggml/src/ggml-qnn/backend-ops.cpp | 4 ++-- tests/ggml-qnn/ggml-qnn-ut.cpp | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/ggml/src/ggml-qnn/backend-ops.cpp b/ggml/src/ggml-qnn/backend-ops.cpp index a516d8b06c344..711f707531228 100644 --- a/ggml/src/ggml-qnn/backend-ops.cpp +++ b/ggml/src/ggml-qnn/backend-ops.cpp @@ -204,7 +204,7 @@ constexpr const char *kGgmlOpToQnnOp[] = { QNN_OP_ELEMENT_WISE_DIVIDE, // GGML_OP_DIV nullptr, // GGML_OP_SQR QNN_OP_ELEMENT_WISE_SQUARE_ROOT, // GGML_OP_SQRT - nullptr, // GGML_OP_LOG + QNN_OP_ELEMENT_WISE_LOG, // GGML_OP_LOG nullptr, // GGML_OP_SUM nullptr, // GGML_OP_SUM_ROWS nullptr, // GGML_OP_MEAN @@ -344,7 +344,7 @@ qnn::ggml_qnn_unary_op_array_t qnn::ggml_qnn_unary_op_array() { nullptr, // GGML_OP_DIV nullptr, // GGML_OP_SQR qnn_unary_op_impl, // GGML_OP_SQRT - nullptr, // GGML_OP_LOG + qnn_unary_op_impl, // GGML_OP_LOG nullptr, // GGML_OP_SUM nullptr, // GGML_OP_SUM_ROWS nullptr, // GGML_OP_MEAN diff --git a/tests/ggml-qnn/ggml-qnn-ut.cpp b/tests/ggml-qnn/ggml-qnn-ut.cpp index dea336966061c..59e561f130e75 100644 --- a/tests/ggml-qnn/ggml-qnn-ut.cpp +++ b/tests/ggml-qnn/ggml-qnn-ut.cpp @@ -347,7 +347,7 @@ static constexpr const ggml_op_unary_t kUnaryOps[] = { nullptr, // GGML_OP_DIV nullptr, // GGML_OP_SQR ggml_sqrt, // GGML_OP_SQRT - nullptr, // GGML_OP_LOG + ggml_log, // GGML_OP_LOG nullptr, // GGML_OP_SUM nullptr, // GGML_OP_SUM_ROWS nullptr, // GGML_OP_MEAN @@ -546,6 +546,7 @@ static const std::unordered_map kMapStringToGGMLOp = { DEFINE_OP(GGML_OP_DIV), DEFINE_OP(GGML_OP_SQRT), DEFINE_OP(GGML_OP_MUL_MAT), + DEFINE_OP(GGML_OP_LOG), }; int main(int argc, char * argv[]) { From 30b40006cc3371ef7c5b4d43b44a5a4d4ec3d907 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Sun, 14 Jul 2024 23:50:11 +0800 Subject: [PATCH 066/143] remove unused declarations --- ggml/src/ggml-qnn.cpp | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/ggml/src/ggml-qnn.cpp b/ggml/src/ggml-qnn.cpp index de1fefe497e58..f1de6b18591c6 100644 --- a/ggml/src/ggml-qnn.cpp +++ b/ggml/src/ggml-qnn.cpp @@ -142,7 +142,7 @@ struct ggml_backend_qnn_buffer_type_context { // ================================================================================================= static bool ggml_qnn_can_handle_op(ggml_backend_qnn_context *ctx, const struct ggml_tensor *tensor, bool b_dump_tensor_info) { - if (ggml_is_empty(tensor) || + if (ggml_is_empty(tensor) || (!qnn::ggml_qnn_unary_op_array()[tensor->op] && !qnn::ggml_qnn_binary_op_array()[tensor->op])) { return false; } @@ -569,9 +569,7 @@ ggml_backend_t ggml_backend_qnn_init(size_t device, const char *qnn_lib_path) { return qnn_backend; } -extern "C" GGML_CALL int ggml_backend_qnn_reg_devices(void); - -GGML_CALL int ggml_backend_qnn_reg_devices() { +int ggml_backend_qnn_reg_devices() { for (size_t idx = 0; idx < GGML_QNN_MAX_DEVICES; idx++) { char name[GGML_MAX_NAME]; ggml_backend_qnn_get_device_description(idx, name, GGML_MAX_NAME); From c46b4deea9bbf02000a355f37fcddc27a3a0ad76 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Mon, 15 Jul 2024 10:23:12 +0800 Subject: [PATCH 067/143] [unit test] init all tensor by one function --- tests/ggml-qnn/ggml-qnn-ut.cpp | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) diff --git a/tests/ggml-qnn/ggml-qnn-ut.cpp b/tests/ggml-qnn/ggml-qnn-ut.cpp index 59e561f130e75..f19a6355d30fe 100644 --- a/tests/ggml-qnn/ggml-qnn-ut.cpp +++ b/tests/ggml-qnn/ggml-qnn-ut.cpp @@ -80,11 +80,11 @@ static void ggml_qnn_log_internal(ggml_log_level level, const char * file, const static const char * get_qnn_backend_name(int n_backend_type) { switch (n_backend_type) { - case 0: + case QNN_BACKEND_CPU: return "QNN-CPU"; - case 1: + case QNN_BACKEND_GPU: return "QNN-GPU"; - case 2: + case QNN_BACKEND_NPU: return "QNN-NPU"; case 3: return "ggml"; @@ -494,16 +494,7 @@ static int qnn_op_ut(int num_threads, int n_backend_type, int n_ggml_op_type) { gf = ggml_new_graph(ctx); ggml_build_forward_expand(gf, dst); - if (n_backend_type != QNN_BACKEND_GGML) { - initialize_tensors(ctx); - } else { - if (qtype == GGML_TYPE_F32) { - ggml_set_f32(src0, 2.f); - } else { - initialize_tensors(ctx); - } - ggml_set_f32(src1, 3.f); - } + initialize_tensors(ctx); ggml_graph_compute_helper(backend, gf, work_buffer, num_threads, nullptr, nullptr); From 4410fd65630be9f782fe0d2e484de08ada18dcaa Mon Sep 17 00:00:00 2001 From: hongruichen Date: Mon, 15 Jul 2024 10:30:57 +0800 Subject: [PATCH 068/143] format with clang-format --- tests/ggml-qnn/ggml-qnn-ut.cpp | 371 ++++++++++++++++----------------- 1 file changed, 174 insertions(+), 197 deletions(-) diff --git a/tests/ggml-qnn/ggml-qnn-ut.cpp b/tests/ggml-qnn/ggml-qnn-ut.cpp index f19a6355d30fe..fefb262445dc0 100644 --- a/tests/ggml-qnn/ggml-qnn-ut.cpp +++ b/tests/ggml-qnn/ggml-qnn-ut.cpp @@ -1,67 +1,67 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include #include #include -#include +#include #include +#include #include -#include +#include +#include +#include +#include +#include +#include #include +#include +#include -#include -#include -#include -#include -#include -#include -#include -#include +#include +#include +#include #include -#include +#include #include -#include -#include +#include +#include #include -#include +#include +#include #include -#include +#include +#include +#include +#include +#include +#include #include -#include -#include #include #include +#include #include "ggml.h" + #include "ggml-alloc.h" #include "ggml-backend.h" #include "ggml-qnn.h" -#define GGML_QNN_DEBUG 1 +#define GGML_QNN_DEBUG 1 #define GGML_QNN_LOGBUF_LEN 4096 -#define QNN_LOG_ERROR(...) ggml_qnn_log_internal(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) -#define QNN_LOG_WARN(...) ggml_qnn_log_internal(GGML_LOG_LEVEL_DEBUG , __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) -#define QNN_LOG_INFO(...) ggml_qnn_log_internal(GGML_LOG_LEVEL_DEBUG , __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) +#define QNN_LOG_ERROR(...) ggml_qnn_log_internal(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) +#define QNN_LOG_WARN(...) ggml_qnn_log_internal(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) +#define QNN_LOG_INFO(...) ggml_qnn_log_internal(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) #if GGML_QNN_DEBUG -#define QNN_LOG_DEBUG(...) ggml_qnn_log_internal(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) +#define QNN_LOG_DEBUG(...) ggml_qnn_log_internal(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) #else #define QNN_LOG_DEBUG(...) #endif -static void tensor_dump(const ggml_tensor * tensor, const char * name); +static void tensor_dump(const ggml_tensor *tensor, const char *name); #define TENSOR_DUMP(tensor) tensor_dump(tensor, #tensor) -static void ggml_qnn_log_internal(ggml_log_level level, const char * file, const char * func, int line, const char * format, ...) { +static void ggml_qnn_log_internal(ggml_log_level level, const char *file, const char *func, int line, + const char *format, ...) { static std::mutex ggml_qnn_log_internal_mutex; static char s_ggml_qnn_log_internal_buf[GGML_QNN_LOGBUF_LEN]; @@ -78,7 +78,7 @@ static void ggml_qnn_log_internal(ggml_log_level level, const char * file, const } } -static const char * get_qnn_backend_name(int n_backend_type) { +static const char *get_qnn_backend_name(int n_backend_type) { switch (n_backend_type) { case QNN_BACKEND_CPU: return "QNN-CPU"; @@ -93,13 +93,9 @@ static const char * get_qnn_backend_name(int n_backend_type) { } } -static bool ggml_graph_compute_helper( - struct ggml_backend * backend, - struct ggml_cgraph * graph, - std::vector & buf, - int n_threads, - ggml_abort_callback abort_callback, - void * abort_callback_data) { +static bool ggml_graph_compute_helper(struct ggml_backend *backend, struct ggml_cgraph *graph, + std::vector &buf, int n_threads, ggml_abort_callback abort_callback, + void *abort_callback_data) { struct ggml_cplan plan = ggml_graph_plan(graph, n_threads); plan.abort_callback = abort_callback; @@ -129,8 +125,8 @@ static bool ggml_graph_compute_helper( #define QK8_0 32 typedef struct { - uint16_t d; // delta - int8_t qs[QK8_0]; // quants + uint16_t d; // delta + int8_t qs[QK8_0]; // quants } block_q8_0; static inline float ggml_compute_fp16_to_fp32(uint16_t h) { @@ -141,12 +137,11 @@ static inline float ggml_compute_fp16_to_fp32(uint16_t h) { #define GGML_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x) -static void tensor_dump(const ggml_tensor * tensor, const char * name) { - QNN_LOG_DEBUG("dump ggml tensor %s(%s): type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", - name, tensor->name, - tensor->type, ggml_type_name(tensor->type), - tensor->ne[0], tensor->ne[1], tensor->ne[2], - tensor->nb[0], tensor->nb[1], tensor->nb[2]); +static void tensor_dump(const ggml_tensor *tensor, const char *name) { + QNN_LOG_DEBUG("dump ggml tensor %s(%s): type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 + ", nb = (%5zi, %5zi, %5zi)\n", + name, tensor->name, tensor->type, ggml_type_name(tensor->type), tensor->ne[0], tensor->ne[1], + tensor->ne[2], tensor->nb[0], tensor->nb[1], tensor->nb[2]); float value = 0; std::ostringstream tmposs; @@ -160,10 +155,8 @@ static void tensor_dump(const ggml_tensor * tensor, const char * name) { for (int i = 0; i < tensor->ne[2]; i++) { for (int j = 0; j < tensor->ne[1]; j++) { for (int k = 0; k < tensor->ne[0]; k++) { - value = ((int8_t *) tensor->data)[h * tensor->ne[2] + i * tensor->ne[1] + - j * tensor->ne[0] + k]; - tmposs << std::setw(8) << std::fixed << std::setprecision(2) << value - << " "; + value = ((int8_t *)tensor->data)[h * tensor->ne[2] + i * tensor->ne[1] + j * tensor->ne[0] + k]; + tmposs << std::setw(8) << std::fixed << std::setprecision(2) << value << " "; } tmposs << "\n"; } @@ -181,10 +174,8 @@ static void tensor_dump(const ggml_tensor * tensor, const char * name) { for (int i = 0; i < tensor->ne[2]; i++) { for (int j = 0; j < tensor->ne[1]; j++) { for (int k = 0; k < tensor->ne[0]; k++) { - value = ((float *) tensor->data)[h * tensor->ne[2] + i * tensor->ne[1] + - j * tensor->ne[0] + k]; - tmposs << std::setw(8) << std::fixed << std::setprecision(2) << value - << " "; + value = ((float *)tensor->data)[h * tensor->ne[2] + i * tensor->ne[1] + j * tensor->ne[0] + k]; + tmposs << std::setw(8) << std::fixed << std::setprecision(2) << value << " "; } tmposs << "\n"; } @@ -202,11 +193,11 @@ static void tensor_dump(const ggml_tensor * tensor, const char * name) { for (int i = 0; i < tensor->ne[2]; i++) { for (int j = 0; j < tensor->ne[1]; j++) { for (int k = 0; k < tensor->ne[0]; k++) { - unsigned short tmpvalue = ((unsigned short *) tensor->data)[h * tensor->ne[2] + i * tensor->ne[1] + - j * tensor->ne[0] + k]; + unsigned short tmpvalue = + ((unsigned short *) + tensor->data)[h * tensor->ne[2] + i * tensor->ne[1] + j * tensor->ne[0] + k]; value = GGML_FP16_TO_FP32(tmpvalue); - tmposs << std::setw(8) << std::fixed << std::setprecision(2) << value - << " "; + tmposs << std::setw(8) << std::fixed << std::setprecision(2) << value << " "; } tmposs << "\n"; } @@ -220,15 +211,14 @@ static void tensor_dump(const ggml_tensor * tensor, const char * name) { } if (tensor->type == GGML_TYPE_Q8_0) { - block_q8_0 * tmp = ((block_q8_0 *)tensor->data); - for (int j = 0; j < tensor->ne[1]; j++) { - int n = tensor->ne[0] / QK8_0; //blocks per row + block_q8_0 *tmp = ((block_q8_0 *)tensor->data); + for (int j = 0; j < tensor->ne[1]; j++) { + int n = tensor->ne[0] / QK8_0; // blocks per row for (int z = 0; z < n; z++) { - const float d = GGML_FP16_TO_FP32(tmp[ j * n + z ].d); + const float d = GGML_FP16_TO_FP32(tmp[j * n + z].d); for (int k = 0; k < QK8_0; k++) { value = tmp[j * n + z].qs[k] * d; - tmposs << std::setw(8) << std::fixed << std::setprecision(2) << value - << " "; + tmposs << std::setw(8) << std::fixed << std::setprecision(2) << value << " "; } } tmposs << "\n"; @@ -241,7 +231,7 @@ static void tensor_dump(const ggml_tensor * tensor, const char * name) { } } -static uint32_t get_tensor_rank(const ggml_tensor * tensor) { +static uint32_t get_tensor_rank(const ggml_tensor *tensor) { uint32_t rank = 0; for (int i = 0; i < GGML_MAX_DIMS; i++) { if ((0 != tensor->ne[i]) && (1 != tensor->ne[i])) { @@ -251,7 +241,7 @@ static uint32_t get_tensor_rank(const ggml_tensor * tensor) { return rank; } -static uint32_t get_tensor_data_size(const ggml_tensor * tensor) { +static uint32_t get_tensor_data_size(const ggml_tensor *tensor) { size_t data_size = ggml_row_size(tensor->type, tensor->ne[0]); size_t n_dims = get_tensor_rank(tensor); for (int i = 1; i < n_dims; i++) { @@ -264,8 +254,8 @@ static uint32_t get_tensor_data_size(const ggml_tensor * tensor) { return ggml_nbytes(tensor); } -//ref: https://github.com/ggerganov/llama.cpp/blob/master/tests/test-backend-ops.cpp#L20 -static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float max = 1.0f) { +// ref: https://github.com/ggerganov/llama.cpp/blob/master/tests/test-backend-ops.cpp#L20 +static void init_tensor_uniform(ggml_tensor *tensor, float min = -1.0f, float max = 1.0f) { size_t size = ggml_nelements(tensor); std::vector data(size); for (size_t i = 0; i < size; i++) { @@ -274,7 +264,7 @@ static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float m if (tensor->type == GGML_TYPE_F32 || tensor->type == GGML_TYPE_I32) { #ifdef GGML_USE_QNN - memcpy((char*)tensor->data, data.data(), size * sizeof(float)); + memcpy((char *)tensor->data, data.data(), size * sizeof(float)); #else ggml_backend_tensor_set(tensor, data.data(), 0, size * sizeof(float)); #endif @@ -282,25 +272,25 @@ static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float m GGML_ASSERT(size % ggml_blck_size(tensor->type) == 0); std::vector dataq(ggml_row_size(tensor->type, size)); std::vector imatrix(tensor->ne[0], 1.0f); // dummy importance matrix - const float * im = imatrix.data(); + const float *im = imatrix.data(); if (!ggml_quantize_requires_imatrix(tensor->type)) { // when the imatrix is optional, we want to test both quantization with and without imatrix // use one of the random numbers to decide - if (data[0] > 0.5f*(min + max)) { + if (data[0] > 0.5f * (min + max)) { im = nullptr; } } - ggml_quantize_chunk(tensor->type, data.data(), dataq.data(), 0, size/tensor->ne[0], tensor->ne[0], im); + ggml_quantize_chunk(tensor->type, data.data(), dataq.data(), 0, size / tensor->ne[0], tensor->ne[0], im); GGML_ASSERT(ggml_validate_row_data(tensor->type, dataq.data(), dataq.size())); #ifdef GGML_USE_QNN - memcpy((char*)tensor->data, dataq.data(), dataq.size()); + memcpy((char *)tensor->data, dataq.data(), dataq.size()); #else ggml_backend_tensor_set(tensor, dataq.data(), 0, dataq.size()); #endif } else if (tensor->type == GGML_TYPE_I8 || tensor->type == GGML_TYPE_I16 || tensor->type == GGML_TYPE_I32) { // This is going to create some weird integers though. #ifdef GGML_USE_QNN - memcpy((char*)tensor->data, data.data(), ggml_nbytes(tensor)); + memcpy((char *)tensor->data, data.data(), ggml_nbytes(tensor)); #else ggml_backend_tensor_set(tensor, data.data(), 0, ggml_nbytes(tensor)); #endif @@ -309,125 +299,117 @@ static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float m } } -//ref: https://github.com/ggerganov/llama.cpp/blob/master/tests/test-backend-ops.cpp#L310 -static void initialize_tensors(ggml_context * ctx) { - for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) { +// ref: https://github.com/ggerganov/llama.cpp/blob/master/tests/test-backend-ops.cpp#L310 +static void initialize_tensors(ggml_context *ctx) { + for (ggml_tensor *t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) { init_tensor_uniform(t); } } static void show_usage() { - printf(" " \ - "\nUsage: test_qnn_ops [options]\n" \ - "\n" \ - "Options:\n" \ - " -t GGML_OP_ADD / GGML_OP_MULMAT\n" \ - " -b 0(QNN_CPU) 1(QNN_GPU) 2(QNN_NPU) 3(ggml)\n" \ - " ?/h print usage infomation\n\n" - ); + printf( + " " + "\nUsage: test_qnn_ops [options]\n" + "\n" + "Options:\n" + " -t GGML_OP_ADD / GGML_OP_MULMAT\n" + " -b 0(QNN_CPU) 1(QNN_GPU) 2(QNN_NPU) 3(ggml)\n" + " ?/h print usage infomation\n\n"); } -typedef ggml_tensor * (*ggml_op_unary_t)( - ggml_context * ctx, - ggml_tensor * a); +typedef ggml_tensor *(*ggml_op_unary_t)(ggml_context *ctx, ggml_tensor *a); -typedef ggml_tensor * (*ggml_op_binary_t)( - ggml_context * ctx, - ggml_tensor * a, - ggml_tensor * b); +typedef ggml_tensor *(*ggml_op_binary_t)(ggml_context *ctx, ggml_tensor *a, ggml_tensor *b); static constexpr const ggml_op_unary_t kUnaryOps[] = { - nullptr, // GGML_OP_NONE - nullptr, // GGML_OP_DUP - nullptr, // GGML_OP_ADD - nullptr, // GGML_OP_ADD1 - nullptr, // GGML_OP_ACC - nullptr, // GGML_OP_SUB - nullptr, // GGML_OP_MUL - nullptr, // GGML_OP_DIV - nullptr, // GGML_OP_SQR - ggml_sqrt, // GGML_OP_SQRT - ggml_log, // GGML_OP_LOG - nullptr, // GGML_OP_SUM - nullptr, // GGML_OP_SUM_ROWS - nullptr, // GGML_OP_MEAN - nullptr, // GGML_OP_ARGMAX - nullptr, // GGML_OP_REPEAT - nullptr, // GGML_OP_REPEAT_BACK - nullptr, // GGML_OP_CONCAT - nullptr, // GGML_OP_SILU_BACK - nullptr, // GGML_OP_NORM - nullptr, // GGML_OP_RMS_NORM - nullptr, // GGML_OP_RMS_NORM_BACK - nullptr, // GGML_OP_GROUP_NORM - nullptr, // GGML_OP_MUL_MAT + nullptr, // GGML_OP_NONE + nullptr, // GGML_OP_DUP + nullptr, // GGML_OP_ADD + nullptr, // GGML_OP_ADD1 + nullptr, // GGML_OP_ACC + nullptr, // GGML_OP_SUB + nullptr, // GGML_OP_MUL + nullptr, // GGML_OP_DIV + nullptr, // GGML_OP_SQR + ggml_sqrt, // GGML_OP_SQRT + ggml_log, // GGML_OP_LOG + nullptr, // GGML_OP_SUM + nullptr, // GGML_OP_SUM_ROWS + nullptr, // GGML_OP_MEAN + nullptr, // GGML_OP_ARGMAX + nullptr, // GGML_OP_REPEAT + nullptr, // GGML_OP_REPEAT_BACK + nullptr, // GGML_OP_CONCAT + nullptr, // GGML_OP_SILU_BACK + nullptr, // GGML_OP_NORM + nullptr, // GGML_OP_RMS_NORM + nullptr, // GGML_OP_RMS_NORM_BACK + nullptr, // GGML_OP_GROUP_NORM + nullptr, // GGML_OP_MUL_MAT }; static constexpr const ggml_op_binary_t kBinaryOps[] = { - nullptr, // GGML_OP_NONE - nullptr, // GGML_OP_DUP - ggml_add, // GGML_OP_ADD - nullptr, // GGML_OP_ADD1 - nullptr, // GGML_OP_ACC - ggml_sub, // GGML_OP_SUB - ggml_mul, // GGML_OP_MUL - ggml_div, // GGML_OP_DIV - nullptr, // GGML_OP_SQR - nullptr, // GGML_OP_SQRT - nullptr, // GGML_OP_LOG - nullptr, // GGML_OP_SUM - nullptr, // GGML_OP_SUM_ROWS - nullptr, // GGML_OP_MEAN - nullptr, // GGML_OP_ARGMAX - nullptr, // GGML_OP_REPEAT - nullptr, // GGML_OP_REPEAT_BACK - nullptr, // GGML_OP_CONCAT - nullptr, // GGML_OP_SILU_BACK - nullptr, // GGML_OP_NORM - nullptr, // GGML_OP_RMS_NORM - nullptr, // GGML_OP_RMS_NORM_BACK - nullptr, // GGML_OP_GROUP_NORM - ggml_mul_mat, // GGML_OP_MUL_MAT + nullptr, // GGML_OP_NONE + nullptr, // GGML_OP_DUP + ggml_add, // GGML_OP_ADD + nullptr, // GGML_OP_ADD1 + nullptr, // GGML_OP_ACC + ggml_sub, // GGML_OP_SUB + ggml_mul, // GGML_OP_MUL + ggml_div, // GGML_OP_DIV + nullptr, // GGML_OP_SQR + nullptr, // GGML_OP_SQRT + nullptr, // GGML_OP_LOG + nullptr, // GGML_OP_SUM + nullptr, // GGML_OP_SUM_ROWS + nullptr, // GGML_OP_MEAN + nullptr, // GGML_OP_ARGMAX + nullptr, // GGML_OP_REPEAT + nullptr, // GGML_OP_REPEAT_BACK + nullptr, // GGML_OP_CONCAT + nullptr, // GGML_OP_SILU_BACK + nullptr, // GGML_OP_NORM + nullptr, // GGML_OP_RMS_NORM + nullptr, // GGML_OP_RMS_NORM_BACK + nullptr, // GGML_OP_GROUP_NORM + ggml_mul_mat, // GGML_OP_MUL_MAT }; static_assert(kBinaryOps[GGML_OP_MUL_MAT] == ggml_mul_mat, "ggml_mul_mat at wrong index, check kBinaryOps"); static int qnn_op_ut(int num_threads, int n_backend_type, int n_ggml_op_type) { - int64_t n_begin_time = 0LL; - int64_t n_end_time = 0LL; - int64_t n_duration = 0LL; - size_t ctx_size = 0; - int sizey = 4; - int sizex = 4; - - struct ggml_context * ctx = nullptr; - struct ggml_cgraph * gf = nullptr; - struct ggml_tensor * src0 = nullptr; - struct ggml_tensor * src1 = nullptr; - struct ggml_tensor * dst = nullptr; - ggml_backend_t backend = nullptr; - ggml_backend_buffer_t buffer= nullptr; - - ggml_type qtype = GGML_TYPE_I8; - qtype = GGML_TYPE_F16; - qtype = GGML_TYPE_Q8_0; - qtype = GGML_TYPE_F32; + int64_t n_begin_time = 0LL; + int64_t n_end_time = 0LL; + int64_t n_duration = 0LL; + size_t ctx_size = 0; + int sizey = 4; + int sizex = 4; + + struct ggml_context *ctx = nullptr; + struct ggml_cgraph *gf = nullptr; + struct ggml_tensor *src0 = nullptr; + struct ggml_tensor *src1 = nullptr; + struct ggml_tensor *dst = nullptr; + ggml_backend_t backend = nullptr; + ggml_backend_buffer_t buffer = nullptr; + + ggml_type qtype = GGML_TYPE_I8; + qtype = GGML_TYPE_F16; + qtype = GGML_TYPE_Q8_0; + qtype = GGML_TYPE_F32; std::vector work_buffer; QNN_LOG_DEBUG("enter qnn_ggml_op\n"); - QNN_LOG_DEBUG("ggml op:%d(%s)\n", n_ggml_op_type, ggml_op_name((enum ggml_op) n_ggml_op_type)); + QNN_LOG_DEBUG("ggml op:%d(%s)\n", n_ggml_op_type, ggml_op_name((enum ggml_op)n_ggml_op_type)); n_begin_time = ggml_time_us(); ctx_size += 1024 * 1024 * 32; - QNN_LOG_DEBUG("Allocating Memory of size %zi bytes, %zi MB\n", ctx_size, - (ctx_size / 1024 / 1024)); + QNN_LOG_DEBUG("Allocating Memory of size %zi bytes, %zi MB\n", ctx_size, (ctx_size / 1024 / 1024)); - struct ggml_init_params params = { - /*.mem_size =*/ ctx_size, - /*.mem_buffer =*/ NULL, - /* no_alloc =*/ 0 - }; + struct ggml_init_params params = { /*.mem_size =*/ctx_size, + /*.mem_buffer =*/NULL, + /* no_alloc =*/0 }; if (n_backend_type != QNN_BACKEND_GGML) { params.no_alloc = true; @@ -470,8 +452,7 @@ static int qnn_op_ut(int num_threads, int n_backend_type, int n_ggml_op_type) { } else if (binary_op) { dst = binary_op(ctx, src0, src1); } else { - QNN_LOG_WARN("ggml op %d(%s) not supported", n_ggml_op_type, - ggml_op_name((enum ggml_op) n_ggml_op_type)); + QNN_LOG_WARN("ggml op %d(%s) not supported", n_ggml_op_type, ggml_op_name((enum ggml_op)n_ggml_op_type)); ggml_free(ctx); ggml_backend_free(backend); return 3; @@ -504,17 +485,17 @@ static int qnn_op_ut(int num_threads, int n_backend_type, int n_ggml_op_type) { TENSOR_DUMP(src1); TENSOR_DUMP(dst); } else { - QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", - src0->name, - src0->type, ggml_type_name(src0->type), src0->ne[0], src0->ne[1], src0->ne[2], + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 + ", nb = (%5zi, %5zi, %5zi)\n", + src0->name, src0->type, ggml_type_name(src0->type), src0->ne[0], src0->ne[1], src0->ne[2], src0->nb[0], src0->nb[1], src0->nb[2]); - QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", - src1->name, - src1->type, ggml_type_name(src1->type), src1->ne[0], src1->ne[1], src1->ne[2], + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 + ", nb = (%5zi, %5zi, %5zi)\n", + src1->name, src1->type, ggml_type_name(src1->type), src1->ne[0], src1->ne[1], src1->ne[2], src1->nb[0], src1->nb[1], src1->nb[2]); - QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", - dst->name, - dst->type, ggml_type_name(dst->type), dst->ne[0], dst->ne[1], dst->ne[2], dst->nb[0], + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 + ", nb = (%5zi, %5zi, %5zi)\n", + dst->name, dst->type, ggml_type_name(dst->type), dst->ne[0], dst->ne[1], dst->ne[2], dst->nb[0], dst->nb[1], dst->nb[2]); } @@ -524,26 +505,22 @@ static int qnn_op_ut(int num_threads, int n_backend_type, int n_ggml_op_type) { n_end_time = ggml_time_us(); n_duration = (n_end_time - n_begin_time) / 1000; - QNN_LOG_DEBUG("duration of ut GGML_OP_%s using QNN backend %s: %lld milliseconds\n", ggml_op_name((enum ggml_op)n_ggml_op_type), get_qnn_backend_name(n_backend_type), n_duration); + QNN_LOG_DEBUG("duration of ut GGML_OP_%s using QNN backend %s: %lld milliseconds\n", + ggml_op_name((enum ggml_op)n_ggml_op_type), get_qnn_backend_name(n_backend_type), n_duration); return 0; } #define DEFINE_OP(op) { #op, op } static const std::unordered_map kMapStringToGGMLOp = { - DEFINE_OP(GGML_OP_ADD), - DEFINE_OP(GGML_OP_SUB), - DEFINE_OP(GGML_OP_MUL), - DEFINE_OP(GGML_OP_DIV), - DEFINE_OP(GGML_OP_SQRT), - DEFINE_OP(GGML_OP_MUL_MAT), - DEFINE_OP(GGML_OP_LOG), + DEFINE_OP(GGML_OP_ADD), DEFINE_OP(GGML_OP_SUB), DEFINE_OP(GGML_OP_MUL), DEFINE_OP(GGML_OP_DIV), + DEFINE_OP(GGML_OP_SQRT), DEFINE_OP(GGML_OP_MUL_MAT), DEFINE_OP(GGML_OP_LOG), }; -int main(int argc, char * argv[]) { - int num_threads = 4; - int n_backend_type = QNN_BACKEND_CPU; - int n_ggml_op_type = GGML_OP_ADD; +int main(int argc, char *argv[]) { + int num_threads = 4; + int n_backend_type = QNN_BACKEND_CPU; + int n_ggml_op_type = GGML_OP_ADD; for (int i = 1; i < argc; i++) { if (0 == strcmp(argv[i], "-t")) { @@ -561,7 +538,7 @@ int main(int argc, char * argv[]) { if (i + 1 < argc) { int backend = atoi(argv[i + 1]); if (backend <= QNN_BACKEND_GGML) - n_backend_type = backend; + n_backend_type = backend; else { show_usage(); return 1; @@ -575,9 +552,9 @@ int main(int argc, char * argv[]) { } QNN_LOG_DEBUG("enter qnn_ggml_op\n"); - QNN_LOG_DEBUG("backend %d, ggml op:%d(%s)", n_backend_type, n_ggml_op_type, ggml_op_name((enum ggml_op) n_ggml_op_type)); + QNN_LOG_DEBUG("backend %d, ggml op:%d(%s)", n_backend_type, n_ggml_op_type, + ggml_op_name((enum ggml_op)n_ggml_op_type)); qnn_op_ut(num_threads, n_backend_type, n_ggml_op_type); - return 0; } From cd5a7331f7cd1d79ab482c2a454e2ef963fff0ee Mon Sep 17 00:00:00 2001 From: hongruichen Date: Mon, 15 Jul 2024 10:50:33 +0800 Subject: [PATCH 069/143] add cpu backend as cross reference --- tests/ggml-qnn/ggml-qnn-ut.cpp | 39 ++++++++++++++++++++++------------ 1 file changed, 25 insertions(+), 14 deletions(-) diff --git a/tests/ggml-qnn/ggml-qnn-ut.cpp b/tests/ggml-qnn/ggml-qnn-ut.cpp index fefb262445dc0..a87781e52b070 100644 --- a/tests/ggml-qnn/ggml-qnn-ut.cpp +++ b/tests/ggml-qnn/ggml-qnn-ut.cpp @@ -244,7 +244,7 @@ static uint32_t get_tensor_rank(const ggml_tensor *tensor) { static uint32_t get_tensor_data_size(const ggml_tensor *tensor) { size_t data_size = ggml_row_size(tensor->type, tensor->ne[0]); size_t n_dims = get_tensor_rank(tensor); - for (int i = 1; i < n_dims; i++) { + for (size_t i = 1; i < n_dims; i++) { data_size *= tensor->ne[i]; } @@ -377,7 +377,8 @@ static constexpr const ggml_op_binary_t kBinaryOps[] = { static_assert(kBinaryOps[GGML_OP_MUL_MAT] == ggml_mul_mat, "ggml_mul_mat at wrong index, check kBinaryOps"); -static int qnn_op_ut(int num_threads, int n_backend_type, int n_ggml_op_type) { +static void qnn_op_ut(int num_threads, int n_backend_type, int n_ggml_op_type, ggml_type qtype, + std::vector &results) { int64_t n_begin_time = 0LL; int64_t n_end_time = 0LL; int64_t n_duration = 0LL; @@ -393,11 +394,6 @@ static int qnn_op_ut(int num_threads, int n_backend_type, int n_ggml_op_type) { ggml_backend_t backend = nullptr; ggml_backend_buffer_t buffer = nullptr; - ggml_type qtype = GGML_TYPE_I8; - qtype = GGML_TYPE_F16; - qtype = GGML_TYPE_Q8_0; - qtype = GGML_TYPE_F32; - std::vector work_buffer; QNN_LOG_DEBUG("enter qnn_ggml_op\n"); QNN_LOG_DEBUG("ggml op:%d(%s)\n", n_ggml_op_type, ggml_op_name((enum ggml_op)n_ggml_op_type)); @@ -416,14 +412,14 @@ static int qnn_op_ut(int num_threads, int n_backend_type, int n_ggml_op_type) { backend = ggml_backend_qnn_init(n_backend_type, "/data/local/tmp/"); if (nullptr == backend) { QNN_LOG_ERROR("create qnn backend %d(%s) failed\n", n_backend_type, get_qnn_backend_name(n_backend_type)); - return 1; + return; } } ctx = ggml_init(params); if (!ctx) { QNN_LOG_ERROR("%s: ggml_init() failed\n"); - return 2; + return; } QNN_LOG_DEBUG("creating new tensors\n"); @@ -455,7 +451,7 @@ static int qnn_op_ut(int num_threads, int n_backend_type, int n_ggml_op_type) { QNN_LOG_WARN("ggml op %d(%s) not supported", n_ggml_op_type, ggml_op_name((enum ggml_op)n_ggml_op_type)); ggml_free(ctx); ggml_backend_free(backend); - return 3; + return; } ggml_set_output(dst); @@ -466,7 +462,7 @@ static int qnn_op_ut(int num_threads, int n_backend_type, int n_ggml_op_type) { QNN_LOG_ERROR("%s: failed to allocate backend buffer\n", __func__); ggml_free(ctx); ggml_backend_free(backend); - return 4; + return; } } #endif @@ -484,6 +480,8 @@ static int qnn_op_ut(int num_threads, int n_backend_type, int n_ggml_op_type) { TENSOR_DUMP(src0); TENSOR_DUMP(src1); TENSOR_DUMP(dst); + results.resize(ggml_nbytes(dst)); + memcpy(results.data(), ggml_get_data(dst), ggml_nbytes(dst)); } else { QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", @@ -507,7 +505,6 @@ static int qnn_op_ut(int num_threads, int n_backend_type, int n_ggml_op_type) { n_duration = (n_end_time - n_begin_time) / 1000; QNN_LOG_DEBUG("duration of ut GGML_OP_%s using QNN backend %s: %lld milliseconds\n", ggml_op_name((enum ggml_op)n_ggml_op_type), get_qnn_backend_name(n_backend_type), n_duration); - return 0; } #define DEFINE_OP(op) { #op, op } @@ -517,6 +514,10 @@ static const std::unordered_map kMapStringToGGMLOp = { DEFINE_OP(GGML_OP_SQRT), DEFINE_OP(GGML_OP_MUL_MAT), DEFINE_OP(GGML_OP_LOG), }; +#define CONSOLE_RED "\033[31m" +#define CONSOLE_GREEN "\033[32m" +#define CONSOLE_RESET "\033[0m" + int main(int argc, char *argv[]) { int num_threads = 4; int n_backend_type = QNN_BACKEND_CPU; @@ -554,7 +555,17 @@ int main(int argc, char *argv[]) { QNN_LOG_DEBUG("enter qnn_ggml_op\n"); QNN_LOG_DEBUG("backend %d, ggml op:%d(%s)", n_backend_type, n_ggml_op_type, ggml_op_name((enum ggml_op)n_ggml_op_type)); - qnn_op_ut(num_threads, n_backend_type, n_ggml_op_type); - return 0; + std::vector results; + qnn_op_ut(num_threads, n_backend_type, n_ggml_op_type, GGML_TYPE_F32, results); + std::vector cpu_results; + qnn_op_ut(num_threads, QNN_BACKEND_GGML, n_ggml_op_type, GGML_TYPE_F32, cpu_results); + + if (results == cpu_results) { + QNN_LOG_INFO(CONSOLE_GREEN "[Result] results equal!" CONSOLE_RESET); + return 0; + } else { + QNN_LOG_ERROR(CONSOLE_RED "[Result] results not equal!" CONSOLE_RESET); + return 1; + } } From f32327e2b2182b170013f123b5469cab6d731d22 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Mon, 15 Jul 2024 11:19:01 +0800 Subject: [PATCH 070/143] remove multiply declearation of log in unit test --- ggml/src/ggml-qnn/logger.cpp | 2 - ggml/src/ggml-qnn/logger.hpp | 2 + tests/ggml-qnn/CMakeLists.txt | 4 ++ tests/ggml-qnn/ggml-qnn-ut.cpp | 71 ++++++++++------------------------ 4 files changed, 26 insertions(+), 53 deletions(-) diff --git a/ggml/src/ggml-qnn/logger.cpp b/ggml/src/ggml-qnn/logger.cpp index 5a1ad13ba40ce..8b74b90edf476 100644 --- a/ggml/src/ggml-qnn/logger.cpp +++ b/ggml/src/ggml-qnn/logger.cpp @@ -9,8 +9,6 @@ #include #endif -#define QNN_LOGBUF_LEN 4096 - void qnn::internal_log(ggml_log_level level, const char *file, const char *func, int line, const char *format, ...) { static std::mutex qnn_internal_log_mutex; static char s_qnn_internal_log_buf[QNN_LOGBUF_LEN]; diff --git a/ggml/src/ggml-qnn/logger.hpp b/ggml/src/ggml-qnn/logger.hpp index f81a1814e9756..b4bab0c006691 100644 --- a/ggml/src/ggml-qnn/logger.hpp +++ b/ggml/src/ggml-qnn/logger.hpp @@ -9,6 +9,8 @@ #include "QnnTypes.h" #include "System/QnnSystemInterface.h" +#define QNN_LOGBUF_LEN 4096 + namespace qnn { void internal_log(ggml_log_level level, const char *file, const char *func, int line, const char *format, ...); diff --git a/tests/ggml-qnn/CMakeLists.txt b/tests/ggml-qnn/CMakeLists.txt index b4f1bd6c07482..e72cc13e78ce4 100644 --- a/tests/ggml-qnn/CMakeLists.txt +++ b/tests/ggml-qnn/CMakeLists.txt @@ -61,3 +61,7 @@ link_libraries(${LOG_LIB} android) add_executable(${TARGET_NAME} ${SOURCE_FILES} ) + +target_include_directories(${TARGET_NAME} PRIVATE + ../../ggml/src/ggml-qnn/ +) diff --git a/tests/ggml-qnn/ggml-qnn-ut.cpp b/tests/ggml-qnn/ggml-qnn-ut.cpp index a87781e52b070..2fea53e620dd8 100644 --- a/tests/ggml-qnn/ggml-qnn-ut.cpp +++ b/tests/ggml-qnn/ggml-qnn-ut.cpp @@ -43,40 +43,7 @@ #include "ggml-backend.h" #include "ggml-qnn.h" -#define GGML_QNN_DEBUG 1 -#define GGML_QNN_LOGBUF_LEN 4096 - -#define QNN_LOG_ERROR(...) ggml_qnn_log_internal(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) -#define QNN_LOG_WARN(...) ggml_qnn_log_internal(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) -#define QNN_LOG_INFO(...) ggml_qnn_log_internal(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) - -#if GGML_QNN_DEBUG -#define QNN_LOG_DEBUG(...) ggml_qnn_log_internal(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) -#else -#define QNN_LOG_DEBUG(...) -#endif - -static void tensor_dump(const ggml_tensor *tensor, const char *name); - -#define TENSOR_DUMP(tensor) tensor_dump(tensor, #tensor) - -static void ggml_qnn_log_internal(ggml_log_level level, const char *file, const char *func, int line, - const char *format, ...) { - static std::mutex ggml_qnn_log_internal_mutex; - static char s_ggml_qnn_log_internal_buf[GGML_QNN_LOGBUF_LEN]; - - { - std::lock_guard lock(ggml_qnn_log_internal_mutex); - va_list args; - va_start(args, format); - int len_prefix = snprintf(s_ggml_qnn_log_internal_buf, GGML_QNN_LOGBUF_LEN, "[%s, %d]: ", func, line); - int len = vsnprintf(s_ggml_qnn_log_internal_buf + len_prefix, GGML_QNN_LOGBUF_LEN - len_prefix, format, args); - if (len < (GGML_QNN_LOGBUF_LEN - len_prefix)) { - printf("%s\n", s_ggml_qnn_log_internal_buf); - } - va_end(args); - } -} +#include "logger.hpp" static const char *get_qnn_backend_name(int n_backend_type) { switch (n_backend_type) { @@ -86,7 +53,7 @@ static const char *get_qnn_backend_name(int n_backend_type) { return "QNN-GPU"; case QNN_BACKEND_NPU: return "QNN-NPU"; - case 3: + case QNN_BACKEND_GGML: return "ggml"; default: return "unknown"; @@ -137,11 +104,13 @@ static inline float ggml_compute_fp16_to_fp32(uint16_t h) { #define GGML_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x) +#define TENSOR_DUMP(tensor) tensor_dump(tensor, #tensor) + static void tensor_dump(const ggml_tensor *tensor, const char *name) { - QNN_LOG_DEBUG("dump ggml tensor %s(%s): type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 - ", nb = (%5zi, %5zi, %5zi)\n", - name, tensor->name, tensor->type, ggml_type_name(tensor->type), tensor->ne[0], tensor->ne[1], - tensor->ne[2], tensor->nb[0], tensor->nb[1], tensor->nb[2]); + QNN_LOG_INFO("dump ggml tensor %s(%s): type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 + ", nb = (%5zi, %5zi, %5zi)\n", + name, tensor->name, tensor->type, ggml_type_name(tensor->type), tensor->ne[0], tensor->ne[1], + tensor->ne[2], tensor->nb[0], tensor->nb[1], tensor->nb[2]); float value = 0; std::ostringstream tmposs; @@ -162,8 +131,8 @@ static void tensor_dump(const ggml_tensor *tensor, const char *name) { } } } - if (strlen(tmposs.str().c_str()) <= (GGML_QNN_LOGBUF_LEN - 96)) { - QNN_LOG_DEBUG("\n%s\n", tmposs.str().c_str()); + if (strlen(tmposs.str().c_str()) <= (QNN_LOGBUF_LEN - 96)) { + QNN_LOG_INFO("\n%s\n", tmposs.str().c_str()); tmposs.clear(); tmposs.str(""); } @@ -181,8 +150,8 @@ static void tensor_dump(const ggml_tensor *tensor, const char *name) { } } } - if (strlen(tmposs.str().c_str()) <= (GGML_QNN_LOGBUF_LEN - 96)) { - QNN_LOG_DEBUG("\n%s\n", tmposs.str().c_str()); + if (strlen(tmposs.str().c_str()) <= (QNN_LOGBUF_LEN - 96)) { + QNN_LOG_INFO("\n%s\n", tmposs.str().c_str()); tmposs.clear(); tmposs.str(""); } @@ -203,8 +172,8 @@ static void tensor_dump(const ggml_tensor *tensor, const char *name) { } } } - if (strlen(tmposs.str().c_str()) <= (GGML_QNN_LOGBUF_LEN - 96)) { - QNN_LOG_DEBUG("\n%s\n", tmposs.str().c_str()); + if (strlen(tmposs.str().c_str()) <= (QNN_LOGBUF_LEN - 96)) { + QNN_LOG_INFO("\n%s\n", tmposs.str().c_str()); tmposs.clear(); tmposs.str(""); } @@ -223,8 +192,8 @@ static void tensor_dump(const ggml_tensor *tensor, const char *name) { } tmposs << "\n"; } - if (strlen(tmposs.str().c_str()) <= (GGML_QNN_LOGBUF_LEN - 96)) { - QNN_LOG_DEBUG("\n%s\n", tmposs.str().c_str()); + if (strlen(tmposs.str().c_str()) <= (QNN_LOGBUF_LEN - 96)) { + QNN_LOG_INFO("\n%s\n", tmposs.str().c_str()); tmposs.clear(); tmposs.str(""); } @@ -480,8 +449,6 @@ static void qnn_op_ut(int num_threads, int n_backend_type, int n_ggml_op_type, g TENSOR_DUMP(src0); TENSOR_DUMP(src1); TENSOR_DUMP(dst); - results.resize(ggml_nbytes(dst)); - memcpy(results.data(), ggml_get_data(dst), ggml_nbytes(dst)); } else { QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", @@ -497,6 +464,8 @@ static void qnn_op_ut(int num_threads, int n_backend_type, int n_ggml_op_type, g dst->nb[1], dst->nb[2]); } + results.resize(ggml_nbytes(dst)); + memcpy(results.data(), ggml_get_data(dst), ggml_nbytes(dst)); ggml_free(ctx); ggml_backend_buffer_free(buffer); ggml_backend_free(backend); @@ -562,10 +531,10 @@ int main(int argc, char *argv[]) { qnn_op_ut(num_threads, QNN_BACKEND_GGML, n_ggml_op_type, GGML_TYPE_F32, cpu_results); if (results == cpu_results) { - QNN_LOG_INFO(CONSOLE_GREEN "[Result] results equal!" CONSOLE_RESET); + QNN_LOG_INFO(CONSOLE_GREEN "[Success] results equal to CPU backend!" CONSOLE_RESET); return 0; } else { - QNN_LOG_ERROR(CONSOLE_RED "[Result] results not equal!" CONSOLE_RESET); + QNN_LOG_ERROR(CONSOLE_RED "[Failed] results mismatch with CPU backend!" CONSOLE_RESET); return 1; } } From ff601abc1ce99cef1de1787add056b406f09a544 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Tue, 16 Jul 2024 00:05:40 +0800 Subject: [PATCH 071/143] add todo --- tests/ggml-qnn/ggml-qnn-ut.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/ggml-qnn/ggml-qnn-ut.cpp b/tests/ggml-qnn/ggml-qnn-ut.cpp index 2fea53e620dd8..71cb86a71bdf1 100644 --- a/tests/ggml-qnn/ggml-qnn-ut.cpp +++ b/tests/ggml-qnn/ggml-qnn-ut.cpp @@ -211,6 +211,7 @@ static uint32_t get_tensor_rank(const ggml_tensor *tensor) { } static uint32_t get_tensor_data_size(const ggml_tensor *tensor) { +#if ENABLE_QNNSDK_LOG size_t data_size = ggml_row_size(tensor->type, tensor->ne[0]); size_t n_dims = get_tensor_rank(tensor); for (size_t i = 1; i < n_dims; i++) { @@ -219,6 +220,7 @@ static uint32_t get_tensor_data_size(const ggml_tensor *tensor) { QNN_LOG_DEBUG("get_tensor_data_size %d", data_size); QNN_LOG_DEBUG("ggml_nbytes(tensor) %d", ggml_nbytes(tensor)); +#endif return ggml_nbytes(tensor); } @@ -530,6 +532,8 @@ int main(int argc, char *argv[]) { std::vector cpu_results; qnn_op_ut(num_threads, QNN_BACKEND_GGML, n_ggml_op_type, GGML_TYPE_F32, cpu_results); + // TODO: theoretically, the results should be the same, but the results may be different due to the different hardware + // a better way to compare the results is to compare the floating point numbers with allowed error if (results == cpu_results) { QNN_LOG_INFO(CONSOLE_GREEN "[Success] results equal to CPU backend!" CONSOLE_RESET); return 0; From 0301b500cd2ce15935b4c3139427e72872f231cb Mon Sep 17 00:00:00 2001 From: hongruichen Date: Tue, 16 Jul 2024 22:52:16 +0800 Subject: [PATCH 072/143] refactoring: prevent leak the QNN_INTERFACE_VER_TYPE and QNN_SYSTEM_INTERFACE_VER_TYPE outside of qnn.hpp --- ggml/include/ggml-qnn.h | 26 +- ggml/src/ggml-qnn.cpp | 39 +-- ggml/src/ggml-qnn/backend-ops.cpp | 2 +- ggml/src/ggml-qnn/backend.hpp | 15 +- ggml/src/ggml-qnn/graph.hpp | 22 +- ggml/src/ggml-qnn/qnn-lib.cpp | 35 +++ ggml/src/ggml-qnn/{qnn.hpp => qnn-lib.hpp} | 304 +++++++++------------ ggml/src/ggml-qnn/tensor.hpp | 2 +- tests/ggml-qnn/CMakeLists.txt | 1 + 9 files changed, 212 insertions(+), 234 deletions(-) create mode 100644 ggml/src/ggml-qnn/qnn-lib.cpp rename ggml/src/ggml-qnn/{qnn.hpp => qnn-lib.hpp} (81%) diff --git a/ggml/include/ggml-qnn.h b/ggml/include/ggml-qnn.h index 60aaf22179647..026c6ddf06672 100644 --- a/ggml/include/ggml-qnn.h +++ b/ggml/include/ggml-qnn.h @@ -1,8 +1,9 @@ #pragma once -#include "ggml-backend.h" #include "ggml.h" +#include "ggml-backend.h" + #ifdef __cplusplus extern "C" { #endif @@ -10,11 +11,11 @@ extern "C" { #define GGML_QNN_MAX_DEVICES 3 enum QNNBackend { - QNN_BACKEND_CPU, - QNN_BACKEND_GPU, - QNN_BACKEND_NPU, - QNN_BACKEND_GGML, //"fake" QNN backend, used for compare performance between - // QNN and original GGML + QNN_BACKEND_CPU = 0, + QNN_BACKEND_GPU, + QNN_BACKEND_NPU, + QNN_BACKEND_GGML, //"fake" QNN backend, used for compare performance between + // QNN and original GGML }; GGML_API int ggml_backend_qnn_reg_devices(void); @@ -27,22 +28,17 @@ GGML_API int ggml_backend_qnn_reg_devices(void); * Android or specified in JNI layer * @return */ -GGML_API ggml_backend_t ggml_backend_qnn_init(size_t dev_num, - const char* qnn_lib_path); +GGML_API ggml_backend_t ggml_backend_qnn_init(size_t dev_num, const char *qnn_lib_path); GGML_API bool ggml_backend_is_qnn(ggml_backend_t backend); -GGML_API void ggml_backend_qnn_set_n_threads(ggml_backend_t backend, - int thread_counts); +GGML_API void ggml_backend_qnn_set_n_threads(ggml_backend_t backend, int thread_counts); GGML_API int ggml_backend_qnn_get_device_count(void); -GGML_API void ggml_backend_qnn_get_device_description(size_t dev_num, - char* description, - size_t description_size); +GGML_API void ggml_backend_qnn_get_device_description(size_t dev_num, char *description, size_t description_size); -GGML_API GGML_CALL ggml_backend_buffer_type_t -ggml_backend_qnn_buffer_type(size_t dev_num); +GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_qnn_buffer_type(size_t dev_num); #ifdef __cplusplus } diff --git a/ggml/src/ggml-qnn.cpp b/ggml/src/ggml-qnn.cpp index f1de6b18591c6..46f7e64bcdedb 100644 --- a/ggml/src/ggml-qnn.cpp +++ b/ggml/src/ggml-qnn.cpp @@ -78,32 +78,9 @@ static struct qnn::qcom_socinfo g_qnn_soc_info_table[] = { // HMX(Hexagon Matrix eXtensions)/HTA(Hexagon Tensor Accelerator) static struct ggml_backend_qnn_context g_qnn_mgr[GGML_QNN_MAX_DEVICES] = { - [QNN_BACKEND_CPU] = { .device = 0, - .threads = 1, - .name = "qnn-cpu", - .lib = "libQnnCpu.so", - .backend = nullptr, - .raw_interface = {}, - .raw_system_interface = {}, - .socinfo = {} }, - - [QNN_BACKEND_GPU] = { .device = 1, - .threads = 1, - .name = "qnn-gpu", - .lib = "libQnnGpu.so", - .backend = nullptr, - .raw_interface = {}, - .raw_system_interface = {}, - .socinfo = {} }, - - [QNN_BACKEND_NPU] = { .device = 2, - .threads = 1, - .name = "qnn-npu", - .lib = "libQnnHtp.so", - .backend = nullptr, - .raw_interface = {}, - .raw_system_interface = {}, - .socinfo = {} }, + ggml_backend_qnn_context(QNN_BACKEND_CPU, 1, "qnn-cpu", "libQnnCpu.so"), /* QNN_BACKEND_CPU */ + ggml_backend_qnn_context(QNN_BACKEND_GPU, 1, "qnn-gpu", "libQnnGpu.so"), /* QNN_BACKEND_GPU */ + ggml_backend_qnn_context(QNN_BACKEND_NPU, 1, "qnn-npu", "libQnnHtp.so"), /* QNN_BACKEND_NPU */ }; struct ggml_backend_qnn_buffer_context { @@ -549,17 +526,17 @@ ggml_backend_t ggml_backend_qnn_init(size_t device, const char *qnn_lib_path) { return nullptr; } auto qnn_interface = instance->get_qnn_interface(); - if (!qnn_interface.is_loaded()) { + if (!qnn_interface) { QNN_LOG_WARN("qnn subsystem failure\n"); return nullptr; } std::string device_name = qnn::get_backend_name(device); QNN_LOG_INFO("qnn device name %s", device_name.c_str()); - g_qnn_mgr[device].instance = instance; - g_qnn_mgr[device].raw_interface = instance->get_qnn_raw_interface(); - g_qnn_mgr[device].raw_system_interface = instance->get_qnn_raw_system_interface(); - g_qnn_mgr[device].socinfo = instance->get_soc_info(); + auto &qnn_device = g_qnn_mgr[device]; + qnn_device.instance = instance; + qnn_device.qnn_interface = qnn_interface; + qnn_device.socinfo = instance->get_soc_info(); ggml_backend_t qnn_backend = new ggml_backend{ /* .guid = */ ggml_backend_qnn_guid(), /* .iface = */ ggml_backend_qnn_interface, diff --git a/ggml/src/ggml-qnn/backend-ops.cpp b/ggml/src/ggml-qnn/backend-ops.cpp index 711f707531228..e1a8c4da5ed40 100644 --- a/ggml/src/ggml-qnn/backend-ops.cpp +++ b/ggml/src/ggml-qnn/backend-ops.cpp @@ -176,7 +176,7 @@ qnn::ggml_qnn_graph<_InputSize, _OutputSize> *get_qnn_graph_from_cache( } auto graph = std::make_unique(graph_name, (QNNBackend)(ctx->device), ctx->instance->get_qnn_context_handle(), - ctx->raw_interface, ctx->socinfo.vtcm_size_in_mb); + ctx->qnn_interface, ctx->socinfo.vtcm_size_in_mb); if (!graph->is_valid()) { return nullptr; diff --git a/ggml/src/ggml-qnn/backend.hpp b/ggml/src/ggml-qnn/backend.hpp index 0ec927779cc31..32f3c6cd445f6 100644 --- a/ggml/src/ggml-qnn/backend.hpp +++ b/ggml/src/ggml-qnn/backend.hpp @@ -9,7 +9,7 @@ #include "ggml-backend.h" #include "graph.hpp" -#include "qnn.hpp" +#include "qnn-lib.hpp" namespace qnn { typedef std::unordered_map> ggml_qnn_unary_graph_cache_t; @@ -21,11 +21,16 @@ struct ggml_backend_qnn_context { int threads; char name[GGML_MAX_NAME]; char lib[GGML_MAX_NAME]; + ggml_backend *backend = nullptr; + qnn::qcom_socinfo socinfo = {}; std::shared_ptr instance; - ggml_backend *backend; - QNN_INTERFACE_VER_TYPE raw_interface; - QNN_SYSTEM_INTERFACE_VER_TYPE raw_system_interface; - qnn::qcom_socinfo socinfo; + std::shared_ptr qnn_interface; qnn::ggml_qnn_unary_graph_cache_t qnn_unary_graph_cache; qnn::ggml_qnn_binary_graph_cache_t qnn_binary_graph_cache; + + explicit ggml_backend_qnn_context(int device, int threads, const char *name, const char *lib) : + device(device), threads(threads) { + strncpy(this->name, name, GGML_MAX_NAME); + strncpy(this->lib, lib, GGML_MAX_NAME); + } }; diff --git a/ggml/src/ggml-qnn/graph.hpp b/ggml/src/ggml-qnn/graph.hpp index cb04b1efda0fc..e4900906ce3e9 100644 --- a/ggml/src/ggml-qnn/graph.hpp +++ b/ggml/src/ggml-qnn/graph.hpp @@ -2,11 +2,12 @@ #pragma once #include +#include #include "ggml-qnn.h" #include "logger.hpp" -#include "qnn.hpp" +#include "qnn-lib.hpp" namespace qnn { @@ -17,7 +18,7 @@ class ggml_qnn_graph { typedef std::array output_tensor_array_t; explicit ggml_qnn_graph(const std::string &graph_name, QNNBackend device, Qnn_ContextHandle_t qnn_context, - QNN_INTERFACE_VER_TYPE qnn_interface, size_t vtcm_size_in_mb) : + std::shared_ptr qnn_interface, size_t vtcm_size_in_mb) : _graph_name(graph_name), _device(device), _qnn_interface(qnn_interface) { QNN_LOG_INFO("graph name %s", graph_name.c_str()); @@ -56,9 +57,9 @@ class ggml_qnn_graph { const QnnGraph_Config_t *graph_configs[] = { &graph_hvx_config, &graph_dlbc_config, &graph_vtcm_config, &graph_opt_config, nullptr }; - error = qnn_interface.graphCreate(qnn_context, graph_name.c_str(), graph_configs, &graph_handle); + error = qnn_interface->qnn_graph_create(qnn_context, graph_name.c_str(), graph_configs, &graph_handle); } else { - error = qnn_interface.graphCreate(qnn_context, graph_name.c_str(), nullptr, &graph_handle); + error = qnn_interface->qnn_graph_create(qnn_context, graph_name.c_str(), nullptr, &graph_handle); } if (error != QNN_SUCCESS) { @@ -79,7 +80,7 @@ class ggml_qnn_graph { return false; } - auto err = _qnn_interface.tensorCreateGraphTensor(_graph_handle, &tensor); + auto err = _qnn_interface->qnn_tensor_create_graph_tensor(_graph_handle, &tensor); if (err != QNN_SUCCESS) { QNN_LOG_INFO("error = %d\n", err); QNN_LOG_DEBUG("tensor%p name %s", &tensor, QNN_TENSOR_GET_NAME(tensor)); @@ -105,13 +106,13 @@ class ggml_qnn_graph { .v1 = { _graph_name.c_str(), QNN_OP_PACKAGE_NAME_QTI_AISW, op_name.c_str(), 0, qnn_params, (uint32_t)_tensor_inputs.size(), _tensor_inputs.data(), (uint32_t)_tensor_outputs.size(), _tensor_outputs.data() } }; - auto error = _qnn_interface.graphAddNode(_graph_handle, op_config); + auto error = _qnn_interface->qnn_graph_add_node(_graph_handle, op_config); if (error != QNN_SUCCESS) { QNN_LOG_ERROR("graphAddNode.error = %d\n", error); return false; } - error = _qnn_interface.graphFinalize(_graph_handle, nullptr, nullptr); + error = _qnn_interface->qnn_graph_finalize(_graph_handle, nullptr, nullptr); if (error != QNN_SUCCESS) { QNN_LOG_ERROR("graphFinalize.error = %d\n", error); return false; @@ -124,8 +125,9 @@ class ggml_qnn_graph { bool execute(const input_tensor_array_t &tensor_inputs, const output_tensor_array_t &tensor_outputs) { _tensor_inputs = tensor_inputs; _tensor_outputs = tensor_outputs; - auto error = _qnn_interface.graphExecute(_graph_handle, _tensor_inputs.data(), _tensor_inputs.size(), - _tensor_outputs.data(), _tensor_outputs.size(), nullptr, nullptr); + auto error = + _qnn_interface->qnn_graph_execute(_graph_handle, _tensor_inputs.data(), _tensor_inputs.size(), + _tensor_outputs.data(), _tensor_outputs.size(), nullptr, nullptr); if (_device == QNN_BACKEND_NPU) { if (error == QNN_COMMON_ERROR_SYSTEM_COMMUNICATION) { QNN_LOG_WARN("NPU crashed. SSR detected. Caused QNN graph execute error\n"); @@ -149,7 +151,7 @@ class ggml_qnn_graph { private: const std::string _graph_name; const QNNBackend _device; - const QNN_INTERFACE_VER_TYPE _qnn_interface; + std::shared_ptr _qnn_interface; Qnn_GraphHandle_t _graph_handle = nullptr; std::array _tensor_inputs; std::array _tensor_outputs; diff --git a/ggml/src/ggml-qnn/qnn-lib.cpp b/ggml/src/ggml-qnn/qnn-lib.cpp new file mode 100644 index 0000000000000..a7553c4ac2b75 --- /dev/null +++ b/ggml/src/ggml-qnn/qnn-lib.cpp @@ -0,0 +1,35 @@ + +#include "qnn-lib.hpp" + +namespace qnn { + +qnn_system_interface::qnn_system_interface(const QnnSystemInterface_t &qnn_sys_interface, dl_handler_t lib_handle) : + _qnn_sys_interface(qnn_sys_interface), _lib_handle(lib_handle) { + qnn_system_context_create(&_qnn_system_handle); + if (_qnn_system_handle) { + QNN_LOG_INFO("initialize qnn system successfully\n"); + } else { + QNN_LOG_WARN("can not create QNN system contenxt\n"); + } +} + +qnn_system_interface::~qnn_system_interface() { + if (_qnn_system_handle) { + if (qnn_system_context_free(_qnn_system_handle) != QNN_SUCCESS) { + QNN_LOG_WARN("failed to free QNN system context\n"); + } + } else { + QNN_LOG_WARN("system handle is null\n"); + } + + if (_lib_handle) { + int dlclose_error = dl_unload(_lib_handle); + if (dlclose_error != 0) { + QNN_LOG_WARN("failed to close QnnSystem library, error %s\n", dl_error()); + } + } else { + QNN_LOG_WARN("system lib handle is null\n"); + } +} + +} // namespace qnn diff --git a/ggml/src/ggml-qnn/qnn.hpp b/ggml/src/ggml-qnn/qnn-lib.hpp similarity index 81% rename from ggml/src/ggml-qnn/qnn.hpp rename to ggml/src/ggml-qnn/qnn-lib.hpp index 10549a6c5e413..7307c9f63e75f 100644 --- a/ggml/src/ggml-qnn/qnn.hpp +++ b/ggml/src/ggml-qnn/qnn-lib.hpp @@ -25,29 +25,64 @@ namespace qnn { +// TODO: those function should be moved to a separate file, and have separate implementation for each platform +typedef void *dl_handler_t; + +inline dl_handler_t dl_load(const std::string &lib_path) { return dlopen(lib_path.c_str(), RTLD_NOW | RTLD_LOCAL); } + +inline void *dl_sym(dl_handler_t handle, const std::string &symbol) { return dlsym(handle, symbol.c_str()); } + +inline int dl_unload(dl_handler_t handle) { return dlclose(handle); } + +inline const char *dl_error() { return dlerror(); } + // ================================================================================================= // // wrapper class of Qualcomm QNN(Qualcomm Neural Network, aka Qualcomm AI Engine Direct) SDK // ref:https://github.com/pytorch/executorch/tree/main/backends/qualcomm // ================================================================================================= -class qnn_interface { -#define DEFINE_SHIM_FUNCTION_INTERFACE(F, pointer_name) \ - template \ - inline auto qnn_##F(Args... args) const { \ - return (_qnn_interface->QNN_INTERFACE_VER_NAME.pointer_name)(std::forward(args)...); \ - } +class qnn_system_interface { -#define DEFINE_SHIM_FUNCTION_SYS_INTERFACE(F, pointer_name) \ - template \ - inline auto qnn_##F(Args... args) const { \ - return (_qnn_sys_interface->QNN_SYSTEM_INTERFACE_VER_NAME.pointer_name)(std::forward(args)...); \ +#define DEFINE_SHIM_FUNCTION_SYS_INTERFACE(F, pointer_name) \ + template \ + inline auto qnn_##F(Args... args) const { \ + return (_qnn_sys_interface.QNN_SYSTEM_INTERFACE_VER_NAME.pointer_name)(std::forward(args)...); \ } - friend class qnn_instance; +public: + qnn_system_interface(const QnnSystemInterface_t &qnn_sys_interface, dl_handler_t lib_handle); + ~qnn_system_interface(); + bool is_valid() const { return _qnn_system_handle != nullptr; } + + // QnnSystem + DEFINE_SHIM_FUNCTION_SYS_INTERFACE(system_context_create, systemContextCreate); + + DEFINE_SHIM_FUNCTION_SYS_INTERFACE(system_context_get_binary_info, systemContextGetBinaryInfo); + + DEFINE_SHIM_FUNCTION_SYS_INTERFACE(system_context_free, systemContextFree); + +private: + qnn_system_interface(const qnn_system_interface &) = delete; + void operator=(const qnn_system_interface &) = delete; + qnn_system_interface(qnn_system_interface &&) = delete; + void operator=(qnn_system_interface &&) = delete; + + const QnnSystemInterface_t _qnn_sys_interface = {}; + dl_handler_t _lib_handle = nullptr; + QnnSystemContext_Handle_t _qnn_system_handle = nullptr; +}; + +class qnn_interface { + +#define DEFINE_SHIM_FUNCTION_INTERFACE(F, pointer_name) \ + template \ + inline auto qnn_##F(Args... args) const { \ + return (_qnn_interface.QNN_INTERFACE_VER_NAME.pointer_name)(std::forward(args)...); \ + } public: - qnn_interface() = default; + qnn_interface(const QnnInterface_t &qnn_interface) : _qnn_interface(qnn_interface) {} // QnnBackend DEFINE_SHIM_FUNCTION_INTERFACE(backend_create, backendCreate); @@ -59,7 +94,6 @@ class qnn_interface { DEFINE_SHIM_FUNCTION_INTERFACE(backend_validate_op_config, backendValidateOpConfig); DEFINE_SHIM_FUNCTION_INTERFACE(backend_get_api_version, backendGetApiVersion); - // QnnDevice DEFINE_SHIM_FUNCTION_INTERFACE(device_create, deviceCreate); @@ -69,6 +103,8 @@ class qnn_interface { DEFINE_SHIM_FUNCTION_INTERFACE(device_get_platform_info, deviceGetPlatformInfo); + DEFINE_SHIM_FUNCTION_INTERFACE(device_free_platform_info, deviceFreePlatformInfo); + DEFINE_SHIM_FUNCTION_INTERFACE(device_get_info, deviceGetInfo); // QnnContext @@ -124,27 +160,15 @@ class qnn_interface { DEFINE_SHIM_FUNCTION_INTERFACE(tensor_create_graph_tensor, tensorCreateGraphTensor); - // QnnSystem - DEFINE_SHIM_FUNCTION_SYS_INTERFACE(system_context_create, systemContextCreate); - - DEFINE_SHIM_FUNCTION_SYS_INTERFACE(system_context_get_binary_info, systemContextGetBinaryInfo); - - DEFINE_SHIM_FUNCTION_SYS_INTERFACE(system_context_free, systemContextFree); - - void set_qnn_interface(const QnnInterface_t *qnn_interface) { _qnn_interface = qnn_interface; } - - void set_qnn_system_interface(const QnnSystemInterface_t *qnn_sys_interface) { - _qnn_sys_interface = qnn_sys_interface; - } - - uint32_t get_backend_id() const { return _qnn_interface->backendId; } - - bool is_loaded() const { return ((_qnn_sys_interface != nullptr) && (_qnn_interface != nullptr)); } + uint32_t get_backend_id() const { return _qnn_interface.backendId; } private: - const QnnInterface_t *_qnn_interface = nullptr; + qnn_interface(const qnn_interface &) = delete; + void operator=(const qnn_interface &) = delete; + qnn_interface(qnn_interface &&) = delete; + void operator=(qnn_interface &&) = delete; - const QnnSystemInterface_t *_qnn_sys_interface = nullptr; + const QnnInterface_t _qnn_interface = {}; }; class qnn_instance { @@ -161,8 +185,7 @@ class qnn_instance { QNN_LOG_DEBUG("enter qni_init\n"); std::lock_guard lock(_init_mutex); - - if (0 != load_system()) { + if (load_system() != 0) { QNN_LOG_WARN("can not load QNN system lib, pls check why?\n"); return 1; } else { @@ -170,16 +193,16 @@ class qnn_instance { } std::string backend_lib_path = _lib_path + _backend_name; - if (0 == _lib_path_to_backend_id.count(backend_lib_path)) { + if (_lib_path_to_backend_id.count(backend_lib_path) == 0) { int is_load_ok = load_backend(backend_lib_path, saver_config); - if (0 != is_load_ok) { + if (is_load_ok != 0) { QNN_LOG_WARN("failed to load QNN backend\n"); return 2; } } backend_id = _lib_path_to_backend_id[backend_lib_path]; - if (0 == _loaded_backend.count(backend_id) || 0 == _loaded_lib_handle.count(backend_id)) { + if (_loaded_backend.count(backend_id) == 0 || _loaded_lib_handle.count(backend_id) == 0) { QNN_LOG_WARN( "library %s is loaded but loaded backend count=%zu, " "loaded lib_handle count=%zu\n", @@ -187,9 +210,8 @@ class qnn_instance { return 3; } - _qnn_interface.set_qnn_interface(_loaded_backend[backend_id]); - - _qnn_interface.qnn_log_create(qnn::sdk_logcallback, _qnn_log_level, &_qnn_log_handle); + _qnn_interface = std::make_shared(*_loaded_backend[backend_id]); + _qnn_interface->qnn_log_create(qnn::sdk_logcallback, _qnn_log_level, &_qnn_log_handle); if (nullptr == _qnn_log_handle) { // NPU backend not work on Qualcomm SoC equipped low-end phone QNN_LOG_WARN("why failed to initialize qnn log\n"); @@ -199,7 +221,7 @@ class qnn_instance { } std::vector temp_backend_config; - _qnn_interface.qnn_backend_create( + _qnn_interface->qnn_backend_create( _qnn_log_handle, temp_backend_config.empty() ? nullptr : temp_backend_config.data(), &_qnn_backend_handle); if (nullptr == _qnn_backend_handle) { QNN_LOG_WARN("why failed to initialize qnn backend\n"); @@ -208,20 +230,18 @@ class qnn_instance { QNN_LOG_DEBUG("initialize qnn backend successfully\n"); } - if (nullptr != _qnn_raw_interface.propertyHasCapability) { - Qnn_ErrorHandle_t qnn_status = _qnn_raw_interface.propertyHasCapability(QNN_PROPERTY_GROUP_DEVICE); - if (QNN_PROPERTY_NOT_SUPPORTED == qnn_status) { - QNN_LOG_WARN("device property is not supported\n"); - } - if (QNN_PROPERTY_ERROR_UNKNOWN_KEY == qnn_status) { - QNN_LOG_WARN("device property is not known to backend\n"); - } + Qnn_ErrorHandle_t qnn_status = _qnn_interface->qnn_property_has_capability(QNN_PROPERTY_GROUP_DEVICE); + if (QNN_PROPERTY_NOT_SUPPORTED == qnn_status) { + QNN_LOG_WARN("device property is not supported\n"); + } + if (QNN_PROPERTY_ERROR_UNKNOWN_KEY == qnn_status) { + QNN_LOG_WARN("device property is not known to backend\n"); } - Qnn_ErrorHandle_t qnn_status = QNN_SUCCESS; + qnn_status = QNN_SUCCESS; if (_backend_name.find("Htp") != std::variant_npos) { const QnnDevice_PlatformInfo_t *p_info = nullptr; - _qnn_raw_interface.deviceGetPlatformInfo(nullptr, &p_info); + _qnn_interface->qnn_device_get_platform_info(nullptr, &p_info); QNN_LOG_INFO("device counts %d", p_info->v1.numHwDevices); QnnDevice_HardwareDeviceInfo_t *infos = p_info->v1.hwDevices; QnnHtpDevice_OnChipDeviceInfoExtension_t chipinfo = {}; @@ -238,7 +258,7 @@ class qnn_instance { chipinfo.vtcmSize); _soc_info = { chipinfo.socModel, htp_arch, chipinfo.vtcmSize }; } - _qnn_raw_interface.deviceFreePlatformInfo(nullptr, p_info); + _qnn_interface->qnn_device_free_platform_info(nullptr, p_info); QnnHtpDevice_CustomConfig_t soc_customconfig; soc_customconfig.option = QNN_HTP_DEVICE_CONFIG_OPTION_SOC; @@ -256,9 +276,9 @@ class qnn_instance { arch_devconfig.customConfig = &arch_customconfig; const QnnDevice_Config_t *p_deviceconfig[] = { &soc_devconfig, &arch_devconfig, nullptr }; - qnn_status = _qnn_raw_interface.deviceCreate(_qnn_log_handle, p_deviceconfig, &_qnn_device_handle); + qnn_status = _qnn_interface->qnn_device_create(_qnn_log_handle, p_deviceconfig, &_qnn_device_handle); } else { - qnn_status = _qnn_raw_interface.deviceCreate(_qnn_log_handle, nullptr, &_qnn_device_handle); + qnn_status = _qnn_interface->qnn_device_create(_qnn_log_handle, nullptr, &_qnn_device_handle); } if (QNN_SUCCESS != qnn_status && QNN_DEVICE_ERROR_UNSUPPORTED_FEATURE != qnn_status) { QNN_LOG_WARN("failed to create QNN device\n"); @@ -270,7 +290,7 @@ class qnn_instance { QNN_LOG_INFO("profiling turned on; level = %d", _profile_level); if (qnn::sdk_profile_level::profile_basic == _profile_level) { QNN_LOG_INFO("basic profiling requested. creating Qnn Profile object\n"); - if (QNN_PROFILE_NO_ERROR != _qnn_raw_interface.profileCreate( + if (QNN_PROFILE_NO_ERROR != _qnn_interface->qnn_profile_create( _qnn_backend_handle, QNN_PROFILE_LEVEL_BASIC, &_qnn_profile_handle)) { QNN_LOG_WARN("unable to create profile handle in the backend\n"); return 6; @@ -279,9 +299,9 @@ class qnn_instance { } } else if (qnn::sdk_profile_level::profile_detail == _profile_level) { QNN_LOG_INFO("detailed profiling requested. Creating Qnn Profile object\n"); - if (QNN_PROFILE_NO_ERROR != _qnn_raw_interface.profileCreate(_qnn_backend_handle, - QNN_PROFILE_LEVEL_DETAILED, - &_qnn_profile_handle)) { + if (QNN_PROFILE_NO_ERROR != _qnn_interface->qnn_profile_create(_qnn_backend_handle, + QNN_PROFILE_LEVEL_DETAILED, + &_qnn_profile_handle)) { QNN_LOG_WARN("unable to create profile handle in the backend\n"); return 7; } else { @@ -290,22 +310,22 @@ class qnn_instance { } } - _rpc_lib_handle = dlopen("libcdsprpc.so", RTLD_NOW | RTLD_LOCAL); + _rpc_lib_handle = dl_load("libcdsprpc.so"); if (nullptr == _rpc_lib_handle) { - QNN_LOG_WARN("failed to load qualcomm's rpc lib, error:%s\n", dlerror()); + QNN_LOG_WARN("failed to load qualcomm's rpc lib, error:%s\n", dl_error()); return 8; } else { QNN_LOG_DEBUG("load rpcmem lib successfully\n"); set_rpcmem_initialized(true); } - _pfn_rpc_mem_init = reinterpret_cast(dlsym(_rpc_lib_handle, "rpcmem_init")); - _pfn_rpc_mem_deinit = reinterpret_cast(dlsym(_rpc_lib_handle, "rpcmem_deinit")); - _pfn_rpc_mem_alloc = reinterpret_cast(dlsym(_rpc_lib_handle, "rpcmem_alloc")); - _pfn_rpc_mem_free = reinterpret_cast(dlsym(_rpc_lib_handle, "rpcmem_free")); - _pfn_rpc_mem_to_fd = reinterpret_cast(dlsym(_rpc_lib_handle, "rpcmem_to_fd")); + _pfn_rpc_mem_init = reinterpret_cast(dl_sym(_rpc_lib_handle, "rpcmem_init")); + _pfn_rpc_mem_deinit = reinterpret_cast(dl_sym(_rpc_lib_handle, "rpcmem_deinit")); + _pfn_rpc_mem_alloc = reinterpret_cast(dl_sym(_rpc_lib_handle, "rpcmem_alloc")); + _pfn_rpc_mem_free = reinterpret_cast(dl_sym(_rpc_lib_handle, "rpcmem_free")); + _pfn_rpc_mem_to_fd = reinterpret_cast(dl_sym(_rpc_lib_handle, "rpcmem_to_fd")); if (nullptr == _pfn_rpc_mem_alloc || nullptr == _pfn_rpc_mem_free || nullptr == _pfn_rpc_mem_to_fd) { - QNN_LOG_WARN("unable to access symbols in QNN RPC lib. dlerror(): %s", dlerror()); - dlclose(_rpc_lib_handle); + QNN_LOG_WARN("unable to access symbols in QNN RPC lib. error: %s", dl_error()); + dl_unload(_rpc_lib_handle); return 9; } @@ -318,7 +338,7 @@ class qnn_instance { qnn_context_config.priority = QNN_PRIORITY_DEFAULT; const QnnContext_Config_t * context_configs[] = {&qnn_context_config, nullptr}; */ - _qnn_interface.qnn_context_create(_qnn_backend_handle, _qnn_device_handle, nullptr, &_qnn_context_handle); + _qnn_interface->qnn_context_create(_qnn_backend_handle, _qnn_device_handle, nullptr, &_qnn_context_handle); if (nullptr == _qnn_context_handle) { QNN_LOG_WARN("why failed to initialize qnn context\n"); return 10; @@ -370,8 +390,8 @@ class qnn_instance { if (nullptr != _pfn_rpc_mem_deinit) // make Qualcomm's SoC equipped low-end phone happy _pfn_rpc_mem_deinit(); - if (dlclose(_rpc_lib_handle) != 0) { - QNN_LOG_WARN("failed to unload qualcomm's rpc lib, error:%s\n", dlerror()); + if (dl_unload(_rpc_lib_handle) != 0) { + QNN_LOG_WARN("failed to unload qualcomm's rpc lib, error:%s\n", dl_error()); } else { QNN_LOG_DEBUG("succeed to close rpcmem lib\n"); } @@ -381,45 +401,45 @@ class qnn_instance { } if (nullptr != _qnn_context_handle) { - error = _qnn_interface.qnn_context_free(_qnn_context_handle, _qnn_profile_handle); + error = _qnn_interface->qnn_context_free(_qnn_context_handle, _qnn_profile_handle); if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to free QNN context_handle: ID %u, error %d\n", _qnn_interface.get_backend_id(), + QNN_LOG_WARN("failed to free QNN context_handle: ID %u, error %d\n", _qnn_interface->get_backend_id(), QNN_GET_ERROR_CODE(error)); } _qnn_context_handle = nullptr; } if (nullptr != _qnn_profile_handle) { - error = _qnn_interface.qnn_profile_free(_qnn_profile_handle); + error = _qnn_interface->qnn_profile_free(_qnn_profile_handle); if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to free QNN profile_handle: ID %u, error %d\n", _qnn_interface.get_backend_id(), + QNN_LOG_WARN("failed to free QNN profile_handle: ID %u, error %d\n", _qnn_interface->get_backend_id(), QNN_GET_ERROR_CODE(error)); } _qnn_profile_handle = nullptr; } if (nullptr != _qnn_device_handle) { - error = _qnn_interface.qnn_device_free(_qnn_device_handle); + error = _qnn_interface->qnn_device_free(_qnn_device_handle); if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to free QNN device_handle: ID %u, error %d\n", _qnn_interface.get_backend_id(), + QNN_LOG_WARN("failed to free QNN device_handle: ID %u, error %d\n", _qnn_interface->get_backend_id(), QNN_GET_ERROR_CODE(error)); } _qnn_device_handle = nullptr; } if (nullptr != _qnn_backend_handle) { - error = _qnn_interface.qnn_backend_free(_qnn_backend_handle); + error = _qnn_interface->qnn_backend_free(_qnn_backend_handle); if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to free QNN backend_handle: ID %u, error %d\n", _qnn_interface.get_backend_id(), + QNN_LOG_WARN("failed to free QNN backend_handle: ID %u, error %d\n", _qnn_interface->get_backend_id(), QNN_GET_ERROR_CODE(error)); } _qnn_backend_handle = nullptr; } if (nullptr != _qnn_log_handle) { - error = _qnn_interface.qnn_log_free(_qnn_log_handle); + error = _qnn_interface->qnn_log_free(_qnn_log_handle); if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to free QNN log_handle: ID %u, error %d\n", _qnn_interface.get_backend_id(), + QNN_LOG_WARN("failed to free QNN log_handle: ID %u, error %d\n", _qnn_interface->get_backend_id(), QNN_GET_ERROR_CODE(error)); } _qnn_log_handle = nullptr; @@ -427,32 +447,18 @@ class qnn_instance { unload_backend(); - unload_system(); + _qnn_sys_interface.reset(); return ret_status; } - const qnn_interface &get_qnn_interface() { - if (!_qnn_interface.is_loaded()) { + std::shared_ptr get_qnn_interface() { + if (!_qnn_interface) { QNN_LOG_WARN("pls check why _qnn_interface is not loaded\n"); } return _qnn_interface; } - const QNN_INTERFACE_VER_TYPE &get_qnn_raw_interface() { - if (!_qnn_interface.is_loaded()) { - QNN_LOG_WARN("pls check why _qnn_interface is not loaded\n"); - } - return _qnn_raw_interface; - } - - const QNN_SYSTEM_INTERFACE_VER_TYPE &get_qnn_raw_system_interface() { - if (!_qnn_interface.is_loaded()) { - QNN_LOG_WARN("pls check why _qnn_interface is not loaded\n"); - } - return _qnn_raw_system_interface; - } - const Qnn_LogHandle_t get_qnn_log_handle() { return _qnn_log_handle; } const Qnn_ProfileHandle_t get_qnn_profile_handle() { return _qnn_profile_handle; } @@ -463,13 +469,11 @@ class qnn_instance { const Qnn_ContextHandle_t get_qnn_context_handle() { return _qnn_context_handle; } - const QnnSystemContext_Handle_t get_qnn_system_handle() { return _qnn_system_handle; } - const Qnn_GraphHandle_t get_qnn_graph_handle() { return _qnn_graph_handle; } int init_htp_perfinfra() { QnnDevice_Infrastructure_t device_infra = nullptr; - int error = _qnn_raw_interface.deviceGetInfrastructure(&device_infra); + int error = _qnn_interface->qnn_device_get_infrastructure(&device_infra); if (error != QNN_SUCCESS) { QNN_LOG_WARN("failed to get qnn device infra\n"); return 1; @@ -655,8 +659,8 @@ class qnn_instance { { { mem_fd } } }; Qnn_MemHandle_t handle = nullptr; int error = QNN_SUCCESS; - error = _qnn_interface.qnn_mem_register(_qnn_context_handle, &descriptor, - /*numDescriptors=*/1, &handle); + error = _qnn_interface->qnn_mem_register(_qnn_context_handle, &descriptor, + /*numDescriptors=*/1, &handle); if (error != QNN_SUCCESS) { QNN_LOG_WARN("failed to register shared memory, error %d, %s\n", QNN_GET_ERROR_CODE(error), strerror(error)); @@ -666,7 +670,8 @@ class qnn_instance { QNN_TENSOR_SET_MEM_HANDLE(*p_tensor, handle); _qnn_mem_set.insert((std::pair(p_data, handle))); - QNN_LOG_INFO("tensor %s successfully register shared memory handler: %p\n", QNN_TENSOR_GET_NAME(*p_tensor), handle); + QNN_LOG_INFO("tensor %s successfully register shared memory handler: %p\n", QNN_TENSOR_GET_NAME(*p_tensor), + handle); return 0; } @@ -692,7 +697,7 @@ class qnn_instance { for (std::unordered_map::iterator it = _qnn_mem_set.begin(); it != _qnn_mem_set.end(); it++) { Qnn_MemHandle_t mem_handle = it->second; - error = _qnn_interface.qnn_mem_de_register(&mem_handle, 1); + error = _qnn_interface->qnn_mem_de_register(&mem_handle, 1); if (error != QNN_SUCCESS) { QNN_LOG_WARN("failed to unregister shared memory, error %d\n", QNN_GET_ERROR_CODE(error)); } @@ -711,16 +716,16 @@ class qnn_instance { std::string system_lib_path = _lib_path + "libQnnSystem.so"; QNN_LOG_DEBUG("system_lib_path:%s\n", system_lib_path.c_str()); - _system_lib_handle = dlopen(system_lib_path.c_str(), RTLD_NOW | RTLD_LOCAL); - if (nullptr == _system_lib_handle) { - QNN_LOG_WARN("can not open QNN library %s, error: %s\n", system_lib_path.c_str(), dlerror()); + auto system_lib_handle = dl_load(system_lib_path); + if (!system_lib_handle) { + QNN_LOG_WARN("can not load QNN library %s, error: %s\n", system_lib_path.c_str(), dl_error()); return 1; } auto *get_providers = reinterpret_cast( - dlsym(_system_lib_handle, "QnnSystemInterface_getProviders")); - if (nullptr == get_providers) { - QNN_LOG_WARN("can not load QNN symbol QnnSystemInterface_getProviders: %s\n", dlerror()); + dl_sym(system_lib_handle, "QnnSystemInterface_getProviders")); + if (!get_providers) { + QNN_LOG_WARN("can not load QNN symbol QnnSystemInterface_getProviders: %s\n", dl_error()); return 2; } @@ -737,7 +742,7 @@ class qnn_instance { return 4; } - if (nullptr == provider_list) { + if (!provider_list) { QNN_LOG_WARN("can not get providers\n"); return 5; } @@ -758,61 +763,31 @@ class qnn_instance { } else { QNN_LOG_INFO("find a valid qnn system interface\n"); } - set_qnn_raw_system_interface(qnn_system_interface); - _qnn_interface.set_qnn_system_interface(provider_list[0]); - - _qnn_interface.qnn_system_context_create(&_qnn_system_handle); - if (nullptr == _qnn_system_handle) { - QNN_LOG_WARN("can not create QNN system contenxt\n"); - } else { - QNN_LOG_INFO("initialize qnn system successfully\n"); + auto qnn_sys_interface = std::make_shared(*provider_list[0], system_lib_handle); + if (!qnn_sys_interface->is_valid()) { + QNN_LOG_WARN("failed to create QNN system interface\n"); + return 7; } + _qnn_sys_interface = qnn_sys_interface; return 0; } - int unload_system() { - int result = 0; - - if (nullptr == _system_lib_handle) { - QNN_LOG_WARN("system lib handle is null\n"); - return 1; - } - - if (nullptr != _qnn_system_handle) { - result = _qnn_interface.qnn_system_context_free(_qnn_system_handle); - if (result != QNN_SUCCESS) { - QNN_LOG_WARN("failed to free QNN system context\n"); - } - _qnn_system_handle = nullptr; - } - - int dlclose_error = dlclose(_system_lib_handle); - if (dlclose_error != 0) { - QNN_LOG_WARN("failed to close QnnSystem library, error %s\n", dlerror()); - return 2; - } - - _system_lib_handle = nullptr; - - return result; - } - int load_backend(std::string &lib_path, const QnnSaver_Config_t **saver_config) { Qnn_ErrorHandle_t error = QNN_SUCCESS; QNN_LOG_DEBUG("lib_path:%s\n", lib_path.c_str()); - void *lib_handle = dlopen(lib_path.c_str(), RTLD_NOW | RTLD_GLOBAL); - if (nullptr == lib_handle) { - QNN_LOG_WARN("can not open QNN library %s, with error: %s", lib_path.c_str(), dlerror()); + auto lib_handle = dl_load(lib_path.c_str()); + if (!lib_handle) { + QNN_LOG_WARN("can not open QNN library %s, with error: %s", lib_path.c_str(), dl_error()); return 1; } auto get_providers = qnn::load_qnn_functionpointers( lib_handle, "QnnInterface_getProviders"); - if (nullptr == get_providers) { - QNN_LOG_WARN("can not load symbol QnnInterface_getProviders : %s", dlerror()); + if (!get_providers) { + QNN_LOG_WARN("can not load symbol QnnInterface_getProviders : %s", dl_error()); return 2; } @@ -829,7 +804,7 @@ class qnn_instance { return 4; } - if (nullptr == provider_list) { + if (!provider_list) { QNN_LOG_WARN("failed to get qnn interface providers\n"); return 5; } @@ -850,7 +825,6 @@ class qnn_instance { } else { QNN_LOG_INFO("find a valid qnn interface\n"); } - set_qnn_raw_interface(qnn_interface); BackendIdType backend_id = provider_list[0]->backendId; _lib_path_to_backend_id[lib_path] = backend_id; @@ -860,9 +834,9 @@ class qnn_instance { _loaded_backend[backend_id] = provider_list[0]; if (_loaded_lib_handle.count(backend_id) > 0) { QNN_LOG_WARN("closing %p\n", _loaded_lib_handle[backend_id]); - int dlclose_error = dlclose(_loaded_lib_handle[backend_id]); + int dlclose_error = dl_unload(_loaded_lib_handle[backend_id]); if (dlclose_error != 0) { - QNN_LOG_WARN("fail to close %p with error %s\n", _loaded_lib_handle[backend_id], dlerror()); + QNN_LOG_WARN("fail to close %p with error %s\n", _loaded_lib_handle[backend_id], dl_error()); } } _loaded_lib_handle[backend_id] = lib_handle; @@ -874,9 +848,9 @@ class qnn_instance { int unload_backend() { int dlclose_error = 0; for (auto &it : _loaded_lib_handle) { - dlclose_error = dlclose(it.second); + dlclose_error = dl_unload(it.second); if (dlclose_error != 0) { - QNN_LOG_WARN("failed to close QNN backend %d, error %s\n", it.first, dlerror()); + QNN_LOG_WARN("failed to close QNN backend %d, error %s\n", it.first, dl_error()); } } @@ -887,12 +861,6 @@ class qnn_instance { return 0; } - void set_qnn_raw_interface(QNN_INTERFACE_VER_TYPE &raw_interface) { _qnn_raw_interface = raw_interface; } - - void set_qnn_raw_system_interface(QNN_SYSTEM_INTERFACE_VER_TYPE &raw_interface) { - _qnn_raw_system_interface = raw_interface; - } - private: static constexpr const int _required_num_providers = 1; @@ -905,9 +873,8 @@ class qnn_instance { qnn::sdk_profile_level _profile_level = qnn::sdk_profile_level::profile_detail; - qnn_interface _qnn_interface; - - void *_system_lib_handle = nullptr; + std::shared_ptr _qnn_sys_interface; + std::shared_ptr _qnn_interface; Qnn_GraphHandle_t _qnn_graph_handle = nullptr; @@ -921,14 +888,9 @@ class qnn_instance { Qnn_ContextHandle_t _qnn_context_handle = nullptr; - QnnSystemContext_Handle_t _qnn_system_handle = nullptr; - QnnHtpDevice_PerfInfrastructure_t *_qnn_htp_perfinfra = nullptr; uint32_t _qnn_power_configid = 1; - QNN_INTERFACE_VER_TYPE _qnn_raw_interface; - QNN_SYSTEM_INTERFACE_VER_TYPE _qnn_raw_system_interface; - std::unordered_map _qnn_mem_set; std::mutex _init_mutex; @@ -936,7 +898,7 @@ class qnn_instance { std::unordered_map _lib_path_to_backend_id; std::unordered_map _loaded_backend; - void *_rpc_lib_handle = nullptr; + dl_handler_t _rpc_lib_handle = nullptr; std::atomic_bool _rpcmem_initialized{ false }; qnn::pfn_rpc_mem_alloc _pfn_rpc_mem_alloc; qnn::pfn_rpc_mem_free _pfn_rpc_mem_free; diff --git a/ggml/src/ggml-qnn/tensor.hpp b/ggml/src/ggml-qnn/tensor.hpp index e6bb63c54481c..e5dc436adaa5c 100644 --- a/ggml/src/ggml-qnn/tensor.hpp +++ b/ggml/src/ggml-qnn/tensor.hpp @@ -12,7 +12,7 @@ #include "System/QnnSystemInterface.h" #include "graph.hpp" #include "logger.hpp" -#include "qnn.hpp" +#include "qnn-lib.hpp" #include "utils.hpp" namespace qnn { diff --git a/tests/ggml-qnn/CMakeLists.txt b/tests/ggml-qnn/CMakeLists.txt index e72cc13e78ce4..f9678d3d88f00 100644 --- a/tests/ggml-qnn/CMakeLists.txt +++ b/tests/ggml-qnn/CMakeLists.txt @@ -20,6 +20,7 @@ set(SOURCE_FILES ../../ggml/src/ggml-alloc.c ../../ggml/src/ggml-backend.c ../../ggml/src/ggml-quants.c + ../../ggml/src/ggml-qnn/qnn-lib.cpp ../../ggml/src/ggml-qnn/logger.cpp ../../ggml/src/ggml-qnn/utils.cpp ../../ggml/src/ggml-qnn/backend-ops.cpp From b1ef302991577ce1ab0913e71848436542f71ad1 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Wed, 17 Jul 2024 12:21:33 +0800 Subject: [PATCH 073/143] refactoring: remove depend of dlsym at utils.hpp --- ggml/src/ggml-qnn/qnn-lib.hpp | 13 +++++++++---- ggml/src/ggml-qnn/utils.hpp | 5 ----- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/ggml/src/ggml-qnn/qnn-lib.hpp b/ggml/src/ggml-qnn/qnn-lib.hpp index 7307c9f63e75f..a676f989566e5 100644 --- a/ggml/src/ggml-qnn/qnn-lib.hpp +++ b/ggml/src/ggml-qnn/qnn-lib.hpp @@ -36,6 +36,11 @@ inline int dl_unload(dl_handler_t handle) { return dlclose(handle); } inline const char *dl_error() { return dlerror(); } +template +Fn dl_sym_typed(dl_handler_t handle, const std::string &function_name) { + return reinterpret_cast(dl_sym(handle, function_name)); +} + // ================================================================================================= // // wrapper class of Qualcomm QNN(Qualcomm Neural Network, aka Qualcomm AI Engine Direct) SDK @@ -722,8 +727,8 @@ class qnn_instance { return 1; } - auto *get_providers = reinterpret_cast( - dl_sym(system_lib_handle, "QnnSystemInterface_getProviders")); + auto *get_providers = dl_sym_typed( + system_lib_handle, "QnnSystemInterface_getProviders"); if (!get_providers) { QNN_LOG_WARN("can not load QNN symbol QnnSystemInterface_getProviders: %s\n", dl_error()); return 2; @@ -784,8 +789,8 @@ class qnn_instance { return 1; } - auto get_providers = qnn::load_qnn_functionpointers( - lib_handle, "QnnInterface_getProviders"); + auto get_providers = + qnn::dl_sym_typed(lib_handle, "QnnInterface_getProviders"); if (!get_providers) { QNN_LOG_WARN("can not load symbol QnnInterface_getProviders : %s", dl_error()); return 2; diff --git a/ggml/src/ggml-qnn/utils.hpp b/ggml/src/ggml-qnn/utils.hpp index 4a01347d0fc1b..66c3eeba471e2 100644 --- a/ggml/src/ggml-qnn/utils.hpp +++ b/ggml/src/ggml-qnn/utils.hpp @@ -24,11 +24,6 @@ uint32_t get_ggml_tensor_data_size(const ggml_tensor *tensor); const char *opname_from_ggmlop(enum ggml_op ggmlop); -template -Fn load_qnn_functionpointers(void *handle, const char *function_name) { - return reinterpret_cast(dlsym(handle, function_name)); -} - inline int validate_tensor_version(const Qnn_Tensor_t &tensor) { if (tensor.version != QNN_TENSOR_VERSION_1) { QNN_LOG_WARN("validate_tensor_version() tensor %s, got unsupported version %d\n", tensor.v1.name, From 63dc587dffae40c0cd7f1468859f2d430039a29e Mon Sep 17 00:00:00 2001 From: hongruichen Date: Wed, 17 Jul 2024 13:34:05 +0800 Subject: [PATCH 074/143] refactoring: make the buffer alloc and free stay in same class --- ggml/src/ggml-qnn.cpp | 86 +++++++++++++++++++++---------------- ggml/src/ggml-qnn/utils.cpp | 21 ++++++++- ggml/src/ggml-qnn/utils.hpp | 3 ++ 3 files changed, 73 insertions(+), 37 deletions(-) diff --git a/ggml/src/ggml-qnn.cpp b/ggml/src/ggml-qnn.cpp index 46f7e64bcdedb..46fdf87a64a9c 100644 --- a/ggml/src/ggml-qnn.cpp +++ b/ggml/src/ggml-qnn.cpp @@ -83,22 +83,54 @@ static struct ggml_backend_qnn_context g_qnn_mgr[GGML_QNN_MAX_DEVICES] = { ggml_backend_qnn_context(QNN_BACKEND_NPU, 1, "qnn-npu", "libQnnHtp.so"), /* QNN_BACKEND_NPU */ }; -struct ggml_backend_qnn_buffer_context { - ggml_backend_qnn_buffer_context(size_t device) : device(device), name(QNN_BACKEND_NAME + std::to_string(device)) {} +class ggml_backend_qnn_buffer_context { +public: + ggml_backend_qnn_buffer_context(QNNBackend device, std::shared_ptr instance, size_t size) : + _device(device), _instance(instance), _name(QNN_BACKEND_NAME + std::to_string(device)) { + + size_t size_page = sysconf(_SC_PAGESIZE); + + // TODO: for qnn npu, a better way here is to reuse the buffer allocated by qnn rpc, will save an extra copy + _buffer = qnn::align_alloc(size_page, size); + + if (!_buffer) { + QNN_LOG_WARN("failed to allocate %.2f MiB\n", float(size / (1 << 20))); + return; + } + + _buffer_size = size; + } ~ggml_backend_qnn_buffer_context() { - tensors.clear(); - if (buffer) { - free(buffer); + _tensors.clear(); + + // the free will do nothing if the _buffer is nullptr + qnn::align_free(_buffer); + } + + bool is_valid() const { return _buffer != nullptr; } + + bool init_tensor(ggml_tensor *tensor) { + auto qnn_tensor = std::make_unique(tensor, _device, _instance); + if (!qnn_tensor->is_valid()) { + QNN_LOG_WARN("Create ggml_qnn_tensor failed"); + return false; } + + _tensors.push_back(std::move(qnn_tensor)); + return true; } - void *buffer = nullptr; - struct ggml_backend_qnn_context *backend_ctx = nullptr; - std::list> tensors; - size_t buffer_size = 0; - size_t device; - std::string name; + void *get_buffer() { return _buffer; } + size_t get_buffer_size() { return _buffer_size; } + +private: + QNNBackend _device; + std::shared_ptr _instance; + std::string _name; + std::list> _tensors; + void *_buffer = nullptr; + size_t _buffer_size = 0; }; struct ggml_backend_qnn_buffer_type_context { @@ -189,20 +221,16 @@ GGML_CALL static void ggml_backend_qnn_buffer_free_buffer(ggml_backend_buffer_t GGML_CALL static void *ggml_backend_qnn_buffer_get_base(ggml_backend_buffer_t buffer) { ggml_backend_qnn_buffer_context *ctx = (ggml_backend_qnn_buffer_context *)buffer->context; - return ctx->buffer; + return ctx->get_buffer(); } GGML_CALL static void ggml_backend_qnn_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor *tensor) { ggml_backend_qnn_buffer_context *ctx = (ggml_backend_qnn_buffer_context *)buffer->context; - auto instance = ctx->backend_ctx->instance; - auto qnn_tensor = std::make_unique(tensor, (QNNBackend)(ctx->device), instance); - if (!qnn_tensor->is_valid()) { - QNN_LOG_WARN("Create ggml_qnn_tensor failed"); + if (!ctx->init_tensor(tensor)) { + QNN_LOG_WARN("init ggml_qnn_tensor failed"); return; } - - ctx->tensors.push_back(std::move(qnn_tensor)); } GGML_CALL static void ggml_backend_qnn_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor *tensor, @@ -232,7 +260,7 @@ GGML_CALL static bool ggml_backend_qnn_buffer_cpy_tensor(ggml_backend_buffer_t b GGML_CALL static void ggml_backend_qnn_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { ggml_backend_qnn_buffer_context *ctx = (ggml_backend_qnn_buffer_context *)buffer->context; - memset(ctx->buffer, value, ctx->buffer_size); + memset(ctx->get_buffer(), value, ctx->get_buffer_size()); } static ggml_backend_buffer_i ggml_backend_qnn_buffer_interface = { @@ -263,23 +291,9 @@ static void *ggml_qnn_host_malloc(size_t n) { GGML_CALL static ggml_backend_buffer_t ggml_backend_qnn_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { ggml_backend_qnn_buffer_type_context *buft_ctx = (ggml_backend_qnn_buffer_type_context *)buft->context; - ggml_backend_qnn_buffer_context *ctx = new ggml_backend_qnn_buffer_context(buft_ctx->device); - - size_t size_page = sysconf(_SC_PAGESIZE); - - size_t size_aligned = size; - if ((size_aligned % size_page) != 0) { - size_aligned += (size_page - (size_aligned % size_page)); - } - - // TODO:use pre-allocated buffer in internal memory pool - ctx->buffer = ggml_qnn_host_malloc(size_aligned); - ctx->buffer_size = size_aligned; - - ctx->backend_ctx = &g_qnn_mgr[buft_ctx->device]; - - if (nullptr == ctx->buffer) { - QNN_LOG_WARN("%s: failed to allocate %.2f MiB\n", __func__, size / (1 << 20)); + ggml_backend_qnn_buffer_context *ctx = + new ggml_backend_qnn_buffer_context((QNNBackend)buft_ctx->device, g_qnn_mgr[buft_ctx->device].instance, size); + if (!ctx->is_valid()) { return nullptr; } diff --git a/ggml/src/ggml-qnn/utils.cpp b/ggml/src/ggml-qnn/utils.cpp index 7c25314f731f0..2b594bfa0503b 100644 --- a/ggml/src/ggml-qnn/utils.cpp +++ b/ggml/src/ggml-qnn/utils.cpp @@ -1,6 +1,8 @@ #include "utils.hpp" +#include + #include "ggml-qnn.h" #include "qnn-types.hpp" @@ -111,7 +113,7 @@ const char *get_htparch_desc(size_t htp_arch) { intptr_t align_to(size_t alignment, intptr_t offset) { return offset % alignment == 0 ? offset - : offset + (static_cast(alignment) - offset % static_cast(alignment)); + : offset + (static_cast(alignment) - (offset % static_cast(alignment))); } uint32_t get_ggml_tensor_data_size(const ggml_tensor *tensor) { @@ -127,6 +129,23 @@ uint32_t get_ggml_tensor_data_size(const ggml_tensor *tensor) { return ggml_nbytes(tensor); } +void *align_alloc(size_t alignment, size_t size) { + size_t size_aligned = size; + if ((size_aligned % alignment) != 0) { + size_aligned += (alignment - (size_aligned % alignment)); + } + + void *data = std::aligned_alloc(alignment, size_aligned); + if (!data) { + QNN_LOG_WARN("aligned_alloc failed\n"); + return nullptr; + } + + return data; +} + +void align_free(void *ptr) { std::free(ptr); } + // ================================================================================================= // // QNN backend internal helper functions diff --git a/ggml/src/ggml-qnn/utils.hpp b/ggml/src/ggml-qnn/utils.hpp index 66c3eeba471e2..b264f2326c7b2 100644 --- a/ggml/src/ggml-qnn/utils.hpp +++ b/ggml/src/ggml-qnn/utils.hpp @@ -22,6 +22,9 @@ const char *get_htparch_desc(size_t htp_arch); intptr_t align_to(size_t alignment, intptr_t offset); uint32_t get_ggml_tensor_data_size(const ggml_tensor *tensor); +void *align_alloc(size_t alignment, size_t size); +void align_free(void *ptr); + const char *opname_from_ggmlop(enum ggml_op ggmlop); inline int validate_tensor_version(const Qnn_Tensor_t &tensor) { From bb13795dce15c783c75ad92d1ea50ea214912324 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Wed, 17 Jul 2024 14:13:42 +0800 Subject: [PATCH 075/143] refactoring: remove unused functions and variables --- ggml/src/ggml-qnn.cpp | 26 -------- ggml/src/ggml-qnn/utils.cpp | 115 ------------------------------------ ggml/src/ggml-qnn/utils.hpp | 7 --- 3 files changed, 148 deletions(-) diff --git a/ggml/src/ggml-qnn.cpp b/ggml/src/ggml-qnn.cpp index 46fdf87a64a9c..13998a73ef7aa 100644 --- a/ggml/src/ggml-qnn.cpp +++ b/ggml/src/ggml-qnn.cpp @@ -49,21 +49,6 @@ #define QNN_BACKEND_NAME "qnn" -static struct qnn::qcom_socinfo g_qnn_soc_info_table[] = { - /* Qualcomm SnapDragon 8 Gen 1 */ - [qnn::SM8450] = { .soc_model = qnn::SM8450, .htp_arch = qnn::V69, .vtcm_size_in_mb = 8 }, - - /* Qualcomm SnapDragon 8 Gen 1+ */ - [qnn::SM8475] = { .soc_model = qnn::SM8475, .htp_arch = qnn::V69, .vtcm_size_in_mb = 8 }, - - /* Qualcomm SnapDragon 8 Gen 2 */ - [qnn::SM8550] = { .soc_model = qnn::SM8550, .htp_arch = qnn::V73, .vtcm_size_in_mb = 8 }, - - /* Qualcomm SnapDragon 8 Gen 3 */ - [qnn::SM8650] = { .soc_model = qnn::SM8650, .htp_arch = qnn::V75, .vtcm_size_in_mb = 8 }, - -}; - // according to the QNN SDK Reference Guide, // CPU - Choose a non-quantized model.Quantized models are currently incompatible with the CPU backend // GPU - Choose a non-quantized model.Quantized models are currently incompatible with the GPU backend @@ -277,17 +262,6 @@ static ggml_backend_buffer_i ggml_backend_qnn_buffer_interface = { GGML_CALL static const char *ggml_backend_qnn_buffer_type_name(ggml_backend_buffer_type_t buft) { return "QNN"; } -static void *ggml_qnn_host_malloc(size_t n) { - void *data = nullptr; - int result = posix_memalign((void **)&data, sysconf(_SC_PAGESIZE), n); - if (result != 0) { - QNN_LOG_WARN("%s: error: posix_memalign failed\n", __func__); - return nullptr; - } - - return data; -} - GGML_CALL static ggml_backend_buffer_t ggml_backend_qnn_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { ggml_backend_qnn_buffer_type_context *buft_ctx = (ggml_backend_qnn_buffer_type_context *)buft->context; diff --git a/ggml/src/ggml-qnn/utils.cpp b/ggml/src/ggml-qnn/utils.cpp index 2b594bfa0503b..11358395219ca 100644 --- a/ggml/src/ggml-qnn/utils.cpp +++ b/ggml/src/ggml-qnn/utils.cpp @@ -7,20 +7,6 @@ #include "qnn-types.hpp" -namespace { - -size_t memscpy(void *dst, size_t dst_size, const void *src, size_t copy_size) { - if (!dst || !src || !dst_size || !copy_size) return 0; - - size_t min_size = dst_size < copy_size ? dst_size : copy_size; - - memcpy(dst, src, min_size); - - return min_size; -} - -} // namespace - namespace qnn { // TODO: mapping more ggml data type to QNN data type @@ -166,105 +152,4 @@ const char *opname_from_ggmlop(enum ggml_op ggmlop) { return nullptr; } -void device_tensor_init(Qnn_Tensor_t &tensor, uint32_t rank, Qnn_TensorMemType_t mem_type, const char *tensor_name, - Qnn_TensorType_t qnn_tensor_type, Qnn_DataType_t qnn_data_type, uint32_t *dimensions) { - tensor = QNN_TENSOR_INIT; - tensor = { .version = QNN_TENSOR_VERSION_1, - { .v1 = { .id = 0, - .name = tensor_name, - .type = qnn_tensor_type, - .dataFormat = QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER, - .dataType = qnn_data_type, - .quantizeParams = { QNN_DEFINITION_UNDEFINED, - QNN_QUANTIZATION_ENCODING_UNDEFINED, - { .scaleOffsetEncoding = { .scale = 0.0000000000000000f, .offset = 0 } } }, - .rank = rank, - .dimensions = dimensions, - .memType = mem_type, - { .clientBuf = {} } } } }; -} - -Qnn_ErrorHandle_t device_tensor_deep_copy(const Qnn_Tensor_t &src, Qnn_Tensor_t &dst) { - Qnn_ErrorHandle_t err = validate_tensor_version(src); - if (err != QNN_SUCCESS) { - QNN_LOG_WARN("validate_tensor_version expected QNN_SUCCESS\n"); - return err; - } - - dst.version = src.version; - QNN_TENSOR_SET_NAME(dst, ::strndup(QNN_TENSOR_GET_NAME(src), std::string(QNN_TENSOR_GET_NAME(src)).size())); - if (nullptr == QNN_TENSOR_GET_NAME(dst)) { - return (Qnn_ErrorHandle_t)1; - } - QNN_TENSOR_SET_ID(dst, QNN_TENSOR_GET_ID(src)); - QNN_TENSOR_SET_TYPE(dst, QNN_TENSOR_GET_TYPE(src)); - QNN_TENSOR_SET_DATA_FORMAT(dst, QNN_TENSOR_GET_DATA_FORMAT(src)); - QNN_TENSOR_SET_DATA_TYPE(dst, QNN_TENSOR_GET_DATA_TYPE(src)); - QNN_TENSOR_SET_MEM_TYPE(dst, QNN_TENSOR_GET_MEM_TYPE(src)); - - if (QNN_TENSOR_GET_MEM_TYPE(src) == QNN_TENSORMEMTYPE_RAW) { - Qnn_ClientBuffer_t client_buf = { nullptr, 0 }; - QNN_TENSOR_SET_CLIENT_BUF(dst, client_buf); - } else if (QNN_TENSOR_GET_MEM_TYPE(src) == QNN_TENSORMEMTYPE_MEMHANDLE) { - QNN_TENSOR_SET_MEM_HANDLE(dst, nullptr); - } else { - return (Qnn_ErrorHandle_t)1; - } - - Qnn_QuantizeParams_t src_qparam = QNN_TENSOR_GET_QUANT_PARAMS(src); - Qnn_QuantizationEncoding_t encoding = src_qparam.quantizationEncoding; - if (encoding == QNN_QUANTIZATION_ENCODING_AXIS_SCALE_OFFSET) { - Qnn_QuantizeParams_t src_qparam_cpy = src_qparam; - Qnn_AxisScaleOffset_t &axis_scale_offset = src_qparam_cpy.axisScaleOffsetEncoding; - Qnn_ScaleOffset_t **scaleOffset = &axis_scale_offset.scaleOffset; - size_t scaleOffsetSize = axis_scale_offset.numScaleOffsets * sizeof(Qnn_ScaleOffset_t); - *scaleOffset = (Qnn_ScaleOffset_t *)malloc(scaleOffsetSize); - memscpy(*scaleOffset, scaleOffsetSize, src_qparam.axisScaleOffsetEncoding.scaleOffset, scaleOffsetSize); - QNN_TENSOR_SET_QUANT_PARAMS(dst, src_qparam_cpy); - } else if (encoding == QNN_QUANTIZATION_ENCODING_BW_AXIS_SCALE_OFFSET) { - Qnn_QuantizeParams_t src_qparam_cpy = src_qparam; - Qnn_BwAxisScaleOffset_t &bwaxis_scale_offset = src_qparam_cpy.bwAxisScaleOffsetEncoding; - size_t scaleSize = bwaxis_scale_offset.numElements * sizeof(float); - float **scales = &bwaxis_scale_offset.scales; - int32_t **offsets = &bwaxis_scale_offset.offsets; - *scales = (float *)malloc(scaleSize); - memscpy(*scales, scaleSize, src_qparam.bwAxisScaleOffsetEncoding.scales, scaleSize); - - if (bwaxis_scale_offset.offsets != nullptr) { - size_t offsetSize = bwaxis_scale_offset.numElements * sizeof(int32_t); - *offsets = (int32_t *)malloc(offsetSize); - memscpy(*offsets, offsetSize, src_qparam.bwAxisScaleOffsetEncoding.offsets, offsetSize); - } - QNN_TENSOR_SET_QUANT_PARAMS(dst, src_qparam_cpy); - } else { - QNN_TENSOR_SET_QUANT_PARAMS(dst, src_qparam); - } - - uint32_t rank = QNN_TENSOR_GET_RANK(src); - QNN_TENSOR_SET_RANK(dst, rank); - size_t dim_size = rank * sizeof(uint32_t); - uint32_t *dimensions = (uint32_t *)malloc(dim_size); - if (dimensions == nullptr) { - QNN_LOG_WARN( - "deep_copy_qnn_tensors() allocation error while copying " - "tensor %s\n", - QNN_TENSOR_GET_NAME(src)); - return (Qnn_ErrorHandle_t)1; - } - memscpy(dimensions, dim_size, QNN_TENSOR_GET_DIMENSIONS(src), dim_size); - QNN_TENSOR_SET_DIMENSIONS(dst, dimensions); - - return err; -} - -void device_tensor_free(Qnn_Tensor_t &tensor) { - if (validate_tensor_version(tensor) != QNN_SUCCESS) { - QNN_LOG_WARN("validate_tensor_version expected QNN_SUCCESS\n"); - return; - } - - free((void *)QNN_TENSOR_GET_NAME(tensor)); - free(QNN_TENSOR_GET_DIMENSIONS(tensor)); -} - } // namespace qnn diff --git a/ggml/src/ggml-qnn/utils.hpp b/ggml/src/ggml-qnn/utils.hpp index b264f2326c7b2..d00673e9a47ce 100644 --- a/ggml/src/ggml-qnn/utils.hpp +++ b/ggml/src/ggml-qnn/utils.hpp @@ -176,13 +176,6 @@ inline void set_qnn_tensor_memhandle(Qnn_Tensor_t &tensor, Qnn_MemHandle_t handl Qnn_DataType_t device_datatype_from_ggml_datatype(ggml_type ggml_type); Qnn_TensorType_t device_tensortype_from_ggml_tensor(ggml_tensor *ggml_tensor); -void device_tensor_init(Qnn_Tensor_t &tensor, uint32_t rank, Qnn_TensorMemType_t mem_type, const char *tensor_name, - Qnn_TensorType_t qnn_tensor_type, Qnn_DataType_t qnn_data_type, uint32_t *dimensions); - -Qnn_ErrorHandle_t device_tensor_deep_copy(const Qnn_Tensor_t &src, Qnn_Tensor_t &dst); - -void device_tensor_free(Qnn_Tensor_t &tensor); - #if ENABLE_QNNBACKEND_PERF class qnn_perf { public: From eed960575fa8a4819c9a0e240a302ab9f1119a77 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Wed, 17 Jul 2024 19:43:01 +0800 Subject: [PATCH 076/143] add build step of QNN backend at ggml --- CMakeLists.txt | 1 + ggml/CMakeLists.txt | 3 ++- ggml/src/CMakeLists.txt | 28 ++++++++++++++++++++++++++++ 3 files changed, 31 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 67dcf86d4fab7..1afc63c639089 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -106,6 +106,7 @@ llama_option_depr(WARNING LLAMA_NATIVE GGML_NATIVE) llama_option_depr(WARNING LLAMA_RPC GGML_RPC) llama_option_depr(WARNING LLAMA_SYCL GGML_SYCL) llama_option_depr(WARNING LLAMA_SYCL_F16 GGML_SYCL_F16) +llama_option_depr(WARNING LLAMA_QNN GGML_QNN) # # build the library diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt index 649ac3dcc4f63..294653804b5f8 100644 --- a/ggml/CMakeLists.txt +++ b/ggml/CMakeLists.txt @@ -145,6 +145,7 @@ option(GGML_SYCL "ggml: use SYCL" option(GGML_SYCL_F16 "ggml: use 16 bit floats for sycl calculations" OFF) set (GGML_SYCL_TARGET "INTEL" CACHE STRING "ggml: sycl target device") +option(GGML_QNN "ggml: use QNN" OFF) # extra artifacts option(GGML_BUILD_TESTS "ggml: build tests" ${GGML_STANDALONE}) @@ -157,7 +158,7 @@ option(GGML_BUILD_EXAMPLES "ggml: build examples" ${GGML_STANDALONE}) set(CMAKE_C_STANDARD 11) set(CMAKE_C_STANDARD_REQUIRED true) -if (GGML_SYCL) +if (GGML_SYCL OR GGML_QNN) set(CMAKE_CXX_STANDARD 17) else() set(CMAKE_CXX_STANDARD 11) diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt index cbadaf4d931c3..e2ba88a1781e0 100644 --- a/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt @@ -770,6 +770,33 @@ if (GGML_CPU_HBM) target_link_libraries(ggml PUBLIC memkind) endif() +if (GGML_QNN) + if (CMAKE_SYSTEM_NAME STREQUAL "Android") + find_library(LOG_LIB log) + find_library(ANDROID_LIB android) + set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} ${LOG_LIB} ${ANDROID_LIB}) + else() + message(FATAL_ERROR "QNN now only available on Android") + endif() + + if (NOT DEFINED GGML_QNN_SDK_PATH) + # try read from environment variable + if (DEFINED ENV{QNN_SDK_PATH}) + set(GGML_QNN_SDK_PATH $ENV{QNN_SDK_PATH}) + else() + message(FATAL_ERROR "GGML_QNN_SDK_PATH not defined") + endif() + endif() + + message("QNN_SDK_PATH: ${GGML_QNN_SDK_PATH}") + file(GLOB GGML_SOURCES_QNN "ggml-qnn/*.cpp") + list(APPEND GGML_SOURCES_QNN "ggml-qnn.cpp") + set(GGML_HEADERS_QNN ../include/ggml-qnn.h) + set(QNN_INC_PATH ${GGML_QNN_SDK_PATH}/include/QNN) + set(GGML_EXTRA_INCLUDES ${GGML_EXTRA_INCLUDES} ${QNN_INC_PATH} "ggml-qnn") + list(APPEND GGML_CDEF_PUBLIC GGML_USE_QNN) +endif() + function(get_flags CCID CCVER) set(C_FLAGS "") set(CXX_FLAGS "") @@ -1184,6 +1211,7 @@ add_library(ggml ${GGML_SOURCES_ROCM} ${GGML_HEADERS_ROCM} ${GGML_SOURCES_BLAS} ${GGML_HEADERS_BLAS} ${GGML_SOURCES_LLAMAFILE} ${GGML_HEADERS_LLAMAFILE} + ${GGML_SOURCES_QNN} ${GGML_HEADERS_QNN} ggml-aarch64.c ggml-aarch64.h ) From 454deef83c14ae33543d289ac40d3a6ad277a3cf Mon Sep 17 00:00:00 2001 From: hongruichen Date: Wed, 17 Jul 2024 20:53:53 +0800 Subject: [PATCH 077/143] register qnn backend --- ggml/include/ggml-qnn.h | 2 -- ggml/src/ggml-backend.c | 5 +++++ ggml/src/ggml-qnn.cpp | 4 +--- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/ggml/include/ggml-qnn.h b/ggml/include/ggml-qnn.h index 026c6ddf06672..2433af1668408 100644 --- a/ggml/include/ggml-qnn.h +++ b/ggml/include/ggml-qnn.h @@ -18,8 +18,6 @@ enum QNNBackend { // QNN and original GGML }; -GGML_API int ggml_backend_qnn_reg_devices(void); - /** * * @param device 0: QNN_BACKEND_CPU 1: QNN_BACKEND_GPU 2: diff --git a/ggml/src/ggml-backend.c b/ggml/src/ggml-backend.c index dbbaa3941febe..80272855de860 100644 --- a/ggml/src/ggml-backend.c +++ b/ggml/src/ggml-backend.c @@ -445,6 +445,11 @@ GGML_CALL static void ggml_backend_registry_init(void) { extern GGML_CALL void ggml_backend_kompute_reg_devices(void); ggml_backend_kompute_reg_devices(); #endif + +#ifdef GGML_USE_QNN + extern GGML_CALL void ggml_backend_qnn_reg_devices(void); + ggml_backend_qnn_reg_devices(); +#endif } GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data) { diff --git a/ggml/src/ggml-qnn.cpp b/ggml/src/ggml-qnn.cpp index 13998a73ef7aa..f8031bb0fd516 100644 --- a/ggml/src/ggml-qnn.cpp +++ b/ggml/src/ggml-qnn.cpp @@ -534,13 +534,11 @@ ggml_backend_t ggml_backend_qnn_init(size_t device, const char *qnn_lib_path) { return qnn_backend; } -int ggml_backend_qnn_reg_devices() { +extern "C" GGML_CALL void ggml_backend_qnn_reg_devices() { for (size_t idx = 0; idx < GGML_QNN_MAX_DEVICES; idx++) { char name[GGML_MAX_NAME]; ggml_backend_qnn_get_device_description(idx, name, GGML_MAX_NAME); ggml_backend_register(name, ggml_backend_qnn_reg_init, ggml_backend_qnn_buffer_type(idx), (void *)(intptr_t)idx); } - - return GGML_QNN_MAX_DEVICES; } From 2502b57203c69916eb7fde14ed46a3b2199ebbcc Mon Sep 17 00:00:00 2001 From: hongruichen Date: Wed, 17 Jul 2024 21:39:25 +0800 Subject: [PATCH 078/143] fix warnings --- ggml/src/ggml-qnn.cpp | 18 ++++++++++++------ ggml/src/ggml-qnn/graph.hpp | 3 +-- ggml/src/ggml-qnn/logger.cpp | 9 ++++++--- ggml/src/ggml-qnn/qnn-lib.hpp | 24 +++++++++++++++--------- ggml/src/ggml-qnn/utils.hpp | 2 +- 5 files changed, 35 insertions(+), 21 deletions(-) diff --git a/ggml/src/ggml-qnn.cpp b/ggml/src/ggml-qnn.cpp index f8031bb0fd516..8ba258d632f38 100644 --- a/ggml/src/ggml-qnn.cpp +++ b/ggml/src/ggml-qnn.cpp @@ -173,7 +173,7 @@ static bool ggml_qnn_can_handle_op(ggml_backend_qnn_context *ctx, const struct g return true; } -bool ggml_qnn_compute_forward(ggml_backend_qnn_context *ctx, struct ggml_tensor *tensor) { +static bool ggml_qnn_compute_forward(ggml_backend_qnn_context *ctx, struct ggml_tensor *tensor) { auto unary_op = qnn::ggml_qnn_unary_op_array()[tensor->op]; if (unary_op) { return unary_op(ctx, tensor->src[0], tensor); @@ -260,7 +260,10 @@ static ggml_backend_buffer_i ggml_backend_qnn_buffer_interface = { /* .reset = */ nullptr, }; -GGML_CALL static const char *ggml_backend_qnn_buffer_type_name(ggml_backend_buffer_type_t buft) { return "QNN"; } +GGML_CALL static const char *ggml_backend_qnn_buffer_type_name(ggml_backend_buffer_type_t buft) { + GGML_UNUSED(buft); + return "QNN"; +} GGML_CALL static ggml_backend_buffer_t ggml_backend_qnn_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { @@ -291,7 +294,10 @@ GGML_CALL static bool ggml_backend_qnn_buffer_is_host(ggml_backend_buffer_type_t return true; } -GGML_CALL static const char *ggml_backend_qnn_name(ggml_backend_t backend) { return "QNN"; } +GGML_CALL static const char *ggml_backend_qnn_name(ggml_backend_t backend) { + GGML_UNUSED(backend); + return "QNN"; +} GGML_CALL static void ggml_backend_qnn_free(ggml_backend_t backend) { QNN_LOG_INFO("enter %s", __func__); @@ -408,8 +414,6 @@ void ggml_backend_qnn_set_n_threads(ggml_backend_t backend, int n_threads) { ctx->threads = n_threads; } -const char *ggml_backend_qnn_get_name(ggml_backend_t backend) { return backend->iface.get_name(backend); } - int ggml_backend_qnn_get_device_count() { return GGML_QNN_MAX_DEVICES; } void ggml_backend_qnn_get_device_description(size_t dev_num, char *description, size_t description_size) { @@ -534,7 +538,9 @@ ggml_backend_t ggml_backend_qnn_init(size_t device, const char *qnn_lib_path) { return qnn_backend; } -extern "C" GGML_CALL void ggml_backend_qnn_reg_devices() { +extern "C" GGML_CALL void ggml_backend_qnn_reg_devices(); + +GGML_CALL void ggml_backend_qnn_reg_devices() { for (size_t idx = 0; idx < GGML_QNN_MAX_DEVICES; idx++) { char name[GGML_MAX_NAME]; ggml_backend_qnn_get_device_description(idx, name, GGML_MAX_NAME); diff --git a/ggml/src/ggml-qnn/graph.hpp b/ggml/src/ggml-qnn/graph.hpp index e4900906ce3e9..9621ad1b4dd68 100644 --- a/ggml/src/ggml-qnn/graph.hpp +++ b/ggml/src/ggml-qnn/graph.hpp @@ -101,10 +101,9 @@ class ggml_qnn_graph { _tensor_inputs = tensor_inputs; _tensor_outputs = tensor_outputs; - Qnn_Param_t qnn_params[] = {}; Qnn_OpConfig_t op_config = { .version = QNN_OPCONFIG_VERSION_1, .v1 = { _graph_name.c_str(), QNN_OP_PACKAGE_NAME_QTI_AISW, op_name.c_str(), 0, - qnn_params, (uint32_t)_tensor_inputs.size(), _tensor_inputs.data(), + nullptr, (uint32_t)_tensor_inputs.size(), _tensor_inputs.data(), (uint32_t)_tensor_outputs.size(), _tensor_outputs.data() } }; auto error = _qnn_interface->qnn_graph_add_node(_graph_handle, op_config); if (error != QNN_SUCCESS) { diff --git a/ggml/src/ggml-qnn/logger.cpp b/ggml/src/ggml-qnn/logger.cpp index 8b74b90edf476..fc37161edba17 100644 --- a/ggml/src/ggml-qnn/logger.cpp +++ b/ggml/src/ggml-qnn/logger.cpp @@ -9,7 +9,8 @@ #include #endif -void qnn::internal_log(ggml_log_level level, const char *file, const char *func, int line, const char *format, ...) { +void qnn::internal_log(ggml_log_level level, const char * /*file*/, const char *func, int line, const char *format, + ...) { static std::mutex qnn_internal_log_mutex; static char s_qnn_internal_log_buf[QNN_LOGBUF_LEN]; @@ -32,8 +33,8 @@ void qnn::internal_log(ggml_log_level level, const char *file, const char *func, } } -void qnn::sdk_logcallback(const char *fmt, QnnLog_Level_t level, uint64_t timestamp, va_list argp) { #if ENABLE_QNNSDK_LOG +void qnn::sdk_logcallback(const char *fmt, QnnLog_Level_t level, uint64_t timestamp, va_list argp) { static std::mutex log_mutex; static unsigned char s_ggml_qnn_logbuf[QNN_LOGBUF_LEN]; @@ -67,5 +68,7 @@ void qnn::sdk_logcallback(const char *fmt, QnnLog_Level_t level, uint64_t timest vsnprintf(reinterpret_cast(s_ggml_qnn_logbuf), QNN_LOGBUF_LEN, fmt, argp); QNN_LOG_INFO("%8.1fms [%-7s] %s", ms, log_level_desc, s_ggml_qnn_logbuf); } -#endif } +#else +void qnn::sdk_logcallback(const char *, QnnLog_Level_t, uint64_t, va_list) {} +#endif diff --git a/ggml/src/ggml-qnn/qnn-lib.hpp b/ggml/src/ggml-qnn/qnn-lib.hpp index a676f989566e5..a46901695aa6d 100644 --- a/ggml/src/ggml-qnn/qnn-lib.hpp +++ b/ggml/src/ggml-qnn/qnn-lib.hpp @@ -47,6 +47,10 @@ Fn dl_sym_typed(dl_handler_t handle, const std::string &function_name) { // ref:https://github.com/pytorch/executorch/tree/main/backends/qualcomm // ================================================================================================= +// TODO: fix this for other compilers +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wextra-semi" + class qnn_system_interface { #define DEFINE_SHIM_FUNCTION_SYS_INTERFACE(F, pointer_name) \ @@ -176,12 +180,14 @@ class qnn_interface { const QnnInterface_t _qnn_interface = {}; }; +#pragma GCC diagnostic pop + class qnn_instance { public: using BackendIdType = decltype(QnnInterface_t{}.backendId); explicit qnn_instance(const std::string &lib_path, const std::string &backend_name, const std::string &model_name) : - _lib_path(std::move(lib_path)), _backend_name(std::move(backend_name)), _model_name(std::move(model_name)) {}; + _lib_path(std::move(lib_path)), _backend_name(std::move(backend_name)), _model_name(std::move(model_name)) {} ~qnn_instance() {} @@ -250,7 +256,7 @@ class qnn_instance { QNN_LOG_INFO("device counts %d", p_info->v1.numHwDevices); QnnDevice_HardwareDeviceInfo_t *infos = p_info->v1.hwDevices; QnnHtpDevice_OnChipDeviceInfoExtension_t chipinfo = {}; - for (int i = 0; i < p_info->v1.numHwDevices; i++) { + for (uint32_t i = 0; i < p_info->v1.numHwDevices; i++) { QNN_LOG_INFO("deviceID:%d, deviceType:%d, numCores %d", infos[i].v1.deviceId, infos[i].v1.deviceType, infos[i].v1.numCores); QnnDevice_DeviceInfoExtension_t devinfo = infos[i].v1.deviceInfoExtension; @@ -464,17 +470,17 @@ class qnn_instance { return _qnn_interface; } - const Qnn_LogHandle_t get_qnn_log_handle() { return _qnn_log_handle; } + Qnn_LogHandle_t get_qnn_log_handle() { return _qnn_log_handle; } - const Qnn_ProfileHandle_t get_qnn_profile_handle() { return _qnn_profile_handle; } + Qnn_ProfileHandle_t get_qnn_profile_handle() { return _qnn_profile_handle; } - const Qnn_DeviceHandle_t get_qnn_device_handle() { return _qnn_device_handle; } + Qnn_DeviceHandle_t get_qnn_device_handle() { return _qnn_device_handle; } - const Qnn_BackendHandle_t get_qnn_backend_handle() { return _qnn_backend_handle; } + Qnn_BackendHandle_t get_qnn_backend_handle() { return _qnn_backend_handle; } - const Qnn_ContextHandle_t get_qnn_context_handle() { return _qnn_context_handle; } + Qnn_ContextHandle_t get_qnn_context_handle() { return _qnn_context_handle; } - const Qnn_GraphHandle_t get_qnn_graph_handle() { return _qnn_graph_handle; } + Qnn_GraphHandle_t get_qnn_graph_handle() { return _qnn_graph_handle; } int init_htp_perfinfra() { QnnDevice_Infrastructure_t device_infra = nullptr; @@ -779,7 +785,7 @@ class qnn_instance { return 0; } - int load_backend(std::string &lib_path, const QnnSaver_Config_t **saver_config) { + int load_backend(std::string &lib_path, const QnnSaver_Config_t ** /*saver_config*/) { Qnn_ErrorHandle_t error = QNN_SUCCESS; QNN_LOG_DEBUG("lib_path:%s\n", lib_path.c_str()); diff --git a/ggml/src/ggml-qnn/utils.hpp b/ggml/src/ggml-qnn/utils.hpp index d00673e9a47ce..e8f1bf71e88be 100644 --- a/ggml/src/ggml-qnn/utils.hpp +++ b/ggml/src/ggml-qnn/utils.hpp @@ -202,7 +202,7 @@ class qnn_perf { #else class qnn_perf { public: - qnn_perf(const std::string &perf_name) {} + qnn_perf(const std::string &) {} ~qnn_perf() { info(); } qnn_perf() = delete; qnn_perf(const qnn_perf &) = delete; From b7d781ec81eb2bdeedabdf540fdbec37cfb02e90 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Wed, 17 Jul 2024 23:08:16 +0800 Subject: [PATCH 079/143] remove qnn dedicated unit tests since we're now using the `test-backend-ops` to cross-validate backend ops --- tests/ggml-qnn/CMakeLists.txt | 68 --- tests/ggml-qnn/ggml-qnn-ut-build-run.sh | 207 --------- tests/ggml-qnn/ggml-qnn-ut.cpp | 544 ------------------------ 3 files changed, 819 deletions(-) delete mode 100644 tests/ggml-qnn/CMakeLists.txt delete mode 100755 tests/ggml-qnn/ggml-qnn-ut-build-run.sh delete mode 100644 tests/ggml-qnn/ggml-qnn-ut.cpp diff --git a/tests/ggml-qnn/CMakeLists.txt b/tests/ggml-qnn/CMakeLists.txt deleted file mode 100644 index f9678d3d88f00..0000000000000 --- a/tests/ggml-qnn/CMakeLists.txt +++ /dev/null @@ -1,68 +0,0 @@ -cmake_minimum_required(VERSION 3.22.1) -project(ggml-qnn-test) - -set(CMAKE_VERBOSE_MAKEFILE on) -set(CMAKE_CXX_STANDARD 17) -set(CMAKE_CXX_STANDARD_REQUIRED ON) -set(CMAKE_POSITION_INDEPENDENT_CODE ON) - -#set to OFF if target Android phone is not equipped with Qualcomm Snapdragon 8 Gen 3 -set(TARGET_SNAPDRAGON_8_GEN3 ON) - -set(QNN_INC_PATH ${QNN_SDK_PATH}/include/QNN) -set(QNN_LIB_PATH ${QNN_SDK_PATH}/lib/aarch64-android) - -include_directories(${QNN_INC_PATH}) -include_directories(../../ggml/include) # ggml.h, ggml-qnn.h - -set(SOURCE_FILES - ../../ggml/src/ggml.c - ../../ggml/src/ggml-alloc.c - ../../ggml/src/ggml-backend.c - ../../ggml/src/ggml-quants.c - ../../ggml/src/ggml-qnn/qnn-lib.cpp - ../../ggml/src/ggml-qnn/logger.cpp - ../../ggml/src/ggml-qnn/utils.cpp - ../../ggml/src/ggml-qnn/backend-ops.cpp - ../../ggml/src/ggml-qnn.cpp - ggml-qnn-ut.cpp -) - - -message("QNN_SDK_PATH : ${QNN_SDK_PATH}") -message("QNN_INC_PATH : ${QNN_INC_PATH}") -message("QNN_LIB_PATH : ${QNN_LIB_PATH}") - -add_definitions(-D__ARM_NEON) -add_definitions(-DGGML_USE_QNN) - -if(CMAKE_BUILD_TYPE STREQUAL "Release") - add_definitions(-DNDEBUG) - add_definitions(-O3) -else() - add_definitions(-O3) -endif() - -if (TARGET_SNAPDRAGON_8_GEN3) - # the below build optimization only verified and works well on Qualcomm SM8650-AB Snapdragon 8 Gen 3 - add_definitions(-march=armv8.7-a) - add_definitions(-mcpu=cortex-x1) - add_definitions(-mtune=cortex-x1) -else() - # the below build optimization might be works well on ALL Android phone equipped with Qualcomm mainstream mobile SoC - add_definitions(-mcpu=cortex-a72) -endif() - -add_compile_options("-Wall" "-Wno-sign-compare") - -find_library(LOG_LIB log) - -link_libraries(${LOG_LIB} android) - -add_executable(${TARGET_NAME} - ${SOURCE_FILES} -) - -target_include_directories(${TARGET_NAME} PRIVATE - ../../ggml/src/ggml-qnn/ -) diff --git a/tests/ggml-qnn/ggml-qnn-ut-build-run.sh b/tests/ggml-qnn/ggml-qnn-ut-build-run.sh deleted file mode 100755 index e12b987b8d69d..0000000000000 --- a/tests/ggml-qnn/ggml-qnn-ut-build-run.sh +++ /dev/null @@ -1,207 +0,0 @@ -#!/bin/bash - -set -e - -#https://qpm.qualcomm.com/#/main/tools/details/qualcomm_ai_engine_direct -#https://developer.qualcomm.com/software/hexagon-dsp-sdk/tools -#QNN SDK released on 20240531 -QNN_SDK_PATH=/opt/qcom/aistack/qairt/2.23.0.240531/ - -ANDROID_NDK=`pwd`/android-ndk-r26c -ANDROID_PLATFORM=android-34 - -GGML_QNN_UT=ggml-qnn-ut -REMOTE_PATH=/data/local/tmp/ -BUILDTYPE=Release -BUILDTYPE=Debug - - -function dump_vars() -{ - echo -e "ANDROID_NDK: ${ANDROID_NDK}" - echo -e "QNN_SDK_PATH: ${QNN_SDK_PATH}" -} - - -function show_pwd() -{ - echo -e "current working path:$(pwd)\n" -} - - -function check_qnn_sdk() -{ - if [ ! -d ${QNN_SDK_PATH} ]; then - echo -e "QNN_SDK_PATH ${QNN_SDK_PATH} not exist, pls check or download it from https://qpm.qualcomm.com/#/main/tools/details/qualcomm_ai_engine_direct...\n" - exit 1 - fi -} - - -function check_and_download_ndk() -{ - is_android_ndk_exist=1 - - if [ ! -d ${ANDROID_NDK} ]; then - is_android_ndk_exist=0 - fi - - if [ ! -f ${ANDROID_NDK}/build/cmake/android.toolchain.cmake ]; then - is_android_ndk_exist=0 - fi - - if [ ${is_android_ndk_exist} -eq 0 ]; then - - if [ ! -f android-ndk-r26c-linux.zip ]; then - wget --no-config --quiet --show-progress -O android-ndk-r26c-linux.zip https://dl.google.com/android/repository/android-ndk-r26c-linux.zip - fi - - unzip android-ndk-r26c-linux.zip - - if [ $? -ne 0 ]; then - printf "failed to download android ndk to %s \n" "${ANDROID_NDK}" - exit 1 - fi - - printf "android ndk saved to ${ANDROID_NDK} \n\n" - else - printf "android ndk already exist:${ANDROID_NDK} \n\n" - fi -} - - -function build_arm64 -{ - cmake -H. -B./out/arm64-v8a -DTARGET_NAME=${GGML_QNN_UT} -DCMAKE_BUILD_TYPE=${BUILDTYPE} -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=${ANDROID_PLATFORM} -DANDROID_NDK=${ANDROID_NDK} -DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK}/build/cmake/android.toolchain.cmake -DQNN_SDK_PATH=${QNN_SDK_PATH} - - cd ./out/arm64-v8a - make - - ls -lah ${GGML_QNN_UT} - /bin/cp ${GGML_QNN_UT} ../../ - cd - -} - - -function remove_temp_dir() -{ - if [ -d out ]; then - echo "remove out directory in `pwd`" - rm -rf out - fi -} - - -function update_qnn_libs() -{ - check_qnn_sdk - - adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnSystem.so ${REMOTE_PATH}/ - adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnCpu.so ${REMOTE_PATH}/ - adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnGpu.so ${REMOTE_PATH}/ - - #the QNN NPU(aka HTP) backend only verified on Qualcomm Snapdragon 8 Gen 3 equipped Android phone - adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtp.so ${REMOTE_PATH}/ - adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtpNetRunExtensions.so ${REMOTE_PATH}/ - adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtpPrepare.so ${REMOTE_PATH}/ - adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtpV75Stub.so ${REMOTE_PATH}/ - adb push ${QNN_SDK_PATH}/lib/hexagon-v75/unsigned/libQnnHtpV75Skel.so ${REMOTE_PATH}/ -} - - -function check_qnn_libs() -{ - #reuse the cached qnn libs in Android phone - adb shell ls ${REMOTE_PATH}/libQnnCpu.so - if [ $? -eq 0 ]; then - printf "QNN libs already exist on Android phone\n" - else - update_qnn_libs - fi -} - - -function build_ggml_qnn_ut() -{ - show_pwd - check_and_download_ndk - check_qnn_sdk - dump_vars - remove_temp_dir - build_arm64 -} - - -function run_ggml_qnn_ut() -{ - check_qnn_libs - - #upload the latest ggml_qnn_test - adb push ${GGML_QNN_UT} ${REMOTE_PATH} - adb shell chmod +x ${REMOTE_PATH}/${GGML_QNN_UT} - - case "$ggmlop" in - GGML_OP_ADD) - adb shell ${REMOTE_PATH}/${GGML_QNN_UT} -t GGML_OP_ADD -b $qnnbackend - ;; - - GGML_OP_MUL_MAT) - adb shell ${REMOTE_PATH}/${GGML_QNN_UT} -t GGML_OP_MUL_MAT -b $qnnbackend - ;; - - *) - printf " \n$arg not supported currently\n" - show_usage - exit 1 - ;; - esac -} - - -function show_usage() -{ - echo "Usage:" - echo " $0 build (build Android command line UT program)" - echo " $0 updateqnnlibs (upload the latest QNN libs to Android phone)" - echo " $0 GGML_OP_ADD 0 (QNN_CPU) / 1(QNN_GPU) / 2(QNN_NPU) / 3(ggml)" - echo " $0 GGML_OP_MUL_MAT 0 (QNN_CPU) / 1(QNN_GPU) / 2(QNN_NPU) / 3(ggml)" - echo -e "\n\n\n" -} - - -unset ggmlop -unset qnnbackend - -check_qnn_sdk - -if [ $# == 0 ]; then - show_usage - exit 1 -elif [ $# == 1 ]; then - if [ "$1" == "-h" ]; then - #avoid upload command line program to Android phone in this scenario - show_usage - exit 1 - elif [ "$1" == "help" ]; then - #avoid upload command line program to Android phone in this scenario - show_usage - exit 1 - elif [ "$1" == "build" ]; then - build_ggml_qnn_ut - exit 0 - elif [ "$1" == "updateqnnlibs" ]; then - update_qnn_libs - exit 0 - else - ggmlop=$1 - qnnbackend=0 - run_ggml_qnn_ut - fi -elif [ $# == 2 ]; then - ggmlop=$1 - qnnbackend=$2 - run_ggml_qnn_ut -else - show_usage - exit 1 -fi diff --git a/tests/ggml-qnn/ggml-qnn-ut.cpp b/tests/ggml-qnn/ggml-qnn-ut.cpp deleted file mode 100644 index 71cb86a71bdf1..0000000000000 --- a/tests/ggml-qnn/ggml-qnn-ut.cpp +++ /dev/null @@ -1,544 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "ggml.h" - -#include "ggml-alloc.h" -#include "ggml-backend.h" -#include "ggml-qnn.h" - -#include "logger.hpp" - -static const char *get_qnn_backend_name(int n_backend_type) { - switch (n_backend_type) { - case QNN_BACKEND_CPU: - return "QNN-CPU"; - case QNN_BACKEND_GPU: - return "QNN-GPU"; - case QNN_BACKEND_NPU: - return "QNN-NPU"; - case QNN_BACKEND_GGML: - return "ggml"; - default: - return "unknown"; - } -} - -static bool ggml_graph_compute_helper(struct ggml_backend *backend, struct ggml_cgraph *graph, - std::vector &buf, int n_threads, ggml_abort_callback abort_callback, - void *abort_callback_data) { - struct ggml_cplan plan = ggml_graph_plan(graph, n_threads); - - plan.abort_callback = abort_callback; - plan.abort_callback_data = abort_callback_data; - - if (plan.work_size > 0) { - buf.resize(plan.work_size); - plan.work_data = buf.data(); - } - - if (ggml_backend_is_cpu(backend)) { - ggml_backend_cpu_set_n_threads(backend, n_threads); - } - -#ifdef GGML_USE_QNN - if (ggml_backend_is_qnn(backend)) { - ggml_backend_qnn_set_n_threads(backend, n_threads); - } -#endif - - if (nullptr != backend) - return ggml_backend_graph_compute(backend, graph) == GGML_STATUS_SUCCESS; - else - return ggml_graph_compute(graph, &plan); -} - -#define QK8_0 32 - -typedef struct { - uint16_t d; // delta - int8_t qs[QK8_0]; // quants -} block_q8_0; - -static inline float ggml_compute_fp16_to_fp32(uint16_t h) { - __fp16 tmp; - memcpy(&tmp, &h, sizeof(uint16_t)); - return (float)tmp; -} - -#define GGML_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x) - -#define TENSOR_DUMP(tensor) tensor_dump(tensor, #tensor) - -static void tensor_dump(const ggml_tensor *tensor, const char *name) { - QNN_LOG_INFO("dump ggml tensor %s(%s): type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 - ", nb = (%5zi, %5zi, %5zi)\n", - name, tensor->name, tensor->type, ggml_type_name(tensor->type), tensor->ne[0], tensor->ne[1], - tensor->ne[2], tensor->nb[0], tensor->nb[1], tensor->nb[2]); - - float value = 0; - std::ostringstream tmposs; - if (nullptr == tensor) { - QNN_LOG_WARN("tensor is null"); - return; - } - - if (tensor->type == GGML_TYPE_I8) { - for (int h = 0; h < tensor->ne[3]; h++) { - for (int i = 0; i < tensor->ne[2]; i++) { - for (int j = 0; j < tensor->ne[1]; j++) { - for (int k = 0; k < tensor->ne[0]; k++) { - value = ((int8_t *)tensor->data)[h * tensor->ne[2] + i * tensor->ne[1] + j * tensor->ne[0] + k]; - tmposs << std::setw(8) << std::fixed << std::setprecision(2) << value << " "; - } - tmposs << "\n"; - } - } - } - if (strlen(tmposs.str().c_str()) <= (QNN_LOGBUF_LEN - 96)) { - QNN_LOG_INFO("\n%s\n", tmposs.str().c_str()); - tmposs.clear(); - tmposs.str(""); - } - } - - if (tensor->type == GGML_TYPE_F32) { - for (int h = 0; h < tensor->ne[3]; h++) { - for (int i = 0; i < tensor->ne[2]; i++) { - for (int j = 0; j < tensor->ne[1]; j++) { - for (int k = 0; k < tensor->ne[0]; k++) { - value = ((float *)tensor->data)[h * tensor->ne[2] + i * tensor->ne[1] + j * tensor->ne[0] + k]; - tmposs << std::setw(8) << std::fixed << std::setprecision(2) << value << " "; - } - tmposs << "\n"; - } - } - } - if (strlen(tmposs.str().c_str()) <= (QNN_LOGBUF_LEN - 96)) { - QNN_LOG_INFO("\n%s\n", tmposs.str().c_str()); - tmposs.clear(); - tmposs.str(""); - } - } - - if (tensor->type == GGML_TYPE_F16) { - for (int h = 0; h < tensor->ne[3]; h++) { - for (int i = 0; i < tensor->ne[2]; i++) { - for (int j = 0; j < tensor->ne[1]; j++) { - for (int k = 0; k < tensor->ne[0]; k++) { - unsigned short tmpvalue = - ((unsigned short *) - tensor->data)[h * tensor->ne[2] + i * tensor->ne[1] + j * tensor->ne[0] + k]; - value = GGML_FP16_TO_FP32(tmpvalue); - tmposs << std::setw(8) << std::fixed << std::setprecision(2) << value << " "; - } - tmposs << "\n"; - } - } - } - if (strlen(tmposs.str().c_str()) <= (QNN_LOGBUF_LEN - 96)) { - QNN_LOG_INFO("\n%s\n", tmposs.str().c_str()); - tmposs.clear(); - tmposs.str(""); - } - } - - if (tensor->type == GGML_TYPE_Q8_0) { - block_q8_0 *tmp = ((block_q8_0 *)tensor->data); - for (int j = 0; j < tensor->ne[1]; j++) { - int n = tensor->ne[0] / QK8_0; // blocks per row - for (int z = 0; z < n; z++) { - const float d = GGML_FP16_TO_FP32(tmp[j * n + z].d); - for (int k = 0; k < QK8_0; k++) { - value = tmp[j * n + z].qs[k] * d; - tmposs << std::setw(8) << std::fixed << std::setprecision(2) << value << " "; - } - } - tmposs << "\n"; - } - if (strlen(tmposs.str().c_str()) <= (QNN_LOGBUF_LEN - 96)) { - QNN_LOG_INFO("\n%s\n", tmposs.str().c_str()); - tmposs.clear(); - tmposs.str(""); - } - } -} - -static uint32_t get_tensor_rank(const ggml_tensor *tensor) { - uint32_t rank = 0; - for (int i = 0; i < GGML_MAX_DIMS; i++) { - if ((0 != tensor->ne[i]) && (1 != tensor->ne[i])) { - rank++; - } - } - return rank; -} - -static uint32_t get_tensor_data_size(const ggml_tensor *tensor) { -#if ENABLE_QNNSDK_LOG - size_t data_size = ggml_row_size(tensor->type, tensor->ne[0]); - size_t n_dims = get_tensor_rank(tensor); - for (size_t i = 1; i < n_dims; i++) { - data_size *= tensor->ne[i]; - } - - QNN_LOG_DEBUG("get_tensor_data_size %d", data_size); - QNN_LOG_DEBUG("ggml_nbytes(tensor) %d", ggml_nbytes(tensor)); -#endif - - return ggml_nbytes(tensor); -} - -// ref: https://github.com/ggerganov/llama.cpp/blob/master/tests/test-backend-ops.cpp#L20 -static void init_tensor_uniform(ggml_tensor *tensor, float min = -1.0f, float max = 1.0f) { - size_t size = ggml_nelements(tensor); - std::vector data(size); - for (size_t i = 0; i < size; i++) { - data[i] = i + 1; - } - - if (tensor->type == GGML_TYPE_F32 || tensor->type == GGML_TYPE_I32) { -#ifdef GGML_USE_QNN - memcpy((char *)tensor->data, data.data(), size * sizeof(float)); -#else - ggml_backend_tensor_set(tensor, data.data(), 0, size * sizeof(float)); -#endif - } else if (ggml_is_quantized(tensor->type) || tensor->type == GGML_TYPE_F16 || tensor->type == GGML_TYPE_BF16) { - GGML_ASSERT(size % ggml_blck_size(tensor->type) == 0); - std::vector dataq(ggml_row_size(tensor->type, size)); - std::vector imatrix(tensor->ne[0], 1.0f); // dummy importance matrix - const float *im = imatrix.data(); - if (!ggml_quantize_requires_imatrix(tensor->type)) { - // when the imatrix is optional, we want to test both quantization with and without imatrix - // use one of the random numbers to decide - if (data[0] > 0.5f * (min + max)) { - im = nullptr; - } - } - ggml_quantize_chunk(tensor->type, data.data(), dataq.data(), 0, size / tensor->ne[0], tensor->ne[0], im); - GGML_ASSERT(ggml_validate_row_data(tensor->type, dataq.data(), dataq.size())); -#ifdef GGML_USE_QNN - memcpy((char *)tensor->data, dataq.data(), dataq.size()); -#else - ggml_backend_tensor_set(tensor, dataq.data(), 0, dataq.size()); -#endif - } else if (tensor->type == GGML_TYPE_I8 || tensor->type == GGML_TYPE_I16 || tensor->type == GGML_TYPE_I32) { - // This is going to create some weird integers though. -#ifdef GGML_USE_QNN - memcpy((char *)tensor->data, data.data(), ggml_nbytes(tensor)); -#else - ggml_backend_tensor_set(tensor, data.data(), 0, ggml_nbytes(tensor)); -#endif - } else { - GGML_ASSERT(false); - } -} - -// ref: https://github.com/ggerganov/llama.cpp/blob/master/tests/test-backend-ops.cpp#L310 -static void initialize_tensors(ggml_context *ctx) { - for (ggml_tensor *t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) { - init_tensor_uniform(t); - } -} - -static void show_usage() { - printf( - " " - "\nUsage: test_qnn_ops [options]\n" - "\n" - "Options:\n" - " -t GGML_OP_ADD / GGML_OP_MULMAT\n" - " -b 0(QNN_CPU) 1(QNN_GPU) 2(QNN_NPU) 3(ggml)\n" - " ?/h print usage infomation\n\n"); -} - -typedef ggml_tensor *(*ggml_op_unary_t)(ggml_context *ctx, ggml_tensor *a); - -typedef ggml_tensor *(*ggml_op_binary_t)(ggml_context *ctx, ggml_tensor *a, ggml_tensor *b); - -static constexpr const ggml_op_unary_t kUnaryOps[] = { - nullptr, // GGML_OP_NONE - nullptr, // GGML_OP_DUP - nullptr, // GGML_OP_ADD - nullptr, // GGML_OP_ADD1 - nullptr, // GGML_OP_ACC - nullptr, // GGML_OP_SUB - nullptr, // GGML_OP_MUL - nullptr, // GGML_OP_DIV - nullptr, // GGML_OP_SQR - ggml_sqrt, // GGML_OP_SQRT - ggml_log, // GGML_OP_LOG - nullptr, // GGML_OP_SUM - nullptr, // GGML_OP_SUM_ROWS - nullptr, // GGML_OP_MEAN - nullptr, // GGML_OP_ARGMAX - nullptr, // GGML_OP_REPEAT - nullptr, // GGML_OP_REPEAT_BACK - nullptr, // GGML_OP_CONCAT - nullptr, // GGML_OP_SILU_BACK - nullptr, // GGML_OP_NORM - nullptr, // GGML_OP_RMS_NORM - nullptr, // GGML_OP_RMS_NORM_BACK - nullptr, // GGML_OP_GROUP_NORM - nullptr, // GGML_OP_MUL_MAT -}; - -static constexpr const ggml_op_binary_t kBinaryOps[] = { - nullptr, // GGML_OP_NONE - nullptr, // GGML_OP_DUP - ggml_add, // GGML_OP_ADD - nullptr, // GGML_OP_ADD1 - nullptr, // GGML_OP_ACC - ggml_sub, // GGML_OP_SUB - ggml_mul, // GGML_OP_MUL - ggml_div, // GGML_OP_DIV - nullptr, // GGML_OP_SQR - nullptr, // GGML_OP_SQRT - nullptr, // GGML_OP_LOG - nullptr, // GGML_OP_SUM - nullptr, // GGML_OP_SUM_ROWS - nullptr, // GGML_OP_MEAN - nullptr, // GGML_OP_ARGMAX - nullptr, // GGML_OP_REPEAT - nullptr, // GGML_OP_REPEAT_BACK - nullptr, // GGML_OP_CONCAT - nullptr, // GGML_OP_SILU_BACK - nullptr, // GGML_OP_NORM - nullptr, // GGML_OP_RMS_NORM - nullptr, // GGML_OP_RMS_NORM_BACK - nullptr, // GGML_OP_GROUP_NORM - ggml_mul_mat, // GGML_OP_MUL_MAT -}; - -static_assert(kBinaryOps[GGML_OP_MUL_MAT] == ggml_mul_mat, "ggml_mul_mat at wrong index, check kBinaryOps"); - -static void qnn_op_ut(int num_threads, int n_backend_type, int n_ggml_op_type, ggml_type qtype, - std::vector &results) { - int64_t n_begin_time = 0LL; - int64_t n_end_time = 0LL; - int64_t n_duration = 0LL; - size_t ctx_size = 0; - int sizey = 4; - int sizex = 4; - - struct ggml_context *ctx = nullptr; - struct ggml_cgraph *gf = nullptr; - struct ggml_tensor *src0 = nullptr; - struct ggml_tensor *src1 = nullptr; - struct ggml_tensor *dst = nullptr; - ggml_backend_t backend = nullptr; - ggml_backend_buffer_t buffer = nullptr; - - std::vector work_buffer; - QNN_LOG_DEBUG("enter qnn_ggml_op\n"); - QNN_LOG_DEBUG("ggml op:%d(%s)\n", n_ggml_op_type, ggml_op_name((enum ggml_op)n_ggml_op_type)); - - n_begin_time = ggml_time_us(); - - ctx_size += 1024 * 1024 * 32; - QNN_LOG_DEBUG("Allocating Memory of size %zi bytes, %zi MB\n", ctx_size, (ctx_size / 1024 / 1024)); - - struct ggml_init_params params = { /*.mem_size =*/ctx_size, - /*.mem_buffer =*/NULL, - /* no_alloc =*/0 }; - - if (n_backend_type != QNN_BACKEND_GGML) { - params.no_alloc = true; - backend = ggml_backend_qnn_init(n_backend_type, "/data/local/tmp/"); - if (nullptr == backend) { - QNN_LOG_ERROR("create qnn backend %d(%s) failed\n", n_backend_type, get_qnn_backend_name(n_backend_type)); - return; - } - } - - ctx = ggml_init(params); - if (!ctx) { - QNN_LOG_ERROR("%s: ggml_init() failed\n"); - return; - } - - QNN_LOG_DEBUG("creating new tensors\n"); - QNN_LOG_DEBUG("ggml_blck_size(%s) %d\n", ggml_type_name(qtype), ggml_blck_size(qtype)); - QNN_LOG_DEBUG("ggml_type_size(%s) %d\n", ggml_type_name(qtype), ggml_type_size(qtype)); - if (ggml_is_quantized(qtype)) { - sizex = ggml_blck_size(qtype); - - if (n_ggml_op_type == GGML_OP_MUL_MAT) { - sizex = ggml_blck_size(qtype) * 2; - } - } - QNN_LOG_DEBUG("sizex: %d\n", sizex); - QNN_LOG_DEBUG("sizey: %d\n", sizey); - - src0 = ggml_new_tensor_2d(ctx, qtype, sizex, sizey); - src1 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizey); - - ggml_set_input(src0); - ggml_set_input(src1); - - auto unary_op = kUnaryOps[n_ggml_op_type]; - auto binary_op = kBinaryOps[n_ggml_op_type]; - if (unary_op) { - dst = unary_op(ctx, src0); - } else if (binary_op) { - dst = binary_op(ctx, src0, src1); - } else { - QNN_LOG_WARN("ggml op %d(%s) not supported", n_ggml_op_type, ggml_op_name((enum ggml_op)n_ggml_op_type)); - ggml_free(ctx); - ggml_backend_free(backend); - return; - } - - ggml_set_output(dst); -#ifdef GGML_USE_QNN - if (n_backend_type != QNN_BACKEND_GGML) { - buffer = ggml_backend_alloc_ctx_tensors(ctx, backend); - if (!buffer) { - QNN_LOG_ERROR("%s: failed to allocate backend buffer\n", __func__); - ggml_free(ctx); - ggml_backend_free(backend); - return; - } - } -#endif - - QNN_LOG_DEBUG("creating compute graph\n"); - gf = ggml_new_graph(ctx); - ggml_build_forward_expand(gf, dst); - - initialize_tensors(ctx); - - ggml_graph_compute_helper(backend, gf, work_buffer, num_threads, nullptr, nullptr); - - if (get_tensor_data_size(dst) < (32 * 32)) { - QNN_LOG_DEBUG("dump tensors:\n"); - TENSOR_DUMP(src0); - TENSOR_DUMP(src1); - TENSOR_DUMP(dst); - } else { - QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 - ", nb = (%5zi, %5zi, %5zi)\n", - src0->name, src0->type, ggml_type_name(src0->type), src0->ne[0], src0->ne[1], src0->ne[2], - src0->nb[0], src0->nb[1], src0->nb[2]); - QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 - ", nb = (%5zi, %5zi, %5zi)\n", - src1->name, src1->type, ggml_type_name(src1->type), src1->ne[0], src1->ne[1], src1->ne[2], - src1->nb[0], src1->nb[1], src1->nb[2]); - QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 - ", nb = (%5zi, %5zi, %5zi)\n", - dst->name, dst->type, ggml_type_name(dst->type), dst->ne[0], dst->ne[1], dst->ne[2], dst->nb[0], - dst->nb[1], dst->nb[2]); - } - - results.resize(ggml_nbytes(dst)); - memcpy(results.data(), ggml_get_data(dst), ggml_nbytes(dst)); - ggml_free(ctx); - ggml_backend_buffer_free(buffer); - ggml_backend_free(backend); - - n_end_time = ggml_time_us(); - n_duration = (n_end_time - n_begin_time) / 1000; - QNN_LOG_DEBUG("duration of ut GGML_OP_%s using QNN backend %s: %lld milliseconds\n", - ggml_op_name((enum ggml_op)n_ggml_op_type), get_qnn_backend_name(n_backend_type), n_duration); -} - -#define DEFINE_OP(op) { #op, op } - -static const std::unordered_map kMapStringToGGMLOp = { - DEFINE_OP(GGML_OP_ADD), DEFINE_OP(GGML_OP_SUB), DEFINE_OP(GGML_OP_MUL), DEFINE_OP(GGML_OP_DIV), - DEFINE_OP(GGML_OP_SQRT), DEFINE_OP(GGML_OP_MUL_MAT), DEFINE_OP(GGML_OP_LOG), -}; - -#define CONSOLE_RED "\033[31m" -#define CONSOLE_GREEN "\033[32m" -#define CONSOLE_RESET "\033[0m" - -int main(int argc, char *argv[]) { - int num_threads = 4; - int n_backend_type = QNN_BACKEND_CPU; - int n_ggml_op_type = GGML_OP_ADD; - - for (int i = 1; i < argc; i++) { - if (0 == strcmp(argv[i], "-t")) { - if (i + 1 < argc) { - auto it = kMapStringToGGMLOp.find(argv[i + 1]); - if (it != kMapStringToGGMLOp.end()) { - n_ggml_op_type = it->second; - } else { - show_usage(); - return 1; - } - i++; - } - } else if (0 == strcmp(argv[i], "-b")) { - if (i + 1 < argc) { - int backend = atoi(argv[i + 1]); - if (backend <= QNN_BACKEND_GGML) - n_backend_type = backend; - else { - show_usage(); - return 1; - } - i++; - } - } else { - show_usage(); - return 1; - } - } - - QNN_LOG_DEBUG("enter qnn_ggml_op\n"); - QNN_LOG_DEBUG("backend %d, ggml op:%d(%s)", n_backend_type, n_ggml_op_type, - ggml_op_name((enum ggml_op)n_ggml_op_type)); - - std::vector results; - qnn_op_ut(num_threads, n_backend_type, n_ggml_op_type, GGML_TYPE_F32, results); - std::vector cpu_results; - qnn_op_ut(num_threads, QNN_BACKEND_GGML, n_ggml_op_type, GGML_TYPE_F32, cpu_results); - - // TODO: theoretically, the results should be the same, but the results may be different due to the different hardware - // a better way to compare the results is to compare the floating point numbers with allowed error - if (results == cpu_results) { - QNN_LOG_INFO(CONSOLE_GREEN "[Success] results equal to CPU backend!" CONSOLE_RESET); - return 0; - } else { - QNN_LOG_ERROR(CONSOLE_RED "[Failed] results mismatch with CPU backend!" CONSOLE_RESET); - return 1; - } -} From 6457a68bd7273eef0843d3ed6faf70f7012d0731 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Wed, 17 Jul 2024 23:24:29 +0800 Subject: [PATCH 080/143] disable qnn profiling in release build --- ggml/src/ggml-qnn/qnn-lib.hpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/ggml/src/ggml-qnn/qnn-lib.hpp b/ggml/src/ggml-qnn/qnn-lib.hpp index a46901695aa6d..136b1af08b086 100644 --- a/ggml/src/ggml-qnn/qnn-lib.hpp +++ b/ggml/src/ggml-qnn/qnn-lib.hpp @@ -882,7 +882,11 @@ class qnn_instance { QnnLog_Level_t _qnn_log_level = QNN_LOG_LEVEL_DEBUG; +#ifdef NDEBUG + qnn::sdk_profile_level _profile_level = qnn::sdk_profile_level::profile_off; +#else qnn::sdk_profile_level _profile_level = qnn::sdk_profile_level::profile_detail; +#endif std::shared_ptr _qnn_sys_interface; std::shared_ptr _qnn_interface; From c76fc9aa2f8585d9840366f1dad387eae30b2c4c Mon Sep 17 00:00:00 2001 From: hongruichen Date: Wed, 17 Jul 2024 23:30:14 +0800 Subject: [PATCH 081/143] fix warnings --- ggml/src/ggml-qnn.cpp | 7 +++---- ggml/src/ggml-qnn/graph.hpp | 8 ++++---- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/ggml/src/ggml-qnn.cpp b/ggml/src/ggml-qnn.cpp index 8ba258d632f38..0e5e86e4add4e 100644 --- a/ggml/src/ggml-qnn.cpp +++ b/ggml/src/ggml-qnn.cpp @@ -134,8 +134,7 @@ struct ggml_backend_qnn_buffer_type_context { // implementation of QNN backend for GGML // // ================================================================================================= -static bool ggml_qnn_can_handle_op(ggml_backend_qnn_context *ctx, const struct ggml_tensor *tensor, - bool b_dump_tensor_info) { +static bool ggml_qnn_can_handle_op(ggml_backend_qnn_context *ctx, const struct ggml_tensor *tensor) { if (ggml_is_empty(tensor) || (!qnn::ggml_qnn_unary_op_array()[tensor->op] && !qnn::ggml_qnn_binary_op_array()[tensor->op])) { return false; @@ -353,13 +352,13 @@ GGML_CALL static ggml_status ggml_backend_qnn_graph_compute(ggml_backend_t backe GGML_CALL static bool ggml_backend_qnn_supports_op(ggml_backend_t backend, const ggml_tensor *op) { ggml_backend_qnn_context *ctx = (ggml_backend_qnn_context *)backend->context; - return (ggml_qnn_can_handle_op(ctx, op, false)); + return ggml_qnn_can_handle_op(ctx, op); } GGML_CALL static bool ggml_backend_qnn_offload_op(ggml_backend_t backend, const ggml_tensor *tensor) { ggml_backend_qnn_context *ctx = (ggml_backend_qnn_context *)backend->context; - return ggml_qnn_can_handle_op(ctx, tensor, false); + return ggml_qnn_can_handle_op(ctx, tensor); } static ggml_backend_i ggml_backend_qnn_interface = { diff --git a/ggml/src/ggml-qnn/graph.hpp b/ggml/src/ggml-qnn/graph.hpp index 9621ad1b4dd68..462ed92034b2c 100644 --- a/ggml/src/ggml-qnn/graph.hpp +++ b/ggml/src/ggml-qnn/graph.hpp @@ -101,10 +101,10 @@ class ggml_qnn_graph { _tensor_inputs = tensor_inputs; _tensor_outputs = tensor_outputs; - Qnn_OpConfig_t op_config = { .version = QNN_OPCONFIG_VERSION_1, - .v1 = { _graph_name.c_str(), QNN_OP_PACKAGE_NAME_QTI_AISW, op_name.c_str(), 0, - nullptr, (uint32_t)_tensor_inputs.size(), _tensor_inputs.data(), - (uint32_t)_tensor_outputs.size(), _tensor_outputs.data() } }; + Qnn_OpConfig_t op_config = { /*.version = */ QNN_OPCONFIG_VERSION_1, + /*.v1 = */ { _graph_name.c_str(), QNN_OP_PACKAGE_NAME_QTI_AISW, op_name.c_str(), 0, + nullptr, (uint32_t)_tensor_inputs.size(), _tensor_inputs.data(), + (uint32_t)_tensor_outputs.size(), _tensor_outputs.data() } }; auto error = _qnn_interface->qnn_graph_add_node(_graph_handle, op_config); if (error != QNN_SUCCESS) { QNN_LOG_ERROR("graphAddNode.error = %d\n", error); From ce199b2de788a5e314761f29868d724070beb254 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Wed, 17 Jul 2024 23:43:22 +0800 Subject: [PATCH 082/143] refactoring: downgrade some log to debug level --- ggml/src/ggml-qnn/qnn-lib.hpp | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/ggml/src/ggml-qnn/qnn-lib.hpp b/ggml/src/ggml-qnn/qnn-lib.hpp index 136b1af08b086..6d0ee05671a8c 100644 --- a/ggml/src/ggml-qnn/qnn-lib.hpp +++ b/ggml/src/ggml-qnn/qnn-lib.hpp @@ -2,6 +2,7 @@ #include +#include #include #include #include @@ -366,8 +367,8 @@ class qnn_instance { size_t probe_counts = sizeof(probe_slots) / sizeof(size_t); for (size_t idx = 0; idx < probe_counts; idx++) { rpc_buffer = static_cast(alloc_rpcmem(probe_slots[idx] * size_in_mb, 4)); - if (nullptr == rpc_buffer) { - QNN_LOG_INFO("alloc rpcmem %d (MB) failure, %s\n", probe_slots[idx], strerror(errno)); + if (!rpc_buffer) { + QNN_LOG_DEBUG("alloc rpcmem %d (MB) failure, %s\n", probe_slots[idx], strerror(errno)); break; } else { candidate_size = probe_slots[idx]; @@ -375,7 +376,8 @@ class qnn_instance { rpc_buffer = nullptr; } } - if (candidate_size > _rpcmem_capacity) _rpcmem_capacity = candidate_size; + + _rpcmem_capacity = std::max(candidate_size, _rpcmem_capacity); QNN_LOG_INFO("capacity of QNN rpc ion memory is about %d MB\n", _rpcmem_capacity); if (0 != init_htp_perfinfra()) { @@ -600,7 +602,7 @@ class qnn_instance { auto allocate_bytes = static_cast(bytes + alignment); void *buf = _pfn_rpc_mem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS, allocate_bytes); if (buf == nullptr) { - QNN_LOG_WARN("failed to allocate rpc memory\n"); + QNN_LOG_WARN("failed to allocate rpc memory, size: %d MB\n", (int)(allocate_bytes / (1 << 20))); return nullptr; } From d82b3a0bdb3ad491e22b4a5b182ff75a5a0597d3 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Thu, 18 Jul 2024 10:25:45 +0800 Subject: [PATCH 083/143] feat: add GGML_UNARY_OP_GELU --- ggml/src/ggml-qnn.cpp | 25 ++++++++------ ggml/src/ggml-qnn/backend-ops.cpp | 56 ++++++++++++++++++++++++------- ggml/src/ggml-qnn/backend-ops.hpp | 4 ++- 3 files changed, 60 insertions(+), 25 deletions(-) diff --git a/ggml/src/ggml-qnn.cpp b/ggml/src/ggml-qnn.cpp index 0e5e86e4add4e..282a3d85941b8 100644 --- a/ggml/src/ggml-qnn.cpp +++ b/ggml/src/ggml-qnn.cpp @@ -135,14 +135,19 @@ struct ggml_backend_qnn_buffer_type_context { // // ================================================================================================= static bool ggml_qnn_can_handle_op(ggml_backend_qnn_context *ctx, const struct ggml_tensor *tensor) { - if (ggml_is_empty(tensor) || - (!qnn::ggml_qnn_unary_op_array()[tensor->op] && !qnn::ggml_qnn_binary_op_array()[tensor->op])) { + if (ggml_is_empty(tensor)) { + return false; + } + + if (!qnn::ggml_qnn_unary_op_array()[tensor->op] && !qnn::ggml_qnn_binary_op_array()[tensor->op] && + (tensor->op != GGML_OP_UNARY || + qnn::ggml_qnn_unary_op_array()[qnn::kGgmlUnaryOpStart + ggml_get_unary_op(tensor)])) { return false; } const struct ggml_tensor *src0 = tensor->src[0]; const struct ggml_tensor *src1 = tensor->src[1]; - if (nullptr == src0 || nullptr == src1) { + if (!src0 || !src1) { return false; } @@ -162,18 +167,16 @@ static bool ggml_qnn_can_handle_op(ggml_backend_qnn_context *ctx, const struct g } } - if (tensor->op == GGML_OP_MUL_MAT) { - if (ne00 <= 32 || ne01 <= 32 || ne10 <= 32 || ne11 <= 32) { - // comment it for make UT of mul_mat with QNN RPC happy - // return false; - } - } - return true; } static bool ggml_qnn_compute_forward(ggml_backend_qnn_context *ctx, struct ggml_tensor *tensor) { - auto unary_op = qnn::ggml_qnn_unary_op_array()[tensor->op]; + size_t unary_op_idx = tensor->op; + if (tensor->op == GGML_OP_UNARY) { + unary_op_idx = qnn::kGgmlUnaryOpStart + ggml_get_unary_op(tensor); + } + + auto unary_op = qnn::ggml_qnn_unary_op_array()[unary_op_idx]; if (unary_op) { return unary_op(ctx, tensor->src[0], tensor); } diff --git a/ggml/src/ggml-qnn/backend-ops.cpp b/ggml/src/ggml-qnn/backend-ops.cpp index e1a8c4da5ed40..6367e7c7064d1 100644 --- a/ggml/src/ggml-qnn/backend-ops.cpp +++ b/ggml/src/ggml-qnn/backend-ops.cpp @@ -158,12 +158,16 @@ qnn::ggml_qnn_binary_graph_cache_t &get_qnn_graph_cache(ggml_backend_qnn_context template qnn::ggml_qnn_graph<_InputSize, _OutputSize> *get_qnn_graph_from_cache( - ggml_backend_qnn_context *ctx, ggml_op op, const std::string &qnn_op, + ggml_backend_qnn_context *ctx, size_t op, const std::string &qnn_op, const std::array &inputs, const std::array &outputs) { using graph_t = qnn::ggml_qnn_graph<_InputSize, _OutputSize>; + GGML_ASSERT(op < (GGML_OP_COUNT + GGML_UNARY_OP_COUNT)); + auto &graph_cache = get_qnn_graph_cache(ctx, inputs, outputs); - const std::string graph_key(ggml_op_name(op)); + const auto *op_name = op < qnn::kGgmlUnaryOpStart ? ggml_op_name(ggml_op(op)) + : ggml_unary_op_name(ggml_unary_op(op - qnn::kGgmlUnaryOpStart)); + const std::string graph_key(op_name); auto it = graph_cache.find(graph_key); graph_t *graph_ptr = nullptr; if (it != graph_cache.end()) { @@ -276,10 +280,27 @@ constexpr const char *kGgmlOpToQnnOp[] = { nullptr, // GGML_OP_CROSS_ENTROPY_LOSS nullptr, // GGML_OP_CROSS_ENTROPY_LOSS_BACK + + // ggml_unary_op + nullptr, // GGML_UNARY_OP_ABS + nullptr, // GGML_UNARY_OP_SGN + nullptr, // GGML_UNARY_OP_NEG + nullptr, // GGML_UNARY_OP_STEP + nullptr, // GGML_UNARY_OP_TANH + nullptr, // GGML_UNARY_OP_ELU + nullptr, // GGML_UNARY_OP_RELU + nullptr, // GGML_UNARY_OP_SIGMOID + QNN_OP_GELU, // GGML_UNARY_OP_GELU + nullptr, // GGML_UNARY_OP_GELU_QUICK + nullptr, // GGML_UNARY_OP_SILU + nullptr, // GGML_UNARY_OP_HARDSWISH + nullptr, // GGML_UNARY_OP_HARDSIGMOID }; -static_assert(sizeof(kGgmlOpToQnnOp) / sizeof(kGgmlOpToQnnOp[0]) == GGML_OP_COUNT, - "GGML_OP_COUNT does not match the size of the ops table"); +static_assert(sizeof(kGgmlOpToQnnOp) / sizeof(kGgmlOpToQnnOp[0]) == (GGML_OP_COUNT + GGML_UNARY_OP_COUNT), + "GGML_OP_COUNT does not match the size of the kGgmlOpToQnnOp table"); +static_assert(kGgmlOpToQnnOp[GGML_UNARY_OP_GELU + qnn::kGgmlUnaryOpStart] != nullptr, + "GGML_UNARY_OP_GELU does not correspond to QNN_OP_GELU"); template bool qnn_binary_op_impl(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, @@ -288,9 +309,6 @@ bool qnn_binary_op_impl(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, CHECK_PARAMS(ctx, src0, src1, dst); - qnn::qnn_perf perf(ggml_op_name(_GgmlOp)); - perf.start(); - bool succeed = false; qnn::ggml_qnn_graph_binary *graph_ptr = get_qnn_graph_from_cache<2, 1>(ctx, _GgmlOp, kGgmlOpToQnnOp[_GgmlOp], { src0, src1 }, { dst }); @@ -307,15 +325,12 @@ bool qnn_binary_op_impl(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, return succeed; } -template +template bool qnn_unary_op_impl(ggml_backend_qnn_context *ctx, const ggml_tensor *src, ggml_tensor *dst) { static_assert(kGgmlOpToQnnOp[_GgmlOp] != nullptr, "GGML_OP does not have a corresponding QNN_OP"); CHECK_PARAMS(ctx, src, dst); - qnn::qnn_perf perf(ggml_op_name(_GgmlOp)); - perf.start(); - bool succeed = false; auto *graph_ptr = get_qnn_graph_from_cache<1, 1>(ctx, _GgmlOp, kGgmlOpToQnnOp[_GgmlOp], { src }, { dst }); if (graph_ptr) { @@ -416,10 +431,25 @@ qnn::ggml_qnn_unary_op_array_t qnn::ggml_qnn_unary_op_array() { nullptr, // GGML_OP_CROSS_ENTROPY_LOSS nullptr, // GGML_OP_CROSS_ENTROPY_LOSS_BACK + + // ggml_unary_op + nullptr, // GGML_UNARY_OP_ABS + nullptr, // GGML_UNARY_OP_SGN + nullptr, // GGML_UNARY_OP_NEG + nullptr, // GGML_UNARY_OP_STEP + nullptr, // GGML_UNARY_OP_TANH + nullptr, // GGML_UNARY_OP_ELU + nullptr, // GGML_UNARY_OP_RELU + nullptr, // GGML_UNARY_OP_SIGMOID + qnn_unary_op_impl, // GGML_UNARY_OP_GELU + nullptr, // GGML_UNARY_OP_GELU_QUICK + nullptr, // GGML_UNARY_OP_SILU + nullptr, // GGML_UNARY_OP_HARDSWISH + nullptr, // GGML_UNARY_OP_HARDSIGMOID }; - static_assert(sizeof(kQnnOpsTable) / sizeof(kQnnOpsTable[0]) == GGML_OP_COUNT, - "GGML_OP_COUNT does not match the size of the ops table"); + static_assert(sizeof(kQnnOpsTable) / sizeof(kQnnOpsTable[0]) == (GGML_OP_COUNT + GGML_UNARY_OP_COUNT), + "GGML_OP_COUNT does not match the size of the kQnnOpsTable table"); return kQnnOpsTable; } diff --git a/ggml/src/ggml-qnn/backend-ops.hpp b/ggml/src/ggml-qnn/backend-ops.hpp index 8d94fc6c25424..8cc2dc366fbfa 100644 --- a/ggml/src/ggml-qnn/backend-ops.hpp +++ b/ggml/src/ggml-qnn/backend-ops.hpp @@ -10,9 +10,11 @@ typedef bool (*ggml_qnn_unary_op_t)(ggml_backend_qnn_context *ctx, const ggml_te typedef bool (*ggml_qnn_binary_op_t)(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst); -typedef const ggml_qnn_unary_op_t (&ggml_qnn_unary_op_array_t)[GGML_OP_COUNT]; +typedef const ggml_qnn_unary_op_t (&ggml_qnn_unary_op_array_t)[GGML_OP_COUNT + GGML_UNARY_OP_COUNT]; typedef const ggml_qnn_binary_op_t (&ggml_qnn_binary_op_array_t)[GGML_OP_COUNT]; +constexpr const size_t kGgmlUnaryOpStart = GGML_OP_COUNT; + ggml_qnn_unary_op_array_t ggml_qnn_unary_op_array(); ggml_qnn_binary_op_array_t ggml_qnn_binary_op_array(); From 15f5cc450c53c890a4656b01bc3f220d3d27095a Mon Sep 17 00:00:00 2001 From: hongruichen Date: Thu, 18 Jul 2024 19:44:05 +0800 Subject: [PATCH 084/143] bug: fix allocation size overflow at log --- ggml/src/ggml-qnn/qnn-lib.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ggml/src/ggml-qnn/qnn-lib.hpp b/ggml/src/ggml-qnn/qnn-lib.hpp index 6d0ee05671a8c..517df493ccb16 100644 --- a/ggml/src/ggml-qnn/qnn-lib.hpp +++ b/ggml/src/ggml-qnn/qnn-lib.hpp @@ -599,9 +599,9 @@ class qnn_instance { return nullptr; } - auto allocate_bytes = static_cast(bytes + alignment); - void *buf = _pfn_rpc_mem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS, allocate_bytes); - if (buf == nullptr) { + auto allocate_bytes = static_cast(bytes + alignment); + void *buf = _pfn_rpc_mem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS, (int)allocate_bytes); + if (!buf) { QNN_LOG_WARN("failed to allocate rpc memory, size: %d MB\n", (int)(allocate_bytes / (1 << 20))); return nullptr; } From 665f823748d13feab4cc747caec1d6896e83ec87 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Thu, 18 Jul 2024 20:26:05 +0800 Subject: [PATCH 085/143] fix op checker --- ggml/src/ggml-qnn.cpp | 88 ++++++++++++++++++++++--------------------- 1 file changed, 45 insertions(+), 43 deletions(-) diff --git a/ggml/src/ggml-qnn.cpp b/ggml/src/ggml-qnn.cpp index 282a3d85941b8..3f228935c6fbb 100644 --- a/ggml/src/ggml-qnn.cpp +++ b/ggml/src/ggml-qnn.cpp @@ -134,42 +134,6 @@ struct ggml_backend_qnn_buffer_type_context { // implementation of QNN backend for GGML // // ================================================================================================= -static bool ggml_qnn_can_handle_op(ggml_backend_qnn_context *ctx, const struct ggml_tensor *tensor) { - if (ggml_is_empty(tensor)) { - return false; - } - - if (!qnn::ggml_qnn_unary_op_array()[tensor->op] && !qnn::ggml_qnn_binary_op_array()[tensor->op] && - (tensor->op != GGML_OP_UNARY || - qnn::ggml_qnn_unary_op_array()[qnn::kGgmlUnaryOpStart + ggml_get_unary_op(tensor)])) { - return false; - } - - const struct ggml_tensor *src0 = tensor->src[0]; - const struct ggml_tensor *src1 = tensor->src[1]; - if (!src0 || !src1) { - return false; - } - - const auto ne00 = src0->ne[0]; - const auto ne01 = src0->ne[1]; - const auto ne10 = src1->ne[0]; - const auto ne11 = src1->ne[1]; - // make qnn_get_ggml_tensor_rank and QNN SDK happy - if (ne00 <= 1 || ne01 <= 1 || ne10 <= 1 || ne11 <= 1) { - return false; - } - - // TODO: support other quantized data type - if (ggml_is_quantized(src0->type)) { - if (src0->type != GGML_TYPE_Q8_0 && src0->type != GGML_TYPE_Q4_0) { - return false; - } - } - - return true; -} - static bool ggml_qnn_compute_forward(ggml_backend_qnn_context *ctx, struct ggml_tensor *tensor) { size_t unary_op_idx = tensor->op; if (tensor->op == GGML_OP_UNARY) { @@ -297,8 +261,8 @@ GGML_CALL static bool ggml_backend_qnn_buffer_is_host(ggml_backend_buffer_type_t } GGML_CALL static const char *ggml_backend_qnn_name(ggml_backend_t backend) { - GGML_UNUSED(backend); - return "QNN"; + ggml_backend_qnn_context *ctx = (ggml_backend_qnn_context *)backend->context; + return g_qnn_mgr[ctx->device].name; } GGML_CALL static void ggml_backend_qnn_free(ggml_backend_t backend) { @@ -353,15 +317,53 @@ GGML_CALL static ggml_status ggml_backend_qnn_graph_compute(ggml_backend_t backe } GGML_CALL static bool ggml_backend_qnn_supports_op(ggml_backend_t backend, const ggml_tensor *op) { - ggml_backend_qnn_context *ctx = (ggml_backend_qnn_context *)backend->context; + GGML_UNUSED(backend); + + if (op->op == GGML_OP_NONE) { + return true; + } + + if (op->op == GGML_OP_UNARY) { + if (!qnn::ggml_qnn_unary_op_array()[qnn::kGgmlUnaryOpStart + ggml_get_unary_op(op)]) { + QNN_LOG_DEBUG("unsupported unary op %d", ggml_get_unary_op(op)); + return false; + } + + if (!op->src[0]) { + QNN_LOG_DEBUG("src0 is nullptr"); + return false; + } + } else { + if (!qnn::ggml_qnn_unary_op_array()[op->op] && !qnn::ggml_qnn_binary_op_array()[op->op]) { + QNN_LOG_DEBUG("unsupported op %d", op->op); + return false; + } - return ggml_qnn_can_handle_op(ctx, op); + if (!op->src[0] || !op->src[1]) { + QNN_LOG_DEBUG("src0 or src1 is nullptr"); + return false; + } + } + + switch (op->src[0]->type) { + case GGML_TYPE_F32: + case GGML_TYPE_F16: + case GGML_TYPE_I8: + case GGML_TYPE_Q8_0: + case GGML_TYPE_Q4_0: + break; + default: + QNN_LOG_DEBUG("unsupported src0 type %d", op->src[0]->type); + return false; + } + + return true; } -GGML_CALL static bool ggml_backend_qnn_offload_op(ggml_backend_t backend, const ggml_tensor *tensor) { - ggml_backend_qnn_context *ctx = (ggml_backend_qnn_context *)backend->context; +GGML_CALL static bool ggml_backend_qnn_offload_op(ggml_backend_t backend, const ggml_tensor *op) { + GGML_UNUSED(backend); - return ggml_qnn_can_handle_op(ctx, tensor); + return op->ne[0] > 1 && op->ne[1] > 1; } static ggml_backend_i ggml_backend_qnn_interface = { From ce3d09e5f2bfa95ca448e7cc053040108d0373e3 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Fri, 19 Jul 2024 10:13:56 +0800 Subject: [PATCH 086/143] tried fix the add node error 6005 --- ggml/src/ggml-qnn.cpp | 6 +----- ggml/src/ggml-qnn/graph.hpp | 6 ++++-- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/ggml/src/ggml-qnn.cpp b/ggml/src/ggml-qnn.cpp index 3f228935c6fbb..d62a8074ef823 100644 --- a/ggml/src/ggml-qnn.cpp +++ b/ggml/src/ggml-qnn.cpp @@ -319,10 +319,6 @@ GGML_CALL static ggml_status ggml_backend_qnn_graph_compute(ggml_backend_t backe GGML_CALL static bool ggml_backend_qnn_supports_op(ggml_backend_t backend, const ggml_tensor *op) { GGML_UNUSED(backend); - if (op->op == GGML_OP_NONE) { - return true; - } - if (op->op == GGML_OP_UNARY) { if (!qnn::ggml_qnn_unary_op_array()[qnn::kGgmlUnaryOpStart + ggml_get_unary_op(op)]) { QNN_LOG_DEBUG("unsupported unary op %d", ggml_get_unary_op(op)); @@ -333,7 +329,7 @@ GGML_CALL static bool ggml_backend_qnn_supports_op(ggml_backend_t backend, const QNN_LOG_DEBUG("src0 is nullptr"); return false; } - } else { + } else if (op->op != GGML_OP_NONE) { if (!qnn::ggml_qnn_unary_op_array()[op->op] && !qnn::ggml_qnn_binary_op_array()[op->op]) { QNN_LOG_DEBUG("unsupported op %d", op->op); return false; diff --git a/ggml/src/ggml-qnn/graph.hpp b/ggml/src/ggml-qnn/graph.hpp index 462ed92034b2c..5fe5dc83d3a72 100644 --- a/ggml/src/ggml-qnn/graph.hpp +++ b/ggml/src/ggml-qnn/graph.hpp @@ -102,8 +102,9 @@ class ggml_qnn_graph { _tensor_outputs = tensor_outputs; Qnn_OpConfig_t op_config = { /*.version = */ QNN_OPCONFIG_VERSION_1, - /*.v1 = */ { _graph_name.c_str(), QNN_OP_PACKAGE_NAME_QTI_AISW, op_name.c_str(), 0, - nullptr, (uint32_t)_tensor_inputs.size(), _tensor_inputs.data(), + /*.v1 = */ { _graph_name.c_str(), QNN_OP_PACKAGE_NAME_QTI_AISW, op_name.c_str(), + (uint32_t)_param_types.size(), _param_types.data(), + (uint32_t)_tensor_inputs.size(), _tensor_inputs.data(), (uint32_t)_tensor_outputs.size(), _tensor_outputs.data() } }; auto error = _qnn_interface->qnn_graph_add_node(_graph_handle, op_config); if (error != QNN_SUCCESS) { @@ -154,6 +155,7 @@ class ggml_qnn_graph { Qnn_GraphHandle_t _graph_handle = nullptr; std::array _tensor_inputs; std::array _tensor_outputs; + std::vector _param_types; ggml_qnn_graph(const ggml_qnn_graph &) = delete; void operator=(const ggml_qnn_graph &) = delete; From f45fbec8f43a4c2bf50726fdf777255d232e56a7 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Fri, 19 Jul 2024 12:59:38 +0800 Subject: [PATCH 087/143] Revert "tried fix the add node error 6005" This reverts commit ce3d09e5f2bfa95ca448e7cc053040108d0373e3. --- ggml/src/ggml-qnn.cpp | 6 +++++- ggml/src/ggml-qnn/graph.hpp | 6 ++---- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/ggml/src/ggml-qnn.cpp b/ggml/src/ggml-qnn.cpp index d62a8074ef823..3f228935c6fbb 100644 --- a/ggml/src/ggml-qnn.cpp +++ b/ggml/src/ggml-qnn.cpp @@ -319,6 +319,10 @@ GGML_CALL static ggml_status ggml_backend_qnn_graph_compute(ggml_backend_t backe GGML_CALL static bool ggml_backend_qnn_supports_op(ggml_backend_t backend, const ggml_tensor *op) { GGML_UNUSED(backend); + if (op->op == GGML_OP_NONE) { + return true; + } + if (op->op == GGML_OP_UNARY) { if (!qnn::ggml_qnn_unary_op_array()[qnn::kGgmlUnaryOpStart + ggml_get_unary_op(op)]) { QNN_LOG_DEBUG("unsupported unary op %d", ggml_get_unary_op(op)); @@ -329,7 +333,7 @@ GGML_CALL static bool ggml_backend_qnn_supports_op(ggml_backend_t backend, const QNN_LOG_DEBUG("src0 is nullptr"); return false; } - } else if (op->op != GGML_OP_NONE) { + } else { if (!qnn::ggml_qnn_unary_op_array()[op->op] && !qnn::ggml_qnn_binary_op_array()[op->op]) { QNN_LOG_DEBUG("unsupported op %d", op->op); return false; diff --git a/ggml/src/ggml-qnn/graph.hpp b/ggml/src/ggml-qnn/graph.hpp index 5fe5dc83d3a72..462ed92034b2c 100644 --- a/ggml/src/ggml-qnn/graph.hpp +++ b/ggml/src/ggml-qnn/graph.hpp @@ -102,9 +102,8 @@ class ggml_qnn_graph { _tensor_outputs = tensor_outputs; Qnn_OpConfig_t op_config = { /*.version = */ QNN_OPCONFIG_VERSION_1, - /*.v1 = */ { _graph_name.c_str(), QNN_OP_PACKAGE_NAME_QTI_AISW, op_name.c_str(), - (uint32_t)_param_types.size(), _param_types.data(), - (uint32_t)_tensor_inputs.size(), _tensor_inputs.data(), + /*.v1 = */ { _graph_name.c_str(), QNN_OP_PACKAGE_NAME_QTI_AISW, op_name.c_str(), 0, + nullptr, (uint32_t)_tensor_inputs.size(), _tensor_inputs.data(), (uint32_t)_tensor_outputs.size(), _tensor_outputs.data() } }; auto error = _qnn_interface->qnn_graph_add_node(_graph_handle, op_config); if (error != QNN_SUCCESS) { @@ -155,7 +154,6 @@ class ggml_qnn_graph { Qnn_GraphHandle_t _graph_handle = nullptr; std::array _tensor_inputs; std::array _tensor_outputs; - std::vector _param_types; ggml_qnn_graph(const ggml_qnn_graph &) = delete; void operator=(const ggml_qnn_graph &) = delete; From 0153a23d3f51f66eff0beceeac1cf287ddc66b7a Mon Sep 17 00:00:00 2001 From: hongruichen Date: Fri, 19 Jul 2024 15:22:23 +0800 Subject: [PATCH 088/143] fix support ops This reverts commit f45fbec8f43a4c2bf50726fdf777255d232e56a7. --- ggml/src/ggml-qnn.cpp | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/ggml/src/ggml-qnn.cpp b/ggml/src/ggml-qnn.cpp index 3f228935c6fbb..e448d73821cb3 100644 --- a/ggml/src/ggml-qnn.cpp +++ b/ggml/src/ggml-qnn.cpp @@ -319,10 +319,6 @@ GGML_CALL static ggml_status ggml_backend_qnn_graph_compute(ggml_backend_t backe GGML_CALL static bool ggml_backend_qnn_supports_op(ggml_backend_t backend, const ggml_tensor *op) { GGML_UNUSED(backend); - if (op->op == GGML_OP_NONE) { - return true; - } - if (op->op == GGML_OP_UNARY) { if (!qnn::ggml_qnn_unary_op_array()[qnn::kGgmlUnaryOpStart + ggml_get_unary_op(op)]) { QNN_LOG_DEBUG("unsupported unary op %d", ggml_get_unary_op(op)); @@ -333,7 +329,7 @@ GGML_CALL static bool ggml_backend_qnn_supports_op(ggml_backend_t backend, const QNN_LOG_DEBUG("src0 is nullptr"); return false; } - } else { + } else if (op->op != GGML_OP_NONE) { if (!qnn::ggml_qnn_unary_op_array()[op->op] && !qnn::ggml_qnn_binary_op_array()[op->op]) { QNN_LOG_DEBUG("unsupported op %d", op->op); return false; @@ -345,7 +341,7 @@ GGML_CALL static bool ggml_backend_qnn_supports_op(ggml_backend_t backend, const } } - switch (op->src[0]->type) { + switch (op->type) { case GGML_TYPE_F32: case GGML_TYPE_F16: case GGML_TYPE_I8: From a607995f95adc182a2d519bc86021776c27d1708 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Fri, 19 Jul 2024 15:35:55 +0800 Subject: [PATCH 089/143] Reapply "tried fix the add node error 6005" This reverts commit f45fbec8f43a4c2bf50726fdf777255d232e56a7. --- ggml/src/ggml-qnn/graph.hpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-qnn/graph.hpp b/ggml/src/ggml-qnn/graph.hpp index 462ed92034b2c..5fe5dc83d3a72 100644 --- a/ggml/src/ggml-qnn/graph.hpp +++ b/ggml/src/ggml-qnn/graph.hpp @@ -102,8 +102,9 @@ class ggml_qnn_graph { _tensor_outputs = tensor_outputs; Qnn_OpConfig_t op_config = { /*.version = */ QNN_OPCONFIG_VERSION_1, - /*.v1 = */ { _graph_name.c_str(), QNN_OP_PACKAGE_NAME_QTI_AISW, op_name.c_str(), 0, - nullptr, (uint32_t)_tensor_inputs.size(), _tensor_inputs.data(), + /*.v1 = */ { _graph_name.c_str(), QNN_OP_PACKAGE_NAME_QTI_AISW, op_name.c_str(), + (uint32_t)_param_types.size(), _param_types.data(), + (uint32_t)_tensor_inputs.size(), _tensor_inputs.data(), (uint32_t)_tensor_outputs.size(), _tensor_outputs.data() } }; auto error = _qnn_interface->qnn_graph_add_node(_graph_handle, op_config); if (error != QNN_SUCCESS) { @@ -154,6 +155,7 @@ class ggml_qnn_graph { Qnn_GraphHandle_t _graph_handle = nullptr; std::array _tensor_inputs; std::array _tensor_outputs; + std::vector _param_types; ggml_qnn_graph(const ggml_qnn_graph &) = delete; void operator=(const ggml_qnn_graph &) = delete; From b1b5cc10b1d3d922f9b8e0350458a4c7b3143815 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Fri, 19 Jul 2024 22:51:17 +0800 Subject: [PATCH 090/143] add function to convert qnn error into string --- ggml/src/ggml-qnn/graph.hpp | 14 ++++++++++++-- ggml/src/ggml-qnn/utils.cpp | 33 +++++++++++++++++++++++++++++++++ ggml/src/ggml-qnn/utils.hpp | 2 ++ 3 files changed, 47 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-qnn/graph.hpp b/ggml/src/ggml-qnn/graph.hpp index 5fe5dc83d3a72..2d412dffd743a 100644 --- a/ggml/src/ggml-qnn/graph.hpp +++ b/ggml/src/ggml-qnn/graph.hpp @@ -108,13 +108,23 @@ class ggml_qnn_graph { (uint32_t)_tensor_outputs.size(), _tensor_outputs.data() } }; auto error = _qnn_interface->qnn_graph_add_node(_graph_handle, op_config); if (error != QNN_SUCCESS) { - QNN_LOG_ERROR("graphAddNode.error = %d\n", error); + auto *error_str = get_qnn_error_string(error); + if (error_str) { + QNN_LOG_ERROR("qnn_graph_add_node.error: %s\n", error_str); + } else { + QNN_LOG_ERROR("qnn_graph_add_node.error: %d\n", error); + } return false; } error = _qnn_interface->qnn_graph_finalize(_graph_handle, nullptr, nullptr); if (error != QNN_SUCCESS) { - QNN_LOG_ERROR("graphFinalize.error = %d\n", error); + auto *error_str = get_qnn_error_string(error); + if (error_str) { + QNN_LOG_ERROR("qnn_graph_finalize.error: %s\n", error_str); + } else { + QNN_LOG_ERROR("qnn_graph_finalize.error: %d\n", error); + } return false; } diff --git a/ggml/src/ggml-qnn/utils.cpp b/ggml/src/ggml-qnn/utils.cpp index 11358395219ca..e36142f283d0d 100644 --- a/ggml/src/ggml-qnn/utils.cpp +++ b/ggml/src/ggml-qnn/utils.cpp @@ -152,4 +152,37 @@ const char *opname_from_ggmlop(enum ggml_op ggmlop) { return nullptr; } +const char *get_qnn_error_string(Qnn_ErrorHandle_t error) { + switch (error) { + case QNN_SUCCESS: + return "QNN_SUCCESS"; + case QNN_GRAPH_ERROR_UNSUPPORTED_FEATURE: + return "QNN_GRAPH_ERROR_UNSUPPORTED_FEATURE"; + case QNN_GRAPH_ERROR_MEM_ALLOC: + return "QNN_GRAPH_ERROR_MEM_ALLOC"; + case QNN_GRAPH_ERROR_GENERAL: + return "QNN_GRAPH_ERROR_GENERAL"; + case QNN_GRAPH_ERROR_INVALID_ARGUMENT: + return "QNN_GRAPH_ERROR_INVALID_ARGUMENT"; + case QNN_GRAPH_ERROR_INVALID_HANDLE: + return "QNN_GRAPH_ERROR_INVALID_HANDLE"; + case QNN_GRAPH_ERROR_GRAPH_DOES_NOT_EXIST: + return "QNN_GRAPH_ERROR_GRAPH_DOES_NOT_EXIST"; + case QNN_GRAPH_ERROR_INVALID_NAME: + return "QNN_GRAPH_ERROR_INVALID_NAME"; + case QNN_GRAPH_ERROR_INVALID_TENSOR: + return "QNN_GRAPH_ERROR_INVALID_TENSOR"; + case QNN_GRAPH_ERROR_INVALID_OP_CONFIG: + return "QNN_GRAPH_ERROR_INVALID_OP_CONFIG"; + case QNN_GRAPH_ERROR_SET_PROFILE: + return "QNN_GRAPH_ERROR_SET_PROFILE"; + case QNN_GRAPH_ERROR_UNCONNECTED_NODE: + return "QNN_GRAPH_ERROR_UNCONNECTED_NODE"; + case QNN_GRAPH_ERROR_CREATE_FAILED: + return "QNN_GRAPH_ERROR_CREATE_FAILED"; + default: + return nullptr; + } +} + } // namespace qnn diff --git a/ggml/src/ggml-qnn/utils.hpp b/ggml/src/ggml-qnn/utils.hpp index e8f1bf71e88be..e91a5ae8730d6 100644 --- a/ggml/src/ggml-qnn/utils.hpp +++ b/ggml/src/ggml-qnn/utils.hpp @@ -27,6 +27,8 @@ void align_free(void *ptr); const char *opname_from_ggmlop(enum ggml_op ggmlop); +const char *get_qnn_error_string(Qnn_ErrorHandle_t error); + inline int validate_tensor_version(const Qnn_Tensor_t &tensor) { if (tensor.version != QNN_TENSOR_VERSION_1) { QNN_LOG_WARN("validate_tensor_version() tensor %s, got unsupported version %d\n", tensor.v1.name, From 1679dcf47ea660254a7b7ccdbbcbd4d858370d5c Mon Sep 17 00:00:00 2001 From: hongruichen Date: Fri, 19 Jul 2024 22:56:00 +0800 Subject: [PATCH 091/143] fix: check all dimentions in `can offload` --- ggml/src/ggml-qnn.cpp | 11 ++++++++++- ggml/src/ggml-qnn/utils.cpp | 2 ++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/ggml/src/ggml-qnn.cpp b/ggml/src/ggml-qnn.cpp index e448d73821cb3..aadf53c35d872 100644 --- a/ggml/src/ggml-qnn.cpp +++ b/ggml/src/ggml-qnn.cpp @@ -359,7 +359,16 @@ GGML_CALL static bool ggml_backend_qnn_supports_op(ggml_backend_t backend, const GGML_CALL static bool ggml_backend_qnn_offload_op(ggml_backend_t backend, const ggml_tensor *op) { GGML_UNUSED(backend); - return op->ne[0] > 1 && op->ne[1] > 1; + size_t dims = ggml_n_dims(op); + bool can_offload = false; + for (size_t i = 0; i < dims; i++) { + if (op->ne[i] > 1) { + can_offload = true; + break; + } + } + + return can_offload; } static ggml_backend_i ggml_backend_qnn_interface = { diff --git a/ggml/src/ggml-qnn/utils.cpp b/ggml/src/ggml-qnn/utils.cpp index e36142f283d0d..70a898b95a63a 100644 --- a/ggml/src/ggml-qnn/utils.cpp +++ b/ggml/src/ggml-qnn/utils.cpp @@ -153,6 +153,8 @@ const char *opname_from_ggmlop(enum ggml_op ggmlop) { } const char *get_qnn_error_string(Qnn_ErrorHandle_t error) { + // A complete list of error codes can be found at here: + // https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/api_error_codes.html switch (error) { case QNN_SUCCESS: return "QNN_SUCCESS"; From 28a00e5e6c9cc691bca6c49d706b7fbf81ba9625 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Sat, 20 Jul 2024 14:10:00 +0800 Subject: [PATCH 092/143] fix: try fix QNN_GRAPH_ERROR_INVALID_OP_CONFIG --- ggml/src/ggml-qnn.cpp | 6 ------ ggml/src/ggml-qnn/graph.hpp | 19 +++++++++++++------ 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/ggml/src/ggml-qnn.cpp b/ggml/src/ggml-qnn.cpp index aadf53c35d872..3ca0dc607eb60 100644 --- a/ggml/src/ggml-qnn.cpp +++ b/ggml/src/ggml-qnn.cpp @@ -123,12 +123,6 @@ struct ggml_backend_qnn_buffer_type_context { std::string name; }; -// ================================================================================================= -// -// QNN backend internal helper functions -// -// ================================================================================================= - // ================================================================================================= // // implementation of QNN backend for GGML diff --git a/ggml/src/ggml-qnn/graph.hpp b/ggml/src/ggml-qnn/graph.hpp index 2d412dffd743a..30f96a994cd84 100644 --- a/ggml/src/ggml-qnn/graph.hpp +++ b/ggml/src/ggml-qnn/graph.hpp @@ -101,12 +101,19 @@ class ggml_qnn_graph { _tensor_inputs = tensor_inputs; _tensor_outputs = tensor_outputs; - Qnn_OpConfig_t op_config = { /*.version = */ QNN_OPCONFIG_VERSION_1, - /*.v1 = */ { _graph_name.c_str(), QNN_OP_PACKAGE_NAME_QTI_AISW, op_name.c_str(), - (uint32_t)_param_types.size(), _param_types.data(), - (uint32_t)_tensor_inputs.size(), _tensor_inputs.data(), - (uint32_t)_tensor_outputs.size(), _tensor_outputs.data() } }; - auto error = _qnn_interface->qnn_graph_add_node(_graph_handle, op_config); + Qnn_OpConfig_t config = QNN_OPCONFIG_INIT; + config.version = QNN_OPCONFIG_VERSION_1; + auto &op_config = config.v1; + op_config.name = _graph_name.c_str(); + op_config.packageName = QNN_OP_PACKAGE_NAME_QTI_AISW; + op_config.typeName = op_name.c_str(); + op_config.numOfParams = (uint32_t)_param_types.size(); + op_config.params = _param_types.data(); + op_config.numOfInputs = (uint32_t)_tensor_inputs.size(); + op_config.inputTensors = _tensor_inputs.data(); + op_config.numOfOutputs = (uint32_t)_tensor_outputs.size(); + op_config.outputTensors = _tensor_outputs.data(); + auto error = _qnn_interface->qnn_graph_add_node(_graph_handle, config); if (error != QNN_SUCCESS) { auto *error_str = get_qnn_error_string(error); if (error_str) { From 27299463ae74b8d72ce84780a61f25ad77634f0f Mon Sep 17 00:00:00 2001 From: hongruichen Date: Sat, 20 Jul 2024 14:23:44 +0800 Subject: [PATCH 093/143] fix: try fix tensor type error --- ggml/src/ggml-qnn/backend-ops.cpp | 4 ++-- ggml/src/ggml-qnn/tensor.hpp | 10 +++++++--- ggml/src/ggml-qnn/utils.cpp | 2 +- 3 files changed, 10 insertions(+), 6 deletions(-) diff --git a/ggml/src/ggml-qnn/backend-ops.cpp b/ggml/src/ggml-qnn/backend-ops.cpp index 6367e7c7064d1..1e79205986918 100644 --- a/ggml/src/ggml-qnn/backend-ops.cpp +++ b/ggml/src/ggml-qnn/backend-ops.cpp @@ -74,7 +74,7 @@ bool qnn_bind_tensors_to_graph(qnn::ggml_qnn_graph<_InputSize, _OutputSize> *gra std::array qnn_input_tensors; for (size_t i = 0; i < inputs.size(); ++i) { auto tensor = qnn::ggml_qnn_tensor::from_ggml_tensor(inputs[i]); - if (!tensor || !tensor->bind_to_graph(*graph)) { + if (!tensor || !tensor->bind_to_graph(*graph, true)) { return false; } @@ -84,7 +84,7 @@ bool qnn_bind_tensors_to_graph(qnn::ggml_qnn_graph<_InputSize, _OutputSize> *gra std::array qnn_output_tensors; for (size_t i = 0; i < outputs.size(); ++i) { auto tensor = qnn::ggml_qnn_tensor::from_ggml_tensor(outputs[i]); - if (!tensor || !tensor->bind_to_graph(*graph)) { + if (!tensor || !tensor->bind_to_graph(*graph, false)) { return false; } diff --git a/ggml/src/ggml-qnn/tensor.hpp b/ggml/src/ggml-qnn/tensor.hpp index e5dc436adaa5c..9137b5d86381c 100644 --- a/ggml/src/ggml-qnn/tensor.hpp +++ b/ggml/src/ggml-qnn/tensor.hpp @@ -43,7 +43,8 @@ class ggml_qnn_tensor { _dimensions[2] = (uint32_t)tensor->ne[2]; _dimensions[3] = (uint32_t)tensor->ne[3]; QNN_TENSOR_SET_DIMENSIONS(_qnn_tensor, _dimensions); - QNN_TENSOR_SET_TYPE(_qnn_tensor, device_tensortype_from_ggml_tensor(tensor)); + auto qnn_tensor_type = device_tensortype_from_ggml_tensor(tensor); + QNN_TENSOR_SET_TYPE(_qnn_tensor, qnn_tensor_type); QNN_TENSOR_SET_DATA_FORMAT(_qnn_tensor, QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER); QNN_TENSOR_SET_DATA_TYPE(_qnn_tensor, device_datatype_from_ggml_datatype(tensor->type)); // TODO: set the quantizeParams base on the tensor type @@ -54,11 +55,11 @@ class ggml_qnn_tensor { QNN_TENSOR_SET_CLIENT_BUF(_qnn_tensor, client_buf); tensor->extra = this; - QNN_LOG_DEBUG("create tensor %s with device %d", _tensor_name.c_str(), device); + QNN_LOG_DEBUG("create tensor %s, device: %d, qnn_type: %d", _tensor_name.c_str(), device, (int)qnn_tensor_type); } template - bool bind_to_graph(ggml_qnn_graph<_InputSize, _OutputSize> &graph) { + bool bind_to_graph(ggml_qnn_graph<_InputSize, _OutputSize> &graph, bool is_input) { if (!is_valid()) { QNN_LOG_WARN("tensor %s not valid", _tensor_name.c_str()); return false; @@ -75,6 +76,9 @@ class ggml_qnn_tensor { } } + Qnn_TensorType_t new_tensor_type = is_input ? QNN_TENSOR_TYPE_APP_WRITE : QNN_TENSOR_TYPE_APP_READ; + QNN_TENSOR_SET_TYPE(_qnn_tensor, new_tensor_type); + QNN_LOG_INFO("tensor %s changed to type %d", _tensor_name.c_str(), new_tensor_type); Qnn_Tensor_t tensor = _qnn_tensor; if (!graph.create_graph_tensor(tensor)) { QNN_LOG_WARN("create graph tensor failed, tensor %s", _tensor_name.c_str()); diff --git a/ggml/src/ggml-qnn/utils.cpp b/ggml/src/ggml-qnn/utils.cpp index 70a898b95a63a..820b72b8969f8 100644 --- a/ggml/src/ggml-qnn/utils.cpp +++ b/ggml/src/ggml-qnn/utils.cpp @@ -30,7 +30,7 @@ Qnn_DataType_t device_datatype_from_ggml_datatype(ggml_type ggml_type) { } Qnn_TensorType_t device_tensortype_from_ggml_tensor(ggml_tensor *ggml_tensor) { - Qnn_TensorType_t qnn_tensor_type = QNN_TENSOR_TYPE_APP_WRITE; + Qnn_TensorType_t qnn_tensor_type = QNN_TENSOR_TYPE_NATIVE; if (ggml_tensor->flags & GGML_TENSOR_FLAG_INPUT) { qnn_tensor_type = QNN_TENSOR_TYPE_APP_WRITE; From 51f95d698004def0202d642ac0fe7e49f566d13d Mon Sep 17 00:00:00 2001 From: hongruichen Date: Sat, 20 Jul 2024 16:11:35 +0800 Subject: [PATCH 094/143] fix: dimension could be wrong for tensor liked 1x1x8 --- ggml/src/ggml-qnn/tensor.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-qnn/tensor.hpp b/ggml/src/ggml-qnn/tensor.hpp index 9137b5d86381c..7fb71de38787f 100644 --- a/ggml/src/ggml-qnn/tensor.hpp +++ b/ggml/src/ggml-qnn/tensor.hpp @@ -48,7 +48,7 @@ class ggml_qnn_tensor { QNN_TENSOR_SET_DATA_FORMAT(_qnn_tensor, QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER); QNN_TENSOR_SET_DATA_TYPE(_qnn_tensor, device_datatype_from_ggml_datatype(tensor->type)); // TODO: set the quantizeParams base on the tensor type - QNN_TENSOR_SET_RANK(_qnn_tensor, qnn::get_ggml_tensor_rank(tensor)); + QNN_TENSOR_SET_RANK(_qnn_tensor, (uint32_t)ggml_n_dims(tensor)); QNN_TENSOR_SET_MEM_TYPE(_qnn_tensor, QNN_TENSORMEMTYPE_RAW); Qnn_ClientBuffer_t client_buf = {}; From 5f3b1ae3b0997c92399629ed9f98a2dff1b76eac Mon Sep 17 00:00:00 2001 From: hongruichen Date: Sat, 20 Jul 2024 16:21:09 +0800 Subject: [PATCH 095/143] fix: try fix graph cache with append the tensors name --- ggml/src/ggml-qnn.cpp | 7 ++++++- ggml/src/ggml-qnn/backend-ops.cpp | 19 ++++++++++++------- 2 files changed, 18 insertions(+), 8 deletions(-) diff --git a/ggml/src/ggml-qnn.cpp b/ggml/src/ggml-qnn.cpp index 3ca0dc607eb60..46718af09dbcb 100644 --- a/ggml/src/ggml-qnn.cpp +++ b/ggml/src/ggml-qnn.cpp @@ -96,9 +96,14 @@ class ggml_backend_qnn_buffer_context { bool is_valid() const { return _buffer != nullptr; } bool init_tensor(ggml_tensor *tensor) { + if (qnn::ggml_qnn_tensor::from_ggml_tensor(tensor)) { + QNN_LOG_INFO("tensor %s already initialized", tensor->name); + return true; + } + auto qnn_tensor = std::make_unique(tensor, _device, _instance); if (!qnn_tensor->is_valid()) { - QNN_LOG_WARN("Create ggml_qnn_tensor failed"); + QNN_LOG_WARN("create ggml_qnn_tensor failed"); return false; } diff --git a/ggml/src/ggml-qnn/backend-ops.cpp b/ggml/src/ggml-qnn/backend-ops.cpp index 1e79205986918..f6eb61731381d 100644 --- a/ggml/src/ggml-qnn/backend-ops.cpp +++ b/ggml/src/ggml-qnn/backend-ops.cpp @@ -167,19 +167,23 @@ qnn::ggml_qnn_graph<_InputSize, _OutputSize> *get_qnn_graph_from_cache( auto &graph_cache = get_qnn_graph_cache(ctx, inputs, outputs); const auto *op_name = op < qnn::kGgmlUnaryOpStart ? ggml_op_name(ggml_op(op)) : ggml_unary_op_name(ggml_unary_op(op - qnn::kGgmlUnaryOpStart)); - const std::string graph_key(op_name); + std::string graph_key(op_name); + for (auto &input : inputs) { + graph_key += "_"; + graph_key += input->name; + } + for (auto &output : outputs) { + graph_key += "_"; + graph_key += output->name; + } + auto it = graph_cache.find(graph_key); graph_t *graph_ptr = nullptr; if (it != graph_cache.end()) { graph_ptr = it->second.get(); } else { - std::string graph_name = graph_key + "_" + std::to_string(ctx->threads); - for (auto &input : inputs) { - graph_name += "_"; - graph_name += input->name; - } auto graph = - std::make_unique(graph_name, (QNNBackend)(ctx->device), ctx->instance->get_qnn_context_handle(), + std::make_unique(graph_key, (QNNBackend)(ctx->device), ctx->instance->get_qnn_context_handle(), ctx->qnn_interface, ctx->socinfo.vtcm_size_in_mb); if (!graph->is_valid()) { @@ -187,6 +191,7 @@ qnn::ggml_qnn_graph<_InputSize, _OutputSize> *get_qnn_graph_from_cache( } if (!qnn_bind_tensors_to_graph<_InputSize, _OutputSize>(graph.get(), qnn_op.c_str(), inputs, outputs)) { + QNN_LOG_ERROR("qnn_bind_tensors_to_graph failed\n"); return nullptr; } From b173c4e061b1a4bb3bd3ed2e4b968b51a143c4b5 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Sat, 20 Jul 2024 16:57:51 +0800 Subject: [PATCH 096/143] feat: update tensor name when bind to graph --- ggml/src/ggml-qnn/backend-ops.cpp | 1 + ggml/src/ggml-qnn/tensor.hpp | 33 +++++++++++++++++++------------ 2 files changed, 21 insertions(+), 13 deletions(-) diff --git a/ggml/src/ggml-qnn/backend-ops.cpp b/ggml/src/ggml-qnn/backend-ops.cpp index f6eb61731381d..6896454aa55e7 100644 --- a/ggml/src/ggml-qnn/backend-ops.cpp +++ b/ggml/src/ggml-qnn/backend-ops.cpp @@ -180,6 +180,7 @@ qnn::ggml_qnn_graph<_InputSize, _OutputSize> *get_qnn_graph_from_cache( auto it = graph_cache.find(graph_key); graph_t *graph_ptr = nullptr; if (it != graph_cache.end()) { + QNN_LOG_DEBUG("found graph %s in cache\n", graph_key.c_str()); graph_ptr = it->second.get(); } else { auto graph = diff --git a/ggml/src/ggml-qnn/tensor.hpp b/ggml/src/ggml-qnn/tensor.hpp index 7fb71de38787f..49e9258c38a60 100644 --- a/ggml/src/ggml-qnn/tensor.hpp +++ b/ggml/src/ggml-qnn/tensor.hpp @@ -29,14 +29,7 @@ class ggml_qnn_tensor { explicit ggml_qnn_tensor(ggml_tensor *tensor, QNNBackend device, std::shared_ptr qnn_instance) : _tensor(tensor), _device(device), _qnn_instance(qnn_instance) { - _tensor_name = ggml_get_name(tensor); - if (_tensor_name.empty()) { - static std::atomic_uint32_t unnamed_tensor_count = 0; - char buffer[GGML_MAX_NAME] = {}; - snprintf(buffer, sizeof(buffer), "unnamed_%d", (int)(unnamed_tensor_count++)); - _tensor_name = buffer; - } - + update_tensor_name(); QNN_TENSOR_SET_NAME(_qnn_tensor, _tensor_name.c_str()); _dimensions[0] = (uint32_t)tensor->ne[0]; _dimensions[1] = (uint32_t)tensor->ne[1]; @@ -79,6 +72,7 @@ class ggml_qnn_tensor { Qnn_TensorType_t new_tensor_type = is_input ? QNN_TENSOR_TYPE_APP_WRITE : QNN_TENSOR_TYPE_APP_READ; QNN_TENSOR_SET_TYPE(_qnn_tensor, new_tensor_type); QNN_LOG_INFO("tensor %s changed to type %d", _tensor_name.c_str(), new_tensor_type); + update_tensor_name(); Qnn_Tensor_t tensor = _qnn_tensor; if (!graph.create_graph_tensor(tensor)) { QNN_LOG_WARN("create graph tensor failed, tensor %s", _tensor_name.c_str()); @@ -116,15 +110,14 @@ class ggml_qnn_tensor { auto tensor_type = QNN_TENSOR_GET_TYPE(_qnn_tensor); if (tensor_type != QNN_TENSOR_TYPE_APP_WRITE && tensor_type != QNN_TENSOR_TYPE_APP_READWRITE) { - QNN_LOG_WARN("tensor %s not writable", _tensor_name.c_str()); - return false; + QNN_LOG_WARN("tensor %s type(%d) not WRITE", _tensor_name.c_str(), (int)tensor_type); } if (should_use_mem_handle()) { if (_qnn_rpc_buffer) { memcpy(_qnn_rpc_buffer, _tensor->data, ggml_nbytes(_tensor)); } else { - QNN_LOG_WARN("can't find rpcmem from qnn mem handle\n"); + QNN_LOG_WARN("tensor %s: can't find rpcmem from qnn mem handle\n", _tensor_name.c_str()); return false; } } @@ -142,8 +135,7 @@ class ggml_qnn_tensor { auto tensor_type = QNN_TENSOR_GET_TYPE(_qnn_tensor); if (tensor_type != QNN_TENSOR_TYPE_APP_READ && tensor_type != QNN_TENSOR_TYPE_APP_READWRITE) { - QNN_LOG_WARN("tensor %s not readable", _tensor_name.c_str()); - return false; + QNN_LOG_WARN("tensor %s type(%d) not READ", _tensor_name.c_str(), (int)tensor_type); } if (should_use_mem_handle()) { @@ -190,6 +182,21 @@ class ggml_qnn_tensor { bool should_use_mem_handle() const { return _device == QNN_BACKEND_NPU; } + void update_tensor_name() { + auto *tensor_name = ggml_get_name(_tensor); + if (!strnlen(tensor_name, GGML_MAX_NAME)) { + if (_tensor_name.empty()) { + static std::atomic_uint32_t unnamed_tensor_count = 0; + char buffer[GGML_MAX_NAME] = {}; + snprintf(buffer, sizeof(buffer), "unnamed_%d", (int)(unnamed_tensor_count++)); + _tensor_name = buffer; + } + } else { + QNN_LOG_DEBUG("tensor name changed: %s -> %s", _tensor_name.c_str(), tensor_name); + _tensor_name = tensor_name; + } + } + const ggml_tensor *_tensor; QNNBackend _device; std::shared_ptr _qnn_instance; From 3b47056c97a01fd176ce46ce969b95d71884919f Mon Sep 17 00:00:00 2001 From: hongruichen Date: Mon, 22 Jul 2024 12:45:26 +0800 Subject: [PATCH 097/143] refactoring: change the tensor binding mode between qnn tensor and ggml tensor --- ggml/src/ggml-qnn.cpp | 36 +------ ggml/src/ggml-qnn/backend-ops.cpp | 126 ++++------------------ ggml/src/ggml-qnn/backend-ops.hpp | 4 +- ggml/src/ggml-qnn/backend.hpp | 6 +- ggml/src/ggml-qnn/graph.hpp | 120 ++++++++++++++------- ggml/src/ggml-qnn/qnn-lib.hpp | 2 +- ggml/src/ggml-qnn/tensor.hpp | 169 +++++++++++++++--------------- ggml/src/ggml-qnn/utils.hpp | 109 ++++++++++--------- 8 files changed, 256 insertions(+), 316 deletions(-) diff --git a/ggml/src/ggml-qnn.cpp b/ggml/src/ggml-qnn.cpp index 46718af09dbcb..87653cfb1f741 100644 --- a/ggml/src/ggml-qnn.cpp +++ b/ggml/src/ggml-qnn.cpp @@ -87,30 +87,12 @@ class ggml_backend_qnn_buffer_context { } ~ggml_backend_qnn_buffer_context() { - _tensors.clear(); - // the free will do nothing if the _buffer is nullptr qnn::align_free(_buffer); } bool is_valid() const { return _buffer != nullptr; } - bool init_tensor(ggml_tensor *tensor) { - if (qnn::ggml_qnn_tensor::from_ggml_tensor(tensor)) { - QNN_LOG_INFO("tensor %s already initialized", tensor->name); - return true; - } - - auto qnn_tensor = std::make_unique(tensor, _device, _instance); - if (!qnn_tensor->is_valid()) { - QNN_LOG_WARN("create ggml_qnn_tensor failed"); - return false; - } - - _tensors.push_back(std::move(qnn_tensor)); - return true; - } - void *get_buffer() { return _buffer; } size_t get_buffer_size() { return _buffer_size; } @@ -118,7 +100,6 @@ class ggml_backend_qnn_buffer_context { QNNBackend _device; std::shared_ptr _instance; std::string _name; - std::list> _tensors; void *_buffer = nullptr; size_t _buffer_size = 0; }; @@ -175,12 +156,9 @@ GGML_CALL static void *ggml_backend_qnn_buffer_get_base(ggml_backend_buffer_t bu } GGML_CALL static void ggml_backend_qnn_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor *tensor) { - ggml_backend_qnn_buffer_context *ctx = (ggml_backend_qnn_buffer_context *)buffer->context; - - if (!ctx->init_tensor(tensor)) { - QNN_LOG_WARN("init ggml_qnn_tensor failed"); - return; - } + // Do nothing here, the qnn tensor will be create along with the graph. + GGML_UNUSED(buffer); + GGML_UNUSED(tensor); } GGML_CALL static void ggml_backend_qnn_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor *tensor, @@ -271,13 +249,7 @@ GGML_CALL static void ggml_backend_qnn_free(ggml_backend_t backend) { auto instance = g_qnn_mgr[ctx->device].instance; if (instance) { - ctx->qnn_unary_graph_cache.clear(); - for (const auto &graph_item : ctx->qnn_binary_graph_cache) { - QNN_LOG_INFO("graph type:%s", graph_item.first.c_str()); - } - - ctx->qnn_binary_graph_cache.clear(); - + ctx->qnn_graph_cache.clear(); instance->qnn_finalize(); g_qnn_mgr[ctx->device].instance.reset(); } diff --git a/ggml/src/ggml-qnn/backend-ops.cpp b/ggml/src/ggml-qnn/backend-ops.cpp index 6896454aa55e7..bd87cfc9e66eb 100644 --- a/ggml/src/ggml-qnn/backend-ops.cpp +++ b/ggml/src/ggml-qnn/backend-ops.cpp @@ -19,10 +19,8 @@ bool qnn_is_valid_params(ggml_backend_qnn_context *ctx, const ggml_tensor *src, } auto instance = ctx->instance; - auto *tensor0 = qnn::ggml_qnn_tensor::from_ggml_tensor(src); - auto *tensor1 = qnn::ggml_qnn_tensor::from_ggml_tensor(dst); - if (!instance || !tensor0 || !tensor1) { - QNN_LOG_WARN("invalid tensors\n"); + if (!instance) { + QNN_LOG_WARN("invalid instance\n"); return false; } @@ -37,11 +35,8 @@ bool qnn_is_valid_params(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, } auto instance = ctx->instance; - auto *tensor0 = qnn::ggml_qnn_tensor::from_ggml_tensor(src0); - auto *tensor1 = qnn::ggml_qnn_tensor::from_ggml_tensor(src1); - auto *tensor2 = qnn::ggml_qnn_tensor::from_ggml_tensor(dst); - if (!instance || !tensor0 || !tensor1 || !tensor2) { - QNN_LOG_WARN("invalid tensors\n"); + if (!instance) { + QNN_LOG_WARN("invalid instance\n"); return false; } @@ -67,104 +62,29 @@ void print_ggml_tensor(const ggml_tensor *tensor) { tensor->nb[0], tensor->nb[1], tensor->nb[2]); } -template -bool qnn_bind_tensors_to_graph(qnn::ggml_qnn_graph<_InputSize, _OutputSize> *graph, const std::string &op_name, - const std::array &inputs, - const std::array &outputs) { - std::array qnn_input_tensors; - for (size_t i = 0; i < inputs.size(); ++i) { - auto tensor = qnn::ggml_qnn_tensor::from_ggml_tensor(inputs[i]); - if (!tensor || !tensor->bind_to_graph(*graph, true)) { - return false; - } - - qnn_input_tensors[i] = tensor->get_qnn_tensor(); - } - - std::array qnn_output_tensors; - for (size_t i = 0; i < outputs.size(); ++i) { - auto tensor = qnn::ggml_qnn_tensor::from_ggml_tensor(outputs[i]); - if (!tensor || !tensor->bind_to_graph(*graph, false)) { - return false; - } - - qnn_output_tensors[i] = tensor->get_qnn_tensor(); - } - - if (!graph->add_nodes(op_name, qnn_input_tensors, qnn_output_tensors)) { - return false; - } - - return true; +template +qnn::ggml_tensor_array_t to_ggml_tensor_array(const std::array &array) { + return qnn::ggml_tensor_array_t(array.data(), array.data() + _Size); } template -bool execute_graph(qnn::ggml_qnn_graph<_InputSize, _OutputSize> *graph, - const std::array &inputs, +bool execute_graph(qnn::ggml_qnn_graph *graph, const std::array &inputs, const std::array &outputs) { - - std::array qnn_input_tensors; - for (size_t i = 0; i < inputs.size(); ++i) { - auto tensor = qnn::ggml_qnn_tensor::from_ggml_tensor(inputs[i]); - if (!tensor || !tensor->write_to_qnn_tensor()) { - QNN_LOG_WARN("write_to_qnn_tensor failed\n"); - return false; - } - - qnn_input_tensors[i] = tensor->get_qnn_tensor(); - } - - std::array qnn_output_tensors; - for (size_t i = 0; i < outputs.size(); ++i) { - auto tensor = qnn::ggml_qnn_tensor::from_ggml_tensor(outputs[i]); - if (!tensor) { - return false; - } - - qnn_output_tensors[i] = tensor->get_qnn_tensor(); - } - - if (!graph->execute(qnn_input_tensors, qnn_output_tensors)) { + if (!graph->execute(to_ggml_tensor_array<_InputSize>(inputs), to_ggml_tensor_array<_OutputSize>(outputs))) { QNN_LOG_WARN("execute failed\n"); return false; } - for (auto &output : outputs) { - auto tensor = qnn::ggml_qnn_tensor::from_ggml_tensor(output); - if (!tensor || !tensor->read_from_qnn_tensor()) { - QNN_LOG_WARN("read_from_qnn_tensors failed\n"); - return false; - } - } - return true; } -qnn::ggml_qnn_unary_graph_cache_t &get_qnn_graph_cache(ggml_backend_qnn_context *ctx, - const std::array &inputs, - const std::array &outputs) { - GGML_UNUSED(inputs); - GGML_UNUSED(outputs); - return ctx->qnn_unary_graph_cache; -} - -qnn::ggml_qnn_binary_graph_cache_t &get_qnn_graph_cache(ggml_backend_qnn_context *ctx, - const std::array &inputs, - const std::array &outputs) { - GGML_UNUSED(inputs); - GGML_UNUSED(outputs); - return ctx->qnn_binary_graph_cache; -} - template -qnn::ggml_qnn_graph<_InputSize, _OutputSize> *get_qnn_graph_from_cache( - ggml_backend_qnn_context *ctx, size_t op, const std::string &qnn_op, - const std::array &inputs, const std::array &outputs) { - using graph_t = qnn::ggml_qnn_graph<_InputSize, _OutputSize>; - +qnn::ggml_qnn_graph *get_qnn_graph_from_cache(ggml_backend_qnn_context *ctx, size_t op, const std::string &qnn_op, + const std::array &inputs, + const std::array &outputs) { GGML_ASSERT(op < (GGML_OP_COUNT + GGML_UNARY_OP_COUNT)); - auto &graph_cache = get_qnn_graph_cache(ctx, inputs, outputs); + auto &graph_cache = ctx->qnn_graph_cache; const auto *op_name = op < qnn::kGgmlUnaryOpStart ? ggml_op_name(ggml_op(op)) : ggml_unary_op_name(ggml_unary_op(op - qnn::kGgmlUnaryOpStart)); std::string graph_key(op_name); @@ -178,21 +98,21 @@ qnn::ggml_qnn_graph<_InputSize, _OutputSize> *get_qnn_graph_from_cache( } auto it = graph_cache.find(graph_key); - graph_t *graph_ptr = nullptr; + qnn::ggml_qnn_graph *graph_ptr = nullptr; if (it != graph_cache.end()) { QNN_LOG_DEBUG("found graph %s in cache\n", graph_key.c_str()); graph_ptr = it->second.get(); } else { - auto graph = - std::make_unique(graph_key, (QNNBackend)(ctx->device), ctx->instance->get_qnn_context_handle(), - ctx->qnn_interface, ctx->socinfo.vtcm_size_in_mb); + auto graph = std::make_unique(graph_key, (QNNBackend)(ctx->device), ctx->instance, + ctx->socinfo.vtcm_size_in_mb); if (!graph->is_valid()) { return nullptr; } - if (!qnn_bind_tensors_to_graph<_InputSize, _OutputSize>(graph.get(), qnn_op.c_str(), inputs, outputs)) { - QNN_LOG_ERROR("qnn_bind_tensors_to_graph failed\n"); + if (!graph->build_graph(qnn_op, to_ggml_tensor_array<_InputSize>(inputs), + to_ggml_tensor_array<_OutputSize>(outputs))) { + QNN_LOG_ERROR("build_graph failed\n"); return nullptr; } @@ -309,15 +229,13 @@ static_assert(kGgmlOpToQnnOp[GGML_UNARY_OP_GELU + qnn::kGgmlUnaryOpStart] != nul "GGML_UNARY_OP_GELU does not correspond to QNN_OP_GELU"); template -bool qnn_binary_op_impl(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, - ggml_tensor *dst) { +bool qnn_binary_op_impl(ggml_backend_qnn_context *ctx, ggml_tensor *src0, ggml_tensor *src1, ggml_tensor *dst) { static_assert(kGgmlOpToQnnOp[_GgmlOp] != nullptr, "GGML_OP does not have a corresponding QNN_OP"); CHECK_PARAMS(ctx, src0, src1, dst); bool succeed = false; - qnn::ggml_qnn_graph_binary *graph_ptr = - get_qnn_graph_from_cache<2, 1>(ctx, _GgmlOp, kGgmlOpToQnnOp[_GgmlOp], { src0, src1 }, { dst }); + auto *graph_ptr = get_qnn_graph_from_cache<2, 1>(ctx, _GgmlOp, kGgmlOpToQnnOp[_GgmlOp], { src0, src1 }, { dst }); if (graph_ptr) { succeed = execute_graph<2, 1>(graph_ptr, { src0, src1 }, { dst }); } @@ -332,7 +250,7 @@ bool qnn_binary_op_impl(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, } template -bool qnn_unary_op_impl(ggml_backend_qnn_context *ctx, const ggml_tensor *src, ggml_tensor *dst) { +bool qnn_unary_op_impl(ggml_backend_qnn_context *ctx, ggml_tensor *src, ggml_tensor *dst) { static_assert(kGgmlOpToQnnOp[_GgmlOp] != nullptr, "GGML_OP does not have a corresponding QNN_OP"); CHECK_PARAMS(ctx, src, dst); diff --git a/ggml/src/ggml-qnn/backend-ops.hpp b/ggml/src/ggml-qnn/backend-ops.hpp index 8cc2dc366fbfa..614bcf651b86b 100644 --- a/ggml/src/ggml-qnn/backend-ops.hpp +++ b/ggml/src/ggml-qnn/backend-ops.hpp @@ -6,8 +6,8 @@ namespace qnn { -typedef bool (*ggml_qnn_unary_op_t)(ggml_backend_qnn_context *ctx, const ggml_tensor *src, ggml_tensor *dst); -typedef bool (*ggml_qnn_binary_op_t)(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, +typedef bool (*ggml_qnn_unary_op_t)(ggml_backend_qnn_context *ctx, ggml_tensor *src, ggml_tensor *dst); +typedef bool (*ggml_qnn_binary_op_t)(ggml_backend_qnn_context *ctx, ggml_tensor *src0, ggml_tensor *src1, ggml_tensor *dst); typedef const ggml_qnn_unary_op_t (&ggml_qnn_unary_op_array_t)[GGML_OP_COUNT + GGML_UNARY_OP_COUNT]; diff --git a/ggml/src/ggml-qnn/backend.hpp b/ggml/src/ggml-qnn/backend.hpp index 32f3c6cd445f6..b2f93a8f7a9e5 100644 --- a/ggml/src/ggml-qnn/backend.hpp +++ b/ggml/src/ggml-qnn/backend.hpp @@ -12,8 +12,7 @@ #include "qnn-lib.hpp" namespace qnn { -typedef std::unordered_map> ggml_qnn_unary_graph_cache_t; -typedef std::unordered_map> ggml_qnn_binary_graph_cache_t; +typedef std::unordered_map> ggml_qnn_graph_cache_t; } // namespace qnn struct ggml_backend_qnn_context { @@ -25,8 +24,7 @@ struct ggml_backend_qnn_context { qnn::qcom_socinfo socinfo = {}; std::shared_ptr instance; std::shared_ptr qnn_interface; - qnn::ggml_qnn_unary_graph_cache_t qnn_unary_graph_cache; - qnn::ggml_qnn_binary_graph_cache_t qnn_binary_graph_cache; + qnn::ggml_qnn_graph_cache_t qnn_graph_cache; explicit ggml_backend_qnn_context(int device, int threads, const char *name, const char *lib) : device(device), threads(threads) { diff --git a/ggml/src/ggml-qnn/graph.hpp b/ggml/src/ggml-qnn/graph.hpp index 30f96a994cd84..9941365f7e897 100644 --- a/ggml/src/ggml-qnn/graph.hpp +++ b/ggml/src/ggml-qnn/graph.hpp @@ -1,27 +1,29 @@ #pragma once -#include +#include #include +#include #include "ggml-qnn.h" #include "logger.hpp" #include "qnn-lib.hpp" +#include "tensor.hpp" namespace qnn { -template +using ggml_tensor_array_t = std::vector; + class ggml_qnn_graph { public: - typedef std::array input_tensor_array_t; - typedef std::array output_tensor_array_t; - - explicit ggml_qnn_graph(const std::string &graph_name, QNNBackend device, Qnn_ContextHandle_t qnn_context, - std::shared_ptr qnn_interface, size_t vtcm_size_in_mb) : - _graph_name(graph_name), _device(device), _qnn_interface(qnn_interface) { + explicit ggml_qnn_graph(const std::string &graph_name, QNNBackend device, + std::shared_ptr qnn_instance, size_t vtcm_size_in_mb) : + _graph_name(graph_name), _device(device), _qnn_instance(qnn_instance) { QNN_LOG_INFO("graph name %s", graph_name.c_str()); + auto qnn_interface = qnn_instance->get_qnn_interface(); + auto qnn_context = qnn_instance->get_qnn_context_handle(); Qnn_ErrorHandle_t error = QNN_SUCCESS; Qnn_GraphHandle_t graph_handle = nullptr; if (device == QNN_BACKEND_NPU) { @@ -72,35 +74,53 @@ class ggml_qnn_graph { QNN_LOG_INFO("create qnn graph handle with graph name %s ok\n", graph_name.c_str()); _graph_handle = graph_handle; + _qnn_interface = qnn_interface; } - bool create_graph_tensor(Qnn_Tensor_t &tensor) { + ~ggml_qnn_graph() { QNN_LOG_DEBUG("graph name %s, destroy", _graph_name.c_str()); } + + bool build_graph(const std::string &op_name, const ggml_tensor_array_t &tensor_inputs, + const ggml_tensor_array_t &tensor_outputs) { if (!is_valid()) { QNN_LOG_ERROR("Invalid graph\n"); return false; } - auto err = _qnn_interface->qnn_tensor_create_graph_tensor(_graph_handle, &tensor); - if (err != QNN_SUCCESS) { - QNN_LOG_INFO("error = %d\n", err); - QNN_LOG_DEBUG("tensor%p name %s", &tensor, QNN_TENSOR_GET_NAME(tensor)); - return false; + QNN_LOG_DEBUG("graph name %s, add_nodes start", _graph_name.c_str()); + _qnn_tensor_inputs.resize(tensor_inputs.size()); + _tensor_inputs.resize(tensor_inputs.size()); + for (size_t i = 0; i < tensor_inputs.size(); i++) { + char buffer[GGML_MAX_NAME] = {}; + snprintf(buffer, GGML_MAX_NAME, "src%d", (int)i); + auto qnn_tensor = + std::make_shared(std::string(buffer), _device, _graph_handle, _qnn_instance); + auto *ggml_tensor = tensor_inputs[i]; + if (!qnn_tensor->bind_ggml_tensor(ggml_tensor, true)) { + QNN_LOG_ERROR("bind tensor %s failed\n", ggml_get_name(ggml_tensor)); + return false; + } + + _qnn_tensor_inputs[i] = qnn_tensor->get_qnn_tensor(); + _tensor_inputs[i] = qnn_tensor; } - return true; - } + _qnn_tensor_outputs.resize(tensor_outputs.size()); + _tensor_outputs.resize(tensor_outputs.size()); + for (size_t i = 0; i < tensor_outputs.size(); i++) { + char buffer[GGML_MAX_NAME] = {}; + snprintf(buffer, GGML_MAX_NAME, "dst%d", (int)i); + auto qnn_tensor = + std::make_shared(std::string(buffer), _device, _graph_handle, _qnn_instance); + auto *ggml_tensor = tensor_inputs[i]; + if (!qnn_tensor->bind_ggml_tensor(ggml_tensor, false)) { + QNN_LOG_ERROR("bind tensor %s failed\n", ggml_get_name(ggml_tensor)); + return false; + } - bool add_nodes(const std::string &op_name, const input_tensor_array_t &tensor_inputs, - const output_tensor_array_t &tensor_outputs) { - if (!is_valid()) { - QNN_LOG_ERROR("Invalid graph\n"); - return false; + _qnn_tensor_outputs[i] = qnn_tensor->get_qnn_tensor(); + _tensor_outputs[i] = qnn_tensor; } - QNN_LOG_DEBUG("graph name %s, add_nodes start", _graph_name.c_str()); - _tensor_inputs = tensor_inputs; - _tensor_outputs = tensor_outputs; - Qnn_OpConfig_t config = QNN_OPCONFIG_INIT; config.version = QNN_OPCONFIG_VERSION_1; auto &op_config = config.v1; @@ -109,10 +129,10 @@ class ggml_qnn_graph { op_config.typeName = op_name.c_str(); op_config.numOfParams = (uint32_t)_param_types.size(); op_config.params = _param_types.data(); - op_config.numOfInputs = (uint32_t)_tensor_inputs.size(); - op_config.inputTensors = _tensor_inputs.data(); - op_config.numOfOutputs = (uint32_t)_tensor_outputs.size(); - op_config.outputTensors = _tensor_outputs.data(); + op_config.numOfInputs = (uint32_t)_qnn_tensor_inputs.size(); + op_config.inputTensors = _qnn_tensor_inputs.data(); + op_config.numOfOutputs = (uint32_t)_qnn_tensor_outputs.size(); + op_config.outputTensors = _qnn_tensor_outputs.data(); auto error = _qnn_interface->qnn_graph_add_node(_graph_handle, config); if (error != QNN_SUCCESS) { auto *error_str = get_qnn_error_string(error); @@ -139,12 +159,32 @@ class ggml_qnn_graph { return true; } - bool execute(const input_tensor_array_t &tensor_inputs, const output_tensor_array_t &tensor_outputs) { - _tensor_inputs = tensor_inputs; - _tensor_outputs = tensor_outputs; + bool execute(const ggml_tensor_array_t &tensor_inputs, const ggml_tensor_array_t &tensor_outputs) { + GGML_ASSERT(tensor_inputs.size() == _tensor_inputs.size()); + GGML_ASSERT(tensor_outputs.size() == _tensor_outputs.size()); + for (size_t i = 0; i < tensor_inputs.size(); i++) { + auto *ggml_tensor = tensor_inputs[i]; + if (!_tensor_inputs[i]->bind_ggml_tensor(ggml_tensor, true)) { + QNN_LOG_ERROR("bind tensor %s failed\n", ggml_get_name(ggml_tensor)); + return false; + } + + _qnn_tensor_inputs[i] = _tensor_inputs[i]->get_qnn_tensor(); + } + + for (size_t i = 0; i < tensor_outputs.size(); i++) { + auto *ggml_tensor = tensor_inputs[i]; + if (!_tensor_outputs[i]->bind_ggml_tensor(ggml_tensor, false)) { + QNN_LOG_ERROR("bind tensor %s failed\n", ggml_get_name(ggml_tensor)); + return false; + } + + _qnn_tensor_outputs[i] = _tensor_outputs[i]->get_qnn_tensor(); + } + auto error = - _qnn_interface->qnn_graph_execute(_graph_handle, _tensor_inputs.data(), _tensor_inputs.size(), - _tensor_outputs.data(), _tensor_outputs.size(), nullptr, nullptr); + _qnn_interface->qnn_graph_execute(_graph_handle, _qnn_tensor_inputs.data(), _qnn_tensor_inputs.size(), + _qnn_tensor_outputs.data(), _qnn_tensor_outputs.size(), nullptr, nullptr); if (_device == QNN_BACKEND_NPU) { if (error == QNN_COMMON_ERROR_SYSTEM_COMMUNICATION) { QNN_LOG_WARN("NPU crashed. SSR detected. Caused QNN graph execute error\n"); @@ -168,10 +208,13 @@ class ggml_qnn_graph { private: const std::string _graph_name; const QNNBackend _device; - std::shared_ptr _qnn_interface; Qnn_GraphHandle_t _graph_handle = nullptr; - std::array _tensor_inputs; - std::array _tensor_outputs; + std::shared_ptr _qnn_instance; + std::shared_ptr _qnn_interface; + std::vector> _tensor_inputs; + std::vector> _tensor_outputs; + std::vector _qnn_tensor_inputs; + std::vector _qnn_tensor_outputs; std::vector _param_types; ggml_qnn_graph(const ggml_qnn_graph &) = delete; @@ -180,7 +223,4 @@ class ggml_qnn_graph { void operator=(ggml_qnn_graph &&) = delete; }; -using ggml_qnn_graph_binary = ggml_qnn_graph<2, 1>; -using ggml_qnn_graph_unary = ggml_qnn_graph<1, 1>; - } // namespace qnn diff --git a/ggml/src/ggml-qnn/qnn-lib.hpp b/ggml/src/ggml-qnn/qnn-lib.hpp index 517df493ccb16..4e1dcb34c119f 100644 --- a/ggml/src/ggml-qnn/qnn-lib.hpp +++ b/ggml/src/ggml-qnn/qnn-lib.hpp @@ -366,7 +366,7 @@ class qnn_instance { size_t probe_slots[] = { 1024, 1536, 2048 - 48, 2048 }; size_t probe_counts = sizeof(probe_slots) / sizeof(size_t); for (size_t idx = 0; idx < probe_counts; idx++) { - rpc_buffer = static_cast(alloc_rpcmem(probe_slots[idx] * size_in_mb, 4)); + rpc_buffer = static_cast(alloc_rpcmem(probe_slots[idx] * size_in_mb, sizeof(void *))); if (!rpc_buffer) { QNN_LOG_DEBUG("alloc rpcmem %d (MB) failure, %s\n", probe_slots[idx], strerror(errno)); break; diff --git a/ggml/src/ggml-qnn/tensor.hpp b/ggml/src/ggml-qnn/tensor.hpp index 49e9258c38a60..5e45266b40b9b 100644 --- a/ggml/src/ggml-qnn/tensor.hpp +++ b/ggml/src/ggml-qnn/tensor.hpp @@ -10,7 +10,6 @@ #include "QnnTensor.h" #include "System/QnnSystemInterface.h" -#include "graph.hpp" #include "logger.hpp" #include "qnn-lib.hpp" #include "utils.hpp" @@ -19,68 +18,47 @@ namespace qnn { class ggml_qnn_tensor { public: - static ggml_qnn_tensor *from_ggml_tensor(const ggml_tensor *tensor) { - if (!tensor) { - return nullptr; - } - - return static_cast(tensor->extra); - } - - explicit ggml_qnn_tensor(ggml_tensor *tensor, QNNBackend device, std::shared_ptr qnn_instance) : - _tensor(tensor), _device(device), _qnn_instance(qnn_instance) { - update_tensor_name(); + explicit ggml_qnn_tensor(const std::string &name, QNNBackend device, Qnn_GraphHandle_t graph_handle, + std::shared_ptr qnn_instance) : + _tensor_name(name), _device(device), _qnn_instance(qnn_instance), _graph_handle(graph_handle) { QNN_TENSOR_SET_NAME(_qnn_tensor, _tensor_name.c_str()); - _dimensions[0] = (uint32_t)tensor->ne[0]; - _dimensions[1] = (uint32_t)tensor->ne[1]; - _dimensions[2] = (uint32_t)tensor->ne[2]; - _dimensions[3] = (uint32_t)tensor->ne[3]; - QNN_TENSOR_SET_DIMENSIONS(_qnn_tensor, _dimensions); - auto qnn_tensor_type = device_tensortype_from_ggml_tensor(tensor); - QNN_TENSOR_SET_TYPE(_qnn_tensor, qnn_tensor_type); + QNN_TENSOR_SET_DIMENSIONS(_qnn_tensor, _dimensions.data()); + QNN_TENSOR_SET_TYPE(_qnn_tensor, QNN_TENSOR_TYPE_NATIVE); QNN_TENSOR_SET_DATA_FORMAT(_qnn_tensor, QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER); - QNN_TENSOR_SET_DATA_TYPE(_qnn_tensor, device_datatype_from_ggml_datatype(tensor->type)); - // TODO: set the quantizeParams base on the tensor type - QNN_TENSOR_SET_RANK(_qnn_tensor, (uint32_t)ggml_n_dims(tensor)); - - QNN_TENSOR_SET_MEM_TYPE(_qnn_tensor, QNN_TENSORMEMTYPE_RAW); - Qnn_ClientBuffer_t client_buf = {}; - QNN_TENSOR_SET_CLIENT_BUF(_qnn_tensor, client_buf); - - tensor->extra = this; - QNN_LOG_DEBUG("create tensor %s, device: %d, qnn_type: %d", _tensor_name.c_str(), device, (int)qnn_tensor_type); + QNN_LOG_DEBUG("create tensor %s, device: %d", _tensor_name.c_str(), device); } - template - bool bind_to_graph(ggml_qnn_graph<_InputSize, _OutputSize> &graph, bool is_input) { - if (!is_valid()) { - QNN_LOG_WARN("tensor %s not valid", _tensor_name.c_str()); - return false; - } - - if (_graph_handle) { - if (_graph_handle != graph.get_graph_handler()) { - QNN_LOG_WARN("tensor %s has been bound to another graph", _tensor_name.c_str()); + bool bind_ggml_tensor(ggml_tensor *tensor, bool is_input) { + if (_tensor) { + if (_tensor != tensor) { + QNN_LOG_WARN("tensor %s has been bound to another ggml tensor %s", _tensor_name.c_str(), + ggml_get_name(_tensor)); return false; } else { - QNN_LOG_INFO("tensor %s already bound to same graph %s", _tensor_name.c_str(), - graph.get_name().c_str()); + QNN_LOG_INFO("tensor %s already bound to same ggml tensor %s", _tensor_name.c_str(), + ggml_get_name(_tensor)); return true; } } + update_params_from_ggml_tensor(tensor); Qnn_TensorType_t new_tensor_type = is_input ? QNN_TENSOR_TYPE_APP_WRITE : QNN_TENSOR_TYPE_APP_READ; QNN_TENSOR_SET_TYPE(_qnn_tensor, new_tensor_type); QNN_LOG_INFO("tensor %s changed to type %d", _tensor_name.c_str(), new_tensor_type); - update_tensor_name(); - Qnn_Tensor_t tensor = _qnn_tensor; - if (!graph.create_graph_tensor(tensor)) { - QNN_LOG_WARN("create graph tensor failed, tensor %s", _tensor_name.c_str()); - return false; + + if (!QNN_TENSOR_GET_ID(_qnn_tensor)) { + Qnn_Tensor_t qnn_tensor = _qnn_tensor; + auto qnn_interface = _qnn_instance->get_qnn_interface(); + auto error = qnn_interface->qnn_tensor_create_graph_tensor(_graph_handle, &qnn_tensor); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("create graph tensor failed, tensor %s, error: %d\n", _tensor_name.c_str(), error); + return false; + } + QNN_TENSOR_SET_ID(_qnn_tensor, QNN_TENSOR_GET_ID(qnn_tensor)); } if (should_use_mem_handle()) { - _qnn_rpc_buffer = alloc_rpc_mem(); + _qnn_rpc_buffer = alloc_rpc_mem(ggml_nbytes(tensor)); if (!_qnn_rpc_buffer) { QNN_LOG_WARN("alloc rpc mem failed, tensor %s", _tensor_name.c_str()); return false; @@ -89,28 +67,59 @@ class ggml_qnn_tensor { QNN_LOG_DEBUG("tensor %s, use mem handle %p", _tensor_name.c_str(), QNN_TENSOR_GET_MEM_HANDLE(_qnn_tensor)); } else { QNN_TENSOR_SET_MEM_TYPE(_qnn_tensor, QNN_TENSORMEMTYPE_RAW); - Qnn_ClientBuffer_t client_buf = { _tensor->data, get_ggml_tensor_data_size(_tensor) }; + Qnn_ClientBuffer_t client_buf = { tensor->data, get_ggml_tensor_data_size(tensor) }; QNN_TENSOR_SET_CLIENT_BUF(_qnn_tensor, client_buf); QNN_LOG_DEBUG("tensor %s, use client buffer %p size %d", _tensor_name.c_str(), client_buf.data, (int)client_buf.dataSize); } - QNN_TENSOR_SET_ID(_qnn_tensor, QNN_TENSOR_GET_ID(tensor)); - _graph_handle = graph.get_graph_handler(); + _tensor = tensor; - QNN_LOG_DEBUG("bind tensor %s to graph %s", _tensor_name.c_str(), graph.get_name().c_str()); + if (!write_to_qnn_tensor()) { + QNN_LOG_WARN("write to qnn tensor failed, tensor %s", _tensor_name.c_str()); + return false; + } + + QNN_LOG_DEBUG("bind tensor %s to ggml tensor %s", _tensor_name.c_str(), ggml_get_name(tensor)); return true; } - bool write_to_qnn_tensor() { - if (!is_valid()) { - QNN_LOG_WARN("tensor %s not valid", _tensor_name.c_str()); + bool unbind_ggml_tensor() { + if (!_graph_handle) { + QNN_LOG_WARN("tensor %s not bound to any graph", _tensor_name.c_str()); + return false; + } + + if (!_tensor) { + QNN_LOG_DEBUG("tensor %s not bound to ggml tensor", _tensor_name.c_str()); + return true; + } + + if (!read_from_qnn_tensor()) { + QNN_LOG_WARN("read from qnn tensor failed, tensor %s", _tensor_name.c_str()); return false; } + if (!should_use_mem_handle()) { + QNN_TENSOR_SET_MEM_TYPE(_qnn_tensor, QNN_TENSORMEMTYPE_RAW); + Qnn_ClientBuffer_t client_buf = {}; + QNN_TENSOR_SET_CLIENT_BUF(_qnn_tensor, client_buf); + QNN_LOG_DEBUG("tensor %s, clear client buffer", _tensor_name.c_str()); + } + + _tensor = nullptr; + QNN_LOG_DEBUG("unbind tensor: %s from ggml tensor: %s", _tensor_name.c_str(), _tensor->name); + return true; + } + + const Qnn_Tensor_t &get_qnn_tensor() const { return _qnn_tensor; } + +private: + bool write_to_qnn_tensor() { auto tensor_type = QNN_TENSOR_GET_TYPE(_qnn_tensor); if (tensor_type != QNN_TENSOR_TYPE_APP_WRITE && tensor_type != QNN_TENSOR_TYPE_APP_READWRITE) { - QNN_LOG_WARN("tensor %s type(%d) not WRITE", _tensor_name.c_str(), (int)tensor_type); + QNN_LOG_DEBUG("tensor %s type(%d) not WRITE", _tensor_name.c_str(), (int)tensor_type); + return true; } if (should_use_mem_handle()) { @@ -128,14 +137,10 @@ class ggml_qnn_tensor { } bool read_from_qnn_tensor() { - if (!is_valid()) { - QNN_LOG_WARN("tensor %s not valid", _tensor_name.c_str()); - return false; - } - auto tensor_type = QNN_TENSOR_GET_TYPE(_qnn_tensor); if (tensor_type != QNN_TENSOR_TYPE_APP_READ && tensor_type != QNN_TENSOR_TYPE_APP_READWRITE) { - QNN_LOG_WARN("tensor %s type(%d) not READ", _tensor_name.c_str(), (int)tensor_type); + QNN_LOG_DEBUG("tensor %s type(%d) not READ", _tensor_name.c_str(), (int)tensor_type); + return true; } if (should_use_mem_handle()) { @@ -152,13 +157,8 @@ class ggml_qnn_tensor { return true; } - bool is_valid() const { return _tensor; } - const Qnn_Tensor_t &get_qnn_tensor() const { return _qnn_tensor; } - -private: - uint8_t *alloc_rpc_mem() { - uint8_t *qnn_rpc_buffer = - static_cast(_qnn_instance->alloc_rpcmem(ggml_nbytes(_tensor), alignof(void *))); + uint8_t *alloc_rpc_mem(size_t bytes) { + uint8_t *qnn_rpc_buffer = static_cast(_qnn_instance->alloc_rpcmem(bytes, alignof(void *))); if (!qnn_rpc_buffer) { QNN_LOG_WARN("alloc rpc mem failure, %s\n", strerror(errno)); QNN_LOG_DEBUG("tensor name %s", _tensor_name.c_str()); @@ -180,29 +180,28 @@ class ggml_qnn_tensor { return qnn_rpc_buffer; } - bool should_use_mem_handle() const { return _device == QNN_BACKEND_NPU; } + void update_params_from_ggml_tensor(ggml_tensor *tensor) { + _dimensions[0] = (uint32_t)tensor->ne[0]; + _dimensions[1] = (uint32_t)tensor->ne[1]; + _dimensions[2] = (uint32_t)tensor->ne[2]; + _dimensions[3] = (uint32_t)tensor->ne[3]; + QNN_TENSOR_SET_DATA_TYPE(_qnn_tensor, device_datatype_from_ggml_datatype(tensor->type)); + // TODO: set the quantizeParams base on the tensor type + QNN_TENSOR_SET_RANK(_qnn_tensor, (uint32_t)ggml_n_dims(tensor)); - void update_tensor_name() { - auto *tensor_name = ggml_get_name(_tensor); - if (!strnlen(tensor_name, GGML_MAX_NAME)) { - if (_tensor_name.empty()) { - static std::atomic_uint32_t unnamed_tensor_count = 0; - char buffer[GGML_MAX_NAME] = {}; - snprintf(buffer, sizeof(buffer), "unnamed_%d", (int)(unnamed_tensor_count++)); - _tensor_name = buffer; - } - } else { - QNN_LOG_DEBUG("tensor name changed: %s -> %s", _tensor_name.c_str(), tensor_name); - _tensor_name = tensor_name; - } + QNN_TENSOR_SET_MEM_TYPE(_qnn_tensor, QNN_TENSORMEMTYPE_RAW); + Qnn_ClientBuffer_t client_buf = {}; + QNN_TENSOR_SET_CLIENT_BUF(_qnn_tensor, client_buf); } + bool should_use_mem_handle() const { return _device == QNN_BACKEND_NPU; } + + std::string _tensor_name; const ggml_tensor *_tensor; QNNBackend _device; std::shared_ptr _qnn_instance; - Qnn_Tensor_t _qnn_tensor = QNN_TENSOR_INIT; - uint32_t _dimensions[4] = {}; - std::string _tensor_name; + Qnn_Tensor_t _qnn_tensor = qnn_tensor_init(kDefaultQnnTensorVersion); + std::array _dimensions = {}; Qnn_GraphHandle_t _graph_handle = nullptr; uint8_t *_qnn_rpc_buffer = nullptr; diff --git a/ggml/src/ggml-qnn/utils.hpp b/ggml/src/ggml-qnn/utils.hpp index e91a5ae8730d6..c2da6cb27eaf7 100644 --- a/ggml/src/ggml-qnn/utils.hpp +++ b/ggml/src/ggml-qnn/utils.hpp @@ -13,6 +13,8 @@ #include "QnnTypes.h" #include "logger.hpp" +#define QNN_TENSOR_VER(x) ((x).v2) + namespace qnn { uint32_t get_ggml_tensor_rank(const ggml_tensor *tensor); @@ -29,149 +31,159 @@ const char *opname_from_ggmlop(enum ggml_op ggmlop); const char *get_qnn_error_string(Qnn_ErrorHandle_t error); -inline int validate_tensor_version(const Qnn_Tensor_t &tensor) { - if (tensor.version != QNN_TENSOR_VERSION_1) { - QNN_LOG_WARN("validate_tensor_version() tensor %s, got unsupported version %d\n", tensor.v1.name, - tensor.version); - return 1; +constexpr const Qnn_TensorVersion_t kDefaultQnnTensorVersion = QNN_TENSOR_VERSION_2; + +inline Qnn_Tensor_t qnn_tensor_init(Qnn_TensorVersion_t version) { + Qnn_Tensor_t tensor; + tensor.version = version; + if (version == QNN_TENSOR_VERSION_1) { + tensor.v1 = QNN_TENSOR_V1_INIT; + } else if (version == QNN_TENSOR_VERSION_2) { + tensor.v2 = QNN_TENSOR_V2_INIT; } - return 0; + return tensor; } inline uint32_t get_qnn_tensorid(const Qnn_Tensor_t &tensor) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - return tensor.v1.id; + if (tensor.version == kDefaultQnnTensorVersion) { + return QNN_TENSOR_VER(tensor).id; } return 0u; } inline const char *get_qnn_tensorname(const Qnn_Tensor_t &tensor) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - return tensor.v1.name; + if (tensor.version == kDefaultQnnTensorVersion) { + return QNN_TENSOR_VER(tensor).name; } return nullptr; } inline Qnn_TensorType_t get_qnn_tensortype(const Qnn_Tensor_t &tensor) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - return tensor.v1.type; + if (tensor.version == kDefaultQnnTensorVersion) { + return QNN_TENSOR_VER(tensor).type; } return QNN_TENSOR_TYPE_UNDEFINED; } inline Qnn_TensorDataFormat_t get_qnn_tensor_dataformat(const Qnn_Tensor_t &tensor) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - return tensor.v1.dataFormat; + if (tensor.version == kDefaultQnnTensorVersion) { + return QNN_TENSOR_VER(tensor).dataFormat; } return QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER; } inline Qnn_DataType_t get_qnn_tensor_datatype(const Qnn_Tensor_t &tensor) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - return tensor.v1.dataType; + if (tensor.version == kDefaultQnnTensorVersion) { + return QNN_TENSOR_VER(tensor).dataType; } return QNN_DATATYPE_UNDEFINED; } inline Qnn_QuantizeParams_t get_qnn_tensor_quantparams(const Qnn_Tensor_t &tensor) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - return tensor.v1.quantizeParams; + if (tensor.version == kDefaultQnnTensorVersion) { + return QNN_TENSOR_VER(tensor).quantizeParams; } return QNN_QUANTIZE_PARAMS_INIT; } inline uint32_t get_qnn_tensor_rank(const Qnn_Tensor_t &tensor) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - return tensor.v1.rank; + if (tensor.version == kDefaultQnnTensorVersion) { + return QNN_TENSOR_VER(tensor).rank; } return 0u; } inline uint32_t *get_qnn_tensor_dimensions(const Qnn_Tensor_t &tensor) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - return tensor.v1.dimensions; + if (tensor.version == kDefaultQnnTensorVersion) { + return QNN_TENSOR_VER(tensor).dimensions; } return nullptr; } inline Qnn_TensorMemType_t get_qnn_tensor_memtype(const Qnn_Tensor_t &tensor) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - return tensor.v1.memType; + if (tensor.version == kDefaultQnnTensorVersion) { + return QNN_TENSOR_VER(tensor).memType; } return QNN_TENSORMEMTYPE_UNDEFINED; } inline Qnn_MemHandle_t get_qnn_tensor_memhandle(const Qnn_Tensor_t &tensor) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - return tensor.v1.memHandle; + if (tensor.version == kDefaultQnnTensorVersion) { + return QNN_TENSOR_VER(tensor).memHandle; } return nullptr; } inline void set_qnn_tensor_id(Qnn_Tensor_t &tensor, uint32_t id) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - tensor.v1.id = id; + if (tensor.version == kDefaultQnnTensorVersion) { + QNN_TENSOR_VER(tensor).id = id; } } inline void set_qnn_tensor_name(Qnn_Tensor_t &tensor, const char *name) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - tensor.v1.name = name; + if (tensor.version == kDefaultQnnTensorVersion) { + QNN_TENSOR_VER(tensor).name = name; } } inline void set_qnn_tensor_type(Qnn_Tensor_t &tensor, Qnn_TensorType_t type) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - tensor.v1.type = type; + if (tensor.version == kDefaultQnnTensorVersion) { + QNN_TENSOR_VER(tensor).type = type; } } inline void set_qnn_tensor_dataformat(Qnn_Tensor_t &tensor, Qnn_TensorDataFormat_t format) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - tensor.v1.dataFormat = format; + if (tensor.version == kDefaultQnnTensorVersion) { + QNN_TENSOR_VER(tensor).dataFormat = format; } } inline void set_qnn_tensor_datatype(Qnn_Tensor_t &tensor, Qnn_DataType_t dataType) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - tensor.v1.dataType = dataType; + if (tensor.version == kDefaultQnnTensorVersion) { + QNN_TENSOR_VER(tensor).dataType = dataType; } } inline void set_qnn_tensor_quantparams(Qnn_Tensor_t &tensor, Qnn_QuantizeParams_t params) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - tensor.v1.quantizeParams = params; + if (tensor.version == kDefaultQnnTensorVersion) { + QNN_TENSOR_VER(tensor).quantizeParams = params; } } inline void set_qnn_tensor_rank(Qnn_Tensor_t &tensor, uint32_t rank) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - tensor.v1.rank = rank; + if (tensor.version == kDefaultQnnTensorVersion) { + QNN_TENSOR_VER(tensor).rank = rank; } } inline void set_qnn_tensor_dimensions(Qnn_Tensor_t &tensor, uint32_t *dims) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - tensor.v1.dimensions = dims; + if (tensor.version == kDefaultQnnTensorVersion) { + QNN_TENSOR_VER(tensor).dimensions = dims; } } inline void set_qnn_tensor_memtype(Qnn_Tensor_t &tensor, Qnn_TensorMemType_t mem_type) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - tensor.v1.memType = mem_type; + if (tensor.version == kDefaultQnnTensorVersion) { + QNN_TENSOR_VER(tensor).memType = mem_type; } } inline void set_qnn_tensor_clientbuf(Qnn_Tensor_t &tensor, Qnn_ClientBuffer_t client_buf) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - tensor.v1.clientBuf = client_buf; + if (tensor.version == kDefaultQnnTensorVersion) { + QNN_TENSOR_VER(tensor).clientBuf = client_buf; } } inline void set_qnn_tensor_memhandle(Qnn_Tensor_t &tensor, Qnn_MemHandle_t handle) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - tensor.v1.memHandle = handle; + if (tensor.version == kDefaultQnnTensorVersion) { + QNN_TENSOR_VER(tensor).memHandle = handle; + } +} + +inline void set_qnn_tensor_dyn_dimensions(Qnn_Tensor_t &tensor, uint8_t *isDynamicDimensions) { + if (tensor.version == kDefaultQnnTensorVersion) { + QNN_TENSOR_VER(tensor).isDynamicDimensions = isDynamicDimensions; } } @@ -239,3 +251,4 @@ class qnn_perf { #define QNN_TENSOR_SET_MEM_TYPE(tensor, value) qnn::set_qnn_tensor_memtype(tensor, value) #define QNN_TENSOR_SET_CLIENT_BUF(tensor, value) qnn::set_qnn_tensor_clientbuf(tensor, value) #define QNN_TENSOR_SET_MEM_HANDLE(tensor, value) qnn::set_qnn_tensor_memhandle(tensor, value) +#define QNN_TENSOR_SET_DYN_DIMENSIONS(tensor, value) qnn::set_qnn_tensor_dyn_dimensions(tensor, value) From 706793f078acab0d952a088fd78ebb8af342df7b Mon Sep 17 00:00:00 2001 From: hongruichen Date: Mon, 22 Jul 2024 21:34:33 +0800 Subject: [PATCH 098/143] fix: back to qnn tensor v1 to fix the create tensor error --- ggml/src/ggml-qnn/utils.hpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/ggml/src/ggml-qnn/utils.hpp b/ggml/src/ggml-qnn/utils.hpp index c2da6cb27eaf7..b7f29bdaa5663 100644 --- a/ggml/src/ggml-qnn/utils.hpp +++ b/ggml/src/ggml-qnn/utils.hpp @@ -13,7 +13,7 @@ #include "QnnTypes.h" #include "logger.hpp" -#define QNN_TENSOR_VER(x) ((x).v2) +#define QNN_TENSOR_VER(x) ((x).v1) namespace qnn { @@ -31,7 +31,7 @@ const char *opname_from_ggmlop(enum ggml_op ggmlop); const char *get_qnn_error_string(Qnn_ErrorHandle_t error); -constexpr const Qnn_TensorVersion_t kDefaultQnnTensorVersion = QNN_TENSOR_VERSION_2; +constexpr const Qnn_TensorVersion_t kDefaultQnnTensorVersion = QNN_TENSOR_VERSION_1; inline Qnn_Tensor_t qnn_tensor_init(Qnn_TensorVersion_t version) { Qnn_Tensor_t tensor; @@ -182,8 +182,8 @@ inline void set_qnn_tensor_memhandle(Qnn_Tensor_t &tensor, Qnn_MemHandle_t handl } inline void set_qnn_tensor_dyn_dimensions(Qnn_Tensor_t &tensor, uint8_t *isDynamicDimensions) { - if (tensor.version == kDefaultQnnTensorVersion) { - QNN_TENSOR_VER(tensor).isDynamicDimensions = isDynamicDimensions; + if (tensor.version == QNN_TENSOR_VERSION_2) { + tensor.v2.isDynamicDimensions = isDynamicDimensions; } } From f843e5aaf5a6f03af71984c197a2e1b92ef5e707 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Mon, 22 Jul 2024 23:41:23 +0800 Subject: [PATCH 099/143] fix: 1.free up rpc memory at destruct 2. unbind tesnsor --- ggml/src/ggml-qnn/graph.hpp | 16 ++++++++++++---- ggml/src/ggml-qnn/tensor.hpp | 24 ++++++++++++++++-------- 2 files changed, 28 insertions(+), 12 deletions(-) diff --git a/ggml/src/ggml-qnn/graph.hpp b/ggml/src/ggml-qnn/graph.hpp index 9941365f7e897..c82b7d66ae1cf 100644 --- a/ggml/src/ggml-qnn/graph.hpp +++ b/ggml/src/ggml-qnn/graph.hpp @@ -86,7 +86,7 @@ class ggml_qnn_graph { return false; } - QNN_LOG_DEBUG("graph name %s, add_nodes start", _graph_name.c_str()); + QNN_LOG_DEBUG("graph name %s, build_graph start", _graph_name.c_str()); _qnn_tensor_inputs.resize(tensor_inputs.size()); _tensor_inputs.resize(tensor_inputs.size()); for (size_t i = 0; i < tensor_inputs.size(); i++) { @@ -111,7 +111,7 @@ class ggml_qnn_graph { snprintf(buffer, GGML_MAX_NAME, "dst%d", (int)i); auto qnn_tensor = std::make_shared(std::string(buffer), _device, _graph_handle, _qnn_instance); - auto *ggml_tensor = tensor_inputs[i]; + auto *ggml_tensor = tensor_outputs[i]; if (!qnn_tensor->bind_ggml_tensor(ggml_tensor, false)) { QNN_LOG_ERROR("bind tensor %s failed\n", ggml_get_name(ggml_tensor)); return false; @@ -155,7 +155,7 @@ class ggml_qnn_graph { return false; } - QNN_LOG_DEBUG("graph name %s, add_nodes succeed", _graph_name.c_str()); + QNN_LOG_DEBUG("graph name %s, build_graph succeed", _graph_name.c_str()); return true; } @@ -173,7 +173,7 @@ class ggml_qnn_graph { } for (size_t i = 0; i < tensor_outputs.size(); i++) { - auto *ggml_tensor = tensor_inputs[i]; + auto *ggml_tensor = tensor_outputs[i]; if (!_tensor_outputs[i]->bind_ggml_tensor(ggml_tensor, false)) { QNN_LOG_ERROR("bind tensor %s failed\n", ggml_get_name(ggml_tensor)); return false; @@ -191,6 +191,14 @@ class ggml_qnn_graph { } } + for (auto tensor : _tensor_inputs) { + tensor->unbind_ggml_tensor(); + } + + for (auto tensor : _tensor_outputs) { + tensor->unbind_ggml_tensor(); + } + if (error != QNN_SUCCESS) { QNN_LOG_INFO("error = %d\n", error); return false; diff --git a/ggml/src/ggml-qnn/tensor.hpp b/ggml/src/ggml-qnn/tensor.hpp index 5e45266b40b9b..7709936ed9618 100644 --- a/ggml/src/ggml-qnn/tensor.hpp +++ b/ggml/src/ggml-qnn/tensor.hpp @@ -28,17 +28,22 @@ class ggml_qnn_tensor { QNN_LOG_DEBUG("create tensor %s, device: %d", _tensor_name.c_str(), device); } + ~ggml_qnn_tensor() { + if (_qnn_instance && _qnn_rpc_buffer) { + _qnn_instance->free_rpcmem(_qnn_rpc_buffer); + } + } + bool bind_ggml_tensor(ggml_tensor *tensor, bool is_input) { if (_tensor) { if (_tensor != tensor) { QNN_LOG_WARN("tensor %s has been bound to another ggml tensor %s", _tensor_name.c_str(), ggml_get_name(_tensor)); return false; - } else { - QNN_LOG_INFO("tensor %s already bound to same ggml tensor %s", _tensor_name.c_str(), - ggml_get_name(_tensor)); - return true; } + QNN_LOG_INFO("tensor %s already bound to same ggml tensor %s", _tensor_name.c_str(), + ggml_get_name(_tensor)); + return true; } update_params_from_ggml_tensor(tensor); @@ -55,13 +60,16 @@ class ggml_qnn_tensor { return false; } QNN_TENSOR_SET_ID(_qnn_tensor, QNN_TENSOR_GET_ID(qnn_tensor)); + QNN_LOG_DEBUG("create graph tensor %s, id: %d", _tensor_name.c_str(), QNN_TENSOR_GET_ID(qnn_tensor)); } if (should_use_mem_handle()) { - _qnn_rpc_buffer = alloc_rpc_mem(ggml_nbytes(tensor)); if (!_qnn_rpc_buffer) { - QNN_LOG_WARN("alloc rpc mem failed, tensor %s", _tensor_name.c_str()); - return false; + _qnn_rpc_buffer = alloc_rpc_mem(ggml_nbytes(tensor)); + if (!_qnn_rpc_buffer) { + QNN_LOG_WARN("alloc rpc mem failed, tensor %s", _tensor_name.c_str()); + return false; + } } QNN_LOG_DEBUG("tensor %s, use mem handle %p", _tensor_name.c_str(), QNN_TENSOR_GET_MEM_HANDLE(_qnn_tensor)); @@ -107,8 +115,8 @@ class ggml_qnn_tensor { QNN_LOG_DEBUG("tensor %s, clear client buffer", _tensor_name.c_str()); } + QNN_LOG_DEBUG("unbind tensor: %s from ggml tensor: %s", _tensor_name.c_str(), ggml_get_name(_tensor)); _tensor = nullptr; - QNN_LOG_DEBUG("unbind tensor: %s from ggml tensor: %s", _tensor_name.c_str(), _tensor->name); return true; } From ee305cc17158b35c20e137c2a55fdde2108edcbd Mon Sep 17 00:00:00 2001 From: hongruichen Date: Fri, 26 Jul 2024 22:33:30 +0800 Subject: [PATCH 100/143] refactoring: split qnn rpc buffer into dedicated class --- ggml/src/ggml-qnn/buffer.hpp | 56 +++++++++++++++++++++++ ggml/src/ggml-qnn/qnn-lib.hpp | 84 ++++++++++++++--------------------- ggml/src/ggml-qnn/tensor.hpp | 46 ++++++------------- 3 files changed, 103 insertions(+), 83 deletions(-) create mode 100644 ggml/src/ggml-qnn/buffer.hpp diff --git a/ggml/src/ggml-qnn/buffer.hpp b/ggml/src/ggml-qnn/buffer.hpp new file mode 100644 index 0000000000000..db8e8ccaf24fa --- /dev/null +++ b/ggml/src/ggml-qnn/buffer.hpp @@ -0,0 +1,56 @@ +#pragma once + +#include + +#include "logger.hpp" +#include "qnn-lib.hpp" + +namespace qnn { +class ggml_qnn_rpc_buffer { +public: + ggml_qnn_rpc_buffer(std::shared_ptr qnn_instance, size_t size, uint32_t rank, uint32_t *dimensions, + Qnn_DataType_t data_type) : + _qnn_instance(qnn_instance), _size(size) { + + auto *qnn_rpc_buffer = static_cast(qnn_instance->alloc_rpcmem(size, alignof(void *))); + _qnn_rpc_mem_handle = qnn_instance->register_rpcmem(qnn_rpc_buffer, rank, dimensions, data_type); + if (!_qnn_rpc_mem_handle) { + qnn_instance->free_rpcmem(qnn_rpc_buffer); + QNN_LOG_WARN("register rpc mem failure\n"); + return; + } + + _qnn_rpc_buffer = qnn_rpc_buffer; + QNN_LOG_DEBUG("alloc rpcmem(%p) successfully, size %d\n", _qnn_rpc_buffer, (int)size); + } + ~ggml_qnn_rpc_buffer() { + if (_qnn_instance) { + if (_qnn_rpc_mem_handle) { + _qnn_instance->unregister_rpcmem(_qnn_rpc_mem_handle); + } + + if (_qnn_rpc_buffer) { + _qnn_instance->free_rpcmem(_qnn_rpc_buffer); + } + } + } + + bool is_valid() const { return _qnn_rpc_buffer && _qnn_rpc_mem_handle; } + + uint8_t *get_buffer() const { return _qnn_rpc_buffer; } + size_t get_size() const { return _size; } + Qnn_MemHandle_t get_mem_handle() const { return _qnn_rpc_mem_handle; } + +private: + std::shared_ptr _qnn_instance; + size_t _size = 0; + uint8_t *_qnn_rpc_buffer = nullptr; + Qnn_MemHandle_t _qnn_rpc_mem_handle = nullptr; + + ggml_qnn_rpc_buffer(const ggml_qnn_rpc_buffer &) = delete; + void operator=(const ggml_qnn_rpc_buffer &) = delete; + ggml_qnn_rpc_buffer(ggml_qnn_rpc_buffer &&) = delete; + void operator=(ggml_qnn_rpc_buffer &&) = delete; +}; + +} // namespace qnn diff --git a/ggml/src/ggml-qnn/qnn-lib.hpp b/ggml/src/ggml-qnn/qnn-lib.hpp index 4e1dcb34c119f..aa142c74adf82 100644 --- a/ggml/src/ggml-qnn/qnn-lib.hpp +++ b/ggml/src/ggml-qnn/qnn-lib.hpp @@ -638,84 +638,68 @@ class qnn_instance { return mem_fd; } - int register_rpcmem(void *p_data, Qnn_Tensor_t *p_tensor) { - if (nullptr == p_data || (nullptr == p_tensor)) { + void *get_rpcmem_from_memhandle(Qnn_MemHandle_t mem_handle) { + for (std::unordered_map::iterator it = _qnn_mem_set.begin(); it != _qnn_mem_set.end(); + it++) { + Qnn_MemHandle_t mem_handle = it->second; + if (it->second == mem_handle) { + return it->first; + } + } + QNN_LOG_WARN("can't find rpcmem from qnn mem handle %p", mem_handle); + return nullptr; + } + + Qnn_MemHandle_t register_rpcmem(void *p_data, uint32_t rank, uint32_t *dimensions, Qnn_DataType_t data_type) { + if (!p_data) { QNN_LOG_WARN("invalid param\n"); - return 1; + return nullptr; } if (!is_rpcmem_initialized()) { QNN_LOG_WARN("rpc memory not initialized\n"); - return 2; + return nullptr; } if (is_rpcmem_allocated(p_data)) { QNN_LOG_WARN("rpc memory already allocated\n"); - return 3; - } - - if (is_rpcmem_registered(QNN_TENSOR_GET_MEM_HANDLE(*p_tensor))) { - QNN_LOG_WARN("tensor %s has been registered shared memory\n", QNN_TENSOR_GET_NAME(*p_tensor)); - return 4; + return nullptr; } - int32_t mem_fd = rpcmem_to_fd(p_data); + auto mem_fd = rpcmem_to_fd(p_data); if (mem_fd == -1) { QNN_LOG_WARN("failed to get file descriptor\n"); - return 5; + return nullptr; } + QNN_LOG_INFO("mem_fd %d\n", mem_fd); - Qnn_MemDescriptor_t descriptor = { { QNN_TENSOR_GET_RANK(*p_tensor), QNN_TENSOR_GET_DIMENSIONS(*p_tensor), - nullptr }, - QNN_TENSOR_GET_DATA_TYPE(*p_tensor), - QNN_MEM_TYPE_ION, - { { mem_fd } } }; + Qnn_MemDescriptor_t descriptor = { { rank, dimensions, nullptr }, data_type, QNN_MEM_TYPE_ION, { { mem_fd } } }; Qnn_MemHandle_t handle = nullptr; - int error = QNN_SUCCESS; - error = _qnn_interface->qnn_mem_register(_qnn_context_handle, &descriptor, - /*numDescriptors=*/1, &handle); + auto error = _qnn_interface->qnn_mem_register(_qnn_context_handle, &descriptor, + /*numDescriptors=*/1, &handle); if (error != QNN_SUCCESS) { QNN_LOG_WARN("failed to register shared memory, error %d, %s\n", QNN_GET_ERROR_CODE(error), strerror(error)); - return 6; + return nullptr; } - QNN_TENSOR_SET_MEM_HANDLE(*p_tensor, handle); _qnn_mem_set.insert((std::pair(p_data, handle))); - QNN_LOG_INFO("tensor %s successfully register shared memory handler: %p\n", QNN_TENSOR_GET_NAME(*p_tensor), - handle); - return 0; + QNN_LOG_INFO("successfully register shared memory handler: %p\n", handle); + return handle; } - void *get_rpcmem_from_memhandle(Qnn_MemHandle_t mem_handle) { - for (std::unordered_map::iterator it = _qnn_mem_set.begin(); it != _qnn_mem_set.end(); - it++) { - Qnn_MemHandle_t mem_handle = it->second; - if (it->second == mem_handle) { - return it->first; - } - } - QNN_LOG_WARN("can't find rpcmem from qnn mem handle %p", mem_handle); - return nullptr; - } - - void unregister_rpcmem() { - Qnn_ErrorHandle_t error = QNN_SUCCESS; - - if (_qnn_mem_set.empty()) { - QNN_LOG_WARN("no rpcmem registered\n"); + void unregister_rpcmem(Qnn_MemHandle_t mem_handle) { + Qnn_ErrorHandle_t error = _qnn_interface->qnn_mem_de_register(&mem_handle, 1); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to unregister shared memory, error %d\n", QNN_GET_ERROR_CODE(error)); } - for (std::unordered_map::iterator it = _qnn_mem_set.begin(); it != _qnn_mem_set.end(); - it++) { - Qnn_MemHandle_t mem_handle = it->second; - error = _qnn_interface->qnn_mem_de_register(&mem_handle, 1); - if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to unregister shared memory, error %d\n", QNN_GET_ERROR_CODE(error)); - } + auto it = std::find_if(_qnn_mem_set.begin(), _qnn_mem_set.end(), + [mem_handle](const auto &kv) { return kv.second == mem_handle; }); + if (it != _qnn_mem_set.end()) { + _qnn_mem_set.erase(it); } - _qnn_mem_set.clear(); } bool is_rpcmem_allocated(void *buf) { return _qnn_mem_set.count(buf) != 0U; } diff --git a/ggml/src/ggml-qnn/tensor.hpp b/ggml/src/ggml-qnn/tensor.hpp index 7709936ed9618..c4ea7a4095d5f 100644 --- a/ggml/src/ggml-qnn/tensor.hpp +++ b/ggml/src/ggml-qnn/tensor.hpp @@ -10,6 +10,7 @@ #include "QnnTensor.h" #include "System/QnnSystemInterface.h" +#include "buffer.hpp" #include "logger.hpp" #include "qnn-lib.hpp" #include "utils.hpp" @@ -28,11 +29,7 @@ class ggml_qnn_tensor { QNN_LOG_DEBUG("create tensor %s, device: %d", _tensor_name.c_str(), device); } - ~ggml_qnn_tensor() { - if (_qnn_instance && _qnn_rpc_buffer) { - _qnn_instance->free_rpcmem(_qnn_rpc_buffer); - } - } + ~ggml_qnn_tensor() { _qnn_rpc_buffer.reset(); } bool bind_ggml_tensor(ggml_tensor *tensor, bool is_input) { if (_tensor) { @@ -65,13 +62,19 @@ class ggml_qnn_tensor { if (should_use_mem_handle()) { if (!_qnn_rpc_buffer) { - _qnn_rpc_buffer = alloc_rpc_mem(ggml_nbytes(tensor)); - if (!_qnn_rpc_buffer) { + auto qnn_rpc_buffer = std::make_unique( + _qnn_instance, ggml_nbytes(tensor), QNN_TENSOR_GET_RANK(_qnn_tensor), + QNN_TENSOR_GET_DIMENSIONS(_qnn_tensor), QNN_TENSOR_GET_DATA_TYPE(_qnn_tensor)); + if (!qnn_rpc_buffer->is_valid()) { QNN_LOG_WARN("alloc rpc mem failed, tensor %s", _tensor_name.c_str()); return false; } + + _qnn_rpc_buffer = std::move(qnn_rpc_buffer); } + QNN_TENSOR_SET_MEM_TYPE(_qnn_tensor, QNN_TENSORMEMTYPE_MEMHANDLE); + QNN_TENSOR_SET_MEM_HANDLE(_qnn_tensor, _qnn_rpc_buffer->get_mem_handle()); QNN_LOG_DEBUG("tensor %s, use mem handle %p", _tensor_name.c_str(), QNN_TENSOR_GET_MEM_HANDLE(_qnn_tensor)); } else { QNN_TENSOR_SET_MEM_TYPE(_qnn_tensor, QNN_TENSORMEMTYPE_RAW); @@ -132,7 +135,7 @@ class ggml_qnn_tensor { if (should_use_mem_handle()) { if (_qnn_rpc_buffer) { - memcpy(_qnn_rpc_buffer, _tensor->data, ggml_nbytes(_tensor)); + memcpy(_qnn_rpc_buffer->get_buffer(), _tensor->data, ggml_nbytes(_tensor)); } else { QNN_LOG_WARN("tensor %s: can't find rpcmem from qnn mem handle\n", _tensor_name.c_str()); return false; @@ -153,7 +156,7 @@ class ggml_qnn_tensor { if (should_use_mem_handle()) { if (_qnn_rpc_buffer) { - memcpy(_tensor->data, _qnn_rpc_buffer, ggml_nbytes(_tensor)); + memcpy(_tensor->data, _qnn_rpc_buffer->get_buffer(), ggml_nbytes(_tensor)); } else { QNN_LOG_WARN("can't find rpcmem from qnn mem handle\n"); return false; @@ -165,29 +168,6 @@ class ggml_qnn_tensor { return true; } - uint8_t *alloc_rpc_mem(size_t bytes) { - uint8_t *qnn_rpc_buffer = static_cast(_qnn_instance->alloc_rpcmem(bytes, alignof(void *))); - if (!qnn_rpc_buffer) { - QNN_LOG_WARN("alloc rpc mem failure, %s\n", strerror(errno)); - QNN_LOG_DEBUG("tensor name %s", _tensor_name.c_str()); - return nullptr; - } - - QNN_LOG_INFO("tensor %s: alloc rpcmem(%p) successfully\n", _tensor_name.c_str(), qnn_rpc_buffer); - auto error = _qnn_instance->register_rpcmem(qnn_rpc_buffer, &_qnn_tensor); - if (error != QNN_SUCCESS) { - QNN_LOG_WARN("register rpc mem failure, %d\n", (int)error); - QNN_LOG_DEBUG("tensor name %s", _tensor_name.c_str()); - _qnn_instance->free_rpcmem(qnn_rpc_buffer); - return nullptr; - } - - // The mem handle will be set at qnn_instance::register_rpcmem - QNN_TENSOR_SET_MEM_TYPE(_qnn_tensor, QNN_TENSORMEMTYPE_MEMHANDLE); - QNN_LOG_INFO("tensor %s: register rpcmem(%p) successfully\n", _tensor_name.c_str(), qnn_rpc_buffer); - return qnn_rpc_buffer; - } - void update_params_from_ggml_tensor(ggml_tensor *tensor) { _dimensions[0] = (uint32_t)tensor->ne[0]; _dimensions[1] = (uint32_t)tensor->ne[1]; @@ -211,7 +191,7 @@ class ggml_qnn_tensor { Qnn_Tensor_t _qnn_tensor = qnn_tensor_init(kDefaultQnnTensorVersion); std::array _dimensions = {}; Qnn_GraphHandle_t _graph_handle = nullptr; - uint8_t *_qnn_rpc_buffer = nullptr; + std::unique_ptr _qnn_rpc_buffer; ggml_qnn_tensor(const ggml_qnn_tensor &) = delete; void operator=(const ggml_qnn_tensor &) = delete; From 47735cb5896332745508f74a81d9fe4c45e96de1 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Fri, 26 Jul 2024 23:03:09 +0800 Subject: [PATCH 101/143] fix: try fix error in 2nd run by appending dimension into graph key --- ggml/src/ggml-qnn/backend-ops.cpp | 36 ++++++++++++++++++++++--------- 1 file changed, 26 insertions(+), 10 deletions(-) diff --git a/ggml/src/ggml-qnn/backend-ops.cpp b/ggml/src/ggml-qnn/backend-ops.cpp index bd87cfc9e66eb..b138257b8c9f6 100644 --- a/ggml/src/ggml-qnn/backend-ops.cpp +++ b/ggml/src/ggml-qnn/backend-ops.cpp @@ -78,6 +78,31 @@ bool execute_graph(qnn::ggml_qnn_graph *graph, const std::array +std::string get_graph_key(const std::string &op_name, const std::array &inputs, + const std::array &outputs) { + constexpr static const auto append_dimensions = [](std::string &key, const ggml_tensor *tensor) { + key += "_"; + key += std::to_string(tensor->ne[0]); + key += "x"; + key += std::to_string(tensor->ne[1]); + key += "x"; + key += std::to_string(tensor->ne[2]); + key += "x"; + key += std::to_string(tensor->ne[3]); + }; + + std::string graph_key(op_name); + for (auto &input : inputs) { + append_dimensions(graph_key, input); + } + for (auto &output : outputs) { + append_dimensions(graph_key, output); + } + + return graph_key; +} + template qnn::ggml_qnn_graph *get_qnn_graph_from_cache(ggml_backend_qnn_context *ctx, size_t op, const std::string &qnn_op, const std::array &inputs, @@ -87,16 +112,7 @@ qnn::ggml_qnn_graph *get_qnn_graph_from_cache(ggml_backend_qnn_context *ctx, siz auto &graph_cache = ctx->qnn_graph_cache; const auto *op_name = op < qnn::kGgmlUnaryOpStart ? ggml_op_name(ggml_op(op)) : ggml_unary_op_name(ggml_unary_op(op - qnn::kGgmlUnaryOpStart)); - std::string graph_key(op_name); - for (auto &input : inputs) { - graph_key += "_"; - graph_key += input->name; - } - for (auto &output : outputs) { - graph_key += "_"; - graph_key += output->name; - } - + auto graph_key = get_graph_key<_InputSize, _OutputSize>(op_name, inputs, outputs); auto it = graph_cache.find(graph_key); qnn::ggml_qnn_graph *graph_ptr = nullptr; if (it != graph_cache.end()) { From be9a8c73a0d5822d75c07f551d6efe0280287924 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Fri, 26 Jul 2024 23:07:25 +0800 Subject: [PATCH 102/143] fix: suppress warning --- ggml/src/ggml-qnn.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/ggml/src/ggml-qnn.cpp b/ggml/src/ggml-qnn.cpp index 87653cfb1f741..6472d3e154367 100644 --- a/ggml/src/ggml-qnn.cpp +++ b/ggml/src/ggml-qnn.cpp @@ -71,7 +71,7 @@ static struct ggml_backend_qnn_context g_qnn_mgr[GGML_QNN_MAX_DEVICES] = { class ggml_backend_qnn_buffer_context { public: ggml_backend_qnn_buffer_context(QNNBackend device, std::shared_ptr instance, size_t size) : - _device(device), _instance(instance), _name(QNN_BACKEND_NAME + std::to_string(device)) { + _instance(instance), _name(QNN_BACKEND_NAME + std::to_string(device)) { size_t size_page = sysconf(_SC_PAGESIZE); @@ -97,7 +97,6 @@ class ggml_backend_qnn_buffer_context { size_t get_buffer_size() { return _buffer_size; } private: - QNNBackend _device; std::shared_ptr _instance; std::string _name; void *_buffer = nullptr; From 18aa6654d5c2af7c5dcea7d57abeb8f260ab0678 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Sat, 27 Jul 2024 10:38:43 +0800 Subject: [PATCH 103/143] refactoring: opt graph key gen --- ggml/src/ggml-qnn/backend-ops.cpp | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/ggml/src/ggml-qnn/backend-ops.cpp b/ggml/src/ggml-qnn/backend-ops.cpp index b138257b8c9f6..1f8b75e5e3e0e 100644 --- a/ggml/src/ggml-qnn/backend-ops.cpp +++ b/ggml/src/ggml-qnn/backend-ops.cpp @@ -82,14 +82,10 @@ template std::string get_graph_key(const std::string &op_name, const std::array &inputs, const std::array &outputs) { constexpr static const auto append_dimensions = [](std::string &key, const ggml_tensor *tensor) { - key += "_"; - key += std::to_string(tensor->ne[0]); - key += "x"; - key += std::to_string(tensor->ne[1]); - key += "x"; - key += std::to_string(tensor->ne[2]); - key += "x"; - key += std::to_string(tensor->ne[3]); + char buffer[256] = {}; + snprintf(buffer, sizeof(buffer), "_%ldx%ldx%ldx%ld", (long)tensor->ne[0], (long)tensor->ne[1], + (long)tensor->ne[2], (long)tensor->ne[3]); + key += buffer; }; std::string graph_key(op_name); @@ -99,7 +95,7 @@ std::string get_graph_key(const std::string &op_name, const std::array Date: Sat, 27 Jul 2024 10:47:18 +0800 Subject: [PATCH 104/143] refactoring: remove dup code --- ggml/src/ggml-qnn/buffer.hpp | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/ggml/src/ggml-qnn/buffer.hpp b/ggml/src/ggml-qnn/buffer.hpp index db8e8ccaf24fa..4b4b2daaa75b4 100644 --- a/ggml/src/ggml-qnn/buffer.hpp +++ b/ggml/src/ggml-qnn/buffer.hpp @@ -12,15 +12,14 @@ class ggml_qnn_rpc_buffer { Qnn_DataType_t data_type) : _qnn_instance(qnn_instance), _size(size) { - auto *qnn_rpc_buffer = static_cast(qnn_instance->alloc_rpcmem(size, alignof(void *))); - _qnn_rpc_mem_handle = qnn_instance->register_rpcmem(qnn_rpc_buffer, rank, dimensions, data_type); - if (!_qnn_rpc_mem_handle) { - qnn_instance->free_rpcmem(qnn_rpc_buffer); + _qnn_rpc_buffer = static_cast(qnn_instance->alloc_rpcmem(size, alignof(void *))); + _qnn_rpc_mem_handle = qnn_instance->register_rpcmem(_qnn_rpc_buffer, rank, dimensions, data_type); + if (!_qnn_rpc_buffer || !_qnn_rpc_mem_handle) { QNN_LOG_WARN("register rpc mem failure\n"); + // let the destructor free the buffer return; } - _qnn_rpc_buffer = qnn_rpc_buffer; QNN_LOG_DEBUG("alloc rpcmem(%p) successfully, size %d\n", _qnn_rpc_buffer, (int)size); } ~ggml_qnn_rpc_buffer() { From ccfec7010657313bb030c1f58d7a78433f4435b7 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Sat, 27 Jul 2024 11:22:29 +0800 Subject: [PATCH 105/143] refactoring: remove unused get_rpcmem_from_memhandle func --- ggml/src/ggml-qnn/qnn-lib.hpp | 39 +++++++++++++---------------------- 1 file changed, 14 insertions(+), 25 deletions(-) diff --git a/ggml/src/ggml-qnn/qnn-lib.hpp b/ggml/src/ggml-qnn/qnn-lib.hpp index aa142c74adf82..da986e2e4c4ff 100644 --- a/ggml/src/ggml-qnn/qnn-lib.hpp +++ b/ggml/src/ggml-qnn/qnn-lib.hpp @@ -591,8 +591,6 @@ class qnn_instance { size_t get_rpcmem_capacity() { return _rpcmem_capacity; } - bool is_rpcmem_registered(Qnn_MemHandle_t handle) { return _qnn_mem_set.count(handle) != 0U; } - void *alloc_rpcmem(size_t bytes, size_t alignment) { if (!_rpcmem_initialized) { QNN_LOG_WARN("rpc memory not initialized\n"); @@ -619,7 +617,7 @@ class qnn_instance { void free_rpcmem(void *buf) { if (!_rpcmem_initialized) { QNN_LOG_WARN("rpc memory not initialized\n"); - } else if (0 == _rpcmem_store_map.count(buf)) { + } else if (_rpcmem_store_map.count(buf) == 0) { QNN_LOG_WARN("no allocated tensor\n"); } else { _pfn_rpc_mem_free(_rpcmem_store_map[buf]); @@ -638,18 +636,6 @@ class qnn_instance { return mem_fd; } - void *get_rpcmem_from_memhandle(Qnn_MemHandle_t mem_handle) { - for (std::unordered_map::iterator it = _qnn_mem_set.begin(); it != _qnn_mem_set.end(); - it++) { - Qnn_MemHandle_t mem_handle = it->second; - if (it->second == mem_handle) { - return it->first; - } - } - QNN_LOG_WARN("can't find rpcmem from qnn mem handle %p", mem_handle); - return nullptr; - } - Qnn_MemHandle_t register_rpcmem(void *p_data, uint32_t rank, uint32_t *dimensions, Qnn_DataType_t data_type) { if (!p_data) { QNN_LOG_WARN("invalid param\n"); @@ -661,9 +647,9 @@ class qnn_instance { return nullptr; } - if (is_rpcmem_allocated(p_data)) { - QNN_LOG_WARN("rpc memory already allocated\n"); - return nullptr; + if (is_rpcmem_registered(p_data)) { + QNN_LOG_WARN("rpc memory already registered\n"); + return _qnn_rpc_buffer_to_handles[p_data]; } auto mem_fd = rpcmem_to_fd(p_data); @@ -683,8 +669,7 @@ class qnn_instance { return nullptr; } - _qnn_mem_set.insert((std::pair(p_data, handle))); - + _qnn_rpc_buffer_to_handles.insert({ p_data, handle }); QNN_LOG_INFO("successfully register shared memory handler: %p\n", handle); return handle; } @@ -695,14 +680,18 @@ class qnn_instance { QNN_LOG_WARN("failed to unregister shared memory, error %d\n", QNN_GET_ERROR_CODE(error)); } - auto it = std::find_if(_qnn_mem_set.begin(), _qnn_mem_set.end(), + auto it = std::find_if(_qnn_rpc_buffer_to_handles.begin(), _qnn_rpc_buffer_to_handles.end(), [mem_handle](const auto &kv) { return kv.second == mem_handle; }); - if (it != _qnn_mem_set.end()) { - _qnn_mem_set.erase(it); + if (it == _qnn_rpc_buffer_to_handles.end()) { + QNN_LOG_WARN("failed to find shared memory handler: %p\n", mem_handle); + return; } + + _qnn_rpc_buffer_to_handles.erase(it); } - bool is_rpcmem_allocated(void *buf) { return _qnn_mem_set.count(buf) != 0U; } + bool is_rpcmem_allocated(void *buf) { return _rpcmem_store_map.count(buf) != 0; } + bool is_rpcmem_registered(void *buf) { return _qnn_rpc_buffer_to_handles.count(buf) != 0U; } const qnn::qcom_socinfo &get_soc_info() { return _soc_info; } @@ -892,7 +881,7 @@ class qnn_instance { QnnHtpDevice_PerfInfrastructure_t *_qnn_htp_perfinfra = nullptr; uint32_t _qnn_power_configid = 1; - std::unordered_map _qnn_mem_set; + std::unordered_map _qnn_rpc_buffer_to_handles; std::mutex _init_mutex; std::unordered_map _loaded_lib_handle; From 867c91bfaff57ba20eca24228c9568fa33b6769e Mon Sep 17 00:00:00 2001 From: hongruichen Date: Sat, 27 Jul 2024 11:56:21 +0800 Subject: [PATCH 106/143] feat: add error string for QnnOpPackage_Error_t --- ggml/src/ggml-qnn/utils.cpp | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-qnn/utils.cpp b/ggml/src/ggml-qnn/utils.cpp index 820b72b8969f8..e44d6dbccee42 100644 --- a/ggml/src/ggml-qnn/utils.cpp +++ b/ggml/src/ggml-qnn/utils.cpp @@ -158,12 +158,14 @@ const char *get_qnn_error_string(Qnn_ErrorHandle_t error) { switch (error) { case QNN_SUCCESS: return "QNN_SUCCESS"; + case QNN_COMMON_ERROR_GENERAL: + return "QNN_COMMON_ERROR_GENERAL"; + + // QnnGraph_Error_t case QNN_GRAPH_ERROR_UNSUPPORTED_FEATURE: return "QNN_GRAPH_ERROR_UNSUPPORTED_FEATURE"; case QNN_GRAPH_ERROR_MEM_ALLOC: return "QNN_GRAPH_ERROR_MEM_ALLOC"; - case QNN_GRAPH_ERROR_GENERAL: - return "QNN_GRAPH_ERROR_GENERAL"; case QNN_GRAPH_ERROR_INVALID_ARGUMENT: return "QNN_GRAPH_ERROR_INVALID_ARGUMENT"; case QNN_GRAPH_ERROR_INVALID_HANDLE: @@ -182,6 +184,22 @@ const char *get_qnn_error_string(Qnn_ErrorHandle_t error) { return "QNN_GRAPH_ERROR_UNCONNECTED_NODE"; case QNN_GRAPH_ERROR_CREATE_FAILED: return "QNN_GRAPH_ERROR_CREATE_FAILED"; + + // QnnOpPackage_Error_t + case QNN_OP_PACKAGE_ERROR_LIBRARY_ALREADY_INITIALIZED: + return "QNN_OP_PACKAGE_ERROR_LIBRARY_ALREADY_INITIALIZED"; + case QNN_OP_PACKAGE_ERROR_LIBRARY_NOT_INITIALIZED: + return "QNN_OP_PACKAGE_ERROR_LIBRARY_NOT_INITIALIZED"; + case QNN_OP_PACKAGE_ERROR_INVALID_HANDLE: + return "QNN_OP_PACKAGE_ERROR_INVALID_HANDLE"; + case QNN_OP_PACKAGE_ERROR_INVALID_INFRASTRUCTURE: + return "QNN_OP_PACKAGE_ERROR_INVALID_INFRASTRUCTURE"; + case QNN_OP_PACKAGE_ERROR_INVALID_INFO: + return "QNN_OP_PACKAGE_ERROR_INVALID_INFO"; + case QNN_OP_PACKAGE_ERROR_VALIDATION_FAILURE: + return "QNN_OP_PACKAGE_ERROR_VALIDATION_FAILURE"; + case QNN_OP_PACKAGE_ERROR_INVALID_ARGUMENT: + return "QNN_OP_PACKAGE_ERROR_INVALID_ARGUMENT"; default: return nullptr; } From 5da73f8085e9b3276ec7dba2c30c8ad775b7bcdd Mon Sep 17 00:00:00 2001 From: hongruichen Date: Sat, 27 Jul 2024 12:52:59 +0800 Subject: [PATCH 107/143] refactoring: move forward and supports_op into ops file --- ggml/src/ggml-qnn.cpp | 55 +------------- ggml/src/ggml-qnn/backend-ops.cpp | 115 ++++++++++++++++++++++++------ ggml/src/ggml-qnn/backend-ops.hpp | 13 +--- 3 files changed, 97 insertions(+), 86 deletions(-) diff --git a/ggml/src/ggml-qnn.cpp b/ggml/src/ggml-qnn.cpp index 6472d3e154367..22b57b1758a54 100644 --- a/ggml/src/ggml-qnn.cpp +++ b/ggml/src/ggml-qnn.cpp @@ -114,23 +114,7 @@ struct ggml_backend_qnn_buffer_type_context { // // ================================================================================================= static bool ggml_qnn_compute_forward(ggml_backend_qnn_context *ctx, struct ggml_tensor *tensor) { - size_t unary_op_idx = tensor->op; - if (tensor->op == GGML_OP_UNARY) { - unary_op_idx = qnn::kGgmlUnaryOpStart + ggml_get_unary_op(tensor); - } - - auto unary_op = qnn::ggml_qnn_unary_op_array()[unary_op_idx]; - if (unary_op) { - return unary_op(ctx, tensor->src[0], tensor); - } - - auto binary_op = qnn::ggml_qnn_binary_op_array()[tensor->op]; - if (binary_op) { - return binary_op(ctx, tensor->src[0], tensor->src[1], tensor); - } - - QNN_LOG_WARN("unsupported op %d", tensor->op); - return false; + return qnn::ggml_qnn_forward(ctx, tensor); } static const char *ggml_backend_qnn_buffer_get_name(ggml_backend_buffer_t buffer) { @@ -288,42 +272,7 @@ GGML_CALL static ggml_status ggml_backend_qnn_graph_compute(ggml_backend_t backe GGML_CALL static bool ggml_backend_qnn_supports_op(ggml_backend_t backend, const ggml_tensor *op) { GGML_UNUSED(backend); - - if (op->op == GGML_OP_UNARY) { - if (!qnn::ggml_qnn_unary_op_array()[qnn::kGgmlUnaryOpStart + ggml_get_unary_op(op)]) { - QNN_LOG_DEBUG("unsupported unary op %d", ggml_get_unary_op(op)); - return false; - } - - if (!op->src[0]) { - QNN_LOG_DEBUG("src0 is nullptr"); - return false; - } - } else if (op->op != GGML_OP_NONE) { - if (!qnn::ggml_qnn_unary_op_array()[op->op] && !qnn::ggml_qnn_binary_op_array()[op->op]) { - QNN_LOG_DEBUG("unsupported op %d", op->op); - return false; - } - - if (!op->src[0] || !op->src[1]) { - QNN_LOG_DEBUG("src0 or src1 is nullptr"); - return false; - } - } - - switch (op->type) { - case GGML_TYPE_F32: - case GGML_TYPE_F16: - case GGML_TYPE_I8: - case GGML_TYPE_Q8_0: - case GGML_TYPE_Q4_0: - break; - default: - QNN_LOG_DEBUG("unsupported src0 type %d", op->src[0]->type); - return false; - } - - return true; + return qnn::ggml_qnn_supports_op(op); } GGML_CALL static bool ggml_backend_qnn_offload_op(ggml_backend_t backend, const ggml_tensor *op) { diff --git a/ggml/src/ggml-qnn/backend-ops.cpp b/ggml/src/ggml-qnn/backend-ops.cpp index 1f8b75e5e3e0e..20a4178fd2303 100644 --- a/ggml/src/ggml-qnn/backend-ops.cpp +++ b/ggml/src/ggml-qnn/backend-ops.cpp @@ -56,6 +56,15 @@ bool qnn_is_valid_params(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, namespace { +typedef bool (*ggml_qnn_unary_op_t)(ggml_backend_qnn_context *ctx, ggml_tensor *src, ggml_tensor *dst); +typedef bool (*ggml_qnn_binary_op_t)(ggml_backend_qnn_context *ctx, ggml_tensor *src0, ggml_tensor *src1, + ggml_tensor *dst); + +typedef const ggml_qnn_unary_op_t (&ggml_qnn_unary_op_array_t)[GGML_OP_COUNT + GGML_UNARY_OP_COUNT]; +typedef const ggml_qnn_binary_op_t (&ggml_qnn_binary_op_array_t)[GGML_OP_COUNT]; + +constexpr const size_t kGgmlUnaryOpStart = GGML_OP_COUNT; + void print_ggml_tensor(const ggml_tensor *tensor) { QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", tensor->name, tensor->type, ggml_type_name(tensor->type), tensor->ne[0], tensor->ne[1], tensor->ne[2], @@ -106,8 +115,8 @@ qnn::ggml_qnn_graph *get_qnn_graph_from_cache(ggml_backend_qnn_context *ctx, siz GGML_ASSERT(op < (GGML_OP_COUNT + GGML_UNARY_OP_COUNT)); auto &graph_cache = ctx->qnn_graph_cache; - const auto *op_name = op < qnn::kGgmlUnaryOpStart ? ggml_op_name(ggml_op(op)) - : ggml_unary_op_name(ggml_unary_op(op - qnn::kGgmlUnaryOpStart)); + const auto *op_name = + op < kGgmlUnaryOpStart ? ggml_op_name(ggml_op(op)) : ggml_unary_op_name(ggml_unary_op(op - kGgmlUnaryOpStart)); auto graph_key = get_graph_key<_InputSize, _OutputSize>(op_name, inputs, outputs); auto it = graph_cache.find(graph_key); qnn::ggml_qnn_graph *graph_ptr = nullptr; @@ -237,7 +246,7 @@ constexpr const char *kGgmlOpToQnnOp[] = { static_assert(sizeof(kGgmlOpToQnnOp) / sizeof(kGgmlOpToQnnOp[0]) == (GGML_OP_COUNT + GGML_UNARY_OP_COUNT), "GGML_OP_COUNT does not match the size of the kGgmlOpToQnnOp table"); -static_assert(kGgmlOpToQnnOp[GGML_UNARY_OP_GELU + qnn::kGgmlUnaryOpStart] != nullptr, +static_assert(kGgmlOpToQnnOp[GGML_UNARY_OP_GELU + kGgmlUnaryOpStart] != nullptr, "GGML_UNARY_OP_GELU does not correspond to QNN_OP_GELU"); template @@ -281,10 +290,8 @@ bool qnn_unary_op_impl(ggml_backend_qnn_context *ctx, ggml_tensor *src, ggml_ten return succeed; } -} // namespace - -qnn::ggml_qnn_unary_op_array_t qnn::ggml_qnn_unary_op_array() { - static constexpr const qnn::ggml_qnn_unary_op_t kQnnOpsTable[] = { +ggml_qnn_unary_op_array_t ggml_qnn_unary_op_array() { + static constexpr const ggml_qnn_unary_op_t kQnnOpsTable[] = { nullptr, // GGML_OP_NONE nullptr, // GGML_OP_DUP nullptr, // GGML_OP_ADD @@ -369,19 +376,19 @@ qnn::ggml_qnn_unary_op_array_t qnn::ggml_qnn_unary_op_array() { nullptr, // GGML_OP_CROSS_ENTROPY_LOSS_BACK // ggml_unary_op - nullptr, // GGML_UNARY_OP_ABS - nullptr, // GGML_UNARY_OP_SGN - nullptr, // GGML_UNARY_OP_NEG - nullptr, // GGML_UNARY_OP_STEP - nullptr, // GGML_UNARY_OP_TANH - nullptr, // GGML_UNARY_OP_ELU - nullptr, // GGML_UNARY_OP_RELU - nullptr, // GGML_UNARY_OP_SIGMOID - qnn_unary_op_impl, // GGML_UNARY_OP_GELU - nullptr, // GGML_UNARY_OP_GELU_QUICK - nullptr, // GGML_UNARY_OP_SILU - nullptr, // GGML_UNARY_OP_HARDSWISH - nullptr, // GGML_UNARY_OP_HARDSIGMOID + nullptr, // GGML_UNARY_OP_ABS + nullptr, // GGML_UNARY_OP_SGN + nullptr, // GGML_UNARY_OP_NEG + nullptr, // GGML_UNARY_OP_STEP + nullptr, // GGML_UNARY_OP_TANH + nullptr, // GGML_UNARY_OP_ELU + nullptr, // GGML_UNARY_OP_RELU + nullptr, // GGML_UNARY_OP_SIGMOID + qnn_unary_op_impl, // GGML_UNARY_OP_GELU + nullptr, // GGML_UNARY_OP_GELU_QUICK + nullptr, // GGML_UNARY_OP_SILU + nullptr, // GGML_UNARY_OP_HARDSWISH + nullptr, // GGML_UNARY_OP_HARDSIGMOID }; static_assert(sizeof(kQnnOpsTable) / sizeof(kQnnOpsTable[0]) == (GGML_OP_COUNT + GGML_UNARY_OP_COUNT), @@ -389,8 +396,8 @@ qnn::ggml_qnn_unary_op_array_t qnn::ggml_qnn_unary_op_array() { return kQnnOpsTable; } -qnn::ggml_qnn_binary_op_array_t qnn::ggml_qnn_binary_op_array() { - static constexpr const qnn::ggml_qnn_binary_op_t kQnnOpsTable[] = { +ggml_qnn_binary_op_array_t ggml_qnn_binary_op_array() { + static constexpr const ggml_qnn_binary_op_t kQnnOpsTable[] = { nullptr, // GGML_OP_NONE nullptr, // GGML_OP_DUP qnn_binary_op_impl, // GGML_OP_ADD @@ -479,3 +486,67 @@ qnn::ggml_qnn_binary_op_array_t qnn::ggml_qnn_binary_op_array() { "GGML_OP_COUNT does not match the size of the ops table"); return kQnnOpsTable; } + +} // namespace + +namespace qnn { + +bool ggml_qnn_supports_op(const ggml_tensor *op) { + if (op->op == GGML_OP_UNARY) { + if (!ggml_qnn_unary_op_array()[kGgmlUnaryOpStart + ggml_get_unary_op(op)]) { + QNN_LOG_DEBUG("unsupported unary op %d", ggml_get_unary_op(op)); + return false; + } + + if (!op->src[0]) { + QNN_LOG_DEBUG("src0 is nullptr"); + return false; + } + } else if (op->op != GGML_OP_NONE) { + if (!ggml_qnn_unary_op_array()[op->op] && !ggml_qnn_binary_op_array()[op->op]) { + QNN_LOG_DEBUG("unsupported op %d", op->op); + return false; + } + + if (!op->src[0] || !op->src[1]) { + QNN_LOG_DEBUG("src0 or src1 is nullptr"); + return false; + } + } + + switch (op->type) { + case GGML_TYPE_F32: + case GGML_TYPE_F16: + case GGML_TYPE_I8: + case GGML_TYPE_Q8_0: + case GGML_TYPE_Q4_0: + break; + default: + QNN_LOG_DEBUG("unsupported src0 type %d", op->src[0]->type); + return false; + } + + return true; +} + +bool ggml_qnn_forward(ggml_backend_qnn_context *ctx, struct ggml_tensor *tensor) { + size_t unary_op_idx = tensor->op; + if (tensor->op == GGML_OP_UNARY) { + unary_op_idx = kGgmlUnaryOpStart + ggml_get_unary_op(tensor); + } + + auto unary_op = ggml_qnn_unary_op_array()[unary_op_idx]; + if (unary_op) { + return unary_op(ctx, tensor->src[0], tensor); + } + + auto binary_op = ggml_qnn_binary_op_array()[tensor->op]; + if (binary_op) { + return binary_op(ctx, tensor->src[0], tensor->src[1], tensor); + } + + QNN_LOG_WARN("unsupported op %d", tensor->op); + return false; +} + +} // namespace qnn diff --git a/ggml/src/ggml-qnn/backend-ops.hpp b/ggml/src/ggml-qnn/backend-ops.hpp index 614bcf651b86b..ed4ce994f787b 100644 --- a/ggml/src/ggml-qnn/backend-ops.hpp +++ b/ggml/src/ggml-qnn/backend-ops.hpp @@ -6,16 +6,7 @@ namespace qnn { -typedef bool (*ggml_qnn_unary_op_t)(ggml_backend_qnn_context *ctx, ggml_tensor *src, ggml_tensor *dst); -typedef bool (*ggml_qnn_binary_op_t)(ggml_backend_qnn_context *ctx, ggml_tensor *src0, ggml_tensor *src1, - ggml_tensor *dst); - -typedef const ggml_qnn_unary_op_t (&ggml_qnn_unary_op_array_t)[GGML_OP_COUNT + GGML_UNARY_OP_COUNT]; -typedef const ggml_qnn_binary_op_t (&ggml_qnn_binary_op_array_t)[GGML_OP_COUNT]; - -constexpr const size_t kGgmlUnaryOpStart = GGML_OP_COUNT; - -ggml_qnn_unary_op_array_t ggml_qnn_unary_op_array(); -ggml_qnn_binary_op_array_t ggml_qnn_binary_op_array(); +bool ggml_qnn_supports_op(const ggml_tensor *op); +bool ggml_qnn_forward(ggml_backend_qnn_context *ctx, struct ggml_tensor *tensor); } // namespace qnn From e0c9b34016d949be19bd65d55fe81b11fc89d327 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Sat, 27 Jul 2024 13:31:57 +0800 Subject: [PATCH 108/143] feat: check if dims equal for add looks qnn add can only applied to matrix with equal dimensions --- ggml/src/ggml-qnn/backend-ops.cpp | 32 +++++++++++++++++++++++++------ 1 file changed, 26 insertions(+), 6 deletions(-) diff --git a/ggml/src/ggml-qnn/backend-ops.cpp b/ggml/src/ggml-qnn/backend-ops.cpp index 20a4178fd2303..fdba6cac24d82 100644 --- a/ggml/src/ggml-qnn/backend-ops.cpp +++ b/ggml/src/ggml-qnn/backend-ops.cpp @@ -43,6 +43,27 @@ bool qnn_is_valid_params(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, return true; } +bool is_tensor_dimensions_equal(const ggml_tensor *l, const ggml_tensor *r) { + const auto dim_l = ggml_n_dims(l); + if (dim_l != ggml_n_dims(r)) { + return false; + } + + for (int i = 0; i < dim_l; i++) { + if (l->ne[i] != r->ne[i]) { + return false; + } + } + + return true; +} + +void print_ggml_tensor(const ggml_tensor *tensor) { + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + tensor->name, tensor->type, ggml_type_name(tensor->type), tensor->ne[0], tensor->ne[1], tensor->ne[2], + tensor->nb[0], tensor->nb[1], tensor->nb[2]); +} + } // namespace #define CHECK_PARAMS(ctx, ...) \ @@ -65,12 +86,6 @@ typedef const ggml_qnn_binary_op_t (&ggml_qnn_binary_op_array_t)[GGML_OP_COUNT]; constexpr const size_t kGgmlUnaryOpStart = GGML_OP_COUNT; -void print_ggml_tensor(const ggml_tensor *tensor) { - QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", - tensor->name, tensor->type, ggml_type_name(tensor->type), tensor->ne[0], tensor->ne[1], tensor->ne[2], - tensor->nb[0], tensor->nb[1], tensor->nb[2]); -} - template qnn::ggml_tensor_array_t to_ggml_tensor_array(const std::array &array) { return qnn::ggml_tensor_array_t(array.data(), array.data() + _Size); @@ -512,6 +527,11 @@ bool ggml_qnn_supports_op(const ggml_tensor *op) { QNN_LOG_DEBUG("src0 or src1 is nullptr"); return false; } + + if (op->op == GGML_OP_ADD && !is_tensor_dimensions_equal(op->src[0], op->src[1])) { + QNN_LOG_DEBUG("src0 and src1 dimensions are not equal"); + return false; + } } switch (op->type) { From 8ab1f15fe396b6d46fe27cb5039855927cb0639b Mon Sep 17 00:00:00 2001 From: hongruichen Date: Sat, 27 Jul 2024 13:43:07 +0800 Subject: [PATCH 109/143] refactoring: remove internal functions, use op table directly --- ggml/src/ggml-qnn/backend-ops.cpp | 388 +++++++++++++++--------------- 1 file changed, 191 insertions(+), 197 deletions(-) diff --git a/ggml/src/ggml-qnn/backend-ops.cpp b/ggml/src/ggml-qnn/backend-ops.cpp index fdba6cac24d82..a560417438c28 100644 --- a/ggml/src/ggml-qnn/backend-ops.cpp +++ b/ggml/src/ggml-qnn/backend-ops.cpp @@ -304,203 +304,197 @@ bool qnn_unary_op_impl(ggml_backend_qnn_context *ctx, ggml_tensor *src, ggml_ten return succeed; } +constexpr const ggml_qnn_unary_op_t kQnnUnaryOpsTable[] = { -ggml_qnn_unary_op_array_t ggml_qnn_unary_op_array() { - static constexpr const ggml_qnn_unary_op_t kQnnOpsTable[] = { - nullptr, // GGML_OP_NONE - nullptr, // GGML_OP_DUP - nullptr, // GGML_OP_ADD - nullptr, // GGML_OP_ADD1 - nullptr, // GGML_OP_ACC - nullptr, // GGML_OP_SUB - nullptr, // GGML_OP_MUL - nullptr, // GGML_OP_DIV - nullptr, // GGML_OP_SQR - qnn_unary_op_impl, // GGML_OP_SQRT - qnn_unary_op_impl, // GGML_OP_LOG - nullptr, // GGML_OP_SUM - nullptr, // GGML_OP_SUM_ROWS - nullptr, // GGML_OP_MEAN - nullptr, // GGML_OP_ARGMAX - nullptr, // GGML_OP_REPEAT - nullptr, // GGML_OP_REPEAT_BACK - nullptr, // GGML_OP_CONCAT - nullptr, // GGML_OP_SILU_BACK - nullptr, // GGML_OP_NORM - nullptr, // GGML_OP_RMS_NORM - nullptr, // GGML_OP_RMS_NORM_BACK - nullptr, // GGML_OP_GROUP_NORM - - nullptr, // GGML_OP_MUL_MAT - nullptr, // GGML_OP_MUL_MAT_ID - nullptr, // GGML_OP_OUT_PROD - - nullptr, // GGML_OP_SCALE - nullptr, // GGML_OP_SET - nullptr, // GGML_OP_CPY - nullptr, // GGML_OP_CONT - nullptr, // GGML_OP_RESHAPE - nullptr, // GGML_OP_VIEW - nullptr, // GGML_OP_PERMUTE - nullptr, // GGML_OP_TRANSPOSE - nullptr, // GGML_OP_GET_ROWS - nullptr, // GGML_OP_GET_ROWS_BACK - nullptr, // GGML_OP_DIAG - nullptr, // GGML_OP_DIAG_MASK_INF - nullptr, // GGML_OP_DIAG_MASK_ZERO - nullptr, // GGML_OP_SOFT_MAX - nullptr, // GGML_OP_SOFT_MAX_BACK - nullptr, // GGML_OP_ROPE - nullptr, // GGML_OP_ROPE_BACK - nullptr, // GGML_OP_CLAMP - nullptr, // GGML_OP_CONV_TRANSPOSE_1D - nullptr, // GGML_OP_IM2COL - nullptr, // GGML_OP_CONV_TRANSPOSE_2D - nullptr, // GGML_OP_POOL_1D - nullptr, // GGML_OP_POOL_2D - nullptr, // GGML_OP_UPSCALE - nullptr, // GGML_OP_PAD - nullptr, // GGML_OP_ARANGE - nullptr, // GGML_OP_TIMESTEP_EMBEDDING - nullptr, // GGML_OP_ARGSORT - nullptr, // GGML_OP_LEAKY_RELU - - nullptr, // GGML_OP_FLASH_ATTN_EXT - nullptr, // GGML_OP_FLASH_ATTN_BACK - nullptr, // GGML_OP_SSM_CONV - nullptr, // GGML_OP_SSM_SCAN - nullptr, // GGML_OP_WIN_PART - nullptr, // GGML_OP_WIN_UNPART - nullptr, // GGML_OP_GET_REL_POS - nullptr, // GGML_OP_ADD_REL_POS - - nullptr, // GGML_OP_UNARY - - nullptr, // GGML_OP_MAP_UNARY - nullptr, // GGML_OP_MAP_BINARY - - nullptr, // GGML_OP_MAP_CUSTOM1_F32 - nullptr, // GGML_OP_MAP_CUSTOM2_F32 - nullptr, // GGML_OP_MAP_CUSTOM3_F32 - - nullptr, // GGML_OP_MAP_CUSTOM1 - nullptr, // GGML_OP_MAP_CUSTOM2 - nullptr, // GGML_OP_MAP_CUSTOM3 - - nullptr, // GGML_OP_CROSS_ENTROPY_LOSS - nullptr, // GGML_OP_CROSS_ENTROPY_LOSS_BACK - - // ggml_unary_op - nullptr, // GGML_UNARY_OP_ABS - nullptr, // GGML_UNARY_OP_SGN - nullptr, // GGML_UNARY_OP_NEG - nullptr, // GGML_UNARY_OP_STEP - nullptr, // GGML_UNARY_OP_TANH - nullptr, // GGML_UNARY_OP_ELU - nullptr, // GGML_UNARY_OP_RELU - nullptr, // GGML_UNARY_OP_SIGMOID - qnn_unary_op_impl, // GGML_UNARY_OP_GELU - nullptr, // GGML_UNARY_OP_GELU_QUICK - nullptr, // GGML_UNARY_OP_SILU - nullptr, // GGML_UNARY_OP_HARDSWISH - nullptr, // GGML_UNARY_OP_HARDSIGMOID - }; + nullptr, // GGML_OP_NONE + nullptr, // GGML_OP_DUP + nullptr, // GGML_OP_ADD + nullptr, // GGML_OP_ADD1 + nullptr, // GGML_OP_ACC + nullptr, // GGML_OP_SUB + nullptr, // GGML_OP_MUL + nullptr, // GGML_OP_DIV + nullptr, // GGML_OP_SQR + qnn_unary_op_impl, // GGML_OP_SQRT + qnn_unary_op_impl, // GGML_OP_LOG + nullptr, // GGML_OP_SUM + nullptr, // GGML_OP_SUM_ROWS + nullptr, // GGML_OP_MEAN + nullptr, // GGML_OP_ARGMAX + nullptr, // GGML_OP_REPEAT + nullptr, // GGML_OP_REPEAT_BACK + nullptr, // GGML_OP_CONCAT + nullptr, // GGML_OP_SILU_BACK + nullptr, // GGML_OP_NORM + nullptr, // GGML_OP_RMS_NORM + nullptr, // GGML_OP_RMS_NORM_BACK + nullptr, // GGML_OP_GROUP_NORM - static_assert(sizeof(kQnnOpsTable) / sizeof(kQnnOpsTable[0]) == (GGML_OP_COUNT + GGML_UNARY_OP_COUNT), - "GGML_OP_COUNT does not match the size of the kQnnOpsTable table"); - return kQnnOpsTable; -} + nullptr, // GGML_OP_MUL_MAT + nullptr, // GGML_OP_MUL_MAT_ID + nullptr, // GGML_OP_OUT_PROD -ggml_qnn_binary_op_array_t ggml_qnn_binary_op_array() { - static constexpr const ggml_qnn_binary_op_t kQnnOpsTable[] = { - nullptr, // GGML_OP_NONE - nullptr, // GGML_OP_DUP - qnn_binary_op_impl, // GGML_OP_ADD - nullptr, // GGML_OP_ADD1 - nullptr, // GGML_OP_ACC - qnn_binary_op_impl, // GGML_OP_SUB - qnn_binary_op_impl, // GGML_OP_MUL - qnn_binary_op_impl, // GGML_OP_DIV - nullptr, // GGML_OP_SQR - nullptr, // GGML_OP_SQRT - nullptr, // GGML_OP_LOG - nullptr, // GGML_OP_SUM - nullptr, // GGML_OP_SUM_ROWS - nullptr, // GGML_OP_MEAN - nullptr, // GGML_OP_ARGMAX - nullptr, // GGML_OP_REPEAT - nullptr, // GGML_OP_REPEAT_BACK - nullptr, // GGML_OP_CONCAT - nullptr, // GGML_OP_SILU_BACK - nullptr, // GGML_OP_NORM - nullptr, // GGML_OP_RMS_NORM - nullptr, // GGML_OP_RMS_NORM_BACK - nullptr, // GGML_OP_GROUP_NORM - - qnn_binary_op_impl, // GGML_OP_MUL_MAT - nullptr, // GGML_OP_MUL_MAT_ID - nullptr, // GGML_OP_OUT_PROD - - nullptr, // GGML_OP_SCALE - nullptr, // GGML_OP_SET - nullptr, // GGML_OP_CPY - nullptr, // GGML_OP_CONT - nullptr, // GGML_OP_RESHAPE - nullptr, // GGML_OP_VIEW - nullptr, // GGML_OP_PERMUTE - nullptr, // GGML_OP_TRANSPOSE - nullptr, // GGML_OP_GET_ROWS - nullptr, // GGML_OP_GET_ROWS_BACK - nullptr, // GGML_OP_DIAG - nullptr, // GGML_OP_DIAG_MASK_INF - nullptr, // GGML_OP_DIAG_MASK_ZERO - nullptr, // GGML_OP_SOFT_MAX - nullptr, // GGML_OP_SOFT_MAX_BACK - nullptr, // GGML_OP_ROPE - nullptr, // GGML_OP_ROPE_BACK - nullptr, // GGML_OP_CLAMP - nullptr, // GGML_OP_CONV_TRANSPOSE_1D - nullptr, // GGML_OP_IM2COL - nullptr, // GGML_OP_CONV_TRANSPOSE_2D - nullptr, // GGML_OP_POOL_1D - nullptr, // GGML_OP_POOL_2D - nullptr, // GGML_OP_UPSCALE - nullptr, // GGML_OP_PAD - nullptr, // GGML_OP_ARANGE - nullptr, // GGML_OP_TIMESTEP_EMBEDDING - nullptr, // GGML_OP_ARGSORT - nullptr, // GGML_OP_LEAKY_RELU - - nullptr, // GGML_OP_FLASH_ATTN_EXT - nullptr, // GGML_OP_FLASH_ATTN_BACK - nullptr, // GGML_OP_SSM_CONV - nullptr, // GGML_OP_SSM_SCAN - nullptr, // GGML_OP_WIN_PART - nullptr, // GGML_OP_WIN_UNPART - nullptr, // GGML_OP_GET_REL_POS - nullptr, // GGML_OP_ADD_REL_POS - - nullptr, // GGML_OP_UNARY - - nullptr, // GGML_OP_MAP_UNARY - nullptr, // GGML_OP_MAP_BINARY - - nullptr, // GGML_OP_MAP_CUSTOM1_F32 - nullptr, // GGML_OP_MAP_CUSTOM2_F32 - nullptr, // GGML_OP_MAP_CUSTOM3_F32 - - nullptr, // GGML_OP_MAP_CUSTOM1 - nullptr, // GGML_OP_MAP_CUSTOM2 - nullptr, // GGML_OP_MAP_CUSTOM3 - - nullptr, // GGML_OP_CROSS_ENTROPY_LOSS - nullptr, // GGML_OP_CROSS_ENTROPY_LOSS_BACK - }; + nullptr, // GGML_OP_SCALE + nullptr, // GGML_OP_SET + nullptr, // GGML_OP_CPY + nullptr, // GGML_OP_CONT + nullptr, // GGML_OP_RESHAPE + nullptr, // GGML_OP_VIEW + nullptr, // GGML_OP_PERMUTE + nullptr, // GGML_OP_TRANSPOSE + nullptr, // GGML_OP_GET_ROWS + nullptr, // GGML_OP_GET_ROWS_BACK + nullptr, // GGML_OP_DIAG + nullptr, // GGML_OP_DIAG_MASK_INF + nullptr, // GGML_OP_DIAG_MASK_ZERO + nullptr, // GGML_OP_SOFT_MAX + nullptr, // GGML_OP_SOFT_MAX_BACK + nullptr, // GGML_OP_ROPE + nullptr, // GGML_OP_ROPE_BACK + nullptr, // GGML_OP_CLAMP + nullptr, // GGML_OP_CONV_TRANSPOSE_1D + nullptr, // GGML_OP_IM2COL + nullptr, // GGML_OP_CONV_TRANSPOSE_2D + nullptr, // GGML_OP_POOL_1D + nullptr, // GGML_OP_POOL_2D + nullptr, // GGML_OP_UPSCALE + nullptr, // GGML_OP_PAD + nullptr, // GGML_OP_ARANGE + nullptr, // GGML_OP_TIMESTEP_EMBEDDING + nullptr, // GGML_OP_ARGSORT + nullptr, // GGML_OP_LEAKY_RELU - static_assert(sizeof(kQnnOpsTable) / sizeof(kQnnOpsTable[0]) == GGML_OP_COUNT, - "GGML_OP_COUNT does not match the size of the ops table"); - return kQnnOpsTable; -} + nullptr, // GGML_OP_FLASH_ATTN_EXT + nullptr, // GGML_OP_FLASH_ATTN_BACK + nullptr, // GGML_OP_SSM_CONV + nullptr, // GGML_OP_SSM_SCAN + nullptr, // GGML_OP_WIN_PART + nullptr, // GGML_OP_WIN_UNPART + nullptr, // GGML_OP_GET_REL_POS + nullptr, // GGML_OP_ADD_REL_POS + + nullptr, // GGML_OP_UNARY + + nullptr, // GGML_OP_MAP_UNARY + nullptr, // GGML_OP_MAP_BINARY + + nullptr, // GGML_OP_MAP_CUSTOM1_F32 + nullptr, // GGML_OP_MAP_CUSTOM2_F32 + nullptr, // GGML_OP_MAP_CUSTOM3_F32 + + nullptr, // GGML_OP_MAP_CUSTOM1 + nullptr, // GGML_OP_MAP_CUSTOM2 + nullptr, // GGML_OP_MAP_CUSTOM3 + + nullptr, // GGML_OP_CROSS_ENTROPY_LOSS + nullptr, // GGML_OP_CROSS_ENTROPY_LOSS_BACK + + // ggml_unary_op + nullptr, // GGML_UNARY_OP_ABS + nullptr, // GGML_UNARY_OP_SGN + nullptr, // GGML_UNARY_OP_NEG + nullptr, // GGML_UNARY_OP_STEP + nullptr, // GGML_UNARY_OP_TANH + nullptr, // GGML_UNARY_OP_ELU + nullptr, // GGML_UNARY_OP_RELU + nullptr, // GGML_UNARY_OP_SIGMOID + qnn_unary_op_impl, // GGML_UNARY_OP_GELU + nullptr, // GGML_UNARY_OP_GELU_QUICK + nullptr, // GGML_UNARY_OP_SILU + nullptr, // GGML_UNARY_OP_HARDSWISH + nullptr, // GGML_UNARY_OP_HARDSIGMOID +}; + +static_assert(sizeof(kQnnUnaryOpsTable) / sizeof(kQnnUnaryOpsTable[0]) == (GGML_OP_COUNT + GGML_UNARY_OP_COUNT), + "GGML_OP_COUNT does not match the size of the kQnnUnaryOpsTable table"); + +static constexpr const ggml_qnn_binary_op_t kQnnBinaryOpsTable[] = { + nullptr, // GGML_OP_NONE + nullptr, // GGML_OP_DUP + qnn_binary_op_impl, // GGML_OP_ADD + nullptr, // GGML_OP_ADD1 + nullptr, // GGML_OP_ACC + qnn_binary_op_impl, // GGML_OP_SUB + qnn_binary_op_impl, // GGML_OP_MUL + qnn_binary_op_impl, // GGML_OP_DIV + nullptr, // GGML_OP_SQR + nullptr, // GGML_OP_SQRT + nullptr, // GGML_OP_LOG + nullptr, // GGML_OP_SUM + nullptr, // GGML_OP_SUM_ROWS + nullptr, // GGML_OP_MEAN + nullptr, // GGML_OP_ARGMAX + nullptr, // GGML_OP_REPEAT + nullptr, // GGML_OP_REPEAT_BACK + nullptr, // GGML_OP_CONCAT + nullptr, // GGML_OP_SILU_BACK + nullptr, // GGML_OP_NORM + nullptr, // GGML_OP_RMS_NORM + nullptr, // GGML_OP_RMS_NORM_BACK + nullptr, // GGML_OP_GROUP_NORM + + qnn_binary_op_impl, // GGML_OP_MUL_MAT + nullptr, // GGML_OP_MUL_MAT_ID + nullptr, // GGML_OP_OUT_PROD + + nullptr, // GGML_OP_SCALE + nullptr, // GGML_OP_SET + nullptr, // GGML_OP_CPY + nullptr, // GGML_OP_CONT + nullptr, // GGML_OP_RESHAPE + nullptr, // GGML_OP_VIEW + nullptr, // GGML_OP_PERMUTE + nullptr, // GGML_OP_TRANSPOSE + nullptr, // GGML_OP_GET_ROWS + nullptr, // GGML_OP_GET_ROWS_BACK + nullptr, // GGML_OP_DIAG + nullptr, // GGML_OP_DIAG_MASK_INF + nullptr, // GGML_OP_DIAG_MASK_ZERO + nullptr, // GGML_OP_SOFT_MAX + nullptr, // GGML_OP_SOFT_MAX_BACK + nullptr, // GGML_OP_ROPE + nullptr, // GGML_OP_ROPE_BACK + nullptr, // GGML_OP_CLAMP + nullptr, // GGML_OP_CONV_TRANSPOSE_1D + nullptr, // GGML_OP_IM2COL + nullptr, // GGML_OP_CONV_TRANSPOSE_2D + nullptr, // GGML_OP_POOL_1D + nullptr, // GGML_OP_POOL_2D + nullptr, // GGML_OP_UPSCALE + nullptr, // GGML_OP_PAD + nullptr, // GGML_OP_ARANGE + nullptr, // GGML_OP_TIMESTEP_EMBEDDING + nullptr, // GGML_OP_ARGSORT + nullptr, // GGML_OP_LEAKY_RELU + + nullptr, // GGML_OP_FLASH_ATTN_EXT + nullptr, // GGML_OP_FLASH_ATTN_BACK + nullptr, // GGML_OP_SSM_CONV + nullptr, // GGML_OP_SSM_SCAN + nullptr, // GGML_OP_WIN_PART + nullptr, // GGML_OP_WIN_UNPART + nullptr, // GGML_OP_GET_REL_POS + nullptr, // GGML_OP_ADD_REL_POS + + nullptr, // GGML_OP_UNARY + + nullptr, // GGML_OP_MAP_UNARY + nullptr, // GGML_OP_MAP_BINARY + + nullptr, // GGML_OP_MAP_CUSTOM1_F32 + nullptr, // GGML_OP_MAP_CUSTOM2_F32 + nullptr, // GGML_OP_MAP_CUSTOM3_F32 + + nullptr, // GGML_OP_MAP_CUSTOM1 + nullptr, // GGML_OP_MAP_CUSTOM2 + nullptr, // GGML_OP_MAP_CUSTOM3 + + nullptr, // GGML_OP_CROSS_ENTROPY_LOSS + nullptr, // GGML_OP_CROSS_ENTROPY_LOSS_BACK +}; + +static_assert(sizeof(kQnnBinaryOpsTable) / sizeof(kQnnBinaryOpsTable[0]) == GGML_OP_COUNT, + "GGML_OP_COUNT does not match the size of the kQnnBinaryOpsTable table"); } // namespace @@ -508,7 +502,7 @@ namespace qnn { bool ggml_qnn_supports_op(const ggml_tensor *op) { if (op->op == GGML_OP_UNARY) { - if (!ggml_qnn_unary_op_array()[kGgmlUnaryOpStart + ggml_get_unary_op(op)]) { + if (!kQnnUnaryOpsTable[kGgmlUnaryOpStart + ggml_get_unary_op(op)]) { QNN_LOG_DEBUG("unsupported unary op %d", ggml_get_unary_op(op)); return false; } @@ -518,7 +512,7 @@ bool ggml_qnn_supports_op(const ggml_tensor *op) { return false; } } else if (op->op != GGML_OP_NONE) { - if (!ggml_qnn_unary_op_array()[op->op] && !ggml_qnn_binary_op_array()[op->op]) { + if (!kQnnUnaryOpsTable[op->op] && !kQnnBinaryOpsTable[op->op]) { QNN_LOG_DEBUG("unsupported op %d", op->op); return false; } @@ -555,12 +549,12 @@ bool ggml_qnn_forward(ggml_backend_qnn_context *ctx, struct ggml_tensor *tensor) unary_op_idx = kGgmlUnaryOpStart + ggml_get_unary_op(tensor); } - auto unary_op = ggml_qnn_unary_op_array()[unary_op_idx]; + auto unary_op = kQnnUnaryOpsTable[unary_op_idx]; if (unary_op) { return unary_op(ctx, tensor->src[0], tensor); } - auto binary_op = ggml_qnn_binary_op_array()[tensor->op]; + auto binary_op = kQnnBinaryOpsTable[tensor->op]; if (binary_op) { return binary_op(ctx, tensor->src[0], tensor->src[1], tensor); } From e33b5c983749287ed06818b1d966b354b2ba6dc9 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Sat, 27 Jul 2024 13:49:49 +0800 Subject: [PATCH 110/143] refactoring: print the name of unsupport op --- ggml/src/ggml-qnn/backend-ops.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-qnn/backend-ops.cpp b/ggml/src/ggml-qnn/backend-ops.cpp index a560417438c28..89def7ec636d8 100644 --- a/ggml/src/ggml-qnn/backend-ops.cpp +++ b/ggml/src/ggml-qnn/backend-ops.cpp @@ -559,7 +559,7 @@ bool ggml_qnn_forward(ggml_backend_qnn_context *ctx, struct ggml_tensor *tensor) return binary_op(ctx, tensor->src[0], tensor->src[1], tensor); } - QNN_LOG_WARN("unsupported op %d", tensor->op); + QNN_LOG_WARN("unsupported op %s", ggml_op_desc(tensor)); return false; } From 1f9d2a7e22e902ced1df6842c4e9b769435a5b98 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Sun, 28 Jul 2024 22:05:51 +0800 Subject: [PATCH 111/143] refactoring: improve tensor print --- ggml/src/ggml-qnn/backend-ops.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ggml/src/ggml-qnn/backend-ops.cpp b/ggml/src/ggml-qnn/backend-ops.cpp index 89def7ec636d8..4d83fd5d1a9c6 100644 --- a/ggml/src/ggml-qnn/backend-ops.cpp +++ b/ggml/src/ggml-qnn/backend-ops.cpp @@ -59,9 +59,9 @@ bool is_tensor_dimensions_equal(const ggml_tensor *l, const ggml_tensor *r) { } void print_ggml_tensor(const ggml_tensor *tensor) { - QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", - tensor->name, tensor->type, ggml_type_name(tensor->type), tensor->ne[0], tensor->ne[1], tensor->ne[2], - tensor->nb[0], tensor->nb[1], tensor->nb[2]); + QNN_LOG_DEBUG("%s: type:%s ne: %ldx%ldx%ldx%ld, nb: %ldx%ldx%ldx%ld\n", tensor->name, ggml_type_name(tensor->type), + (long)tensor->ne[0], (long)tensor->ne[1], (long)tensor->ne[2], (long)tensor->ne[3], + (long)tensor->nb[0], (long)tensor->nb[1], (long)tensor->nb[2], (long)tensor->nb[3]); } } // namespace From 6da82947df06f182f93829e849c0ac33f68a10ea Mon Sep 17 00:00:00 2001 From: hongruichen Date: Mon, 29 Jul 2024 15:51:54 +0800 Subject: [PATCH 112/143] refactoring: set the default qnn lib search path at CMakeLists.txt by GGML_QNN_DEFAULT_LIB_SEARCH_PATH --- ggml/include/ggml-qnn.h | 8 +++----- ggml/src/CMakeLists.txt | 2 ++ ggml/src/ggml-qnn.cpp | 35 +++++++++++++---------------------- ggml/src/ggml-qnn/logger.cpp | 4 ++-- src/llama.cpp | 19 ++++++++----------- 5 files changed, 28 insertions(+), 40 deletions(-) diff --git a/ggml/include/ggml-qnn.h b/ggml/include/ggml-qnn.h index 2433af1668408..b8c7da8fbbf87 100644 --- a/ggml/include/ggml-qnn.h +++ b/ggml/include/ggml-qnn.h @@ -20,13 +20,11 @@ enum QNNBackend { /** * - * @param device 0: QNN_BACKEND_CPU 1: QNN_BACKEND_GPU 2: - * QNN_BACKEND_NPU - * @param qnn_lib_path qnn library path, such as "/data/local/tmp/" on - * Android or specified in JNI layer + * @param device 0: QNN_BACKEND_CPU 1: QNN_BACKEND_GPU 2:QNN_BACKEND_NPU + * @param extend_lib_search_path extened lib search path for searching QNN backend dynamic libs * @return */ -GGML_API ggml_backend_t ggml_backend_qnn_init(size_t dev_num, const char *qnn_lib_path); +GGML_API ggml_backend_t ggml_backend_qnn_init(size_t dev_num, const char *extend_lib_search_path); GGML_API bool ggml_backend_is_qnn(ggml_backend_t backend); diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt index dffff42e7e530..59a7014dbeff3 100644 --- a/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt @@ -889,10 +889,12 @@ if (GGML_QNN) find_library(LOG_LIB log) find_library(ANDROID_LIB android) set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} ${LOG_LIB} ${ANDROID_LIB}) + set(GGML_QNN_DEFAULT_LIB_SEARCH_PATH "\"/data/local/tmp/\"") else() message(FATAL_ERROR "QNN now only available on Android") endif() + add_compile_definitions(GGML_QNN_DEFAULT_LIB_SEARCH_PATH=${GGML_QNN_DEFAULT_LIB_SEARCH_PATH}) if (NOT DEFINED GGML_QNN_SDK_PATH) # try read from environment variable if (DEFINED ENV{QNN_SDK_PATH}) diff --git a/ggml/src/ggml-qnn.cpp b/ggml/src/ggml-qnn.cpp index 22b57b1758a54..6ed5ecb2e03f2 100644 --- a/ggml/src/ggml-qnn.cpp +++ b/ggml/src/ggml-qnn.cpp @@ -319,15 +319,8 @@ static ggml_guid_t ggml_backend_qnn_guid() { return &guid; } -static ggml_backend_t ggml_backend_qnn_reg_init(const char *params, void *user_data) { - if (nullptr == params) { - // QNN library path - // can be hardcoded to "/data/local/tmp/" for Android command line application - // or specified in JNI layer for Android APK - params = "/data/local/tmp/"; - } - ggml_backend_t qnn_backend = ggml_backend_qnn_init((int)(intptr_t)user_data, params); - +static ggml_backend_t ggml_backend_qnn_reg_init(const char *extend_lib_search_path, void *user_data) { + ggml_backend_t qnn_backend = ggml_backend_qnn_init((int)(intptr_t)user_data, extend_lib_search_path); return qnn_backend; } @@ -390,28 +383,25 @@ ggml_backend_buffer_type_t ggml_backend_qnn_buffer_type(size_t device) { return &ggml_backend_qnn_buffer_types[device]; } -/** - * - * @param device 0: QNN_BACKEND_CPU 1: QNN_BACKEND_GPU 2: QNN_BACKEND_NPU - * @param qnn_lib_path qnn library path, such as "/data/local/tmp/" on Android or specified in JNI layer - * @return - */ -ggml_backend_t ggml_backend_qnn_init(size_t device, const char *qnn_lib_path) { +ggml_backend_t ggml_backend_qnn_init(size_t device, const char *extend_lib_search_path) { int result = 0; - if (nullptr == qnn_lib_path) { - QNN_LOG_ERROR("invalid qnn lib path\n"); - return nullptr; + if (!extend_lib_search_path) { + extend_lib_search_path = GGML_QNN_DEFAULT_LIB_SEARCH_PATH; + QNN_LOG_WARN("extend_lib_search_path is nullptr, will use " GGML_QNN_DEFAULT_LIB_SEARCH_PATH " as default"); } QNN_LOG_DEBUG("device %d", device); - QNN_LOG_DEBUG("qnn_lib_path %s", qnn_lib_path); + QNN_LOG_DEBUG("extend_lib_search_path %s", extend_lib_search_path); if (device >= GGML_QNN_MAX_DEVICES) { QNN_LOG_ERROR("invalid device %d", device); return nullptr; } - std::string path = qnn_lib_path; + std::string path = extend_lib_search_path; + +// TODO: Fix this for other platforms +#if defined(__ANDROID__) || defined(ANDROID) if (QNN_BACKEND_NPU == device) { if (0 == setenv("LD_LIBRARY_PATH", (path + ":/vendor/dsp/cdsp:/vendor/lib64:/vendor/dsp/" @@ -438,8 +428,9 @@ ggml_backend_t ggml_backend_qnn_init(size_t device, const char *qnn_lib_path) { QNN_LOG_ERROR("%s backend setenv failure\n", qnn::get_backend_name(device)); } } +#endif - auto instance = std::make_shared(qnn_lib_path, g_qnn_mgr[device].lib, ""); + auto instance = std::make_shared(extend_lib_search_path, g_qnn_mgr[device].lib, ""); result = instance->qnn_init(nullptr); if (result != 0) { QNN_LOG_WARN("init qnn subsystem failed with qnn backend %s, pls check why\n", qnn::get_backend_name(device)); diff --git a/ggml/src/ggml-qnn/logger.cpp b/ggml/src/ggml-qnn/logger.cpp index fc37161edba17..187e9088c779c 100644 --- a/ggml/src/ggml-qnn/logger.cpp +++ b/ggml/src/ggml-qnn/logger.cpp @@ -5,7 +5,7 @@ #include -#if (defined __ANDROID__) || (defined ANDROID) +#if defined(__ANDROID__) || defined(ANDROID) #include #endif @@ -22,7 +22,7 @@ void qnn::internal_log(ggml_log_level level, const char * /*file*/, const char * int len_prefix = snprintf(s_qnn_internal_log_buf, QNN_LOGBUF_LEN, "[%s, %d]: ", func, line); int len = vsnprintf(s_qnn_internal_log_buf + len_prefix, QNN_LOGBUF_LEN - len_prefix, format, args); if (len < (QNN_LOGBUF_LEN - len_prefix)) { -#if (defined __ANDROID__) || (defined ANDROID) +#if defined(__ANDROID__) || defined(ANDROID) // for Android APK __android_log_print(level, "ggml-qnn", "%s\n", s_qnn_internal_log_buf); #endif diff --git a/src/llama.cpp b/src/llama.cpp index 670c5c83707dd..82d52ca84ecfd 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -16706,18 +16706,15 @@ struct llama_context * llama_new_context_with_model( } } #elif defined(GGML_USE_QNN) - if (model->n_gpu_layers > 0) { - //the second param is data path of prebuit QNN libs provided by Qualcomm - //can be hardcoded to "/data/local/tmp/" for Android command line application - //or specified in JNI layer for Android APK application - ggml_backend_t backend = ggml_backend_qnn_init(model->main_gpu, "/data/local/tmp/"); - if (nullptr == backend) { - LLAMA_LOG_ERROR("%s: failed to initialize QNN backend\n", __func__); - llama_free(ctx); - return nullptr; - } - ctx->backends.push_back(backend); + if (model->n_gpu_layers > 0) { + ggml_backend_t backend = ggml_backend_qnn_init(model->main_gpu, nullptr); + if (nullptr == backend) { + LLAMA_LOG_ERROR("%s: failed to initialize QNN backend\n", __func__); + llama_free(ctx); + return nullptr; } + ctx->backends.push_back(backend); + } #endif #ifdef GGML_USE_BLAS From 9a5f802bb6763ef51866687c97c4b936e33242d6 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Mon, 29 Jul 2024 22:18:48 +0800 Subject: [PATCH 113/143] refactoring: add convient macro to disable copy and move of class --- ggml/src/ggml-qnn/graph.hpp | 6 ++---- ggml/src/ggml-qnn/qnn-types.hpp | 8 ++++++++ ggml/src/ggml-qnn/tensor.hpp | 6 ++---- 3 files changed, 12 insertions(+), 8 deletions(-) diff --git a/ggml/src/ggml-qnn/graph.hpp b/ggml/src/ggml-qnn/graph.hpp index c82b7d66ae1cf..01190e18346c7 100644 --- a/ggml/src/ggml-qnn/graph.hpp +++ b/ggml/src/ggml-qnn/graph.hpp @@ -225,10 +225,8 @@ class ggml_qnn_graph { std::vector _qnn_tensor_outputs; std::vector _param_types; - ggml_qnn_graph(const ggml_qnn_graph &) = delete; - void operator=(const ggml_qnn_graph &) = delete; - ggml_qnn_graph(ggml_qnn_graph &&) = delete; - void operator=(ggml_qnn_graph &&) = delete; + DISABLE_COPY(ggml_qnn_graph); + DISABLE_MOVE(ggml_qnn_graph); }; } // namespace qnn diff --git a/ggml/src/ggml-qnn/qnn-types.hpp b/ggml/src/ggml-qnn/qnn-types.hpp index 58ca8648b0b03..8fce790defb61 100644 --- a/ggml/src/ggml-qnn/qnn-types.hpp +++ b/ggml/src/ggml-qnn/qnn-types.hpp @@ -51,3 +51,11 @@ using pfn_qnnsysteminterface_getproviders = decltype(QnnSystemInterface_getProvi #define RPCMEM_DEFAULT_FLAGS 1 #define RPCMEM_HEAP_ID_SYSTEM 25 + +#define DISABLE_COPY(class_name) \ + class_name(const class_name &) = delete; \ + void operator=(const class_name &) = delete + +#define DISABLE_MOVE(class_name) \ + class_name(class_name &&) = delete; \ + void operator=(class_name &&) = delete diff --git a/ggml/src/ggml-qnn/tensor.hpp b/ggml/src/ggml-qnn/tensor.hpp index c4ea7a4095d5f..07fbfde7828a7 100644 --- a/ggml/src/ggml-qnn/tensor.hpp +++ b/ggml/src/ggml-qnn/tensor.hpp @@ -193,10 +193,8 @@ class ggml_qnn_tensor { Qnn_GraphHandle_t _graph_handle = nullptr; std::unique_ptr _qnn_rpc_buffer; - ggml_qnn_tensor(const ggml_qnn_tensor &) = delete; - void operator=(const ggml_qnn_tensor &) = delete; - ggml_qnn_tensor(ggml_qnn_tensor &&) = delete; - void operator=(ggml_qnn_tensor &&) = delete; + DISABLE_COPY(ggml_qnn_tensor); + DISABLE_MOVE(ggml_qnn_tensor); }; } // namespace qnn From 74eb05a13b1c624efa4ba24eaa46839461a3e62d Mon Sep 17 00:00:00 2001 From: hongruichen Date: Mon, 29 Jul 2024 23:12:51 +0800 Subject: [PATCH 114/143] feat: add ggml_qnn_op_config for handle different op --- ggml/src/ggml-qnn/backend-ops.cpp | 88 +++++++++++++++++++------------ ggml/src/ggml-qnn/graph.hpp | 48 +++++++---------- ggml/src/ggml-qnn/op-config.hpp | 73 +++++++++++++++++++++++++ ggml/src/ggml-qnn/tensor.hpp | 2 - 4 files changed, 146 insertions(+), 65 deletions(-) create mode 100644 ggml/src/ggml-qnn/op-config.hpp diff --git a/ggml/src/ggml-qnn/backend-ops.cpp b/ggml/src/ggml-qnn/backend-ops.cpp index 4d83fd5d1a9c6..d264ec766f808 100644 --- a/ggml/src/ggml-qnn/backend-ops.cpp +++ b/ggml/src/ggml-qnn/backend-ops.cpp @@ -5,6 +5,7 @@ #include "graph.hpp" #include "logger.hpp" +#include "op-config.hpp" #include "tensor.hpp" #include "utils.hpp" @@ -123,40 +124,22 @@ std::string get_graph_key(const std::string &op_name, const std::array -qnn::ggml_qnn_graph *get_qnn_graph_from_cache(ggml_backend_qnn_context *ctx, size_t op, const std::string &qnn_op, - const std::array &inputs, - const std::array &outputs) { - GGML_ASSERT(op < (GGML_OP_COUNT + GGML_UNARY_OP_COUNT)); - - auto &graph_cache = ctx->qnn_graph_cache; - const auto *op_name = - op < kGgmlUnaryOpStart ? ggml_op_name(ggml_op(op)) : ggml_unary_op_name(ggml_unary_op(op - kGgmlUnaryOpStart)); - auto graph_key = get_graph_key<_InputSize, _OutputSize>(op_name, inputs, outputs); - auto it = graph_cache.find(graph_key); - qnn::ggml_qnn_graph *graph_ptr = nullptr; - if (it != graph_cache.end()) { - QNN_LOG_DEBUG("found graph %s in cache\n", graph_key.c_str()); - graph_ptr = it->second.get(); - } else { - auto graph = std::make_unique(graph_key, (QNNBackend)(ctx->device), ctx->instance, - ctx->socinfo.vtcm_size_in_mb); - - if (!graph->is_valid()) { - return nullptr; - } - - if (!graph->build_graph(qnn_op, to_ggml_tensor_array<_InputSize>(inputs), - to_ggml_tensor_array<_OutputSize>(outputs))) { - QNN_LOG_ERROR("build_graph failed\n"); - return nullptr; - } - - graph_ptr = graph.get(); - graph_cache[graph_key] = std::move(graph); +qnn::ggml_op_constructor_t generate_common_op_constructor(const std::string &op_name) { + if (op_name == QNN_OP_MAT_MUL) { + // For QNN_OP_MAT_MUL, we need to transpose the input tensor + return [](const std::string &name) { + auto config = std::make_unique(name, QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_MAT_MUL); + Qnn_Scalar_t scalar = QNN_SCALAR_INIT; + scalar.dataType = QNN_DATATYPE_BOOL_8; + scalar.bool8Value = true; + config->add_scalar_param(QNN_OP_MAT_MUL_PARAM_TRANSPOSE_IN0, scalar); + return config; + }; } - return graph_ptr; + return [op_name](const std::string &name) { + return std::make_unique(name, QNN_OP_PACKAGE_NAME_QTI_AISW, op_name); + }; } constexpr const char *kGgmlOpToQnnOp[] = { @@ -264,6 +247,42 @@ static_assert(sizeof(kGgmlOpToQnnOp) / sizeof(kGgmlOpToQnnOp[0]) == (GGML_OP_COU static_assert(kGgmlOpToQnnOp[GGML_UNARY_OP_GELU + kGgmlUnaryOpStart] != nullptr, "GGML_UNARY_OP_GELU does not correspond to QNN_OP_GELU"); +template +qnn::ggml_qnn_graph *get_qnn_graph_from_cache(ggml_backend_qnn_context *ctx, size_t op, + const std::array &inputs, + const std::array &outputs) { + GGML_ASSERT(op < (GGML_OP_COUNT + GGML_UNARY_OP_COUNT)); + + auto &graph_cache = ctx->qnn_graph_cache; + const auto *op_name = + op < kGgmlUnaryOpStart ? ggml_op_name(ggml_op(op)) : ggml_unary_op_name(ggml_unary_op(op - kGgmlUnaryOpStart)); + auto graph_key = get_graph_key<_InputSize, _OutputSize>(op_name, inputs, outputs); + auto it = graph_cache.find(graph_key); + qnn::ggml_qnn_graph *graph_ptr = nullptr; + if (it != graph_cache.end()) { + QNN_LOG_DEBUG("found graph %s in cache\n", graph_key.c_str()); + graph_ptr = it->second.get(); + } else { + auto graph = std::make_unique(graph_key, (QNNBackend)(ctx->device), ctx->instance, + ctx->socinfo.vtcm_size_in_mb); + if (!graph->is_valid()) { + return nullptr; + } + + auto op_constructor = generate_common_op_constructor(kGgmlOpToQnnOp[op]); + if (!graph->build_graph(op_constructor, to_ggml_tensor_array<_InputSize>(inputs), + to_ggml_tensor_array<_OutputSize>(outputs))) { + QNN_LOG_ERROR("build_graph failed\n"); + return nullptr; + } + + graph_ptr = graph.get(); + graph_cache[graph_key] = std::move(graph); + } + + return graph_ptr; +} + template bool qnn_binary_op_impl(ggml_backend_qnn_context *ctx, ggml_tensor *src0, ggml_tensor *src1, ggml_tensor *dst) { static_assert(kGgmlOpToQnnOp[_GgmlOp] != nullptr, "GGML_OP does not have a corresponding QNN_OP"); @@ -271,7 +290,7 @@ bool qnn_binary_op_impl(ggml_backend_qnn_context *ctx, ggml_tensor *src0, ggml_t CHECK_PARAMS(ctx, src0, src1, dst); bool succeed = false; - auto *graph_ptr = get_qnn_graph_from_cache<2, 1>(ctx, _GgmlOp, kGgmlOpToQnnOp[_GgmlOp], { src0, src1 }, { dst }); + auto *graph_ptr = get_qnn_graph_from_cache<2, 1>(ctx, _GgmlOp, { src0, src1 }, { dst }); if (graph_ptr) { succeed = execute_graph<2, 1>(graph_ptr, { src0, src1 }, { dst }); } @@ -292,7 +311,7 @@ bool qnn_unary_op_impl(ggml_backend_qnn_context *ctx, ggml_tensor *src, ggml_ten CHECK_PARAMS(ctx, src, dst); bool succeed = false; - auto *graph_ptr = get_qnn_graph_from_cache<1, 1>(ctx, _GgmlOp, kGgmlOpToQnnOp[_GgmlOp], { src }, { dst }); + auto *graph_ptr = get_qnn_graph_from_cache<1, 1>(ctx, _GgmlOp, { src }, { dst }); if (graph_ptr) { succeed = execute_graph<1, 1>(graph_ptr, { src }, { dst }); } @@ -305,7 +324,6 @@ bool qnn_unary_op_impl(ggml_backend_qnn_context *ctx, ggml_tensor *src, ggml_ten return succeed; } constexpr const ggml_qnn_unary_op_t kQnnUnaryOpsTable[] = { - nullptr, // GGML_OP_NONE nullptr, // GGML_OP_DUP nullptr, // GGML_OP_ADD diff --git a/ggml/src/ggml-qnn/graph.hpp b/ggml/src/ggml-qnn/graph.hpp index 01190e18346c7..1beb4b31b0c77 100644 --- a/ggml/src/ggml-qnn/graph.hpp +++ b/ggml/src/ggml-qnn/graph.hpp @@ -2,18 +2,22 @@ #pragma once #include +#include #include +#include #include #include "ggml-qnn.h" #include "logger.hpp" +#include "op-config.hpp" #include "qnn-lib.hpp" #include "tensor.hpp" namespace qnn { using ggml_tensor_array_t = std::vector; +using ggml_op_constructor_t = std::function(const std::string &)>; class ggml_qnn_graph { public: @@ -79,15 +83,15 @@ class ggml_qnn_graph { ~ggml_qnn_graph() { QNN_LOG_DEBUG("graph name %s, destroy", _graph_name.c_str()); } - bool build_graph(const std::string &op_name, const ggml_tensor_array_t &tensor_inputs, + bool build_graph(ggml_op_constructor_t op_constructor, const ggml_tensor_array_t &tensor_inputs, const ggml_tensor_array_t &tensor_outputs) { + GGML_ASSERT(op_constructor); if (!is_valid()) { QNN_LOG_ERROR("Invalid graph\n"); return false; } QNN_LOG_DEBUG("graph name %s, build_graph start", _graph_name.c_str()); - _qnn_tensor_inputs.resize(tensor_inputs.size()); _tensor_inputs.resize(tensor_inputs.size()); for (size_t i = 0; i < tensor_inputs.size(); i++) { char buffer[GGML_MAX_NAME] = {}; @@ -100,11 +104,9 @@ class ggml_qnn_graph { return false; } - _qnn_tensor_inputs[i] = qnn_tensor->get_qnn_tensor(); _tensor_inputs[i] = qnn_tensor; } - _qnn_tensor_outputs.resize(tensor_outputs.size()); _tensor_outputs.resize(tensor_outputs.size()); for (size_t i = 0; i < tensor_outputs.size(); i++) { char buffer[GGML_MAX_NAME] = {}; @@ -117,23 +119,13 @@ class ggml_qnn_graph { return false; } - _qnn_tensor_outputs[i] = qnn_tensor->get_qnn_tensor(); _tensor_outputs[i] = qnn_tensor; } - Qnn_OpConfig_t config = QNN_OPCONFIG_INIT; - config.version = QNN_OPCONFIG_VERSION_1; - auto &op_config = config.v1; - op_config.name = _graph_name.c_str(); - op_config.packageName = QNN_OP_PACKAGE_NAME_QTI_AISW; - op_config.typeName = op_name.c_str(); - op_config.numOfParams = (uint32_t)_param_types.size(); - op_config.params = _param_types.data(); - op_config.numOfInputs = (uint32_t)_qnn_tensor_inputs.size(); - op_config.inputTensors = _qnn_tensor_inputs.data(); - op_config.numOfOutputs = (uint32_t)_qnn_tensor_outputs.size(); - op_config.outputTensors = _qnn_tensor_outputs.data(); - auto error = _qnn_interface->qnn_graph_add_node(_graph_handle, config); + _op_config = op_constructor(_graph_name); + _op_config->set_input_tensors(_tensor_inputs); + _op_config->set_output_tensors(_tensor_outputs); + auto error = _qnn_interface->qnn_graph_add_node(_graph_handle, _op_config->get_op_config()); if (error != QNN_SUCCESS) { auto *error_str = get_qnn_error_string(error); if (error_str) { @@ -168,8 +160,6 @@ class ggml_qnn_graph { QNN_LOG_ERROR("bind tensor %s failed\n", ggml_get_name(ggml_tensor)); return false; } - - _qnn_tensor_inputs[i] = _tensor_inputs[i]->get_qnn_tensor(); } for (size_t i = 0; i < tensor_outputs.size(); i++) { @@ -178,13 +168,16 @@ class ggml_qnn_graph { QNN_LOG_ERROR("bind tensor %s failed\n", ggml_get_name(ggml_tensor)); return false; } - - _qnn_tensor_outputs[i] = _tensor_outputs[i]->get_qnn_tensor(); } + _op_config->set_input_tensors(_tensor_inputs); + _op_config->set_output_tensors(_tensor_outputs); + auto &qnn_tensor_inputs = _op_config->get_qnn_input_tensors(); + auto &qnn_tensor_outputs = _op_config->get_qnn_output_tensors(); + auto error = - _qnn_interface->qnn_graph_execute(_graph_handle, _qnn_tensor_inputs.data(), _qnn_tensor_inputs.size(), - _qnn_tensor_outputs.data(), _qnn_tensor_outputs.size(), nullptr, nullptr); + _qnn_interface->qnn_graph_execute(_graph_handle, qnn_tensor_inputs.data(), qnn_tensor_inputs.size(), + qnn_tensor_outputs.data(), qnn_tensor_outputs.size(), nullptr, nullptr); if (_device == QNN_BACKEND_NPU) { if (error == QNN_COMMON_ERROR_SYSTEM_COMMUNICATION) { QNN_LOG_WARN("NPU crashed. SSR detected. Caused QNN graph execute error\n"); @@ -219,10 +212,9 @@ class ggml_qnn_graph { Qnn_GraphHandle_t _graph_handle = nullptr; std::shared_ptr _qnn_instance; std::shared_ptr _qnn_interface; - std::vector> _tensor_inputs; - std::vector> _tensor_outputs; - std::vector _qnn_tensor_inputs; - std::vector _qnn_tensor_outputs; + std::vector> _tensor_inputs; + std::vector> _tensor_outputs; + std::unique_ptr _op_config; std::vector _param_types; DISABLE_COPY(ggml_qnn_graph); diff --git a/ggml/src/ggml-qnn/op-config.hpp b/ggml/src/ggml-qnn/op-config.hpp new file mode 100644 index 0000000000000..de75c93581168 --- /dev/null +++ b/ggml/src/ggml-qnn/op-config.hpp @@ -0,0 +1,73 @@ +#pragma once + +#include +#include + +#include "ggml-qnn.h" + +#include "logger.hpp" +#include "qnn-lib.hpp" +#include "qnn-types.hpp" +#include "tensor.hpp" + +namespace qnn { +class ggml_qnn_op_config { +public: + explicit ggml_qnn_op_config(const std::string &name, const std::string &package_name, const std::string &op_type) : + _name(name), _package_name(package_name), _op_type(op_type) {} + + void set_input_tensors(const std::vector> &tensor_inputs) { + _qnn_tensor_inputs.resize(tensor_inputs.size()); + for (size_t i = 0; i < tensor_inputs.size(); i++) { + _qnn_tensor_inputs[i] = tensor_inputs[i]->get_qnn_tensor(); + } + } + + void set_output_tensors(const std::vector> &tensor_outputs) { + _qnn_tensor_outputs.resize(tensor_outputs.size()); + for (size_t i = 0; i < tensor_outputs.size(); i++) { + _qnn_tensor_outputs[i] = tensor_outputs[i]->get_qnn_tensor(); + } + } + + void add_scalar_param(const std::string &name, const Qnn_Scalar_t scalar) { + _param_names.push_back(name); + Qnn_Param_t param = QNN_PARAM_INIT; + param.paramType = QNN_PARAMTYPE_SCALAR; + param.name = _param_names.back().c_str(); + param.scalarParam = scalar; + _param_types.push_back(param); + } + + std::vector &get_qnn_input_tensors() { return _qnn_tensor_inputs; } + std::vector &get_qnn_output_tensors() { return _qnn_tensor_outputs; } + + Qnn_OpConfig_t get_op_config() { + Qnn_OpConfig_t config = QNN_OPCONFIG_INIT; + config.version = QNN_OPCONFIG_VERSION_1; + auto &op_config = config.v1; + op_config.name = _name.c_str(); + op_config.packageName = _package_name.c_str(); + op_config.typeName = _op_type.c_str(); + op_config.numOfParams = (uint32_t)_param_types.size(); + op_config.params = _param_types.data(); + op_config.numOfInputs = (uint32_t)_qnn_tensor_inputs.size(); + op_config.inputTensors = _qnn_tensor_inputs.data(); + op_config.numOfOutputs = (uint32_t)_qnn_tensor_outputs.size(); + op_config.outputTensors = _qnn_tensor_outputs.data(); + return config; + } + +private: + std::string _name; + std::string _package_name; + std::string _op_type; + std::vector _qnn_tensor_inputs; + std::vector _qnn_tensor_outputs; + std::vector _param_types; + std::vector _param_names; + + DISABLE_COPY(ggml_qnn_op_config); + DISABLE_MOVE(ggml_qnn_op_config); +}; +} // namespace qnn diff --git a/ggml/src/ggml-qnn/tensor.hpp b/ggml/src/ggml-qnn/tensor.hpp index 07fbfde7828a7..b3181ed230e3d 100644 --- a/ggml/src/ggml-qnn/tensor.hpp +++ b/ggml/src/ggml-qnn/tensor.hpp @@ -8,8 +8,6 @@ #include "ggml-qnn.h" -#include "QnnTensor.h" -#include "System/QnnSystemInterface.h" #include "buffer.hpp" #include "logger.hpp" #include "qnn-lib.hpp" From 47f6e02eda0798f5f288f69dcb2956dd5703ff07 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Wed, 31 Jul 2024 22:44:21 +0800 Subject: [PATCH 115/143] fix: try fix the tensor rank of mul mat --- ggml/src/ggml-qnn/backend-ops.cpp | 1 + ggml/src/ggml-qnn/graph.hpp | 17 +++++++++++++---- ggml/src/ggml-qnn/op-config.hpp | 8 ++++---- ggml/src/ggml-qnn/tensor.hpp | 13 ++++++++----- 4 files changed, 26 insertions(+), 13 deletions(-) diff --git a/ggml/src/ggml-qnn/backend-ops.cpp b/ggml/src/ggml-qnn/backend-ops.cpp index d264ec766f808..52f078a962ae8 100644 --- a/ggml/src/ggml-qnn/backend-ops.cpp +++ b/ggml/src/ggml-qnn/backend-ops.cpp @@ -133,6 +133,7 @@ qnn::ggml_op_constructor_t generate_common_op_constructor(const std::string &op_ scalar.dataType = QNN_DATATYPE_BOOL_8; scalar.bool8Value = true; config->add_scalar_param(QNN_OP_MAT_MUL_PARAM_TRANSPOSE_IN0, scalar); + QNN_LOG_DEBUG("add scalar param %s\n", QNN_OP_MAT_MUL_PARAM_TRANSPOSE_IN0); return config; }; } diff --git a/ggml/src/ggml-qnn/graph.hpp b/ggml/src/ggml-qnn/graph.hpp index 1beb4b31b0c77..3f1a0ef163208 100644 --- a/ggml/src/ggml-qnn/graph.hpp +++ b/ggml/src/ggml-qnn/graph.hpp @@ -91,6 +91,14 @@ class ggml_qnn_graph { return false; } + // get the max tensor rank + for (auto tensor : tensor_inputs) { + _tensor_rank = std::max(_tensor_rank, ggml_n_dims(tensor)); + } + for (auto tensor : tensor_outputs) { + _tensor_rank = std::max(_tensor_rank, ggml_n_dims(tensor)); + } + QNN_LOG_DEBUG("graph name %s, build_graph start", _graph_name.c_str()); _tensor_inputs.resize(tensor_inputs.size()); for (size_t i = 0; i < tensor_inputs.size(); i++) { @@ -99,7 +107,7 @@ class ggml_qnn_graph { auto qnn_tensor = std::make_shared(std::string(buffer), _device, _graph_handle, _qnn_instance); auto *ggml_tensor = tensor_inputs[i]; - if (!qnn_tensor->bind_ggml_tensor(ggml_tensor, true)) { + if (!qnn_tensor->bind_ggml_tensor(ggml_tensor, true, _tensor_rank)) { QNN_LOG_ERROR("bind tensor %s failed\n", ggml_get_name(ggml_tensor)); return false; } @@ -114,7 +122,7 @@ class ggml_qnn_graph { auto qnn_tensor = std::make_shared(std::string(buffer), _device, _graph_handle, _qnn_instance); auto *ggml_tensor = tensor_outputs[i]; - if (!qnn_tensor->bind_ggml_tensor(ggml_tensor, false)) { + if (!qnn_tensor->bind_ggml_tensor(ggml_tensor, false, _tensor_rank)) { QNN_LOG_ERROR("bind tensor %s failed\n", ggml_get_name(ggml_tensor)); return false; } @@ -156,7 +164,7 @@ class ggml_qnn_graph { GGML_ASSERT(tensor_outputs.size() == _tensor_outputs.size()); for (size_t i = 0; i < tensor_inputs.size(); i++) { auto *ggml_tensor = tensor_inputs[i]; - if (!_tensor_inputs[i]->bind_ggml_tensor(ggml_tensor, true)) { + if (!_tensor_inputs[i]->bind_ggml_tensor(ggml_tensor, true, _tensor_rank)) { QNN_LOG_ERROR("bind tensor %s failed\n", ggml_get_name(ggml_tensor)); return false; } @@ -164,7 +172,7 @@ class ggml_qnn_graph { for (size_t i = 0; i < tensor_outputs.size(); i++) { auto *ggml_tensor = tensor_outputs[i]; - if (!_tensor_outputs[i]->bind_ggml_tensor(ggml_tensor, false)) { + if (!_tensor_outputs[i]->bind_ggml_tensor(ggml_tensor, false, _tensor_rank)) { QNN_LOG_ERROR("bind tensor %s failed\n", ggml_get_name(ggml_tensor)); return false; } @@ -216,6 +224,7 @@ class ggml_qnn_graph { std::vector> _tensor_outputs; std::unique_ptr _op_config; std::vector _param_types; + int _tensor_rank = 0; DISABLE_COPY(ggml_qnn_graph); DISABLE_MOVE(ggml_qnn_graph); diff --git a/ggml/src/ggml-qnn/op-config.hpp b/ggml/src/ggml-qnn/op-config.hpp index de75c93581168..7852ee84dc12f 100644 --- a/ggml/src/ggml-qnn/op-config.hpp +++ b/ggml/src/ggml-qnn/op-config.hpp @@ -36,7 +36,7 @@ class ggml_qnn_op_config { param.paramType = QNN_PARAMTYPE_SCALAR; param.name = _param_names.back().c_str(); param.scalarParam = scalar; - _param_types.push_back(param); + _parameters.push_back(param); } std::vector &get_qnn_input_tensors() { return _qnn_tensor_inputs; } @@ -49,8 +49,8 @@ class ggml_qnn_op_config { op_config.name = _name.c_str(); op_config.packageName = _package_name.c_str(); op_config.typeName = _op_type.c_str(); - op_config.numOfParams = (uint32_t)_param_types.size(); - op_config.params = _param_types.data(); + op_config.numOfParams = (uint32_t)_parameters.size(); + op_config.params = _parameters.data(); op_config.numOfInputs = (uint32_t)_qnn_tensor_inputs.size(); op_config.inputTensors = _qnn_tensor_inputs.data(); op_config.numOfOutputs = (uint32_t)_qnn_tensor_outputs.size(); @@ -64,7 +64,7 @@ class ggml_qnn_op_config { std::string _op_type; std::vector _qnn_tensor_inputs; std::vector _qnn_tensor_outputs; - std::vector _param_types; + std::vector _parameters; std::vector _param_names; DISABLE_COPY(ggml_qnn_op_config); diff --git a/ggml/src/ggml-qnn/tensor.hpp b/ggml/src/ggml-qnn/tensor.hpp index b3181ed230e3d..0c724e2871d45 100644 --- a/ggml/src/ggml-qnn/tensor.hpp +++ b/ggml/src/ggml-qnn/tensor.hpp @@ -29,7 +29,7 @@ class ggml_qnn_tensor { ~ggml_qnn_tensor() { _qnn_rpc_buffer.reset(); } - bool bind_ggml_tensor(ggml_tensor *tensor, bool is_input) { + bool bind_ggml_tensor(ggml_tensor *tensor, bool is_input, int prev_max_rank) { if (_tensor) { if (_tensor != tensor) { QNN_LOG_WARN("tensor %s has been bound to another ggml tensor %s", _tensor_name.c_str(), @@ -41,7 +41,7 @@ class ggml_qnn_tensor { return true; } - update_params_from_ggml_tensor(tensor); + update_params_from_ggml_tensor(tensor, prev_max_rank); Qnn_TensorType_t new_tensor_type = is_input ? QNN_TENSOR_TYPE_APP_WRITE : QNN_TENSOR_TYPE_APP_READ; QNN_TENSOR_SET_TYPE(_qnn_tensor, new_tensor_type); QNN_LOG_INFO("tensor %s changed to type %d", _tensor_name.c_str(), new_tensor_type); @@ -54,8 +54,10 @@ class ggml_qnn_tensor { QNN_LOG_WARN("create graph tensor failed, tensor %s, error: %d\n", _tensor_name.c_str(), error); return false; } + QNN_TENSOR_SET_ID(_qnn_tensor, QNN_TENSOR_GET_ID(qnn_tensor)); - QNN_LOG_DEBUG("create graph tensor %s, id: %d", _tensor_name.c_str(), QNN_TENSOR_GET_ID(qnn_tensor)); + QNN_LOG_DEBUG("create graph tensor %s, id: %d, rank: %d", _tensor_name.c_str(), + QNN_TENSOR_GET_ID(qnn_tensor), QNN_TENSOR_GET_RANK(qnn_tensor)); } if (should_use_mem_handle()) { @@ -166,14 +168,15 @@ class ggml_qnn_tensor { return true; } - void update_params_from_ggml_tensor(ggml_tensor *tensor) { + void update_params_from_ggml_tensor(ggml_tensor *tensor, int prev_max_rank) { _dimensions[0] = (uint32_t)tensor->ne[0]; _dimensions[1] = (uint32_t)tensor->ne[1]; _dimensions[2] = (uint32_t)tensor->ne[2]; _dimensions[3] = (uint32_t)tensor->ne[3]; QNN_TENSOR_SET_DATA_TYPE(_qnn_tensor, device_datatype_from_ggml_datatype(tensor->type)); // TODO: set the quantizeParams base on the tensor type - QNN_TENSOR_SET_RANK(_qnn_tensor, (uint32_t)ggml_n_dims(tensor)); + + QNN_TENSOR_SET_RANK(_qnn_tensor, (uint32_t)std::max(prev_max_rank, ggml_n_dims(tensor))); QNN_TENSOR_SET_MEM_TYPE(_qnn_tensor, QNN_TENSORMEMTYPE_RAW); Qnn_ClientBuffer_t client_buf = {}; From dedadf2a20a30dfc0a8b4d195a21a13c53c3ce00 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E3=81=BF=E3=82=83=E3=82=93?= Date: Tue, 20 Aug 2024 11:20:23 +0900 Subject: [PATCH 116/143] =?UTF-8?q?Fixed=20a=20bug=20where=20debug=20code?= =?UTF-8?q?=20was=20included=20in=20the=20release,=20resulting=20i?= =?UTF-8?q?=E2=80=A6=20(#1)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Fixed a bug where debug code was included in the release, resulting in an undefined function error. * Change the path of the QNN library when building in termux environment * Revert "Change the path of the QNN library when building in termux environment" This reverts commit c6e26a3679da2608940e2163e090adf75d667400. * Changed so that GGML_QNN_DEFAULT_LIB_SEARCH_PATH can be set from command line arguments --- ggml/src/CMakeLists.txt | 5 +++-- ggml/src/ggml-qnn/backend-ops.cpp | 6 ++++++ 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt index b3c287b7872db..0252499e3cb57 100644 --- a/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt @@ -885,12 +885,13 @@ if (GGML_QNN) find_library(LOG_LIB log) find_library(ANDROID_LIB android) set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} ${LOG_LIB} ${ANDROID_LIB}) - set(GGML_QNN_DEFAULT_LIB_SEARCH_PATH "\"/data/local/tmp/\"") + set(GGML_QNN_DEFAULT_LIB_SEARCH_PATH "/data/local/tmp/" CACHE STRING "customized library search path for QNN backend") else() message(FATAL_ERROR "QNN now only available on Android") endif() - add_compile_definitions(GGML_QNN_DEFAULT_LIB_SEARCH_PATH=${GGML_QNN_DEFAULT_LIB_SEARCH_PATH}) + string(REGEX REPLACE "/$" "" GGML_QNN_DEFAULT_LIB_SEARCH_PATH "${GGML_QNN_DEFAULT_LIB_SEARCH_PATH}") + add_compile_definitions(GGML_QNN_DEFAULT_LIB_SEARCH_PATH="${GGML_QNN_DEFAULT_LIB_SEARCH_PATH}/") if (NOT DEFINED GGML_QNN_SDK_PATH) # try read from environment variable if (DEFINED ENV{QNN_SDK_PATH}) diff --git a/ggml/src/ggml-qnn/backend-ops.cpp b/ggml/src/ggml-qnn/backend-ops.cpp index 52f078a962ae8..d6d6dddf85b23 100644 --- a/ggml/src/ggml-qnn/backend-ops.cpp +++ b/ggml/src/ggml-qnn/backend-ops.cpp @@ -296,11 +296,13 @@ bool qnn_binary_op_impl(ggml_backend_qnn_context *ctx, ggml_tensor *src0, ggml_t succeed = execute_graph<2, 1>(graph_ptr, { src0, src1 }, { dst }); } +#ifndef NDEBUG if (!succeed) { print_ggml_tensor(src0); print_ggml_tensor(src1); print_ggml_tensor(dst); } +#endif return succeed; } @@ -317,10 +319,12 @@ bool qnn_unary_op_impl(ggml_backend_qnn_context *ctx, ggml_tensor *src, ggml_ten succeed = execute_graph<1, 1>(graph_ptr, { src }, { dst }); } +#ifndef NDEBUG if (!succeed) { print_ggml_tensor(src); print_ggml_tensor(dst); } +#endif return succeed; } @@ -541,10 +545,12 @@ bool ggml_qnn_supports_op(const ggml_tensor *op) { return false; } +#ifndef NDEBUG if (op->op == GGML_OP_ADD && !is_tensor_dimensions_equal(op->src[0], op->src[1])) { QNN_LOG_DEBUG("src0 and src1 dimensions are not equal"); return false; } +#endif } switch (op->type) { From 481cb3a0c507e7ee117ad2164d62272adcc3fef9 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Sat, 7 Sep 2024 12:22:53 +0800 Subject: [PATCH 117/143] fix compiling error --- ggml/src/ggml-qnn/backend-ops.cpp | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/ggml/src/ggml-qnn/backend-ops.cpp b/ggml/src/ggml-qnn/backend-ops.cpp index d6d6dddf85b23..5829e0fadbe92 100644 --- a/ggml/src/ggml-qnn/backend-ops.cpp +++ b/ggml/src/ggml-qnn/backend-ops.cpp @@ -155,6 +155,8 @@ constexpr const char *kGgmlOpToQnnOp[] = { nullptr, // GGML_OP_SQR QNN_OP_ELEMENT_WISE_SQUARE_ROOT, // GGML_OP_SQRT QNN_OP_ELEMENT_WISE_LOG, // GGML_OP_LOG + nullptr, // GGML_OP_SIN + nullptr, // GGML_OP_COS nullptr, // GGML_OP_SUM nullptr, // GGML_OP_SUM_ROWS nullptr, // GGML_OP_MEAN @@ -192,9 +194,11 @@ constexpr const char *kGgmlOpToQnnOp[] = { nullptr, // GGML_OP_CLAMP nullptr, // GGML_OP_CONV_TRANSPOSE_1D nullptr, // GGML_OP_IM2COL + nullptr, // GGML_OP_IM2COL_BACK nullptr, // GGML_OP_CONV_TRANSPOSE_2D nullptr, // GGML_OP_POOL_1D nullptr, // GGML_OP_POOL_2D + nullptr, // GGML_OP_POOL_2D_BACK nullptr, // GGML_OP_UPSCALE nullptr, // GGML_OP_PAD nullptr, // GGML_OP_ARANGE @@ -210,6 +214,7 @@ constexpr const char *kGgmlOpToQnnOp[] = { nullptr, // GGML_OP_WIN_UNPART nullptr, // GGML_OP_GET_REL_POS nullptr, // GGML_OP_ADD_REL_POS + nullptr, // GGML_OP_RWKV_WKV nullptr, // GGML_OP_UNARY @@ -241,6 +246,7 @@ constexpr const char *kGgmlOpToQnnOp[] = { nullptr, // GGML_UNARY_OP_SILU nullptr, // GGML_UNARY_OP_HARDSWISH nullptr, // GGML_UNARY_OP_HARDSIGMOID + nullptr, // GGML_UNARY_OP_EXP }; static_assert(sizeof(kGgmlOpToQnnOp) / sizeof(kGgmlOpToQnnOp[0]) == (GGML_OP_COUNT + GGML_UNARY_OP_COUNT), @@ -340,6 +346,8 @@ constexpr const ggml_qnn_unary_op_t kQnnUnaryOpsTable[] = { nullptr, // GGML_OP_SQR qnn_unary_op_impl, // GGML_OP_SQRT qnn_unary_op_impl, // GGML_OP_LOG + nullptr, // GGML_OP_SIN + nullptr, // GGML_OP_COS nullptr, // GGML_OP_SUM nullptr, // GGML_OP_SUM_ROWS nullptr, // GGML_OP_MEAN @@ -377,9 +385,11 @@ constexpr const ggml_qnn_unary_op_t kQnnUnaryOpsTable[] = { nullptr, // GGML_OP_CLAMP nullptr, // GGML_OP_CONV_TRANSPOSE_1D nullptr, // GGML_OP_IM2COL + nullptr, // GGML_OP_IM2COL_BACK nullptr, // GGML_OP_CONV_TRANSPOSE_2D nullptr, // GGML_OP_POOL_1D nullptr, // GGML_OP_POOL_2D + nullptr, // GGML_OP_POOL_2D_BACK nullptr, // GGML_OP_UPSCALE nullptr, // GGML_OP_PAD nullptr, // GGML_OP_ARANGE @@ -395,6 +405,7 @@ constexpr const ggml_qnn_unary_op_t kQnnUnaryOpsTable[] = { nullptr, // GGML_OP_WIN_UNPART nullptr, // GGML_OP_GET_REL_POS nullptr, // GGML_OP_ADD_REL_POS + nullptr, // GGML_OP_RWKV_WKV nullptr, // GGML_OP_UNARY @@ -426,6 +437,7 @@ constexpr const ggml_qnn_unary_op_t kQnnUnaryOpsTable[] = { nullptr, // GGML_UNARY_OP_SILU nullptr, // GGML_UNARY_OP_HARDSWISH nullptr, // GGML_UNARY_OP_HARDSIGMOID + nullptr, // GGML_UNARY_OP_EXP }; static_assert(sizeof(kQnnUnaryOpsTable) / sizeof(kQnnUnaryOpsTable[0]) == (GGML_OP_COUNT + GGML_UNARY_OP_COUNT), @@ -443,6 +455,8 @@ static constexpr const ggml_qnn_binary_op_t kQnnBinaryOpsTable[] = { nullptr, // GGML_OP_SQR nullptr, // GGML_OP_SQRT nullptr, // GGML_OP_LOG + nullptr, // GGML_OP_SIN + nullptr, // GGML_OP_COS nullptr, // GGML_OP_SUM nullptr, // GGML_OP_SUM_ROWS nullptr, // GGML_OP_MEAN @@ -480,9 +494,11 @@ static constexpr const ggml_qnn_binary_op_t kQnnBinaryOpsTable[] = { nullptr, // GGML_OP_CLAMP nullptr, // GGML_OP_CONV_TRANSPOSE_1D nullptr, // GGML_OP_IM2COL + nullptr, // GGML_OP_IM2COL_BACK nullptr, // GGML_OP_CONV_TRANSPOSE_2D nullptr, // GGML_OP_POOL_1D nullptr, // GGML_OP_POOL_2D + nullptr, // GGML_OP_POOL_2D_BACK nullptr, // GGML_OP_UPSCALE nullptr, // GGML_OP_PAD nullptr, // GGML_OP_ARANGE @@ -498,6 +514,7 @@ static constexpr const ggml_qnn_binary_op_t kQnnBinaryOpsTable[] = { nullptr, // GGML_OP_WIN_UNPART nullptr, // GGML_OP_GET_REL_POS nullptr, // GGML_OP_ADD_REL_POS + nullptr, // GGML_OP_RWKV_WKV nullptr, // GGML_OP_UNARY From b7aea0438e080a6d090196a3d84e3947c111c600 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Wed, 18 Sep 2024 12:56:47 +0800 Subject: [PATCH 118/143] fix compiling error --- ggml/src/CMakeLists.txt | 2 +- ggml/src/ggml-qnn.cpp | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt index 1edbd736d5e20..185f588be144c 100644 --- a/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt @@ -910,7 +910,7 @@ if (GGML_QNN) if (CMAKE_SYSTEM_NAME STREQUAL "Android") find_library(LOG_LIB log) find_library(ANDROID_LIB android) - set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} ${LOG_LIB} ${ANDROID_LIB}) + set(GGML_EXTRA_LIBS_PRIVATE ${GGML_EXTRA_LIBS_PRIVATE} ${LOG_LIB} ${ANDROID_LIB}) set(GGML_QNN_DEFAULT_LIB_SEARCH_PATH "/data/local/tmp/" CACHE STRING "customized library search path for QNN backend") else() message(FATAL_ERROR "QNN now only available on Android") diff --git a/ggml/src/ggml-qnn.cpp b/ggml/src/ggml-qnn.cpp index 6ed5ecb2e03f2..b5b18e04aa20c 100644 --- a/ggml/src/ggml-qnn.cpp +++ b/ggml/src/ggml-qnn.cpp @@ -29,6 +29,7 @@ #include #include "ggml-backend-impl.h" +#include "ggml-impl.h" #include "ggml-qnn/backend-ops.hpp" #include "ggml-qnn/backend.hpp" From a1ceaae4ad73ecfb6a1c76bb8ed4a8f452790e3c Mon Sep 17 00:00:00 2001 From: Hongrui Chen Date: Sat, 28 Sep 2024 23:06:17 +0800 Subject: [PATCH 119/143] fix compiling error at older ndk (r23c) --- ggml/src/ggml-qnn/qnn-lib.hpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/ggml/src/ggml-qnn/qnn-lib.hpp b/ggml/src/ggml-qnn/qnn-lib.hpp index da986e2e4c4ff..d55f730f80d84 100644 --- a/ggml/src/ggml-qnn/qnn-lib.hpp +++ b/ggml/src/ggml-qnn/qnn-lib.hpp @@ -6,6 +6,7 @@ #include #include #include +#include // header file of Qualcomm QNN(Qualcomm Neural Network, aka Qualcomm AI Engine Direct) SDK // https://qpm.qualcomm.com/#/main/tools/details/qualcomm_ai_engine_direct @@ -251,7 +252,7 @@ class qnn_instance { } qnn_status = QNN_SUCCESS; - if (_backend_name.find("Htp") != std::variant_npos) { + if (_backend_name.find("Htp") != _backend_name.npos) { const QnnDevice_PlatformInfo_t *p_info = nullptr; _qnn_interface->qnn_device_get_platform_info(nullptr, &p_info); QNN_LOG_INFO("device counts %d", p_info->v1.numHwDevices); @@ -358,7 +359,7 @@ class qnn_instance { QNN_LOG_DEBUG("initialize qnn context successfully\n"); } - if (_backend_name.find("Htp") != std::variant_npos) { + if (_backend_name.find("Htp") != _backend_name.npos) { // TODO: faster approach to probe the accurate capacity of rpc ion memory size_t candidate_size = 0; uint8_t *rpc_buffer = nullptr; @@ -409,7 +410,7 @@ class qnn_instance { QNN_LOG_DEBUG("succeed to close rpcmem lib\n"); } - if (_backend_name.find("Htp") != std::variant_npos) { + if (_backend_name.find("Htp") != _backend_name.npos) { _qnn_htp_perfinfra->destroyPowerConfigId(_qnn_power_configid); } From 1da8a3e67831edaf9ba5ad7a8fcfaf9a01d6090f Mon Sep 17 00:00:00 2001 From: hongruichen Date: Mon, 30 Sep 2024 10:37:23 +0800 Subject: [PATCH 120/143] fix compiling error after merge --- ggml/src/ggml-qnn.cpp | 1 + ggml/src/ggml-qnn/backend-ops.cpp | 3 +++ ggml/src/ggml-qnn/tensor.hpp | 1 + 3 files changed, 5 insertions(+) diff --git a/ggml/src/ggml-qnn.cpp b/ggml/src/ggml-qnn.cpp index b5b18e04aa20c..3e3fb5778c883 100644 --- a/ggml/src/ggml-qnn.cpp +++ b/ggml/src/ggml-qnn.cpp @@ -180,6 +180,7 @@ static ggml_backend_buffer_i ggml_backend_qnn_buffer_interface = { /* .free_buffer = */ ggml_backend_qnn_buffer_free_buffer, /* .get_base = */ ggml_backend_qnn_buffer_get_base, /* .init_tensor = */ ggml_backend_qnn_buffer_init_tensor, + /* .memset_tensor = */ nullptr, /* .set_tensor = */ ggml_backend_qnn_buffer_set_tensor, /* .get_tensor = */ ggml_backend_qnn_buffer_get_tensor, /* .cpy_tensor = */ ggml_backend_qnn_buffer_cpy_tensor, diff --git a/ggml/src/ggml-qnn/backend-ops.cpp b/ggml/src/ggml-qnn/backend-ops.cpp index 5829e0fadbe92..6a83f4561807a 100644 --- a/ggml/src/ggml-qnn/backend-ops.cpp +++ b/ggml/src/ggml-qnn/backend-ops.cpp @@ -231,6 +231,7 @@ constexpr const char *kGgmlOpToQnnOp[] = { nullptr, // GGML_OP_CROSS_ENTROPY_LOSS nullptr, // GGML_OP_CROSS_ENTROPY_LOSS_BACK + nullptr, // GGML_OP_OPT_STEP_ADAMW // ggml_unary_op nullptr, // GGML_UNARY_OP_ABS @@ -422,6 +423,7 @@ constexpr const ggml_qnn_unary_op_t kQnnUnaryOpsTable[] = { nullptr, // GGML_OP_CROSS_ENTROPY_LOSS nullptr, // GGML_OP_CROSS_ENTROPY_LOSS_BACK + nullptr, // GGML_OP_OPT_STEP_ADAMW // ggml_unary_op nullptr, // GGML_UNARY_OP_ABS @@ -531,6 +533,7 @@ static constexpr const ggml_qnn_binary_op_t kQnnBinaryOpsTable[] = { nullptr, // GGML_OP_CROSS_ENTROPY_LOSS nullptr, // GGML_OP_CROSS_ENTROPY_LOSS_BACK + nullptr, // GGML_OP_OPT_STEP_ADAMW }; static_assert(sizeof(kQnnBinaryOpsTable) / sizeof(kQnnBinaryOpsTable[0]) == GGML_OP_COUNT, diff --git a/ggml/src/ggml-qnn/tensor.hpp b/ggml/src/ggml-qnn/tensor.hpp index 0c724e2871d45..c465d17f25506 100644 --- a/ggml/src/ggml-qnn/tensor.hpp +++ b/ggml/src/ggml-qnn/tensor.hpp @@ -1,6 +1,7 @@ #pragma once +#include #include #include #include From 181cf52888015417d812b50c76af9ee0032aec11 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Thu, 10 Oct 2024 10:29:51 +0800 Subject: [PATCH 121/143] adapt new register backend interface and fix missing ops --- ggml/include/ggml-qnn.h | 11 +- ggml/src/ggml-backend.cpp | 8 + ggml/src/ggml-qnn.cpp | 443 ++++++++++++++++++------------ ggml/src/ggml-qnn/backend-ops.cpp | 23 +- ggml/src/ggml-qnn/backend-ops.hpp | 2 +- ggml/src/ggml-qnn/backend.hpp | 25 +- src/llama.cpp | 4 - 7 files changed, 308 insertions(+), 208 deletions(-) diff --git a/ggml/include/ggml-qnn.h b/ggml/include/ggml-qnn.h index b8c7da8fbbf87..23835f23cb0ec 100644 --- a/ggml/include/ggml-qnn.h +++ b/ggml/include/ggml-qnn.h @@ -8,6 +8,7 @@ extern "C" { #endif +#define GGML_QNN_NAME "QNN" #define GGML_QNN_MAX_DEVICES 3 enum QNNBackend { @@ -20,21 +21,17 @@ enum QNNBackend { /** * - * @param device 0: QNN_BACKEND_CPU 1: QNN_BACKEND_GPU 2:QNN_BACKEND_NPU + * @param index 0: QNN_BACKEND_CPU 1: QNN_BACKEND_GPU 2:QNN_BACKEND_NPU * @param extend_lib_search_path extened lib search path for searching QNN backend dynamic libs * @return */ -GGML_API ggml_backend_t ggml_backend_qnn_init(size_t dev_num, const char *extend_lib_search_path); +GGML_API ggml_backend_t ggml_backend_qnn_init(size_t index, const char *extend_lib_search_path); GGML_API bool ggml_backend_is_qnn(ggml_backend_t backend); -GGML_API void ggml_backend_qnn_set_n_threads(ggml_backend_t backend, int thread_counts); - GGML_API int ggml_backend_qnn_get_device_count(void); -GGML_API void ggml_backend_qnn_get_device_description(size_t dev_num, char *description, size_t description_size); - -GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_qnn_buffer_type(size_t dev_num); +GGML_API ggml_backend_reg_t ggml_backend_qnn_reg(void); #ifdef __cplusplus } diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index 0551764fe3fb0..f70c9f6e42f09 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -525,6 +525,10 @@ void * ggml_backend_reg_get_proc_address(ggml_backend_reg_t reg, const char * na #include "ggml-cuda.h" #endif +#ifdef GGML_USE_QNN +#include "ggml-qnn.h" +#endif + struct ggml_backend_registry { std::vector backends; std::vector devices; @@ -534,6 +538,10 @@ struct ggml_backend_registry { register_backend(ggml_backend_cuda_reg()); #endif +#ifdef GGML_USE_QNN + register_backend(ggml_backend_qnn_reg()); +#endif + register_backend(ggml_backend_cpu_reg()); // TODO: sycl, metal, vulkan, kompute, cann diff --git a/ggml/src/ggml-qnn.cpp b/ggml/src/ggml-qnn.cpp index 3e3fb5778c883..2d2b4745d13f1 100644 --- a/ggml/src/ggml-qnn.cpp +++ b/ggml/src/ggml-qnn.cpp @@ -1,11 +1,5 @@ #include "ggml-qnn.h" -#include -#include -#include -#include -#include -#include #include #include @@ -50,23 +44,19 @@ #define QNN_BACKEND_NAME "qnn" -// according to the QNN SDK Reference Guide, -// CPU - Choose a non-quantized model.Quantized models are currently incompatible with the CPU backend -// GPU - Choose a non-quantized model.Quantized models are currently incompatible with the GPU backend -// HTP - Choose a quantized model. Quantized models are required when running on the HTP backend -// DSP - Choose a quantized model. Quantized models are required when running on the DSP backend -// HTA - Choose a quantized model. Quantized models are required when running on the HTA backend -// -// only focus on Qualcomm CPU/GPU/NPU backend in this implementation of QNN backend for ggml currently, -// CPU: Qualcomm Kryo CPU -// GPU: Qualcomm Adreno GPU -// NPU: Qualcomm NPU: aka HTP(Hexagon Tensor Processor), ~= cDSP(Compute DSP) + -// HMX(Hexagon Matrix eXtensions)/HTA(Hexagon Tensor Accelerator) - -static struct ggml_backend_qnn_context g_qnn_mgr[GGML_QNN_MAX_DEVICES] = { - ggml_backend_qnn_context(QNN_BACKEND_CPU, 1, "qnn-cpu", "libQnnCpu.so"), /* QNN_BACKEND_CPU */ - ggml_backend_qnn_context(QNN_BACKEND_GPU, 1, "qnn-gpu", "libQnnGpu.so"), /* QNN_BACKEND_GPU */ - ggml_backend_qnn_context(QNN_BACKEND_NPU, 1, "qnn-npu", "libQnnHtp.so"), /* QNN_BACKEND_NPU */ +namespace { + +struct qnn_device_caps { + const char *name; + const char *description; + const char *lib_name; + enum ggml_backend_dev_type type; +}; + +const qnn_device_caps kDeviceCaps[GGML_QNN_MAX_DEVICES]{ + { "qnn-cpu", "Qualcomm Kryo CPU", "libQnnCpu.so", GGML_BACKEND_DEVICE_TYPE_CPU }, /* QNN_BACKEND_CPU */ + { "qnn-gpu", "Qualcomm Adreno GPU", "libQnnGpu.so", GGML_BACKEND_DEVICE_TYPE_GPU }, /* QNN_BACKEND_GPU */ + { "qnn-npu", "Qualcomm NPU", "libQnnHtp.so", GGML_BACKEND_DEVICE_TYPE_GPU }, /* QNN_BACKEND_NPU */ }; class ggml_backend_qnn_buffer_context { @@ -74,6 +64,7 @@ class ggml_backend_qnn_buffer_context { ggml_backend_qnn_buffer_context(QNNBackend device, std::shared_ptr instance, size_t size) : _instance(instance), _name(QNN_BACKEND_NAME + std::to_string(device)) { + // TODO: fix this for other platforms size_t size_page = sysconf(_SC_PAGESIZE); // TODO: for qnn npu, a better way here is to reuse the buffer allocated by qnn rpc, will save an extra copy @@ -105,61 +96,60 @@ class ggml_backend_qnn_buffer_context { }; struct ggml_backend_qnn_buffer_type_context { - size_t device; std::string name; }; -// ================================================================================================= -// -// implementation of QNN backend for GGML -// -// ================================================================================================= -static bool ggml_qnn_compute_forward(ggml_backend_qnn_context *ctx, struct ggml_tensor *tensor) { - return qnn::ggml_qnn_forward(ctx, tensor); +ggml_backend_qnn_device_context *get_device_context(ggml_backend_dev_t dev) { + return reinterpret_cast(dev->context); } -static const char *ggml_backend_qnn_buffer_get_name(ggml_backend_buffer_t buffer) { +/* + * ----------------------------------------------------------------------------------------------- + * qnn backend buffer object + * ----------------------------------------------------------------------------------------------- + */ +const char *ggml_backend_qnn_buffer_get_name(ggml_backend_buffer_t buffer) { GGML_UNUSED(buffer); - return "QNN"; + return GGML_QNN_NAME; } -GGML_CALL static bool ggml_backend_buffer_is_qnn(ggml_backend_buffer_t buffer) { +bool ggml_backend_buffer_is_qnn(ggml_backend_buffer_t buffer) { return buffer->iface.get_name == ggml_backend_qnn_buffer_get_name; } -GGML_CALL static void ggml_backend_qnn_buffer_free_buffer(ggml_backend_buffer_t buffer) { +void ggml_backend_qnn_buffer_free_buffer(ggml_backend_buffer_t buffer) { ggml_backend_qnn_buffer_context *ctx = (ggml_backend_qnn_buffer_context *)buffer->context; delete ctx; } -GGML_CALL static void *ggml_backend_qnn_buffer_get_base(ggml_backend_buffer_t buffer) { +void *ggml_backend_qnn_buffer_get_base(ggml_backend_buffer_t buffer) { ggml_backend_qnn_buffer_context *ctx = (ggml_backend_qnn_buffer_context *)buffer->context; return ctx->get_buffer(); } -GGML_CALL static void ggml_backend_qnn_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor *tensor) { +void ggml_backend_qnn_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor *tensor) { // Do nothing here, the qnn tensor will be create along with the graph. GGML_UNUSED(buffer); GGML_UNUSED(tensor); } -GGML_CALL static void ggml_backend_qnn_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor *tensor, - const void *data, size_t offset, size_t size) { +void ggml_backend_qnn_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor *tensor, const void *data, + size_t offset, size_t size) { GGML_UNUSED(buffer); memcpy((char *)tensor->data + offset, data, size); } -GGML_CALL static void ggml_backend_qnn_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor *tensor, - void *data, size_t offset, size_t size) { +void ggml_backend_qnn_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor *tensor, void *data, + size_t offset, size_t size) { GGML_UNUSED(buffer); memcpy(data, (const char *)tensor->data + offset, size); } -GGML_CALL static bool ggml_backend_qnn_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor *src, - struct ggml_tensor *dst) { +bool ggml_backend_qnn_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor *src, + struct ggml_tensor *dst) { GGML_UNUSED(buffer); if (ggml_backend_buffer_is_host(src->buffer)) { memcpy(dst->data, src->data, ggml_nbytes(src)); @@ -169,13 +159,13 @@ GGML_CALL static bool ggml_backend_qnn_buffer_cpy_tensor(ggml_backend_buffer_t b return false; } -GGML_CALL static void ggml_backend_qnn_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { +void ggml_backend_qnn_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { ggml_backend_qnn_buffer_context *ctx = (ggml_backend_qnn_buffer_context *)buffer->context; memset(ctx->get_buffer(), value, ctx->get_buffer_size()); } -static ggml_backend_buffer_i ggml_backend_qnn_buffer_interface = { +ggml_backend_buffer_i ggml_backend_qnn_buffer_interface = { /* .get_name = */ ggml_backend_qnn_buffer_get_name, /* .free_buffer = */ ggml_backend_qnn_buffer_free_buffer, /* .get_base = */ ggml_backend_qnn_buffer_get_base, @@ -188,16 +178,20 @@ static ggml_backend_buffer_i ggml_backend_qnn_buffer_interface = { /* .reset = */ nullptr, }; -GGML_CALL static const char *ggml_backend_qnn_buffer_type_name(ggml_backend_buffer_type_t buft) { +/* + * ----------------------------------------------------------------------------------------------- + * qnn backend object + * ----------------------------------------------------------------------------------------------- + */ +const char *ggml_backend_qnn_buffer_type_name(ggml_backend_buffer_type_t buft) { GGML_UNUSED(buft); - return "QNN"; + return GGML_QNN_NAME; } -GGML_CALL static ggml_backend_buffer_t ggml_backend_qnn_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, - size_t size) { - ggml_backend_qnn_buffer_type_context *buft_ctx = (ggml_backend_qnn_buffer_type_context *)buft->context; +ggml_backend_buffer_t ggml_backend_qnn_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { + auto *dev_ctx = get_device_context(buft->device); ggml_backend_qnn_buffer_context *ctx = - new ggml_backend_qnn_buffer_context((QNNBackend)buft_ctx->device, g_qnn_mgr[buft_ctx->device].instance, size); + new ggml_backend_qnn_buffer_context((QNNBackend)dev_ctx->device, dev_ctx->instance, size); if (!ctx->is_valid()) { return nullptr; } @@ -205,65 +199,84 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_qnn_buffer_type_alloc_buffer return ggml_backend_buffer_init(buft, ggml_backend_qnn_buffer_interface, ctx, size); } -GGML_CALL static size_t ggml_backend_qnn_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { +size_t ggml_backend_qnn_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { GGML_UNUSED(buft); return 32; } // TODO: this value is an experimental value, works fine with whisper/llm/minicpm-v inference on Android -GGML_CALL static size_t ggml_backend_qnn_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) { +size_t ggml_backend_qnn_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) { GGML_UNUSED(buft); return (96 * 1024 * 1024); } -GGML_CALL static bool ggml_backend_qnn_buffer_is_host(ggml_backend_buffer_type_t buft) { +bool ggml_backend_qnn_buffer_is_host(ggml_backend_buffer_type_t buft) { + // TODO: fix this GGML_UNUSED(buft); return true; } -GGML_CALL static const char *ggml_backend_qnn_name(ggml_backend_t backend) { - ggml_backend_qnn_context *ctx = (ggml_backend_qnn_context *)backend->context; - return g_qnn_mgr[ctx->device].name; +const char *ggml_backend_qnn_name(ggml_backend_t backend) { + auto *device_ctx = get_device_context(backend->device); + return device_ctx->name.c_str(); } -GGML_CALL static void ggml_backend_qnn_free(ggml_backend_t backend) { - QNN_LOG_INFO("enter %s", __func__); - ggml_backend_qnn_context *ctx = (ggml_backend_qnn_context *)backend->context; - QNN_LOG_INFO("idx %d, name:%s", ctx->device, g_qnn_mgr[ctx->device].name); +void ggml_backend_qnn_free(ggml_backend_t backend) { + auto *device_ctx = get_device_context(backend->device); + QNN_LOG_INFO("idx %d, name:%s", device_ctx->device, device_ctx->name.c_str()); - auto instance = g_qnn_mgr[ctx->device].instance; + auto &instance = device_ctx->instance; if (instance) { - ctx->qnn_graph_cache.clear(); + device_ctx->qnn_graph_cache.clear(); + device_ctx->qnn_interface.reset(); instance->qnn_finalize(); - g_qnn_mgr[ctx->device].instance.reset(); + instance.reset(); } +} - if (g_qnn_mgr[ctx->device].backend != nullptr) { - delete backend; - g_qnn_mgr[ctx->device].backend = nullptr; +ggml_backend_buffer_type_t ggml_backend_qnn_buffer_type(ggml_backend_dev_t dev) { + static ggml_backend_qnn_buffer_type_context ggml_backend_qnn_buffer_type_contexts[GGML_QNN_MAX_DEVICES]; + static ggml_backend_buffer_type ggml_backend_qnn_buffer_types[GGML_QNN_MAX_DEVICES]; + static bool ggml_backend_qnn_buffer_type_initialized = false; + auto *dev_ctx = get_device_context(dev); + if (!ggml_backend_qnn_buffer_type_initialized) { + for (size_t i = 0; i < GGML_QNN_MAX_DEVICES; i++) { + auto &context = ggml_backend_qnn_buffer_type_contexts[i]; + context = { std::string(QNN_BACKEND_NAME) + std::to_string(i) }; + ggml_backend_qnn_buffer_types[i] = { + /* .iface = */ { + /* .get_name = */ ggml_backend_qnn_buffer_type_name, + /* .alloc_buffer = */ ggml_backend_qnn_buffer_type_alloc_buffer, + /* .get_alignment = */ ggml_backend_qnn_buffer_type_get_alignment, + /* .get_max_size = */ ggml_backend_qnn_buffer_type_get_max_size, + /* .get_alloc_size = */ nullptr, // defaults to ggml_nbytes + /* .is_host = */ ggml_backend_qnn_buffer_is_host, + }, + /* .device */ dev, + /* .context = */ &context, + }; + } + ggml_backend_qnn_buffer_type_initialized = true; } - QNN_LOG_INFO("leave %s", __func__); -} -GGML_CALL static ggml_backend_buffer_type_t ggml_backend_qnn_get_default_buffer_type(ggml_backend_t backend) { - ggml_backend_qnn_context *ctx = (ggml_backend_qnn_context *)backend->context; + return &ggml_backend_qnn_buffer_types[dev_ctx->device]; +} - return ggml_backend_qnn_buffer_type(ctx->device); +ggml_backend_buffer_type_t ggml_backend_qnn_get_default_buffer_type(ggml_backend_t backend) { + return ggml_backend_qnn_buffer_type(backend->device); } -GGML_CALL static ggml_status ggml_backend_qnn_graph_compute(ggml_backend_t backend, ggml_cgraph *cgraph) { +ggml_status ggml_backend_qnn_graph_compute(ggml_backend_t backend, ggml_cgraph *cgraph) { enum ggml_status result = GGML_STATUS_SUCCESS; - ggml_backend_qnn_context *ctx = (ggml_backend_qnn_context *)backend->context; - GGML_UNUSED(ctx); - + auto *device_ctx = get_device_context(backend->device); for (int i = 0; i < cgraph->n_nodes; i++) { ggml_tensor *node = cgraph->nodes[i]; if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) { continue; } - bool ok = ggml_qnn_compute_forward(ctx, node); + bool ok = qnn::ggml_qnn_forward(device_ctx, node); if (!ok) { QNN_LOG_DEBUG("error: op not supported %s (%s)\n", node->name, ggml_op_name(node->op)); } @@ -272,12 +285,12 @@ GGML_CALL static ggml_status ggml_backend_qnn_graph_compute(ggml_backend_t backe return result; } -GGML_CALL static bool ggml_backend_qnn_supports_op(ggml_backend_t backend, const ggml_tensor *op) { +bool ggml_backend_qnn_supports_op(ggml_backend_t backend, const ggml_tensor *op) { GGML_UNUSED(backend); return qnn::ggml_qnn_supports_op(op); } -GGML_CALL static bool ggml_backend_qnn_offload_op(ggml_backend_t backend, const ggml_tensor *op) { +bool ggml_backend_qnn_offload_op(ggml_backend_t backend, const ggml_tensor *op) { GGML_UNUSED(backend); size_t dims = ggml_n_dims(op); @@ -292,7 +305,7 @@ GGML_CALL static bool ggml_backend_qnn_offload_op(ggml_backend_t backend, const return can_offload; } -static ggml_backend_i ggml_backend_qnn_interface = { +ggml_backend_i ggml_backend_qnn_interface = { /* .get_name = */ ggml_backend_qnn_name, /* .free = */ ggml_backend_qnn_free, /* .get_default_buffer_type = */ ggml_backend_qnn_get_default_buffer_type, @@ -305,106 +318,75 @@ static ggml_backend_i ggml_backend_qnn_interface = { /* .graph_plan_update = */ nullptr, /* .graph_plan_compute = */ nullptr, /* .graph_compute = */ ggml_backend_qnn_graph_compute, - /* .supports_op = */ ggml_backend_qnn_supports_op, - /* .supports_buft = */ nullptr, - /* .offload_op = */ ggml_backend_qnn_offload_op, - /* .event_new = */ nullptr, - /* .event_free = */ nullptr, + /* .supports_op = */ nullptr, // moved to device + /* .supports_buft = */ nullptr, // moved to device + /* .offload_op = */ nullptr, // moved to device /* .event_record = */ nullptr, /* .event_wait = */ nullptr, - /* .event_synchronize = */ nullptr, }; -static ggml_guid_t ggml_backend_qnn_guid() { - static ggml_guid guid = { 0x1a, 0x2b, 0x3c, 0x4d, 0x5e, 0x6f, 0x70, 0x81, - 0x92, 0xa3, 0xb4, 0xc5, 0xd6, 0xe7, 0xf8, 0x09 }; - return &guid; -} - -static ggml_backend_t ggml_backend_qnn_reg_init(const char *extend_lib_search_path, void *user_data) { - ggml_backend_t qnn_backend = ggml_backend_qnn_init((int)(intptr_t)user_data, extend_lib_search_path); - return qnn_backend; +/* + * ----------------------------------------------------------------------------------------------- + * qnn backend device object + * ----------------------------------------------------------------------------------------------- + */ +const char *ggml_backend_qnn_device_get_name(ggml_backend_dev_t dev) { + const auto &caps = kDeviceCaps[get_device_context(dev)->device]; + return caps.name; } -bool ggml_backend_is_qnn(ggml_backend_t backend) { - return backend != nullptr && ggml_guid_matches(backend->guid, ggml_backend_qnn_guid()); +const char *ggml_backend_qnn_device_get_description(ggml_backend_dev_t dev) { + const auto &caps = kDeviceCaps[get_device_context(dev)->device]; + return caps.description; } -void ggml_backend_qnn_set_n_threads(ggml_backend_t backend, int n_threads) { - GGML_ASSERT(ggml_backend_is_qnn(backend)); +void ggml_backend_qnn_device_get_memory(ggml_backend_dev_t dev, size_t *free, size_t *total) { + // TODO: get memory info + *free = 0; + *total = 0; - auto *ctx = (ggml_backend_qnn_context *)backend->context; - ctx->threads = n_threads; + GGML_UNUSED(dev); } -int ggml_backend_qnn_get_device_count() { return GGML_QNN_MAX_DEVICES; } - -void ggml_backend_qnn_get_device_description(size_t dev_num, char *description, size_t description_size) { - if (nullptr == description || 0 == description_size) { - QNN_LOG_WARN("invalid param"); - return; - } - - if (dev_num >= GGML_QNN_MAX_DEVICES) { - QNN_LOG_WARN("invalid param"); - return; - } - - snprintf(description, description_size, "%s", g_qnn_mgr[dev_num].name); +enum ggml_backend_dev_type ggml_backend_qnn_device_get_type(ggml_backend_dev_t dev) { + // TODO: for cpu backend, we should return GGML_BACKEND_DEVICE_TYPE_CPU + GGML_UNUSED(dev); + return GGML_BACKEND_DEVICE_TYPE_GPU; } -ggml_backend_buffer_type_t ggml_backend_qnn_buffer_type(size_t device) { - if (device >= GGML_QNN_MAX_DEVICES) { - QNN_LOG_DEBUG( - "ggml_backend_qnn_buffer_type error: device_index:%d is " - "out of range [0, %d]\n", - device, GGML_QNN_MAX_DEVICES - 1); - return nullptr; - } - - static ggml_backend_qnn_buffer_type_context ggml_backend_qnn_buffer_type_contexts[GGML_QNN_MAX_DEVICES]; - static ggml_backend_buffer_type ggml_backend_qnn_buffer_types[GGML_QNN_MAX_DEVICES]; - static bool ggml_backend_qnn_buffer_type_initialized = false; - if (!ggml_backend_qnn_buffer_type_initialized) { - for (size_t i = 0; i < GGML_QNN_MAX_DEVICES; i++) { - auto &context = ggml_backend_qnn_buffer_type_contexts[i]; - context = { i, std::string(QNN_BACKEND_NAME) + std::to_string(i) }; - ggml_backend_qnn_buffer_types[i] = { - /* .iface = */ { /* .get_name = */ ggml_backend_qnn_buffer_type_name, - /* .alloc_buffer = */ ggml_backend_qnn_buffer_type_alloc_buffer, - /* .get_alignment = */ ggml_backend_qnn_buffer_type_get_alignment, - /* .get_max_size = */ ggml_backend_qnn_buffer_type_get_max_size, - /* .get_alloc_size = */ nullptr, // defaults to ggml_nbytes - /* .is_host = */ ggml_backend_qnn_buffer_is_host }, - /* .context = */ &context, - }; - } - ggml_backend_qnn_buffer_type_initialized = true; - } - - return &ggml_backend_qnn_buffer_types[device]; +void ggml_backend_qnn_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props *props) { + props->name = ggml_backend_qnn_device_get_name(dev); + props->description = ggml_backend_qnn_device_get_description(dev); + props->type = ggml_backend_qnn_device_get_type(dev); + ggml_backend_qnn_device_get_memory(dev, &props->memory_free, &props->memory_total); + props->caps = { + /* async */ false, + /* host_buffer */ false, + /* events */ false, + }; } -ggml_backend_t ggml_backend_qnn_init(size_t device, const char *extend_lib_search_path) { - int result = 0; +ggml_guid_t ggml_backend_qnn_guid() { + static ggml_guid guid = { 0x1a, 0x2b, 0x3c, 0x4d, 0x5e, 0x6f, 0x70, 0x81, + 0x92, 0xa3, 0xb4, 0xc5, 0xd6, 0xe7, 0xf8, 0x09 }; + return &guid; +} +ggml_backend_t ggml_backend_qnn_init_with_device_context(ggml_backend_dev_t dev, const char *extend_lib_search_path) { if (!extend_lib_search_path) { extend_lib_search_path = GGML_QNN_DEFAULT_LIB_SEARCH_PATH; QNN_LOG_WARN("extend_lib_search_path is nullptr, will use " GGML_QNN_DEFAULT_LIB_SEARCH_PATH " as default"); } - QNN_LOG_DEBUG("device %d", device); + auto *dev_ctx = get_device_context(dev); + auto device_index = dev_ctx->device; + QNN_LOG_DEBUG("device %d", device_index); QNN_LOG_DEBUG("extend_lib_search_path %s", extend_lib_search_path); - if (device >= GGML_QNN_MAX_DEVICES) { - QNN_LOG_ERROR("invalid device %d", device); - return nullptr; - } - std::string path = extend_lib_search_path; // TODO: Fix this for other platforms #if defined(__ANDROID__) || defined(ANDROID) - if (QNN_BACKEND_NPU == device) { + if (QNN_BACKEND_NPU == device_index) { if (0 == setenv("LD_LIBRARY_PATH", (path + ":/vendor/dsp/cdsp:/vendor/lib64:/vendor/dsp/" "dsp:/vendor/dsp/images") @@ -425,17 +407,18 @@ ggml_backend_t ggml_backend_qnn_init(size_t device, const char *extend_lib_searc } } else { if (0 == setenv("LD_LIBRARY_PATH", path.c_str(), 1)) { - QNN_LOG_INFO("%s backend setenv successfully\n", qnn::get_backend_name(device)); + QNN_LOG_INFO("%s backend setenv successfully\n", qnn::get_backend_name(device_index)); } else { - QNN_LOG_ERROR("%s backend setenv failure\n", qnn::get_backend_name(device)); + QNN_LOG_ERROR("%s backend setenv failure\n", qnn::get_backend_name(device_index)); } } #endif - auto instance = std::make_shared(extend_lib_search_path, g_qnn_mgr[device].lib, ""); - result = instance->qnn_init(nullptr); + auto instance = std::make_shared(path, dev_ctx->lib_name, "ggml"); + auto result = instance->qnn_init(nullptr); if (result != 0) { - QNN_LOG_WARN("init qnn subsystem failed with qnn backend %s, pls check why\n", qnn::get_backend_name(device)); + QNN_LOG_WARN("init qnn subsystem failed with qnn backend %s, pls check why\n", + qnn::get_backend_name(device_index)); return nullptr; } auto qnn_interface = instance->get_qnn_interface(); @@ -444,28 +427,138 @@ ggml_backend_t ggml_backend_qnn_init(size_t device, const char *extend_lib_searc return nullptr; } - std::string device_name = qnn::get_backend_name(device); + std::string device_name = qnn::get_backend_name(device_index); QNN_LOG_INFO("qnn device name %s", device_name.c_str()); - auto &qnn_device = g_qnn_mgr[device]; - qnn_device.instance = instance; - qnn_device.qnn_interface = qnn_interface; - qnn_device.socinfo = instance->get_soc_info(); + dev_ctx->instance = instance; + dev_ctx->qnn_interface = qnn_interface; + dev_ctx->socinfo = instance->get_soc_info(); - ggml_backend_t qnn_backend = new ggml_backend{ /* .guid = */ ggml_backend_qnn_guid(), - /* .iface = */ ggml_backend_qnn_interface, - /* .context = */ &g_qnn_mgr[device] }; - g_qnn_mgr[device].backend = qnn_backend; + ggml_backend_t qnn_backend = new ggml_backend{ + /* .guid = */ ggml_backend_qnn_guid(), + /* .iface = */ ggml_backend_qnn_interface, + /* .device = */ dev, + /* .context = */ nullptr, + }; return qnn_backend; } -extern "C" GGML_CALL void ggml_backend_qnn_reg_devices(); +ggml_backend_t ggml_backend_qnn_device_init(ggml_backend_dev_t dev, const char *params) { + return ggml_backend_qnn_init_with_device_context(dev, params); +} + +ggml_backend_buffer_type_t ggml_backend_qnn_device_get_buffer_type(ggml_backend_dev_t dev) { + return ggml_backend_qnn_buffer_type(dev); +} + +ggml_backend_buffer_t ggml_backend_qnn_device_buffer_from_ptr(ggml_backend_dev_t dev, void *ptr, size_t size, + size_t max_tensor_size) { + // TODO + GGML_UNUSED(dev); + GGML_UNUSED(max_tensor_size); + return ggml_backend_cpu_buffer_from_ptr(ptr, size); +} + +bool ggml_backend_qnn_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor *op) { + GGML_UNUSED(dev); + return qnn::ggml_qnn_supports_op(op); +} + +bool ggml_backend_qnn_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) { + GGML_UNUSED(dev); + return ggml_backend_buft_is_host(buft); +} + +const struct ggml_backend_device_i ggml_backend_qnn_device_interface = { + /* .get_name = */ ggml_backend_qnn_device_get_name, + /* .get_description = */ ggml_backend_qnn_device_get_description, + /* .get_memory = */ ggml_backend_qnn_device_get_memory, + /* .get_type = */ ggml_backend_qnn_device_get_type, + /* .get_props = */ ggml_backend_qnn_device_get_props, + /* .init_backend = */ ggml_backend_qnn_device_init, + /* .get_buffer_type = */ ggml_backend_qnn_device_get_buffer_type, + /* .get_host_buffer_type = */ nullptr, + /* .buffer_from_host_ptr = */ ggml_backend_qnn_device_buffer_from_ptr, + /* .supports_op = */ ggml_backend_qnn_device_supports_op, + /* .supports_buft = */ ggml_backend_qnn_device_supports_buft, + /* .offload_op = */ nullptr, + /* .event_new = */ nullptr, + /* .event_free = */ nullptr, + /* .event_synchronize = */ nullptr, +}; + +/* + * ----------------------------------------------------------------------------------------------- + * qnn backend registry object + * ----------------------------------------------------------------------------------------------- + */ + +struct ggml_backend_qnn_reg_impl : ggml_backend_reg { + std::array, GGML_QNN_MAX_DEVICES> device_contexts; + std::array devices; + + ggml_backend_qnn_reg_impl(ggml_backend_reg_i interface) { + context = this; + iface = interface; + } +}; + +const char *ggml_backend_qnn_reg_get_name(ggml_backend_reg_t reg) { + GGML_UNUSED(reg); + return GGML_QNN_NAME; +} + +size_t ggml_backend_qnn_reg_get_device_count(ggml_backend_reg_t reg) { + auto *ctx = (ggml_backend_qnn_reg_impl *)reg->context; + return ctx->devices.size(); +} + +ggml_backend_dev_t ggml_backend_qnn_reg_get_device(ggml_backend_reg_t reg, size_t index) { + auto *ctx = (ggml_backend_qnn_reg_impl *)reg->context; + GGML_ASSERT(index < ctx->devices.size()); + return &(ctx->devices[index]); +} -GGML_CALL void ggml_backend_qnn_reg_devices() { - for (size_t idx = 0; idx < GGML_QNN_MAX_DEVICES; idx++) { - char name[GGML_MAX_NAME]; - ggml_backend_qnn_get_device_description(idx, name, GGML_MAX_NAME); - ggml_backend_register(name, ggml_backend_qnn_reg_init, ggml_backend_qnn_buffer_type(idx), - (void *)(intptr_t)idx); +const ggml_backend_reg_i ggml_backend_qnn_reg_interface = { + /* .get_name = */ ggml_backend_qnn_reg_get_name, + /* .get_device_count = */ ggml_backend_qnn_reg_get_device_count, + /* .get_device_get = */ ggml_backend_qnn_reg_get_device, + /* .get_proc_address = */ nullptr, +}; + +} // namespace + +ggml_backend_reg_t ggml_backend_qnn_reg() { + static ggml_backend_qnn_reg_impl reg{ ggml_backend_qnn_reg_interface }; + static bool initialized = false; + static std::mutex mutex; + + { + std::lock_guard lock(mutex); + if (!initialized) { + for (int i = 0; i < GGML_QNN_MAX_DEVICES; i++) { + reg.device_contexts[i] = std::make_unique( + /* .device = */ (QNNBackend)i, + /* .threads = */ 1, + /* .name = */ qnn::get_backend_name(i), + /* .lib_name = */ kDeviceCaps[i].lib_name); + + auto &device = reg.devices[i]; + device.iface = ggml_backend_qnn_device_interface; + device.reg = ® + device.context = reg.device_contexts[i].get(); + } + initialized = true; + } } + + return ® +} + +int ggml_backend_qnn_get_device_count() { return GGML_QNN_MAX_DEVICES; } + +ggml_backend_t ggml_backend_qnn_init(size_t index, const char *extend_lib_search_path) { + auto *reg = ggml_backend_qnn_reg(); + auto *device = ggml_backend_qnn_reg_get_device(reg, index); + return ggml_backend_qnn_device_init(device, extend_lib_search_path); } diff --git a/ggml/src/ggml-qnn/backend-ops.cpp b/ggml/src/ggml-qnn/backend-ops.cpp index 6a83f4561807a..9c6e5709c8189 100644 --- a/ggml/src/ggml-qnn/backend-ops.cpp +++ b/ggml/src/ggml-qnn/backend-ops.cpp @@ -13,7 +13,7 @@ namespace { -bool qnn_is_valid_params(ggml_backend_qnn_context *ctx, const ggml_tensor *src, ggml_tensor *dst) { +bool qnn_is_valid_params(ggml_backend_qnn_device_context *ctx, const ggml_tensor *src, ggml_tensor *dst) { if (!ctx || !src || !dst) { QNN_LOG_WARN("invalid params\n"); return false; @@ -28,7 +28,7 @@ bool qnn_is_valid_params(ggml_backend_qnn_context *ctx, const ggml_tensor *src, return true; } -bool qnn_is_valid_params(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, +bool qnn_is_valid_params(ggml_backend_qnn_device_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst) { if (!ctx || !src0 || !src1 || !dst) { QNN_LOG_WARN("invalid params\n"); @@ -78,8 +78,8 @@ void print_ggml_tensor(const ggml_tensor *tensor) { namespace { -typedef bool (*ggml_qnn_unary_op_t)(ggml_backend_qnn_context *ctx, ggml_tensor *src, ggml_tensor *dst); -typedef bool (*ggml_qnn_binary_op_t)(ggml_backend_qnn_context *ctx, ggml_tensor *src0, ggml_tensor *src1, +typedef bool (*ggml_qnn_unary_op_t)(ggml_backend_qnn_device_context *ctx, ggml_tensor *src, ggml_tensor *dst); +typedef bool (*ggml_qnn_binary_op_t)(ggml_backend_qnn_device_context *ctx, ggml_tensor *src0, ggml_tensor *src1, ggml_tensor *dst); typedef const ggml_qnn_unary_op_t (&ggml_qnn_unary_op_array_t)[GGML_OP_COUNT + GGML_UNARY_OP_COUNT]; @@ -161,6 +161,7 @@ constexpr const char *kGgmlOpToQnnOp[] = { nullptr, // GGML_OP_SUM_ROWS nullptr, // GGML_OP_MEAN nullptr, // GGML_OP_ARGMAX + nullptr, // GGML_OP_COUNT_EQUAL nullptr, // GGML_OP_REPEAT nullptr, // GGML_OP_REPEAT_BACK nullptr, // GGML_OP_CONCAT @@ -256,7 +257,7 @@ static_assert(kGgmlOpToQnnOp[GGML_UNARY_OP_GELU + kGgmlUnaryOpStart] != nullptr, "GGML_UNARY_OP_GELU does not correspond to QNN_OP_GELU"); template -qnn::ggml_qnn_graph *get_qnn_graph_from_cache(ggml_backend_qnn_context *ctx, size_t op, +qnn::ggml_qnn_graph *get_qnn_graph_from_cache(ggml_backend_qnn_device_context *ctx, size_t op, const std::array &inputs, const std::array &outputs) { GGML_ASSERT(op < (GGML_OP_COUNT + GGML_UNARY_OP_COUNT)); @@ -271,8 +272,8 @@ qnn::ggml_qnn_graph *get_qnn_graph_from_cache(ggml_backend_qnn_context *ctx, siz QNN_LOG_DEBUG("found graph %s in cache\n", graph_key.c_str()); graph_ptr = it->second.get(); } else { - auto graph = std::make_unique(graph_key, (QNNBackend)(ctx->device), ctx->instance, - ctx->socinfo.vtcm_size_in_mb); + auto graph = + std::make_unique(graph_key, ctx->device, ctx->instance, ctx->socinfo.vtcm_size_in_mb); if (!graph->is_valid()) { return nullptr; } @@ -292,7 +293,7 @@ qnn::ggml_qnn_graph *get_qnn_graph_from_cache(ggml_backend_qnn_context *ctx, siz } template -bool qnn_binary_op_impl(ggml_backend_qnn_context *ctx, ggml_tensor *src0, ggml_tensor *src1, ggml_tensor *dst) { +bool qnn_binary_op_impl(ggml_backend_qnn_device_context *ctx, ggml_tensor *src0, ggml_tensor *src1, ggml_tensor *dst) { static_assert(kGgmlOpToQnnOp[_GgmlOp] != nullptr, "GGML_OP does not have a corresponding QNN_OP"); CHECK_PARAMS(ctx, src0, src1, dst); @@ -315,7 +316,7 @@ bool qnn_binary_op_impl(ggml_backend_qnn_context *ctx, ggml_tensor *src0, ggml_t } template -bool qnn_unary_op_impl(ggml_backend_qnn_context *ctx, ggml_tensor *src, ggml_tensor *dst) { +bool qnn_unary_op_impl(ggml_backend_qnn_device_context *ctx, ggml_tensor *src, ggml_tensor *dst) { static_assert(kGgmlOpToQnnOp[_GgmlOp] != nullptr, "GGML_OP does not have a corresponding QNN_OP"); CHECK_PARAMS(ctx, src, dst); @@ -353,6 +354,7 @@ constexpr const ggml_qnn_unary_op_t kQnnUnaryOpsTable[] = { nullptr, // GGML_OP_SUM_ROWS nullptr, // GGML_OP_MEAN nullptr, // GGML_OP_ARGMAX + nullptr, // GGML_OP_COUNT_EQUAL nullptr, // GGML_OP_REPEAT nullptr, // GGML_OP_REPEAT_BACK nullptr, // GGML_OP_CONCAT @@ -463,6 +465,7 @@ static constexpr const ggml_qnn_binary_op_t kQnnBinaryOpsTable[] = { nullptr, // GGML_OP_SUM_ROWS nullptr, // GGML_OP_MEAN nullptr, // GGML_OP_ARGMAX + nullptr, // GGML_OP_COUNT_EQUAL nullptr, // GGML_OP_REPEAT nullptr, // GGML_OP_REPEAT_BACK nullptr, // GGML_OP_CONCAT @@ -588,7 +591,7 @@ bool ggml_qnn_supports_op(const ggml_tensor *op) { return true; } -bool ggml_qnn_forward(ggml_backend_qnn_context *ctx, struct ggml_tensor *tensor) { +bool ggml_qnn_forward(ggml_backend_qnn_device_context *ctx, struct ggml_tensor *tensor) { size_t unary_op_idx = tensor->op; if (tensor->op == GGML_OP_UNARY) { unary_op_idx = kGgmlUnaryOpStart + ggml_get_unary_op(tensor); diff --git a/ggml/src/ggml-qnn/backend-ops.hpp b/ggml/src/ggml-qnn/backend-ops.hpp index ed4ce994f787b..86658da118f8b 100644 --- a/ggml/src/ggml-qnn/backend-ops.hpp +++ b/ggml/src/ggml-qnn/backend-ops.hpp @@ -7,6 +7,6 @@ namespace qnn { bool ggml_qnn_supports_op(const ggml_tensor *op); -bool ggml_qnn_forward(ggml_backend_qnn_context *ctx, struct ggml_tensor *tensor); +bool ggml_qnn_forward(ggml_backend_qnn_device_context *ctx, struct ggml_tensor *tensor); } // namespace qnn diff --git a/ggml/src/ggml-qnn/backend.hpp b/ggml/src/ggml-qnn/backend.hpp index b2f93a8f7a9e5..696a883480e9f 100644 --- a/ggml/src/ggml-qnn/backend.hpp +++ b/ggml/src/ggml-qnn/backend.hpp @@ -2,11 +2,13 @@ #pragma once #include +#include #include #include "ggml.h" #include "ggml-backend.h" +#include "ggml-qnn.h" #include "graph.hpp" #include "qnn-lib.hpp" @@ -15,20 +17,21 @@ namespace qnn { typedef std::unordered_map> ggml_qnn_graph_cache_t; } // namespace qnn -struct ggml_backend_qnn_context { - int device; - int threads; - char name[GGML_MAX_NAME]; - char lib[GGML_MAX_NAME]; - ggml_backend *backend = nullptr; +struct ggml_backend_qnn_device_context { + // initialize in constructor + QNNBackend device; + size_t threads; + std::string name; + std::string lib_name; + + // initialize in init qnn::qcom_socinfo socinfo = {}; std::shared_ptr instance; std::shared_ptr qnn_interface; + qnn::ggml_qnn_graph_cache_t qnn_graph_cache; - explicit ggml_backend_qnn_context(int device, int threads, const char *name, const char *lib) : - device(device), threads(threads) { - strncpy(this->name, name, GGML_MAX_NAME); - strncpy(this->lib, lib, GGML_MAX_NAME); - } + explicit ggml_backend_qnn_device_context(QNNBackend device, size_t threads, const char *name, + const char *lib_name) : + device(device), threads(threads), name(name), lib_name(lib_name) {} }; diff --git a/src/llama.cpp b/src/llama.cpp index 44fef53b31295..d929d74e567e8 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -3430,8 +3430,6 @@ static int llama_get_device_count(const llama_model & model) { count += ggml_backend_vk_get_device_count(); #elif defined(GGML_USE_CANN) count += ggml_backend_cann_get_device_count(); -#elif defined(GGML_USE_QNN) - count = ggml_backend_qnn_get_device_count(); #endif return count; @@ -3465,8 +3463,6 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(const llama_mode if (host_buffer) { buft = ggml_backend_vk_host_buffer_type(); } -#elif defined(GGML_USE_QNN) - buft = ggml_backend_qnn_buffer_type(gpu); #endif if (buft == nullptr) { From f2604982136b43944902366d2b41f5ebb65cc49a Mon Sep 17 00:00:00 2001 From: hongruichen Date: Fri, 11 Oct 2024 12:11:31 +0800 Subject: [PATCH 122/143] remove unused function --- ggml/src/ggml-qnn.cpp | 20 -------------------- 1 file changed, 20 deletions(-) diff --git a/ggml/src/ggml-qnn.cpp b/ggml/src/ggml-qnn.cpp index 2d2b4745d13f1..bc88ba0f4de45 100644 --- a/ggml/src/ggml-qnn.cpp +++ b/ggml/src/ggml-qnn.cpp @@ -285,26 +285,6 @@ ggml_status ggml_backend_qnn_graph_compute(ggml_backend_t backend, ggml_cgraph * return result; } -bool ggml_backend_qnn_supports_op(ggml_backend_t backend, const ggml_tensor *op) { - GGML_UNUSED(backend); - return qnn::ggml_qnn_supports_op(op); -} - -bool ggml_backend_qnn_offload_op(ggml_backend_t backend, const ggml_tensor *op) { - GGML_UNUSED(backend); - - size_t dims = ggml_n_dims(op); - bool can_offload = false; - for (size_t i = 0; i < dims; i++) { - if (op->ne[i] > 1) { - can_offload = true; - break; - } - } - - return can_offload; -} - ggml_backend_i ggml_backend_qnn_interface = { /* .get_name = */ ggml_backend_qnn_name, /* .free = */ ggml_backend_qnn_free, From 4abaf7d87ed55876448394d340e475e5426f29a9 Mon Sep 17 00:00:00 2001 From: nullname Date: Mon, 28 Oct 2024 12:48:16 +0800 Subject: [PATCH 123/143] feat: fix mulmat (#2) * ggml_qnn_op_config now manager the construction of ggml_qnn_tensor * wip * add interface ggml_qnn_op_config * add ggml_qnn_list_op_config * add create_tensor and move tensor bind to execute * wip * rename: ggml_qnn_list_op_config -> ggml_qnn_matmul_op_config * add tensortype to allow native tensor * remove ggml_tensor param at ggml_qnn_tensor::create_tensor * postpone the tensor id allocation to add_node * add ggml_qnn_op_config_base * trival change to reduct the param of function * split bind_tensors into bind_input_tensors and bind_output_tensors * implement ggml_qnn_single_op_config::create_tensors next will set the prameter of transpose * tensor: add bind buffer * add parameter tensor type * implement add_tensor_param * set qnn_instance only at constructor * set transpose tensor param * move create_op_constructor into op-config module * create QNN_OP_MAT_MUL from ggml_qnn_matmul_op_config * try fix crash * fix compiling error at older ndk (r23c) * fix crash * fix parameter tensor name * update tensor dimension assignment and add TODO * fix mat_mul graph creating * fix MUL_MAT_256x16x10x1_256x1x10x1_16x1x10x1 * append type to graph cache key * wip * fix supported op * update comment * disable op other than add and mat_mul * add convert op to adapt multi input/output format * disable f16 for cpu backend according to official doc https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/cpu_backend.html#supported-operations * add supported data types flags in each backend * remove unused functions * append output type to graph key * fix gpu backend by disable the different data type op * fix cpu backend support ops * fix duplicated tensor name * append op name * suppress warning * remove unused code --- ggml/src/ggml-qnn.cpp | 36 ++- ggml/src/ggml-qnn/backend-ops.cpp | 114 +++++--- ggml/src/ggml-qnn/backend-ops.hpp | 2 +- ggml/src/ggml-qnn/backend.hpp | 2 + ggml/src/ggml-qnn/buffer.hpp | 4 +- ggml/src/ggml-qnn/graph.hpp | 117 ++------ ggml/src/ggml-qnn/op-config.cpp | 471 ++++++++++++++++++++++++++++++ ggml/src/ggml-qnn/op-config.hpp | 147 ++++++---- ggml/src/ggml-qnn/qnn-lib.hpp | 2 +- ggml/src/ggml-qnn/tensor.hpp | 154 +++++++--- ggml/src/ggml-qnn/utils.cpp | 114 +++++++- ggml/src/ggml-qnn/utils.hpp | 15 +- 12 files changed, 920 insertions(+), 258 deletions(-) create mode 100644 ggml/src/ggml-qnn/op-config.cpp diff --git a/ggml/src/ggml-qnn.cpp b/ggml/src/ggml-qnn.cpp index bc88ba0f4de45..21a7dee1c99ef 100644 --- a/ggml/src/ggml-qnn.cpp +++ b/ggml/src/ggml-qnn.cpp @@ -51,12 +51,30 @@ struct qnn_device_caps { const char *description; const char *lib_name; enum ggml_backend_dev_type type; + + // TODO: should get this caps from device + std::unordered_set supported_types; }; const qnn_device_caps kDeviceCaps[GGML_QNN_MAX_DEVICES]{ - { "qnn-cpu", "Qualcomm Kryo CPU", "libQnnCpu.so", GGML_BACKEND_DEVICE_TYPE_CPU }, /* QNN_BACKEND_CPU */ - { "qnn-gpu", "Qualcomm Adreno GPU", "libQnnGpu.so", GGML_BACKEND_DEVICE_TYPE_GPU }, /* QNN_BACKEND_GPU */ - { "qnn-npu", "Qualcomm NPU", "libQnnHtp.so", GGML_BACKEND_DEVICE_TYPE_GPU }, /* QNN_BACKEND_NPU */ + { "qnn-cpu", + "Qualcomm Kryo CPU", + "libQnnCpu.so", + GGML_BACKEND_DEVICE_TYPE_CPU, + { GGML_TYPE_F32, + GGML_TYPE_I8 } }, // https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/CpuOpDefSupplement.html#matmul + { "qnn-gpu", + "Qualcomm Adreno GPU", + "libQnnGpu.so", + GGML_BACKEND_DEVICE_TYPE_GPU, + { GGML_TYPE_F32, + GGML_TYPE_F16 } }, // https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/GpuOpDefSupplement.html#matmul + { "qnn-npu", + "Qualcomm NPU", + "libQnnHtp.so", + GGML_BACKEND_DEVICE_TYPE_GPU, + { GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_I16, + GGML_TYPE_I8 } }, // https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/HtpOpDefSupplement.html#matmul }; class ggml_backend_qnn_buffer_context { @@ -340,9 +358,10 @@ void ggml_backend_qnn_device_get_props(ggml_backend_dev_t dev, struct ggml_backe props->type = ggml_backend_qnn_device_get_type(dev); ggml_backend_qnn_device_get_memory(dev, &props->memory_free, &props->memory_total); props->caps = { - /* async */ false, - /* host_buffer */ false, - /* events */ false, + /* async */ false, + /* host_buffer */ false, + /* buffer_from_host_ptr */ false, + /* events */ false, }; } @@ -412,6 +431,7 @@ ggml_backend_t ggml_backend_qnn_init_with_device_context(ggml_backend_dev_t dev, dev_ctx->instance = instance; dev_ctx->qnn_interface = qnn_interface; dev_ctx->socinfo = instance->get_soc_info(); + dev_ctx->supported_types = kDeviceCaps[device_index].supported_types; ggml_backend_t qnn_backend = new ggml_backend{ /* .guid = */ ggml_backend_qnn_guid(), @@ -440,8 +460,8 @@ ggml_backend_buffer_t ggml_backend_qnn_device_buffer_from_ptr(ggml_backend_dev_t } bool ggml_backend_qnn_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor *op) { - GGML_UNUSED(dev); - return qnn::ggml_qnn_supports_op(op); + auto *device_ctx = get_device_context(dev); + return qnn::ggml_qnn_supports_op(device_ctx, op); } bool ggml_backend_qnn_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) { diff --git a/ggml/src/ggml-qnn/backend-ops.cpp b/ggml/src/ggml-qnn/backend-ops.cpp index 9c6e5709c8189..d20069874a7c3 100644 --- a/ggml/src/ggml-qnn/backend-ops.cpp +++ b/ggml/src/ggml-qnn/backend-ops.cpp @@ -108,8 +108,8 @@ std::string get_graph_key(const std::string &op_name, const std::array &outputs) { constexpr static const auto append_dimensions = [](std::string &key, const ggml_tensor *tensor) { char buffer[256] = {}; - snprintf(buffer, sizeof(buffer), "_%ldx%ldx%ldx%ld", (long)tensor->ne[0], (long)tensor->ne[1], - (long)tensor->ne[2], (long)tensor->ne[3]); + snprintf(buffer, sizeof(buffer), "_%ldx%ldx%ldx%ld%s", (long)tensor->ne[0], (long)tensor->ne[1], + (long)tensor->ne[2], (long)tensor->ne[3], qnn::get_ggml_type_name(tensor->type)); key += buffer; }; @@ -117,32 +117,11 @@ std::string get_graph_key(const std::string &op_name, const std::arraytype); return graph_key; } -qnn::ggml_op_constructor_t generate_common_op_constructor(const std::string &op_name) { - if (op_name == QNN_OP_MAT_MUL) { - // For QNN_OP_MAT_MUL, we need to transpose the input tensor - return [](const std::string &name) { - auto config = std::make_unique(name, QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_MAT_MUL); - Qnn_Scalar_t scalar = QNN_SCALAR_INIT; - scalar.dataType = QNN_DATATYPE_BOOL_8; - scalar.bool8Value = true; - config->add_scalar_param(QNN_OP_MAT_MUL_PARAM_TRANSPOSE_IN0, scalar); - QNN_LOG_DEBUG("add scalar param %s\n", QNN_OP_MAT_MUL_PARAM_TRANSPOSE_IN0); - return config; - }; - } - - return [op_name](const std::string &name) { - return std::make_unique(name, QNN_OP_PACKAGE_NAME_QTI_AISW, op_name); - }; -} - constexpr const char *kGgmlOpToQnnOp[] = { nullptr, // GGML_OP_NONE nullptr, // GGML_OP_DUP @@ -278,7 +257,7 @@ qnn::ggml_qnn_graph *get_qnn_graph_from_cache(ggml_backend_qnn_device_context *c return nullptr; } - auto op_constructor = generate_common_op_constructor(kGgmlOpToQnnOp[op]); + auto op_constructor = qnn::create_op_constructor(kGgmlOpToQnnOp[op]); if (!graph->build_graph(op_constructor, to_ggml_tensor_array<_InputSize>(inputs), to_ggml_tensor_array<_OutputSize>(outputs))) { QNN_LOG_ERROR("build_graph failed\n"); @@ -542,11 +521,57 @@ static constexpr const ggml_qnn_binary_op_t kQnnBinaryOpsTable[] = { static_assert(sizeof(kQnnBinaryOpsTable) / sizeof(kQnnBinaryOpsTable[0]) == GGML_OP_COUNT, "GGML_OP_COUNT does not match the size of the kQnnBinaryOpsTable table"); +bool ggml_qnn_supports_tensor(ggml_backend_qnn_device_context *ctx, const ggml_tensor *tensor) { + switch (tensor->type) { + case GGML_TYPE_F32: + case GGML_TYPE_F16: + case GGML_TYPE_Q8_0: + case GGML_TYPE_Q4_0: + if (ctx->supported_types.find(tensor->type) == ctx->supported_types.end()) { + QNN_LOG_DEBUG("unsupported data type GGML_TYPE_F16 for cpu backend"); + return false; + } + break; + default: + QNN_LOG_DEBUG("unsupported data type %d", tensor->type); + return false; + } + + return true; +} + +bool ggml_qnn_supports_matmul_op(ggml_backend_qnn_device_context *ctx, const ggml_tensor *op) { + GGML_UNUSED(ctx); + + auto *src0 = op->src[0]; + auto *src1 = op->src[1]; + if (src0->type != src1->type || src0->type != op->type) { + // current qnn implementation only supports the same type for src0 and src1 + QNN_LOG_DEBUG("src0 type %d and src1 type %d and op type %d are not equal", src0->type, src1->type, op->type); + return false; + } + + if (src0->ne[2] != src1->ne[2] || src0->ne[3] != src1->ne[3]) { + /* + * TODO: remove the blocker here when qnn backend supports mul_mat like this: + * [ne03, ne02, n, k] * [ne03 * x, ne02 * y, m, k] -> [ne03 * x, ne02 * y, m, n] + */ + QNN_LOG_DEBUG("src0 and src1 dimensions are not equal"); + return false; + } + + return true; +} + } // namespace namespace qnn { -bool ggml_qnn_supports_op(const ggml_tensor *op) { +bool ggml_qnn_supports_op(ggml_backend_qnn_device_context *ctx, const ggml_tensor *op) { + if (op->op == GGML_OP_NONE) { + return true; + } + if (op->op == GGML_OP_UNARY) { if (!kQnnUnaryOpsTable[kGgmlUnaryOpStart + ggml_get_unary_op(op)]) { QNN_LOG_DEBUG("unsupported unary op %d", ggml_get_unary_op(op)); @@ -557,35 +582,38 @@ bool ggml_qnn_supports_op(const ggml_tensor *op) { QNN_LOG_DEBUG("src0 is nullptr"); return false; } - } else if (op->op != GGML_OP_NONE) { + } else { if (!kQnnUnaryOpsTable[op->op] && !kQnnBinaryOpsTable[op->op]) { QNN_LOG_DEBUG("unsupported op %d", op->op); return false; } - if (!op->src[0] || !op->src[1]) { + auto *src0 = op->src[0]; + auto *src1 = op->src[1]; + if (!src0 || !src1) { QNN_LOG_DEBUG("src0 or src1 is nullptr"); return false; } -#ifndef NDEBUG - if (op->op == GGML_OP_ADD && !is_tensor_dimensions_equal(op->src[0], op->src[1])) { - QNN_LOG_DEBUG("src0 and src1 dimensions are not equal"); + if (!ggml_qnn_supports_tensor(ctx, src0) || !ggml_qnn_supports_tensor(ctx, src1) || + !ggml_qnn_supports_tensor(ctx, op)) { return false; } -#endif - } - switch (op->type) { - case GGML_TYPE_F32: - case GGML_TYPE_F16: - case GGML_TYPE_I8: - case GGML_TYPE_Q8_0: - case GGML_TYPE_Q4_0: - break; - default: - QNN_LOG_DEBUG("unsupported src0 type %d", op->src[0]->type); - return false; + switch (op->op) { + case GGML_OP_ADD: + if (!is_tensor_dimensions_equal(src0, src1)) { + QNN_LOG_DEBUG("src0 and src1 dimensions are not equal"); + return false; + } + break; + + case GGML_OP_MUL_MAT: + return ggml_qnn_supports_matmul_op(ctx, op); + + default: + return false; + } } return true; diff --git a/ggml/src/ggml-qnn/backend-ops.hpp b/ggml/src/ggml-qnn/backend-ops.hpp index 86658da118f8b..3df7f4a98a146 100644 --- a/ggml/src/ggml-qnn/backend-ops.hpp +++ b/ggml/src/ggml-qnn/backend-ops.hpp @@ -6,7 +6,7 @@ namespace qnn { -bool ggml_qnn_supports_op(const ggml_tensor *op); +bool ggml_qnn_supports_op(ggml_backend_qnn_device_context *ctx, const ggml_tensor *op); bool ggml_qnn_forward(ggml_backend_qnn_device_context *ctx, struct ggml_tensor *tensor); } // namespace qnn diff --git a/ggml/src/ggml-qnn/backend.hpp b/ggml/src/ggml-qnn/backend.hpp index 696a883480e9f..eb292e89bfd21 100644 --- a/ggml/src/ggml-qnn/backend.hpp +++ b/ggml/src/ggml-qnn/backend.hpp @@ -4,6 +4,7 @@ #include #include #include +#include #include "ggml.h" @@ -26,6 +27,7 @@ struct ggml_backend_qnn_device_context { // initialize in init qnn::qcom_socinfo socinfo = {}; + std::unordered_set supported_types; std::shared_ptr instance; std::shared_ptr qnn_interface; diff --git a/ggml/src/ggml-qnn/buffer.hpp b/ggml/src/ggml-qnn/buffer.hpp index 4b4b2daaa75b4..676e88c0454be 100644 --- a/ggml/src/ggml-qnn/buffer.hpp +++ b/ggml/src/ggml-qnn/buffer.hpp @@ -8,8 +8,8 @@ namespace qnn { class ggml_qnn_rpc_buffer { public: - ggml_qnn_rpc_buffer(std::shared_ptr qnn_instance, size_t size, uint32_t rank, uint32_t *dimensions, - Qnn_DataType_t data_type) : + ggml_qnn_rpc_buffer(std::shared_ptr qnn_instance, const size_t size, const uint32_t rank, + uint32_t *dimensions, Qnn_DataType_t data_type) : _qnn_instance(qnn_instance), _size(size) { _qnn_rpc_buffer = static_cast(qnn_instance->alloc_rpcmem(size, alignof(void *))); diff --git a/ggml/src/ggml-qnn/graph.hpp b/ggml/src/ggml-qnn/graph.hpp index 3f1a0ef163208..858a7d3af29a2 100644 --- a/ggml/src/ggml-qnn/graph.hpp +++ b/ggml/src/ggml-qnn/graph.hpp @@ -2,7 +2,6 @@ #pragma once #include -#include #include #include #include @@ -12,19 +11,15 @@ #include "logger.hpp" #include "op-config.hpp" #include "qnn-lib.hpp" -#include "tensor.hpp" namespace qnn { -using ggml_tensor_array_t = std::vector; -using ggml_op_constructor_t = std::function(const std::string &)>; - class ggml_qnn_graph { public: explicit ggml_qnn_graph(const std::string &graph_name, QNNBackend device, std::shared_ptr qnn_instance, size_t vtcm_size_in_mb) : _graph_name(graph_name), _device(device), _qnn_instance(qnn_instance) { - QNN_LOG_INFO("graph name %s", graph_name.c_str()); + QNN_LOG_INFO("[%s]create", graph_name.c_str()); auto qnn_interface = qnn_instance->get_qnn_interface(); auto qnn_context = qnn_instance->get_qnn_context_handle(); @@ -69,19 +64,16 @@ class ggml_qnn_graph { } if (error != QNN_SUCCESS) { - QNN_LOG_INFO( - "can't create qnn graph handle with graph name %s, " - "error = %d\n", - graph_name.c_str(), error); + QNN_LOG_INFO("[%s]can't create qnn graph handle, error = %d\n", graph_name.c_str(), error); return; } - QNN_LOG_INFO("create qnn graph handle with graph name %s ok\n", graph_name.c_str()); + QNN_LOG_INFO("[%s]create succeed\n", graph_name.c_str()); _graph_handle = graph_handle; _qnn_interface = qnn_interface; } - ~ggml_qnn_graph() { QNN_LOG_DEBUG("graph name %s, destroy", _graph_name.c_str()); } + ~ggml_qnn_graph() { QNN_LOG_DEBUG("[%s]destroy", _graph_name.c_str()); } bool build_graph(ggml_op_constructor_t op_constructor, const ggml_tensor_array_t &tensor_inputs, const ggml_tensor_array_t &tensor_outputs) { @@ -91,95 +83,44 @@ class ggml_qnn_graph { return false; } - // get the max tensor rank - for (auto tensor : tensor_inputs) { - _tensor_rank = std::max(_tensor_rank, ggml_n_dims(tensor)); - } - for (auto tensor : tensor_outputs) { - _tensor_rank = std::max(_tensor_rank, ggml_n_dims(tensor)); - } - - QNN_LOG_DEBUG("graph name %s, build_graph start", _graph_name.c_str()); - _tensor_inputs.resize(tensor_inputs.size()); - for (size_t i = 0; i < tensor_inputs.size(); i++) { - char buffer[GGML_MAX_NAME] = {}; - snprintf(buffer, GGML_MAX_NAME, "src%d", (int)i); - auto qnn_tensor = - std::make_shared(std::string(buffer), _device, _graph_handle, _qnn_instance); - auto *ggml_tensor = tensor_inputs[i]; - if (!qnn_tensor->bind_ggml_tensor(ggml_tensor, true, _tensor_rank)) { - QNN_LOG_ERROR("bind tensor %s failed\n", ggml_get_name(ggml_tensor)); - return false; - } - - _tensor_inputs[i] = qnn_tensor; - } - - _tensor_outputs.resize(tensor_outputs.size()); - for (size_t i = 0; i < tensor_outputs.size(); i++) { - char buffer[GGML_MAX_NAME] = {}; - snprintf(buffer, GGML_MAX_NAME, "dst%d", (int)i); - auto qnn_tensor = - std::make_shared(std::string(buffer), _device, _graph_handle, _qnn_instance); - auto *ggml_tensor = tensor_outputs[i]; - if (!qnn_tensor->bind_ggml_tensor(ggml_tensor, false, _tensor_rank)) { - QNN_LOG_ERROR("bind tensor %s failed\n", ggml_get_name(ggml_tensor)); - return false; - } - - _tensor_outputs[i] = qnn_tensor; + QNN_LOG_DEBUG("[%s]build_graph start", _graph_name.c_str()); + _op_config = op_constructor(_graph_name, _qnn_instance); + if (!_op_config->create_tensors(_device, _graph_handle, tensor_inputs, tensor_outputs)) { + QNN_LOG_ERROR("[%s]create_tensors failed\n", _graph_name.c_str()); + return false; } - _op_config = op_constructor(_graph_name); - _op_config->set_input_tensors(_tensor_inputs); - _op_config->set_output_tensors(_tensor_outputs); - auto error = _qnn_interface->qnn_graph_add_node(_graph_handle, _op_config->get_op_config()); - if (error != QNN_SUCCESS) { - auto *error_str = get_qnn_error_string(error); - if (error_str) { - QNN_LOG_ERROR("qnn_graph_add_node.error: %s\n", error_str); - } else { - QNN_LOG_ERROR("qnn_graph_add_node.error: %d\n", error); - } + if (!_op_config->add_op_to_graph(_graph_handle)) { + QNN_LOG_ERROR("[%s]add nodes failed\n", _graph_name.c_str()); return false; } - error = _qnn_interface->qnn_graph_finalize(_graph_handle, nullptr, nullptr); + auto error = _qnn_interface->qnn_graph_finalize(_graph_handle, nullptr, nullptr); if (error != QNN_SUCCESS) { auto *error_str = get_qnn_error_string(error); if (error_str) { - QNN_LOG_ERROR("qnn_graph_finalize.error: %s\n", error_str); + QNN_LOG_ERROR("[%s]qnn_graph_finalize.error: %s\n", _graph_name.c_str(), error_str); } else { - QNN_LOG_ERROR("qnn_graph_finalize.error: %d\n", error); + QNN_LOG_ERROR("[%s]qnn_graph_finalize.error: %d\n", _graph_name.c_str(), error); } return false; } - QNN_LOG_DEBUG("graph name %s, build_graph succeed", _graph_name.c_str()); + QNN_LOG_DEBUG("[%s]build_graph succeed", _graph_name.c_str()); return true; } bool execute(const ggml_tensor_array_t &tensor_inputs, const ggml_tensor_array_t &tensor_outputs) { - GGML_ASSERT(tensor_inputs.size() == _tensor_inputs.size()); - GGML_ASSERT(tensor_outputs.size() == _tensor_outputs.size()); - for (size_t i = 0; i < tensor_inputs.size(); i++) { - auto *ggml_tensor = tensor_inputs[i]; - if (!_tensor_inputs[i]->bind_ggml_tensor(ggml_tensor, true, _tensor_rank)) { - QNN_LOG_ERROR("bind tensor %s failed\n", ggml_get_name(ggml_tensor)); - return false; - } + if (!_op_config->bind_input_tensors(tensor_inputs)) { + QNN_LOG_ERROR("[%s]bind input tensors failed\n", _graph_name.c_str()); + return false; } - for (size_t i = 0; i < tensor_outputs.size(); i++) { - auto *ggml_tensor = tensor_outputs[i]; - if (!_tensor_outputs[i]->bind_ggml_tensor(ggml_tensor, false, _tensor_rank)) { - QNN_LOG_ERROR("bind tensor %s failed\n", ggml_get_name(ggml_tensor)); - return false; - } + if (!_op_config->bind_output_tensors(tensor_outputs)) { + QNN_LOG_ERROR("[%s]bind output tensors failed\n", _graph_name.c_str()); + return false; } - _op_config->set_input_tensors(_tensor_inputs); - _op_config->set_output_tensors(_tensor_outputs); auto &qnn_tensor_inputs = _op_config->get_qnn_input_tensors(); auto &qnn_tensor_outputs = _op_config->get_qnn_output_tensors(); @@ -188,20 +129,15 @@ class ggml_qnn_graph { qnn_tensor_outputs.data(), qnn_tensor_outputs.size(), nullptr, nullptr); if (_device == QNN_BACKEND_NPU) { if (error == QNN_COMMON_ERROR_SYSTEM_COMMUNICATION) { - QNN_LOG_WARN("NPU crashed. SSR detected. Caused QNN graph execute error\n"); + QNN_LOG_WARN("[%s]NPU crashed. SSR detected. Caused QNN graph execute error\n", _graph_name.c_str()); } } - for (auto tensor : _tensor_inputs) { - tensor->unbind_ggml_tensor(); - } - - for (auto tensor : _tensor_outputs) { - tensor->unbind_ggml_tensor(); - } + _op_config->unbind_input_tensors(); + _op_config->unbind_output_tensors(); if (error != QNN_SUCCESS) { - QNN_LOG_INFO("error = %d\n", error); + QNN_LOG_INFO("[%s]error = %d\n", _graph_name.c_str(), error); return false; } @@ -220,11 +156,8 @@ class ggml_qnn_graph { Qnn_GraphHandle_t _graph_handle = nullptr; std::shared_ptr _qnn_instance; std::shared_ptr _qnn_interface; - std::vector> _tensor_inputs; - std::vector> _tensor_outputs; std::unique_ptr _op_config; std::vector _param_types; - int _tensor_rank = 0; DISABLE_COPY(ggml_qnn_graph); DISABLE_MOVE(ggml_qnn_graph); diff --git a/ggml/src/ggml-qnn/op-config.cpp b/ggml/src/ggml-qnn/op-config.cpp new file mode 100644 index 0000000000000..07dcba156471b --- /dev/null +++ b/ggml/src/ggml-qnn/op-config.cpp @@ -0,0 +1,471 @@ +#include "op-config.hpp" + +#include + +#include "logger.hpp" + +namespace { + +constexpr const qnn::qnn_dimension_array_t kTransposeParamData[GGML_MAX_DIMS] = { + { 0 }, + { 1, 0 }, + { 0, 2, 1 }, + { 0, 1, 3, 2 }, +}; + +qnn::qnn_dimension_array_t get_transposed_dimensions(const qnn::qnn_dimension_array_t &dimensions, int rank) { + qnn::qnn_dimension_array_t transposed_dims = dimensions; + if (rank >= 2) { + transposed_dims[rank - 1] = dimensions[rank - 2]; + transposed_dims[rank - 2] = dimensions[rank - 1]; + } + + return transposed_dims; +} + +int get_rank(const qnn::ggml_tensor_array_t &tensor_inputs, const qnn::ggml_tensor_array_t &tensor_outputs) { + int tensor_rank = 0; + // get the max tensor rank + for (auto tensor : tensor_inputs) { + tensor_rank = std::max(tensor_rank, ggml_n_dims(tensor)); + } + for (auto tensor : tensor_outputs) { + tensor_rank = std::max(tensor_rank, ggml_n_dims(tensor)); + } + + return tensor_rank; +} + +Qnn_DataType_t get_tensor_type(const qnn::ggml_qnn_tensor_array_t &tensors) { + Qnn_DataType_t type = QNN_DATATYPE_UNDEFINED; + for (auto tensor : tensors) { + auto tensor_type_size = qnn::qnn_datatype_size(tensor->get_data_type()); + GGML_ASSERT(tensor_type_size > 0); + if (tensor_type_size > qnn::qnn_datatype_size(type)) { + type = tensor->get_data_type(); + } + } + + return type; +} + +struct tensor_common_params { + const char *name_prefix; + int tensor_rank; + bool is_input; + QNNBackend device; + Qnn_GraphHandle_t graph_handle; + std::shared_ptr qnn_instance; +}; + +void create_tensors_from_ggml_tensor(const tensor_common_params ¶ms, const qnn::ggml_tensor_array_t &ggml_tensors, + qnn::ggml_qnn_tensor_array_t *tensor_wrappers, + std::vector *qnn_tensors) { + using namespace qnn; + + tensor_wrappers->resize(ggml_tensors.size()); + if (qnn_tensors) { + qnn_tensors->resize(ggml_tensors.size()); + } + char buffer[GGML_MAX_NAME] = {}; + auto tensor_type = params.is_input ? ggml_qnn_tensor::INPUT : ggml_qnn_tensor::OUTPUT; + for (size_t i = 0; i < ggml_tensors.size(); i++) { + snprintf(buffer, GGML_MAX_NAME, "%s%d", params.name_prefix, (int)i); + auto *ggml_tensor = ggml_tensors[i]; + (*tensor_wrappers)[i] = std::make_shared(tensor_type, std::string(buffer), ggml_tensor->ne, + ggml_tensor->type, params.tensor_rank, params.device, + params.graph_handle, params.qnn_instance); + } +} + +bool bind_tensors(const qnn::ggml_tensor_array_t &ggml_tensors, qnn::ggml_qnn_tensor_array_t &tensor_wrappers, + std::vector &qnn_tensors) { + for (size_t i = 0; i < ggml_tensors.size(); i++) { + auto *ggml_tensor = ggml_tensors[i]; + if (!tensor_wrappers[i]->bind_ggml_tensor(ggml_tensor)) { + QNN_LOG_ERROR("bind tensor %s failed\n", ggml_get_name(ggml_tensor)); + return false; + } + + qnn_tensors[i] = tensor_wrappers[i]->get_qnn_tensor(); + } + + return true; +} + +class ggml_qnn_connectable_op_config : public qnn::ggml_qnn_op_config_base { +public: + explicit ggml_qnn_connectable_op_config(const std::string &name, const std::string &package_name, + const std::string &op_type, + std::shared_ptr qnn_instance) : + ggml_qnn_op_config_base(name, package_name, op_type, qnn_instance) {} + + bool create_tensors(QNNBackend device, Qnn_GraphHandle_t graph_handle, + const qnn::ggml_tensor_array_t &tensor_inputs, + const qnn::ggml_tensor_array_t &tensor_outputs) override { + GGML_UNUSED(device); + GGML_UNUSED(graph_handle); + GGML_UNUSED(tensor_inputs); + GGML_UNUSED(tensor_outputs); + return true; + } + + void set_input_tensors(qnn::ggml_qnn_tensor_array_t &tensor_inputs) { + _tensor_inputs = tensor_inputs; + _qnn_tensor_inputs.resize(_tensor_inputs.size()); + } + + void set_input_tensors(qnn::ggml_qnn_tensor_array_t &&tensor_inputs) { + _tensor_inputs = std::move(tensor_inputs); + _qnn_tensor_inputs.resize(_tensor_inputs.size()); + } + + void set_output_tensors(qnn::ggml_qnn_tensor_array_t &tensor_outputs) { + _tensor_outputs = tensor_outputs; + _qnn_tensor_outputs.resize(_tensor_outputs.size()); + } + + void set_output_tensors(qnn::ggml_qnn_tensor_array_t &&tensor_outputs) { + _tensor_outputs = std::move(tensor_outputs); + _qnn_tensor_outputs.resize(_tensor_outputs.size()); + } + + qnn::ggml_qnn_tensor_array_t &get_input_tensors() { return _tensor_inputs; } + qnn::ggml_qnn_tensor_array_t &get_output_tensors() { return _tensor_outputs; } + +private: + DISABLE_COPY(ggml_qnn_connectable_op_config); + DISABLE_MOVE(ggml_qnn_connectable_op_config); +}; + +} // namespace + +namespace qnn { + +void ggml_qnn_op_config_base::add_scalar_param(const std::string &name, const Qnn_Scalar_t scalar) { + _param_names.push_back(name); + Qnn_Param_t param = QNN_PARAM_INIT; + param.paramType = QNN_PARAMTYPE_SCALAR; + param.name = _param_names.back().c_str(); + param.scalarParam = scalar; + _qnn_parameters.push_back(param); +} + +bool ggml_qnn_op_config_base::add_tensor_param(const std::string &name, const qnn_dimension_array_t &dimensions, + int rank, const uint8_t *data, const Qnn_DataType_t data_type, + QNNBackend device, Qnn_GraphHandle_t graph_handle) { + std::string tensor_name = _name + name + std::to_string(_tensor_parameters.size()); + auto param_tensor = std::make_shared(ggml_qnn_tensor::PARAMETER, tensor_name, dimensions, + data_type, rank, device, graph_handle, _qnn_instance); + size_t data_size = ggml_type_size(ggml_datatype_from_qnn_datatype(data_type)); + for (int i = 0; i < rank; i++) { + data_size *= dimensions[i]; + } + + GGML_ASSERT(data_size > 0); + if (!param_tensor->bind_buffer(const_cast(data), data_size)) { + QNN_LOG_ERROR("parameter tensor bind_buffer failed\n"); + return false; + } + + if (!param_tensor->alloc_qnn_tensor_id()) { + QNN_LOG_ERROR("parameter tensor alloc_qnn_tensor_id failed\n"); + return false; + } + + _tensor_parameters.push_back(param_tensor); + _param_names.push_back(name); + Qnn_Param_t param = QNN_PARAM_INIT; + param.paramType = QNN_PARAMTYPE_TENSOR; + param.name = _param_names.back().c_str(); + param.tensorParam = param_tensor->get_qnn_tensor(); + _qnn_parameters.push_back(param); + return true; +} + +bool ggml_qnn_op_config_base::add_op_to_graph(Qnn_GraphHandle_t graph_handle) { + GGML_ASSERT(_qnn_tensor_inputs.size() == _tensor_inputs.size()); + GGML_ASSERT(_qnn_tensor_outputs.size() == _tensor_outputs.size()); + + auto qnn_interface = _qnn_instance->get_qnn_interface(); + for (size_t i = 0; i < _tensor_inputs.size(); i++) { + auto tensor = _tensor_inputs[i]; + if (!tensor->alloc_qnn_tensor_id()) { + QNN_LOG_ERROR("[%s]input tensor alloc_qnn_tensor_id failed\n", _name.c_str()); + return false; + } + + _qnn_tensor_inputs[i] = tensor->get_qnn_tensor(); + } + + for (size_t i = 0; i < _tensor_outputs.size(); i++) { + auto tensor = _tensor_outputs[i]; + if (!tensor->alloc_qnn_tensor_id()) { + QNN_LOG_ERROR("[%s]output tensor alloc_qnn_tensor_id failed\n", _name.c_str()); + return false; + } + _qnn_tensor_outputs[i] = _tensor_outputs[i]->get_qnn_tensor(); + } + + auto error = qnn_interface->qnn_graph_add_node(graph_handle, get_op_config()); + if (error != QNN_SUCCESS) { + auto *error_str = get_qnn_error_string(error); + if (error_str) { + QNN_LOG_ERROR("[%s]qnn_graph_add_node.error: %s\n", _name.c_str(), error_str); + } else { + QNN_LOG_ERROR("[%s]qnn_graph_add_node.error: %d\n", _name.c_str(), error); + } + return false; + } + + QNN_LOG_DEBUG("[%s]added to graph\n", _name.c_str()); + return true; +} + +bool ggml_qnn_op_config_base::bind_input_tensors(const ggml_tensor_array_t &tensor_inputs) { + GGML_ASSERT(tensor_inputs.size() == _tensor_inputs.size()); + return bind_tensors(tensor_inputs, _tensor_inputs, _qnn_tensor_inputs); +} + +bool ggml_qnn_op_config_base::bind_output_tensors(const ggml_tensor_array_t &tensor_outputs) { + GGML_ASSERT(tensor_outputs.size() == _tensor_outputs.size()); + return bind_tensors(tensor_outputs, _tensor_outputs, _qnn_tensor_outputs); +} + +void ggml_qnn_op_config_base::unbind_input_tensors() { + for (auto &tensor : _tensor_inputs) { + tensor->unbind(); + } +} + +void ggml_qnn_op_config_base::unbind_output_tensors() { + for (auto &tensor : _tensor_outputs) { + tensor->unbind(); + } +} + +Qnn_OpConfig_t ggml_qnn_op_config_base::get_op_config() { + Qnn_OpConfig_t config = QNN_OPCONFIG_INIT; + config.version = QNN_OPCONFIG_VERSION_1; + auto &op_config = config.v1; + op_config.name = _name.c_str(); + op_config.packageName = _package_name.c_str(); + op_config.typeName = _op_type.c_str(); + op_config.numOfParams = (uint32_t)_qnn_parameters.size(); + op_config.params = _qnn_parameters.data(); + op_config.numOfInputs = (uint32_t)_qnn_tensor_inputs.size(); + op_config.inputTensors = _qnn_tensor_inputs.data(); + op_config.numOfOutputs = (uint32_t)_qnn_tensor_outputs.size(); + op_config.outputTensors = _qnn_tensor_outputs.data(); + return config; +} + +bool ggml_qnn_single_op_config::create_tensors(QNNBackend device, Qnn_GraphHandle_t graph_handle, + const ggml_tensor_array_t &tensor_inputs, + const ggml_tensor_array_t &tensor_outputs) { + const auto tensor_rank = get_rank(tensor_inputs, tensor_outputs); + tensor_common_params params = { "src", tensor_rank, true, device, graph_handle, _qnn_instance }; + create_tensors_from_ggml_tensor(params, tensor_inputs, &_tensor_inputs, &_qnn_tensor_inputs); + params.name_prefix = "dst"; + params.is_input = false; + create_tensors_from_ggml_tensor(params, tensor_outputs, &_tensor_outputs, &_qnn_tensor_outputs); + return true; +} + +bool ggml_qnn_matmul_op_config::create_tensors(QNNBackend device, Qnn_GraphHandle_t graph_handle, + const ggml_tensor_array_t &tensor_inputs, + const ggml_tensor_array_t &tensor_outputs) { + GGML_ASSERT(tensor_inputs.size() == 2); + GGML_ASSERT(tensor_outputs.size() == 1); + const auto tensor_rank = get_rank(tensor_inputs, tensor_outputs); + GGML_ASSERT(tensor_rank >= 2); + + // create input tensors + tensor_common_params params = { "src", tensor_rank, true, device, graph_handle, _qnn_instance }; + create_tensors_from_ggml_tensor(params, tensor_inputs, &_tensor_inputs, &_qnn_tensor_inputs); + + // create output tensor + ggml_qnn_tensor_array_t mat_mul_tensor_outputs; + params.name_prefix = "dst"; + params.is_input = false; + create_tensors_from_ggml_tensor(params, tensor_outputs, &mat_mul_tensor_outputs, nullptr); + + // create mat_mul nodes + return create_mat_mul_nodes(device, graph_handle, tensor_rank, _tensor_inputs, mat_mul_tensor_outputs); +} + +bool ggml_qnn_matmul_op_config::create_mat_mul_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, const int rank, + ggml_qnn_tensor_array_t &tensor_inputs, + ggml_qnn_tensor_array_t &tensor_outputs) { + + /* + * First, both the ggml and qnn tensor in memory are stored as row-major format. (For more details, please also: + * https://pytorch.org/blog/tensor-memory-format-matters/#:~:text=Column%20Major%20Order:%20In%20this%20format,%20the%20matrix) + * But the dimensions of the tensor are stored in different order. + * For example, a 2x3 matrix: + * [ + * [1, 2, 3], + * [4, 5, 6], + * ] + * The ggml tensor will have dimensions [3, 2], while the qnn tensor will have dimensions [2, 3]. + * + * Second, from the ggml introduction here: https://github.com/huggingface/blog/blob/main/introduction-to-ggml.md + * Given 2 matrices A and B, the matrix multiplication C = A * B is defined as: + * ```python + * import torch + * # Create two matrices + * A = torch.tensor([ + * [2, 8], + * [5, 1], + * [4, 2], + * [8, 6], + * ]) + * B = torch.tensor([ + * [10, 5], + * [9, 9], + * [5, 4], + * ]) + * # Perform matrix multiplication + * result = torch.matmul(A, B.T) + * print(result.T) + * ``` + * Here, the B.T is the transpose of B. + * + * So here we need to create graph like: + * ```mermaid + * graph TD; + * i1>ggml_tensor_in0] --src0--> mat_mul0; + * i2>ggml_tensor_in1] --src1--> transpose0; + * transpose0 --src0_trans--> mat_mul0; + * mat_mul0 --dst_trans--> transpose1; + * transpose1 --dst0--> o1>ggml_tensor_out]; + * ``` + */ + + // create src0_trans tensor + auto src1 = tensor_inputs.back(); + static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS does not match the expected value"); + + qnn_dimension_array_t dimensions = get_transposed_dimensions(src1->get_dimensions(), rank); + auto src0_trans = + std::make_shared(ggml_qnn_tensor::INTERMEDIATE, "src0_trans", dimensions, + src1->get_data_type(), rank, device, graph_handle, _qnn_instance); + + // create dst_trans tensor + auto dst = tensor_outputs.front(); + dimensions = get_transposed_dimensions(dst->get_dimensions(), rank); + auto dst_trans = std::make_shared(ggml_qnn_tensor::INTERMEDIATE, "dst_trans", dimensions, + dst->get_data_type(), rank, device, graph_handle, _qnn_instance); + + // create transpose0 + auto transpose0 = std::make_shared(_name + "_trans0", QNN_OP_PACKAGE_NAME_QTI_AISW, + QNN_OP_TRANSPOSE, _qnn_instance); + + // create transpose1 + auto transpose1 = std::make_shared(_name + "_trans1", QNN_OP_PACKAGE_NAME_QTI_AISW, + QNN_OP_TRANSPOSE, _qnn_instance); + + // create mat_mul + auto mat_mul = std::make_shared(_name, QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_MAT_MUL, + _qnn_instance); + + // set transpose0 parameters + auto *params_data = reinterpret_cast(kTransposeParamData[rank - 1].data()); + const qnn_dimension_array_t param_dims = { (uint32_t)rank, 1, 1, 1 }; + transpose0->add_tensor_param(QNN_OP_TRANSPOSE_PARAM_PERM, param_dims, 1, params_data, QNN_DATATYPE_UINT_32, device, + graph_handle); + + // set transpose1 parameters + transpose1->add_tensor_param(QNN_OP_TRANSPOSE_PARAM_PERM, param_dims, 1, params_data, QNN_DATATYPE_UINT_32, device, + graph_handle); + + // set tensor to transpose0 + ggml_qnn_tensor_array_t tensors = { tensor_inputs.back() }; + transpose0->set_input_tensors(tensors); + tensors = { src0_trans }; + transpose0->set_output_tensors(tensors); + + // set tensor to mat_mul + tensors = { tensor_inputs.front(), src0_trans }; + mat_mul->set_input_tensors(tensors); + tensors = { dst_trans }; + mat_mul->set_output_tensors(tensors); + + // set tensor to transpose1 + tensors = { dst_trans }; + transpose1->set_input_tensors(tensors); + transpose1->set_output_tensors(tensor_outputs); + + _mat_mul = mat_mul; + _transpose0 = transpose0; + _transpose1 = transpose1; + return true; +} + +bool ggml_qnn_matmul_op_config::add_op_to_graph(Qnn_GraphHandle_t graph_handle) { + for (auto &convert : _input_converts) { + if (convert && !convert->add_op_to_graph(graph_handle)) { + return false; + } + } + + return _transpose0->add_op_to_graph(graph_handle) && _mat_mul->add_op_to_graph(graph_handle) && + _transpose1->add_op_to_graph(graph_handle) && + (!_output_convert || _output_convert->add_op_to_graph(graph_handle)); +} + +bool ggml_qnn_matmul_op_config::bind_input_tensors(const ggml_tensor_array_t &tensor_inputs) { + return bind_tensors(tensor_inputs, _tensor_inputs, _qnn_tensor_inputs); +} + +bool ggml_qnn_matmul_op_config::bind_output_tensors(const ggml_tensor_array_t &tensor_outputs) { + if (_output_convert) { + return _output_convert->bind_output_tensors(tensor_outputs); + } else { + return _transpose1->bind_output_tensors(tensor_outputs); + } +} + +void ggml_qnn_matmul_op_config::unbind_input_tensors() { + _mat_mul->unbind_input_tensors(); + _transpose0->unbind_input_tensors(); + for (auto &convert : _input_converts) { + if (convert) { + convert->unbind_input_tensors(); + } + } +} + +void ggml_qnn_matmul_op_config::unbind_output_tensors() { + _transpose1->unbind_output_tensors(); + if (_output_convert) { + _output_convert->unbind_output_tensors(); + } +} + +std::vector &ggml_qnn_matmul_op_config::get_qnn_output_tensors() { + if (_output_convert) { + return _output_convert->get_qnn_output_tensors(); + } else { + return _transpose1->get_qnn_output_tensors(); + } +} + +ggml_op_constructor_t create_op_constructor(const std::string &op_name) { + if (op_name == QNN_OP_MAT_MUL) { + // For QNN_OP_MAT_MUL, we need to transpose the input tensor + return [](const std::string &instance_name, + std::shared_ptr qnn_instance) -> std::unique_ptr { + QNN_LOG_DEBUG("create QNN_OP_MAT_MUL, name %s\n", instance_name.c_str()); + return std::make_unique(instance_name, qnn_instance); + }; + } + + return [op_name](const std::string &instance_name, + std::shared_ptr qnn_instance) -> std::unique_ptr { + return std::make_unique(instance_name, QNN_OP_PACKAGE_NAME_QTI_AISW, op_name, + qnn_instance); + }; +} + +} // namespace qnn diff --git a/ggml/src/ggml-qnn/op-config.hpp b/ggml/src/ggml-qnn/op-config.hpp index 7852ee84dc12f..2016cb4ac994d 100644 --- a/ggml/src/ggml-qnn/op-config.hpp +++ b/ggml/src/ggml-qnn/op-config.hpp @@ -1,73 +1,122 @@ #pragma once +#include +#include #include #include #include "ggml-qnn.h" -#include "logger.hpp" #include "qnn-lib.hpp" #include "qnn-types.hpp" #include "tensor.hpp" namespace qnn { + +using ggml_tensor_array_t = std::vector; + class ggml_qnn_op_config { public: - explicit ggml_qnn_op_config(const std::string &name, const std::string &package_name, const std::string &op_type) : - _name(name), _package_name(package_name), _op_type(op_type) {} - - void set_input_tensors(const std::vector> &tensor_inputs) { - _qnn_tensor_inputs.resize(tensor_inputs.size()); - for (size_t i = 0; i < tensor_inputs.size(); i++) { - _qnn_tensor_inputs[i] = tensor_inputs[i]->get_qnn_tensor(); - } - } - - void set_output_tensors(const std::vector> &tensor_outputs) { - _qnn_tensor_outputs.resize(tensor_outputs.size()); - for (size_t i = 0; i < tensor_outputs.size(); i++) { - _qnn_tensor_outputs[i] = tensor_outputs[i]->get_qnn_tensor(); - } - } - - void add_scalar_param(const std::string &name, const Qnn_Scalar_t scalar) { - _param_names.push_back(name); - Qnn_Param_t param = QNN_PARAM_INIT; - param.paramType = QNN_PARAMTYPE_SCALAR; - param.name = _param_names.back().c_str(); - param.scalarParam = scalar; - _parameters.push_back(param); - } - - std::vector &get_qnn_input_tensors() { return _qnn_tensor_inputs; } - std::vector &get_qnn_output_tensors() { return _qnn_tensor_outputs; } - - Qnn_OpConfig_t get_op_config() { - Qnn_OpConfig_t config = QNN_OPCONFIG_INIT; - config.version = QNN_OPCONFIG_VERSION_1; - auto &op_config = config.v1; - op_config.name = _name.c_str(); - op_config.packageName = _package_name.c_str(); - op_config.typeName = _op_type.c_str(); - op_config.numOfParams = (uint32_t)_parameters.size(); - op_config.params = _parameters.data(); - op_config.numOfInputs = (uint32_t)_qnn_tensor_inputs.size(); - op_config.inputTensors = _qnn_tensor_inputs.data(); - op_config.numOfOutputs = (uint32_t)_qnn_tensor_outputs.size(); - op_config.outputTensors = _qnn_tensor_outputs.data(); - return config; - } + virtual ~ggml_qnn_op_config() {} + virtual bool create_tensors(QNNBackend device, Qnn_GraphHandle_t graph_handle, + const ggml_tensor_array_t &tensor_inputs, + const ggml_tensor_array_t &tensor_outputs) = 0; + virtual std::vector &get_qnn_input_tensors() = 0; + virtual std::vector &get_qnn_output_tensors() = 0; + virtual bool add_op_to_graph(Qnn_GraphHandle_t graph_handle) = 0; + virtual bool bind_input_tensors(const ggml_tensor_array_t &tensor_inputs) = 0; + virtual bool bind_output_tensors(const ggml_tensor_array_t &tensor_outputs) = 0; + virtual void unbind_input_tensors() = 0; + virtual void unbind_output_tensors() = 0; +}; + +class ggml_qnn_op_config_base : public ggml_qnn_op_config { +public: + explicit ggml_qnn_op_config_base(const std::string &name, const std::string &package_name, + const std::string &op_type, std::shared_ptr qnn_instance) : + _name(name), _package_name(package_name), _op_type(op_type), _qnn_instance(qnn_instance) {} + + void add_scalar_param(const std::string &name, const Qnn_Scalar_t scalar); + bool add_tensor_param(const std::string &name, const qnn_dimension_array_t &dimensions, int rank, + const uint8_t *data, const Qnn_DataType_t data_type, QNNBackend device, + Qnn_GraphHandle_t graph_handle); + bool add_op_to_graph(Qnn_GraphHandle_t graph_handle) override; + bool bind_input_tensors(const ggml_tensor_array_t &tensor_inputs) override; + bool bind_output_tensors(const ggml_tensor_array_t &tensor_outputs) override; + void unbind_input_tensors() override; + void unbind_output_tensors() override; + std::vector &get_qnn_input_tensors() override { return _qnn_tensor_inputs; } + std::vector &get_qnn_output_tensors() override { return _qnn_tensor_outputs; } + +protected: + Qnn_OpConfig_t get_op_config(); -private: std::string _name; std::string _package_name; std::string _op_type; + std::shared_ptr _qnn_instance; + ggml_qnn_tensor_array_t _tensor_inputs; + ggml_qnn_tensor_array_t _tensor_outputs; + ggml_qnn_tensor_array_t _tensor_parameters; std::vector _qnn_tensor_inputs; std::vector _qnn_tensor_outputs; - std::vector _parameters; + std::vector _qnn_parameters; std::vector _param_names; - DISABLE_COPY(ggml_qnn_op_config); - DISABLE_MOVE(ggml_qnn_op_config); + DISABLE_COPY(ggml_qnn_op_config_base); + DISABLE_MOVE(ggml_qnn_op_config_base); +}; + +class ggml_qnn_single_op_config : public ggml_qnn_op_config_base { +public: + explicit ggml_qnn_single_op_config(const std::string &name, const std::string &package_name, + const std::string &op_type, std::shared_ptr qnn_instance) : + ggml_qnn_op_config_base(name, package_name, op_type, qnn_instance) {} + + bool create_tensors(QNNBackend device, Qnn_GraphHandle_t graph_handle, const ggml_tensor_array_t &tensor_inputs, + const ggml_tensor_array_t &tensor_outputs) override; + +private: + DISABLE_COPY(ggml_qnn_single_op_config); + DISABLE_MOVE(ggml_qnn_single_op_config); }; + +class ggml_qnn_matmul_op_config : public ggml_qnn_op_config { +public: + ggml_qnn_matmul_op_config(const std::string &name, std::shared_ptr qnn_instance) : + _name(name), _qnn_instance(qnn_instance) {} + + bool create_tensors(QNNBackend device, Qnn_GraphHandle_t graph_handle, const ggml_tensor_array_t &tensor_inputs, + const ggml_tensor_array_t &tensor_outputs) override; + bool add_op_to_graph(Qnn_GraphHandle_t graph_handle) override; + bool bind_input_tensors(const ggml_tensor_array_t &tensor_inputs) override; + bool bind_output_tensors(const ggml_tensor_array_t &tensor_outputs) override; + void unbind_input_tensors() override; + void unbind_output_tensors() override; + std::vector &get_qnn_input_tensors() override { return _qnn_tensor_inputs; } + std::vector &get_qnn_output_tensors() override; + +private: + bool create_mat_mul_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, const int rank, + ggml_qnn_tensor_array_t &tensor_inputs, ggml_qnn_tensor_array_t &tensor_outputs); + + std::string _name; + std::shared_ptr _qnn_instance; + std::shared_ptr _transpose0; + std::shared_ptr _transpose1; + std::shared_ptr _mat_mul; + std::vector> _input_converts; + std::shared_ptr _output_convert; + ggml_qnn_tensor_array_t _tensor_inputs; + std::vector _qnn_tensor_inputs; + + DISABLE_COPY(ggml_qnn_matmul_op_config); + DISABLE_MOVE(ggml_qnn_matmul_op_config); +}; + +using ggml_op_constructor_t = + std::function(const std::string &, std::shared_ptr)>; + +ggml_op_constructor_t create_op_constructor(const std::string &op_name); + } // namespace qnn diff --git a/ggml/src/ggml-qnn/qnn-lib.hpp b/ggml/src/ggml-qnn/qnn-lib.hpp index d55f730f80d84..74bc2b3f95f6b 100644 --- a/ggml/src/ggml-qnn/qnn-lib.hpp +++ b/ggml/src/ggml-qnn/qnn-lib.hpp @@ -637,7 +637,7 @@ class qnn_instance { return mem_fd; } - Qnn_MemHandle_t register_rpcmem(void *p_data, uint32_t rank, uint32_t *dimensions, Qnn_DataType_t data_type) { + Qnn_MemHandle_t register_rpcmem(void *p_data, const uint32_t rank, uint32_t *dimensions, Qnn_DataType_t data_type) { if (!p_data) { QNN_LOG_WARN("invalid param\n"); return nullptr; diff --git a/ggml/src/ggml-qnn/tensor.hpp b/ggml/src/ggml-qnn/tensor.hpp index c465d17f25506..faf5b0df5f4e1 100644 --- a/ggml/src/ggml-qnn/tensor.hpp +++ b/ggml/src/ggml-qnn/tensor.hpp @@ -1,8 +1,10 @@ #pragma once +#include #include #include +#include #include #include #include @@ -16,55 +18,81 @@ namespace qnn { +static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS should be 4"); + class ggml_qnn_tensor { public: - explicit ggml_qnn_tensor(const std::string &name, QNNBackend device, Qnn_GraphHandle_t graph_handle, + typedef enum _tensor_type { INPUT, OUTPUT, INTERMEDIATE, PARAMETER } tensor_type_t; + + explicit ggml_qnn_tensor(tensor_type_t tensor_type, const std::string &name, + const qnn_dimension_array_t &dimensions, Qnn_DataType_t data_type, int rank, + QNNBackend device, Qnn_GraphHandle_t graph_handle, std::shared_ptr qnn_instance) : _tensor_name(name), _device(device), _qnn_instance(qnn_instance), _graph_handle(graph_handle) { - QNN_TENSOR_SET_NAME(_qnn_tensor, _tensor_name.c_str()); + if (!_tensor_name.empty()) { + QNN_TENSOR_SET_NAME(_qnn_tensor, _tensor_name.c_str()); + } QNN_TENSOR_SET_DIMENSIONS(_qnn_tensor, _dimensions.data()); - QNN_TENSOR_SET_TYPE(_qnn_tensor, QNN_TENSOR_TYPE_NATIVE); QNN_TENSOR_SET_DATA_FORMAT(_qnn_tensor, QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER); - QNN_LOG_DEBUG("create tensor %s, device: %d", _tensor_name.c_str(), device); + + _dimensions = dimensions; + update_params_from_ggml_tensor(tensor_type, data_type, rank); + QNN_LOG_DEBUG("create tensor %s, rank: %d, dims: [%d, %d, %d, %d], data_type: %d, device: %d", + _tensor_name.c_str(), rank, (int)_dimensions[0], (int)_dimensions[1], (int)_dimensions[2], + (int)_dimensions[3], (int)data_type, (int)device); } + explicit ggml_qnn_tensor(tensor_type_t tensor_type, const std::string &name, + const ggml_dimension_array_t &dimensions, ggml_type data_type, int rank, QNNBackend device, + Qnn_GraphHandle_t graph_handle, std::shared_ptr qnn_instance) : + ggml_qnn_tensor(tensor_type, name, get_internal_dimension(dimensions, rank), + qnn_datatype_from_ggml_datatype(data_type), rank, device, graph_handle, qnn_instance) {} + ~ggml_qnn_tensor() { _qnn_rpc_buffer.reset(); } - bool bind_ggml_tensor(ggml_tensor *tensor, bool is_input, int prev_max_rank) { - if (_tensor) { - if (_tensor != tensor) { - QNN_LOG_WARN("tensor %s has been bound to another ggml tensor %s", _tensor_name.c_str(), - ggml_get_name(_tensor)); - return false; - } - QNN_LOG_INFO("tensor %s already bound to same ggml tensor %s", _tensor_name.c_str(), - ggml_get_name(_tensor)); + bool alloc_qnn_tensor_id() { + if (QNN_TENSOR_GET_ID(_qnn_tensor)) { + QNN_LOG_WARN("graph tensor %s already created, id %d", _tensor_name.c_str(), + QNN_TENSOR_GET_ID(_qnn_tensor)); return true; } - update_params_from_ggml_tensor(tensor, prev_max_rank); - Qnn_TensorType_t new_tensor_type = is_input ? QNN_TENSOR_TYPE_APP_WRITE : QNN_TENSOR_TYPE_APP_READ; - QNN_TENSOR_SET_TYPE(_qnn_tensor, new_tensor_type); - QNN_LOG_INFO("tensor %s changed to type %d", _tensor_name.c_str(), new_tensor_type); + Qnn_Tensor_t qnn_tensor = _qnn_tensor; + auto qnn_interface = _qnn_instance->get_qnn_interface(); + auto error = qnn_interface->qnn_tensor_create_graph_tensor(_graph_handle, &qnn_tensor); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("create graph tensor failed, tensor %s, error: %d\n", _tensor_name.c_str(), error); + return false; + } + + QNN_TENSOR_SET_ID(_qnn_tensor, QNN_TENSOR_GET_ID(qnn_tensor)); + QNN_LOG_DEBUG("create graph tensor %s, id: %d, rank: %d", _tensor_name.c_str(), QNN_TENSOR_GET_ID(qnn_tensor), + QNN_TENSOR_GET_RANK(qnn_tensor)); - if (!QNN_TENSOR_GET_ID(_qnn_tensor)) { - Qnn_Tensor_t qnn_tensor = _qnn_tensor; - auto qnn_interface = _qnn_instance->get_qnn_interface(); - auto error = qnn_interface->qnn_tensor_create_graph_tensor(_graph_handle, &qnn_tensor); - if (error != QNN_SUCCESS) { - QNN_LOG_WARN("create graph tensor failed, tensor %s, error: %d\n", _tensor_name.c_str(), error); + return true; + } + + bool bind_buffer(uint8_t *buffer, const size_t buffer_size) { + if (_buffer) { + if (_buffer != buffer) { + QNN_LOG_WARN("tensor %s has been bound to another buffer %p", _tensor_name.c_str(), _buffer); return false; } - QNN_TENSOR_SET_ID(_qnn_tensor, QNN_TENSOR_GET_ID(qnn_tensor)); - QNN_LOG_DEBUG("create graph tensor %s, id: %d, rank: %d", _tensor_name.c_str(), - QNN_TENSOR_GET_ID(qnn_tensor), QNN_TENSOR_GET_RANK(qnn_tensor)); + QNN_LOG_INFO("tensor %s already bound to same ggml tensor %p", _tensor_name.c_str(), _buffer); + return true; + } + + if (QNN_TENSOR_GET_TYPE(_qnn_tensor) == QNN_TENSOR_TYPE_NATIVE) { + QNN_LOG_DEBUG("tensor %s type(%d) not READ/WRITE, skipping", _tensor_name.c_str(), + (int)QNN_TENSOR_TYPE_NATIVE); + return true; } if (should_use_mem_handle()) { if (!_qnn_rpc_buffer) { auto qnn_rpc_buffer = std::make_unique( - _qnn_instance, ggml_nbytes(tensor), QNN_TENSOR_GET_RANK(_qnn_tensor), + _qnn_instance, buffer_size, QNN_TENSOR_GET_RANK(_qnn_tensor), QNN_TENSOR_GET_DIMENSIONS(_qnn_tensor), QNN_TENSOR_GET_DATA_TYPE(_qnn_tensor)); if (!qnn_rpc_buffer->is_valid()) { QNN_LOG_WARN("alloc rpc mem failed, tensor %s", _tensor_name.c_str()); @@ -79,30 +107,41 @@ class ggml_qnn_tensor { QNN_LOG_DEBUG("tensor %s, use mem handle %p", _tensor_name.c_str(), QNN_TENSOR_GET_MEM_HANDLE(_qnn_tensor)); } else { QNN_TENSOR_SET_MEM_TYPE(_qnn_tensor, QNN_TENSORMEMTYPE_RAW); - Qnn_ClientBuffer_t client_buf = { tensor->data, get_ggml_tensor_data_size(tensor) }; + Qnn_ClientBuffer_t client_buf = { buffer, (uint32_t)buffer_size }; QNN_TENSOR_SET_CLIENT_BUF(_qnn_tensor, client_buf); QNN_LOG_DEBUG("tensor %s, use client buffer %p size %d", _tensor_name.c_str(), client_buf.data, (int)client_buf.dataSize); } - _tensor = tensor; + _buffer = buffer; + _buffer_size = buffer_size; if (!write_to_qnn_tensor()) { QNN_LOG_WARN("write to qnn tensor failed, tensor %s", _tensor_name.c_str()); return false; } - QNN_LOG_DEBUG("bind tensor %s to ggml tensor %s", _tensor_name.c_str(), ggml_get_name(tensor)); + QNN_LOG_DEBUG("bind tensor %s to buffer: %p, size: %d", _tensor_name.c_str(), buffer, (int)buffer_size); + return true; + } + + bool bind_ggml_tensor(ggml_tensor *tensor) { + if (!bind_buffer(reinterpret_cast(tensor->data), ggml_nbytes(tensor))) { + QNN_LOG_WARN("Failed to bind tensor: %s to ggml tensor: %s", _tensor_name.c_str(), ggml_get_name(tensor)); + return false; + } + + QNN_LOG_DEBUG("Bind tensor %s to ggml tensor %s", _tensor_name.c_str(), ggml_get_name(tensor)); return true; } - bool unbind_ggml_tensor() { + bool unbind() { if (!_graph_handle) { QNN_LOG_WARN("tensor %s not bound to any graph", _tensor_name.c_str()); return false; } - if (!_tensor) { + if (!_buffer) { QNN_LOG_DEBUG("tensor %s not bound to ggml tensor", _tensor_name.c_str()); return true; } @@ -119,12 +158,15 @@ class ggml_qnn_tensor { QNN_LOG_DEBUG("tensor %s, clear client buffer", _tensor_name.c_str()); } - QNN_LOG_DEBUG("unbind tensor: %s from ggml tensor: %s", _tensor_name.c_str(), ggml_get_name(_tensor)); - _tensor = nullptr; + QNN_LOG_DEBUG("unbind tensor: %s from buffer: %p, size: %d", _tensor_name.c_str(), _buffer, (int)_buffer_size); + _buffer = nullptr; + _buffer_size = 0; return true; } const Qnn_Tensor_t &get_qnn_tensor() const { return _qnn_tensor; } + Qnn_DataType_t get_data_type() const { return QNN_TENSOR_GET_DATA_TYPE(_qnn_tensor); } + const qnn_dimension_array_t &get_dimensions() const { return _dimensions; } private: bool write_to_qnn_tensor() { @@ -136,7 +178,7 @@ class ggml_qnn_tensor { if (should_use_mem_handle()) { if (_qnn_rpc_buffer) { - memcpy(_qnn_rpc_buffer->get_buffer(), _tensor->data, ggml_nbytes(_tensor)); + memcpy(_qnn_rpc_buffer->get_buffer(), _buffer, _buffer_size); } else { QNN_LOG_WARN("tensor %s: can't find rpcmem from qnn mem handle\n", _tensor_name.c_str()); return false; @@ -157,7 +199,7 @@ class ggml_qnn_tensor { if (should_use_mem_handle()) { if (_qnn_rpc_buffer) { - memcpy(_tensor->data, _qnn_rpc_buffer->get_buffer(), ggml_nbytes(_tensor)); + memcpy(_buffer, _qnn_rpc_buffer->get_buffer(), _buffer_size); } else { QNN_LOG_WARN("can't find rpcmem from qnn mem handle\n"); return false; @@ -169,29 +211,45 @@ class ggml_qnn_tensor { return true; } - void update_params_from_ggml_tensor(ggml_tensor *tensor, int prev_max_rank) { - _dimensions[0] = (uint32_t)tensor->ne[0]; - _dimensions[1] = (uint32_t)tensor->ne[1]; - _dimensions[2] = (uint32_t)tensor->ne[2]; - _dimensions[3] = (uint32_t)tensor->ne[3]; - QNN_TENSOR_SET_DATA_TYPE(_qnn_tensor, device_datatype_from_ggml_datatype(tensor->type)); + void update_params_from_ggml_tensor(tensor_type_t tensor_type, Qnn_DataType_t data_type, int rank) { + QNN_TENSOR_SET_DATA_TYPE(_qnn_tensor, data_type); // TODO: set the quantizeParams base on the tensor type - QNN_TENSOR_SET_RANK(_qnn_tensor, (uint32_t)std::max(prev_max_rank, ggml_n_dims(tensor))); - + QNN_TENSOR_SET_RANK(_qnn_tensor, (uint32_t)rank); QNN_TENSOR_SET_MEM_TYPE(_qnn_tensor, QNN_TENSORMEMTYPE_RAW); Qnn_ClientBuffer_t client_buf = {}; QNN_TENSOR_SET_CLIENT_BUF(_qnn_tensor, client_buf); + + Qnn_TensorType_t new_tensor_type; + switch (tensor_type) { + case INPUT: + new_tensor_type = QNN_TENSOR_TYPE_APP_WRITE; + break; + case OUTPUT: + new_tensor_type = QNN_TENSOR_TYPE_APP_READ; + break; + case PARAMETER: + new_tensor_type = QNN_TENSOR_TYPE_STATIC; + break; + default: + new_tensor_type = QNN_TENSOR_TYPE_NATIVE; + break; + } + QNN_TENSOR_SET_TYPE(_qnn_tensor, new_tensor_type); + QNN_LOG_INFO("tensor %s changed to type %d", _tensor_name.c_str(), new_tensor_type); } - bool should_use_mem_handle() const { return _device == QNN_BACKEND_NPU; } + bool should_use_mem_handle() const { + return _device == QNN_BACKEND_NPU && QNN_TENSOR_GET_TYPE(_qnn_tensor) != QNN_TENSOR_TYPE_STATIC; + } std::string _tensor_name; - const ggml_tensor *_tensor; + uint8_t *_buffer = nullptr; + size_t _buffer_size = 0; QNNBackend _device; std::shared_ptr _qnn_instance; Qnn_Tensor_t _qnn_tensor = qnn_tensor_init(kDefaultQnnTensorVersion); - std::array _dimensions = {}; + qnn_dimension_array_t _dimensions = {}; Qnn_GraphHandle_t _graph_handle = nullptr; std::unique_ptr _qnn_rpc_buffer; @@ -199,4 +257,6 @@ class ggml_qnn_tensor { DISABLE_MOVE(ggml_qnn_tensor); }; +using ggml_qnn_tensor_array_t = std::vector>; + } // namespace qnn diff --git a/ggml/src/ggml-qnn/utils.cpp b/ggml/src/ggml-qnn/utils.cpp index e44d6dbccee42..0de9d203ebee9 100644 --- a/ggml/src/ggml-qnn/utils.cpp +++ b/ggml/src/ggml-qnn/utils.cpp @@ -9,14 +9,40 @@ namespace qnn { +qnn_dimension_array_t get_internal_dimension(const ggml_dimension_array_t &dims, uint32_t rank) { + static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS should be 4"); + GGML_ASSERT(rank <= GGML_MAX_DIMS && rank > 0); + + qnn_dimension_array_t internal_dims = {}; + /* + * Both the ggml and qnn tensor in memory are stored as row-major format. + * But the dimensions of the tensor are stored in different order. + * For example, a 2x3 matrix: + * [ + * [1, 2, 3], + * [4, 5, 6], + * ] + * The ggml tensor will have dimensions [3, 2], while the qnn tensor will have dimensions [2, 3]. + */ + for (uint32_t i = 0; i < rank; i++) { + internal_dims[i] = std::max(dims[rank - 1 - i], 1); + } + + return internal_dims; +} + // TODO: mapping more ggml data type to QNN data type // ref:explanation of k-quants, https://github.com/ggerganov/llama.cpp/pull/1684 -Qnn_DataType_t device_datatype_from_ggml_datatype(ggml_type ggml_type) { +Qnn_DataType_t qnn_datatype_from_ggml_datatype(ggml_type ggml_type) { switch (ggml_type) { - case GGML_TYPE_F16: - return QNN_DATATYPE_FLOAT_16; case GGML_TYPE_F32: return QNN_DATATYPE_FLOAT_32; + case GGML_TYPE_F16: + return QNN_DATATYPE_FLOAT_16; + case GGML_TYPE_I32: + return QNN_DATATYPE_INT_32; + case GGML_TYPE_I16: + return QNN_DATATYPE_INT_16; case GGML_TYPE_I8: return QNN_DATATYPE_INT_8; case GGML_TYPE_Q8_0: @@ -29,16 +55,75 @@ Qnn_DataType_t device_datatype_from_ggml_datatype(ggml_type ggml_type) { return QNN_DATATYPE_UNDEFINED; } -Qnn_TensorType_t device_tensortype_from_ggml_tensor(ggml_tensor *ggml_tensor) { - Qnn_TensorType_t qnn_tensor_type = QNN_TENSOR_TYPE_NATIVE; +ggml_type ggml_datatype_from_qnn_datatype(Qnn_DataType_t qnn_type) { + switch (qnn_type) { + case QNN_DATATYPE_FLOAT_32: + return GGML_TYPE_F32; + case QNN_DATATYPE_FLOAT_16: + return GGML_TYPE_F16; + case QNN_DATATYPE_UINT_32: + case QNN_DATATYPE_INT_32: + return GGML_TYPE_I32; + case QNN_DATATYPE_INT_16: + return GGML_TYPE_I16; + case QNN_DATATYPE_INT_8: + return GGML_TYPE_I8; + case QNN_DATATYPE_SFIXED_POINT_8: + return GGML_TYPE_Q8_0; + case QNN_DATATYPE_SFIXED_POINT_4: + return GGML_TYPE_Q4_0; + default: + break; + } + return GGML_TYPE_COUNT; +} - if (ggml_tensor->flags & GGML_TENSOR_FLAG_INPUT) { - qnn_tensor_type = QNN_TENSOR_TYPE_APP_WRITE; - } else if (ggml_tensor->flags & GGML_TENSOR_FLAG_OUTPUT) { - qnn_tensor_type = QNN_TENSOR_TYPE_APP_READ; +size_t qnn_datatype_size(Qnn_DataType_t qnn_type) { + switch (qnn_type) { + case QNN_DATATYPE_FLOAT_32: + return sizeof(float); + case QNN_DATATYPE_FLOAT_16: + return sizeof(uint16_t); + case QNN_DATATYPE_UINT_32: + case QNN_DATATYPE_INT_32: + return sizeof(int32_t); + case QNN_DATATYPE_INT_16: + return sizeof(int16_t); + case QNN_DATATYPE_INT_8: + return sizeof(int8_t); + case QNN_DATATYPE_SFIXED_POINT_8: + return sizeof(int8_t); + case QNN_DATATYPE_SFIXED_POINT_4: + return sizeof(int8_t); + default: + break; } + return 0; +} - return qnn_tensor_type; +const char *qnn_datatype_to_string(Qnn_DataType_t qnn_type) { + switch (qnn_type) { + case QNN_DATATYPE_FLOAT_32: + return "QNN_DATATYPE_FLOAT_32"; + case QNN_DATATYPE_FLOAT_16: + return "QNN_DATATYPE_FLOAT_16"; + case QNN_DATATYPE_UINT_32: + return "QNN_DATATYPE_UINT_32"; + case QNN_DATATYPE_INT_32: + return "QNN_DATATYPE_INT_32"; + case QNN_DATATYPE_INT_16: + return "QNN_DATATYPE_INT_16"; + case QNN_DATATYPE_INT_8: + return "QNN_DATATYPE_INT_8"; + case QNN_DATATYPE_SFIXED_POINT_8: + return "QNN_DATATYPE_SFIXED_POINT_8"; + case QNN_DATATYPE_SFIXED_POINT_4: + return "QNN_DATATYPE_SFIXED_POINT_4"; + default: + break; + } + + return "QNN_DATATYPE_UNDEFINED"; } uint32_t get_ggml_tensor_rank(const ggml_tensor *tensor) { @@ -51,8 +136,13 @@ uint32_t get_ggml_tensor_rank(const ggml_tensor *tensor) { return rank; } -const char *get_backend_name(int n_backend_type) { - switch (n_backend_type) { +const char *get_ggml_type_name(ggml_type type) { + const auto *traits = ggml_get_type_traits(type); + return traits->type_name; +} + +const char *get_backend_name(size_t device_index) { + switch (device_index) { case QNN_BACKEND_CPU: return "QNN-CPU"; case QNN_BACKEND_GPU: diff --git a/ggml/src/ggml-qnn/utils.hpp b/ggml/src/ggml-qnn/utils.hpp index b7f29bdaa5663..2c58d037982f6 100644 --- a/ggml/src/ggml-qnn/utils.hpp +++ b/ggml/src/ggml-qnn/utils.hpp @@ -6,6 +6,7 @@ #include #include +#include #include #include "ggml.h" @@ -17,8 +18,14 @@ namespace qnn { +using ggml_dimension_array_t = int64_t[GGML_MAX_DIMS]; +using qnn_dimension_array_t = std::array; + +qnn_dimension_array_t get_internal_dimension(const ggml_dimension_array_t &dims, uint32_t rank); + uint32_t get_ggml_tensor_rank(const ggml_tensor *tensor); -const char *get_backend_name(int n_backend_type); +const char *get_ggml_type_name(ggml_type type); +const char *get_backend_name(size_t device_index); const char *get_chipset_desc(uint32_t chipset_id); const char *get_htparch_desc(size_t htp_arch); intptr_t align_to(size_t alignment, intptr_t offset); @@ -187,8 +194,10 @@ inline void set_qnn_tensor_dyn_dimensions(Qnn_Tensor_t &tensor, uint8_t *isDynam } } -Qnn_DataType_t device_datatype_from_ggml_datatype(ggml_type ggml_type); -Qnn_TensorType_t device_tensortype_from_ggml_tensor(ggml_tensor *ggml_tensor); +Qnn_DataType_t qnn_datatype_from_ggml_datatype(ggml_type ggml_type); +ggml_type ggml_datatype_from_qnn_datatype(Qnn_DataType_t qnn_type); +size_t qnn_datatype_size(Qnn_DataType_t qnn_type); +const char *qnn_datatype_to_string(Qnn_DataType_t qnn_type); #if ENABLE_QNNBACKEND_PERF class qnn_perf { From 5c1e6d4905c4e7e6023caa5c1ca12fd9aafcd70c Mon Sep 17 00:00:00 2001 From: hongruichen Date: Tue, 29 Oct 2024 00:54:08 +0800 Subject: [PATCH 124/143] disable gelu in NPU --- ggml/src/ggml-qnn/backend-ops.cpp | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-qnn/backend-ops.cpp b/ggml/src/ggml-qnn/backend-ops.cpp index d20069874a7c3..c4207e62a36f7 100644 --- a/ggml/src/ggml-qnn/backend-ops.cpp +++ b/ggml/src/ggml-qnn/backend-ops.cpp @@ -573,8 +573,15 @@ bool ggml_qnn_supports_op(ggml_backend_qnn_device_context *ctx, const ggml_tenso } if (op->op == GGML_OP_UNARY) { - if (!kQnnUnaryOpsTable[kGgmlUnaryOpStart + ggml_get_unary_op(op)]) { - QNN_LOG_DEBUG("unsupported unary op %d", ggml_get_unary_op(op)); + const auto unary_op = ggml_get_unary_op(op); + if (unary_op == GGML_UNARY_OP_GELU && ctx->device == QNN_BACKEND_NPU) { + // TODO: fix this when NPU supports GELU + QNN_LOG_DEBUG("unsupported unary op GGML_UNARY_OP_GELU for NPU"); + return false; + } + + if (!kQnnUnaryOpsTable[kGgmlUnaryOpStart + unary_op]) { + QNN_LOG_DEBUG("unsupported unary op %d", unary_op); return false; } From fe565cfd9f7b81e1afd5f8d1c8a82ea72b4ea69b Mon Sep 17 00:00:00 2001 From: nullname Date: Tue, 29 Oct 2024 15:47:07 +0800 Subject: [PATCH 125/143] fix compiling error in release --- ggml/src/ggml-qnn/backend-ops.cpp | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/ggml/src/ggml-qnn/backend-ops.cpp b/ggml/src/ggml-qnn/backend-ops.cpp index c4207e62a36f7..3e24ca32ed35f 100644 --- a/ggml/src/ggml-qnn/backend-ops.cpp +++ b/ggml/src/ggml-qnn/backend-ops.cpp @@ -44,21 +44,6 @@ bool qnn_is_valid_params(ggml_backend_qnn_device_context *ctx, const ggml_tensor return true; } -bool is_tensor_dimensions_equal(const ggml_tensor *l, const ggml_tensor *r) { - const auto dim_l = ggml_n_dims(l); - if (dim_l != ggml_n_dims(r)) { - return false; - } - - for (int i = 0; i < dim_l; i++) { - if (l->ne[i] != r->ne[i]) { - return false; - } - } - - return true; -} - void print_ggml_tensor(const ggml_tensor *tensor) { QNN_LOG_DEBUG("%s: type:%s ne: %ldx%ldx%ldx%ld, nb: %ldx%ldx%ldx%ld\n", tensor->name, ggml_type_name(tensor->type), (long)tensor->ne[0], (long)tensor->ne[1], (long)tensor->ne[2], (long)tensor->ne[3], @@ -78,6 +63,21 @@ void print_ggml_tensor(const ggml_tensor *tensor) { namespace { +bool is_tensor_dimensions_equal(const ggml_tensor *l, const ggml_tensor *r) { + const auto dim_l = ggml_n_dims(l); + if (dim_l != ggml_n_dims(r)) { + return false; + } + + for (int i = 0; i < dim_l; i++) { + if (l->ne[i] != r->ne[i]) { + return false; + } + } + + return true; +} + typedef bool (*ggml_qnn_unary_op_t)(ggml_backend_qnn_device_context *ctx, ggml_tensor *src, ggml_tensor *dst); typedef bool (*ggml_qnn_binary_op_t)(ggml_backend_qnn_device_context *ctx, ggml_tensor *src0, ggml_tensor *src1, ggml_tensor *dst); From 0fec56fd57f3051534defd40b19372498ffd5c68 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Mon, 4 Nov 2024 22:44:04 +0800 Subject: [PATCH 126/143] fix compiling error --- ggml/src/ggml-qnn.cpp | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/ggml/src/ggml-qnn.cpp b/ggml/src/ggml-qnn.cpp index 21a7dee1c99ef..4da991916c0b7 100644 --- a/ggml/src/ggml-qnn.cpp +++ b/ggml/src/ggml-qnn.cpp @@ -126,15 +126,6 @@ ggml_backend_qnn_device_context *get_device_context(ggml_backend_dev_t dev) { * qnn backend buffer object * ----------------------------------------------------------------------------------------------- */ -const char *ggml_backend_qnn_buffer_get_name(ggml_backend_buffer_t buffer) { - GGML_UNUSED(buffer); - return GGML_QNN_NAME; -} - -bool ggml_backend_buffer_is_qnn(ggml_backend_buffer_t buffer) { - return buffer->iface.get_name == ggml_backend_qnn_buffer_get_name; -} - void ggml_backend_qnn_buffer_free_buffer(ggml_backend_buffer_t buffer) { ggml_backend_qnn_buffer_context *ctx = (ggml_backend_qnn_buffer_context *)buffer->context; @@ -184,7 +175,6 @@ void ggml_backend_qnn_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) } ggml_backend_buffer_i ggml_backend_qnn_buffer_interface = { - /* .get_name = */ ggml_backend_qnn_buffer_get_name, /* .free_buffer = */ ggml_backend_qnn_buffer_free_buffer, /* .get_base = */ ggml_backend_qnn_buffer_get_base, /* .init_tensor = */ ggml_backend_qnn_buffer_init_tensor, @@ -281,10 +271,6 @@ ggml_backend_buffer_type_t ggml_backend_qnn_buffer_type(ggml_backend_dev_t dev) return &ggml_backend_qnn_buffer_types[dev_ctx->device]; } -ggml_backend_buffer_type_t ggml_backend_qnn_get_default_buffer_type(ggml_backend_t backend) { - return ggml_backend_qnn_buffer_type(backend->device); -} - ggml_status ggml_backend_qnn_graph_compute(ggml_backend_t backend, ggml_cgraph *cgraph) { enum ggml_status result = GGML_STATUS_SUCCESS; auto *device_ctx = get_device_context(backend->device); @@ -306,7 +292,6 @@ ggml_status ggml_backend_qnn_graph_compute(ggml_backend_t backend, ggml_cgraph * ggml_backend_i ggml_backend_qnn_interface = { /* .get_name = */ ggml_backend_qnn_name, /* .free = */ ggml_backend_qnn_free, - /* .get_default_buffer_type = */ ggml_backend_qnn_get_default_buffer_type, /* .set_tensor_async = */ nullptr, /* .get_tensor_async = */ nullptr, /* .cpy_tensor_async = */ nullptr, @@ -316,9 +301,6 @@ ggml_backend_i ggml_backend_qnn_interface = { /* .graph_plan_update = */ nullptr, /* .graph_plan_compute = */ nullptr, /* .graph_compute = */ ggml_backend_qnn_graph_compute, - /* .supports_op = */ nullptr, // moved to device - /* .supports_buft = */ nullptr, // moved to device - /* .offload_op = */ nullptr, // moved to device /* .event_record = */ nullptr, /* .event_wait = */ nullptr, }; From 8ad86dc703fd091d943b0ea1e07932947b1e2e66 Mon Sep 17 00:00:00 2001 From: nullname Date: Mon, 4 Nov 2024 23:12:03 +0800 Subject: [PATCH 127/143] feat: add QNN_OP_TRANSPOSE (#6) * redo: add convert nodes This reverts commit 8448acd5ebf8fe86ab9d25313b64a15c811ef96e. * align clang format with cann * rename binary_op -> general_op casue there're some op that will only tak 1 param * Revert "rename binary_op -> general_op" This reverts commit 5be63b1a0dc4614457785367dade62158fe46214. * wip * add GGML_OP_PERMUTE * add GGML_OP_VIEW and GGML_OP_GET_ROWS * wip * Revert "wip" This reverts commit 772462ca6cfa01ea31bde725c2da60076ad9385f. --- ggml/src/ggml-qnn.cpp | 48 ++++---- ggml/src/ggml-qnn/.clang-format | 46 +++++++- ggml/src/ggml-qnn/backend-ops.cpp | 186 ++++++++++++++++-------------- ggml/src/ggml-qnn/op-config.cpp | 90 ++++++++++++--- ggml/src/ggml-qnn/op-config.hpp | 35 ++++-- ggml/src/ggml-qnn/tensor.hpp | 1 + 6 files changed, 265 insertions(+), 141 deletions(-) diff --git a/ggml/src/ggml-qnn.cpp b/ggml/src/ggml-qnn.cpp index 4da991916c0b7..d28163dce44bc 100644 --- a/ggml/src/ggml-qnn.cpp +++ b/ggml/src/ggml-qnn.cpp @@ -57,30 +57,30 @@ struct qnn_device_caps { }; const qnn_device_caps kDeviceCaps[GGML_QNN_MAX_DEVICES]{ - { "qnn-cpu", - "Qualcomm Kryo CPU", - "libQnnCpu.so", - GGML_BACKEND_DEVICE_TYPE_CPU, - { GGML_TYPE_F32, - GGML_TYPE_I8 } }, // https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/CpuOpDefSupplement.html#matmul - { "qnn-gpu", - "Qualcomm Adreno GPU", - "libQnnGpu.so", - GGML_BACKEND_DEVICE_TYPE_GPU, - { GGML_TYPE_F32, - GGML_TYPE_F16 } }, // https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/GpuOpDefSupplement.html#matmul - { "qnn-npu", - "Qualcomm NPU", - "libQnnHtp.so", - GGML_BACKEND_DEVICE_TYPE_GPU, - { GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_I16, - GGML_TYPE_I8 } }, // https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/HtpOpDefSupplement.html#matmul + {// https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/CpuOpDefSupplement.html#matmul + "qnn-cpu", + "Qualcomm Kryo CPU", + "libQnnCpu.so", + GGML_BACKEND_DEVICE_TYPE_CPU, + {GGML_TYPE_F32, GGML_TYPE_I8}}, + {// https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/GpuOpDefSupplement.html#matmul + "qnn-gpu", + "Qualcomm Adreno GPU", + "libQnnGpu.so", + GGML_BACKEND_DEVICE_TYPE_GPU, + {GGML_TYPE_F32, GGML_TYPE_F16}}, + {// https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/HtpOpDefSupplement.html#matmul + "qnn-npu", + "Qualcomm NPU", + "libQnnHtp.so", + GGML_BACKEND_DEVICE_TYPE_GPU, + {GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_I16, GGML_TYPE_I8}}, }; class ggml_backend_qnn_buffer_context { public: - ggml_backend_qnn_buffer_context(QNNBackend device, std::shared_ptr instance, size_t size) : - _instance(instance), _name(QNN_BACKEND_NAME + std::to_string(device)) { + ggml_backend_qnn_buffer_context(QNNBackend device, std::shared_ptr instance, size_t size) + : _instance(instance), _name(QNN_BACKEND_NAME + std::to_string(device)) { // TODO: fix this for other platforms size_t size_page = sysconf(_SC_PAGESIZE); @@ -251,7 +251,7 @@ ggml_backend_buffer_type_t ggml_backend_qnn_buffer_type(ggml_backend_dev_t dev) if (!ggml_backend_qnn_buffer_type_initialized) { for (size_t i = 0; i < GGML_QNN_MAX_DEVICES; i++) { auto &context = ggml_backend_qnn_buffer_type_contexts[i]; - context = { std::string(QNN_BACKEND_NAME) + std::to_string(i) }; + context = {std::string(QNN_BACKEND_NAME) + std::to_string(i)}; ggml_backend_qnn_buffer_types[i] = { /* .iface = */ { /* .get_name = */ ggml_backend_qnn_buffer_type_name, @@ -348,8 +348,8 @@ void ggml_backend_qnn_device_get_props(ggml_backend_dev_t dev, struct ggml_backe } ggml_guid_t ggml_backend_qnn_guid() { - static ggml_guid guid = { 0x1a, 0x2b, 0x3c, 0x4d, 0x5e, 0x6f, 0x70, 0x81, - 0x92, 0xa3, 0xb4, 0xc5, 0xd6, 0xe7, 0xf8, 0x09 }; + static ggml_guid guid = {0x1a, 0x2b, 0x3c, 0x4d, 0x5e, 0x6f, 0x70, 0x81, + 0x92, 0xa3, 0xb4, 0xc5, 0xd6, 0xe7, 0xf8, 0x09}; return &guid; } @@ -511,7 +511,7 @@ const ggml_backend_reg_i ggml_backend_qnn_reg_interface = { } // namespace ggml_backend_reg_t ggml_backend_qnn_reg() { - static ggml_backend_qnn_reg_impl reg{ ggml_backend_qnn_reg_interface }; + static ggml_backend_qnn_reg_impl reg{ggml_backend_qnn_reg_interface}; static bool initialized = false; static std::mutex mutex; diff --git a/ggml/src/ggml-qnn/.clang-format b/ggml/src/ggml-qnn/.clang-format index 3b933ff10db42..0c67c54239623 100644 --- a/ggml/src/ggml-qnn/.clang-format +++ b/ggml/src/ggml-qnn/.clang-format @@ -3,16 +3,50 @@ BasedOnStyle: Google IndentWidth: 4 AccessModifierOffset: -4 AlignAfterOpenBracket: Align -AlignOperands: true +AlignConsecutiveMacros: false +AlignConsecutiveAssignments: false +AlignConsecutiveDeclarations: false +AlignEscapedNewlines: Left +AlignOperands: true AlignTrailingComments: true +AllowAllArgumentsOnNextLine: true +AllowAllConstructorInitializersOnNextLine: true +AllowAllParametersOfDeclarationOnNextLine: true +AllowShortBlocksOnASingleLine: Never +AllowShortCaseLabelsOnASingleLine: false +AllowShortFunctionsOnASingleLine: All +AllowShortLambdasOnASingleLine: All +AllowShortIfStatementsOnASingleLine: WithoutElse +AllowShortLoopsOnASingleLine: true +AlwaysBreakAfterDefinitionReturnType: None +AlwaysBreakAfterReturnType: None +AlwaysBreakBeforeMultilineStrings: true +AlwaysBreakTemplateDeclarations: Yes BinPackArguments: true BinPackParameters: true -BreakBeforeBraces: Custom -BreakConstructorInitializers: AfterColon +BraceWrapping: + AfterCaseLabel: false + AfterClass: false + AfterControlStatement: false + AfterEnum: false + AfterFunction: false + AfterNamespace: false + AfterObjCDeclaration: false + AfterStruct: false + AfterUnion: false + AfterExternBlock: false + BeforeCatch: false + BeforeElse: false + IndentBraces: false + SplitEmptyFunction: true + SplitEmptyRecord: true + SplitEmptyNamespace: true ColumnLimit: 120 -Cpp11BracedListStyle: false +ConstructorInitializerIndentWidth: 4 +ContinuationIndentWidth: 4 +Cpp11BracedListStyle: true DerivePointerAlignment: false -IncludeCategories: +IncludeCategories: - Regex: '^<.*\.h>' Priority: 1 - Regex: '^<.*' @@ -28,4 +62,4 @@ MaxEmptyLinesToKeep: 1 PointerAlignment: Right SortIncludes: true SpacesBeforeTrailingComments: 1 -UseTab: Never \ No newline at end of file +UseTab: Never diff --git a/ggml/src/ggml-qnn/backend-ops.cpp b/ggml/src/ggml-qnn/backend-ops.cpp index 3e24ca32ed35f..c0e263a640eea 100644 --- a/ggml/src/ggml-qnn/backend-ops.cpp +++ b/ggml/src/ggml-qnn/backend-ops.cpp @@ -92,10 +92,10 @@ qnn::ggml_tensor_array_t to_ggml_tensor_array(const std::array +template bool execute_graph(qnn::ggml_qnn_graph *graph, const std::array &inputs, - const std::array &outputs) { - if (!graph->execute(to_ggml_tensor_array<_InputSize>(inputs), to_ggml_tensor_array<_OutputSize>(outputs))) { + ggml_tensor *output) { + if (!graph->execute(to_ggml_tensor_array<_InputSize>(inputs), to_ggml_tensor_array<1>({output}))) { QNN_LOG_WARN("execute failed\n"); return false; } @@ -154,37 +154,37 @@ constexpr const char *kGgmlOpToQnnOp[] = { nullptr, // GGML_OP_MUL_MAT_ID nullptr, // GGML_OP_OUT_PROD - nullptr, // GGML_OP_SCALE - nullptr, // GGML_OP_SET - nullptr, // GGML_OP_CPY - nullptr, // GGML_OP_CONT - nullptr, // GGML_OP_RESHAPE - nullptr, // GGML_OP_VIEW - nullptr, // GGML_OP_PERMUTE - nullptr, // GGML_OP_TRANSPOSE - nullptr, // GGML_OP_GET_ROWS - nullptr, // GGML_OP_GET_ROWS_BACK - nullptr, // GGML_OP_DIAG - nullptr, // GGML_OP_DIAG_MASK_INF - nullptr, // GGML_OP_DIAG_MASK_ZERO - nullptr, // GGML_OP_SOFT_MAX - nullptr, // GGML_OP_SOFT_MAX_BACK - nullptr, // GGML_OP_ROPE - nullptr, // GGML_OP_ROPE_BACK - nullptr, // GGML_OP_CLAMP - nullptr, // GGML_OP_CONV_TRANSPOSE_1D - nullptr, // GGML_OP_IM2COL - nullptr, // GGML_OP_IM2COL_BACK - nullptr, // GGML_OP_CONV_TRANSPOSE_2D - nullptr, // GGML_OP_POOL_1D - nullptr, // GGML_OP_POOL_2D - nullptr, // GGML_OP_POOL_2D_BACK - nullptr, // GGML_OP_UPSCALE - nullptr, // GGML_OP_PAD - nullptr, // GGML_OP_ARANGE - nullptr, // GGML_OP_TIMESTEP_EMBEDDING - nullptr, // GGML_OP_ARGSORT - nullptr, // GGML_OP_LEAKY_RELU + nullptr, // GGML_OP_SCALE + nullptr, // GGML_OP_SET + nullptr, // GGML_OP_CPY + nullptr, // GGML_OP_CONT + nullptr, // GGML_OP_RESHAPE + nullptr, // GGML_OP_VIEW + QNN_OP_TRANSPOSE, // GGML_OP_PERMUTE + nullptr, // GGML_OP_TRANSPOSE + nullptr, // GGML_OP_GET_ROWS + nullptr, // GGML_OP_GET_ROWS_BACK + nullptr, // GGML_OP_DIAG + nullptr, // GGML_OP_DIAG_MASK_INF + nullptr, // GGML_OP_DIAG_MASK_ZERO + nullptr, // GGML_OP_SOFT_MAX + nullptr, // GGML_OP_SOFT_MAX_BACK + nullptr, // GGML_OP_ROPE + nullptr, // GGML_OP_ROPE_BACK + nullptr, // GGML_OP_CLAMP + nullptr, // GGML_OP_CONV_TRANSPOSE_1D + nullptr, // GGML_OP_IM2COL + nullptr, // GGML_OP_IM2COL_BACK + nullptr, // GGML_OP_CONV_TRANSPOSE_2D + nullptr, // GGML_OP_POOL_1D + nullptr, // GGML_OP_POOL_2D + nullptr, // GGML_OP_POOL_2D_BACK + nullptr, // GGML_OP_UPSCALE + nullptr, // GGML_OP_PAD + nullptr, // GGML_OP_ARANGE + nullptr, // GGML_OP_TIMESTEP_EMBEDDING + nullptr, // GGML_OP_ARGSORT + nullptr, // GGML_OP_LEAKY_RELU nullptr, // GGML_OP_FLASH_ATTN_EXT nullptr, // GGML_OP_FLASH_ATTN_BACK @@ -235,16 +235,16 @@ static_assert(sizeof(kGgmlOpToQnnOp) / sizeof(kGgmlOpToQnnOp[0]) == (GGML_OP_COU static_assert(kGgmlOpToQnnOp[GGML_UNARY_OP_GELU + kGgmlUnaryOpStart] != nullptr, "GGML_UNARY_OP_GELU does not correspond to QNN_OP_GELU"); -template +template qnn::ggml_qnn_graph *get_qnn_graph_from_cache(ggml_backend_qnn_device_context *ctx, size_t op, const std::array &inputs, - const std::array &outputs) { + ggml_tensor *output) { GGML_ASSERT(op < (GGML_OP_COUNT + GGML_UNARY_OP_COUNT)); auto &graph_cache = ctx->qnn_graph_cache; const auto *op_name = op < kGgmlUnaryOpStart ? ggml_op_name(ggml_op(op)) : ggml_unary_op_name(ggml_unary_op(op - kGgmlUnaryOpStart)); - auto graph_key = get_graph_key<_InputSize, _OutputSize>(op_name, inputs, outputs); + auto graph_key = get_graph_key<_InputSize, 1>(op_name, inputs, {output}); auto it = graph_cache.find(graph_key); qnn::ggml_qnn_graph *graph_ptr = nullptr; if (it != graph_cache.end()) { @@ -259,7 +259,7 @@ qnn::ggml_qnn_graph *get_qnn_graph_from_cache(ggml_backend_qnn_device_context *c auto op_constructor = qnn::create_op_constructor(kGgmlOpToQnnOp[op]); if (!graph->build_graph(op_constructor, to_ggml_tensor_array<_InputSize>(inputs), - to_ggml_tensor_array<_OutputSize>(outputs))) { + to_ggml_tensor_array<1>({output}))) { QNN_LOG_ERROR("build_graph failed\n"); return nullptr; } @@ -278,9 +278,9 @@ bool qnn_binary_op_impl(ggml_backend_qnn_device_context *ctx, ggml_tensor *src0, CHECK_PARAMS(ctx, src0, src1, dst); bool succeed = false; - auto *graph_ptr = get_qnn_graph_from_cache<2, 1>(ctx, _GgmlOp, { src0, src1 }, { dst }); + auto *graph_ptr = get_qnn_graph_from_cache<2>(ctx, _GgmlOp, {src0, src1}, dst); if (graph_ptr) { - succeed = execute_graph<2, 1>(graph_ptr, { src0, src1 }, { dst }); + succeed = execute_graph<2>(graph_ptr, {src0, src1}, dst); } #ifndef NDEBUG @@ -301,9 +301,9 @@ bool qnn_unary_op_impl(ggml_backend_qnn_device_context *ctx, ggml_tensor *src, g CHECK_PARAMS(ctx, src, dst); bool succeed = false; - auto *graph_ptr = get_qnn_graph_from_cache<1, 1>(ctx, _GgmlOp, { src }, { dst }); + auto *graph_ptr = get_qnn_graph_from_cache<1>(ctx, _GgmlOp, {src}, dst); if (graph_ptr) { - succeed = execute_graph<1, 1>(graph_ptr, { src }, { dst }); + succeed = execute_graph<1>(graph_ptr, {src}, dst); } #ifndef NDEBUG @@ -315,6 +315,22 @@ bool qnn_unary_op_impl(ggml_backend_qnn_device_context *ctx, ggml_tensor *src, g return succeed; } + +bool qnn_unary_nop_impl(ggml_backend_qnn_device_context *ctx, ggml_tensor *src, ggml_tensor *dst) { + GGML_UNUSED(ctx); + GGML_UNUSED(src); + GGML_UNUSED(dst); + return true; +} + +bool qnn_binary_nop_impl(ggml_backend_qnn_device_context *ctx, ggml_tensor *src0, ggml_tensor *src1, ggml_tensor *dst) { + GGML_UNUSED(ctx); + GGML_UNUSED(src0); + GGML_UNUSED(src1); + GGML_UNUSED(dst); + return true; +} + constexpr const ggml_qnn_unary_op_t kQnnUnaryOpsTable[] = { nullptr, // GGML_OP_NONE nullptr, // GGML_OP_DUP @@ -347,37 +363,37 @@ constexpr const ggml_qnn_unary_op_t kQnnUnaryOpsTable[] = { nullptr, // GGML_OP_MUL_MAT_ID nullptr, // GGML_OP_OUT_PROD - nullptr, // GGML_OP_SCALE - nullptr, // GGML_OP_SET - nullptr, // GGML_OP_CPY - nullptr, // GGML_OP_CONT - nullptr, // GGML_OP_RESHAPE - nullptr, // GGML_OP_VIEW - nullptr, // GGML_OP_PERMUTE - nullptr, // GGML_OP_TRANSPOSE - nullptr, // GGML_OP_GET_ROWS - nullptr, // GGML_OP_GET_ROWS_BACK - nullptr, // GGML_OP_DIAG - nullptr, // GGML_OP_DIAG_MASK_INF - nullptr, // GGML_OP_DIAG_MASK_ZERO - nullptr, // GGML_OP_SOFT_MAX - nullptr, // GGML_OP_SOFT_MAX_BACK - nullptr, // GGML_OP_ROPE - nullptr, // GGML_OP_ROPE_BACK - nullptr, // GGML_OP_CLAMP - nullptr, // GGML_OP_CONV_TRANSPOSE_1D - nullptr, // GGML_OP_IM2COL - nullptr, // GGML_OP_IM2COL_BACK - nullptr, // GGML_OP_CONV_TRANSPOSE_2D - nullptr, // GGML_OP_POOL_1D - nullptr, // GGML_OP_POOL_2D - nullptr, // GGML_OP_POOL_2D_BACK - nullptr, // GGML_OP_UPSCALE - nullptr, // GGML_OP_PAD - nullptr, // GGML_OP_ARANGE - nullptr, // GGML_OP_TIMESTEP_EMBEDDING - nullptr, // GGML_OP_ARGSORT - nullptr, // GGML_OP_LEAKY_RELU + nullptr, // GGML_OP_SCALE + nullptr, // GGML_OP_SET + nullptr, // GGML_OP_CPY + nullptr, // GGML_OP_CONT + nullptr, // GGML_OP_RESHAPE + qnn_unary_nop_impl, // GGML_OP_VIEW + qnn_unary_op_impl, // GGML_OP_PERMUTE + nullptr, // GGML_OP_TRANSPOSE + qnn_unary_nop_impl, // GGML_OP_GET_ROWS + nullptr, // GGML_OP_GET_ROWS_BACK + nullptr, // GGML_OP_DIAG + nullptr, // GGML_OP_DIAG_MASK_INF + nullptr, // GGML_OP_DIAG_MASK_ZERO + nullptr, // GGML_OP_SOFT_MAX + nullptr, // GGML_OP_SOFT_MAX_BACK + nullptr, // GGML_OP_ROPE + nullptr, // GGML_OP_ROPE_BACK + nullptr, // GGML_OP_CLAMP + nullptr, // GGML_OP_CONV_TRANSPOSE_1D + nullptr, // GGML_OP_IM2COL + nullptr, // GGML_OP_IM2COL_BACK + nullptr, // GGML_OP_CONV_TRANSPOSE_2D + nullptr, // GGML_OP_POOL_1D + nullptr, // GGML_OP_POOL_2D + nullptr, // GGML_OP_POOL_2D_BACK + nullptr, // GGML_OP_UPSCALE + nullptr, // GGML_OP_PAD + nullptr, // GGML_OP_ARANGE + nullptr, // GGML_OP_TIMESTEP_EMBEDDING + nullptr, // GGML_OP_ARGSORT + nullptr, // GGML_OP_LEAKY_RELU nullptr, // GGML_OP_FLASH_ATTN_EXT nullptr, // GGML_OP_FLASH_ATTN_BACK @@ -522,18 +538,24 @@ static_assert(sizeof(kQnnBinaryOpsTable) / sizeof(kQnnBinaryOpsTable[0]) == GGML "GGML_OP_COUNT does not match the size of the kQnnBinaryOpsTable table"); bool ggml_qnn_supports_tensor(ggml_backend_qnn_device_context *ctx, const ggml_tensor *tensor) { + if (!tensor) { + QNN_LOG_DEBUG("tensor is nullptr"); + return false; + } + + auto *type_name = ggml_get_type_traits(tensor->type)->type_name; switch (tensor->type) { case GGML_TYPE_F32: case GGML_TYPE_F16: case GGML_TYPE_Q8_0: case GGML_TYPE_Q4_0: if (ctx->supported_types.find(tensor->type) == ctx->supported_types.end()) { - QNN_LOG_DEBUG("unsupported data type GGML_TYPE_F16 for cpu backend"); + QNN_LOG_DEBUG("unsupported data type %s for backend %d", type_name, (int)ctx->device); return false; } break; default: - QNN_LOG_DEBUG("unsupported data type %d", tensor->type); + QNN_LOG_DEBUG("unsupported data type %s", type_name); return false; } @@ -591,19 +613,15 @@ bool ggml_qnn_supports_op(ggml_backend_qnn_device_context *ctx, const ggml_tenso } } else { if (!kQnnUnaryOpsTable[op->op] && !kQnnBinaryOpsTable[op->op]) { - QNN_LOG_DEBUG("unsupported op %d", op->op); + QNN_LOG_DEBUG("[%s] unsupported op", ggml_op_name(op->op)); return false; } auto *src0 = op->src[0]; auto *src1 = op->src[1]; - if (!src0 || !src1) { - QNN_LOG_DEBUG("src0 or src1 is nullptr"); - return false; - } - - if (!ggml_qnn_supports_tensor(ctx, src0) || !ggml_qnn_supports_tensor(ctx, src1) || - !ggml_qnn_supports_tensor(ctx, op)) { + if (!ggml_qnn_supports_tensor(ctx, src0) || !ggml_qnn_supports_tensor(ctx, op) || + (kQnnBinaryOpsTable[op->op] && !ggml_qnn_supports_tensor(ctx, src1))) { + QNN_LOG_DEBUG("[%s] unsupported tensor", ggml_op_name(op->op)); return false; } @@ -642,7 +660,7 @@ bool ggml_qnn_forward(ggml_backend_qnn_device_context *ctx, struct ggml_tensor * return binary_op(ctx, tensor->src[0], tensor->src[1], tensor); } - QNN_LOG_WARN("unsupported op %s", ggml_op_desc(tensor)); + QNN_LOG_WARN("[forward]unsupported op %s", ggml_op_desc(tensor)); return false; } diff --git a/ggml/src/ggml-qnn/op-config.cpp b/ggml/src/ggml-qnn/op-config.cpp index 07dcba156471b..9b98051adfc8e 100644 --- a/ggml/src/ggml-qnn/op-config.cpp +++ b/ggml/src/ggml-qnn/op-config.cpp @@ -7,10 +7,10 @@ namespace { constexpr const qnn::qnn_dimension_array_t kTransposeParamData[GGML_MAX_DIMS] = { - { 0 }, - { 1, 0 }, - { 0, 2, 1 }, - { 0, 1, 3, 2 }, + {0}, + {1, 0}, + {0, 2, 1}, + {0, 1, 3, 2}, }; qnn::qnn_dimension_array_t get_transposed_dimensions(const qnn::qnn_dimension_array_t &dimensions, int rank) { @@ -96,9 +96,8 @@ bool bind_tensors(const qnn::ggml_tensor_array_t &ggml_tensors, qnn::ggml_qnn_te class ggml_qnn_connectable_op_config : public qnn::ggml_qnn_op_config_base { public: explicit ggml_qnn_connectable_op_config(const std::string &name, const std::string &package_name, - const std::string &op_type, - std::shared_ptr qnn_instance) : - ggml_qnn_op_config_base(name, package_name, op_type, qnn_instance) {} + const std::string &op_type, std::shared_ptr qnn_instance) + : ggml_qnn_op_config_base(name, package_name, op_type, qnn_instance) {} bool create_tensors(QNNBackend device, Qnn_GraphHandle_t graph_handle, const qnn::ggml_tensor_array_t &tensor_inputs, @@ -264,11 +263,22 @@ bool ggml_qnn_single_op_config::create_tensors(QNNBackend device, Qnn_GraphHandl const ggml_tensor_array_t &tensor_inputs, const ggml_tensor_array_t &tensor_outputs) { const auto tensor_rank = get_rank(tensor_inputs, tensor_outputs); - tensor_common_params params = { "src", tensor_rank, true, device, graph_handle, _qnn_instance }; + tensor_common_params params = {"src", tensor_rank, true, device, graph_handle, _qnn_instance}; create_tensors_from_ggml_tensor(params, tensor_inputs, &_tensor_inputs, &_qnn_tensor_inputs); params.name_prefix = "dst"; params.is_input = false; create_tensors_from_ggml_tensor(params, tensor_outputs, &_tensor_outputs, &_qnn_tensor_outputs); + + if (_param_buffer.size() > 0) { + // handle parameters in output tensor + auto *params = tensor_outputs.front()->op_params; + memcpy(_param_buffer.data(), params, _param_buffer.size()); + + const uint32_t count = uint32_t(_param_buffer.size() / qnn_datatype_size(_param_type)); + const qnn_dimension_array_t param_dims = {count, 1, 1, 1}; + add_tensor_param(_param_name, param_dims, 1, _param_buffer.data(), _param_type, device, graph_handle); + } + return true; } @@ -281,7 +291,7 @@ bool ggml_qnn_matmul_op_config::create_tensors(QNNBackend device, Qnn_GraphHandl GGML_ASSERT(tensor_rank >= 2); // create input tensors - tensor_common_params params = { "src", tensor_rank, true, device, graph_handle, _qnn_instance }; + tensor_common_params params = {"src", tensor_rank, true, device, graph_handle, _qnn_instance}; create_tensors_from_ggml_tensor(params, tensor_inputs, &_tensor_inputs, &_qnn_tensor_inputs); // create output tensor @@ -290,8 +300,49 @@ bool ggml_qnn_matmul_op_config::create_tensors(QNNBackend device, Qnn_GraphHandl params.is_input = false; create_tensors_from_ggml_tensor(params, tensor_outputs, &mat_mul_tensor_outputs, nullptr); + if (device == QNN_BACKEND_GPU) { + // there's no convert op for GPU, so we should create matmul nodes directl. + return create_mat_mul_nodes(device, graph_handle, tensor_rank, _tensor_inputs, mat_mul_tensor_outputs); + } + + // create tensors for convert node + ggml_qnn_tensor_array_t mat_mul_tensor_inputs = _tensor_inputs; + auto input_tensor_type = get_tensor_type(mat_mul_tensor_inputs); + QNN_LOG_DEBUG("matmul input tensor type: %s\n", qnn_datatype_to_string(input_tensor_type)); + + _input_converts.resize(mat_mul_tensor_inputs.size()); + for (size_t i = 0; i < mat_mul_tensor_inputs.size(); ++i) { + // create input convert nodes + std::string convert_name("convert_src" + std::to_string(i)); + auto convert_in = mat_mul_tensor_inputs[i]; + auto convert_out = std::make_shared(ggml_qnn_tensor::INTERMEDIATE, convert_name + "_out", + convert_in->get_dimensions(), input_tensor_type, + tensor_rank, device, graph_handle, _qnn_instance); + auto convert = std::make_shared(convert_name, QNN_OP_PACKAGE_NAME_QTI_AISW, + QNN_OP_CONVERT, _qnn_instance); + convert->set_input_tensors({convert_in}); + convert->set_output_tensors({convert_out}); + mat_mul_tensor_inputs[i] = convert_out; + _input_converts[i] = convert; + } + + { + // create output convert node + std::string convert_name("convert_dst"); + auto convert_out = mat_mul_tensor_outputs.front(); + auto convert_in = std::make_shared(ggml_qnn_tensor::INTERMEDIATE, convert_name + "_in", + convert_out->get_dimensions(), input_tensor_type, + tensor_rank, device, graph_handle, _qnn_instance); + auto output_convert = std::make_shared( + convert_name, QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_CONVERT, _qnn_instance); + output_convert->set_input_tensors({convert_in}); + output_convert->set_output_tensors({convert_out}); + mat_mul_tensor_outputs[0] = convert_in; + _output_convert = output_convert; + } + // create mat_mul nodes - return create_mat_mul_nodes(device, graph_handle, tensor_rank, _tensor_inputs, mat_mul_tensor_outputs); + return create_mat_mul_nodes(device, graph_handle, tensor_rank, mat_mul_tensor_inputs, mat_mul_tensor_outputs); } bool ggml_qnn_matmul_op_config::create_mat_mul_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, const int rank, @@ -371,7 +422,7 @@ bool ggml_qnn_matmul_op_config::create_mat_mul_nodes(QNNBackend device, Qnn_Grap // set transpose0 parameters auto *params_data = reinterpret_cast(kTransposeParamData[rank - 1].data()); - const qnn_dimension_array_t param_dims = { (uint32_t)rank, 1, 1, 1 }; + const qnn_dimension_array_t param_dims = {(uint32_t)rank, 1, 1, 1}; transpose0->add_tensor_param(QNN_OP_TRANSPOSE_PARAM_PERM, param_dims, 1, params_data, QNN_DATATYPE_UINT_32, device, graph_handle); @@ -380,19 +431,19 @@ bool ggml_qnn_matmul_op_config::create_mat_mul_nodes(QNNBackend device, Qnn_Grap graph_handle); // set tensor to transpose0 - ggml_qnn_tensor_array_t tensors = { tensor_inputs.back() }; + ggml_qnn_tensor_array_t tensors = {tensor_inputs.back()}; transpose0->set_input_tensors(tensors); - tensors = { src0_trans }; + tensors = {src0_trans}; transpose0->set_output_tensors(tensors); // set tensor to mat_mul - tensors = { tensor_inputs.front(), src0_trans }; + tensors = {tensor_inputs.front(), src0_trans}; mat_mul->set_input_tensors(tensors); - tensors = { dst_trans }; + tensors = {dst_trans}; mat_mul->set_output_tensors(tensors); // set tensor to transpose1 - tensors = { dst_trans }; + tensors = {dst_trans}; transpose1->set_input_tensors(tensors); transpose1->set_output_tensors(tensor_outputs); @@ -459,6 +510,13 @@ ggml_op_constructor_t create_op_constructor(const std::string &op_name) { QNN_LOG_DEBUG("create QNN_OP_MAT_MUL, name %s\n", instance_name.c_str()); return std::make_unique(instance_name, qnn_instance); }; + } else if (op_name == QNN_OP_TRANSPOSE) { + return [](const std::string &instance_name, + std::shared_ptr qnn_instance) -> std::unique_ptr { + return std::make_unique(instance_name, QNN_OP_PACKAGE_NAME_QTI_AISW, + QNN_OP_TRANSPOSE, QNN_OP_TRANSPOSE_PARAM_PERM, + QNN_DATATYPE_UINT_32, 4 * sizeof(uint32_t), qnn_instance); + }; } return [op_name](const std::string &instance_name, diff --git a/ggml/src/ggml-qnn/op-config.hpp b/ggml/src/ggml-qnn/op-config.hpp index 2016cb4ac994d..4ec7aac9b256e 100644 --- a/ggml/src/ggml-qnn/op-config.hpp +++ b/ggml/src/ggml-qnn/op-config.hpp @@ -30,11 +30,16 @@ class ggml_qnn_op_config { virtual void unbind_output_tensors() = 0; }; +using ggml_op_constructor_t = + std::function(const std::string &, std::shared_ptr)>; + +ggml_op_constructor_t create_op_constructor(const std::string &op_name); + class ggml_qnn_op_config_base : public ggml_qnn_op_config { public: explicit ggml_qnn_op_config_base(const std::string &name, const std::string &package_name, - const std::string &op_type, std::shared_ptr qnn_instance) : - _name(name), _package_name(package_name), _op_type(op_type), _qnn_instance(qnn_instance) {} + const std::string &op_type, std::shared_ptr qnn_instance) + : _name(name), _package_name(package_name), _op_type(op_type), _qnn_instance(qnn_instance) {} void add_scalar_param(const std::string &name, const Qnn_Scalar_t scalar); bool add_tensor_param(const std::string &name, const qnn_dimension_array_t &dimensions, int rank, @@ -70,21 +75,34 @@ class ggml_qnn_op_config_base : public ggml_qnn_op_config { class ggml_qnn_single_op_config : public ggml_qnn_op_config_base { public: explicit ggml_qnn_single_op_config(const std::string &name, const std::string &package_name, - const std::string &op_type, std::shared_ptr qnn_instance) : - ggml_qnn_op_config_base(name, package_name, op_type, qnn_instance) {} + const std::string &op_type, std::shared_ptr qnn_instance) + : ggml_qnn_op_config_base(name, package_name, op_type, qnn_instance) {} + + explicit ggml_qnn_single_op_config(const std::string &name, const std::string &package_name, + const std::string &op_type, const std::string ¶m_name, + const Qnn_DataType_t param_type, const size_t param_size, + std::shared_ptr qnn_instance) + : ggml_qnn_op_config_base(name, package_name, op_type, qnn_instance), + _param_name(param_name), + _param_type(param_type), + _param_buffer(param_size) {} bool create_tensors(QNNBackend device, Qnn_GraphHandle_t graph_handle, const ggml_tensor_array_t &tensor_inputs, const ggml_tensor_array_t &tensor_outputs) override; private: + const std::string _param_name; + const Qnn_DataType_t _param_type = QNN_DATATYPE_UINT_32; + std::vector _param_buffer; + DISABLE_COPY(ggml_qnn_single_op_config); DISABLE_MOVE(ggml_qnn_single_op_config); }; class ggml_qnn_matmul_op_config : public ggml_qnn_op_config { public: - ggml_qnn_matmul_op_config(const std::string &name, std::shared_ptr qnn_instance) : - _name(name), _qnn_instance(qnn_instance) {} + ggml_qnn_matmul_op_config(const std::string &name, std::shared_ptr qnn_instance) + : _name(name), _qnn_instance(qnn_instance) {} bool create_tensors(QNNBackend device, Qnn_GraphHandle_t graph_handle, const ggml_tensor_array_t &tensor_inputs, const ggml_tensor_array_t &tensor_outputs) override; @@ -114,9 +132,4 @@ class ggml_qnn_matmul_op_config : public ggml_qnn_op_config { DISABLE_MOVE(ggml_qnn_matmul_op_config); }; -using ggml_op_constructor_t = - std::function(const std::string &, std::shared_ptr)>; - -ggml_op_constructor_t create_op_constructor(const std::string &op_name); - } // namespace qnn diff --git a/ggml/src/ggml-qnn/tensor.hpp b/ggml/src/ggml-qnn/tensor.hpp index faf5b0df5f4e1..f28fc8e2ca1e2 100644 --- a/ggml/src/ggml-qnn/tensor.hpp +++ b/ggml/src/ggml-qnn/tensor.hpp @@ -257,6 +257,7 @@ class ggml_qnn_tensor { DISABLE_MOVE(ggml_qnn_tensor); }; +using ggml_qnn_tensor_ptr_t = std::shared_ptr; using ggml_qnn_tensor_array_t = std::vector>; } // namespace qnn From e6dbdacc3287ab89c0a21a9bd5972caa2c5338a1 Mon Sep 17 00:00:00 2001 From: nullname Date: Wed, 13 Nov 2024 17:06:46 +0800 Subject: [PATCH 128/143] feat: fix llama-bench (#7) * remove unused functions * wip * init from last devices * move init into constructor * wip * add static assert to device table * make kDeviceCaps as constexpr * get free memory and total memory * add optimize flag for qnn backend --- ggml/include/ggml-qnn.h | 15 +-- ggml/src/CMakeLists.txt | 1 + ggml/src/ggml-qnn.cpp | 172 +++++++++++++++--------------- ggml/src/ggml-qnn/backend-ops.cpp | 2 +- ggml/src/ggml-qnn/backend.hpp | 2 +- ggml/src/ggml-qnn/utils.cpp | 25 ++++- ggml/src/ggml-qnn/utils.hpp | 6 +- 7 files changed, 116 insertions(+), 107 deletions(-) diff --git a/ggml/include/ggml-qnn.h b/ggml/include/ggml-qnn.h index 23835f23cb0ec..2b25ce40d79e5 100644 --- a/ggml/include/ggml-qnn.h +++ b/ggml/include/ggml-qnn.h @@ -9,28 +9,17 @@ extern "C" { #endif #define GGML_QNN_NAME "QNN" -#define GGML_QNN_MAX_DEVICES 3 +#define GGML_QNN_MAX_DEVICES QNN_BACKEND_COUNT enum QNNBackend { QNN_BACKEND_CPU = 0, QNN_BACKEND_GPU, QNN_BACKEND_NPU, - QNN_BACKEND_GGML, //"fake" QNN backend, used for compare performance between - // QNN and original GGML + QNN_BACKEND_COUNT, }; -/** - * - * @param index 0: QNN_BACKEND_CPU 1: QNN_BACKEND_GPU 2:QNN_BACKEND_NPU - * @param extend_lib_search_path extened lib search path for searching QNN backend dynamic libs - * @return - */ -GGML_API ggml_backend_t ggml_backend_qnn_init(size_t index, const char *extend_lib_search_path); - GGML_API bool ggml_backend_is_qnn(ggml_backend_t backend); -GGML_API int ggml_backend_qnn_get_device_count(void); - GGML_API ggml_backend_reg_t ggml_backend_qnn_reg(void); #ifdef __cplusplus diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt index b827f9c8f0aba..0d4b388f324f3 100644 --- a/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt @@ -969,6 +969,7 @@ if (GGML_QNN) message("QNN_SDK_PATH: ${GGML_QNN_SDK_PATH}") file(GLOB GGML_SOURCES_QNN "ggml-qnn/*.cpp") list(APPEND GGML_SOURCES_QNN "ggml-qnn.cpp") + set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3") set(GGML_HEADERS_QNN ../include/ggml-qnn.h) set(QNN_INC_PATH ${GGML_QNN_SDK_PATH}/include/QNN) set(GGML_EXTRA_INCLUDES ${GGML_EXTRA_INCLUDES} ${QNN_INC_PATH} "ggml-qnn") diff --git a/ggml/src/ggml-qnn.cpp b/ggml/src/ggml-qnn.cpp index d28163dce44bc..a41fae6bbb368 100644 --- a/ggml/src/ggml-qnn.cpp +++ b/ggml/src/ggml-qnn.cpp @@ -53,39 +53,50 @@ struct qnn_device_caps { enum ggml_backend_dev_type type; // TODO: should get this caps from device - std::unordered_set supported_types; + uint64_t supported_types; }; -const qnn_device_caps kDeviceCaps[GGML_QNN_MAX_DEVICES]{ - {// https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/CpuOpDefSupplement.html#matmul - "qnn-cpu", - "Qualcomm Kryo CPU", - "libQnnCpu.so", - GGML_BACKEND_DEVICE_TYPE_CPU, - {GGML_TYPE_F32, GGML_TYPE_I8}}, - {// https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/GpuOpDefSupplement.html#matmul - "qnn-gpu", - "Qualcomm Adreno GPU", - "libQnnGpu.so", - GGML_BACKEND_DEVICE_TYPE_GPU, - {GGML_TYPE_F32, GGML_TYPE_F16}}, - {// https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/HtpOpDefSupplement.html#matmul - "qnn-npu", - "Qualcomm NPU", - "libQnnHtp.so", - GGML_BACKEND_DEVICE_TYPE_GPU, - {GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_I16, GGML_TYPE_I8}}, +constexpr const qnn_device_caps kDeviceCaps[] = { + { + // https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/CpuOpDefSupplement.html#matmul + "qnn-cpu", + "Qualcomm Kryo CPU", + "libQnnCpu.so", + GGML_BACKEND_DEVICE_TYPE_CPU, + (1 << GGML_TYPE_I8) | (1 << GGML_TYPE_F32), + }, + { + // https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/GpuOpDefSupplement.html#matmul + "qnn-gpu", + "Qualcomm Adreno GPU", + "libQnnGpu.so", + GGML_BACKEND_DEVICE_TYPE_GPU, + (1 << GGML_TYPE_F32) | (1 << GGML_TYPE_F16), + }, + { + // https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/HtpOpDefSupplement.html#matmul + "qnn-npu", + "Qualcomm NPU", + "libQnnHtp.so", + GGML_BACKEND_DEVICE_TYPE_ACCEL, + (1 << GGML_TYPE_F32) | (1 << GGML_TYPE_F16) | (1 << GGML_TYPE_I16) | (1 << GGML_TYPE_I8), + }, }; +static_assert(sizeof(kDeviceCaps) / sizeof(kDeviceCaps[0]) == GGML_QNN_MAX_DEVICES, + "The number of qnn devices should be equal to GGML_QNN_MAX_DEVICES"); +static_assert(kDeviceCaps[QNN_BACKEND_NPU].type == GGML_BACKEND_DEVICE_TYPE_ACCEL, + "The NPU device should be an accelerator device"); + class ggml_backend_qnn_buffer_context { public: ggml_backend_qnn_buffer_context(QNNBackend device, std::shared_ptr instance, size_t size) : _instance(instance), _name(QNN_BACKEND_NAME + std::to_string(device)) { - // TODO: fix this for other platforms size_t size_page = sysconf(_SC_PAGESIZE); - // TODO: for qnn npu, a better way here is to reuse the buffer allocated by qnn rpc, will save an extra copy + // TODO: for qnn npu, a better way here is to reuse the buffer allocated by + // qnn rpc, will save an extra copy _buffer = qnn::align_alloc(size_page, size); if (!_buffer) { @@ -192,8 +203,8 @@ ggml_backend_buffer_i ggml_backend_qnn_buffer_interface = { * ----------------------------------------------------------------------------------------------- */ const char *ggml_backend_qnn_buffer_type_name(ggml_backend_buffer_type_t buft) { - GGML_UNUSED(buft); - return GGML_QNN_NAME; + auto *dev_ctx = get_device_context(buft->device); + return qnn::get_backend_name(dev_ctx->device); } ggml_backend_buffer_t ggml_backend_qnn_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { @@ -209,13 +220,14 @@ ggml_backend_buffer_t ggml_backend_qnn_buffer_type_alloc_buffer(ggml_backend_buf size_t ggml_backend_qnn_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { GGML_UNUSED(buft); + // TODO: fix this return 32; } -// TODO: this value is an experimental value, works fine with whisper/llm/minicpm-v inference on Android size_t ggml_backend_qnn_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) { GGML_UNUSED(buft); - + // TODO: this value is an experimental value, works fine with + // whisper/llm/minicpm-v inference on Android return (96 * 1024 * 1024); } @@ -255,9 +267,12 @@ ggml_backend_buffer_type_t ggml_backend_qnn_buffer_type(ggml_backend_dev_t dev) ggml_backend_qnn_buffer_types[i] = { /* .iface = */ { /* .get_name = */ ggml_backend_qnn_buffer_type_name, - /* .alloc_buffer = */ ggml_backend_qnn_buffer_type_alloc_buffer, - /* .get_alignment = */ ggml_backend_qnn_buffer_type_get_alignment, - /* .get_max_size = */ ggml_backend_qnn_buffer_type_get_max_size, + /* .alloc_buffer = */ + ggml_backend_qnn_buffer_type_alloc_buffer, + /* .get_alignment = */ + ggml_backend_qnn_buffer_type_get_alignment, + /* .get_max_size = */ + ggml_backend_qnn_buffer_type_get_max_size, /* .get_alloc_size = */ nullptr, // defaults to ggml_nbytes /* .is_host = */ ggml_backend_qnn_buffer_is_host, }, @@ -321,17 +336,13 @@ const char *ggml_backend_qnn_device_get_description(ggml_backend_dev_t dev) { } void ggml_backend_qnn_device_get_memory(ggml_backend_dev_t dev, size_t *free, size_t *total) { - // TODO: get memory info - *free = 0; - *total = 0; - GGML_UNUSED(dev); + *free = qnn::get_system_free_memory_in_bytes(); + *total = qnn::get_system_total_memory_in_bytes(); } enum ggml_backend_dev_type ggml_backend_qnn_device_get_type(ggml_backend_dev_t dev) { - // TODO: for cpu backend, we should return GGML_BACKEND_DEVICE_TYPE_CPU - GGML_UNUSED(dev); - return GGML_BACKEND_DEVICE_TYPE_GPU; + return kDeviceCaps[get_device_context(dev)->device].type; } void ggml_backend_qnn_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props *props) { @@ -356,41 +367,43 @@ ggml_guid_t ggml_backend_qnn_guid() { ggml_backend_t ggml_backend_qnn_init_with_device_context(ggml_backend_dev_t dev, const char *extend_lib_search_path) { if (!extend_lib_search_path) { extend_lib_search_path = GGML_QNN_DEFAULT_LIB_SEARCH_PATH; - QNN_LOG_WARN("extend_lib_search_path is nullptr, will use " GGML_QNN_DEFAULT_LIB_SEARCH_PATH " as default"); + QNN_LOG_WARN( + "extend_lib_search_path is nullptr, will " + "use " GGML_QNN_DEFAULT_LIB_SEARCH_PATH " as default"); } auto *dev_ctx = get_device_context(dev); - auto device_index = dev_ctx->device; - QNN_LOG_DEBUG("device %d", device_index); + const auto device = dev_ctx->device; + QNN_LOG_DEBUG("device %d", device); QNN_LOG_DEBUG("extend_lib_search_path %s", extend_lib_search_path); std::string path = extend_lib_search_path; // TODO: Fix this for other platforms #if defined(__ANDROID__) || defined(ANDROID) - if (QNN_BACKEND_NPU == device_index) { - if (0 == setenv("LD_LIBRARY_PATH", - (path + ":/vendor/dsp/cdsp:/vendor/lib64:/vendor/dsp/" - "dsp:/vendor/dsp/images") - .c_str(), - 1)) { + if (device == QNN_BACKEND_NPU) { + if (setenv("LD_LIBRARY_PATH", + (path + ":/vendor/dsp/cdsp:/vendor/lib64:/vendor/dsp/" + "dsp:/vendor/dsp/images") + .c_str(), + 1) == 0) { QNN_LOG_INFO("QNN NPU backend setenv successfully"); } else { QNN_LOG_ERROR("QNN NPU backend setenv failure"); } - if (0 == setenv("ADSP_LIBRARY_PATH", - (path + ";/vendor/dsp/cdsp;/vendor/lib/rfsa/adsp;/system/lib/" - "rfsa/adsp;/vendor/dsp/dsp;/vendor/dsp/images;/dsp") - .c_str(), - 1)) { + if (setenv("ADSP_LIBRARY_PATH", + (path + ";/vendor/dsp/cdsp;/vendor/lib/rfsa/adsp;/system/lib/" + "rfsa/adsp;/vendor/dsp/dsp;/vendor/dsp/images;/dsp") + .c_str(), + 1) == 0) { QNN_LOG_INFO("QNN NPU backend setenv successfully"); } else { QNN_LOG_ERROR("QNN NPU backend setenv failure"); } } else { - if (0 == setenv("LD_LIBRARY_PATH", path.c_str(), 1)) { - QNN_LOG_INFO("%s backend setenv successfully\n", qnn::get_backend_name(device_index)); + if (setenv("LD_LIBRARY_PATH", path.c_str(), 1) == 0) { + QNN_LOG_INFO("%s backend setenv successfully\n", qnn::get_backend_name(device)); } else { - QNN_LOG_ERROR("%s backend setenv failure\n", qnn::get_backend_name(device_index)); + QNN_LOG_ERROR("%s backend setenv failure\n", qnn::get_backend_name(device)); } } #endif @@ -398,8 +411,7 @@ ggml_backend_t ggml_backend_qnn_init_with_device_context(ggml_backend_dev_t dev, auto instance = std::make_shared(path, dev_ctx->lib_name, "ggml"); auto result = instance->qnn_init(nullptr); if (result != 0) { - QNN_LOG_WARN("init qnn subsystem failed with qnn backend %s, pls check why\n", - qnn::get_backend_name(device_index)); + QNN_LOG_WARN("init qnn subsystem failed with qnn backend %s, pls check why\n", qnn::get_backend_name(device)); return nullptr; } auto qnn_interface = instance->get_qnn_interface(); @@ -408,12 +420,12 @@ ggml_backend_t ggml_backend_qnn_init_with_device_context(ggml_backend_dev_t dev, return nullptr; } - std::string device_name = qnn::get_backend_name(device_index); + std::string device_name = qnn::get_backend_name(device); QNN_LOG_INFO("qnn device name %s", device_name.c_str()); dev_ctx->instance = instance; dev_ctx->qnn_interface = qnn_interface; dev_ctx->socinfo = instance->get_soc_info(); - dev_ctx->supported_types = kDeviceCaps[device_index].supported_types; + dev_ctx->supported_types = kDeviceCaps[device].supported_types; ggml_backend_t qnn_backend = new ggml_backend{ /* .guid = */ ggml_backend_qnn_guid(), @@ -479,9 +491,23 @@ struct ggml_backend_qnn_reg_impl : ggml_backend_reg { std::array, GGML_QNN_MAX_DEVICES> device_contexts; std::array devices; - ggml_backend_qnn_reg_impl(ggml_backend_reg_i interface) { + explicit ggml_backend_qnn_reg_impl(ggml_backend_reg_i interface) { context = this; iface = interface; + + for (int i = 0; i < GGML_QNN_MAX_DEVICES; i++) { + const auto device_enum = (QNNBackend)(GGML_QNN_MAX_DEVICES - 1 - i); // init from the last device, i.e. NPU + device_contexts[i] = std::make_unique( + /* .device = */ device_enum, // init from the last device, i.e. NPU + /* .threads = */ 1, + /* .name = */ qnn::get_backend_name(device_enum), + /* .lib_name = */ kDeviceCaps[device_enum].lib_name); + + auto &device = devices[i]; + device.iface = ggml_backend_qnn_device_interface; + device.reg = this; + device.context = device_contexts[i].get(); + } } }; @@ -512,35 +538,5 @@ const ggml_backend_reg_i ggml_backend_qnn_reg_interface = { ggml_backend_reg_t ggml_backend_qnn_reg() { static ggml_backend_qnn_reg_impl reg{ggml_backend_qnn_reg_interface}; - static bool initialized = false; - static std::mutex mutex; - - { - std::lock_guard lock(mutex); - if (!initialized) { - for (int i = 0; i < GGML_QNN_MAX_DEVICES; i++) { - reg.device_contexts[i] = std::make_unique( - /* .device = */ (QNNBackend)i, - /* .threads = */ 1, - /* .name = */ qnn::get_backend_name(i), - /* .lib_name = */ kDeviceCaps[i].lib_name); - - auto &device = reg.devices[i]; - device.iface = ggml_backend_qnn_device_interface; - device.reg = ® - device.context = reg.device_contexts[i].get(); - } - initialized = true; - } - } - return ® } - -int ggml_backend_qnn_get_device_count() { return GGML_QNN_MAX_DEVICES; } - -ggml_backend_t ggml_backend_qnn_init(size_t index, const char *extend_lib_search_path) { - auto *reg = ggml_backend_qnn_reg(); - auto *device = ggml_backend_qnn_reg_get_device(reg, index); - return ggml_backend_qnn_device_init(device, extend_lib_search_path); -} diff --git a/ggml/src/ggml-qnn/backend-ops.cpp b/ggml/src/ggml-qnn/backend-ops.cpp index c0e263a640eea..5643a746313d3 100644 --- a/ggml/src/ggml-qnn/backend-ops.cpp +++ b/ggml/src/ggml-qnn/backend-ops.cpp @@ -549,7 +549,7 @@ bool ggml_qnn_supports_tensor(ggml_backend_qnn_device_context *ctx, const ggml_t case GGML_TYPE_F16: case GGML_TYPE_Q8_0: case GGML_TYPE_Q4_0: - if (ctx->supported_types.find(tensor->type) == ctx->supported_types.end()) { + if (!(ctx->supported_types & (1 << tensor->type))) { QNN_LOG_DEBUG("unsupported data type %s for backend %d", type_name, (int)ctx->device); return false; } diff --git a/ggml/src/ggml-qnn/backend.hpp b/ggml/src/ggml-qnn/backend.hpp index eb292e89bfd21..aaced227275c8 100644 --- a/ggml/src/ggml-qnn/backend.hpp +++ b/ggml/src/ggml-qnn/backend.hpp @@ -27,7 +27,7 @@ struct ggml_backend_qnn_device_context { // initialize in init qnn::qcom_socinfo socinfo = {}; - std::unordered_set supported_types; + uint64_t supported_types; std::shared_ptr instance; std::shared_ptr qnn_interface; diff --git a/ggml/src/ggml-qnn/utils.cpp b/ggml/src/ggml-qnn/utils.cpp index 0de9d203ebee9..8ae375ffc8afc 100644 --- a/ggml/src/ggml-qnn/utils.cpp +++ b/ggml/src/ggml-qnn/utils.cpp @@ -7,6 +7,10 @@ #include "qnn-types.hpp" +#ifdef __linux__ +#include +#endif + namespace qnn { qnn_dimension_array_t get_internal_dimension(const ggml_dimension_array_t &dims, uint32_t rank) { @@ -141,7 +145,7 @@ const char *get_ggml_type_name(ggml_type type) { return traits->type_name; } -const char *get_backend_name(size_t device_index) { +const char *get_backend_name(QNNBackend device_index) { switch (device_index) { case QNN_BACKEND_CPU: return "QNN-CPU"; @@ -149,8 +153,7 @@ const char *get_backend_name(size_t device_index) { return "QNN-GPU"; case QNN_BACKEND_NPU: return "QNN-NPU"; - case QNN_BACKEND_GGML: - return "ggml"; //"fake" QNN backend, used for compare performance between QNN backend and original GGML + case QNN_BACKEND_COUNT: default: return "unknown"; } @@ -295,4 +298,20 @@ const char *get_qnn_error_string(Qnn_ErrorHandle_t error) { } } +#ifdef __linux__ + +size_t get_system_total_memory_in_bytes() { + auto pages = (size_t)sysconf(_SC_PHYS_PAGES); + auto page_size = (size_t)sysconf(_SC_PAGE_SIZE); + return pages * page_size; +} + +size_t get_system_free_memory_in_bytes() { + auto avail_pages = (size_t)sysconf(_SC_AVPHYS_PAGES); + auto page_size = (size_t)sysconf(_SC_PAGE_SIZE); + return avail_pages * page_size; +} + +#endif + } // namespace qnn diff --git a/ggml/src/ggml-qnn/utils.hpp b/ggml/src/ggml-qnn/utils.hpp index 2c58d037982f6..40dff321b970e 100644 --- a/ggml/src/ggml-qnn/utils.hpp +++ b/ggml/src/ggml-qnn/utils.hpp @@ -11,6 +11,8 @@ #include "ggml.h" +#include "ggml-qnn.h" + #include "QnnTypes.h" #include "logger.hpp" @@ -25,7 +27,7 @@ qnn_dimension_array_t get_internal_dimension(const ggml_dimension_array_t &dims, uint32_t get_ggml_tensor_rank(const ggml_tensor *tensor); const char *get_ggml_type_name(ggml_type type); -const char *get_backend_name(size_t device_index); +const char *get_backend_name(QNNBackend device_index); const char *get_chipset_desc(uint32_t chipset_id); const char *get_htparch_desc(size_t htp_arch); intptr_t align_to(size_t alignment, intptr_t offset); @@ -198,6 +200,8 @@ Qnn_DataType_t qnn_datatype_from_ggml_datatype(ggml_type ggml_type); ggml_type ggml_datatype_from_qnn_datatype(Qnn_DataType_t qnn_type); size_t qnn_datatype_size(Qnn_DataType_t qnn_type); const char *qnn_datatype_to_string(Qnn_DataType_t qnn_type); +size_t get_system_total_memory_in_bytes(); +size_t get_system_free_memory_in_bytes(); #if ENABLE_QNNBACKEND_PERF class qnn_perf { From a2df09b6afa37cd97df10e3207bdeb10c8c042b9 Mon Sep 17 00:00:00 2001 From: nullname Date: Fri, 29 Nov 2024 00:03:23 +0800 Subject: [PATCH 129/143] [WIP] feat: perf opt (#10) * reduce log * wip * add function to create concat nodes * opt * insert concat node before mulmat * use resize op * wip * add bind_buffer and remov ggml prefix in tensor types * use gather node instead * fix tensor type, now succeed in gpu and cpu, failed in npu * add comment * wip * add comment * wip * in destructor, clear internal buffer before unbind * disable gather for npu * wip * count swap memory as free memory * wip * fix supported_types ggml_backend_device_i.supports_op will be invoked before ggml_backend_device_i.init_backend * rename create_tensors -> initialize_op_nodes * move ggml_qnn_op_config to deparated file * wip * add create_convert_nodes * add comment * enable different type in/out for npu and cpu backend * fix npu convert op * enlarge max buffer size * add more error code * check tensor type before create convert node * add log * add log * remove transpose0 and use buildin transpose flag * rename transpose1 -> transpose_out * disable convert for npu * add more logs --- ggml/src/ggml-qnn.cpp | 19 +- ggml/src/ggml-qnn/backend-ops.cpp | 47 +++-- ggml/src/ggml-qnn/backend.hpp | 17 +- ggml/src/ggml-qnn/graph.hpp | 52 +++--- ggml/src/ggml-qnn/logger.cpp | 5 +- ggml/src/ggml-qnn/op-config-base.hpp | 129 ++++++++++++++ ggml/src/ggml-qnn/op-config.cpp | 257 +++++++++++++++++---------- ggml/src/ggml-qnn/op-config.hpp | 55 +++--- ggml/src/ggml-qnn/qnn-lib.hpp | 64 +++---- ggml/src/ggml-qnn/qnn-types.hpp | 2 +- ggml/src/ggml-qnn/tensor.hpp | 184 +++++++++++-------- ggml/src/ggml-qnn/utils.cpp | 69 +++++-- 12 files changed, 590 insertions(+), 310 deletions(-) create mode 100644 ggml/src/ggml-qnn/op-config-base.hpp diff --git a/ggml/src/ggml-qnn.cpp b/ggml/src/ggml-qnn.cpp index a41fae6bbb368..a4dace7078d3b 100644 --- a/ggml/src/ggml-qnn.cpp +++ b/ggml/src/ggml-qnn.cpp @@ -226,9 +226,8 @@ size_t ggml_backend_qnn_buffer_type_get_alignment(ggml_backend_buffer_type_t buf size_t ggml_backend_qnn_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) { GGML_UNUSED(buft); - // TODO: this value is an experimental value, works fine with - // whisper/llm/minicpm-v inference on Android - return (96 * 1024 * 1024); + // TODO: get the max size from device + return (1024 * 1024 * 1024); } bool ggml_backend_qnn_buffer_is_host(ggml_backend_buffer_type_t buft) { @@ -339,6 +338,7 @@ void ggml_backend_qnn_device_get_memory(ggml_backend_dev_t dev, size_t *free, si GGML_UNUSED(dev); *free = qnn::get_system_free_memory_in_bytes(); *total = qnn::get_system_total_memory_in_bytes(); + QNN_LOG_DEBUG("free memory: %ldMB, total memory: %ldMB", (*free / 1048576), (*total) / 1048576); } enum ggml_backend_dev_type ggml_backend_qnn_device_get_type(ggml_backend_dev_t dev) { @@ -374,7 +374,7 @@ ggml_backend_t ggml_backend_qnn_init_with_device_context(ggml_backend_dev_t dev, auto *dev_ctx = get_device_context(dev); const auto device = dev_ctx->device; - QNN_LOG_DEBUG("device %d", device); + QNN_LOG_DEBUG("device %s", qnn::get_backend_name(device)); QNN_LOG_DEBUG("extend_lib_search_path %s", extend_lib_search_path); std::string path = extend_lib_search_path; @@ -386,7 +386,7 @@ ggml_backend_t ggml_backend_qnn_init_with_device_context(ggml_backend_dev_t dev, "dsp:/vendor/dsp/images") .c_str(), 1) == 0) { - QNN_LOG_INFO("QNN NPU backend setenv successfully"); + QNN_LOG_DEBUG("QNN NPU backend setenv successfully"); } else { QNN_LOG_ERROR("QNN NPU backend setenv failure"); } @@ -395,13 +395,13 @@ ggml_backend_t ggml_backend_qnn_init_with_device_context(ggml_backend_dev_t dev, "rfsa/adsp;/vendor/dsp/dsp;/vendor/dsp/images;/dsp") .c_str(), 1) == 0) { - QNN_LOG_INFO("QNN NPU backend setenv successfully"); + QNN_LOG_DEBUG("QNN NPU backend setenv successfully"); } else { QNN_LOG_ERROR("QNN NPU backend setenv failure"); } } else { if (setenv("LD_LIBRARY_PATH", path.c_str(), 1) == 0) { - QNN_LOG_INFO("%s backend setenv successfully\n", qnn::get_backend_name(device)); + QNN_LOG_DEBUG("%s backend setenv successfully\n", qnn::get_backend_name(device)); } else { QNN_LOG_ERROR("%s backend setenv failure\n", qnn::get_backend_name(device)); } @@ -454,6 +454,7 @@ ggml_backend_buffer_t ggml_backend_qnn_device_buffer_from_ptr(ggml_backend_dev_t } bool ggml_backend_qnn_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor *op) { + // Note that this function could be called before the device context is initialized auto *device_ctx = get_device_context(dev); return qnn::ggml_qnn_supports_op(device_ctx, op); } @@ -495,13 +496,15 @@ struct ggml_backend_qnn_reg_impl : ggml_backend_reg { context = this; iface = interface; + QNN_LOG_DEBUG("qnn backend registry init"); for (int i = 0; i < GGML_QNN_MAX_DEVICES; i++) { const auto device_enum = (QNNBackend)(GGML_QNN_MAX_DEVICES - 1 - i); // init from the last device, i.e. NPU device_contexts[i] = std::make_unique( /* .device = */ device_enum, // init from the last device, i.e. NPU /* .threads = */ 1, /* .name = */ qnn::get_backend_name(device_enum), - /* .lib_name = */ kDeviceCaps[device_enum].lib_name); + /* .lib_name = */ kDeviceCaps[device_enum].lib_name, + /* .supported_types = */ kDeviceCaps[device_enum].supported_types); auto &device = devices[i]; device.iface = ggml_backend_qnn_device_interface; diff --git a/ggml/src/ggml-qnn/backend-ops.cpp b/ggml/src/ggml-qnn/backend-ops.cpp index 5643a746313d3..da0480df7fd9f 100644 --- a/ggml/src/ggml-qnn/backend-ops.cpp +++ b/ggml/src/ggml-qnn/backend-ops.cpp @@ -543,14 +543,17 @@ bool ggml_qnn_supports_tensor(ggml_backend_qnn_device_context *ctx, const ggml_t return false; } +#ifndef NDEBUG auto *type_name = ggml_get_type_traits(tensor->type)->type_name; +#endif switch (tensor->type) { case GGML_TYPE_F32: case GGML_TYPE_F16: case GGML_TYPE_Q8_0: case GGML_TYPE_Q4_0: if (!(ctx->supported_types & (1 << tensor->type))) { - QNN_LOG_DEBUG("unsupported data type %s for backend %d", type_name, (int)ctx->device); + QNN_LOG_DEBUG("unsupported data type %s for backend %s, supported_types: 0x%x", type_name, + qnn::get_backend_name(ctx->device), ctx->supported_types); return false; } break; @@ -563,25 +566,42 @@ bool ggml_qnn_supports_tensor(ggml_backend_qnn_device_context *ctx, const ggml_t } bool ggml_qnn_supports_matmul_op(ggml_backend_qnn_device_context *ctx, const ggml_tensor *op) { - GGML_UNUSED(ctx); - auto *src0 = op->src[0]; auto *src1 = op->src[1]; - if (src0->type != src1->type || src0->type != op->type) { - // current qnn implementation only supports the same type for src0 and src1 - QNN_LOG_DEBUG("src0 type %d and src1 type %d and op type %d are not equal", src0->type, src1->type, op->type); - return false; + switch (ctx->device) { + case QNN_BACKEND_NPU: + if (src1->ne[2] != src0->ne[2] || src1->ne[3] != src0->ne[3]) { + /* + * TODO: remove the blocker here when NPU backend supports mul_mat like this: + * [ne03, ne02, n, k] * [ne03 * x, ne02 * y, m, k] -> [ne03 * x, ne02 * y, m, n] + */ + QNN_LOG_DEBUG("[qnn-npu] src0 and src1 dimensions are not equal, support/unsupported: %d/%d", + ctx->support_op_count.load(), ++(ctx->unsupported_op_count)); + return false; + } + // fall through, from test here, the convert op is super slow on NPU: + // https://github.com/usefulsensors/qc_npu_benchmark + case QNN_BACKEND_GPU: + if (src0->type != src1->type || src0->type != op->type) { + // there's no convert op for GPU. + QNN_LOG_DEBUG("[qnn-gpu]type src0(%d), src1(%d) and op(%d) are not equal, support/unsupported: %d/%d", + src0->type, src1->type, op->type, ctx->support_op_count.load(), + ++(ctx->unsupported_op_count)); + return false; + } + break; + default: + break; } - if (src0->ne[2] != src1->ne[2] || src0->ne[3] != src1->ne[3]) { - /* - * TODO: remove the blocker here when qnn backend supports mul_mat like this: - * [ne03, ne02, n, k] * [ne03 * x, ne02 * y, m, k] -> [ne03 * x, ne02 * y, m, n] - */ - QNN_LOG_DEBUG("src0 and src1 dimensions are not equal"); + if ((src1->ne[2] % src0->ne[2]) != 0 || (src1->ne[3] % src0->ne[3]) != 0) { + QNN_LOG_DEBUG("[%s] src0 and src1 dimensions are not equal, support/unsupported: %d/%d", + qnn::get_backend_name(ctx->device), ctx->support_op_count.load(), ++(ctx->unsupported_op_count)); return false; } + QNN_LOG_DEBUG("[%s] supported matmul op, support/unsupported: %d/%d", qnn::get_backend_name(ctx->device), + ++(ctx->support_op_count), ctx->unsupported_op_count.load()); return true; } @@ -590,6 +610,7 @@ bool ggml_qnn_supports_matmul_op(ggml_backend_qnn_device_context *ctx, const ggm namespace qnn { bool ggml_qnn_supports_op(ggml_backend_qnn_device_context *ctx, const ggml_tensor *op) { + // Note that this function could be called before the device context is initialized if (op->op == GGML_OP_NONE) { return true; } diff --git a/ggml/src/ggml-qnn/backend.hpp b/ggml/src/ggml-qnn/backend.hpp index aaced227275c8..17823ed577aaa 100644 --- a/ggml/src/ggml-qnn/backend.hpp +++ b/ggml/src/ggml-qnn/backend.hpp @@ -1,6 +1,10 @@ #pragma once +#ifndef NDEBUG +#include +#endif + #include #include #include @@ -25,7 +29,7 @@ struct ggml_backend_qnn_device_context { std::string name; std::string lib_name; - // initialize in init + // initialize in qnn init qnn::qcom_socinfo socinfo = {}; uint64_t supported_types; std::shared_ptr instance; @@ -33,7 +37,12 @@ struct ggml_backend_qnn_device_context { qnn::ggml_qnn_graph_cache_t qnn_graph_cache; - explicit ggml_backend_qnn_device_context(QNNBackend device, size_t threads, const char *name, - const char *lib_name) : - device(device), threads(threads), name(name), lib_name(lib_name) {} +#ifndef NDEBUG + std::atomic_uint32_t support_op_count = 0; + std::atomic_uint32_t unsupported_op_count = 0; +#endif + + explicit ggml_backend_qnn_device_context(QNNBackend device, size_t threads, const char *name, const char *lib_name, + uint64_t supported_types) + : device(device), threads(threads), name(name), lib_name(lib_name), supported_types(supported_types) {} }; diff --git a/ggml/src/ggml-qnn/graph.hpp b/ggml/src/ggml-qnn/graph.hpp index 858a7d3af29a2..1b0dcd78faa17 100644 --- a/ggml/src/ggml-qnn/graph.hpp +++ b/ggml/src/ggml-qnn/graph.hpp @@ -17,9 +17,9 @@ namespace qnn { class ggml_qnn_graph { public: explicit ggml_qnn_graph(const std::string &graph_name, QNNBackend device, - std::shared_ptr qnn_instance, size_t vtcm_size_in_mb) : - _graph_name(graph_name), _device(device), _qnn_instance(qnn_instance) { - QNN_LOG_INFO("[%s]create", graph_name.c_str()); + std::shared_ptr qnn_instance, size_t vtcm_size_in_mb) + : _graph_name(graph_name), _device(device), _qnn_instance(qnn_instance) { + QNN_LOG_DEBUG("[%s][%s]created", get_backend_name(device), graph_name.c_str()); auto qnn_interface = qnn_instance->get_qnn_interface(); auto qnn_context = qnn_instance->get_qnn_context_handle(); @@ -56,24 +56,25 @@ class ggml_qnn_graph { graph_vtcm_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; graph_vtcm_config.customConfig = &vtcm_config; - const QnnGraph_Config_t *graph_configs[] = { &graph_hvx_config, &graph_dlbc_config, &graph_vtcm_config, - &graph_opt_config, nullptr }; + const QnnGraph_Config_t *graph_configs[] = {&graph_hvx_config, &graph_dlbc_config, &graph_vtcm_config, + &graph_opt_config, nullptr}; error = qnn_interface->qnn_graph_create(qnn_context, graph_name.c_str(), graph_configs, &graph_handle); } else { error = qnn_interface->qnn_graph_create(qnn_context, graph_name.c_str(), nullptr, &graph_handle); } if (error != QNN_SUCCESS) { - QNN_LOG_INFO("[%s]can't create qnn graph handle, error = %d\n", graph_name.c_str(), error); + QNN_LOG_ERROR("[%s][%s]failed to create qnn graph, error: %s\n", get_backend_name(device), + graph_name.c_str(), get_qnn_error_string(error)); return; } - QNN_LOG_INFO("[%s]create succeed\n", graph_name.c_str()); + QNN_LOG_INFO("[%s][%s]create succeed\n", get_backend_name(device), graph_name.c_str()); _graph_handle = graph_handle; _qnn_interface = qnn_interface; } - ~ggml_qnn_graph() { QNN_LOG_DEBUG("[%s]destroy", _graph_name.c_str()); } + ~ggml_qnn_graph() { QNN_LOG_DEBUG("[%s][%s]destroy", get_backend_name(_device), _graph_name.c_str()); } bool build_graph(ggml_op_constructor_t op_constructor, const ggml_tensor_array_t &tensor_inputs, const ggml_tensor_array_t &tensor_outputs) { @@ -83,10 +84,10 @@ class ggml_qnn_graph { return false; } - QNN_LOG_DEBUG("[%s]build_graph start", _graph_name.c_str()); + QNN_LOG_DEBUG("[%s][%s]build_graph start", get_backend_name(_device), _graph_name.c_str()); _op_config = op_constructor(_graph_name, _qnn_instance); - if (!_op_config->create_tensors(_device, _graph_handle, tensor_inputs, tensor_outputs)) { - QNN_LOG_ERROR("[%s]create_tensors failed\n", _graph_name.c_str()); + if (!_op_config->initialize_op_nodes(_device, _graph_handle, tensor_inputs, tensor_outputs)) { + QNN_LOG_ERROR("[%s][%s]initialize_op_nodes failed", get_backend_name(_device), _graph_name.c_str()); return false; } @@ -97,27 +98,23 @@ class ggml_qnn_graph { auto error = _qnn_interface->qnn_graph_finalize(_graph_handle, nullptr, nullptr); if (error != QNN_SUCCESS) { - auto *error_str = get_qnn_error_string(error); - if (error_str) { - QNN_LOG_ERROR("[%s]qnn_graph_finalize.error: %s\n", _graph_name.c_str(), error_str); - } else { - QNN_LOG_ERROR("[%s]qnn_graph_finalize.error: %d\n", _graph_name.c_str(), error); - } + QNN_LOG_ERROR("[%s][%s]qnn_graph_finalize.error: %s", get_backend_name(_device), _graph_name.c_str(), + get_qnn_error_string(error)); return false; } - QNN_LOG_DEBUG("[%s]build_graph succeed", _graph_name.c_str()); + QNN_LOG_DEBUG("[%s][%s]build_graph succeed", get_backend_name(_device), _graph_name.c_str()); return true; } bool execute(const ggml_tensor_array_t &tensor_inputs, const ggml_tensor_array_t &tensor_outputs) { if (!_op_config->bind_input_tensors(tensor_inputs)) { - QNN_LOG_ERROR("[%s]bind input tensors failed\n", _graph_name.c_str()); + QNN_LOG_ERROR("[%s][%s]bind input tensors failed\n", get_backend_name(_device), _graph_name.c_str()); return false; } if (!_op_config->bind_output_tensors(tensor_outputs)) { - QNN_LOG_ERROR("[%s]bind output tensors failed\n", _graph_name.c_str()); + QNN_LOG_ERROR("[%s][%s]bind output tensors failed\n", get_backend_name(_device), _graph_name.c_str()); return false; } @@ -127,20 +124,21 @@ class ggml_qnn_graph { auto error = _qnn_interface->qnn_graph_execute(_graph_handle, qnn_tensor_inputs.data(), qnn_tensor_inputs.size(), qnn_tensor_outputs.data(), qnn_tensor_outputs.size(), nullptr, nullptr); - if (_device == QNN_BACKEND_NPU) { - if (error == QNN_COMMON_ERROR_SYSTEM_COMMUNICATION) { - QNN_LOG_WARN("[%s]NPU crashed. SSR detected. Caused QNN graph execute error\n", _graph_name.c_str()); - } - } - _op_config->unbind_input_tensors(); _op_config->unbind_output_tensors(); if (error != QNN_SUCCESS) { - QNN_LOG_INFO("[%s]error = %d\n", _graph_name.c_str(), error); + if (_device == QNN_BACKEND_NPU && error == QNN_COMMON_ERROR_SYSTEM_COMMUNICATION) { + QNN_LOG_WARN("[%s][%s]NPU crashed. SSR detected. Caused QNN graph execute error.", + get_backend_name(_device), _graph_name.c_str()); + } else { + QNN_LOG_ERROR("[%s][%s]error: %s", get_backend_name(_device), _graph_name.c_str(), + get_qnn_error_string(error)); + } return false; } + QNN_LOG_DEBUG("[%s][%s]execute succeed", get_backend_name(_device), _graph_name.c_str()); return true; } diff --git a/ggml/src/ggml-qnn/logger.cpp b/ggml/src/ggml-qnn/logger.cpp index 187e9088c779c..1e781721d629c 100644 --- a/ggml/src/ggml-qnn/logger.cpp +++ b/ggml/src/ggml-qnn/logger.cpp @@ -34,7 +34,7 @@ void qnn::internal_log(ggml_log_level level, const char * /*file*/, const char * } #if ENABLE_QNNSDK_LOG -void qnn::sdk_logcallback(const char *fmt, QnnLog_Level_t level, uint64_t timestamp, va_list argp) { +void qnn::sdk_logcallback(const char *fmt, QnnLog_Level_t level, uint64_t /*timestamp*/, va_list argp) { static std::mutex log_mutex; static unsigned char s_ggml_qnn_logbuf[QNN_LOGBUF_LEN]; @@ -60,13 +60,12 @@ void qnn::sdk_logcallback(const char *fmt, QnnLog_Level_t level, uint64_t timest break; } - double ms = (double)timestamp / 1000000.0; { std::lock_guard lock(log_mutex); memset(s_ggml_qnn_logbuf, 0, QNN_LOGBUF_LEN); vsnprintf(reinterpret_cast(s_ggml_qnn_logbuf), QNN_LOGBUF_LEN, fmt, argp); - QNN_LOG_INFO("%8.1fms [%-7s] %s", ms, log_level_desc, s_ggml_qnn_logbuf); + QNN_LOG_INFO("[%s]%s", log_level_desc, s_ggml_qnn_logbuf); } } #else diff --git a/ggml/src/ggml-qnn/op-config-base.hpp b/ggml/src/ggml-qnn/op-config-base.hpp new file mode 100644 index 0000000000000..159944a7d7f60 --- /dev/null +++ b/ggml/src/ggml-qnn/op-config-base.hpp @@ -0,0 +1,129 @@ +#pragma once + +#include +#include + +#include "ggml-qnn.h" + +#include "qnn-types.hpp" +#include "tensor.hpp" + +namespace qnn { + +using ggml_tensor_array_t = std::vector; + +/** + * @class ggml_qnn_op_config + * @brief Abstract base class for configuring QNN operations. + * + * This class provides an interface for creating and managing tensors, + * adding operations to a graph, and binding/unbinding input and output tensors. + */ +class ggml_qnn_op_config { +public: + virtual ~ggml_qnn_op_config() {} + + /** + * @brief Creates tensors and internal nodes for constructing the calculation graph. + * + * This pure virtual function is responsible for creating tensors on the given + * backend device, associating them with the provided graph handle, and creating + * the internal nodes necessary for constructing the calculation graph. It takes + * input and output tensor arrays as parameters. + * + * @param device The backend device where tensors will be created. + * @param graph_handle The handle to the graph where tensors and nodes will be associated. + * @param tensor_inputs An array of input tensors. + * @param tensor_outputs An array of output tensors. + * @return true if tensors and nodes are successfully created, false otherwise. + */ + virtual bool initialize_op_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, + const ggml_tensor_array_t &tensor_inputs, + const ggml_tensor_array_t &tensor_outputs) = 0; + + /** + * @brief Pure virtual function to retrieve the input tensors for QNN (Quantized Neural Network). + * + * This function must be overridden by derived classes to provide the specific implementation + * for retrieving the input tensors used in QNN operations. + * + * @return A reference to a vector of Qnn_Tensor_t objects representing the input tensors. + */ + virtual std::vector &get_qnn_input_tensors() = 0; + + /** + * @brief Pure virtual function to retrieve the output tensors of a QNN (Quantized Neural Network). + * + * This function must be overridden by any derived class to provide access to the + * output tensors of the QNN. The function returns a reference to a vector of + * Qnn_Tensor_t objects, which represent the output tensors. + * + * @return std::vector& Reference to a vector of Qnn_Tensor_t objects. + */ + virtual std::vector &get_qnn_output_tensors() = 0; + + /** + * @brief Adds an operation to the given graph. + * + * This pure virtual function must be implemented by derived classes to add + * a specific operation to the provided graph handle. + * + * This function will be called after `initialize_op_nodes` during initialization. + * + * @param graph_handle The handle to the graph where the operation will be added. + * @return true if the operation was successfully added to the graph, false otherwise. + */ + virtual bool add_op_to_graph(Qnn_GraphHandle_t graph_handle) = 0; + + /** + * @brief Binds the input tensors to the operation. + * + * This pure virtual function must be implemented by derived classes to bind + * the provided input tensors to the operation. The function takes a constant + * reference to a ggml_tensor_array_t object, which contains the input tensors + * to be bound. + * + * @param tensor_inputs A constant reference to a ggml_tensor_array_t object + * containing the input tensors. + * @return true if the input tensors were successfully bound, false otherwise. + */ + virtual bool bind_input_tensors(const ggml_tensor_array_t &tensor_inputs) = 0; + + /** + * @brief Binds the output tensors to the given tensor array. + * + * This pure virtual function must be implemented by derived classes to bind + * the output tensors to the provided array of tensors. The function is expected + * to establish the necessary connections or mappings between the output tensors + * and the elements of the given tensor array. + * + * @param tensor_outputs A constant reference to an array of ggml tensors that + * represent the output tensors to be bound. + * @return true if the binding is successful, false otherwise. + */ + virtual bool bind_output_tensors(const ggml_tensor_array_t &tensor_outputs) = 0; + + /** + * @brief Unbinds the input tensors from the operation. + * + * This pure virtual function is intended to be overridden by derived classes + * to implement the logic for unbinding or detaching input tensors that were + * previously bound to the operation. This is typically used to release resources + * or reset the state of the operation. + */ + virtual void unbind_input_tensors() = 0; + + /** + * @brief Unbinds the output tensors. + * + * This pure virtual function is responsible for unbinding or detaching + * the output tensors from their current bindings. Implementations of this + * function should ensure that any resources or references held by the + * output tensors are properly released or reset. + */ + virtual void unbind_output_tensors() = 0; +}; + +using qnn_op_config_ptr_t = std::shared_ptr; + +} // namespace qnn diff --git a/ggml/src/ggml-qnn/op-config.cpp b/ggml/src/ggml-qnn/op-config.cpp index 9b98051adfc8e..df70d548a44e0 100644 --- a/ggml/src/ggml-qnn/op-config.cpp +++ b/ggml/src/ggml-qnn/op-config.cpp @@ -36,7 +36,7 @@ int get_rank(const qnn::ggml_tensor_array_t &tensor_inputs, const qnn::ggml_tens return tensor_rank; } -Qnn_DataType_t get_tensor_type(const qnn::ggml_qnn_tensor_array_t &tensors) { +Qnn_DataType_t get_tensor_type(const qnn::qnn_tensor_array_t &tensors) { Qnn_DataType_t type = QNN_DATATYPE_UNDEFINED; for (auto tensor : tensors) { auto tensor_type_size = qnn::qnn_datatype_size(tensor->get_data_type()); @@ -59,8 +59,7 @@ struct tensor_common_params { }; void create_tensors_from_ggml_tensor(const tensor_common_params ¶ms, const qnn::ggml_tensor_array_t &ggml_tensors, - qnn::ggml_qnn_tensor_array_t *tensor_wrappers, - std::vector *qnn_tensors) { + qnn::qnn_tensor_array_t *tensor_wrappers, std::vector *qnn_tensors) { using namespace qnn; tensor_wrappers->resize(ggml_tensors.size()); @@ -78,7 +77,7 @@ void create_tensors_from_ggml_tensor(const tensor_common_params ¶ms, const q } } -bool bind_tensors(const qnn::ggml_tensor_array_t &ggml_tensors, qnn::ggml_qnn_tensor_array_t &tensor_wrappers, +bool bind_tensors(const qnn::ggml_tensor_array_t &ggml_tensors, qnn::qnn_tensor_array_t &tensor_wrappers, std::vector &qnn_tensors) { for (size_t i = 0; i < ggml_tensors.size(); i++) { auto *ggml_tensor = ggml_tensors[i]; @@ -99,9 +98,9 @@ class ggml_qnn_connectable_op_config : public qnn::ggml_qnn_op_config_base { const std::string &op_type, std::shared_ptr qnn_instance) : ggml_qnn_op_config_base(name, package_name, op_type, qnn_instance) {} - bool create_tensors(QNNBackend device, Qnn_GraphHandle_t graph_handle, - const qnn::ggml_tensor_array_t &tensor_inputs, - const qnn::ggml_tensor_array_t &tensor_outputs) override { + bool initialize_op_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, + const qnn::ggml_tensor_array_t &tensor_inputs, + const qnn::ggml_tensor_array_t &tensor_outputs) override { GGML_UNUSED(device); GGML_UNUSED(graph_handle); GGML_UNUSED(tensor_inputs); @@ -109,28 +108,28 @@ class ggml_qnn_connectable_op_config : public qnn::ggml_qnn_op_config_base { return true; } - void set_input_tensors(qnn::ggml_qnn_tensor_array_t &tensor_inputs) { + void set_input_tensors(qnn::qnn_tensor_array_t &tensor_inputs) { _tensor_inputs = tensor_inputs; _qnn_tensor_inputs.resize(_tensor_inputs.size()); } - void set_input_tensors(qnn::ggml_qnn_tensor_array_t &&tensor_inputs) { + void set_input_tensors(qnn::qnn_tensor_array_t &&tensor_inputs) { _tensor_inputs = std::move(tensor_inputs); _qnn_tensor_inputs.resize(_tensor_inputs.size()); } - void set_output_tensors(qnn::ggml_qnn_tensor_array_t &tensor_outputs) { + void set_output_tensors(qnn::qnn_tensor_array_t &tensor_outputs) { _tensor_outputs = tensor_outputs; _qnn_tensor_outputs.resize(_tensor_outputs.size()); } - void set_output_tensors(qnn::ggml_qnn_tensor_array_t &&tensor_outputs) { + void set_output_tensors(qnn::qnn_tensor_array_t &&tensor_outputs) { _tensor_outputs = std::move(tensor_outputs); _qnn_tensor_outputs.resize(_tensor_outputs.size()); } - qnn::ggml_qnn_tensor_array_t &get_input_tensors() { return _tensor_inputs; } - qnn::ggml_qnn_tensor_array_t &get_output_tensors() { return _tensor_outputs; } + qnn::qnn_tensor_array_t &get_input_tensors() { return _tensor_inputs; } + qnn::qnn_tensor_array_t &get_output_tensors() { return _tensor_outputs; } private: DISABLE_COPY(ggml_qnn_connectable_op_config); @@ -186,7 +185,7 @@ bool ggml_qnn_op_config_base::add_op_to_graph(Qnn_GraphHandle_t graph_handle) { GGML_ASSERT(_qnn_tensor_inputs.size() == _tensor_inputs.size()); GGML_ASSERT(_qnn_tensor_outputs.size() == _tensor_outputs.size()); - auto qnn_interface = _qnn_instance->get_qnn_interface(); + QNN_LOG_DEBUG("[%s]add to graph start\n", _name.c_str()); for (size_t i = 0; i < _tensor_inputs.size(); i++) { auto tensor = _tensor_inputs[i]; if (!tensor->alloc_qnn_tensor_id()) { @@ -194,6 +193,7 @@ bool ggml_qnn_op_config_base::add_op_to_graph(Qnn_GraphHandle_t graph_handle) { return false; } + QNN_LOG_DEBUG("[%s]input tensor id: %d\n", _name.c_str(), tensor->get_qnn_tensor_id()); _qnn_tensor_inputs[i] = tensor->get_qnn_tensor(); } @@ -203,21 +203,19 @@ bool ggml_qnn_op_config_base::add_op_to_graph(Qnn_GraphHandle_t graph_handle) { QNN_LOG_ERROR("[%s]output tensor alloc_qnn_tensor_id failed\n", _name.c_str()); return false; } - _qnn_tensor_outputs[i] = _tensor_outputs[i]->get_qnn_tensor(); + + QNN_LOG_DEBUG("[%s]output tensor id: %d\n", _name.c_str(), tensor->get_qnn_tensor_id()); + _qnn_tensor_outputs[i] = tensor->get_qnn_tensor(); } + auto qnn_interface = _qnn_instance->get_qnn_interface(); auto error = qnn_interface->qnn_graph_add_node(graph_handle, get_op_config()); if (error != QNN_SUCCESS) { - auto *error_str = get_qnn_error_string(error); - if (error_str) { - QNN_LOG_ERROR("[%s]qnn_graph_add_node.error: %s\n", _name.c_str(), error_str); - } else { - QNN_LOG_ERROR("[%s]qnn_graph_add_node.error: %d\n", _name.c_str(), error); - } + QNN_LOG_ERROR("[%s]qnn_graph_add_node.error: %s", _name.c_str(), get_qnn_error_string(error)); return false; } - QNN_LOG_DEBUG("[%s]added to graph\n", _name.c_str()); + QNN_LOG_DEBUG("[%s]added to graph succeed\n", _name.c_str()); return true; } @@ -259,9 +257,9 @@ Qnn_OpConfig_t ggml_qnn_op_config_base::get_op_config() { return config; } -bool ggml_qnn_single_op_config::create_tensors(QNNBackend device, Qnn_GraphHandle_t graph_handle, - const ggml_tensor_array_t &tensor_inputs, - const ggml_tensor_array_t &tensor_outputs) { +bool ggml_qnn_single_op_config::initialize_op_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, + const ggml_tensor_array_t &tensor_inputs, + const ggml_tensor_array_t &tensor_outputs) { const auto tensor_rank = get_rank(tensor_inputs, tensor_outputs); tensor_common_params params = {"src", tensor_rank, true, device, graph_handle, _qnn_instance}; create_tensors_from_ggml_tensor(params, tensor_inputs, &_tensor_inputs, &_qnn_tensor_inputs); @@ -282,9 +280,9 @@ bool ggml_qnn_single_op_config::create_tensors(QNNBackend device, Qnn_GraphHandl return true; } -bool ggml_qnn_matmul_op_config::create_tensors(QNNBackend device, Qnn_GraphHandle_t graph_handle, - const ggml_tensor_array_t &tensor_inputs, - const ggml_tensor_array_t &tensor_outputs) { +bool ggml_qnn_matmul_op_config::initialize_op_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, + const ggml_tensor_array_t &tensor_inputs, + const ggml_tensor_array_t &tensor_outputs) { GGML_ASSERT(tensor_inputs.size() == 2); GGML_ASSERT(tensor_outputs.size() == 1); const auto tensor_rank = get_rank(tensor_inputs, tensor_outputs); @@ -295,59 +293,143 @@ bool ggml_qnn_matmul_op_config::create_tensors(QNNBackend device, Qnn_GraphHandl create_tensors_from_ggml_tensor(params, tensor_inputs, &_tensor_inputs, &_qnn_tensor_inputs); // create output tensor - ggml_qnn_tensor_array_t mat_mul_tensor_outputs; + qnn_tensor_array_t mat_mul_tensor_outputs; params.name_prefix = "dst"; params.is_input = false; create_tensors_from_ggml_tensor(params, tensor_outputs, &mat_mul_tensor_outputs, nullptr); + // create convert nodes + qnn_tensor_array_t mat_mul_tensor_inputs = _tensor_inputs; + if (!create_convert_nodes(device, graph_handle, tensor_rank, mat_mul_tensor_inputs, mat_mul_tensor_outputs)) { + QNN_LOG_ERROR("create convert nodes failed\n"); + return false; + } + + mat_mul_tensor_inputs.front() = create_gather_nodes(device, graph_handle, tensor_rank, _tensor_inputs.front(), + _tensor_inputs.back()->get_dimensions()); + return create_mat_mul_nodes(device, graph_handle, tensor_rank, mat_mul_tensor_inputs, mat_mul_tensor_outputs); +} + +qnn_tensor_ptr_t ggml_qnn_matmul_op_config::create_gather_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, + const int rank, qnn_tensor_ptr_t tensor_input, + qnn_dimension_array_t output_dimensions) { + if (rank <= 2) { + return tensor_input; + } + + const auto &input_dimensions = tensor_input->get_dimensions(); + output_dimensions[rank - 1] = input_dimensions[rank - 1]; + output_dimensions[rank - 2] = input_dimensions[rank - 2]; + + const auto y = output_dimensions[rank - 3] / input_dimensions[rank - 3]; + if (y == 1 && (rank == 3 || (rank == 4 && output_dimensions[rank - 4] == input_dimensions[rank - 4]))) { + return tensor_input; + } + + // create concat nodes, to convert tensor shape from [ne03, ne02, n, k] to [ne03 * x, ne02 * y, n, k] + constexpr const auto create_node = + [](const std::string &name, const int rank, const int axis, const qnn_dimension_array_t &dimensions, + qnn_tensor_ptr_t tensor_input, QNNBackend device, Qnn_GraphHandle_t graph_handle, + std::shared_ptr qnn_instance, qnn_tensor_ptr_t &tensor_output) -> qnn_op_config_ptr_t { + auto gather_out = + std::make_shared(ggml_qnn_tensor::INTERMEDIATE, name + "_out", dimensions, + tensor_input->get_data_type(), rank, device, graph_handle, qnn_instance); + auto gather_op = std::make_shared(name, QNN_OP_PACKAGE_NAME_QTI_AISW, + QNN_OP_GATHER, qnn_instance); + + Qnn_Scalar_t scalar = QNN_SCALAR_INIT; + scalar.dataType = QNN_DATATYPE_INT_32; + scalar.int32Value = axis; + gather_op->add_scalar_param(QNN_OP_GATHER_PARAM_AXIS, scalar); + gather_op->set_output_tensors({gather_out}); + + // here we calculate the index mapping, will generate a 1d tensor like [0, 0, 0, 1, 1, 1, 2, 2, 2, ...], + // by repeating each index [scale] times. + const auto scale = dimensions[axis] / tensor_input->get_dimensions()[axis]; + std::vector index_buffer(dimensions[axis] * sizeof(uint32_t)); + for (uint32_t *curr = reinterpret_cast(index_buffer.data()), *end = curr + dimensions[axis]; + curr < end; curr++) { + *curr = (curr - reinterpret_cast(index_buffer.data())) / scale; + } + + auto gather_index = std::make_shared( + ggml_qnn_tensor::PARAMETER, name + "_index", qnn_dimension_array_t{dimensions[axis]}, QNN_DATATYPE_UINT_32, + 1, device, graph_handle, qnn_instance); + gather_index->set_data_buffer(std::move(index_buffer)); + gather_op->set_input_tensors({tensor_input, gather_index}); + + tensor_output = gather_out; + return gather_op; + }; + + qnn_dimension_array_t intermediate_dimensions = input_dimensions; + intermediate_dimensions[rank - 3] = output_dimensions[rank - 3]; + qnn_tensor_ptr_t gather0_out; + _gather0 = create_node(_name + "_gather0", rank, rank - 3, intermediate_dimensions, tensor_input, device, + graph_handle, _qnn_instance, gather0_out); + if (rank == 3) { + return gather0_out; + } + + qnn_tensor_ptr_t gather1_out; + _gather1 = create_node(_name + "_gather1", rank, rank - 4, output_dimensions, gather0_out, device, graph_handle, + _qnn_instance, gather1_out); + return gather1_out; +} + +bool ggml_qnn_matmul_op_config::create_convert_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, const int rank, + qnn_tensor_array_t &tensor_inputs, + qnn_tensor_array_t &tensor_outputs) { if (device == QNN_BACKEND_GPU) { - // there's no convert op for GPU, so we should create matmul nodes directl. - return create_mat_mul_nodes(device, graph_handle, tensor_rank, _tensor_inputs, mat_mul_tensor_outputs); + // there's no convert op for GPU, so we should create matmul nodes directly. + return true; } // create tensors for convert node - ggml_qnn_tensor_array_t mat_mul_tensor_inputs = _tensor_inputs; - auto input_tensor_type = get_tensor_type(mat_mul_tensor_inputs); - QNN_LOG_DEBUG("matmul input tensor type: %s\n", qnn_datatype_to_string(input_tensor_type)); + auto tensor_type = get_tensor_type(tensor_inputs); + QNN_LOG_DEBUG("input tensor type: %s\n", qnn_datatype_to_string(tensor_type)); - _input_converts.resize(mat_mul_tensor_inputs.size()); - for (size_t i = 0; i < mat_mul_tensor_inputs.size(); ++i) { + _input_converts.resize(tensor_inputs.size()); + for (size_t i = 0; i < tensor_inputs.size(); ++i) { // create input convert nodes + auto convert_in = tensor_inputs[i]; + if (convert_in->get_data_type() == tensor_type) { + continue; + } + std::string convert_name("convert_src" + std::to_string(i)); - auto convert_in = mat_mul_tensor_inputs[i]; auto convert_out = std::make_shared(ggml_qnn_tensor::INTERMEDIATE, convert_name + "_out", - convert_in->get_dimensions(), input_tensor_type, - tensor_rank, device, graph_handle, _qnn_instance); + convert_in->get_dimensions(), tensor_type, rank, device, + graph_handle, _qnn_instance); auto convert = std::make_shared(convert_name, QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_CONVERT, _qnn_instance); convert->set_input_tensors({convert_in}); convert->set_output_tensors({convert_out}); - mat_mul_tensor_inputs[i] = convert_out; + tensor_inputs[i] = convert_out; _input_converts[i] = convert; } - { + if (tensor_outputs.front()->get_data_type() != tensor_type) { // create output convert node std::string convert_name("convert_dst"); - auto convert_out = mat_mul_tensor_outputs.front(); + auto convert_out = tensor_outputs.front(); auto convert_in = std::make_shared(ggml_qnn_tensor::INTERMEDIATE, convert_name + "_in", - convert_out->get_dimensions(), input_tensor_type, - tensor_rank, device, graph_handle, _qnn_instance); + convert_out->get_dimensions(), tensor_type, rank, device, + graph_handle, _qnn_instance); auto output_convert = std::make_shared( convert_name, QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_CONVERT, _qnn_instance); output_convert->set_input_tensors({convert_in}); output_convert->set_output_tensors({convert_out}); - mat_mul_tensor_outputs[0] = convert_in; + tensor_outputs.front() = convert_in; _output_convert = output_convert; } - // create mat_mul nodes - return create_mat_mul_nodes(device, graph_handle, tensor_rank, mat_mul_tensor_inputs, mat_mul_tensor_outputs); + return true; } bool ggml_qnn_matmul_op_config::create_mat_mul_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, const int rank, - ggml_qnn_tensor_array_t &tensor_inputs, - ggml_qnn_tensor_array_t &tensor_outputs) { + qnn_tensor_array_t &tensor_inputs, + qnn_tensor_array_t &tensor_outputs) { /* * First, both the ggml and qnn tensor in memory are stored as row-major format. (For more details, please also: @@ -386,9 +468,8 @@ bool ggml_qnn_matmul_op_config::create_mat_mul_nodes(QNNBackend device, Qnn_Grap * ```mermaid * graph TD; * i1>ggml_tensor_in0] --src0--> mat_mul0; - * i2>ggml_tensor_in1] --src1--> transpose0; - * transpose0 --src0_trans--> mat_mul0; - * mat_mul0 --dst_trans--> transpose1; + * i2>ggml_tensor_in1] --src1--> mat_mul0; + * mat_mul0 --dst_trans--> transpose_out; * transpose1 --dst0--> o1>ggml_tensor_out]; * ``` */ @@ -398,9 +479,6 @@ bool ggml_qnn_matmul_op_config::create_mat_mul_nodes(QNNBackend device, Qnn_Grap static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS does not match the expected value"); qnn_dimension_array_t dimensions = get_transposed_dimensions(src1->get_dimensions(), rank); - auto src0_trans = - std::make_shared(ggml_qnn_tensor::INTERMEDIATE, "src0_trans", dimensions, - src1->get_data_type(), rank, device, graph_handle, _qnn_instance); // create dst_trans tensor auto dst = tensor_outputs.front(); @@ -408,48 +486,37 @@ bool ggml_qnn_matmul_op_config::create_mat_mul_nodes(QNNBackend device, Qnn_Grap auto dst_trans = std::make_shared(ggml_qnn_tensor::INTERMEDIATE, "dst_trans", dimensions, dst->get_data_type(), rank, device, graph_handle, _qnn_instance); - // create transpose0 - auto transpose0 = std::make_shared(_name + "_trans0", QNN_OP_PACKAGE_NAME_QTI_AISW, - QNN_OP_TRANSPOSE, _qnn_instance); - - // create transpose1 - auto transpose1 = std::make_shared(_name + "_trans1", QNN_OP_PACKAGE_NAME_QTI_AISW, - QNN_OP_TRANSPOSE, _qnn_instance); + // create transpose_out + auto transpose_out = std::make_shared( + _name + "_trans1", QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_TRANSPOSE, _qnn_instance); // create mat_mul auto mat_mul = std::make_shared(_name, QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_MAT_MUL, _qnn_instance); - // set transpose0 parameters + Qnn_Scalar_t scalar = QNN_SCALAR_INIT; + scalar.dataType = QNN_DATATYPE_BOOL_8; + scalar.bool8Value = 1; + mat_mul->add_scalar_param(QNN_OP_MAT_MUL_PARAM_TRANSPOSE_IN1, scalar); + + // set transpose_out parameters auto *params_data = reinterpret_cast(kTransposeParamData[rank - 1].data()); const qnn_dimension_array_t param_dims = {(uint32_t)rank, 1, 1, 1}; - transpose0->add_tensor_param(QNN_OP_TRANSPOSE_PARAM_PERM, param_dims, 1, params_data, QNN_DATATYPE_UINT_32, device, - graph_handle); - - // set transpose1 parameters - transpose1->add_tensor_param(QNN_OP_TRANSPOSE_PARAM_PERM, param_dims, 1, params_data, QNN_DATATYPE_UINT_32, device, - graph_handle); - - // set tensor to transpose0 - ggml_qnn_tensor_array_t tensors = {tensor_inputs.back()}; - transpose0->set_input_tensors(tensors); - tensors = {src0_trans}; - transpose0->set_output_tensors(tensors); + transpose_out->add_tensor_param(QNN_OP_TRANSPOSE_PARAM_PERM, param_dims, 1, params_data, QNN_DATATYPE_UINT_32, + device, graph_handle); // set tensor to mat_mul - tensors = {tensor_inputs.front(), src0_trans}; - mat_mul->set_input_tensors(tensors); - tensors = {dst_trans}; + mat_mul->set_input_tensors(tensor_inputs); + qnn_tensor_array_t tensors = {dst_trans}; mat_mul->set_output_tensors(tensors); - // set tensor to transpose1 + // set tensor to transpose_out tensors = {dst_trans}; - transpose1->set_input_tensors(tensors); - transpose1->set_output_tensors(tensor_outputs); + transpose_out->set_input_tensors(tensors); + transpose_out->set_output_tensors(tensor_outputs); _mat_mul = mat_mul; - _transpose0 = transpose0; - _transpose1 = transpose1; + _transpose_out = transpose_out; return true; } @@ -460,8 +527,15 @@ bool ggml_qnn_matmul_op_config::add_op_to_graph(Qnn_GraphHandle_t graph_handle) } } - return _transpose0->add_op_to_graph(graph_handle) && _mat_mul->add_op_to_graph(graph_handle) && - _transpose1->add_op_to_graph(graph_handle) && + if (_gather0 && !_gather0->add_op_to_graph(graph_handle)) { + return false; + } + + if (_gather1 && !_gather1->add_op_to_graph(graph_handle)) { + return false; + } + + return _mat_mul->add_op_to_graph(graph_handle) && _transpose_out->add_op_to_graph(graph_handle) && (!_output_convert || _output_convert->add_op_to_graph(graph_handle)); } @@ -473,13 +547,12 @@ bool ggml_qnn_matmul_op_config::bind_output_tensors(const ggml_tensor_array_t &t if (_output_convert) { return _output_convert->bind_output_tensors(tensor_outputs); } else { - return _transpose1->bind_output_tensors(tensor_outputs); + return _transpose_out->bind_output_tensors(tensor_outputs); } } void ggml_qnn_matmul_op_config::unbind_input_tensors() { _mat_mul->unbind_input_tensors(); - _transpose0->unbind_input_tensors(); for (auto &convert : _input_converts) { if (convert) { convert->unbind_input_tensors(); @@ -488,7 +561,7 @@ void ggml_qnn_matmul_op_config::unbind_input_tensors() { } void ggml_qnn_matmul_op_config::unbind_output_tensors() { - _transpose1->unbind_output_tensors(); + _transpose_out->unbind_output_tensors(); if (_output_convert) { _output_convert->unbind_output_tensors(); } @@ -498,7 +571,7 @@ std::vector &ggml_qnn_matmul_op_config::get_qnn_output_tensors() { if (_output_convert) { return _output_convert->get_qnn_output_tensors(); } else { - return _transpose1->get_qnn_output_tensors(); + return _transpose_out->get_qnn_output_tensors(); } } @@ -513,9 +586,9 @@ ggml_op_constructor_t create_op_constructor(const std::string &op_name) { } else if (op_name == QNN_OP_TRANSPOSE) { return [](const std::string &instance_name, std::shared_ptr qnn_instance) -> std::unique_ptr { - return std::make_unique(instance_name, QNN_OP_PACKAGE_NAME_QTI_AISW, - QNN_OP_TRANSPOSE, QNN_OP_TRANSPOSE_PARAM_PERM, - QNN_DATATYPE_UINT_32, 4 * sizeof(uint32_t), qnn_instance); + return std::make_unique( + instance_name, QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_TRANSPOSE, QNN_OP_TRANSPOSE_PARAM_PERM, + QNN_DATATYPE_UINT_32, 4 * sizeof(uint32_t), qnn_instance); }; } diff --git a/ggml/src/ggml-qnn/op-config.hpp b/ggml/src/ggml-qnn/op-config.hpp index 4ec7aac9b256e..27571563309a8 100644 --- a/ggml/src/ggml-qnn/op-config.hpp +++ b/ggml/src/ggml-qnn/op-config.hpp @@ -5,31 +5,13 @@ #include #include -#include "ggml-qnn.h" - +#include "op-config-base.hpp" #include "qnn-lib.hpp" #include "qnn-types.hpp" #include "tensor.hpp" namespace qnn { -using ggml_tensor_array_t = std::vector; - -class ggml_qnn_op_config { -public: - virtual ~ggml_qnn_op_config() {} - virtual bool create_tensors(QNNBackend device, Qnn_GraphHandle_t graph_handle, - const ggml_tensor_array_t &tensor_inputs, - const ggml_tensor_array_t &tensor_outputs) = 0; - virtual std::vector &get_qnn_input_tensors() = 0; - virtual std::vector &get_qnn_output_tensors() = 0; - virtual bool add_op_to_graph(Qnn_GraphHandle_t graph_handle) = 0; - virtual bool bind_input_tensors(const ggml_tensor_array_t &tensor_inputs) = 0; - virtual bool bind_output_tensors(const ggml_tensor_array_t &tensor_outputs) = 0; - virtual void unbind_input_tensors() = 0; - virtual void unbind_output_tensors() = 0; -}; - using ggml_op_constructor_t = std::function(const std::string &, std::shared_ptr)>; @@ -60,9 +42,9 @@ class ggml_qnn_op_config_base : public ggml_qnn_op_config { std::string _package_name; std::string _op_type; std::shared_ptr _qnn_instance; - ggml_qnn_tensor_array_t _tensor_inputs; - ggml_qnn_tensor_array_t _tensor_outputs; - ggml_qnn_tensor_array_t _tensor_parameters; + qnn_tensor_array_t _tensor_inputs; + qnn_tensor_array_t _tensor_outputs; + qnn_tensor_array_t _tensor_parameters; std::vector _qnn_tensor_inputs; std::vector _qnn_tensor_outputs; std::vector _qnn_parameters; @@ -87,8 +69,9 @@ class ggml_qnn_single_op_config : public ggml_qnn_op_config_base { _param_type(param_type), _param_buffer(param_size) {} - bool create_tensors(QNNBackend device, Qnn_GraphHandle_t graph_handle, const ggml_tensor_array_t &tensor_inputs, - const ggml_tensor_array_t &tensor_outputs) override; + bool initialize_op_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, + const ggml_tensor_array_t &tensor_inputs, + const ggml_tensor_array_t &tensor_outputs) override; private: const std::string _param_name; @@ -104,8 +87,9 @@ class ggml_qnn_matmul_op_config : public ggml_qnn_op_config { ggml_qnn_matmul_op_config(const std::string &name, std::shared_ptr qnn_instance) : _name(name), _qnn_instance(qnn_instance) {} - bool create_tensors(QNNBackend device, Qnn_GraphHandle_t graph_handle, const ggml_tensor_array_t &tensor_inputs, - const ggml_tensor_array_t &tensor_outputs) override; + bool initialize_op_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, + const ggml_tensor_array_t &tensor_inputs, + const ggml_tensor_array_t &tensor_outputs) override; bool add_op_to_graph(Qnn_GraphHandle_t graph_handle) override; bool bind_input_tensors(const ggml_tensor_array_t &tensor_inputs) override; bool bind_output_tensors(const ggml_tensor_array_t &tensor_outputs) override; @@ -115,17 +99,22 @@ class ggml_qnn_matmul_op_config : public ggml_qnn_op_config { std::vector &get_qnn_output_tensors() override; private: + qnn_tensor_ptr_t create_gather_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, const int rank, + qnn_tensor_ptr_t tensor_input, qnn_dimension_array_t output_dimensions); + bool create_convert_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, const int rank, + qnn_tensor_array_t &tensor_inputs, qnn_tensor_array_t &tensor_outputs); bool create_mat_mul_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, const int rank, - ggml_qnn_tensor_array_t &tensor_inputs, ggml_qnn_tensor_array_t &tensor_outputs); + qnn_tensor_array_t &tensor_inputs, qnn_tensor_array_t &tensor_outputs); std::string _name; std::shared_ptr _qnn_instance; - std::shared_ptr _transpose0; - std::shared_ptr _transpose1; - std::shared_ptr _mat_mul; - std::vector> _input_converts; - std::shared_ptr _output_convert; - ggml_qnn_tensor_array_t _tensor_inputs; + qnn_op_config_ptr_t _transpose_out; + qnn_op_config_ptr_t _mat_mul; + qnn_op_config_ptr_t _gather0; + qnn_op_config_ptr_t _gather1; + std::vector _input_converts; + qnn_op_config_ptr_t _output_convert; + qnn_tensor_array_t _tensor_inputs; std::vector _qnn_tensor_inputs; DISABLE_COPY(ggml_qnn_matmul_op_config); diff --git a/ggml/src/ggml-qnn/qnn-lib.hpp b/ggml/src/ggml-qnn/qnn-lib.hpp index 74bc2b3f95f6b..c6801b7771ee9 100644 --- a/ggml/src/ggml-qnn/qnn-lib.hpp +++ b/ggml/src/ggml-qnn/qnn-lib.hpp @@ -188,8 +188,8 @@ class qnn_instance { public: using BackendIdType = decltype(QnnInterface_t{}.backendId); - explicit qnn_instance(const std::string &lib_path, const std::string &backend_name, const std::string &model_name) : - _lib_path(std::move(lib_path)), _backend_name(std::move(backend_name)), _model_name(std::move(model_name)) {} + explicit qnn_instance(const std::string &lib_path, const std::string &backend_name, const std::string &model_name) + : _lib_path(std::move(lib_path)), _backend_name(std::move(backend_name)), _model_name(std::move(model_name)) {} ~qnn_instance() {} @@ -269,7 +269,7 @@ class qnn_instance { QNN_LOG_INFO("qualcomm soc_model:%d(%s), htp_arch:%d(%s), vtcm_size:%d MB", chipinfo.socModel, qnn::get_chipset_desc(chipinfo.socModel), htp_arch, qnn::get_htparch_desc(htp_arch), chipinfo.vtcmSize); - _soc_info = { chipinfo.socModel, htp_arch, chipinfo.vtcmSize }; + _soc_info = {chipinfo.socModel, htp_arch, chipinfo.vtcmSize}; } _qnn_interface->qnn_device_free_platform_info(nullptr, p_info); @@ -288,7 +288,7 @@ class qnn_instance { arch_devconfig.option = QNN_DEVICE_CONFIG_OPTION_CUSTOM; arch_devconfig.customConfig = &arch_customconfig; - const QnnDevice_Config_t *p_deviceconfig[] = { &soc_devconfig, &arch_devconfig, nullptr }; + const QnnDevice_Config_t *p_deviceconfig[] = {&soc_devconfig, &arch_devconfig, nullptr}; qnn_status = _qnn_interface->qnn_device_create(_qnn_log_handle, p_deviceconfig, &_qnn_device_handle); } else { qnn_status = _qnn_interface->qnn_device_create(_qnn_log_handle, nullptr, &_qnn_device_handle); @@ -299,27 +299,17 @@ class qnn_instance { QNN_LOG_INFO("create QNN device successfully\n"); } - if (qnn::sdk_profile_level::profile_off != _profile_level) { + if (_profile_level != sdk_profile_level::profile_off) { QNN_LOG_INFO("profiling turned on; level = %d", _profile_level); - if (qnn::sdk_profile_level::profile_basic == _profile_level) { - QNN_LOG_INFO("basic profiling requested. creating Qnn Profile object\n"); - if (QNN_PROFILE_NO_ERROR != _qnn_interface->qnn_profile_create( - _qnn_backend_handle, QNN_PROFILE_LEVEL_BASIC, &_qnn_profile_handle)) { - QNN_LOG_WARN("unable to create profile handle in the backend\n"); - return 6; - } else { - QNN_LOG_DEBUG("initialize qnn profile successfully\n"); - } - } else if (qnn::sdk_profile_level::profile_detail == _profile_level) { - QNN_LOG_INFO("detailed profiling requested. Creating Qnn Profile object\n"); - if (QNN_PROFILE_NO_ERROR != _qnn_interface->qnn_profile_create(_qnn_backend_handle, - QNN_PROFILE_LEVEL_DETAILED, - &_qnn_profile_handle)) { - QNN_LOG_WARN("unable to create profile handle in the backend\n"); - return 7; - } else { - QNN_LOG_DEBUG("initialize qnn profile successfully\n"); - } + auto profile_level = _profile_level == sdk_profile_level::profile_detail ? QNN_PROFILE_LEVEL_DETAILED + : QNN_PROFILE_LEVEL_BASIC; + + if (QNN_PROFILE_NO_ERROR != + _qnn_interface->qnn_profile_create(_qnn_backend_handle, profile_level, &_qnn_profile_handle)) { + QNN_LOG_WARN("unable to create profile handle in the backend\n"); + return 6; + } else { + QNN_LOG_DEBUG("initialize qnn profile successfully\n"); } } @@ -364,7 +354,7 @@ class qnn_instance { size_t candidate_size = 0; uint8_t *rpc_buffer = nullptr; const int size_in_mb = (1 << 20); - size_t probe_slots[] = { 1024, 1536, 2048 - 48, 2048 }; + size_t probe_slots[] = {1024, 1536, 2048 - 48, 2048}; size_t probe_counts = sizeof(probe_slots) / sizeof(size_t); for (size_t idx = 0; idx < probe_counts; idx++) { rpc_buffer = static_cast(alloc_rpcmem(probe_slots[idx] * size_in_mb, sizeof(void *))); @@ -526,13 +516,13 @@ class qnn_instance { // use rpc control latency recommended 100 us, refer hexagon sdk rpc_control_latency.rpcControlLatencyConfig = 100; - const QnnHtpPerfInfrastructure_PowerConfig_t *power_configs[] = { &rpc_polling_time, &rpc_control_latency, - nullptr }; + const QnnHtpPerfInfrastructure_PowerConfig_t *power_configs[] = {&rpc_polling_time, &rpc_control_latency, + nullptr}; Qnn_ErrorHandle_t qnn_status = _qnn_htp_perfinfra->setPowerConfig(_qnn_power_configid, power_configs); if (qnn_status != QNN_SUCCESS) { QNN_LOG_WARN("set htp perf failed\n"); } else { - QNN_LOG_INFO("set htp perf ok\n"); + QNN_LOG_DEBUG("set htp perf ok\n"); } } else { QNN_LOG_WARN("can't set htp perf\n"); @@ -572,13 +562,13 @@ class qnn_instance { power_config.dcvsV3Config.coreVoltageCornerMax = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; // set power config with different performance parameters - const QnnHtpPerfInfrastructure_PowerConfig_t *power_configs[] = { &power_config, nullptr }; + const QnnHtpPerfInfrastructure_PowerConfig_t *power_configs[] = {&power_config, nullptr}; Qnn_ErrorHandle_t qnn_status = QNN_SUCCESS; qnn_status = _qnn_htp_perfinfra->setPowerConfig(_qnn_power_configid, power_configs); if (qnn_status != QNN_SUCCESS) { QNN_LOG_WARN("set htp high performance mode failed\n"); } else { - QNN_LOG_INFO("set htp high performance mode ok\n"); + QNN_LOG_DEBUG("set htp high performance mode ok\n"); } return 0; @@ -659,8 +649,8 @@ class qnn_instance { return nullptr; } - QNN_LOG_INFO("mem_fd %d\n", mem_fd); - Qnn_MemDescriptor_t descriptor = { { rank, dimensions, nullptr }, data_type, QNN_MEM_TYPE_ION, { { mem_fd } } }; + QNN_LOG_DEBUG("mem_fd %d\n", mem_fd); + Qnn_MemDescriptor_t descriptor = {{rank, dimensions, nullptr}, data_type, QNN_MEM_TYPE_ION, {{mem_fd}}}; Qnn_MemHandle_t handle = nullptr; auto error = _qnn_interface->qnn_mem_register(_qnn_context_handle, &descriptor, /*numDescriptors=*/1, &handle); @@ -670,8 +660,8 @@ class qnn_instance { return nullptr; } - _qnn_rpc_buffer_to_handles.insert({ p_data, handle }); - QNN_LOG_INFO("successfully register shared memory handler: %p\n", handle); + _qnn_rpc_buffer_to_handles.insert({p_data, handle}); + QNN_LOG_DEBUG("successfully register shared memory handler: %p\n", handle); return handle; } @@ -748,7 +738,7 @@ class qnn_instance { QNN_LOG_WARN("unable to find a valid qnn system interface\n"); return 6; } else { - QNN_LOG_INFO("find a valid qnn system interface\n"); + QNN_LOG_DEBUG("find a valid qnn system interface\n"); } auto qnn_sys_interface = std::make_shared(*provider_list[0], system_lib_handle); @@ -810,7 +800,7 @@ class qnn_instance { QNN_LOG_WARN("unable to find a valid qnn interface\n"); return 6; } else { - QNN_LOG_INFO("find a valid qnn interface\n"); + QNN_LOG_DEBUG("find a valid qnn interface\n"); } BackendIdType backend_id = provider_list[0]->backendId; @@ -890,7 +880,7 @@ class qnn_instance { std::unordered_map _loaded_backend; dl_handler_t _rpc_lib_handle = nullptr; - std::atomic_bool _rpcmem_initialized{ false }; + std::atomic_bool _rpcmem_initialized{false}; qnn::pfn_rpc_mem_alloc _pfn_rpc_mem_alloc; qnn::pfn_rpc_mem_free _pfn_rpc_mem_free; qnn::pfn_rpc_mem_to_fd _pfn_rpc_mem_to_fd; diff --git a/ggml/src/ggml-qnn/qnn-types.hpp b/ggml/src/ggml-qnn/qnn-types.hpp index 8fce790defb61..7461ac3012755 100644 --- a/ggml/src/ggml-qnn/qnn-types.hpp +++ b/ggml/src/ggml-qnn/qnn-types.hpp @@ -14,7 +14,7 @@ namespace qnn { // Qualcomm QNN(Qualcomm Neural Network, aka Qualcomm AI Engine Direct) SDK // ref:https://github.com/pytorch/executorch/tree/main/backends/qualcomm // ================================================================================================= -enum sdk_profile_level { profile_off = 0, profile_basic = 1, profile_detail = 2 }; +enum sdk_profile_level { profile_off = 0, profile_basic, profile_detail }; enum qcom_htp_arch { NONE = 0, diff --git a/ggml/src/ggml-qnn/tensor.hpp b/ggml/src/ggml-qnn/tensor.hpp index f28fc8e2ca1e2..0a9a367015127 100644 --- a/ggml/src/ggml-qnn/tensor.hpp +++ b/ggml/src/ggml-qnn/tensor.hpp @@ -27,8 +27,8 @@ class ggml_qnn_tensor { explicit ggml_qnn_tensor(tensor_type_t tensor_type, const std::string &name, const qnn_dimension_array_t &dimensions, Qnn_DataType_t data_type, int rank, QNNBackend device, Qnn_GraphHandle_t graph_handle, - std::shared_ptr qnn_instance) : - _tensor_name(name), _device(device), _qnn_instance(qnn_instance), _graph_handle(graph_handle) { + std::shared_ptr qnn_instance) + : _tensor_name(name), _device(device), _qnn_instance(qnn_instance), _graph_handle(graph_handle) { if (!_tensor_name.empty()) { QNN_TENSOR_SET_NAME(_qnn_tensor, _tensor_name.c_str()); } @@ -37,23 +37,35 @@ class ggml_qnn_tensor { _dimensions = dimensions; update_params_from_ggml_tensor(tensor_type, data_type, rank); - QNN_LOG_DEBUG("create tensor %s, rank: %d, dims: [%d, %d, %d, %d], data_type: %d, device: %d", + QNN_LOG_DEBUG("[%s][%s]created, rank: %d, dims: [%d, %d, %d, %d], type: %s", get_backend_name(device), _tensor_name.c_str(), rank, (int)_dimensions[0], (int)_dimensions[1], (int)_dimensions[2], - (int)_dimensions[3], (int)data_type, (int)device); + (int)_dimensions[3], qnn_datatype_to_string(data_type)); } explicit ggml_qnn_tensor(tensor_type_t tensor_type, const std::string &name, const ggml_dimension_array_t &dimensions, ggml_type data_type, int rank, QNNBackend device, - Qnn_GraphHandle_t graph_handle, std::shared_ptr qnn_instance) : - ggml_qnn_tensor(tensor_type, name, get_internal_dimension(dimensions, rank), - qnn_datatype_from_ggml_datatype(data_type), rank, device, graph_handle, qnn_instance) {} + Qnn_GraphHandle_t graph_handle, std::shared_ptr qnn_instance) + : ggml_qnn_tensor(tensor_type, name, get_internal_dimension(dimensions, rank), + qnn_datatype_from_ggml_datatype(data_type), rank, device, graph_handle, qnn_instance) {} + + ~ggml_qnn_tensor() { + _buffer_storage.clear(); + unbind(); + _qnn_rpc_buffer.reset(); + } + + bool set_data_buffer(std::vector &&buffer) { + if (!bind_buffer_impl(buffer.data(), buffer.size())) { + return false; + } - ~ggml_qnn_tensor() { _qnn_rpc_buffer.reset(); } + _buffer_storage = std::move(buffer); + return true; + } bool alloc_qnn_tensor_id() { if (QNN_TENSOR_GET_ID(_qnn_tensor)) { - QNN_LOG_WARN("graph tensor %s already created, id %d", _tensor_name.c_str(), - QNN_TENSOR_GET_ID(_qnn_tensor)); + QNN_LOG_DEBUG("[%s]tensor already has a id: %d", _tensor_name.c_str(), QNN_TENSOR_GET_ID(_qnn_tensor)); return true; } @@ -61,30 +73,90 @@ class ggml_qnn_tensor { auto qnn_interface = _qnn_instance->get_qnn_interface(); auto error = qnn_interface->qnn_tensor_create_graph_tensor(_graph_handle, &qnn_tensor); if (error != QNN_SUCCESS) { - QNN_LOG_WARN("create graph tensor failed, tensor %s, error: %d\n", _tensor_name.c_str(), error); + QNN_LOG_WARN("[%s]allocate id failed , error: %d\n", _tensor_name.c_str(), error); return false; } QNN_TENSOR_SET_ID(_qnn_tensor, QNN_TENSOR_GET_ID(qnn_tensor)); - QNN_LOG_DEBUG("create graph tensor %s, id: %d, rank: %d", _tensor_name.c_str(), QNN_TENSOR_GET_ID(qnn_tensor), - QNN_TENSOR_GET_RANK(qnn_tensor)); - + QNN_LOG_DEBUG("[%s][%s]allocated id: %d, rank: %d", get_backend_name(_device), _tensor_name.c_str(), + QNN_TENSOR_GET_ID(qnn_tensor), QNN_TENSOR_GET_RANK(qnn_tensor)); return true; } bool bind_buffer(uint8_t *buffer, const size_t buffer_size) { + if (!_buffer_storage.empty()) { + QNN_LOG_DEBUG("[%s]already has buffer storage, skip bind", _tensor_name.c_str()); + return true; + } + + return bind_buffer_impl(buffer, buffer_size); + } + + bool bind_ggml_tensor(ggml_tensor *tensor) { + if (!bind_buffer(reinterpret_cast(tensor->data), ggml_nbytes(tensor))) { + QNN_LOG_WARN("[%s]failed to bind ggml tensor(%s)", _tensor_name.c_str(), ggml_get_name(tensor)); + return false; + } + + QNN_LOG_DEBUG("[%s][%s]bind to ggml tensor(%s)", get_backend_name(_device), _tensor_name.c_str(), + ggml_get_name(tensor)); + return true; + } + + bool unbind() { + if (!_graph_handle) { + QNN_LOG_WARN("[%s]not bound to any graph", _tensor_name.c_str()); + return false; + } + + if (!_buffer) { + QNN_LOG_DEBUG("[%s]bound to ggml tensor", _tensor_name.c_str()); + return true; + } + + if (!read_from_qnn_tensor()) { + QNN_LOG_WARN("[%s]read from qnn tensor failed", _tensor_name.c_str()); + return false; + } + + if (!_buffer_storage.empty()) { + QNN_LOG_DEBUG("[%s]already has buffer storage, stop unbind", _tensor_name.c_str()); + return true; + } + + if (!should_use_mem_handle()) { + QNN_TENSOR_SET_MEM_TYPE(_qnn_tensor, QNN_TENSORMEMTYPE_RAW); + Qnn_ClientBuffer_t client_buf = {}; + QNN_TENSOR_SET_CLIENT_BUF(_qnn_tensor, client_buf); + QNN_LOG_DEBUG("[%s]clear client buffer", _tensor_name.c_str()); + } + + QNN_LOG_DEBUG("[%s][%s]unbind from buffer: %p, size: %d", get_backend_name(_device), _tensor_name.c_str(), + _buffer, (int)_buffer_size); + _buffer = nullptr; + _buffer_size = 0; + return true; + } + + const Qnn_Tensor_t &get_qnn_tensor() const { return _qnn_tensor; } + Qnn_DataType_t get_data_type() const { return QNN_TENSOR_GET_DATA_TYPE(_qnn_tensor); } + const qnn_dimension_array_t &get_dimensions() const { return _dimensions; } + uint32_t get_qnn_tensor_id() const { return QNN_TENSOR_GET_ID(_qnn_tensor); } + +private: + bool bind_buffer_impl(uint8_t *buffer, const size_t buffer_size) { if (_buffer) { if (_buffer != buffer) { - QNN_LOG_WARN("tensor %s has been bound to another buffer %p", _tensor_name.c_str(), _buffer); + QNN_LOG_WARN("[%s]has been bound to another buffer %p", _tensor_name.c_str(), _buffer); return false; } - QNN_LOG_INFO("tensor %s already bound to same ggml tensor %p", _tensor_name.c_str(), _buffer); + QNN_LOG_DEBUG("[%s]already bound to same ggml tensor %p", _tensor_name.c_str(), _buffer); return true; } if (QNN_TENSOR_GET_TYPE(_qnn_tensor) == QNN_TENSOR_TYPE_NATIVE) { - QNN_LOG_DEBUG("tensor %s type(%d) not READ/WRITE, skipping", _tensor_name.c_str(), + QNN_LOG_DEBUG("[%s]tensor type(%d) not READ/WRITE, skipping", _tensor_name.c_str(), (int)QNN_TENSOR_TYPE_NATIVE); return true; } @@ -95,7 +167,7 @@ class ggml_qnn_tensor { _qnn_instance, buffer_size, QNN_TENSOR_GET_RANK(_qnn_tensor), QNN_TENSOR_GET_DIMENSIONS(_qnn_tensor), QNN_TENSOR_GET_DATA_TYPE(_qnn_tensor)); if (!qnn_rpc_buffer->is_valid()) { - QNN_LOG_WARN("alloc rpc mem failed, tensor %s", _tensor_name.c_str()); + QNN_LOG_WARN("[%s]alloc rpc mem failed", _tensor_name.c_str()); return false; } @@ -104,12 +176,12 @@ class ggml_qnn_tensor { QNN_TENSOR_SET_MEM_TYPE(_qnn_tensor, QNN_TENSORMEMTYPE_MEMHANDLE); QNN_TENSOR_SET_MEM_HANDLE(_qnn_tensor, _qnn_rpc_buffer->get_mem_handle()); - QNN_LOG_DEBUG("tensor %s, use mem handle %p", _tensor_name.c_str(), QNN_TENSOR_GET_MEM_HANDLE(_qnn_tensor)); + QNN_LOG_DEBUG("[%s]use mem handle %p", _tensor_name.c_str(), QNN_TENSOR_GET_MEM_HANDLE(_qnn_tensor)); } else { QNN_TENSOR_SET_MEM_TYPE(_qnn_tensor, QNN_TENSORMEMTYPE_RAW); - Qnn_ClientBuffer_t client_buf = { buffer, (uint32_t)buffer_size }; + Qnn_ClientBuffer_t client_buf = {buffer, (uint32_t)buffer_size}; QNN_TENSOR_SET_CLIENT_BUF(_qnn_tensor, client_buf); - QNN_LOG_DEBUG("tensor %s, use client buffer %p size %d", _tensor_name.c_str(), client_buf.data, + QNN_LOG_DEBUG("[%s]use client buffer %p size %d", _tensor_name.c_str(), client_buf.data, (int)client_buf.dataSize); } @@ -117,62 +189,19 @@ class ggml_qnn_tensor { _buffer_size = buffer_size; if (!write_to_qnn_tensor()) { - QNN_LOG_WARN("write to qnn tensor failed, tensor %s", _tensor_name.c_str()); - return false; - } - - QNN_LOG_DEBUG("bind tensor %s to buffer: %p, size: %d", _tensor_name.c_str(), buffer, (int)buffer_size); - return true; - } - - bool bind_ggml_tensor(ggml_tensor *tensor) { - if (!bind_buffer(reinterpret_cast(tensor->data), ggml_nbytes(tensor))) { - QNN_LOG_WARN("Failed to bind tensor: %s to ggml tensor: %s", _tensor_name.c_str(), ggml_get_name(tensor)); - return false; - } - - QNN_LOG_DEBUG("Bind tensor %s to ggml tensor %s", _tensor_name.c_str(), ggml_get_name(tensor)); - return true; - } - - bool unbind() { - if (!_graph_handle) { - QNN_LOG_WARN("tensor %s not bound to any graph", _tensor_name.c_str()); + QNN_LOG_WARN("[%s]write to qnn tensor failed", _tensor_name.c_str()); return false; } - if (!_buffer) { - QNN_LOG_DEBUG("tensor %s not bound to ggml tensor", _tensor_name.c_str()); - return true; - } - - if (!read_from_qnn_tensor()) { - QNN_LOG_WARN("read from qnn tensor failed, tensor %s", _tensor_name.c_str()); - return false; - } - - if (!should_use_mem_handle()) { - QNN_TENSOR_SET_MEM_TYPE(_qnn_tensor, QNN_TENSORMEMTYPE_RAW); - Qnn_ClientBuffer_t client_buf = {}; - QNN_TENSOR_SET_CLIENT_BUF(_qnn_tensor, client_buf); - QNN_LOG_DEBUG("tensor %s, clear client buffer", _tensor_name.c_str()); - } - - QNN_LOG_DEBUG("unbind tensor: %s from buffer: %p, size: %d", _tensor_name.c_str(), _buffer, (int)_buffer_size); - _buffer = nullptr; - _buffer_size = 0; + QNN_LOG_DEBUG("[%s][%s]bind to buffer: %p, size: %d", get_backend_name(_device), _tensor_name.c_str(), buffer, + (int)buffer_size); return true; } - const Qnn_Tensor_t &get_qnn_tensor() const { return _qnn_tensor; } - Qnn_DataType_t get_data_type() const { return QNN_TENSOR_GET_DATA_TYPE(_qnn_tensor); } - const qnn_dimension_array_t &get_dimensions() const { return _dimensions; } - -private: bool write_to_qnn_tensor() { auto tensor_type = QNN_TENSOR_GET_TYPE(_qnn_tensor); if (tensor_type != QNN_TENSOR_TYPE_APP_WRITE && tensor_type != QNN_TENSOR_TYPE_APP_READWRITE) { - QNN_LOG_DEBUG("tensor %s type(%d) not WRITE", _tensor_name.c_str(), (int)tensor_type); + QNN_LOG_DEBUG("[%s]tensor type(%d) not WRITE", _tensor_name.c_str(), (int)tensor_type); return true; } @@ -180,20 +209,20 @@ class ggml_qnn_tensor { if (_qnn_rpc_buffer) { memcpy(_qnn_rpc_buffer->get_buffer(), _buffer, _buffer_size); } else { - QNN_LOG_WARN("tensor %s: can't find rpcmem from qnn mem handle\n", _tensor_name.c_str()); + QNN_LOG_WARN("[%s]can't find rpcmem from qnn mem handle\n", _tensor_name.c_str()); return false; } } // For CPU and GPU, the data is already in the tensor. - QNN_LOG_DEBUG("write tensor %s to qnn", _tensor_name.c_str()); + QNN_LOG_DEBUG("[%s][%s]write tensor to qnn", get_backend_name(_device), _tensor_name.c_str()); return true; } bool read_from_qnn_tensor() { auto tensor_type = QNN_TENSOR_GET_TYPE(_qnn_tensor); if (tensor_type != QNN_TENSOR_TYPE_APP_READ && tensor_type != QNN_TENSOR_TYPE_APP_READWRITE) { - QNN_LOG_DEBUG("tensor %s type(%d) not READ", _tensor_name.c_str(), (int)tensor_type); + QNN_LOG_DEBUG("[%s]tensor type(%d) not READ", _tensor_name.c_str(), (int)tensor_type); return true; } @@ -201,13 +230,13 @@ class ggml_qnn_tensor { if (_qnn_rpc_buffer) { memcpy(_buffer, _qnn_rpc_buffer->get_buffer(), _buffer_size); } else { - QNN_LOG_WARN("can't find rpcmem from qnn mem handle\n"); + QNN_LOG_WARN("[%s]can't find rpcmem from qnn mem handle", _tensor_name.c_str()); return false; } } // For CPU and GPU, the data is already in the tensor. - QNN_LOG_DEBUG("read tensor %s from qnn", _tensor_name.c_str()); + QNN_LOG_DEBUG("[%s][%s]read tensor from qnn", get_backend_name(_device), _tensor_name.c_str()); return true; } @@ -231,12 +260,14 @@ class ggml_qnn_tensor { case PARAMETER: new_tensor_type = QNN_TENSOR_TYPE_STATIC; break; + case INTERMEDIATE: default: new_tensor_type = QNN_TENSOR_TYPE_NATIVE; break; } QNN_TENSOR_SET_TYPE(_qnn_tensor, new_tensor_type); - QNN_LOG_INFO("tensor %s changed to type %d", _tensor_name.c_str(), new_tensor_type); + QNN_LOG_DEBUG("[%s][%s]tensor changed to type %d", get_backend_name(_device), _tensor_name.c_str(), + new_tensor_type); } bool should_use_mem_handle() const { @@ -246,6 +277,7 @@ class ggml_qnn_tensor { std::string _tensor_name; uint8_t *_buffer = nullptr; size_t _buffer_size = 0; + std::vector _buffer_storage; QNNBackend _device; std::shared_ptr _qnn_instance; Qnn_Tensor_t _qnn_tensor = qnn_tensor_init(kDefaultQnnTensorVersion); @@ -257,7 +289,7 @@ class ggml_qnn_tensor { DISABLE_MOVE(ggml_qnn_tensor); }; -using ggml_qnn_tensor_ptr_t = std::shared_ptr; -using ggml_qnn_tensor_array_t = std::vector>; +using qnn_tensor_ptr_t = std::shared_ptr; +using qnn_tensor_array_t = std::vector; } // namespace qnn diff --git a/ggml/src/ggml-qnn/utils.cpp b/ggml/src/ggml-qnn/utils.cpp index 8ae375ffc8afc..ebfc0372375fd 100644 --- a/ggml/src/ggml-qnn/utils.cpp +++ b/ggml/src/ggml-qnn/utils.cpp @@ -5,9 +5,11 @@ #include "ggml-qnn.h" +#include "QnnGraph.h" #include "qnn-types.hpp" #ifdef __linux__ +#include #include #endif @@ -148,11 +150,11 @@ const char *get_ggml_type_name(ggml_type type) { const char *get_backend_name(QNNBackend device_index) { switch (device_index) { case QNN_BACKEND_CPU: - return "QNN-CPU"; + return "qnn-cpu"; case QNN_BACKEND_GPU: - return "QNN-GPU"; + return "qnn-gpu"; case QNN_BACKEND_NPU: - return "QNN-NPU"; + return "qnn-npu"; case QNN_BACKEND_COUNT: default: return "unknown"; @@ -195,18 +197,7 @@ intptr_t align_to(size_t alignment, intptr_t offset) { : offset + (static_cast(alignment) - (offset % static_cast(alignment))); } -uint32_t get_ggml_tensor_data_size(const ggml_tensor *tensor) { - /* - size_t data_size = ggml_row_size(tensor->type, tensor->ne[0]); - size_t n_dims = qnn_get_ggml_tensor_rank(tensor); - for (int i = 1; i < n_dims; i++) { - data_size *= tensor->ne[i]; - } - - return data_size; - */ - return ggml_nbytes(tensor); -} +uint32_t get_ggml_tensor_data_size(const ggml_tensor *tensor) { return ggml_nbytes(tensor); } void *align_alloc(size_t alignment, size_t size) { size_t size_aligned = size; @@ -248,6 +239,7 @@ const char *opname_from_ggmlop(enum ggml_op ggmlop) { const char *get_qnn_error_string(Qnn_ErrorHandle_t error) { // A complete list of error codes can be found at here: // https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/api_error_codes.html + thread_local static char error_code[128] = {}; switch (error) { case QNN_SUCCESS: return "QNN_SUCCESS"; @@ -277,6 +269,36 @@ const char *get_qnn_error_string(Qnn_ErrorHandle_t error) { return "QNN_GRAPH_ERROR_UNCONNECTED_NODE"; case QNN_GRAPH_ERROR_CREATE_FAILED: return "QNN_GRAPH_ERROR_CREATE_FAILED"; + case QNN_GRAPH_ERROR_OPTIMIZATION_FAILED: + return "QNN_GRAPH_ERROR_OPTIMIZATION_FAILED"; + case QNN_GRAPH_ERROR_FINALIZE_FAILED: + return "QNN_GRAPH_ERROR_FINALIZE_FAILED"; + case QNN_GRAPH_ERROR_GRAPH_NOT_FINALIZED: + return "QNN_GRAPH_ERROR_GRAPH_NOT_FINALIZED"; + case QNN_GRAPH_ERROR_GRAPH_FINALIZED: + return "QNN_GRAPH_ERROR_GRAPH_FINALIZED"; + case QNN_GRAPH_ERROR_EXECUTION_ASYNC_FIFO_FULL: + return "QNN_GRAPH_ERROR_EXECUTION_ASYNC_FIFO_FULL"; + case QNN_GRAPH_ERROR_SIGNAL_IN_USE: + return "QNN_GRAPH_ERROR_SIGNAL_IN_USE"; + case QNN_GRAPH_ERROR_ABORTED: + return "QNN_GRAPH_ERROR_ABORTED"; + case QNN_GRAPH_ERROR_PROFILE_IN_USE: + return "QNN_GRAPH_ERROR_PROFILE_IN_USE"; + case QNN_GRAPH_ERROR_TIMED_OUT: + return "QNN_GRAPH_ERROR_TIMED_OUT"; + case QNN_GRAPH_ERROR_SUBGRAPH: + return "QNN_GRAPH_ERROR_SUBGRAPH"; + case QNN_GRAPH_ERROR_DISABLED: + return "QNN_GRAPH_ERROR_DISABLED"; + case QNN_GRAPH_ERROR_DYNAMIC_TENSOR_SHAPE: + return "QNN_GRAPH_ERROR_DYNAMIC_TENSOR_SHAPE"; + case QNN_GRAPH_ERROR_TENSOR_SPARSITY: + return "QNN_GRAPH_ERROR_TENSOR_SPARSITY"; + case QNN_GRAPH_ERROR_EARLY_TERMINATION: + return "QNN_GRAPH_ERROR_EARLY_TERMINATION"; + case QNN_GRAPH_ERROR_INVALID_CONTEXT: + return "QNN_GRAPH_ERROR_INVALID_CONTEXT"; // QnnOpPackage_Error_t case QNN_OP_PACKAGE_ERROR_LIBRARY_ALREADY_INITIALIZED: @@ -294,19 +316,34 @@ const char *get_qnn_error_string(Qnn_ErrorHandle_t error) { case QNN_OP_PACKAGE_ERROR_INVALID_ARGUMENT: return "QNN_OP_PACKAGE_ERROR_INVALID_ARGUMENT"; default: - return nullptr; + if (error >= QNN_GRAPH_MIN_ERROR && error < QNN_GRAPH_MAX_ERROR) { + snprintf(error_code, sizeof(error_code), "UNKNOWN_GRAPH_ERROR_%d", int(error - QNN_GRAPH_MIN_ERROR)); + } else { + snprintf(error_code, sizeof(error_code), "%d", int(error)); + } + return error_code; } } #ifdef __linux__ size_t get_system_total_memory_in_bytes() { + struct sysinfo info = {}; + if (sysinfo(&info) == 0) { + return (info.totalram + info.totalswap) * info.mem_unit; + } + auto pages = (size_t)sysconf(_SC_PHYS_PAGES); auto page_size = (size_t)sysconf(_SC_PAGE_SIZE); return pages * page_size; } size_t get_system_free_memory_in_bytes() { + struct sysinfo info = {}; + if (sysinfo(&info) == 0) { + return (info.freeram + info.freeswap) * info.mem_unit; + } + auto avail_pages = (size_t)sysconf(_SC_AVPHYS_PAGES); auto page_size = (size_t)sysconf(_SC_PAGE_SIZE); return avail_pages * page_size; From 5103b166badb497fbddaf9de8b07cf1bacd83ff7 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Fri, 29 Nov 2024 14:19:34 +0800 Subject: [PATCH 130/143] bugfix: block large tensor calc in npu --- ggml/src/ggml-qnn/backend-ops.cpp | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/ggml/src/ggml-qnn/backend-ops.cpp b/ggml/src/ggml-qnn/backend-ops.cpp index da0480df7fd9f..30930be422496 100644 --- a/ggml/src/ggml-qnn/backend-ops.cpp +++ b/ggml/src/ggml-qnn/backend-ops.cpp @@ -566,6 +566,10 @@ bool ggml_qnn_supports_tensor(ggml_backend_qnn_device_context *ctx, const ggml_t } bool ggml_qnn_supports_matmul_op(ggml_backend_qnn_device_context *ctx, const ggml_tensor *op) { + constexpr const auto get_tensor_size = [](const ggml_tensor *tensor) -> size_t { + return tensor->ne[0] * tensor->ne[1] * tensor->ne[2] * tensor->ne[3]; + }; + auto *src0 = op->src[0]; auto *src1 = op->src[1]; switch (ctx->device) { @@ -578,6 +582,11 @@ bool ggml_qnn_supports_matmul_op(ggml_backend_qnn_device_context *ctx, const ggm QNN_LOG_DEBUG("[qnn-npu] src0 and src1 dimensions are not equal, support/unsupported: %d/%d", ctx->support_op_count.load(), ++(ctx->unsupported_op_count)); return false; + } else if (get_tensor_size(src0) + get_tensor_size(src1) + get_tensor_size(op) >= + (8192 * 2048 + 8192 * 512 + 2048 * 512)) { + QNN_LOG_DEBUG("[qnn-npu] tensor size is too large, support/unsupported: %d/%d", + ctx->support_op_count.load(), ++(ctx->unsupported_op_count)); + return false; } // fall through, from test here, the convert op is super slow on NPU: // https://github.com/usefulsensors/qc_npu_benchmark From 6d4feae5791038d9415a8538bc8083f11e72875e Mon Sep 17 00:00:00 2001 From: hongruichen Date: Fri, 29 Nov 2024 16:51:06 +0800 Subject: [PATCH 131/143] redo conflict changes --- ggml/CMakeLists.txt | 3 ++- ggml/src/CMakeLists.txt | 1 + ggml/src/ggml-backend-reg.cpp | 11 +++++++++ ggml/src/ggml-qnn/CMakeLists.txt | 34 ++++++++++++++++++++++++++++ ggml/src/{ => ggml-qnn}/ggml-qnn.cpp | 0 5 files changed, 48 insertions(+), 1 deletion(-) create mode 100644 ggml/src/ggml-qnn/CMakeLists.txt rename ggml/src/{ => ggml-qnn}/ggml-qnn.cpp (100%) diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt index d2c377dcb4fd5..b0dca348f7ed6 100644 --- a/ggml/CMakeLists.txt +++ b/ggml/CMakeLists.txt @@ -232,7 +232,8 @@ set(GGML_PUBLIC_HEADERS include/ggml-metal.h include/ggml-rpc.h include/ggml-sycl.h - include/ggml-vulkan.h) + include/ggml-vulkan.h + include/ggml-qnn.h) set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}") #if (GGML_METAL) diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt index 9022aa3ae197d..f6db35571bb0b 100644 --- a/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt @@ -286,6 +286,7 @@ ggml_add_backend(MUSA) ggml_add_backend(RPC) ggml_add_backend(SYCL) ggml_add_backend(Vulkan) +ggml_add_backend(QNN) foreach (target ggml-base ggml) target_include_directories(${target} PUBLIC $ $) diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp index a0e0e2c5852f7..8dc267c2dbaad 100644 --- a/ggml/src/ggml-backend-reg.cpp +++ b/ggml/src/ggml-backend-reg.cpp @@ -61,6 +61,14 @@ #include "ggml-kompute.h" #endif +#ifdef GGML_USE_KOMPUTE +#include "ggml-kompute.h" +#endif + +#ifdef GGML_USE_QNN +#include "ggml-qnn.h" +#endif + struct ggml_backend_reg_entry { ggml_backend_reg_t reg; void * handle; @@ -98,6 +106,9 @@ struct ggml_backend_registry { #ifdef GGML_USE_KOMPUTE register_backend(ggml_backend_kompute_reg()); #endif +#ifdef GGML_USE_QNN + register_backend(ggml_backend_qnn_reg()); +#endif #ifdef GGML_USE_CPU register_backend(ggml_backend_cpu_reg()); #endif diff --git a/ggml/src/ggml-qnn/CMakeLists.txt b/ggml/src/ggml-qnn/CMakeLists.txt new file mode 100644 index 0000000000000..af60de67d38e0 --- /dev/null +++ b/ggml/src/ggml-qnn/CMakeLists.txt @@ -0,0 +1,34 @@ +message(STATUS "Using QNN backend") + +if(CMAKE_SYSTEM_NAME STREQUAL "Android") + find_library(LOG_LIB log) + find_library(ANDROID_LIB android) + set(QNN_LINK_LIBRARIES ${LOG_LIB} ${ANDROID_LIB}) + set(QNN_DEFAULT_LIB_SEARCH_PATH "/data/local/tmp/" CACHE STRING "customized library search path for QNN backend") +else() + message(FATAL_ERROR "QNN now only available on Android") +endif() + +if(NOT DEFINED GGML_QNN_SDK_PATH) + # try read from environment variable + if(DEFINED ENV{QNN_SDK_PATH}) + set(GGML_QNN_SDK_PATH $ENV{QNN_SDK_PATH}) + else() + message(FATAL_ERROR "GGML_QNN_SDK_PATH not defined") + endif() +endif() + +message("QNN_SDK_PATH: ${GGML_QNN_SDK_PATH}") + +string(REGEX REPLACE "/$" "" GGML_QNN_DEFAULT_LIB_SEARCH_PATH "${QNN_DEFAULT_LIB_SEARCH_PATH}") +add_compile_definitions(GGML_QNN_DEFAULT_LIB_SEARCH_PATH="${QNN_DEFAULT_LIB_SEARCH_PATH}/") + +set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3") + +file(GLOB QNN_SOURCES "${CMAKE_CURRENT_LIST_DIR}/*.cpp") +ggml_add_backend_library(ggml-qnn + ${QNN_SOURCES} +) + +target_include_directories(ggml-qnn PRIVATE ${GGML_QNN_SDK_PATH}/include/QNN ${CMAKE_CURRENT_LIST_DIR}) +target_link_libraries(ggml-qnn PRIVATE ${QNN_LINK_LIBRARIES}) diff --git a/ggml/src/ggml-qnn.cpp b/ggml/src/ggml-qnn/ggml-qnn.cpp similarity index 100% rename from ggml/src/ggml-qnn.cpp rename to ggml/src/ggml-qnn/ggml-qnn.cpp From 09efaa389e3525e4a972b4390c7f2c5ec36ae5e2 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Fri, 29 Nov 2024 17:24:05 +0800 Subject: [PATCH 132/143] define compile flag as module private --- ggml/src/ggml-qnn/CMakeLists.txt | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ggml/src/ggml-qnn/CMakeLists.txt b/ggml/src/ggml-qnn/CMakeLists.txt index af60de67d38e0..b8d84d078e082 100644 --- a/ggml/src/ggml-qnn/CMakeLists.txt +++ b/ggml/src/ggml-qnn/CMakeLists.txt @@ -20,9 +20,6 @@ endif() message("QNN_SDK_PATH: ${GGML_QNN_SDK_PATH}") -string(REGEX REPLACE "/$" "" GGML_QNN_DEFAULT_LIB_SEARCH_PATH "${QNN_DEFAULT_LIB_SEARCH_PATH}") -add_compile_definitions(GGML_QNN_DEFAULT_LIB_SEARCH_PATH="${QNN_DEFAULT_LIB_SEARCH_PATH}/") - set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3") file(GLOB QNN_SOURCES "${CMAKE_CURRENT_LIST_DIR}/*.cpp") @@ -32,3 +29,6 @@ ggml_add_backend_library(ggml-qnn target_include_directories(ggml-qnn PRIVATE ${GGML_QNN_SDK_PATH}/include/QNN ${CMAKE_CURRENT_LIST_DIR}) target_link_libraries(ggml-qnn PRIVATE ${QNN_LINK_LIBRARIES}) + +string(REGEX REPLACE "/$" "" GGML_QNN_DEFAULT_LIB_SEARCH_PATH "${QNN_DEFAULT_LIB_SEARCH_PATH}") +target_compile_definitions(ggml-qnn PRIVATE GGML_QNN_DEFAULT_LIB_SEARCH_PATH="${QNN_DEFAULT_LIB_SEARCH_PATH}/") From c5e65493318cdcbe03b726723fa9b4cd86c74d35 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Fri, 29 Nov 2024 23:37:52 +0800 Subject: [PATCH 133/143] fix: fix assertion --- ggml/src/ggml-qnn/backend-ops.cpp | 2 ++ ggml/src/ggml-qnn/ggml-qnn.cpp | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/ggml/src/ggml-qnn/backend-ops.cpp b/ggml/src/ggml-qnn/backend-ops.cpp index 30930be422496..0e73cce668e83 100644 --- a/ggml/src/ggml-qnn/backend-ops.cpp +++ b/ggml/src/ggml-qnn/backend-ops.cpp @@ -666,6 +666,8 @@ bool ggml_qnn_supports_op(ggml_backend_qnn_device_context *ctx, const ggml_tenso case GGML_OP_MUL_MAT: return ggml_qnn_supports_matmul_op(ctx, op); + case GGML_OP_VIEW: + return true; default: return false; } diff --git a/ggml/src/ggml-qnn/ggml-qnn.cpp b/ggml/src/ggml-qnn/ggml-qnn.cpp index a4dace7078d3b..3bc91a061212f 100644 --- a/ggml/src/ggml-qnn/ggml-qnn.cpp +++ b/ggml/src/ggml-qnn/ggml-qnn.cpp @@ -227,7 +227,7 @@ size_t ggml_backend_qnn_buffer_type_get_alignment(ggml_backend_buffer_type_t buf size_t ggml_backend_qnn_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) { GGML_UNUSED(buft); // TODO: get the max size from device - return (1024 * 1024 * 1024); + return (2 * 1024 * 1024 * 1024); } bool ggml_backend_qnn_buffer_is_host(ggml_backend_buffer_type_t buft) { From 0d02ee09edafacbffa6630b67d68f1f27664e37d Mon Sep 17 00:00:00 2001 From: hongruichen Date: Tue, 3 Dec 2024 10:52:49 +0800 Subject: [PATCH 134/143] fix int overflow and remove view op to pass unit test --- ggml/src/ggml-qnn/backend-ops.cpp | 2 -- ggml/src/ggml-qnn/ggml-qnn.cpp | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/ggml/src/ggml-qnn/backend-ops.cpp b/ggml/src/ggml-qnn/backend-ops.cpp index 0e73cce668e83..30930be422496 100644 --- a/ggml/src/ggml-qnn/backend-ops.cpp +++ b/ggml/src/ggml-qnn/backend-ops.cpp @@ -666,8 +666,6 @@ bool ggml_qnn_supports_op(ggml_backend_qnn_device_context *ctx, const ggml_tenso case GGML_OP_MUL_MAT: return ggml_qnn_supports_matmul_op(ctx, op); - case GGML_OP_VIEW: - return true; default: return false; } diff --git a/ggml/src/ggml-qnn/ggml-qnn.cpp b/ggml/src/ggml-qnn/ggml-qnn.cpp index 3bc91a061212f..c57692b867bc7 100644 --- a/ggml/src/ggml-qnn/ggml-qnn.cpp +++ b/ggml/src/ggml-qnn/ggml-qnn.cpp @@ -227,7 +227,7 @@ size_t ggml_backend_qnn_buffer_type_get_alignment(ggml_backend_buffer_type_t buf size_t ggml_backend_qnn_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) { GGML_UNUSED(buft); // TODO: get the max size from device - return (2 * 1024 * 1024 * 1024); + return 1024 * 1024 * 1024; } bool ggml_backend_qnn_buffer_is_host(ggml_backend_buffer_type_t buft) { From e36ad89528a0276331e3c22f153d6837c353c5cf Mon Sep 17 00:00:00 2001 From: nullname Date: Wed, 11 Dec 2024 10:42:00 +0800 Subject: [PATCH 135/143] bugfix: error pre-allocated tensor (k_cache_view-0) (#12) * fix device binding at ggml_backend_qnn_buffer_type * merge ggml_backend_qnn_buffer_context and qnn_mem_buffer * wip * add log * wip * add qnn_buffer_ptr * remove tailing `\n` at log * add log * enable GGML_OP_NONE * wip * wip * disable tensor with view * wip * wip * more log for view tensor * re-enable view * wip * remove link android lib * set dimension at bind function * move graph traversal to backend-ops * wip * add get_view_internal_dimension to obtain the tensor view source dimension * use _view_source_dimensions to allocate qnn tensor * add place holder function ggml_backend_qnn_cpy_tensor_async * add ggml_qnn_aggregate_op_config * make matmul based on ggml_qnn_aggregate_op_config * wip * manually specify the order of op destruct * skip register qnn-cpu backend * disable view op again * remove _view_source_dimensions * add nop for reshape and view ops * add log * add comment --- ggml/src/ggml-qnn/CMakeLists.txt | 3 +- ggml/src/ggml-qnn/backend-ops.cpp | 151 ++++++++++++---------- ggml/src/ggml-qnn/backend-ops.hpp | 4 +- ggml/src/ggml-qnn/buffer.hpp | 84 ++++++++++--- ggml/src/ggml-qnn/ggml-qnn.cpp | 202 +++++++++++++----------------- ggml/src/ggml-qnn/graph.hpp | 14 +-- ggml/src/ggml-qnn/op-config.cpp | 122 +++++------------- ggml/src/ggml-qnn/op-config.hpp | 78 +++++++++--- ggml/src/ggml-qnn/qnn-lib.hpp | 145 +++++++++++---------- ggml/src/ggml-qnn/tensor.hpp | 50 ++++---- ggml/src/ggml-qnn/utils.cpp | 30 +++++ ggml/src/ggml-qnn/utils.hpp | 3 + 12 files changed, 469 insertions(+), 417 deletions(-) diff --git a/ggml/src/ggml-qnn/CMakeLists.txt b/ggml/src/ggml-qnn/CMakeLists.txt index b8d84d078e082..7bbb9be76b4f6 100644 --- a/ggml/src/ggml-qnn/CMakeLists.txt +++ b/ggml/src/ggml-qnn/CMakeLists.txt @@ -2,8 +2,7 @@ message(STATUS "Using QNN backend") if(CMAKE_SYSTEM_NAME STREQUAL "Android") find_library(LOG_LIB log) - find_library(ANDROID_LIB android) - set(QNN_LINK_LIBRARIES ${LOG_LIB} ${ANDROID_LIB}) + set(QNN_LINK_LIBRARIES ${LOG_LIB}) set(QNN_DEFAULT_LIB_SEARCH_PATH "/data/local/tmp/" CACHE STRING "customized library search path for QNN backend") else() message(FATAL_ERROR "QNN now only available on Android") diff --git a/ggml/src/ggml-qnn/backend-ops.cpp b/ggml/src/ggml-qnn/backend-ops.cpp index 30930be422496..990338c953524 100644 --- a/ggml/src/ggml-qnn/backend-ops.cpp +++ b/ggml/src/ggml-qnn/backend-ops.cpp @@ -3,6 +3,8 @@ #include +#include "ggml-impl.h" + #include "graph.hpp" #include "logger.hpp" #include "op-config.hpp" @@ -15,13 +17,13 @@ namespace { bool qnn_is_valid_params(ggml_backend_qnn_device_context *ctx, const ggml_tensor *src, ggml_tensor *dst) { if (!ctx || !src || !dst) { - QNN_LOG_WARN("invalid params\n"); + QNN_LOG_WARN("invalid params"); return false; } auto instance = ctx->instance; if (!instance) { - QNN_LOG_WARN("invalid instance\n"); + QNN_LOG_WARN("invalid instance"); return false; } @@ -31,13 +33,13 @@ bool qnn_is_valid_params(ggml_backend_qnn_device_context *ctx, const ggml_tensor bool qnn_is_valid_params(ggml_backend_qnn_device_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst) { if (!ctx || !src0 || !src1 || !dst) { - QNN_LOG_WARN("invalid params\n"); + QNN_LOG_WARN("invalid params"); return false; } auto instance = ctx->instance; if (!instance) { - QNN_LOG_WARN("invalid instance\n"); + QNN_LOG_WARN("invalid instance"); return false; } @@ -45,7 +47,7 @@ bool qnn_is_valid_params(ggml_backend_qnn_device_context *ctx, const ggml_tensor } void print_ggml_tensor(const ggml_tensor *tensor) { - QNN_LOG_DEBUG("%s: type:%s ne: %ldx%ldx%ldx%ld, nb: %ldx%ldx%ldx%ld\n", tensor->name, ggml_type_name(tensor->type), + QNN_LOG_DEBUG("%s: type:%s ne: %ldx%ldx%ldx%ld, nb: %ldx%ldx%ldx%ld", tensor->name, ggml_type_name(tensor->type), (long)tensor->ne[0], (long)tensor->ne[1], (long)tensor->ne[2], (long)tensor->ne[3], (long)tensor->nb[0], (long)tensor->nb[1], (long)tensor->nb[2], (long)tensor->nb[3]); } @@ -96,7 +98,7 @@ template bool execute_graph(qnn::ggml_qnn_graph *graph, const std::array &inputs, ggml_tensor *output) { if (!graph->execute(to_ggml_tensor_array<_InputSize>(inputs), to_ggml_tensor_array<1>({output}))) { - QNN_LOG_WARN("execute failed\n"); + QNN_LOG_WARN("execute failed"); return false; } @@ -248,7 +250,7 @@ qnn::ggml_qnn_graph *get_qnn_graph_from_cache(ggml_backend_qnn_device_context *c auto it = graph_cache.find(graph_key); qnn::ggml_qnn_graph *graph_ptr = nullptr; if (it != graph_cache.end()) { - QNN_LOG_DEBUG("found graph %s in cache\n", graph_key.c_str()); + QNN_LOG_DEBUG("[%s]found graph %s in cache", qnn::get_backend_name(ctx->device), graph_key.c_str()); graph_ptr = it->second.get(); } else { auto graph = @@ -260,7 +262,7 @@ qnn::ggml_qnn_graph *get_qnn_graph_from_cache(ggml_backend_qnn_device_context *c auto op_constructor = qnn::create_op_constructor(kGgmlOpToQnnOp[op]); if (!graph->build_graph(op_constructor, to_ggml_tensor_array<_InputSize>(inputs), to_ggml_tensor_array<1>({output}))) { - QNN_LOG_ERROR("build_graph failed\n"); + QNN_LOG_ERROR("[%s]build_graph failed", qnn::get_backend_name(ctx->device)); return nullptr; } @@ -332,7 +334,7 @@ bool qnn_binary_nop_impl(ggml_backend_qnn_device_context *ctx, ggml_tensor *src0 } constexpr const ggml_qnn_unary_op_t kQnnUnaryOpsTable[] = { - nullptr, // GGML_OP_NONE + qnn_unary_nop_impl, // GGML_OP_NONE nullptr, // GGML_OP_DUP nullptr, // GGML_OP_ADD nullptr, // GGML_OP_ADD1 @@ -363,37 +365,37 @@ constexpr const ggml_qnn_unary_op_t kQnnUnaryOpsTable[] = { nullptr, // GGML_OP_MUL_MAT_ID nullptr, // GGML_OP_OUT_PROD - nullptr, // GGML_OP_SCALE - nullptr, // GGML_OP_SET - nullptr, // GGML_OP_CPY - nullptr, // GGML_OP_CONT - nullptr, // GGML_OP_RESHAPE - qnn_unary_nop_impl, // GGML_OP_VIEW - qnn_unary_op_impl, // GGML_OP_PERMUTE - nullptr, // GGML_OP_TRANSPOSE - qnn_unary_nop_impl, // GGML_OP_GET_ROWS - nullptr, // GGML_OP_GET_ROWS_BACK - nullptr, // GGML_OP_DIAG - nullptr, // GGML_OP_DIAG_MASK_INF - nullptr, // GGML_OP_DIAG_MASK_ZERO - nullptr, // GGML_OP_SOFT_MAX - nullptr, // GGML_OP_SOFT_MAX_BACK - nullptr, // GGML_OP_ROPE - nullptr, // GGML_OP_ROPE_BACK - nullptr, // GGML_OP_CLAMP - nullptr, // GGML_OP_CONV_TRANSPOSE_1D - nullptr, // GGML_OP_IM2COL - nullptr, // GGML_OP_IM2COL_BACK - nullptr, // GGML_OP_CONV_TRANSPOSE_2D - nullptr, // GGML_OP_POOL_1D - nullptr, // GGML_OP_POOL_2D - nullptr, // GGML_OP_POOL_2D_BACK - nullptr, // GGML_OP_UPSCALE - nullptr, // GGML_OP_PAD - nullptr, // GGML_OP_ARANGE - nullptr, // GGML_OP_TIMESTEP_EMBEDDING - nullptr, // GGML_OP_ARGSORT - nullptr, // GGML_OP_LEAKY_RELU + nullptr, // GGML_OP_SCALE + nullptr, // GGML_OP_SET + nullptr, // GGML_OP_CPY + nullptr, // GGML_OP_CONT + qnn_unary_nop_impl, // GGML_OP_RESHAPE + qnn_unary_nop_impl, // GGML_OP_VIEW + qnn_unary_nop_impl, // GGML_OP_PERMUTE + qnn_unary_nop_impl, // GGML_OP_TRANSPOSE + qnn_unary_nop_impl, // GGML_OP_GET_ROWS + nullptr, // GGML_OP_GET_ROWS_BACK + nullptr, // GGML_OP_DIAG + nullptr, // GGML_OP_DIAG_MASK_INF + nullptr, // GGML_OP_DIAG_MASK_ZERO + nullptr, // GGML_OP_SOFT_MAX + nullptr, // GGML_OP_SOFT_MAX_BACK + nullptr, // GGML_OP_ROPE + nullptr, // GGML_OP_ROPE_BACK + nullptr, // GGML_OP_CLAMP + nullptr, // GGML_OP_CONV_TRANSPOSE_1D + nullptr, // GGML_OP_IM2COL + nullptr, // GGML_OP_IM2COL_BACK + nullptr, // GGML_OP_CONV_TRANSPOSE_2D + nullptr, // GGML_OP_POOL_1D + nullptr, // GGML_OP_POOL_2D + nullptr, // GGML_OP_POOL_2D_BACK + nullptr, // GGML_OP_UPSCALE + nullptr, // GGML_OP_PAD + nullptr, // GGML_OP_ARANGE + nullptr, // GGML_OP_TIMESTEP_EMBEDDING + nullptr, // GGML_OP_ARGSORT + nullptr, // GGML_OP_LEAKY_RELU nullptr, // GGML_OP_FLASH_ATTN_EXT nullptr, // GGML_OP_FLASH_ATTN_BACK @@ -442,7 +444,7 @@ constexpr const ggml_qnn_unary_op_t kQnnUnaryOpsTable[] = { static_assert(sizeof(kQnnUnaryOpsTable) / sizeof(kQnnUnaryOpsTable[0]) == (GGML_OP_COUNT + GGML_UNARY_OP_COUNT), "GGML_OP_COUNT does not match the size of the kQnnUnaryOpsTable table"); -static constexpr const ggml_qnn_binary_op_t kQnnBinaryOpsTable[] = { +constexpr const ggml_qnn_binary_op_t kQnnBinaryOpsTable[] = { nullptr, // GGML_OP_NONE nullptr, // GGML_OP_DUP qnn_binary_op_impl, // GGML_OP_ADD @@ -543,22 +545,28 @@ bool ggml_qnn_supports_tensor(ggml_backend_qnn_device_context *ctx, const ggml_t return false; } -#ifndef NDEBUG - auto *type_name = ggml_get_type_traits(tensor->type)->type_name; -#endif + if (tensor->view_src) { + auto *src_tensor = tensor->view_src; + QNN_LOG_DEBUG("[%s]tensor(%s_%dx%dx%dx%d) is a view, src: %s_%dx%dx%dx%d", qnn::get_backend_name(ctx->device), + ggml_get_name(tensor), tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3], + ggml_get_name(src_tensor), src_tensor->ne[0], src_tensor->ne[1], src_tensor->ne[2], + src_tensor->ne[3]); + } + switch (tensor->type) { case GGML_TYPE_F32: case GGML_TYPE_F16: case GGML_TYPE_Q8_0: case GGML_TYPE_Q4_0: if (!(ctx->supported_types & (1 << tensor->type))) { - QNN_LOG_DEBUG("unsupported data type %s for backend %s, supported_types: 0x%x", type_name, - qnn::get_backend_name(ctx->device), ctx->supported_types); + QNN_LOG_DEBUG("[%s]unsupported data type %s, supported_types: 0x%x", qnn::get_backend_name(ctx->device), + ggml_type_name(tensor->type), ctx->supported_types); return false; } break; default: - QNN_LOG_DEBUG("unsupported data type %s", type_name); + QNN_LOG_DEBUG("[%s]unsupported data type %s", qnn::get_backend_name(ctx->device), + ggml_type_name(tensor->type)); return false; } @@ -566,6 +574,7 @@ bool ggml_qnn_supports_tensor(ggml_backend_qnn_device_context *ctx, const ggml_t } bool ggml_qnn_supports_matmul_op(ggml_backend_qnn_device_context *ctx, const ggml_tensor *op) { + constexpr const size_t kMaxNpuTensorSize = 8192L * 2048 + 8192 * 512 + 2048 * 512; constexpr const auto get_tensor_size = [](const ggml_tensor *tensor) -> size_t { return tensor->ne[0] * tensor->ne[1] * tensor->ne[2] * tensor->ne[3]; }; @@ -582,8 +591,7 @@ bool ggml_qnn_supports_matmul_op(ggml_backend_qnn_device_context *ctx, const ggm QNN_LOG_DEBUG("[qnn-npu] src0 and src1 dimensions are not equal, support/unsupported: %d/%d", ctx->support_op_count.load(), ++(ctx->unsupported_op_count)); return false; - } else if (get_tensor_size(src0) + get_tensor_size(src1) + get_tensor_size(op) >= - (8192 * 2048 + 8192 * 512 + 2048 * 512)) { + } else if (get_tensor_size(src0) + get_tensor_size(src1) + get_tensor_size(op) >= kMaxNpuTensorSize) { QNN_LOG_DEBUG("[qnn-npu] tensor size is too large, support/unsupported: %d/%d", ctx->support_op_count.load(), ++(ctx->unsupported_op_count)); return false; @@ -618,12 +626,13 @@ bool ggml_qnn_supports_matmul_op(ggml_backend_qnn_device_context *ctx, const ggm namespace qnn { -bool ggml_qnn_supports_op(ggml_backend_qnn_device_context *ctx, const ggml_tensor *op) { +bool device_supports_op(ggml_backend_qnn_device_context *ctx, const ggml_tensor *op) { // Note that this function could be called before the device context is initialized if (op->op == GGML_OP_NONE) { return true; } + auto *src0 = op->src[0]; if (op->op == GGML_OP_UNARY) { const auto unary_op = ggml_get_unary_op(op); if (unary_op == GGML_UNARY_OP_GELU && ctx->device == QNN_BACKEND_NPU) { @@ -637,7 +646,7 @@ bool ggml_qnn_supports_op(ggml_backend_qnn_device_context *ctx, const ggml_tenso return false; } - if (!op->src[0]) { + if (!ggml_qnn_supports_tensor(ctx, src0) || !ggml_qnn_supports_tensor(ctx, op)) { QNN_LOG_DEBUG("src0 is nullptr"); return false; } @@ -647,7 +656,6 @@ bool ggml_qnn_supports_op(ggml_backend_qnn_device_context *ctx, const ggml_tenso return false; } - auto *src0 = op->src[0]; auto *src1 = op->src[1]; if (!ggml_qnn_supports_tensor(ctx, src0) || !ggml_qnn_supports_tensor(ctx, op) || (kQnnBinaryOpsTable[op->op] && !ggml_qnn_supports_tensor(ctx, src1))) { @@ -674,24 +682,35 @@ bool ggml_qnn_supports_op(ggml_backend_qnn_device_context *ctx, const ggml_tenso return true; } -bool ggml_qnn_forward(ggml_backend_qnn_device_context *ctx, struct ggml_tensor *tensor) { - size_t unary_op_idx = tensor->op; - if (tensor->op == GGML_OP_UNARY) { - unary_op_idx = kGgmlUnaryOpStart + ggml_get_unary_op(tensor); - } +bool device_compute_graph(ggml_backend_qnn_device_context *ctx, ggml_cgraph *cgraph) { + QNN_LOG_DEBUG("[%s]compute graph, nodes count: %d", qnn::get_backend_name(ctx->device), (int)cgraph->n_nodes); + for (int i = 0; i < cgraph->n_nodes; i++) { + ggml_tensor *tensor = cgraph->nodes[i]; + if (ggml_is_empty(tensor)) { + continue; + } - auto unary_op = kQnnUnaryOpsTable[unary_op_idx]; - if (unary_op) { - return unary_op(ctx, tensor->src[0], tensor); - } + size_t unary_op_idx = tensor->op; + if (tensor->op == GGML_OP_UNARY) { + unary_op_idx = kGgmlUnaryOpStart + ggml_get_unary_op(tensor); + } + + bool ok = false; + auto unary_op = kQnnUnaryOpsTable[unary_op_idx]; + auto binary_op = kQnnBinaryOpsTable[tensor->op]; + if (unary_op) { + ok = unary_op(ctx, tensor->src[0], tensor); + } else if (binary_op) { + ok = binary_op(ctx, tensor->src[0], tensor->src[1], tensor); + } - auto binary_op = kQnnBinaryOpsTable[tensor->op]; - if (binary_op) { - return binary_op(ctx, tensor->src[0], tensor->src[1], tensor); + if (!ok) { + QNN_LOG_WARN("[%s]unsupported op %s", qnn::get_backend_name(ctx->device), ggml_op_desc(tensor)); + return false; + } } - QNN_LOG_WARN("[forward]unsupported op %s", ggml_op_desc(tensor)); - return false; + return true; } } // namespace qnn diff --git a/ggml/src/ggml-qnn/backend-ops.hpp b/ggml/src/ggml-qnn/backend-ops.hpp index 3df7f4a98a146..c49c4d6dc19d7 100644 --- a/ggml/src/ggml-qnn/backend-ops.hpp +++ b/ggml/src/ggml-qnn/backend-ops.hpp @@ -6,7 +6,7 @@ namespace qnn { -bool ggml_qnn_supports_op(ggml_backend_qnn_device_context *ctx, const ggml_tensor *op); -bool ggml_qnn_forward(ggml_backend_qnn_device_context *ctx, struct ggml_tensor *tensor); +bool device_supports_op(ggml_backend_qnn_device_context *ctx, const ggml_tensor *op); +bool device_compute_graph(ggml_backend_qnn_device_context *ctx, ggml_cgraph *cgraph); } // namespace qnn diff --git a/ggml/src/ggml-qnn/buffer.hpp b/ggml/src/ggml-qnn/buffer.hpp index 676e88c0454be..9573e160b4176 100644 --- a/ggml/src/ggml-qnn/buffer.hpp +++ b/ggml/src/ggml-qnn/buffer.hpp @@ -1,28 +1,42 @@ #pragma once #include +#include #include "logger.hpp" #include "qnn-lib.hpp" namespace qnn { -class ggml_qnn_rpc_buffer { + +class qnn_buffer_interface { public: - ggml_qnn_rpc_buffer(std::shared_ptr qnn_instance, const size_t size, const uint32_t rank, - uint32_t *dimensions, Qnn_DataType_t data_type) : - _qnn_instance(qnn_instance), _size(size) { + virtual ~qnn_buffer_interface() = default; + + virtual bool is_valid() const = 0; + virtual uint8_t *get_buffer() = 0; + virtual size_t get_size() const = 0; + virtual Qnn_MemHandle_t get_mem_handle() const = 0; +}; + +using qnn_buffer_ptr = std::shared_ptr; - _qnn_rpc_buffer = static_cast(qnn_instance->alloc_rpcmem(size, alignof(void *))); +class qnn_rpc_buffer : public qnn_buffer_interface { +public: + qnn_rpc_buffer(std::shared_ptr qnn_instance, const size_t size, const uint32_t rank, + uint32_t *dimensions, Qnn_DataType_t data_type) + : _size(size), _qnn_instance(qnn_instance) { + + _qnn_rpc_buffer = static_cast(qnn_instance->alloc_rpcmem(size, alignof(uint8_t *))); _qnn_rpc_mem_handle = qnn_instance->register_rpcmem(_qnn_rpc_buffer, rank, dimensions, data_type); if (!_qnn_rpc_buffer || !_qnn_rpc_mem_handle) { - QNN_LOG_WARN("register rpc mem failure\n"); + QNN_LOG_WARN("register rpc mem failure"); // let the destructor free the buffer return; } - QNN_LOG_DEBUG("alloc rpcmem(%p) successfully, size %d\n", _qnn_rpc_buffer, (int)size); + QNN_LOG_DEBUG("alloc rpcmem(%p) successfully, size %d", _qnn_rpc_buffer, (int)size); } - ~ggml_qnn_rpc_buffer() { + ~qnn_rpc_buffer() { if (_qnn_instance) { if (_qnn_rpc_mem_handle) { _qnn_instance->unregister_rpcmem(_qnn_rpc_mem_handle); @@ -34,22 +48,58 @@ class ggml_qnn_rpc_buffer { } } - bool is_valid() const { return _qnn_rpc_buffer && _qnn_rpc_mem_handle; } + bool is_valid() const override { return _qnn_rpc_buffer && _qnn_rpc_mem_handle; } - uint8_t *get_buffer() const { return _qnn_rpc_buffer; } - size_t get_size() const { return _size; } - Qnn_MemHandle_t get_mem_handle() const { return _qnn_rpc_mem_handle; } + uint8_t *get_buffer() override { return _qnn_rpc_buffer; } + size_t get_size() const override { return _size; } + Qnn_MemHandle_t get_mem_handle() const override { return _qnn_rpc_mem_handle; } private: - std::shared_ptr _qnn_instance; size_t _size = 0; uint8_t *_qnn_rpc_buffer = nullptr; Qnn_MemHandle_t _qnn_rpc_mem_handle = nullptr; + std::shared_ptr _qnn_instance; + + DISABLE_COPY(qnn_rpc_buffer); + DISABLE_MOVE(qnn_rpc_buffer); +}; + +class qnn_mem_buffer : public qnn_buffer_interface { +public: + explicit qnn_mem_buffer(const uint8_t *data, size_t size) { + _buffer = reinterpret_cast(qnn::page_align_alloc(size)); + + if (!_buffer) { + QNN_LOG_WARN("failed to allocate %.2f MiB", float(size / (1 << 20))); + return; + } + + _size = size; + + if (data) { + memcpy(_buffer, data, size); + } + } + + explicit qnn_mem_buffer(size_t size) : qnn_mem_buffer(nullptr, size) {} + + ~qnn_mem_buffer() { + // the free will do nothing if the _buffer is nullptr + qnn::align_free(_buffer); + } + + bool is_valid() const override { return _buffer != nullptr; } + + uint8_t *get_buffer() override { return _buffer; } + size_t get_size() const override { return _size; } + Qnn_MemHandle_t get_mem_handle() const override { return nullptr; } + +private: + size_t _size = 0; + uint8_t *_buffer = nullptr; - ggml_qnn_rpc_buffer(const ggml_qnn_rpc_buffer &) = delete; - void operator=(const ggml_qnn_rpc_buffer &) = delete; - ggml_qnn_rpc_buffer(ggml_qnn_rpc_buffer &&) = delete; - void operator=(ggml_qnn_rpc_buffer &&) = delete; + DISABLE_COPY(qnn_mem_buffer); + DISABLE_MOVE(qnn_mem_buffer); }; } // namespace qnn diff --git a/ggml/src/ggml-qnn/ggml-qnn.cpp b/ggml/src/ggml-qnn/ggml-qnn.cpp index c57692b867bc7..933016a62878e 100644 --- a/ggml/src/ggml-qnn/ggml-qnn.cpp +++ b/ggml/src/ggml-qnn/ggml-qnn.cpp @@ -1,7 +1,5 @@ #include "ggml-qnn.h" -#include - #include #include #include @@ -87,78 +85,44 @@ static_assert(sizeof(kDeviceCaps) / sizeof(kDeviceCaps[0]) == GGML_QNN_MAX_DEVIC "The number of qnn devices should be equal to GGML_QNN_MAX_DEVICES"); static_assert(kDeviceCaps[QNN_BACKEND_NPU].type == GGML_BACKEND_DEVICE_TYPE_ACCEL, "The NPU device should be an accelerator device"); +static_assert(kDeviceCaps[QNN_BACKEND_GPU].type == GGML_BACKEND_DEVICE_TYPE_GPU, + "The NPU device should be an accelerator device"); -class ggml_backend_qnn_buffer_context { -public: - ggml_backend_qnn_buffer_context(QNNBackend device, std::shared_ptr instance, size_t size) - : _instance(instance), _name(QNN_BACKEND_NAME + std::to_string(device)) { - // TODO: fix this for other platforms - size_t size_page = sysconf(_SC_PAGESIZE); - - // TODO: for qnn npu, a better way here is to reuse the buffer allocated by - // qnn rpc, will save an extra copy - _buffer = qnn::align_alloc(size_page, size); - - if (!_buffer) { - QNN_LOG_WARN("failed to allocate %.2f MiB\n", float(size / (1 << 20))); - return; - } - - _buffer_size = size; - } - - ~ggml_backend_qnn_buffer_context() { - // the free will do nothing if the _buffer is nullptr - qnn::align_free(_buffer); - } - - bool is_valid() const { return _buffer != nullptr; } - - void *get_buffer() { return _buffer; } - size_t get_buffer_size() { return _buffer_size; } - -private: - std::shared_ptr _instance; - std::string _name; - void *_buffer = nullptr; - size_t _buffer_size = 0; -}; - -struct ggml_backend_qnn_buffer_type_context { - std::string name; -}; +static_assert(kDeviceCaps[QNN_BACKEND_CPU].type == GGML_BACKEND_DEVICE_TYPE_CPU, + "The NPU device should be an accelerator device"); ggml_backend_qnn_device_context *get_device_context(ggml_backend_dev_t dev) { return reinterpret_cast(dev->context); } +qnn::qnn_buffer_interface *get_buffer_context(ggml_backend_buffer_t buffer) { + return reinterpret_cast(buffer->context); +} + /* * ----------------------------------------------------------------------------------------------- * qnn backend buffer object * ----------------------------------------------------------------------------------------------- */ void ggml_backend_qnn_buffer_free_buffer(ggml_backend_buffer_t buffer) { - ggml_backend_qnn_buffer_context *ctx = (ggml_backend_qnn_buffer_context *)buffer->context; - + auto *ctx = get_buffer_context(buffer); delete ctx; } void *ggml_backend_qnn_buffer_get_base(ggml_backend_buffer_t buffer) { - ggml_backend_qnn_buffer_context *ctx = (ggml_backend_qnn_buffer_context *)buffer->context; - + auto *ctx = get_buffer_context(buffer); return ctx->get_buffer(); } void ggml_backend_qnn_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor *tensor) { - // Do nothing here, the qnn tensor will be create along with the graph. GGML_UNUSED(buffer); GGML_UNUSED(tensor); + // TODO: we should create the qnn tensor along with the ggml tensor } void ggml_backend_qnn_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor *tensor, const void *data, size_t offset, size_t size) { GGML_UNUSED(buffer); - memcpy((char *)tensor->data + offset, data, size); } @@ -168,8 +132,7 @@ void ggml_backend_qnn_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml memcpy(data, (const char *)tensor->data + offset, size); } -bool ggml_backend_qnn_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor *src, - struct ggml_tensor *dst) { +bool ggml_backend_qnn_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor *src, ggml_tensor *dst) { GGML_UNUSED(buffer); if (ggml_backend_buffer_is_host(src->buffer)) { memcpy(dst->data, src->data, ggml_nbytes(src)); @@ -180,12 +143,11 @@ bool ggml_backend_qnn_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const stru } void ggml_backend_qnn_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { - ggml_backend_qnn_buffer_context *ctx = (ggml_backend_qnn_buffer_context *)buffer->context; - - memset(ctx->get_buffer(), value, ctx->get_buffer_size()); + auto *ctx = get_buffer_context(buffer); + memset(ctx->get_buffer(), value, ctx->get_size()); } -ggml_backend_buffer_i ggml_backend_qnn_buffer_interface = { +constexpr const ggml_backend_buffer_i ggml_backend_qnn_buffer_interface = { /* .free_buffer = */ ggml_backend_qnn_buffer_free_buffer, /* .get_base = */ ggml_backend_qnn_buffer_get_base, /* .init_tensor = */ ggml_backend_qnn_buffer_init_tensor, @@ -208,13 +170,13 @@ const char *ggml_backend_qnn_buffer_type_name(ggml_backend_buffer_type_t buft) { } ggml_backend_buffer_t ggml_backend_qnn_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { - auto *dev_ctx = get_device_context(buft->device); - ggml_backend_qnn_buffer_context *ctx = - new ggml_backend_qnn_buffer_context((QNNBackend)dev_ctx->device, dev_ctx->instance, size); + qnn::qnn_buffer_interface *ctx = new qnn::qnn_mem_buffer(size); if (!ctx->is_valid()) { return nullptr; } + QNN_LOG_DEBUG("[%s]alloc buffer: %p, size: %ld", qnn::get_backend_name(get_device_context(buft->device)->device), + ctx->get_buffer(), size); return ggml_backend_buffer_init(buft, ggml_backend_qnn_buffer_interface, ctx, size); } @@ -227,7 +189,7 @@ size_t ggml_backend_qnn_buffer_type_get_alignment(ggml_backend_buffer_type_t buf size_t ggml_backend_qnn_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) { GGML_UNUSED(buft); // TODO: get the max size from device - return 1024 * 1024 * 1024; + return 1024L * 1024 * 1024; } bool ggml_backend_qnn_buffer_is_host(ggml_backend_buffer_type_t buft) { @@ -254,61 +216,52 @@ void ggml_backend_qnn_free(ggml_backend_t backend) { } } +bool ggml_backend_qnn_cpy_tensor_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, const ggml_tensor *src, + ggml_tensor *dst) { + GGML_UNUSED(backend_src); + GGML_UNUSED(backend_dst); + GGML_UNUSED(src); + GGML_UNUSED(dst); + return false; +} + ggml_backend_buffer_type_t ggml_backend_qnn_buffer_type(ggml_backend_dev_t dev) { - static ggml_backend_qnn_buffer_type_context ggml_backend_qnn_buffer_type_contexts[GGML_QNN_MAX_DEVICES]; static ggml_backend_buffer_type ggml_backend_qnn_buffer_types[GGML_QNN_MAX_DEVICES]; - static bool ggml_backend_qnn_buffer_type_initialized = false; auto *dev_ctx = get_device_context(dev); - if (!ggml_backend_qnn_buffer_type_initialized) { - for (size_t i = 0; i < GGML_QNN_MAX_DEVICES; i++) { - auto &context = ggml_backend_qnn_buffer_type_contexts[i]; - context = {std::string(QNN_BACKEND_NAME) + std::to_string(i)}; - ggml_backend_qnn_buffer_types[i] = { - /* .iface = */ { - /* .get_name = */ ggml_backend_qnn_buffer_type_name, - /* .alloc_buffer = */ - ggml_backend_qnn_buffer_type_alloc_buffer, - /* .get_alignment = */ - ggml_backend_qnn_buffer_type_get_alignment, - /* .get_max_size = */ - ggml_backend_qnn_buffer_type_get_max_size, - /* .get_alloc_size = */ nullptr, // defaults to ggml_nbytes - /* .is_host = */ ggml_backend_qnn_buffer_is_host, - }, - /* .device */ dev, - /* .context = */ &context, - }; - } - ggml_backend_qnn_buffer_type_initialized = true; + if (!ggml_backend_qnn_buffer_types[dev_ctx->device].device) { + ggml_backend_qnn_buffer_types[dev_ctx->device] = { + /* .iface = */ { + /* .get_name = */ ggml_backend_qnn_buffer_type_name, + /* .alloc_buffer = */ + ggml_backend_qnn_buffer_type_alloc_buffer, + /* .get_alignment = */ + ggml_backend_qnn_buffer_type_get_alignment, + /* .get_max_size = */ + ggml_backend_qnn_buffer_type_get_max_size, + /* .get_alloc_size = */ nullptr, // defaults to ggml_nbytes + /* .is_host = */ ggml_backend_qnn_buffer_is_host, + }, + /* .device */ dev, + /* .context = */ nullptr, + }; + } else { + GGML_ASSERT(ggml_backend_qnn_buffer_types[dev_ctx->device].device == dev); } return &ggml_backend_qnn_buffer_types[dev_ctx->device]; } ggml_status ggml_backend_qnn_graph_compute(ggml_backend_t backend, ggml_cgraph *cgraph) { - enum ggml_status result = GGML_STATUS_SUCCESS; - auto *device_ctx = get_device_context(backend->device); - for (int i = 0; i < cgraph->n_nodes; i++) { - ggml_tensor *node = cgraph->nodes[i]; - if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || - node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) { - continue; - } - bool ok = qnn::ggml_qnn_forward(device_ctx, node); - if (!ok) { - QNN_LOG_DEBUG("error: op not supported %s (%s)\n", node->name, ggml_op_name(node->op)); - } - } - - return result; + return qnn::device_compute_graph(get_device_context(backend->device), cgraph) ? GGML_STATUS_SUCCESS + : GGML_STATUS_FAILED; } -ggml_backend_i ggml_backend_qnn_interface = { +constexpr const ggml_backend_i ggml_backend_qnn_interface = { /* .get_name = */ ggml_backend_qnn_name, /* .free = */ ggml_backend_qnn_free, /* .set_tensor_async = */ nullptr, /* .get_tensor_async = */ nullptr, - /* .cpy_tensor_async = */ nullptr, + /* .cpy_tensor_async = */ ggml_backend_qnn_cpy_tensor_async, /* .synchronize = */ nullptr, /* .graph_plan_create = */ nullptr, /* .graph_plan_free = */ nullptr, @@ -345,7 +298,7 @@ enum ggml_backend_dev_type ggml_backend_qnn_device_get_type(ggml_backend_dev_t d return kDeviceCaps[get_device_context(dev)->device].type; } -void ggml_backend_qnn_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props *props) { +void ggml_backend_qnn_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props *props) { props->name = ggml_backend_qnn_device_get_name(dev); props->description = ggml_backend_qnn_device_get_description(dev); props->type = ggml_backend_qnn_device_get_type(dev); @@ -364,6 +317,8 @@ ggml_guid_t ggml_backend_qnn_guid() { return &guid; } +bool ggml_backend_is_qnn(ggml_backend_t backend) { return ggml_guid_matches(backend->guid, ggml_backend_qnn_guid()); } + ggml_backend_t ggml_backend_qnn_init_with_device_context(ggml_backend_dev_t dev, const char *extend_lib_search_path) { if (!extend_lib_search_path) { extend_lib_search_path = GGML_QNN_DEFAULT_LIB_SEARCH_PATH; @@ -401,9 +356,9 @@ ggml_backend_t ggml_backend_qnn_init_with_device_context(ggml_backend_dev_t dev, } } else { if (setenv("LD_LIBRARY_PATH", path.c_str(), 1) == 0) { - QNN_LOG_DEBUG("%s backend setenv successfully\n", qnn::get_backend_name(device)); + QNN_LOG_DEBUG("%s backend setenv successfully", qnn::get_backend_name(device)); } else { - QNN_LOG_ERROR("%s backend setenv failure\n", qnn::get_backend_name(device)); + QNN_LOG_ERROR("%s backend setenv failure", qnn::get_backend_name(device)); } } #endif @@ -411,12 +366,12 @@ ggml_backend_t ggml_backend_qnn_init_with_device_context(ggml_backend_dev_t dev, auto instance = std::make_shared(path, dev_ctx->lib_name, "ggml"); auto result = instance->qnn_init(nullptr); if (result != 0) { - QNN_LOG_WARN("init qnn subsystem failed with qnn backend %s, pls check why\n", qnn::get_backend_name(device)); + QNN_LOG_WARN("init qnn subsystem failed with qnn backend %s, pls check why", qnn::get_backend_name(device)); return nullptr; } auto qnn_interface = instance->get_qnn_interface(); if (!qnn_interface) { - QNN_LOG_WARN("qnn subsystem failure\n"); + QNN_LOG_WARN("qnn subsystem failure"); return nullptr; } @@ -453,10 +408,10 @@ ggml_backend_buffer_t ggml_backend_qnn_device_buffer_from_ptr(ggml_backend_dev_t return ggml_backend_cpu_buffer_from_ptr(ptr, size); } -bool ggml_backend_qnn_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor *op) { +bool ggml_backend_qnn_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor *op) { // Note that this function could be called before the device context is initialized auto *device_ctx = get_device_context(dev); - return qnn::ggml_qnn_supports_op(device_ctx, op); + return qnn::device_supports_op(device_ctx, op); } bool ggml_backend_qnn_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) { @@ -464,7 +419,13 @@ bool ggml_backend_qnn_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_ return ggml_backend_buft_is_host(buft); } -const struct ggml_backend_device_i ggml_backend_qnn_device_interface = { +bool ggml_backend_qnn_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor *op) { + auto *device_ctx = get_device_context(dev); + QNN_LOG_DEBUG("[%s][%s]offload op", qnn::get_backend_name(device_ctx->device), ggml_op_name(op->op)); + return false; +} + +constexpr const ggml_backend_device_i ggml_backend_qnn_device_interface = { /* .get_name = */ ggml_backend_qnn_device_get_name, /* .get_description = */ ggml_backend_qnn_device_get_description, /* .get_memory = */ ggml_backend_qnn_device_get_memory, @@ -476,7 +437,7 @@ const struct ggml_backend_device_i ggml_backend_qnn_device_interface = { /* .buffer_from_host_ptr = */ ggml_backend_qnn_device_buffer_from_ptr, /* .supports_op = */ ggml_backend_qnn_device_supports_op, /* .supports_buft = */ ggml_backend_qnn_device_supports_buft, - /* .offload_op = */ nullptr, + /* .offload_op = */ ggml_backend_qnn_device_offload_op, /* .event_new = */ nullptr, /* .event_free = */ nullptr, /* .event_synchronize = */ nullptr, @@ -489,27 +450,36 @@ const struct ggml_backend_device_i ggml_backend_qnn_device_interface = { */ struct ggml_backend_qnn_reg_impl : ggml_backend_reg { - std::array, GGML_QNN_MAX_DEVICES> device_contexts; - std::array devices; + std::vector> device_contexts; + std::vector devices; explicit ggml_backend_qnn_reg_impl(ggml_backend_reg_i interface) { context = this; iface = interface; QNN_LOG_DEBUG("qnn backend registry init"); - for (int i = 0; i < GGML_QNN_MAX_DEVICES; i++) { - const auto device_enum = (QNNBackend)(GGML_QNN_MAX_DEVICES - 1 - i); // init from the last device, i.e. NPU - device_contexts[i] = std::make_unique( + for (size_t i = 0; i < QNN_BACKEND_COUNT; i++) { + const auto device_enum = (QNNBackend)(QNN_BACKEND_COUNT - 1 - i); // init from the last device, i.e. NPU + if (device_enum == QNN_BACKEND_CPU) { + /* + * here we skip the initialization of CPU device, + * cause it'll block unsupported ops fallback to ggml cpu backend + */ + continue; + } + + device_contexts.emplace_back(std::make_unique( /* .device = */ device_enum, // init from the last device, i.e. NPU /* .threads = */ 1, /* .name = */ qnn::get_backend_name(device_enum), /* .lib_name = */ kDeviceCaps[device_enum].lib_name, - /* .supported_types = */ kDeviceCaps[device_enum].supported_types); + /* .supported_types = */ kDeviceCaps[device_enum].supported_types)); - auto &device = devices[i]; - device.iface = ggml_backend_qnn_device_interface; - device.reg = this; - device.context = device_contexts[i].get(); + devices.emplace_back(ggml_backend_device{ + /* iface = */ ggml_backend_qnn_device_interface, + /* reg = */ this, + /* context = */ device_contexts.back().get(), + }); } } }; diff --git a/ggml/src/ggml-qnn/graph.hpp b/ggml/src/ggml-qnn/graph.hpp index 1b0dcd78faa17..1806f41126f3c 100644 --- a/ggml/src/ggml-qnn/graph.hpp +++ b/ggml/src/ggml-qnn/graph.hpp @@ -64,12 +64,12 @@ class ggml_qnn_graph { } if (error != QNN_SUCCESS) { - QNN_LOG_ERROR("[%s][%s]failed to create qnn graph, error: %s\n", get_backend_name(device), - graph_name.c_str(), get_qnn_error_string(error)); + QNN_LOG_ERROR("[%s][%s]failed to create qnn graph, error: %s", get_backend_name(device), graph_name.c_str(), + get_qnn_error_string(error)); return; } - QNN_LOG_INFO("[%s][%s]create succeed\n", get_backend_name(device), graph_name.c_str()); + QNN_LOG_INFO("[%s][%s]create succeed", get_backend_name(device), graph_name.c_str()); _graph_handle = graph_handle; _qnn_interface = qnn_interface; } @@ -80,7 +80,7 @@ class ggml_qnn_graph { const ggml_tensor_array_t &tensor_outputs) { GGML_ASSERT(op_constructor); if (!is_valid()) { - QNN_LOG_ERROR("Invalid graph\n"); + QNN_LOG_ERROR("Invalid graph"); return false; } @@ -92,7 +92,7 @@ class ggml_qnn_graph { } if (!_op_config->add_op_to_graph(_graph_handle)) { - QNN_LOG_ERROR("[%s]add nodes failed\n", _graph_name.c_str()); + QNN_LOG_ERROR("[%s]add nodes failed", _graph_name.c_str()); return false; } @@ -109,12 +109,12 @@ class ggml_qnn_graph { bool execute(const ggml_tensor_array_t &tensor_inputs, const ggml_tensor_array_t &tensor_outputs) { if (!_op_config->bind_input_tensors(tensor_inputs)) { - QNN_LOG_ERROR("[%s][%s]bind input tensors failed\n", get_backend_name(_device), _graph_name.c_str()); + QNN_LOG_ERROR("[%s][%s]bind input tensors failed", get_backend_name(_device), _graph_name.c_str()); return false; } if (!_op_config->bind_output_tensors(tensor_outputs)) { - QNN_LOG_ERROR("[%s][%s]bind output tensors failed\n", get_backend_name(_device), _graph_name.c_str()); + QNN_LOG_ERROR("[%s][%s]bind output tensors failed", get_backend_name(_device), _graph_name.c_str()); return false; } diff --git a/ggml/src/ggml-qnn/op-config.cpp b/ggml/src/ggml-qnn/op-config.cpp index df70d548a44e0..b3c84b5435095 100644 --- a/ggml/src/ggml-qnn/op-config.cpp +++ b/ggml/src/ggml-qnn/op-config.cpp @@ -82,7 +82,7 @@ bool bind_tensors(const qnn::ggml_tensor_array_t &ggml_tensors, qnn::qnn_tensor_ for (size_t i = 0; i < ggml_tensors.size(); i++) { auto *ggml_tensor = ggml_tensors[i]; if (!tensor_wrappers[i]->bind_ggml_tensor(ggml_tensor)) { - QNN_LOG_ERROR("bind tensor %s failed\n", ggml_get_name(ggml_tensor)); + QNN_LOG_ERROR("bind tensor %s failed", ggml_get_name(ggml_tensor)); return false; } @@ -162,12 +162,12 @@ bool ggml_qnn_op_config_base::add_tensor_param(const std::string &name, const qn GGML_ASSERT(data_size > 0); if (!param_tensor->bind_buffer(const_cast(data), data_size)) { - QNN_LOG_ERROR("parameter tensor bind_buffer failed\n"); + QNN_LOG_ERROR("parameter tensor bind_buffer failed"); return false; } if (!param_tensor->alloc_qnn_tensor_id()) { - QNN_LOG_ERROR("parameter tensor alloc_qnn_tensor_id failed\n"); + QNN_LOG_ERROR("parameter tensor alloc_qnn_tensor_id failed"); return false; } @@ -185,26 +185,26 @@ bool ggml_qnn_op_config_base::add_op_to_graph(Qnn_GraphHandle_t graph_handle) { GGML_ASSERT(_qnn_tensor_inputs.size() == _tensor_inputs.size()); GGML_ASSERT(_qnn_tensor_outputs.size() == _tensor_outputs.size()); - QNN_LOG_DEBUG("[%s]add to graph start\n", _name.c_str()); + QNN_LOG_DEBUG("[%s]add to graph start", _name.c_str()); for (size_t i = 0; i < _tensor_inputs.size(); i++) { auto tensor = _tensor_inputs[i]; if (!tensor->alloc_qnn_tensor_id()) { - QNN_LOG_ERROR("[%s]input tensor alloc_qnn_tensor_id failed\n", _name.c_str()); + QNN_LOG_ERROR("[%s]input tensor alloc_qnn_tensor_id failed", _name.c_str()); return false; } - QNN_LOG_DEBUG("[%s]input tensor id: %d\n", _name.c_str(), tensor->get_qnn_tensor_id()); + QNN_LOG_DEBUG("[%s]input tensor id: %d", _name.c_str(), tensor->get_qnn_tensor_id()); _qnn_tensor_inputs[i] = tensor->get_qnn_tensor(); } for (size_t i = 0; i < _tensor_outputs.size(); i++) { auto tensor = _tensor_outputs[i]; if (!tensor->alloc_qnn_tensor_id()) { - QNN_LOG_ERROR("[%s]output tensor alloc_qnn_tensor_id failed\n", _name.c_str()); + QNN_LOG_ERROR("[%s]output tensor alloc_qnn_tensor_id failed", _name.c_str()); return false; } - QNN_LOG_DEBUG("[%s]output tensor id: %d\n", _name.c_str(), tensor->get_qnn_tensor_id()); + QNN_LOG_DEBUG("[%s]output tensor id: %d", _name.c_str(), tensor->get_qnn_tensor_id()); _qnn_tensor_outputs[i] = tensor->get_qnn_tensor(); } @@ -215,7 +215,7 @@ bool ggml_qnn_op_config_base::add_op_to_graph(Qnn_GraphHandle_t graph_handle) { return false; } - QNN_LOG_DEBUG("[%s]added to graph succeed\n", _name.c_str()); + QNN_LOG_DEBUG("[%s]added to graph succeed", _name.c_str()); return true; } @@ -280,6 +280,14 @@ bool ggml_qnn_single_op_config::initialize_op_nodes(QNNBackend device, Qnn_Graph return true; } +bool ggml_qnn_aggregate_op_config::bind_input_tensors(const ggml_tensor_array_t &tensor_inputs) { + return bind_tensors(tensor_inputs, _tensor_inputs, _qnn_tensor_inputs); +} + +bool ggml_qnn_aggregate_op_config::bind_output_tensors(const ggml_tensor_array_t &tensor_outputs) { + return bind_tensors(tensor_outputs, _tensor_outputs, _qnn_tensor_outputs); +} + bool ggml_qnn_matmul_op_config::initialize_op_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, const ggml_tensor_array_t &tensor_inputs, const ggml_tensor_array_t &tensor_outputs) { @@ -293,20 +301,21 @@ bool ggml_qnn_matmul_op_config::initialize_op_nodes(QNNBackend device, Qnn_Graph create_tensors_from_ggml_tensor(params, tensor_inputs, &_tensor_inputs, &_qnn_tensor_inputs); // create output tensor - qnn_tensor_array_t mat_mul_tensor_outputs; params.name_prefix = "dst"; params.is_input = false; - create_tensors_from_ggml_tensor(params, tensor_outputs, &mat_mul_tensor_outputs, nullptr); + create_tensors_from_ggml_tensor(params, tensor_outputs, &_tensor_outputs, &_qnn_tensor_outputs); // create convert nodes qnn_tensor_array_t mat_mul_tensor_inputs = _tensor_inputs; + qnn_tensor_array_t mat_mul_tensor_outputs = _tensor_outputs; if (!create_convert_nodes(device, graph_handle, tensor_rank, mat_mul_tensor_inputs, mat_mul_tensor_outputs)) { - QNN_LOG_ERROR("create convert nodes failed\n"); + QNN_LOG_ERROR("create convert nodes failed"); return false; } - mat_mul_tensor_inputs.front() = create_gather_nodes(device, graph_handle, tensor_rank, _tensor_inputs.front(), - _tensor_inputs.back()->get_dimensions()); + mat_mul_tensor_inputs.front() = + create_gather_nodes(device, graph_handle, tensor_rank, mat_mul_tensor_inputs.front(), + mat_mul_tensor_inputs.back()->get_dimensions()); return create_mat_mul_nodes(device, graph_handle, tensor_rank, mat_mul_tensor_inputs, mat_mul_tensor_outputs); } @@ -365,15 +374,15 @@ qnn_tensor_ptr_t ggml_qnn_matmul_op_config::create_gather_nodes(QNNBackend devic qnn_dimension_array_t intermediate_dimensions = input_dimensions; intermediate_dimensions[rank - 3] = output_dimensions[rank - 3]; qnn_tensor_ptr_t gather0_out; - _gather0 = create_node(_name + "_gather0", rank, rank - 3, intermediate_dimensions, tensor_input, device, - graph_handle, _qnn_instance, gather0_out); + _operations.push_back(create_node(_name + "_gather0", rank, rank - 3, intermediate_dimensions, tensor_input, device, + graph_handle, _qnn_instance, gather0_out)); if (rank == 3) { return gather0_out; } qnn_tensor_ptr_t gather1_out; - _gather1 = create_node(_name + "_gather1", rank, rank - 4, output_dimensions, gather0_out, device, graph_handle, - _qnn_instance, gather1_out); + _operations.push_back(create_node(_name + "_gather1", rank, rank - 4, output_dimensions, gather0_out, device, + graph_handle, _qnn_instance, gather1_out)); return gather1_out; } @@ -387,9 +396,8 @@ bool ggml_qnn_matmul_op_config::create_convert_nodes(QNNBackend device, Qnn_Grap // create tensors for convert node auto tensor_type = get_tensor_type(tensor_inputs); - QNN_LOG_DEBUG("input tensor type: %s\n", qnn_datatype_to_string(tensor_type)); + QNN_LOG_DEBUG("input tensor type: %s", qnn_datatype_to_string(tensor_type)); - _input_converts.resize(tensor_inputs.size()); for (size_t i = 0; i < tensor_inputs.size(); ++i) { // create input convert nodes auto convert_in = tensor_inputs[i]; @@ -406,7 +414,7 @@ bool ggml_qnn_matmul_op_config::create_convert_nodes(QNNBackend device, Qnn_Grap convert->set_input_tensors({convert_in}); convert->set_output_tensors({convert_out}); tensor_inputs[i] = convert_out; - _input_converts[i] = convert; + _operations.push_back(convert); } if (tensor_outputs.front()->get_data_type() != tensor_type) { @@ -421,7 +429,7 @@ bool ggml_qnn_matmul_op_config::create_convert_nodes(QNNBackend device, Qnn_Grap output_convert->set_input_tensors({convert_in}); output_convert->set_output_tensors({convert_out}); tensor_outputs.front() = convert_in; - _output_convert = output_convert; + _operations.push_back(output_convert); } return true; @@ -432,7 +440,7 @@ bool ggml_qnn_matmul_op_config::create_mat_mul_nodes(QNNBackend device, Qnn_Grap qnn_tensor_array_t &tensor_outputs) { /* - * First, both the ggml and qnn tensor in memory are stored as row-major format. (For more details, please also: + * First, both the ggml and qnn tensor in memory are stored as row-major format. (For more details, please refer to: * https://pytorch.org/blog/tensor-memory-format-matters/#:~:text=Column%20Major%20Order:%20In%20this%20format,%20the%20matrix) * But the dimensions of the tensor are stored in different order. * For example, a 2x3 matrix: @@ -515,81 +523,19 @@ bool ggml_qnn_matmul_op_config::create_mat_mul_nodes(QNNBackend device, Qnn_Grap transpose_out->set_input_tensors(tensors); transpose_out->set_output_tensors(tensor_outputs); - _mat_mul = mat_mul; - _transpose_out = transpose_out; + _operations.push_back(mat_mul); + _operations.push_back(transpose_out); return true; } -bool ggml_qnn_matmul_op_config::add_op_to_graph(Qnn_GraphHandle_t graph_handle) { - for (auto &convert : _input_converts) { - if (convert && !convert->add_op_to_graph(graph_handle)) { - return false; - } - } - - if (_gather0 && !_gather0->add_op_to_graph(graph_handle)) { - return false; - } - - if (_gather1 && !_gather1->add_op_to_graph(graph_handle)) { - return false; - } - - return _mat_mul->add_op_to_graph(graph_handle) && _transpose_out->add_op_to_graph(graph_handle) && - (!_output_convert || _output_convert->add_op_to_graph(graph_handle)); -} - -bool ggml_qnn_matmul_op_config::bind_input_tensors(const ggml_tensor_array_t &tensor_inputs) { - return bind_tensors(tensor_inputs, _tensor_inputs, _qnn_tensor_inputs); -} - -bool ggml_qnn_matmul_op_config::bind_output_tensors(const ggml_tensor_array_t &tensor_outputs) { - if (_output_convert) { - return _output_convert->bind_output_tensors(tensor_outputs); - } else { - return _transpose_out->bind_output_tensors(tensor_outputs); - } -} - -void ggml_qnn_matmul_op_config::unbind_input_tensors() { - _mat_mul->unbind_input_tensors(); - for (auto &convert : _input_converts) { - if (convert) { - convert->unbind_input_tensors(); - } - } -} - -void ggml_qnn_matmul_op_config::unbind_output_tensors() { - _transpose_out->unbind_output_tensors(); - if (_output_convert) { - _output_convert->unbind_output_tensors(); - } -} - -std::vector &ggml_qnn_matmul_op_config::get_qnn_output_tensors() { - if (_output_convert) { - return _output_convert->get_qnn_output_tensors(); - } else { - return _transpose_out->get_qnn_output_tensors(); - } -} - ggml_op_constructor_t create_op_constructor(const std::string &op_name) { if (op_name == QNN_OP_MAT_MUL) { // For QNN_OP_MAT_MUL, we need to transpose the input tensor return [](const std::string &instance_name, std::shared_ptr qnn_instance) -> std::unique_ptr { - QNN_LOG_DEBUG("create QNN_OP_MAT_MUL, name %s\n", instance_name.c_str()); + QNN_LOG_DEBUG("create QNN_OP_MAT_MUL, name %s", instance_name.c_str()); return std::make_unique(instance_name, qnn_instance); }; - } else if (op_name == QNN_OP_TRANSPOSE) { - return [](const std::string &instance_name, - std::shared_ptr qnn_instance) -> std::unique_ptr { - return std::make_unique( - instance_name, QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_TRANSPOSE, QNN_OP_TRANSPOSE_PARAM_PERM, - QNN_DATATYPE_UINT_32, 4 * sizeof(uint32_t), qnn_instance); - }; } return [op_name](const std::string &instance_name, diff --git a/ggml/src/ggml-qnn/op-config.hpp b/ggml/src/ggml-qnn/op-config.hpp index 27571563309a8..a05b75ade7e6a 100644 --- a/ggml/src/ggml-qnn/op-config.hpp +++ b/ggml/src/ggml-qnn/op-config.hpp @@ -82,21 +82,70 @@ class ggml_qnn_single_op_config : public ggml_qnn_op_config_base { DISABLE_MOVE(ggml_qnn_single_op_config); }; -class ggml_qnn_matmul_op_config : public ggml_qnn_op_config { +class ggml_qnn_aggregate_op_config : public ggml_qnn_op_config { public: - ggml_qnn_matmul_op_config(const std::string &name, std::shared_ptr qnn_instance) + explicit ggml_qnn_aggregate_op_config(const std::string &name, std::shared_ptr qnn_instance) : _name(name), _qnn_instance(qnn_instance) {} - bool initialize_op_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, - const ggml_tensor_array_t &tensor_inputs, - const ggml_tensor_array_t &tensor_outputs) override; - bool add_op_to_graph(Qnn_GraphHandle_t graph_handle) override; + ~ggml_qnn_aggregate_op_config() { + _qnn_tensor_inputs.clear(); + _qnn_tensor_outputs.clear(); + _tensor_inputs.clear(); + _tensor_outputs.clear(); + _operations.clear(); + } + + bool add_op_to_graph(Qnn_GraphHandle_t graph_handle) override { + for (auto &op : _operations) { + if (!op->add_op_to_graph(graph_handle)) { + return false; + } + } + return true; + } + bool bind_input_tensors(const ggml_tensor_array_t &tensor_inputs) override; + bool bind_output_tensors(const ggml_tensor_array_t &tensor_outputs) override; - void unbind_input_tensors() override; - void unbind_output_tensors() override; + + void unbind_input_tensors() override { + for (auto &tensor : _tensor_inputs) { + tensor->unbind(); + } + } + + void unbind_output_tensors() override { + for (auto &tensor : _tensor_outputs) { + tensor->unbind(); + } + } + std::vector &get_qnn_input_tensors() override { return _qnn_tensor_inputs; } - std::vector &get_qnn_output_tensors() override; + std::vector &get_qnn_output_tensors() override { return _qnn_tensor_outputs; } + +protected: + std::string _name; + std::shared_ptr _qnn_instance; + + std::vector _operations; + qnn_tensor_array_t _tensor_inputs; + qnn_tensor_array_t _tensor_outputs; + std::vector _qnn_tensor_inputs; + std::vector _qnn_tensor_outputs; + +private: + DISABLE_COPY(ggml_qnn_aggregate_op_config); + DISABLE_MOVE(ggml_qnn_aggregate_op_config); +}; + +class ggml_qnn_matmul_op_config : public ggml_qnn_aggregate_op_config { +public: + ggml_qnn_matmul_op_config(const std::string &name, std::shared_ptr qnn_instance) + : ggml_qnn_aggregate_op_config(name, qnn_instance) {} + + bool initialize_op_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, + const ggml_tensor_array_t &tensor_inputs, + const ggml_tensor_array_t &tensor_outputs) override; private: qnn_tensor_ptr_t create_gather_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, const int rank, @@ -106,17 +155,6 @@ class ggml_qnn_matmul_op_config : public ggml_qnn_op_config { bool create_mat_mul_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, const int rank, qnn_tensor_array_t &tensor_inputs, qnn_tensor_array_t &tensor_outputs); - std::string _name; - std::shared_ptr _qnn_instance; - qnn_op_config_ptr_t _transpose_out; - qnn_op_config_ptr_t _mat_mul; - qnn_op_config_ptr_t _gather0; - qnn_op_config_ptr_t _gather1; - std::vector _input_converts; - qnn_op_config_ptr_t _output_convert; - qnn_tensor_array_t _tensor_inputs; - std::vector _qnn_tensor_inputs; - DISABLE_COPY(ggml_qnn_matmul_op_config); DISABLE_MOVE(ggml_qnn_matmul_op_config); }; diff --git a/ggml/src/ggml-qnn/qnn-lib.hpp b/ggml/src/ggml-qnn/qnn-lib.hpp index c6801b7771ee9..454c0c6aa32c5 100644 --- a/ggml/src/ggml-qnn/qnn-lib.hpp +++ b/ggml/src/ggml-qnn/qnn-lib.hpp @@ -195,21 +195,21 @@ class qnn_instance { int qnn_init(const QnnSaver_Config_t **saver_config) { BackendIdType backend_id = QNN_BACKEND_ID_NULL; - QNN_LOG_DEBUG("enter qni_init\n"); + QNN_LOG_DEBUG("enter qnn_init"); std::lock_guard lock(_init_mutex); if (load_system() != 0) { - QNN_LOG_WARN("can not load QNN system lib, pls check why?\n"); + QNN_LOG_WARN("can not load QNN system lib, pls check why?"); return 1; } else { - QNN_LOG_DEBUG("load QNN system lib successfully\n"); + QNN_LOG_DEBUG("load QNN system lib successfully"); } std::string backend_lib_path = _lib_path + _backend_name; if (_lib_path_to_backend_id.count(backend_lib_path) == 0) { int is_load_ok = load_backend(backend_lib_path, saver_config); if (is_load_ok != 0) { - QNN_LOG_WARN("failed to load QNN backend\n"); + QNN_LOG_WARN("failed to load QNN backend"); return 2; } } @@ -218,7 +218,7 @@ class qnn_instance { if (_loaded_backend.count(backend_id) == 0 || _loaded_lib_handle.count(backend_id) == 0) { QNN_LOG_WARN( "library %s is loaded but loaded backend count=%zu, " - "loaded lib_handle count=%zu\n", + "loaded lib_handle count=%zu", backend_lib_path.c_str(), _loaded_backend.count(backend_id), _loaded_lib_handle.count(backend_id)); return 3; } @@ -227,28 +227,28 @@ class qnn_instance { _qnn_interface->qnn_log_create(qnn::sdk_logcallback, _qnn_log_level, &_qnn_log_handle); if (nullptr == _qnn_log_handle) { // NPU backend not work on Qualcomm SoC equipped low-end phone - QNN_LOG_WARN("why failed to initialize qnn log\n"); + QNN_LOG_WARN("why failed to initialize qnn log"); return 4; } else { - QNN_LOG_DEBUG("initialize qnn log successfully\n"); + QNN_LOG_DEBUG("initialize qnn log successfully"); } std::vector temp_backend_config; _qnn_interface->qnn_backend_create( _qnn_log_handle, temp_backend_config.empty() ? nullptr : temp_backend_config.data(), &_qnn_backend_handle); if (nullptr == _qnn_backend_handle) { - QNN_LOG_WARN("why failed to initialize qnn backend\n"); + QNN_LOG_WARN("why failed to initialize qnn backend"); return 5; } else { - QNN_LOG_DEBUG("initialize qnn backend successfully\n"); + QNN_LOG_DEBUG("initialize qnn backend successfully"); } Qnn_ErrorHandle_t qnn_status = _qnn_interface->qnn_property_has_capability(QNN_PROPERTY_GROUP_DEVICE); if (QNN_PROPERTY_NOT_SUPPORTED == qnn_status) { - QNN_LOG_WARN("device property is not supported\n"); + QNN_LOG_WARN("device property is not supported"); } if (QNN_PROPERTY_ERROR_UNKNOWN_KEY == qnn_status) { - QNN_LOG_WARN("device property is not known to backend\n"); + QNN_LOG_WARN("device property is not known to backend"); } qnn_status = QNN_SUCCESS; @@ -294,9 +294,9 @@ class qnn_instance { qnn_status = _qnn_interface->qnn_device_create(_qnn_log_handle, nullptr, &_qnn_device_handle); } if (QNN_SUCCESS != qnn_status && QNN_DEVICE_ERROR_UNSUPPORTED_FEATURE != qnn_status) { - QNN_LOG_WARN("failed to create QNN device\n"); + QNN_LOG_WARN("failed to create QNN device"); } else { - QNN_LOG_INFO("create QNN device successfully\n"); + QNN_LOG_INFO("create QNN device successfully"); } if (_profile_level != sdk_profile_level::profile_off) { @@ -306,19 +306,19 @@ class qnn_instance { if (QNN_PROFILE_NO_ERROR != _qnn_interface->qnn_profile_create(_qnn_backend_handle, profile_level, &_qnn_profile_handle)) { - QNN_LOG_WARN("unable to create profile handle in the backend\n"); + QNN_LOG_WARN("unable to create profile handle in the backend"); return 6; } else { - QNN_LOG_DEBUG("initialize qnn profile successfully\n"); + QNN_LOG_DEBUG("initialize qnn profile successfully"); } } _rpc_lib_handle = dl_load("libcdsprpc.so"); if (nullptr == _rpc_lib_handle) { - QNN_LOG_WARN("failed to load qualcomm's rpc lib, error:%s\n", dl_error()); + QNN_LOG_WARN("failed to load qualcomm's rpc lib, error:%s", dl_error()); return 8; } else { - QNN_LOG_DEBUG("load rpcmem lib successfully\n"); + QNN_LOG_DEBUG("load rpcmem lib successfully"); set_rpcmem_initialized(true); } _pfn_rpc_mem_init = reinterpret_cast(dl_sym(_rpc_lib_handle, "rpcmem_init")); @@ -343,10 +343,10 @@ class qnn_instance { */ _qnn_interface->qnn_context_create(_qnn_backend_handle, _qnn_device_handle, nullptr, &_qnn_context_handle); if (nullptr == _qnn_context_handle) { - QNN_LOG_WARN("why failed to initialize qnn context\n"); + QNN_LOG_WARN("why failed to initialize qnn context"); return 10; } else { - QNN_LOG_DEBUG("initialize qnn context successfully\n"); + QNN_LOG_DEBUG("initialize qnn context successfully"); } if (_backend_name.find("Htp") != _backend_name.npos) { @@ -359,7 +359,7 @@ class qnn_instance { for (size_t idx = 0; idx < probe_counts; idx++) { rpc_buffer = static_cast(alloc_rpcmem(probe_slots[idx] * size_in_mb, sizeof(void *))); if (!rpc_buffer) { - QNN_LOG_DEBUG("alloc rpcmem %d (MB) failure, %s\n", probe_slots[idx], strerror(errno)); + QNN_LOG_DEBUG("alloc rpcmem %d (MB) failure, %s", probe_slots[idx], strerror(errno)); break; } else { candidate_size = probe_slots[idx]; @@ -369,7 +369,7 @@ class qnn_instance { } _rpcmem_capacity = std::max(candidate_size, _rpcmem_capacity); - QNN_LOG_INFO("capacity of QNN rpc ion memory is about %d MB\n", _rpcmem_capacity); + QNN_LOG_INFO("capacity of QNN rpc ion memory is about %d MB", _rpcmem_capacity); if (0 != init_htp_perfinfra()) { QNN_LOG_WARN("initialize HTP performance failure"); @@ -382,7 +382,7 @@ class qnn_instance { } } - QNN_LOG_DEBUG("leave qni_init\n"); + QNN_LOG_DEBUG("leave qnn_init"); return 0; } @@ -395,9 +395,9 @@ class qnn_instance { _pfn_rpc_mem_deinit(); if (dl_unload(_rpc_lib_handle) != 0) { - QNN_LOG_WARN("failed to unload qualcomm's rpc lib, error:%s\n", dl_error()); + QNN_LOG_WARN("failed to unload qualcomm's rpc lib, error:%s", dl_error()); } else { - QNN_LOG_DEBUG("succeed to close rpcmem lib\n"); + QNN_LOG_DEBUG("succeed to close rpcmem lib"); } if (_backend_name.find("Htp") != _backend_name.npos) { @@ -407,7 +407,7 @@ class qnn_instance { if (nullptr != _qnn_context_handle) { error = _qnn_interface->qnn_context_free(_qnn_context_handle, _qnn_profile_handle); if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to free QNN context_handle: ID %u, error %d\n", _qnn_interface->get_backend_id(), + QNN_LOG_WARN("failed to free QNN context_handle: ID %u, error %d", _qnn_interface->get_backend_id(), QNN_GET_ERROR_CODE(error)); } _qnn_context_handle = nullptr; @@ -416,7 +416,7 @@ class qnn_instance { if (nullptr != _qnn_profile_handle) { error = _qnn_interface->qnn_profile_free(_qnn_profile_handle); if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to free QNN profile_handle: ID %u, error %d\n", _qnn_interface->get_backend_id(), + QNN_LOG_WARN("failed to free QNN profile_handle: ID %u, error %d", _qnn_interface->get_backend_id(), QNN_GET_ERROR_CODE(error)); } _qnn_profile_handle = nullptr; @@ -425,7 +425,7 @@ class qnn_instance { if (nullptr != _qnn_device_handle) { error = _qnn_interface->qnn_device_free(_qnn_device_handle); if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to free QNN device_handle: ID %u, error %d\n", _qnn_interface->get_backend_id(), + QNN_LOG_WARN("failed to free QNN device_handle: ID %u, error %d", _qnn_interface->get_backend_id(), QNN_GET_ERROR_CODE(error)); } _qnn_device_handle = nullptr; @@ -434,7 +434,7 @@ class qnn_instance { if (nullptr != _qnn_backend_handle) { error = _qnn_interface->qnn_backend_free(_qnn_backend_handle); if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to free QNN backend_handle: ID %u, error %d\n", _qnn_interface->get_backend_id(), + QNN_LOG_WARN("failed to free QNN backend_handle: ID %u, error %d", _qnn_interface->get_backend_id(), QNN_GET_ERROR_CODE(error)); } _qnn_backend_handle = nullptr; @@ -443,7 +443,7 @@ class qnn_instance { if (nullptr != _qnn_log_handle) { error = _qnn_interface->qnn_log_free(_qnn_log_handle); if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to free QNN log_handle: ID %u, error %d\n", _qnn_interface->get_backend_id(), + QNN_LOG_WARN("failed to free QNN log_handle: ID %u, error %d", _qnn_interface->get_backend_id(), QNN_GET_ERROR_CODE(error)); } _qnn_log_handle = nullptr; @@ -458,7 +458,7 @@ class qnn_instance { std::shared_ptr get_qnn_interface() { if (!_qnn_interface) { - QNN_LOG_WARN("pls check why _qnn_interface is not loaded\n"); + QNN_LOG_WARN("pls check why _qnn_interface is not loaded"); } return _qnn_interface; } @@ -479,10 +479,10 @@ class qnn_instance { QnnDevice_Infrastructure_t device_infra = nullptr; int error = _qnn_interface->qnn_device_get_infrastructure(&device_infra); if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to get qnn device infra\n"); + QNN_LOG_WARN("failed to get qnn device infra"); return 1; } else { - QNN_LOG_INFO("HTP backend perf_infrastructure creation ok\n"); + QNN_LOG_INFO("HTP backend perf_infrastructure creation ok"); } QnnHtpDevice_Infrastructure_t *htp_infra = static_cast(device_infra); @@ -494,7 +494,7 @@ class qnn_instance { if (htp_infra->infraType != QNN_HTP_DEVICE_INFRASTRUCTURE_TYPE_PERF) { QNN_LOG_INFO("HTP infra type = %d, which is not perf infra type", htp_infra->infraType); } else { - QNN_LOG_INFO("HTP infra type = %d, which is perf infra type\n", htp_infra->infraType); + QNN_LOG_INFO("HTP infra type = %d, which is perf infra type", htp_infra->infraType); } _qnn_htp_perfinfra = htp_perfinfra; _qnn_power_configid = power_configid; @@ -520,12 +520,12 @@ class qnn_instance { nullptr}; Qnn_ErrorHandle_t qnn_status = _qnn_htp_perfinfra->setPowerConfig(_qnn_power_configid, power_configs); if (qnn_status != QNN_SUCCESS) { - QNN_LOG_WARN("set htp perf failed\n"); + QNN_LOG_WARN("set htp perf failed"); } else { - QNN_LOG_DEBUG("set htp perf ok\n"); + QNN_LOG_DEBUG("set htp perf ok"); } } else { - QNN_LOG_WARN("can't set htp perf\n"); + QNN_LOG_WARN("can't set htp perf"); } return 0; @@ -533,7 +533,7 @@ class qnn_instance { int set_high_performance_mode() { if (nullptr == _qnn_htp_perfinfra) { - QNN_LOG_WARN("perf intra is null\n"); + QNN_LOG_WARN("perf intra is null"); return 1; } @@ -566,9 +566,9 @@ class qnn_instance { Qnn_ErrorHandle_t qnn_status = QNN_SUCCESS; qnn_status = _qnn_htp_perfinfra->setPowerConfig(_qnn_power_configid, power_configs); if (qnn_status != QNN_SUCCESS) { - QNN_LOG_WARN("set htp high performance mode failed\n"); + QNN_LOG_WARN("set htp high performance mode failed"); } else { - QNN_LOG_DEBUG("set htp high performance mode ok\n"); + QNN_LOG_DEBUG("set htp high performance mode ok"); } return 0; @@ -584,21 +584,21 @@ class qnn_instance { void *alloc_rpcmem(size_t bytes, size_t alignment) { if (!_rpcmem_initialized) { - QNN_LOG_WARN("rpc memory not initialized\n"); + QNN_LOG_WARN("rpc memory not initialized"); return nullptr; } auto allocate_bytes = static_cast(bytes + alignment); void *buf = _pfn_rpc_mem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS, (int)allocate_bytes); if (!buf) { - QNN_LOG_WARN("failed to allocate rpc memory, size: %d MB\n", (int)(allocate_bytes / (1 << 20))); + QNN_LOG_WARN("failed to allocate rpc memory, size: %d MB", (int)(allocate_bytes / (1 << 20))); return nullptr; } auto aligned_buf = reinterpret_cast(qnn::align_to(alignment, reinterpret_cast(buf))); bool status = _rpcmem_store_map.insert(std::pair(aligned_buf, buf)).second; if (!status) { - QNN_LOG_WARN("failed to allocate rpc memory\n"); + QNN_LOG_WARN("failed to allocate rpc memory"); _pfn_rpc_mem_free(buf); } @@ -607,9 +607,9 @@ class qnn_instance { void free_rpcmem(void *buf) { if (!_rpcmem_initialized) { - QNN_LOG_WARN("rpc memory not initialized\n"); + QNN_LOG_WARN("rpc memory not initialized"); } else if (_rpcmem_store_map.count(buf) == 0) { - QNN_LOG_WARN("no allocated tensor\n"); + QNN_LOG_WARN("no allocated tensor"); } else { _pfn_rpc_mem_free(_rpcmem_store_map[buf]); _rpcmem_store_map.erase(buf); @@ -619,7 +619,7 @@ class qnn_instance { int32_t rpcmem_to_fd(void *buf) { int32_t mem_fd = -1; if (!is_rpcmem_initialized()) { - QNN_LOG_WARN("rpc memory not initialized\n"); + QNN_LOG_WARN("rpc memory not initialized"); } else { mem_fd = _pfn_rpc_mem_to_fd(buf); } @@ -629,52 +629,51 @@ class qnn_instance { Qnn_MemHandle_t register_rpcmem(void *p_data, const uint32_t rank, uint32_t *dimensions, Qnn_DataType_t data_type) { if (!p_data) { - QNN_LOG_WARN("invalid param\n"); + QNN_LOG_WARN("invalid param"); return nullptr; } if (!is_rpcmem_initialized()) { - QNN_LOG_WARN("rpc memory not initialized\n"); + QNN_LOG_WARN("rpc memory not initialized"); return nullptr; } if (is_rpcmem_registered(p_data)) { - QNN_LOG_WARN("rpc memory already registered\n"); + QNN_LOG_WARN("rpc memory already registered"); return _qnn_rpc_buffer_to_handles[p_data]; } auto mem_fd = rpcmem_to_fd(p_data); if (mem_fd == -1) { - QNN_LOG_WARN("failed to get file descriptor\n"); + QNN_LOG_WARN("failed to get file descriptor"); return nullptr; } - QNN_LOG_DEBUG("mem_fd %d\n", mem_fd); + QNN_LOG_DEBUG("mem_fd %d", mem_fd); Qnn_MemDescriptor_t descriptor = {{rank, dimensions, nullptr}, data_type, QNN_MEM_TYPE_ION, {{mem_fd}}}; Qnn_MemHandle_t handle = nullptr; auto error = _qnn_interface->qnn_mem_register(_qnn_context_handle, &descriptor, /*numDescriptors=*/1, &handle); if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to register shared memory, error %d, %s\n", QNN_GET_ERROR_CODE(error), - strerror(error)); + QNN_LOG_WARN("failed to register shared memory, error %d, %s", QNN_GET_ERROR_CODE(error), strerror(error)); return nullptr; } _qnn_rpc_buffer_to_handles.insert({p_data, handle}); - QNN_LOG_DEBUG("successfully register shared memory handler: %p\n", handle); + QNN_LOG_DEBUG("successfully register shared memory handler: %p", handle); return handle; } void unregister_rpcmem(Qnn_MemHandle_t mem_handle) { Qnn_ErrorHandle_t error = _qnn_interface->qnn_mem_de_register(&mem_handle, 1); if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to unregister shared memory, error %d\n", QNN_GET_ERROR_CODE(error)); + QNN_LOG_WARN("failed to unregister shared memory, error %d", QNN_GET_ERROR_CODE(error)); } auto it = std::find_if(_qnn_rpc_buffer_to_handles.begin(), _qnn_rpc_buffer_to_handles.end(), [mem_handle](const auto &kv) { return kv.second == mem_handle; }); if (it == _qnn_rpc_buffer_to_handles.end()) { - QNN_LOG_WARN("failed to find shared memory handler: %p\n", mem_handle); + QNN_LOG_WARN("failed to find shared memory handler: %p", mem_handle); return; } @@ -691,18 +690,18 @@ class qnn_instance { Qnn_ErrorHandle_t error = QNN_SUCCESS; std::string system_lib_path = _lib_path + "libQnnSystem.so"; - QNN_LOG_DEBUG("system_lib_path:%s\n", system_lib_path.c_str()); + QNN_LOG_DEBUG("system_lib_path:%s", system_lib_path.c_str()); auto system_lib_handle = dl_load(system_lib_path); if (!system_lib_handle) { - QNN_LOG_WARN("can not load QNN library %s, error: %s\n", system_lib_path.c_str(), dl_error()); + QNN_LOG_WARN("can not load QNN library %s, error: %s", system_lib_path.c_str(), dl_error()); return 1; } auto *get_providers = dl_sym_typed( system_lib_handle, "QnnSystemInterface_getProviders"); if (!get_providers) { - QNN_LOG_WARN("can not load QNN symbol QnnSystemInterface_getProviders: %s\n", dl_error()); + QNN_LOG_WARN("can not load QNN symbol QnnSystemInterface_getProviders: %s", dl_error()); return 2; } @@ -710,17 +709,17 @@ class qnn_instance { const QnnSystemInterface_t **provider_list = nullptr; error = get_providers(&provider_list, &num_providers); if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to get providers, error %d\n", QNN_GET_ERROR_CODE(error)); + QNN_LOG_WARN("failed to get providers, error %d", QNN_GET_ERROR_CODE(error)); return 3; } if (num_providers != _required_num_providers) { - QNN_LOG_WARN("providers is %d instead of required %d\n", num_providers, _required_num_providers); + QNN_LOG_WARN("providers is %d instead of required %d", num_providers, _required_num_providers); return 4; } if (!provider_list) { - QNN_LOG_WARN("can not get providers\n"); + QNN_LOG_WARN("can not get providers"); return 5; } @@ -735,15 +734,15 @@ class qnn_instance { } } if (!found_valid_system_interface) { - QNN_LOG_WARN("unable to find a valid qnn system interface\n"); + QNN_LOG_WARN("unable to find a valid qnn system interface"); return 6; } else { - QNN_LOG_DEBUG("find a valid qnn system interface\n"); + QNN_LOG_DEBUG("find a valid qnn system interface"); } auto qnn_sys_interface = std::make_shared(*provider_list[0], system_lib_handle); if (!qnn_sys_interface->is_valid()) { - QNN_LOG_WARN("failed to create QNN system interface\n"); + QNN_LOG_WARN("failed to create QNN system interface"); return 7; } @@ -753,7 +752,7 @@ class qnn_instance { int load_backend(std::string &lib_path, const QnnSaver_Config_t ** /*saver_config*/) { Qnn_ErrorHandle_t error = QNN_SUCCESS; - QNN_LOG_DEBUG("lib_path:%s\n", lib_path.c_str()); + QNN_LOG_DEBUG("lib_path:%s", lib_path.c_str()); auto lib_handle = dl_load(lib_path.c_str()); if (!lib_handle) { @@ -775,14 +774,14 @@ class qnn_instance { QNN_LOG_WARN("failed to get providers, error %d", QNN_GET_ERROR_CODE(error)); return 3; } - QNN_LOG_DEBUG("num_providers=%d\n", num_providers); + QNN_LOG_DEBUG("num_providers=%d", num_providers); if (num_providers != _required_num_providers) { QNN_LOG_WARN("providers is %d instead of required %d", num_providers, _required_num_providers); return 4; } if (!provider_list) { - QNN_LOG_WARN("failed to get qnn interface providers\n"); + QNN_LOG_WARN("failed to get qnn interface providers"); return 5; } bool found_valid_interface = false; @@ -797,23 +796,23 @@ class qnn_instance { } if (!found_valid_interface) { - QNN_LOG_WARN("unable to find a valid qnn interface\n"); + QNN_LOG_WARN("unable to find a valid qnn interface"); return 6; } else { - QNN_LOG_DEBUG("find a valid qnn interface\n"); + QNN_LOG_DEBUG("find a valid qnn interface"); } BackendIdType backend_id = provider_list[0]->backendId; _lib_path_to_backend_id[lib_path] = backend_id; if (_loaded_backend.count(backend_id) > 0) { - QNN_LOG_WARN("lib_path %s is loaded, but backend %d already exists\n", lib_path.c_str(), backend_id); + QNN_LOG_WARN("lib_path %s is loaded, but backend %d already exists", lib_path.c_str(), backend_id); } _loaded_backend[backend_id] = provider_list[0]; if (_loaded_lib_handle.count(backend_id) > 0) { - QNN_LOG_WARN("closing %p\n", _loaded_lib_handle[backend_id]); + QNN_LOG_WARN("closing %p", _loaded_lib_handle[backend_id]); int dlclose_error = dl_unload(_loaded_lib_handle[backend_id]); if (dlclose_error != 0) { - QNN_LOG_WARN("fail to close %p with error %s\n", _loaded_lib_handle[backend_id], dl_error()); + QNN_LOG_WARN("fail to close %p with error %s", _loaded_lib_handle[backend_id], dl_error()); } } _loaded_lib_handle[backend_id] = lib_handle; @@ -827,7 +826,7 @@ class qnn_instance { for (auto &it : _loaded_lib_handle) { dlclose_error = dl_unload(it.second); if (dlclose_error != 0) { - QNN_LOG_WARN("failed to close QNN backend %d, error %s\n", it.first, dl_error()); + QNN_LOG_WARN("failed to close QNN backend %d, error %s", it.first, dl_error()); } } diff --git a/ggml/src/ggml-qnn/tensor.hpp b/ggml/src/ggml-qnn/tensor.hpp index 0a9a367015127..833c620971e0d 100644 --- a/ggml/src/ggml-qnn/tensor.hpp +++ b/ggml/src/ggml-qnn/tensor.hpp @@ -32,10 +32,10 @@ class ggml_qnn_tensor { if (!_tensor_name.empty()) { QNN_TENSOR_SET_NAME(_qnn_tensor, _tensor_name.c_str()); } - QNN_TENSOR_SET_DIMENSIONS(_qnn_tensor, _dimensions.data()); - QNN_TENSOR_SET_DATA_FORMAT(_qnn_tensor, QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER); _dimensions = dimensions; + QNN_TENSOR_SET_DIMENSIONS(_qnn_tensor, _dimensions.data()); + QNN_TENSOR_SET_DATA_FORMAT(_qnn_tensor, QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER); update_params_from_ggml_tensor(tensor_type, data_type, rank); QNN_LOG_DEBUG("[%s][%s]created, rank: %d, dims: [%d, %d, %d, %d], type: %s", get_backend_name(device), _tensor_name.c_str(), rank, (int)_dimensions[0], (int)_dimensions[1], (int)_dimensions[2], @@ -51,7 +51,7 @@ class ggml_qnn_tensor { ~ggml_qnn_tensor() { _buffer_storage.clear(); unbind(); - _qnn_rpc_buffer.reset(); + _rpc_buffer.reset(); } bool set_data_buffer(std::vector &&buffer) { @@ -73,7 +73,7 @@ class ggml_qnn_tensor { auto qnn_interface = _qnn_instance->get_qnn_interface(); auto error = qnn_interface->qnn_tensor_create_graph_tensor(_graph_handle, &qnn_tensor); if (error != QNN_SUCCESS) { - QNN_LOG_WARN("[%s]allocate id failed , error: %d\n", _tensor_name.c_str(), error); + QNN_LOG_WARN("[%s]allocate id failed , error: %d", _tensor_name.c_str(), error); return false; } @@ -162,21 +162,29 @@ class ggml_qnn_tensor { } if (should_use_mem_handle()) { - if (!_qnn_rpc_buffer) { - auto qnn_rpc_buffer = std::make_unique( + if (!_rpc_buffer) { + auto rpc_buffer = std::make_shared( _qnn_instance, buffer_size, QNN_TENSOR_GET_RANK(_qnn_tensor), QNN_TENSOR_GET_DIMENSIONS(_qnn_tensor), QNN_TENSOR_GET_DATA_TYPE(_qnn_tensor)); - if (!qnn_rpc_buffer->is_valid()) { - QNN_LOG_WARN("[%s]alloc rpc mem failed", _tensor_name.c_str()); + if (!rpc_buffer->is_valid()) { + QNN_LOG_WARN("[%s][%s]alloc rpc mem failed", get_backend_name(_device), _tensor_name.c_str()); return false; } - _qnn_rpc_buffer = std::move(qnn_rpc_buffer); + _rpc_buffer = std::move(rpc_buffer); } QNN_TENSOR_SET_MEM_TYPE(_qnn_tensor, QNN_TENSORMEMTYPE_MEMHANDLE); - QNN_TENSOR_SET_MEM_HANDLE(_qnn_tensor, _qnn_rpc_buffer->get_mem_handle()); - QNN_LOG_DEBUG("[%s]use mem handle %p", _tensor_name.c_str(), QNN_TENSOR_GET_MEM_HANDLE(_qnn_tensor)); + auto mem_handle = _rpc_buffer->get_mem_handle(); + if (!mem_handle) { + QNN_LOG_WARN("[%s][%s]can't find rpcmem from qnn mem handle", get_backend_name(_device), + _tensor_name.c_str()); + return false; + } + + QNN_TENSOR_SET_MEM_HANDLE(_qnn_tensor, mem_handle); + QNN_LOG_DEBUG("[%s][%s]use mem handle %p", get_backend_name(_device), _tensor_name.c_str(), + QNN_TENSOR_GET_MEM_HANDLE(_qnn_tensor)); } else { QNN_TENSOR_SET_MEM_TYPE(_qnn_tensor, QNN_TENSORMEMTYPE_RAW); Qnn_ClientBuffer_t client_buf = {buffer, (uint32_t)buffer_size}; @@ -205,13 +213,8 @@ class ggml_qnn_tensor { return true; } - if (should_use_mem_handle()) { - if (_qnn_rpc_buffer) { - memcpy(_qnn_rpc_buffer->get_buffer(), _buffer, _buffer_size); - } else { - QNN_LOG_WARN("[%s]can't find rpcmem from qnn mem handle\n", _tensor_name.c_str()); - return false; - } + if (_rpc_buffer) { + memcpy(_rpc_buffer->get_buffer(), _buffer, _buffer_size); } // For CPU and GPU, the data is already in the tensor. @@ -226,13 +229,8 @@ class ggml_qnn_tensor { return true; } - if (should_use_mem_handle()) { - if (_qnn_rpc_buffer) { - memcpy(_buffer, _qnn_rpc_buffer->get_buffer(), _buffer_size); - } else { - QNN_LOG_WARN("[%s]can't find rpcmem from qnn mem handle", _tensor_name.c_str()); - return false; - } + if (_rpc_buffer) { + memcpy(_buffer, _rpc_buffer->get_buffer(), _buffer_size); } // For CPU and GPU, the data is already in the tensor. @@ -283,7 +281,7 @@ class ggml_qnn_tensor { Qnn_Tensor_t _qnn_tensor = qnn_tensor_init(kDefaultQnnTensorVersion); qnn_dimension_array_t _dimensions = {}; Qnn_GraphHandle_t _graph_handle = nullptr; - std::unique_ptr _qnn_rpc_buffer; + qnn_buffer_ptr _rpc_buffer; DISABLE_COPY(ggml_qnn_tensor); DISABLE_MOVE(ggml_qnn_tensor); diff --git a/ggml/src/ggml-qnn/utils.cpp b/ggml/src/ggml-qnn/utils.cpp index ebfc0372375fd..eaabe60cdb262 100644 --- a/ggml/src/ggml-qnn/utils.cpp +++ b/ggml/src/ggml-qnn/utils.cpp @@ -1,6 +1,8 @@ #include "utils.hpp" +#include + #include #include "ggml-qnn.h" @@ -37,6 +39,28 @@ qnn_dimension_array_t get_internal_dimension(const ggml_dimension_array_t &dims, return internal_dims; } +qnn_dimension_array_t get_view_internal_dimension(const ggml_tensor *tensor, size_t &element_offset_out) { + + element_offset_out = 0; + + auto *parent_tensor = tensor; + while (parent_tensor->view_src) { + element_offset_out += parent_tensor->view_offs; + parent_tensor = parent_tensor->view_src; + } + + const auto rank = get_ggml_tensor_rank(tensor); + const auto parent_rank = get_ggml_tensor_rank(parent_tensor); + GGML_ASSERT(parent_tensor->type == tensor->type); + GGML_ASSERT(parent_rank == rank); + + const auto block_size = ggml_blck_size(tensor->type); + element_offset_out = + element_offset_out * block_size / tensor->nb[0]; // calculate the element offset in the view tensor + + return get_internal_dimension(parent_tensor->ne, parent_rank); +} + // TODO: mapping more ggml data type to QNN data type // ref:explanation of k-quants, https://github.com/ggerganov/llama.cpp/pull/1684 Qnn_DataType_t qnn_datatype_from_ggml_datatype(ggml_type ggml_type) { @@ -199,6 +223,12 @@ intptr_t align_to(size_t alignment, intptr_t offset) { uint32_t get_ggml_tensor_data_size(const ggml_tensor *tensor) { return ggml_nbytes(tensor); } +void *page_align_alloc(size_t size) { + // TODO: fix this for other platforms + const size_t alignment = sysconf(_SC_PAGESIZE); + return align_alloc(alignment, size); +} + void *align_alloc(size_t alignment, size_t size) { size_t size_aligned = size; if ((size_aligned % alignment) != 0) { diff --git a/ggml/src/ggml-qnn/utils.hpp b/ggml/src/ggml-qnn/utils.hpp index 40dff321b970e..1ec0af4c96f77 100644 --- a/ggml/src/ggml-qnn/utils.hpp +++ b/ggml/src/ggml-qnn/utils.hpp @@ -21,9 +21,11 @@ namespace qnn { using ggml_dimension_array_t = int64_t[GGML_MAX_DIMS]; +using ggml_stride_array_t = size_t[GGML_MAX_DIMS]; using qnn_dimension_array_t = std::array; qnn_dimension_array_t get_internal_dimension(const ggml_dimension_array_t &dims, uint32_t rank); +qnn_dimension_array_t get_view_internal_dimension(const ggml_tensor *tensor, size_t &element_offser_out); uint32_t get_ggml_tensor_rank(const ggml_tensor *tensor); const char *get_ggml_type_name(ggml_type type); @@ -33,6 +35,7 @@ const char *get_htparch_desc(size_t htp_arch); intptr_t align_to(size_t alignment, intptr_t offset); uint32_t get_ggml_tensor_data_size(const ggml_tensor *tensor); +void *page_align_alloc(size_t size); void *align_alloc(size_t alignment, size_t size); void align_free(void *ptr); From 79f124a6999b5931a301c7cbdecd52142c2f737a Mon Sep 17 00:00:00 2001 From: hongruichen Date: Sat, 14 Dec 2024 15:49:44 +0800 Subject: [PATCH 136/143] add missing op --- ggml/src/ggml-qnn/backend-ops.cpp | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-qnn/backend-ops.cpp b/ggml/src/ggml-qnn/backend-ops.cpp index 990338c953524..6bd9006851cc1 100644 --- a/ggml/src/ggml-qnn/backend-ops.cpp +++ b/ggml/src/ggml-qnn/backend-ops.cpp @@ -183,6 +183,7 @@ constexpr const char *kGgmlOpToQnnOp[] = { nullptr, // GGML_OP_POOL_2D_BACK nullptr, // GGML_OP_UPSCALE nullptr, // GGML_OP_PAD + nullptr, // GGML_OP_PAD_REFLECT_1D nullptr, // GGML_OP_ARANGE nullptr, // GGML_OP_TIMESTEP_EMBEDDING nullptr, // GGML_OP_ARGSORT @@ -196,7 +197,7 @@ constexpr const char *kGgmlOpToQnnOp[] = { nullptr, // GGML_OP_WIN_UNPART nullptr, // GGML_OP_GET_REL_POS nullptr, // GGML_OP_ADD_REL_POS - nullptr, // GGML_OP_RWKV_WKV + nullptr, // GGML_OP_RWKV_WKV6 nullptr, // GGML_OP_UNARY @@ -392,6 +393,7 @@ constexpr const ggml_qnn_unary_op_t kQnnUnaryOpsTable[] = { nullptr, // GGML_OP_POOL_2D_BACK nullptr, // GGML_OP_UPSCALE nullptr, // GGML_OP_PAD + nullptr, // GGML_OP_PAD_REFLECT_1D nullptr, // GGML_OP_ARANGE nullptr, // GGML_OP_TIMESTEP_EMBEDDING nullptr, // GGML_OP_ARGSORT @@ -405,7 +407,7 @@ constexpr const ggml_qnn_unary_op_t kQnnUnaryOpsTable[] = { nullptr, // GGML_OP_WIN_UNPART nullptr, // GGML_OP_GET_REL_POS nullptr, // GGML_OP_ADD_REL_POS - nullptr, // GGML_OP_RWKV_WKV + nullptr, // GGML_OP_RWKV_WKV6 nullptr, // GGML_OP_UNARY @@ -503,6 +505,7 @@ constexpr const ggml_qnn_binary_op_t kQnnBinaryOpsTable[] = { nullptr, // GGML_OP_POOL_2D_BACK nullptr, // GGML_OP_UPSCALE nullptr, // GGML_OP_PAD + nullptr, // GGML_OP_PAD_REFLECT_1D nullptr, // GGML_OP_ARANGE nullptr, // GGML_OP_TIMESTEP_EMBEDDING nullptr, // GGML_OP_ARGSORT From f2d8d017da4d69cb8af2faa43b8f59f828d34b10 Mon Sep 17 00:00:00 2001 From: nullname Date: Fri, 10 Jan 2025 11:13:25 +0800 Subject: [PATCH 137/143] [feat] Port ggml graph to QNN graph (#16) * more log * split graph implementation into cpp file * rename: ggml_qnn_graph -> qnn_graph * add imput/output tensor to graph * fix assert * wip * add _ggml_tensor field in qnn tensor * add comments * add set_data_buffer with raw memory buffer * use set_data_buffer * op param buffer use qnn_buffer_ptr * add qnn_mem_buffer_slice * use qnn_buffer_ptr as tensor buffer * use new set_data_buffer to reduce copy * ggml_qnn_op_config: add function to set input/output tensor before init node * remove ggml_qnn_connectable_op_config and use ggml_qnn_single_op_config instead * wip * add initialize_op_nodes without tensor params * wip * add op caps table * merge kGgmlOpToQnnOp and kOpCaps tables * wip * add cache parameter to create_tensors * add init_from_ggml_graph * disable gelu for all backend * wip * move op index calc to op config module * use the ggml_tensor as parameter of build_graph * add log * use create_operation_from_op_tensor in old build_graph function * remove unused constructors * fix parameter count * remove unused member func/var * make init_from_ggml_graph as a class member: build_graph_from_ggml_graph * move graph finalize into member function `finalize()` * get graph key from ggml op tensor directly * append output type * reduce tensor key length * add function to generate key from ggml_cgraph * simplify graph cache insert and delete * remove template param at get_qnn_graph_from_cache * wip * merge kQnnUnaryOpsTable and kQnnBinaryOpsTable * refactor device_supports_op * add log * wip * use framework function to check same shape * wip * extract some logic into separated function * wip * add execution function that runs graph * add function to create qnn graph from ggml_cgraph with cache * execute graph directly * return null graph key for empty graph * add more qualcomm chipset enums * add cap for reshape * disable some ops * try to skip GGML_OP_VIEW * moew log for view tensor * append param tensor into intermedia tensor key * use 'ordered' set * fix warning in release * wip --- ggml/src/ggml-qnn/backend-ops.cpp | 752 ++++++++++----------------- ggml/src/ggml-qnn/backend.hpp | 4 +- ggml/src/ggml-qnn/buffer.hpp | 76 ++- ggml/src/ggml-qnn/ggml-qnn.cpp | 12 +- ggml/src/ggml-qnn/graph.cpp | 386 ++++++++++++++ ggml/src/ggml-qnn/graph.hpp | 155 +----- ggml/src/ggml-qnn/op-config-base.hpp | 55 +- ggml/src/ggml-qnn/op-config-caps.cpp | 223 ++++++++ ggml/src/ggml-qnn/op-config.cpp | 226 +++----- ggml/src/ggml-qnn/op-config.hpp | 76 +-- ggml/src/ggml-qnn/qnn-types.hpp | 16 +- ggml/src/ggml-qnn/tensor.hpp | 179 +++++-- ggml/src/ggml-qnn/utils.cpp | 12 +- 13 files changed, 1310 insertions(+), 862 deletions(-) create mode 100644 ggml/src/ggml-qnn/graph.cpp create mode 100644 ggml/src/ggml-qnn/op-config-caps.cpp diff --git a/ggml/src/ggml-qnn/backend-ops.cpp b/ggml/src/ggml-qnn/backend-ops.cpp index 6bd9006851cc1..1ed01bfd6851d 100644 --- a/ggml/src/ggml-qnn/backend-ops.cpp +++ b/ggml/src/ggml-qnn/backend-ops.cpp @@ -11,12 +11,10 @@ #include "tensor.hpp" #include "utils.hpp" -#ifndef NDEBUG - namespace { -bool qnn_is_valid_params(ggml_backend_qnn_device_context *ctx, const ggml_tensor *src, ggml_tensor *dst) { - if (!ctx || !src || !dst) { +bool qnn_is_op_valid(ggml_backend_qnn_device_context *ctx, const ggml_tensor *dst) { + if (!ctx || !dst) { QNN_LOG_WARN("invalid params"); return false; } @@ -27,243 +25,151 @@ bool qnn_is_valid_params(ggml_backend_qnn_device_context *ctx, const ggml_tensor return false; } - return true; -} - -bool qnn_is_valid_params(ggml_backend_qnn_device_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, - ggml_tensor *dst) { - if (!ctx || !src0 || !src1 || !dst) { - QNN_LOG_WARN("invalid params"); - return false; - } - - auto instance = ctx->instance; - if (!instance) { - QNN_LOG_WARN("invalid instance"); - return false; + const auto param_count = qnn::get_qnn_op_input_param_count(qnn::get_qnn_op_index(dst)); + switch (param_count) { + case 1: + return dst->src[0]; + case 2: + return dst->src[0] && dst->src[1]; + default: + QNN_LOG_WARN("invalid op param count %d", (int)param_count); + break; } - return true; + return false; } +#ifndef NDEBUG void print_ggml_tensor(const ggml_tensor *tensor) { QNN_LOG_DEBUG("%s: type:%s ne: %ldx%ldx%ldx%ld, nb: %ldx%ldx%ldx%ld", tensor->name, ggml_type_name(tensor->type), (long)tensor->ne[0], (long)tensor->ne[1], (long)tensor->ne[2], (long)tensor->ne[3], (long)tensor->nb[0], (long)tensor->nb[1], (long)tensor->nb[2], (long)tensor->nb[3]); } +#endif } // namespace -#define CHECK_PARAMS(ctx, ...) \ - if (!qnn_is_valid_params((ctx), __VA_ARGS__)) { \ - return false; \ - } - -#else -#define CHECK_PARAMS(ctx, ...) -#endif - namespace { -bool is_tensor_dimensions_equal(const ggml_tensor *l, const ggml_tensor *r) { - const auto dim_l = ggml_n_dims(l); - if (dim_l != ggml_n_dims(r)) { - return false; - } +typedef bool (*ggml_qnn_op_t)(ggml_backend_qnn_device_context *ctx, ggml_tensor *dst); - for (int i = 0; i < dim_l; i++) { - if (l->ne[i] != r->ne[i]) { - return false; - } +bool execute_graph(qnn::qnn_graph *graph, ggml_tensor *output) { + if (!graph->execute(output)) { + QNN_LOG_WARN("execute failed"); + return false; } return true; } -typedef bool (*ggml_qnn_unary_op_t)(ggml_backend_qnn_device_context *ctx, ggml_tensor *src, ggml_tensor *dst); -typedef bool (*ggml_qnn_binary_op_t)(ggml_backend_qnn_device_context *ctx, ggml_tensor *src0, ggml_tensor *src1, - ggml_tensor *dst); - -typedef const ggml_qnn_unary_op_t (&ggml_qnn_unary_op_array_t)[GGML_OP_COUNT + GGML_UNARY_OP_COUNT]; -typedef const ggml_qnn_binary_op_t (&ggml_qnn_binary_op_array_t)[GGML_OP_COUNT]; - -constexpr const size_t kGgmlUnaryOpStart = GGML_OP_COUNT; - -template -qnn::ggml_tensor_array_t to_ggml_tensor_array(const std::array &array) { - return qnn::ggml_tensor_array_t(array.data(), array.data() + _Size); +void append_tensor_dimensions(const ggml_tensor *tensor, std::string &output) { + char buffer[256] = {}; + const auto *type_name = qnn::get_ggml_type_name(tensor->type); + int len = 0; + switch (ggml_n_dims(tensor)) { + case 1: + len = snprintf(buffer, sizeof(buffer), "%ld%s", (long)tensor->ne[0], type_name); + break; + case 2: + len = snprintf(buffer, sizeof(buffer), "%ldx%ld%s", (long)tensor->ne[0], (long)tensor->ne[1], type_name); + break; + case 3: + len = snprintf(buffer, sizeof(buffer), "%ldx%ldx%ld%s", (long)tensor->ne[0], (long)tensor->ne[1], + (long)tensor->ne[2], type_name); + break; + case 4: + default: + len = snprintf(buffer, sizeof(buffer), "%ldx%ldx%ldx%ld%s", (long)tensor->ne[0], (long)tensor->ne[1], + (long)tensor->ne[2], (long)tensor->ne[3], type_name); + break; + } + GGML_ASSERT(len > 0 && len < (int)sizeof(buffer)); + output.append(buffer, len); } -template -bool execute_graph(qnn::ggml_qnn_graph *graph, const std::array &inputs, - ggml_tensor *output) { - if (!graph->execute(to_ggml_tensor_array<_InputSize>(inputs), to_ggml_tensor_array<1>({output}))) { - QNN_LOG_WARN("execute failed"); - return false; +void get_graph_key_from_op(const ggml_tensor *op, std::string &output) { + GGML_ASSERT(op->op != GGML_OP_NONE); + output += ggml_op_desc(op); + output += qnn::get_ggml_type_name(op->type); + const auto param_count = qnn::get_qnn_op_input_param_count(qnn::get_qnn_op_index(op)); + for (size_t i = 0; i < param_count; ++i) { + auto *input = op->src[i]; + output += '_'; + append_tensor_dimensions(input, output); } - - return true; } -template -std::string get_graph_key(const std::string &op_name, const std::array &inputs, - const std::array &outputs) { - constexpr static const auto append_dimensions = [](std::string &key, const ggml_tensor *tensor) { - char buffer[256] = {}; - snprintf(buffer, sizeof(buffer), "_%ldx%ldx%ldx%ld%s", (long)tensor->ne[0], (long)tensor->ne[1], - (long)tensor->ne[2], (long)tensor->ne[3], qnn::get_ggml_type_name(tensor->type)); - key += buffer; - }; - - std::string graph_key(op_name); - for (auto &input : inputs) { - append_dimensions(graph_key, input); +void get_op_key_with_src_op_desc(const ggml_tensor *op, std::string &output) { + output += ggml_op_desc(op); + output += '('; + if (op->src[0]) { + output += ggml_op_desc(op->src[0]); } - - graph_key += qnn::get_ggml_type_name(outputs.front()->type); - return graph_key; + for (size_t i = 1; i < GGML_MAX_DIMS && op->src[i]; ++i) { + output += ','; + output += ggml_op_desc(op->src[i]); + } + output += ')'; } -constexpr const char *kGgmlOpToQnnOp[] = { - nullptr, // GGML_OP_NONE - nullptr, // GGML_OP_DUP - QNN_OP_ELEMENT_WISE_ADD, // GGML_OP_ADD - nullptr, // GGML_OP_ADD1 - nullptr, // GGML_OP_ACC - QNN_OP_ELEMENT_WISE_SUBTRACT, // GGML_OP_SUB - QNN_OP_ELEMENT_WISE_MULTIPLY, // GGML_OP_MUL - QNN_OP_ELEMENT_WISE_DIVIDE, // GGML_OP_DIV - nullptr, // GGML_OP_SQR - QNN_OP_ELEMENT_WISE_SQUARE_ROOT, // GGML_OP_SQRT - QNN_OP_ELEMENT_WISE_LOG, // GGML_OP_LOG - nullptr, // GGML_OP_SIN - nullptr, // GGML_OP_COS - nullptr, // GGML_OP_SUM - nullptr, // GGML_OP_SUM_ROWS - nullptr, // GGML_OP_MEAN - nullptr, // GGML_OP_ARGMAX - nullptr, // GGML_OP_COUNT_EQUAL - nullptr, // GGML_OP_REPEAT - nullptr, // GGML_OP_REPEAT_BACK - nullptr, // GGML_OP_CONCAT - nullptr, // GGML_OP_SILU_BACK - nullptr, // GGML_OP_NORM - nullptr, // GGML_OP_RMS_NORM - nullptr, // GGML_OP_RMS_NORM_BACK - nullptr, // GGML_OP_GROUP_NORM - - QNN_OP_MAT_MUL, // GGML_OP_MUL_MAT - nullptr, // GGML_OP_MUL_MAT_ID - nullptr, // GGML_OP_OUT_PROD - - nullptr, // GGML_OP_SCALE - nullptr, // GGML_OP_SET - nullptr, // GGML_OP_CPY - nullptr, // GGML_OP_CONT - nullptr, // GGML_OP_RESHAPE - nullptr, // GGML_OP_VIEW - QNN_OP_TRANSPOSE, // GGML_OP_PERMUTE - nullptr, // GGML_OP_TRANSPOSE - nullptr, // GGML_OP_GET_ROWS - nullptr, // GGML_OP_GET_ROWS_BACK - nullptr, // GGML_OP_DIAG - nullptr, // GGML_OP_DIAG_MASK_INF - nullptr, // GGML_OP_DIAG_MASK_ZERO - nullptr, // GGML_OP_SOFT_MAX - nullptr, // GGML_OP_SOFT_MAX_BACK - nullptr, // GGML_OP_ROPE - nullptr, // GGML_OP_ROPE_BACK - nullptr, // GGML_OP_CLAMP - nullptr, // GGML_OP_CONV_TRANSPOSE_1D - nullptr, // GGML_OP_IM2COL - nullptr, // GGML_OP_IM2COL_BACK - nullptr, // GGML_OP_CONV_TRANSPOSE_2D - nullptr, // GGML_OP_POOL_1D - nullptr, // GGML_OP_POOL_2D - nullptr, // GGML_OP_POOL_2D_BACK - nullptr, // GGML_OP_UPSCALE - nullptr, // GGML_OP_PAD - nullptr, // GGML_OP_PAD_REFLECT_1D - nullptr, // GGML_OP_ARANGE - nullptr, // GGML_OP_TIMESTEP_EMBEDDING - nullptr, // GGML_OP_ARGSORT - nullptr, // GGML_OP_LEAKY_RELU - - nullptr, // GGML_OP_FLASH_ATTN_EXT - nullptr, // GGML_OP_FLASH_ATTN_BACK - nullptr, // GGML_OP_SSM_CONV - nullptr, // GGML_OP_SSM_SCAN - nullptr, // GGML_OP_WIN_PART - nullptr, // GGML_OP_WIN_UNPART - nullptr, // GGML_OP_GET_REL_POS - nullptr, // GGML_OP_ADD_REL_POS - nullptr, // GGML_OP_RWKV_WKV6 - - nullptr, // GGML_OP_UNARY - - nullptr, // GGML_OP_MAP_UNARY - nullptr, // GGML_OP_MAP_BINARY - - nullptr, // GGML_OP_MAP_CUSTOM1_F32 - nullptr, // GGML_OP_MAP_CUSTOM2_F32 - nullptr, // GGML_OP_MAP_CUSTOM3_F32 - - nullptr, // GGML_OP_MAP_CUSTOM1 - nullptr, // GGML_OP_MAP_CUSTOM2 - nullptr, // GGML_OP_MAP_CUSTOM3 +void get_graph_key_from_cgraph(const ggml_cgraph *cgraph, std::string &output) { + // generate key from the graph, the key is used to cache the graph, like: + // "MUL_MATf32_256x16x10f32_256x1x10f32#LOG#ADD#ADDf32_16x1x10f32" + if (cgraph->n_nodes == 0) { + QNN_LOG_DEBUG("empty cgraph"); + return; + } - nullptr, // GGML_OP_CROSS_ENTROPY_LOSS - nullptr, // GGML_OP_CROSS_ENTROPY_LOSS_BACK - nullptr, // GGML_OP_OPT_STEP_ADAMW + { + bool is_start = true; + for (int i = 0; i < cgraph->n_nodes; ++i) { + auto *op = cgraph->nodes[i]; + if (ggml_is_empty(op)) { + QNN_LOG_DEBUG("empty op in graph, skipping"); + continue; + } - // ggml_unary_op - nullptr, // GGML_UNARY_OP_ABS - nullptr, // GGML_UNARY_OP_SGN - nullptr, // GGML_UNARY_OP_NEG - nullptr, // GGML_UNARY_OP_STEP - nullptr, // GGML_UNARY_OP_TANH - nullptr, // GGML_UNARY_OP_ELU - nullptr, // GGML_UNARY_OP_RELU - nullptr, // GGML_UNARY_OP_SIGMOID - QNN_OP_GELU, // GGML_UNARY_OP_GELU - nullptr, // GGML_UNARY_OP_GELU_QUICK - nullptr, // GGML_UNARY_OP_SILU - nullptr, // GGML_UNARY_OP_HARDSWISH - nullptr, // GGML_UNARY_OP_HARDSIGMOID - nullptr, // GGML_UNARY_OP_EXP -}; + if (op->op == GGML_OP_NONE) { + QNN_LOG_DEBUG("GGML_OP_NONE in graph, skipping"); + continue; + } -static_assert(sizeof(kGgmlOpToQnnOp) / sizeof(kGgmlOpToQnnOp[0]) == (GGML_OP_COUNT + GGML_UNARY_OP_COUNT), - "GGML_OP_COUNT does not match the size of the kGgmlOpToQnnOp table"); -static_assert(kGgmlOpToQnnOp[GGML_UNARY_OP_GELU + kGgmlUnaryOpStart] != nullptr, - "GGML_UNARY_OP_GELU does not correspond to QNN_OP_GELU"); + if (is_start) { + get_graph_key_from_op(cgraph->nodes[0], output); + is_start = false; + } else { + output += '#'; + get_op_key_with_src_op_desc(op, output); + } + } + } -template -qnn::ggml_qnn_graph *get_qnn_graph_from_cache(ggml_backend_qnn_device_context *ctx, size_t op, - const std::array &inputs, - ggml_tensor *output) { - GGML_ASSERT(op < (GGML_OP_COUNT + GGML_UNARY_OP_COUNT)); + if (cgraph->n_nodes > 1) { + auto *last_op = cgraph->nodes[cgraph->n_nodes - 1]; + output += qnn::get_ggml_type_name(last_op->type); + output += '_'; + append_tensor_dimensions(last_op, output); + } +} +qnn::qnn_graph *get_qnn_graph_from_cache(ggml_backend_qnn_device_context *ctx, ggml_tensor *output) { auto &graph_cache = ctx->qnn_graph_cache; - const auto *op_name = - op < kGgmlUnaryOpStart ? ggml_op_name(ggml_op(op)) : ggml_unary_op_name(ggml_unary_op(op - kGgmlUnaryOpStart)); - auto graph_key = get_graph_key<_InputSize, 1>(op_name, inputs, {output}); + std::string graph_key; + get_graph_key_from_op(output, graph_key); auto it = graph_cache.find(graph_key); - qnn::ggml_qnn_graph *graph_ptr = nullptr; + qnn::qnn_graph *graph_ptr = nullptr; if (it != graph_cache.end()) { QNN_LOG_DEBUG("[%s]found graph %s in cache", qnn::get_backend_name(ctx->device), graph_key.c_str()); graph_ptr = it->second.get(); } else { auto graph = - std::make_unique(graph_key, ctx->device, ctx->instance, ctx->socinfo.vtcm_size_in_mb); + std::make_unique(graph_key, ctx->device, ctx->instance, ctx->socinfo.vtcm_size_in_mb); if (!graph->is_valid()) { return nullptr; } - auto op_constructor = qnn::create_op_constructor(kGgmlOpToQnnOp[op]); - if (!graph->build_graph(op_constructor, to_ggml_tensor_array<_InputSize>(inputs), - to_ggml_tensor_array<1>({output}))) { - QNN_LOG_ERROR("[%s]build_graph failed", qnn::get_backend_name(ctx->device)); + if (!graph->build_graph_from_op(output)) { + QNN_LOG_ERROR("[%s]build_graph_from_op failed", qnn::get_backend_name(ctx->device)); return nullptr; } @@ -274,44 +180,54 @@ qnn::ggml_qnn_graph *get_qnn_graph_from_cache(ggml_backend_qnn_device_context *c return graph_ptr; } -template -bool qnn_binary_op_impl(ggml_backend_qnn_device_context *ctx, ggml_tensor *src0, ggml_tensor *src1, ggml_tensor *dst) { - static_assert(kGgmlOpToQnnOp[_GgmlOp] != nullptr, "GGML_OP does not have a corresponding QNN_OP"); +qnn::qnn_graph *get_qnn_graph_from_cache(ggml_backend_qnn_device_context *ctx, const ggml_cgraph *cgraph) { + auto &graph_cache = ctx->qnn_graph_cache; + std::string graph_key; + get_graph_key_from_cgraph(cgraph, graph_key); + if (graph_key.empty()) { + QNN_LOG_DEBUG("[%s]empty graph key for cgraph: %p, size: %d", qnn::get_backend_name(ctx->device), cgraph, + (int)cgraph->n_nodes); + return nullptr; + } - CHECK_PARAMS(ctx, src0, src1, dst); + auto it = graph_cache.find(graph_key); + qnn::qnn_graph *graph_ptr = nullptr; + if (it != graph_cache.end()) { + QNN_LOG_DEBUG("[%s]found graph %s in cache", qnn::get_backend_name(ctx->device), graph_key.c_str()); + graph_ptr = it->second.get(); + } else { + auto graph = + std::make_unique(graph_key, ctx->device, ctx->instance, ctx->socinfo.vtcm_size_in_mb); + if (!graph->is_valid()) { + return nullptr; + } - bool succeed = false; - auto *graph_ptr = get_qnn_graph_from_cache<2>(ctx, _GgmlOp, {src0, src1}, dst); - if (graph_ptr) { - succeed = execute_graph<2>(graph_ptr, {src0, src1}, dst); - } + if (!graph->build_graph_from_ggml_graph(cgraph)) { + QNN_LOG_ERROR("[%s]build_graph_from_op failed", qnn::get_backend_name(ctx->device)); + return nullptr; + } -#ifndef NDEBUG - if (!succeed) { - print_ggml_tensor(src0); - print_ggml_tensor(src1); - print_ggml_tensor(dst); + graph_ptr = graph.get(); + graph_cache[graph_key] = std::move(graph); } -#endif - return succeed; + return graph_ptr; } -template -bool qnn_unary_op_impl(ggml_backend_qnn_device_context *ctx, ggml_tensor *src, ggml_tensor *dst) { - static_assert(kGgmlOpToQnnOp[_GgmlOp] != nullptr, "GGML_OP does not have a corresponding QNN_OP"); - - CHECK_PARAMS(ctx, src, dst); - - bool succeed = false; - auto *graph_ptr = get_qnn_graph_from_cache<1>(ctx, _GgmlOp, {src}, dst); - if (graph_ptr) { - succeed = execute_graph<1>(graph_ptr, {src}, dst); +bool qnn_generic_op_impl(ggml_backend_qnn_device_context *ctx, ggml_tensor *dst) { + if (!qnn_is_op_valid(ctx, dst)) { + return false; } + auto *graph_ptr = get_qnn_graph_from_cache(ctx, dst); + bool succeed = graph_ptr && execute_graph(graph_ptr, dst); + #ifndef NDEBUG if (!succeed) { - print_ggml_tensor(src); + const auto param_count = qnn::get_qnn_op_input_param_count(qnn::get_qnn_op_index(dst)); + for (size_t i = 0; i < param_count; ++i) { + print_ggml_tensor(dst->src[i]); + } print_ggml_tensor(dst); } #endif @@ -319,85 +235,76 @@ bool qnn_unary_op_impl(ggml_backend_qnn_device_context *ctx, ggml_tensor *src, g return succeed; } -bool qnn_unary_nop_impl(ggml_backend_qnn_device_context *ctx, ggml_tensor *src, ggml_tensor *dst) { - GGML_UNUSED(ctx); - GGML_UNUSED(src); - GGML_UNUSED(dst); - return true; -} - -bool qnn_binary_nop_impl(ggml_backend_qnn_device_context *ctx, ggml_tensor *src0, ggml_tensor *src1, ggml_tensor *dst) { +bool qnn_nop_impl(ggml_backend_qnn_device_context *ctx, ggml_tensor *dst) { GGML_UNUSED(ctx); - GGML_UNUSED(src0); - GGML_UNUSED(src1); GGML_UNUSED(dst); return true; } -constexpr const ggml_qnn_unary_op_t kQnnUnaryOpsTable[] = { - qnn_unary_nop_impl, // GGML_OP_NONE - nullptr, // GGML_OP_DUP - nullptr, // GGML_OP_ADD - nullptr, // GGML_OP_ADD1 - nullptr, // GGML_OP_ACC - nullptr, // GGML_OP_SUB - nullptr, // GGML_OP_MUL - nullptr, // GGML_OP_DIV - nullptr, // GGML_OP_SQR - qnn_unary_op_impl, // GGML_OP_SQRT - qnn_unary_op_impl, // GGML_OP_LOG - nullptr, // GGML_OP_SIN - nullptr, // GGML_OP_COS - nullptr, // GGML_OP_SUM - nullptr, // GGML_OP_SUM_ROWS - nullptr, // GGML_OP_MEAN - nullptr, // GGML_OP_ARGMAX - nullptr, // GGML_OP_COUNT_EQUAL - nullptr, // GGML_OP_REPEAT - nullptr, // GGML_OP_REPEAT_BACK - nullptr, // GGML_OP_CONCAT - nullptr, // GGML_OP_SILU_BACK - nullptr, // GGML_OP_NORM - nullptr, // GGML_OP_RMS_NORM - nullptr, // GGML_OP_RMS_NORM_BACK - nullptr, // GGML_OP_GROUP_NORM - - nullptr, // GGML_OP_MUL_MAT - nullptr, // GGML_OP_MUL_MAT_ID - nullptr, // GGML_OP_OUT_PROD - - nullptr, // GGML_OP_SCALE - nullptr, // GGML_OP_SET - nullptr, // GGML_OP_CPY - nullptr, // GGML_OP_CONT - qnn_unary_nop_impl, // GGML_OP_RESHAPE - qnn_unary_nop_impl, // GGML_OP_VIEW - qnn_unary_nop_impl, // GGML_OP_PERMUTE - qnn_unary_nop_impl, // GGML_OP_TRANSPOSE - qnn_unary_nop_impl, // GGML_OP_GET_ROWS - nullptr, // GGML_OP_GET_ROWS_BACK - nullptr, // GGML_OP_DIAG - nullptr, // GGML_OP_DIAG_MASK_INF - nullptr, // GGML_OP_DIAG_MASK_ZERO - nullptr, // GGML_OP_SOFT_MAX - nullptr, // GGML_OP_SOFT_MAX_BACK - nullptr, // GGML_OP_ROPE - nullptr, // GGML_OP_ROPE_BACK - nullptr, // GGML_OP_CLAMP - nullptr, // GGML_OP_CONV_TRANSPOSE_1D - nullptr, // GGML_OP_IM2COL - nullptr, // GGML_OP_IM2COL_BACK - nullptr, // GGML_OP_CONV_TRANSPOSE_2D - nullptr, // GGML_OP_POOL_1D - nullptr, // GGML_OP_POOL_2D - nullptr, // GGML_OP_POOL_2D_BACK - nullptr, // GGML_OP_UPSCALE - nullptr, // GGML_OP_PAD - nullptr, // GGML_OP_PAD_REFLECT_1D - nullptr, // GGML_OP_ARANGE - nullptr, // GGML_OP_TIMESTEP_EMBEDDING - nullptr, // GGML_OP_ARGSORT - nullptr, // GGML_OP_LEAKY_RELU +constexpr const ggml_qnn_op_t kQnnOpsTable[] = { + qnn_nop_impl, // GGML_OP_NONE + nullptr, // GGML_OP_DUP + qnn_generic_op_impl, // GGML_OP_ADD + nullptr, // GGML_OP_ADD1 + nullptr, // GGML_OP_ACC + qnn_generic_op_impl, // GGML_OP_SUB + qnn_generic_op_impl, // GGML_OP_MUL + qnn_generic_op_impl, // GGML_OP_DIV + nullptr, // GGML_OP_SQR + qnn_generic_op_impl, // GGML_OP_SQRT + qnn_generic_op_impl, // GGML_OP_LOG + nullptr, // GGML_OP_SIN + nullptr, // GGML_OP_COS + nullptr, // GGML_OP_SUM + nullptr, // GGML_OP_SUM_ROWS + nullptr, // GGML_OP_MEAN + nullptr, // GGML_OP_ARGMAX + nullptr, // GGML_OP_COUNT_EQUAL + nullptr, // GGML_OP_REPEAT + nullptr, // GGML_OP_REPEAT_BACK + nullptr, // GGML_OP_CONCAT + nullptr, // GGML_OP_SILU_BACK + nullptr, // GGML_OP_NORM + nullptr, // GGML_OP_RMS_NORM + nullptr, // GGML_OP_RMS_NORM_BACK + nullptr, // GGML_OP_GROUP_NORM + + qnn_generic_op_impl, // GGML_OP_MUL_MAT + nullptr, // GGML_OP_MUL_MAT_ID + nullptr, // GGML_OP_OUT_PROD + + nullptr, // GGML_OP_SCALE + nullptr, // GGML_OP_SET + nullptr, // GGML_OP_CPY + nullptr, // GGML_OP_CONT + qnn_nop_impl, // GGML_OP_RESHAPE + nullptr, // GGML_OP_VIEW + nullptr, // GGML_OP_PERMUTE + nullptr, // GGML_OP_TRANSPOSE + nullptr, // GGML_OP_GET_ROWS + nullptr, // GGML_OP_GET_ROWS_BACK + nullptr, // GGML_OP_DIAG + nullptr, // GGML_OP_DIAG_MASK_INF + nullptr, // GGML_OP_DIAG_MASK_ZERO + nullptr, // GGML_OP_SOFT_MAX + nullptr, // GGML_OP_SOFT_MAX_BACK + nullptr, // GGML_OP_ROPE + nullptr, // GGML_OP_ROPE_BACK + nullptr, // GGML_OP_CLAMP + nullptr, // GGML_OP_CONV_TRANSPOSE_1D + nullptr, // GGML_OP_IM2COL + nullptr, // GGML_OP_IM2COL_BACK + nullptr, // GGML_OP_CONV_TRANSPOSE_2D + nullptr, // GGML_OP_POOL_1D + nullptr, // GGML_OP_POOL_2D + nullptr, // GGML_OP_POOL_2D_BACK + nullptr, // GGML_OP_UPSCALE + nullptr, // GGML_OP_PAD + nullptr, // GGML_OP_PAD_REFLECT_1D + nullptr, // GGML_OP_ARANGE + nullptr, // GGML_OP_TIMESTEP_EMBEDDING + nullptr, // GGML_OP_ARGSORT + nullptr, // GGML_OP_LEAKY_RELU nullptr, // GGML_OP_FLASH_ATTN_EXT nullptr, // GGML_OP_FLASH_ATTN_BACK @@ -407,7 +314,7 @@ constexpr const ggml_qnn_unary_op_t kQnnUnaryOpsTable[] = { nullptr, // GGML_OP_WIN_UNPART nullptr, // GGML_OP_GET_REL_POS nullptr, // GGML_OP_ADD_REL_POS - nullptr, // GGML_OP_RWKV_WKV6 + nullptr, // GGML_OP_RWKV_WKV nullptr, // GGML_OP_UNARY @@ -427,120 +334,34 @@ constexpr const ggml_qnn_unary_op_t kQnnUnaryOpsTable[] = { nullptr, // GGML_OP_OPT_STEP_ADAMW // ggml_unary_op - nullptr, // GGML_UNARY_OP_ABS - nullptr, // GGML_UNARY_OP_SGN - nullptr, // GGML_UNARY_OP_NEG - nullptr, // GGML_UNARY_OP_STEP - nullptr, // GGML_UNARY_OP_TANH - nullptr, // GGML_UNARY_OP_ELU - nullptr, // GGML_UNARY_OP_RELU - nullptr, // GGML_UNARY_OP_SIGMOID - qnn_unary_op_impl, // GGML_UNARY_OP_GELU - nullptr, // GGML_UNARY_OP_GELU_QUICK - nullptr, // GGML_UNARY_OP_SILU - nullptr, // GGML_UNARY_OP_HARDSWISH - nullptr, // GGML_UNARY_OP_HARDSIGMOID - nullptr, // GGML_UNARY_OP_EXP + nullptr, // GGML_UNARY_OP_ABS + nullptr, // GGML_UNARY_OP_SGN + nullptr, // GGML_UNARY_OP_NEG + nullptr, // GGML_UNARY_OP_STEP + nullptr, // GGML_UNARY_OP_TANH + nullptr, // GGML_UNARY_OP_ELU + nullptr, // GGML_UNARY_OP_RELU + nullptr, // GGML_UNARY_OP_SIGMOID + qnn_generic_op_impl, // GGML_UNARY_OP_GELU + nullptr, // GGML_UNARY_OP_GELU_QUICK + nullptr, // GGML_UNARY_OP_SILU + nullptr, // GGML_UNARY_OP_HARDSWISH + nullptr, // GGML_UNARY_OP_HARDSIGMOID + nullptr, // GGML_UNARY_OP_EXP }; -static_assert(sizeof(kQnnUnaryOpsTable) / sizeof(kQnnUnaryOpsTable[0]) == (GGML_OP_COUNT + GGML_UNARY_OP_COUNT), - "GGML_OP_COUNT does not match the size of the kQnnUnaryOpsTable table"); - -constexpr const ggml_qnn_binary_op_t kQnnBinaryOpsTable[] = { - nullptr, // GGML_OP_NONE - nullptr, // GGML_OP_DUP - qnn_binary_op_impl, // GGML_OP_ADD - nullptr, // GGML_OP_ADD1 - nullptr, // GGML_OP_ACC - qnn_binary_op_impl, // GGML_OP_SUB - qnn_binary_op_impl, // GGML_OP_MUL - qnn_binary_op_impl, // GGML_OP_DIV - nullptr, // GGML_OP_SQR - nullptr, // GGML_OP_SQRT - nullptr, // GGML_OP_LOG - nullptr, // GGML_OP_SIN - nullptr, // GGML_OP_COS - nullptr, // GGML_OP_SUM - nullptr, // GGML_OP_SUM_ROWS - nullptr, // GGML_OP_MEAN - nullptr, // GGML_OP_ARGMAX - nullptr, // GGML_OP_COUNT_EQUAL - nullptr, // GGML_OP_REPEAT - nullptr, // GGML_OP_REPEAT_BACK - nullptr, // GGML_OP_CONCAT - nullptr, // GGML_OP_SILU_BACK - nullptr, // GGML_OP_NORM - nullptr, // GGML_OP_RMS_NORM - nullptr, // GGML_OP_RMS_NORM_BACK - nullptr, // GGML_OP_GROUP_NORM - - qnn_binary_op_impl, // GGML_OP_MUL_MAT - nullptr, // GGML_OP_MUL_MAT_ID - nullptr, // GGML_OP_OUT_PROD - - nullptr, // GGML_OP_SCALE - nullptr, // GGML_OP_SET - nullptr, // GGML_OP_CPY - nullptr, // GGML_OP_CONT - nullptr, // GGML_OP_RESHAPE - nullptr, // GGML_OP_VIEW - nullptr, // GGML_OP_PERMUTE - nullptr, // GGML_OP_TRANSPOSE - nullptr, // GGML_OP_GET_ROWS - nullptr, // GGML_OP_GET_ROWS_BACK - nullptr, // GGML_OP_DIAG - nullptr, // GGML_OP_DIAG_MASK_INF - nullptr, // GGML_OP_DIAG_MASK_ZERO - nullptr, // GGML_OP_SOFT_MAX - nullptr, // GGML_OP_SOFT_MAX_BACK - nullptr, // GGML_OP_ROPE - nullptr, // GGML_OP_ROPE_BACK - nullptr, // GGML_OP_CLAMP - nullptr, // GGML_OP_CONV_TRANSPOSE_1D - nullptr, // GGML_OP_IM2COL - nullptr, // GGML_OP_IM2COL_BACK - nullptr, // GGML_OP_CONV_TRANSPOSE_2D - nullptr, // GGML_OP_POOL_1D - nullptr, // GGML_OP_POOL_2D - nullptr, // GGML_OP_POOL_2D_BACK - nullptr, // GGML_OP_UPSCALE - nullptr, // GGML_OP_PAD - nullptr, // GGML_OP_PAD_REFLECT_1D - nullptr, // GGML_OP_ARANGE - nullptr, // GGML_OP_TIMESTEP_EMBEDDING - nullptr, // GGML_OP_ARGSORT - nullptr, // GGML_OP_LEAKY_RELU - - nullptr, // GGML_OP_FLASH_ATTN_EXT - nullptr, // GGML_OP_FLASH_ATTN_BACK - nullptr, // GGML_OP_SSM_CONV - nullptr, // GGML_OP_SSM_SCAN - nullptr, // GGML_OP_WIN_PART - nullptr, // GGML_OP_WIN_UNPART - nullptr, // GGML_OP_GET_REL_POS - nullptr, // GGML_OP_ADD_REL_POS - nullptr, // GGML_OP_RWKV_WKV - - nullptr, // GGML_OP_UNARY - - nullptr, // GGML_OP_MAP_UNARY - nullptr, // GGML_OP_MAP_BINARY - - nullptr, // GGML_OP_MAP_CUSTOM1_F32 - nullptr, // GGML_OP_MAP_CUSTOM2_F32 - nullptr, // GGML_OP_MAP_CUSTOM3_F32 - - nullptr, // GGML_OP_MAP_CUSTOM1 - nullptr, // GGML_OP_MAP_CUSTOM2 - nullptr, // GGML_OP_MAP_CUSTOM3 - - nullptr, // GGML_OP_CROSS_ENTROPY_LOSS - nullptr, // GGML_OP_CROSS_ENTROPY_LOSS_BACK - nullptr, // GGML_OP_OPT_STEP_ADAMW -}; - -static_assert(sizeof(kQnnBinaryOpsTable) / sizeof(kQnnBinaryOpsTable[0]) == GGML_OP_COUNT, - "GGML_OP_COUNT does not match the size of the kQnnBinaryOpsTable table"); +static_assert(kQnnOpsTable[GGML_OP_NONE] == qnn_nop_impl, "GGML_OP_NONE does not match the qnn_nop_impl function"); +static_assert(kQnnOpsTable[GGML_OP_ADD] == qnn_generic_op_impl, + "GGML_OP_ADD does not match the qnn_generic_op_impl function"); +static_assert(kQnnOpsTable[GGML_OP_MUL] == qnn_generic_op_impl, + "GGML_OP_MUL does not match the qnn_generic_op_impl function"); +static_assert(kQnnOpsTable[GGML_OP_MUL_MAT] == qnn_generic_op_impl, + "GGML_OP_MUL_MAT does not match the qnn_generic_op_impl function"); +static_assert(kQnnOpsTable[GGML_OP_RESHAPE] == qnn_nop_impl, + "GGML_OP_RESHAPE does not match the qnn_nop_impl function"); +static_assert(kQnnOpsTable[GGML_OP_VIEW] == nullptr, "GGML_OP_VIEW is not nullptr"); +static_assert(std::size(kQnnOpsTable) == (GGML_OP_COUNT + GGML_UNARY_OP_COUNT), + "GGML_OP_COUNT does not match the size of the kQnnOpsTable table"); bool ggml_qnn_supports_tensor(ggml_backend_qnn_device_context *ctx, const ggml_tensor *tensor) { if (!tensor) { @@ -548,6 +369,7 @@ bool ggml_qnn_supports_tensor(ggml_backend_qnn_device_context *ctx, const ggml_t return false; } +#ifndef NDEBUG if (tensor->view_src) { auto *src_tensor = tensor->view_src; QNN_LOG_DEBUG("[%s]tensor(%s_%dx%dx%dx%d) is a view, src: %s_%dx%dx%dx%d", qnn::get_backend_name(ctx->device), @@ -555,6 +377,7 @@ bool ggml_qnn_supports_tensor(ggml_backend_qnn_device_context *ctx, const ggml_t ggml_get_name(src_tensor), src_tensor->ne[0], src_tensor->ne[1], src_tensor->ne[2], src_tensor->ne[3]); } +#endif switch (tensor->type) { case GGML_TYPE_F32: @@ -576,6 +399,25 @@ bool ggml_qnn_supports_tensor(ggml_backend_qnn_device_context *ctx, const ggml_t return true; } +bool ggnl_qnn_supports_op_tensor(ggml_backend_qnn_device_context *ctx, const ggml_tensor *op) { + if (op->op == GGML_OP_NONE) { + return true; + } + + if (!ggml_qnn_supports_tensor(ctx, op)) { + return false; + } + + const auto param_count = qnn::get_qnn_op_input_param_count(qnn::get_qnn_op_index(op)); + for (size_t i = 0; i < param_count; ++i) { + if (!ggml_qnn_supports_tensor(ctx, op->src[i])) { + return false; + } + } + + return true; +} + bool ggml_qnn_supports_matmul_op(ggml_backend_qnn_device_context *ctx, const ggml_tensor *op) { constexpr const size_t kMaxNpuTensorSize = 8192L * 2048 + 8192 * 512 + 2048 * 512; constexpr const auto get_tensor_size = [](const ggml_tensor *tensor) -> size_t { @@ -591,11 +433,11 @@ bool ggml_qnn_supports_matmul_op(ggml_backend_qnn_device_context *ctx, const ggm * TODO: remove the blocker here when NPU backend supports mul_mat like this: * [ne03, ne02, n, k] * [ne03 * x, ne02 * y, m, k] -> [ne03 * x, ne02 * y, m, n] */ - QNN_LOG_DEBUG("[qnn-npu] src0 and src1 dimensions are not equal, support/unsupported: %d/%d", + QNN_LOG_DEBUG("[qnn-npu][MUL_MAT]src0 and src1 dimensions are not equal, support/unsupported: %d/%d", ctx->support_op_count.load(), ++(ctx->unsupported_op_count)); return false; } else if (get_tensor_size(src0) + get_tensor_size(src1) + get_tensor_size(op) >= kMaxNpuTensorSize) { - QNN_LOG_DEBUG("[qnn-npu] tensor size is too large, support/unsupported: %d/%d", + QNN_LOG_DEBUG("[qnn-npu][MUL_MAT]tensor size is too large, support/unsupported: %d/%d", ctx->support_op_count.load(), ++(ctx->unsupported_op_count)); return false; } @@ -604,9 +446,9 @@ bool ggml_qnn_supports_matmul_op(ggml_backend_qnn_device_context *ctx, const ggm case QNN_BACKEND_GPU: if (src0->type != src1->type || src0->type != op->type) { // there's no convert op for GPU. - QNN_LOG_DEBUG("[qnn-gpu]type src0(%d), src1(%d) and op(%d) are not equal, support/unsupported: %d/%d", - src0->type, src1->type, op->type, ctx->support_op_count.load(), - ++(ctx->unsupported_op_count)); + QNN_LOG_DEBUG( + "[qnn-gpu][MUL_MAT]type src0(%d), src1(%d) and op(%d) are not equal, support/unsupported: %d/%d", + src0->type, src1->type, op->type, ctx->support_op_count.load(), ++(ctx->unsupported_op_count)); return false; } break; @@ -615,12 +457,12 @@ bool ggml_qnn_supports_matmul_op(ggml_backend_qnn_device_context *ctx, const ggm } if ((src1->ne[2] % src0->ne[2]) != 0 || (src1->ne[3] % src0->ne[3]) != 0) { - QNN_LOG_DEBUG("[%s] src0 and src1 dimensions are not equal, support/unsupported: %d/%d", + QNN_LOG_DEBUG("[%s][MUL_MAT]src0 and src1 dimensions are not equal, support/unsupported: %d/%d", qnn::get_backend_name(ctx->device), ctx->support_op_count.load(), ++(ctx->unsupported_op_count)); return false; } - QNN_LOG_DEBUG("[%s] supported matmul op, support/unsupported: %d/%d", qnn::get_backend_name(ctx->device), + QNN_LOG_DEBUG("[%s][MUL_MAT]supported matmul op, support/unsupported: %d/%d", qnn::get_backend_name(ctx->device), ++(ctx->support_op_count), ctx->unsupported_op_count.load()); return true; } @@ -635,41 +477,30 @@ bool device_supports_op(ggml_backend_qnn_device_context *ctx, const ggml_tensor return true; } - auto *src0 = op->src[0]; - if (op->op == GGML_OP_UNARY) { - const auto unary_op = ggml_get_unary_op(op); - if (unary_op == GGML_UNARY_OP_GELU && ctx->device == QNN_BACKEND_NPU) { - // TODO: fix this when NPU supports GELU - QNN_LOG_DEBUG("unsupported unary op GGML_UNARY_OP_GELU for NPU"); - return false; - } + if (!kQnnOpsTable[qnn::get_qnn_op_index(op)]) { + QNN_LOG_DEBUG("[%s]unsupported op", ggml_op_name(op->op)); + return false; + } - if (!kQnnUnaryOpsTable[kGgmlUnaryOpStart + unary_op]) { - QNN_LOG_DEBUG("unsupported unary op %d", unary_op); - return false; - } + if (!ggnl_qnn_supports_op_tensor(ctx, op)) { + QNN_LOG_DEBUG("[%s]unsupported tensor", ggml_op_name(op->op)); + return false; + } - if (!ggml_qnn_supports_tensor(ctx, src0) || !ggml_qnn_supports_tensor(ctx, op)) { - QNN_LOG_DEBUG("src0 is nullptr"); + if (op->op == GGML_OP_UNARY) { + const auto unary_op = ggml_get_unary_op(op); + if (unary_op == GGML_UNARY_OP_GELU) { + // TODO: fix this + QNN_LOG_DEBUG("[GELU]unsupported unary op GGML_UNARY_OP_GELU for NPU"); return false; } } else { - if (!kQnnUnaryOpsTable[op->op] && !kQnnBinaryOpsTable[op->op]) { - QNN_LOG_DEBUG("[%s] unsupported op", ggml_op_name(op->op)); - return false; - } - + auto *src0 = op->src[0]; auto *src1 = op->src[1]; - if (!ggml_qnn_supports_tensor(ctx, src0) || !ggml_qnn_supports_tensor(ctx, op) || - (kQnnBinaryOpsTable[op->op] && !ggml_qnn_supports_tensor(ctx, src1))) { - QNN_LOG_DEBUG("[%s] unsupported tensor", ggml_op_name(op->op)); - return false; - } - switch (op->op) { case GGML_OP_ADD: - if (!is_tensor_dimensions_equal(src0, src1)) { - QNN_LOG_DEBUG("src0 and src1 dimensions are not equal"); + if (!ggml_are_same_shape(src0, src1)) { + QNN_LOG_DEBUG("[ADD] src0 and src1 dimensions are not equal"); return false; } break; @@ -686,34 +517,13 @@ bool device_supports_op(ggml_backend_qnn_device_context *ctx, const ggml_tensor } bool device_compute_graph(ggml_backend_qnn_device_context *ctx, ggml_cgraph *cgraph) { - QNN_LOG_DEBUG("[%s]compute graph, nodes count: %d", qnn::get_backend_name(ctx->device), (int)cgraph->n_nodes); - for (int i = 0; i < cgraph->n_nodes; i++) { - ggml_tensor *tensor = cgraph->nodes[i]; - if (ggml_is_empty(tensor)) { - continue; - } + QNN_LOG_DEBUG("[%s]compute graph start, nodes count: %d", qnn::get_backend_name(ctx->device), (int)cgraph->n_nodes); - size_t unary_op_idx = tensor->op; - if (tensor->op == GGML_OP_UNARY) { - unary_op_idx = kGgmlUnaryOpStart + ggml_get_unary_op(tensor); - } - - bool ok = false; - auto unary_op = kQnnUnaryOpsTable[unary_op_idx]; - auto binary_op = kQnnBinaryOpsTable[tensor->op]; - if (unary_op) { - ok = unary_op(ctx, tensor->src[0], tensor); - } else if (binary_op) { - ok = binary_op(ctx, tensor->src[0], tensor->src[1], tensor); - } + auto qnn_graph = get_qnn_graph_from_cache(ctx, cgraph); + bool success = qnn_graph && qnn_graph->execute(cgraph); - if (!ok) { - QNN_LOG_WARN("[%s]unsupported op %s", qnn::get_backend_name(ctx->device), ggml_op_desc(tensor)); - return false; - } - } - - return true; + QNN_LOG_DEBUG("[%s]compute graph, success: %d", qnn::get_backend_name(ctx->device), (int)success); + return success; } } // namespace qnn diff --git a/ggml/src/ggml-qnn/backend.hpp b/ggml/src/ggml-qnn/backend.hpp index 17823ed577aaa..df5e2eb08fb8f 100644 --- a/ggml/src/ggml-qnn/backend.hpp +++ b/ggml/src/ggml-qnn/backend.hpp @@ -19,7 +19,7 @@ #include "qnn-lib.hpp" namespace qnn { -typedef std::unordered_map> ggml_qnn_graph_cache_t; +typedef std::unordered_map> qnn_graph_cache_t; } // namespace qnn struct ggml_backend_qnn_device_context { @@ -35,7 +35,7 @@ struct ggml_backend_qnn_device_context { std::shared_ptr instance; std::shared_ptr qnn_interface; - qnn::ggml_qnn_graph_cache_t qnn_graph_cache; + qnn::qnn_graph_cache_t qnn_graph_cache; #ifndef NDEBUG std::atomic_uint32_t support_op_count = 0; diff --git a/ggml/src/ggml-qnn/buffer.hpp b/ggml/src/ggml-qnn/buffer.hpp index 9573e160b4176..af165b394eefb 100644 --- a/ggml/src/ggml-qnn/buffer.hpp +++ b/ggml/src/ggml-qnn/buffer.hpp @@ -8,18 +8,65 @@ namespace qnn { +/** + * @brief An interface for managing generic QNN buffers. + * + * This abstract class defines the interface for managing generic memory buffers in a QNN context. + */ class qnn_buffer_interface { public: virtual ~qnn_buffer_interface() = default; + /** + * @brief Checks if the buffer is valid. + * + * This pure virtual function must be implemented by derived classes to check + * the validity of the buffer. + * + * @return true if the buffer is valid, false otherwise. + */ virtual bool is_valid() const = 0; + + /** + * @brief Gets the buffer pointer. + * + * This pure virtual function must be implemented by derived classes to return + * a pointer to the buffer. + * + * @return A pointer to the buffer. + */ virtual uint8_t *get_buffer() = 0; + + /** + * @brief Gets the buffer pointer. + * + * This pure virtual function must be implemented by derived classes to return + * a pointer to the buffer. + * + * @return A pointer to the buffer. + */ virtual size_t get_size() const = 0; + + /** + * @brief Gets the QNN memory handle associated with the buffer. + * + * This pure virtual function must be implemented by derived classes to return + * the memory handle associated with the buffer. + * + * @return The memory handle, or null if no valid QNN memory handle is attached. + */ virtual Qnn_MemHandle_t get_mem_handle() const = 0; }; using qnn_buffer_ptr = std::shared_ptr; +/** + * @brief A class for managing QNN RPC memory buffers. + * + * This class is responsible for allocating, registering, and managing a buffer in RPC memory. + * It ensures that the buffer is properly allocated and registered with the QNN instance, and + * handles cleanup of the buffer and its associated memory handle upon destruction. + */ class qnn_rpc_buffer : public qnn_buffer_interface { public: qnn_rpc_buffer(std::shared_ptr qnn_instance, const size_t size, const uint32_t rank, @@ -29,7 +76,7 @@ class qnn_rpc_buffer : public qnn_buffer_interface { _qnn_rpc_buffer = static_cast(qnn_instance->alloc_rpcmem(size, alignof(uint8_t *))); _qnn_rpc_mem_handle = qnn_instance->register_rpcmem(_qnn_rpc_buffer, rank, dimensions, data_type); if (!_qnn_rpc_buffer || !_qnn_rpc_mem_handle) { - QNN_LOG_WARN("register rpc mem failure"); + QNN_LOG_WARN("Failed to register RPC memory: buffer or memory handle is null"); // let the destructor free the buffer return; } @@ -64,6 +111,13 @@ class qnn_rpc_buffer : public qnn_buffer_interface { DISABLE_MOVE(qnn_rpc_buffer); }; +/** + * @brief A class for managing QNN memory buffers allocated in regular memory. + * + * This class is responsible for allocating, managing, and freeing memory buffers + * in regular (non-RPC) memory. It implements the qnn_buffer_interface to provide + * a consistent interface for buffer management. + */ class qnn_mem_buffer : public qnn_buffer_interface { public: explicit qnn_mem_buffer(const uint8_t *data, size_t size) { @@ -102,4 +156,24 @@ class qnn_mem_buffer : public qnn_buffer_interface { DISABLE_MOVE(qnn_mem_buffer); }; +class qnn_mem_buffer_slice : public qnn_buffer_interface { +public: + qnn_mem_buffer_slice(const uint8_t *buffer, size_t size) : _buffer(const_cast(buffer)), _size(size) {} + + bool is_valid() const override { return _buffer && _size; } + + uint8_t *get_buffer() override { return _buffer; } + + size_t get_size() const override { return _size; } + + Qnn_MemHandle_t get_mem_handle() const override { return nullptr; } + +private: + uint8_t *_buffer = nullptr; + size_t _size = 0; + + DISABLE_COPY(qnn_mem_buffer_slice); + DISABLE_MOVE(qnn_mem_buffer_slice); +}; + } // namespace qnn diff --git a/ggml/src/ggml-qnn/ggml-qnn.cpp b/ggml/src/ggml-qnn/ggml-qnn.cpp index 933016a62878e..b3673eb35a5f3 100644 --- a/ggml/src/ggml-qnn/ggml-qnn.cpp +++ b/ggml/src/ggml-qnn/ggml-qnn.cpp @@ -222,6 +222,9 @@ bool ggml_backend_qnn_cpy_tensor_async(ggml_backend_t backend_src, ggml_backend_ GGML_UNUSED(backend_dst); GGML_UNUSED(src); GGML_UNUSED(dst); + + QNN_LOG_DEBUG("opy form %s to %s, src_is_qnn: %d, dst_is_qnn: %d", ggml_get_name(src), ggml_get_name(dst), + (int)ggml_backend_is_qnn(backend_src), (int)ggml_backend_is_qnn(backend_dst)); return false; } @@ -317,8 +320,6 @@ ggml_guid_t ggml_backend_qnn_guid() { return &guid; } -bool ggml_backend_is_qnn(ggml_backend_t backend) { return ggml_guid_matches(backend->guid, ggml_backend_qnn_guid()); } - ggml_backend_t ggml_backend_qnn_init_with_device_context(ggml_backend_dev_t dev, const char *extend_lib_search_path) { if (!extend_lib_search_path) { extend_lib_search_path = GGML_QNN_DEFAULT_LIB_SEARCH_PATH; @@ -420,8 +421,13 @@ bool ggml_backend_qnn_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_ } bool ggml_backend_qnn_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor *op) { +#ifdef NDEBUG + GGML_UNUSED(dev); + GGML_UNUSED(op); +#else auto *device_ctx = get_device_context(dev); QNN_LOG_DEBUG("[%s][%s]offload op", qnn::get_backend_name(device_ctx->device), ggml_op_name(op->op)); +#endif return false; } @@ -509,6 +515,8 @@ const ggml_backend_reg_i ggml_backend_qnn_reg_interface = { } // namespace +bool ggml_backend_is_qnn(ggml_backend_t backend) { return ggml_guid_matches(backend->guid, ggml_backend_qnn_guid()); } + ggml_backend_reg_t ggml_backend_qnn_reg() { static ggml_backend_qnn_reg_impl reg{ggml_backend_qnn_reg_interface}; return ® diff --git a/ggml/src/ggml-qnn/graph.cpp b/ggml/src/ggml-qnn/graph.cpp new file mode 100644 index 0000000000000..0210e1554a8ab --- /dev/null +++ b/ggml/src/ggml-qnn/graph.cpp @@ -0,0 +1,386 @@ + +#include "graph.hpp" + +#include +#include + +#include "ggml-impl.h" + +#include "logger.hpp" +#include "op-config.hpp" +#include "tensor.hpp" + +namespace { +using qnn_tensor_cache_t = std::unordered_map; + +int get_op_max_rank(const ggml_tensor *op) { + int max_rank = ggml_n_dims(op); + const int count = (int)qnn::get_qnn_op_input_param_count(qnn::get_qnn_op_index(op)); + for (int i = 0; i < count; ++i) { + max_rank = std::max(max_rank, ggml_n_dims(op->src[i])); + } + + return max_rank; +} + +qnn::qnn_tensor_ptr_t create_tensor_with_cache(ggml_tensor *tensor, qnn::ggml_qnn_tensor::tensor_type_t type, int rank, + QNNBackend device, Qnn_GraphHandle_t graph_handle, + std::shared_ptr qnn_instance, + qnn_tensor_cache_t &tensor_cache) { + GGML_ASSERT(tensor); + if (tensor_cache.count(tensor)) { + return tensor_cache[tensor]; + } + + auto qnn_tensor = std::make_shared(type, tensor->name, tensor->ne, tensor->type, rank, device, + graph_handle, qnn_instance); + tensor_cache[tensor] = qnn_tensor; + return qnn_tensor; +} + +qnn::qnn_tensor_array_t create_tensors_with_cache(const qnn::ggml_tensor_array_t &ggml_tensors, + qnn::ggml_qnn_tensor::tensor_type_t type, int rank, QNNBackend device, + Qnn_GraphHandle_t graph_handle, + std::shared_ptr qnn_instance, + qnn_tensor_cache_t &tensor_cache) { + qnn::qnn_tensor_array_t tensors; + for (auto *tensor : ggml_tensors) { + tensors.push_back( + create_tensor_with_cache(tensor, type, rank, device, graph_handle, qnn_instance, tensor_cache)); + } + + return tensors; +} + +qnn::qnn_op_config_ptr_t create_operation_from_op_tensor(ggml_tensor *dst, const std::string &name, int rank, + QNNBackend device, Qnn_GraphHandle_t graph_handle, + std::shared_ptr qnn_instance, + bool is_intermediate, qnn_tensor_cache_t &tensor_cache) { + const auto op_index = qnn::get_qnn_op_index(dst); + auto qnn_op = qnn::create_op_constructor(op_index); + auto operation = qnn_op(name, qnn_instance); + + // input tensors + qnn::qnn_tensor_array_t input_qnn_tensors; + auto tensor_type = is_intermediate ? qnn::ggml_qnn_tensor::INTERMEDIATE : qnn::ggml_qnn_tensor::INPUT; + for (size_t i = 0; i < qnn::get_qnn_op_input_param_count(op_index); ++i) { + auto input_qnn_tensor = + create_tensor_with_cache(dst->src[i], tensor_type, rank, device, graph_handle, qnn_instance, tensor_cache); + input_qnn_tensors.push_back(input_qnn_tensor); + } + operation->set_input_tensors(input_qnn_tensors); + + // output tensor + tensor_type = is_intermediate ? qnn::ggml_qnn_tensor::INTERMEDIATE : qnn::ggml_qnn_tensor::OUTPUT; + qnn::qnn_tensor_array_t output_qnn_tensors = + create_tensors_with_cache({dst}, tensor_type, rank, device, graph_handle, qnn_instance, tensor_cache); + operation->set_output_tensors(output_qnn_tensors); + + // initialize operation + if (!operation->initialize_op_nodes(device, graph_handle)) { + QNN_LOG_ERROR("[%s][%s]initialize_op_nodes failed", qnn::get_backend_name(device), name.c_str()); + return nullptr; + } + + return operation; +} + +bool bind_src_tensors(ggml_tensor *op, qnn::qnn_tensor_array_t &tensor_wrappers, + std::vector &qnn_tensors) { + if (op->op == GGML_OP_NONE) { + QNN_LOG_DEBUG("op %s is not a valid op", ggml_get_name(op)); + return false; + } + + const auto param_count = qnn::get_qnn_op_input_param_count(qnn::get_qnn_op_index(op)); + GGML_ASSERT(tensor_wrappers.size() == param_count); + qnn_tensors.resize(param_count); + for (size_t i = 0; i < param_count; ++i) { + auto *ggml_tensor = op->src[i]; + if (!tensor_wrappers[i]->bind_ggml_tensor(ggml_tensor)) { + QNN_LOG_ERROR("bind tensor %s failed", ggml_get_name(ggml_tensor)); + return false; + } + + qnn_tensors[i] = tensor_wrappers[i]->get_qnn_tensor(); + } + + return true; +} + +int get_io_tensors_from_graph(const ggml_cgraph *cgraph, qnn::ggml_tensor_array_t &inputs, + qnn::ggml_tensor_array_t &outputs) { + using ggml_tensor_set_t = std::set; + + ggml_tensor_set_t input_set; + ggml_tensor_set_t output_set; + ggml_tensor_set_t visited_set; + int rank = 0; + for (int i = 0; i < cgraph->n_nodes; i++) { + ggml_tensor *dst = cgraph->nodes[i]; + if (ggml_is_empty(dst)) { + continue; + } + + if (dst->op == GGML_OP_NONE || dst->op == GGML_OP_VIEW) { + // TODO: remove GGML_OP_VIEW after view op is supported + continue; + } + + rank = std::max(rank, ggml_n_dims(dst)); + input_set.erase(dst); + if (!visited_set.count(dst)) { + output_set.insert(dst); + visited_set.insert(dst); + } + + for (size_t i = 0; i < GGML_MAX_DIMS && dst->src[i]; ++i) { + auto *src = dst->src[i]; + rank = std::max(rank, ggml_n_dims(src)); + output_set.erase(src); + if (!visited_set.count(src)) { + input_set.insert(src); + visited_set.insert(src); + } + } + } + + inputs.assign(input_set.begin(), input_set.end()); + outputs.assign(output_set.begin(), output_set.end()); + return rank; +} + +} // namespace + +namespace qnn { + +qnn_graph::qnn_graph(const std::string &graph_name, QNNBackend device, std::shared_ptr qnn_instance, + size_t vtcm_size_in_mb) + : _graph_name(graph_name), _device(device), _qnn_instance(qnn_instance) { + QNN_LOG_DEBUG("[%s][%s]created", get_backend_name(device), graph_name.c_str()); + + auto qnn_interface = qnn_instance->get_qnn_interface(); + auto qnn_context = qnn_instance->get_qnn_context_handle(); + Qnn_ErrorHandle_t error = QNN_SUCCESS; + Qnn_GraphHandle_t graph_handle = nullptr; + if (device == QNN_BACKEND_NPU) { + // TODO: fix graph config here for NPU + QnnHtpGraph_CustomConfig_t hvx_config; + hvx_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_NUM_HVX_THREADS; + hvx_config.numHvxThreads = 8; + QnnGraph_Config_t graph_hvx_config; + graph_hvx_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; + graph_hvx_config.customConfig = &hvx_config; + + QnnHtpGraph_CustomConfig_t dlbc_config; + dlbc_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_OPTIMIZATION; + dlbc_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_ENABLE_DLBC; + dlbc_config.optimizationOption.floatValue = 1.0; // set to 0.0 to turn off DLBC + QnnGraph_Config_t graph_dlbc_config; + graph_dlbc_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; + graph_dlbc_config.customConfig = &dlbc_config; + + QnnHtpGraph_CustomConfig_t opt_config; + opt_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_OPTIMIZATION_FLAG; + opt_config.optimizationOption.floatValue = 1; // 1 / 3 + QnnGraph_Config_t graph_opt_config; + graph_opt_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; + graph_opt_config.customConfig = &opt_config; + + QnnHtpGraph_CustomConfig_t vtcm_config; + vtcm_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_VTCM_SIZE; + vtcm_config.vtcmSizeInMB = vtcm_size_in_mb; + QnnGraph_Config_t graph_vtcm_config; + graph_vtcm_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; + graph_vtcm_config.customConfig = &vtcm_config; + + const QnnGraph_Config_t *graph_configs[] = {&graph_hvx_config, &graph_dlbc_config, &graph_vtcm_config, + &graph_opt_config, nullptr}; + error = qnn_interface->qnn_graph_create(qnn_context, graph_name.c_str(), graph_configs, &graph_handle); + } else { + error = qnn_interface->qnn_graph_create(qnn_context, graph_name.c_str(), nullptr, &graph_handle); + } + + if (error != QNN_SUCCESS) { + QNN_LOG_ERROR("[%s][%s]failed to create qnn graph, error: %s", get_backend_name(device), graph_name.c_str(), + get_qnn_error_string(error)); + return; + } + + QNN_LOG_INFO("[%s][%s]create succeed", get_backend_name(device), graph_name.c_str()); + _graph_handle = graph_handle; + _qnn_interface = qnn_interface; +} + +qnn_graph::~qnn_graph() { QNN_LOG_DEBUG("[%s][%s]destroy", get_backend_name(_device), _graph_name.c_str()); } + +bool qnn_graph::build_graph_from_op(ggml_tensor *op) { + if (!is_valid()) { + QNN_LOG_ERROR("Invalid graph"); + return false; + } + + QNN_LOG_DEBUG("[%s][%s]build start", get_backend_name(_device), _graph_name.c_str()); + qnn_tensor_cache_t tensor_cache; + const auto rank = get_op_max_rank(op); + auto operation = create_operation_from_op_tensor(op, _graph_name, rank, _device, _graph_handle, _qnn_instance, + false, tensor_cache); + if (!operation) { + QNN_LOG_ERROR("[%s][%s]create_operation_from_op_tensor failed", get_backend_name(_device), _graph_name.c_str()); + return false; + } + + _tensor_inputs = operation->get_input_tensors(); + _tensor_outputs = operation->get_output_tensors(); + _operations.push_back(std::move(operation)); + if (!finalize()) { + return false; + } + + QNN_LOG_DEBUG("[%s][%s]build succeed", get_backend_name(_device), _graph_name.c_str()); + return true; +} + +bool qnn_graph::build_graph_from_ggml_graph(const ggml_cgraph *cgraph) { + QNN_LOG_DEBUG("[%s][%s]build start", get_backend_name(_device), _graph_name.c_str()); + + ggml_tensor_array_t inputs; + ggml_tensor_array_t outputs; + int rank = get_io_tensors_from_graph(cgraph, inputs, outputs); + QNN_LOG_DEBUG("[%s]rank: %d, input_set: %d, output_set: %d", get_backend_name(_device), rank, int(inputs.size()), + int(outputs.size())); + + { + qnn_tensor_cache_t tensor_cache; + auto input_tensors = create_tensors_with_cache(inputs, ggml_qnn_tensor::INPUT, rank, _device, _graph_handle, + _qnn_instance, tensor_cache); + auto output_tensors = create_tensors_with_cache(outputs, ggml_qnn_tensor::OUTPUT, rank, _device, _graph_handle, + _qnn_instance, tensor_cache); + qnn_op_config_array_t operations; + for (int i = 0; i < cgraph->n_nodes; i++) { + ggml_tensor *dst = cgraph->nodes[i]; + if (ggml_is_empty(dst)) { + continue; + } + + if (dst->op == GGML_OP_NONE || dst->op == GGML_OP_VIEW) { + // TODO: remove GGML_OP_VIEW after view op is supported + continue; + } + + QNN_LOG_DEBUG("[%s]create op: %s", get_backend_name(_device), get_qnn_op_name(dst->op)); + auto operation = create_operation_from_op_tensor(dst, dst->name, rank, _device, _graph_handle, + _qnn_instance, true, tensor_cache); // TODO: fix op name + operations.push_back(operation); + } + + _tensor_inputs = std::move(input_tensors); + _tensor_outputs = std::move(output_tensors); + _operations = std::move(operations); + if (!finalize()) { + return false; + } + } + + QNN_LOG_DEBUG("[%s][%s]build succeed", get_backend_name(_device), _graph_name.c_str()); + return true; +} + +bool qnn_graph::execute(ggml_tensor *op) { + if (!bind_src_tensors(op, _tensor_inputs, _qnn_tensor_inputs)) { + QNN_LOG_ERROR("[%s][%s]bind input tensors failed", get_backend_name(_device), _graph_name.c_str()); + return false; + } + + if (!qnn::bind_tensors({op}, _tensor_outputs, _qnn_tensor_outputs)) { + QNN_LOG_ERROR("[%s][%s]bind output tensors failed", get_backend_name(_device), _graph_name.c_str()); + return false; + } + + auto &qnn_tensor_inputs = _qnn_tensor_inputs; + auto &qnn_tensor_outputs = _qnn_tensor_outputs; + auto error = + _qnn_interface->qnn_graph_execute(_graph_handle, qnn_tensor_inputs.data(), qnn_tensor_inputs.size(), + qnn_tensor_outputs.data(), qnn_tensor_outputs.size(), nullptr, nullptr); + unbind_tensors(_tensor_inputs); + unbind_tensors(_tensor_outputs); + + if (error != QNN_SUCCESS) { + if (_device == QNN_BACKEND_NPU && error == QNN_COMMON_ERROR_SYSTEM_COMMUNICATION) { + QNN_LOG_WARN("[%s][%s]NPU crashed. SSR detected. Caused QNN graph execute error.", + get_backend_name(_device), _graph_name.c_str()); + } else { + QNN_LOG_ERROR("[%s][%s]error: %s", get_backend_name(_device), _graph_name.c_str(), + get_qnn_error_string(error)); + } + return false; + } + + QNN_LOG_DEBUG("[%s][%s]execute succeed", get_backend_name(_device), _graph_name.c_str()); + return true; +} + +bool qnn_graph::execute(const ggml_cgraph *cgraph) { + ggml_tensor_array_t inputs; + ggml_tensor_array_t outputs; +#ifdef NDEBUG + get_io_tensors_from_graph(cgraph, inputs, outputs); +#else + int rank = get_io_tensors_from_graph(cgraph, inputs, outputs); + QNN_LOG_DEBUG("[%s]rank: %d, input_set: %d, output_set: %d", get_backend_name(_device), rank, int(inputs.size()), + int(outputs.size())); +#endif + + { + if (!qnn::bind_tensors(inputs, _tensor_inputs, _qnn_tensor_inputs)) { + QNN_LOG_ERROR("[%s][%s]bind input tensors failed", get_backend_name(_device), _graph_name.c_str()); + return false; + } + + if (!qnn::bind_tensors(outputs, _tensor_outputs, _qnn_tensor_outputs)) { + QNN_LOG_ERROR("[%s][%s]bind output tensors failed", get_backend_name(_device), _graph_name.c_str()); + return false; + } + + auto &qnn_tensor_inputs = _qnn_tensor_inputs; + auto &qnn_tensor_outputs = _qnn_tensor_outputs; + auto error = + _qnn_interface->qnn_graph_execute(_graph_handle, qnn_tensor_inputs.data(), qnn_tensor_inputs.size(), + qnn_tensor_outputs.data(), qnn_tensor_outputs.size(), nullptr, nullptr); + unbind_tensors(_tensor_inputs); + unbind_tensors(_tensor_outputs); + + if (error != QNN_SUCCESS) { + if (_device == QNN_BACKEND_NPU && error == QNN_COMMON_ERROR_SYSTEM_COMMUNICATION) { + QNN_LOG_WARN("[%s][%s]NPU crashed. SSR detected. Caused QNN graph execute error.", + get_backend_name(_device), _graph_name.c_str()); + } else { + QNN_LOG_ERROR("[%s][%s]error: %s", get_backend_name(_device), _graph_name.c_str(), + get_qnn_error_string(error)); + } + return false; + } + + QNN_LOG_DEBUG("[%s][%s]execute succeed", get_backend_name(_device), _graph_name.c_str()); + return true; + } +} + +bool qnn_graph::finalize() { + if (!qnn::add_op_to_graph(_graph_handle, _operations)) { + QNN_LOG_ERROR("[%s]add nodes failed", _graph_name.c_str()); + return false; + } + + auto error = _qnn_interface->qnn_graph_finalize(_graph_handle, nullptr, nullptr); + if (error != QNN_SUCCESS) { + QNN_LOG_ERROR("[%s][%s]qnn_graph_finalize.error: %s", get_backend_name(_device), _graph_name.c_str(), + get_qnn_error_string(error)); + return false; + } + + QNN_LOG_DEBUG("[%s][%s]finalize succeed", get_backend_name(_device), _graph_name.c_str()); + return true; +} + +} // namespace qnn diff --git a/ggml/src/ggml-qnn/graph.hpp b/ggml/src/ggml-qnn/graph.hpp index 1806f41126f3c..521186f790ee5 100644 --- a/ggml/src/ggml-qnn/graph.hpp +++ b/ggml/src/ggml-qnn/graph.hpp @@ -1,164 +1,53 @@ #pragma once -#include #include #include #include #include "ggml-qnn.h" -#include "logger.hpp" #include "op-config.hpp" #include "qnn-lib.hpp" namespace qnn { -class ggml_qnn_graph { +class qnn_graph { public: - explicit ggml_qnn_graph(const std::string &graph_name, QNNBackend device, - std::shared_ptr qnn_instance, size_t vtcm_size_in_mb) - : _graph_name(graph_name), _device(device), _qnn_instance(qnn_instance) { - QNN_LOG_DEBUG("[%s][%s]created", get_backend_name(device), graph_name.c_str()); + explicit qnn_graph(const std::string &graph_name, QNNBackend device, std::shared_ptr qnn_instance, + size_t vtcm_size_in_mb); + ~qnn_graph(); - auto qnn_interface = qnn_instance->get_qnn_interface(); - auto qnn_context = qnn_instance->get_qnn_context_handle(); - Qnn_ErrorHandle_t error = QNN_SUCCESS; - Qnn_GraphHandle_t graph_handle = nullptr; - if (device == QNN_BACKEND_NPU) { - // TODO: fix graph config here for NPU - QnnHtpGraph_CustomConfig_t hvx_config; - hvx_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_NUM_HVX_THREADS; - hvx_config.numHvxThreads = 8; - QnnGraph_Config_t graph_hvx_config; - graph_hvx_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; - graph_hvx_config.customConfig = &hvx_config; - - QnnHtpGraph_CustomConfig_t dlbc_config; - dlbc_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_OPTIMIZATION; - dlbc_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_ENABLE_DLBC; - dlbc_config.optimizationOption.floatValue = 1.0; // set to 0.0 to turn off DLBC - QnnGraph_Config_t graph_dlbc_config; - graph_dlbc_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; - graph_dlbc_config.customConfig = &dlbc_config; - - QnnHtpGraph_CustomConfig_t opt_config; - opt_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_OPTIMIZATION_FLAG; - opt_config.optimizationOption.floatValue = 1; // 1 / 3 - QnnGraph_Config_t graph_opt_config; - graph_opt_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; - graph_opt_config.customConfig = &opt_config; - - QnnHtpGraph_CustomConfig_t vtcm_config; - vtcm_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_VTCM_SIZE; - vtcm_config.vtcmSizeInMB = vtcm_size_in_mb; - QnnGraph_Config_t graph_vtcm_config; - graph_vtcm_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; - graph_vtcm_config.customConfig = &vtcm_config; - - const QnnGraph_Config_t *graph_configs[] = {&graph_hvx_config, &graph_dlbc_config, &graph_vtcm_config, - &graph_opt_config, nullptr}; - error = qnn_interface->qnn_graph_create(qnn_context, graph_name.c_str(), graph_configs, &graph_handle); - } else { - error = qnn_interface->qnn_graph_create(qnn_context, graph_name.c_str(), nullptr, &graph_handle); - } - - if (error != QNN_SUCCESS) { - QNN_LOG_ERROR("[%s][%s]failed to create qnn graph, error: %s", get_backend_name(device), graph_name.c_str(), - get_qnn_error_string(error)); - return; - } - - QNN_LOG_INFO("[%s][%s]create succeed", get_backend_name(device), graph_name.c_str()); - _graph_handle = graph_handle; - _qnn_interface = qnn_interface; - } - - ~ggml_qnn_graph() { QNN_LOG_DEBUG("[%s][%s]destroy", get_backend_name(_device), _graph_name.c_str()); } - - bool build_graph(ggml_op_constructor_t op_constructor, const ggml_tensor_array_t &tensor_inputs, - const ggml_tensor_array_t &tensor_outputs) { - GGML_ASSERT(op_constructor); - if (!is_valid()) { - QNN_LOG_ERROR("Invalid graph"); - return false; - } - - QNN_LOG_DEBUG("[%s][%s]build_graph start", get_backend_name(_device), _graph_name.c_str()); - _op_config = op_constructor(_graph_name, _qnn_instance); - if (!_op_config->initialize_op_nodes(_device, _graph_handle, tensor_inputs, tensor_outputs)) { - QNN_LOG_ERROR("[%s][%s]initialize_op_nodes failed", get_backend_name(_device), _graph_name.c_str()); - return false; - } - - if (!_op_config->add_op_to_graph(_graph_handle)) { - QNN_LOG_ERROR("[%s]add nodes failed", _graph_name.c_str()); - return false; - } - - auto error = _qnn_interface->qnn_graph_finalize(_graph_handle, nullptr, nullptr); - if (error != QNN_SUCCESS) { - QNN_LOG_ERROR("[%s][%s]qnn_graph_finalize.error: %s", get_backend_name(_device), _graph_name.c_str(), - get_qnn_error_string(error)); - return false; - } - - QNN_LOG_DEBUG("[%s][%s]build_graph succeed", get_backend_name(_device), _graph_name.c_str()); - return true; - } - - bool execute(const ggml_tensor_array_t &tensor_inputs, const ggml_tensor_array_t &tensor_outputs) { - if (!_op_config->bind_input_tensors(tensor_inputs)) { - QNN_LOG_ERROR("[%s][%s]bind input tensors failed", get_backend_name(_device), _graph_name.c_str()); - return false; - } - - if (!_op_config->bind_output_tensors(tensor_outputs)) { - QNN_LOG_ERROR("[%s][%s]bind output tensors failed", get_backend_name(_device), _graph_name.c_str()); - return false; - } - - auto &qnn_tensor_inputs = _op_config->get_qnn_input_tensors(); - auto &qnn_tensor_outputs = _op_config->get_qnn_output_tensors(); - - auto error = - _qnn_interface->qnn_graph_execute(_graph_handle, qnn_tensor_inputs.data(), qnn_tensor_inputs.size(), - qnn_tensor_outputs.data(), qnn_tensor_outputs.size(), nullptr, nullptr); - _op_config->unbind_input_tensors(); - _op_config->unbind_output_tensors(); - - if (error != QNN_SUCCESS) { - if (_device == QNN_BACKEND_NPU && error == QNN_COMMON_ERROR_SYSTEM_COMMUNICATION) { - QNN_LOG_WARN("[%s][%s]NPU crashed. SSR detected. Caused QNN graph execute error.", - get_backend_name(_device), _graph_name.c_str()); - } else { - QNN_LOG_ERROR("[%s][%s]error: %s", get_backend_name(_device), _graph_name.c_str(), - get_qnn_error_string(error)); - } - return false; - } - - QNN_LOG_DEBUG("[%s][%s]execute succeed", get_backend_name(_device), _graph_name.c_str()); - return true; - } + bool build_graph_from_op(ggml_tensor *op); + bool build_graph_from_ggml_graph(const ggml_cgraph *cgraph); + bool execute(ggml_tensor *op); + bool execute(const ggml_cgraph *cgraph); bool is_valid() const { return _graph_handle != nullptr; } - Qnn_GraphHandle_t get_graph_handler() const { return _graph_handle; } - + std::shared_ptr get_qnn_instance() { return _qnn_instance; } const std::string &get_name() const { return _graph_name; } + QNNBackend get_device() const { return _device; } private: + bool finalize(); + const std::string _graph_name; const QNNBackend _device; Qnn_GraphHandle_t _graph_handle = nullptr; std::shared_ptr _qnn_instance; std::shared_ptr _qnn_interface; - std::unique_ptr _op_config; - std::vector _param_types; + qnn_op_config_array_t _operations; - DISABLE_COPY(ggml_qnn_graph); - DISABLE_MOVE(ggml_qnn_graph); + qnn_tensor_array_t _tensor_inputs; + qnn_tensor_array_t _tensor_outputs; + std::vector _qnn_tensor_inputs; + std::vector _qnn_tensor_outputs; + + DISABLE_COPY(qnn_graph); + DISABLE_MOVE(qnn_graph); }; +using qnn_graph_ptr_t = std::shared_ptr; + } // namespace qnn diff --git a/ggml/src/ggml-qnn/op-config-base.hpp b/ggml/src/ggml-qnn/op-config-base.hpp index 159944a7d7f60..274bb8318ff99 100644 --- a/ggml/src/ggml-qnn/op-config-base.hpp +++ b/ggml/src/ggml-qnn/op-config-base.hpp @@ -10,8 +10,6 @@ namespace qnn { -using ggml_tensor_array_t = std::vector; - /** * @class ggml_qnn_op_config * @brief Abstract base class for configuring QNN operations. @@ -23,6 +21,34 @@ class ggml_qnn_op_config { public: virtual ~ggml_qnn_op_config() {} + /** + * @brief Sets custom input tensors for the operation. This method should be called before `initialize_op_nodes`. + * If no custom input tensors are provided, the input tensors will be automatically created from the input ggml + * tensors. + * + * This pure virtual function must be overridden by derived classes to set + * the input tensors for the operation. The function takes a reference to a + * vector of qnn_tensor_ptr_t objects, which represent the input tensors. + * + * @param tensor_inputs A reference to a vector of qnn_tensor_ptr_t objects representing the input tensors. + */ + virtual void set_input_tensors(qnn::qnn_tensor_array_t &tensor_inputs) = 0; + virtual void set_input_tensors(qnn::qnn_tensor_array_t &&tensor_inputs) = 0; + + /** + * @brief Sets custom output tensors for the operation. This method should be called before `initialize_op_nodes`. + * If no custom output tensors are provided, the output tensors will be automatically created from the output ggml + * tensors. + * + * This pure virtual function must be overridden by derived classes to set + * the output tensors for the operation. The function takes a reference to a + * vector of qnn_tensor_ptr_t objects, which represent the output tensors. + * + * @param tensor_outputs A reference to a vector of qnn_tensor_ptr_t objects representing the output tensors. + */ + virtual void set_output_tensors(qnn::qnn_tensor_array_t &tensor_outputs) = 0; + virtual void set_output_tensors(qnn::qnn_tensor_array_t &&tensor_inputs) = 0; + /** * @brief Creates tensors and internal nodes for constructing the calculation graph. * @@ -31,36 +57,32 @@ class ggml_qnn_op_config { * the internal nodes necessary for constructing the calculation graph. It takes * input and output tensor arrays as parameters. * - * @param device The backend device where tensors will be created. - * @param graph_handle The handle to the graph where tensors and nodes will be associated. - * @param tensor_inputs An array of input tensors. - * @param tensor_outputs An array of output tensors. + * @param device + * @param graph_handle * @return true if tensors and nodes are successfully created, false otherwise. */ - virtual bool initialize_op_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, - const ggml_tensor_array_t &tensor_inputs, - const ggml_tensor_array_t &tensor_outputs) = 0; + virtual bool initialize_op_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle) = 0; /** - * @brief Pure virtual function to retrieve the input tensors for QNN (Quantized Neural Network). + * @brief Pure virtual function to retrieve the input tensors. * * This function must be overridden by derived classes to provide the specific implementation * for retrieving the input tensors used in QNN operations. * - * @return A reference to a vector of Qnn_Tensor_t objects representing the input tensors. + * @return A reference to a vector of qnn_tensor_ptr_t objects representing the input tensors. */ - virtual std::vector &get_qnn_input_tensors() = 0; + virtual const qnn_tensor_array_t &get_input_tensors() = 0; /** - * @brief Pure virtual function to retrieve the output tensors of a QNN (Quantized Neural Network). + * @brief Pure virtual function to retrieve the output tensors of a QNN. * * This function must be overridden by any derived class to provide access to the * output tensors of the QNN. The function returns a reference to a vector of - * Qnn_Tensor_t objects, which represent the output tensors. + * qnn_tensor_ptr_t objects, which represent the output tensors. * - * @return std::vector& Reference to a vector of Qnn_Tensor_t objects. + * @return A reference to a vector of qnn_tensor_ptr_t objects representing the output tensors. */ - virtual std::vector &get_qnn_output_tensors() = 0; + virtual const qnn_tensor_array_t &get_output_tensors() = 0; /** * @brief Adds an operation to the given graph. @@ -125,5 +147,6 @@ class ggml_qnn_op_config { }; using qnn_op_config_ptr_t = std::shared_ptr; +using qnn_op_config_array_t = std::vector; } // namespace qnn diff --git a/ggml/src/ggml-qnn/op-config-caps.cpp b/ggml/src/ggml-qnn/op-config-caps.cpp new file mode 100644 index 0000000000000..aab8f65958bf1 --- /dev/null +++ b/ggml/src/ggml-qnn/op-config-caps.cpp @@ -0,0 +1,223 @@ + +#include "op-config.hpp" + +namespace { + +using op_dims_calc_func_t = void (*)(const std::vector &input_dims, + qnn::ggml_dimension_array_t &output_dims); + +void element_wise_op_dims(const std::vector &input_dims, + qnn::ggml_dimension_array_t &output_dims) { + for (size_t i = 1; i < std::size(output_dims); i++) { + output_dims[i] = input_dims.front()[i]; + } +} + +void mat_mul_op_dims(const std::vector &input_dims, + qnn::ggml_dimension_array_t &output_dims) { + GGML_ASSERT(input_dims.size() == 2); + output_dims[0] = input_dims.front()[1]; + output_dims[1] = input_dims.back()[1]; +} + +struct qnn_op_caps_t { + const char *qnn_op_name = nullptr; + const size_t input_param_count = 0; + op_dims_calc_func_t calc_dims_func = nullptr; +}; + +constexpr const qnn_op_caps_t kOpCaps[] = { + {}, // GGML_OP_NONE + {}, // GGML_OP_DUP + { + // GGML_OP_ADD + QNN_OP_ELEMENT_WISE_ADD, // qnn_op_name + 2, // input_param_count + element_wise_op_dims, // calc_dims_func + }, + {}, // GGML_OP_ADD1 + {}, // GGML_OP_ACC + { + // GGML_OP_SUB + QNN_OP_ELEMENT_WISE_SUBTRACT, // qnn_op_name + 2, // input_param_count + element_wise_op_dims, // calc_dims_func + }, + { + // GGML_OP_MUL + QNN_OP_ELEMENT_WISE_MULTIPLY, // qnn_op_name + 2, // input_param_count + element_wise_op_dims, // calc_dims_func + }, + { + // GGML_OP_DIV + QNN_OP_ELEMENT_WISE_DIVIDE, // qnn_op_name + 2, // input_param_count + element_wise_op_dims, // calc_dims_func + }, + {}, // GGML_OP_SQR + { + // GGML_OP_SQRT + QNN_OP_ELEMENT_WISE_SQUARE_ROOT, // qnn_op_name + 1, // input_param_count + element_wise_op_dims, // calc_dims_func + }, + { + // GGML_OP_LOG + QNN_OP_ELEMENT_WISE_LOG, // qnn_op_name + 1, // input_param_count + element_wise_op_dims, // calc_dims_func + }, + {}, // GGML_OP_SIN + {}, // GGML_OP_COS + {}, // GGML_OP_SUM + {}, // GGML_OP_SUM_ROWS + {}, // GGML_OP_MEAN + {}, // GGML_OP_ARGMAX + {}, // GGML_OP_COUNT_EQUAL + {}, // GGML_OP_REPEAT + {}, // GGML_OP_REPEAT_BACK + {}, // GGML_OP_CONCAT + {}, // GGML_OP_SILU_BACK + {}, // GGML_OP_NORM + {}, // GGML_OP_RMS_NORM + {}, // GGML_OP_RMS_NORM_BACK + {}, // GGML_OP_GROUP_NORM + { + // GGML_OP_MUL_MAT + QNN_OP_MAT_MUL, // qnn_op_name + 2, // input_param_count + mat_mul_op_dims, // calc_dims_func + }, + {}, // GGML_OP_MUL_MAT_ID + {}, // GGML_OP_OUT_PROD + {}, // GGML_OP_SCALE + {}, // GGML_OP_SET + {}, // GGML_OP_CPY + {}, // GGML_OP_CONT + { + // GGML_OP_RESHAPE + QNN_OP_RESHAPE, // qnn_op_name + 1, // input_param_count + nullptr, // TODO: calc_dims_func + }, + {}, // GGML_OP_VIEW + {}, // GGML_OP_PERMUTE + {}, // GGML_OP_TRANSPOSE + {}, // GGML_OP_GET_ROWS + {}, // GGML_OP_GET_ROWS_BACK + {}, // GGML_OP_DIAG + {}, // GGML_OP_DIAG_MASK_INF + {}, // GGML_OP_DIAG_MASK_ZERO + {}, // GGML_OP_SOFT_MAX + {}, // GGML_OP_SOFT_MAX_BACK + {}, // GGML_OP_ROPE + {}, // GGML_OP_ROPE_BACK + {}, // GGML_OP_CLAMP + {}, // GGML_OP_CONV_TRANSPOSE_1D + {}, // GGML_OP_IM2COL + {}, // GGML_OP_IM2COL_BACK + {}, // GGML_OP_CONV_TRANSPOSE_2D + {}, // GGML_OP_POOL_1D + {}, // GGML_OP_POOL_2D + {}, // GGML_OP_POOL_2D_BACK + {}, // GGML_OP_UPSCALE + {}, // GGML_OP_PAD + {}, // GGML_OP_PAD_REFLECT_1D + {}, // GGML_OP_ARANGE + + {}, // GGML_OP_TIMESTEP_EMBEDDING + {}, // GGML_OP_ARGSORT + {}, // GGML_OP_LEAKY_RELU + + {}, // GGML_OP_FLASH_ATTN_EXT + {}, // GGML_OP_FLASH_ATTN_BACK + {}, // GGML_OP_SSM_CONV + {}, // GGML_OP_SSM_SCAN + {}, // GGML_OP_WIN_PART + {}, // GGML_OP_WIN_UNPART + {}, // GGML_OP_GET_REL_POS + {}, // GGML_OP_ADD_REL_POS + {}, // GGML_OP_RWKV_WKV6 + + {}, // GGML_OP_UNARY + + {}, // GGML_OP_MAP_UNARY + {}, // GGML_OP_MAP_BINARY + + {}, // GGML_OP_MAP_CUSTOM1_F32 + {}, // GGML_OP_MAP_CUSTOM2_F32 + {}, // GGML_OP_MAP_CUSTOM3_F32 + + {}, // GGML_OP_MAP_CUSTOM1 + {}, // GGML_OP_MAP_CUSTOM2 + {}, // GGML_OP_MAP_CUSTOM3 + + {}, // GGML_OP_CROSS_ENTROPY_LOSS + {}, // GGML_OP_CROSS_ENTROPY_LOSS_BACK + {}, // GGML_OP_OPT_STEP_ADAMW + + // ggml_unary_op + {}, // GGML_UNARY_OP_ABS + {}, // GGML_UNARY_OP_SGN + {}, // GGML_UNARY_OP_NEG + {}, // GGML_UNARY_OP_STEP + {}, // GGML_UNARY_OP_TANH + {}, // GGML_UNARY_OP_ELU + {}, // GGML_UNARY_OP_RELU + {}, // GGML_UNARY_OP_SIGMOID + { + // GGML_UNARY_OP_GELU + QNN_OP_GELU, // qnn_op_name + 1, // input_param_count + nullptr, // TODO: calc_dims_func + }, + {}, // GGML_UNARY_OP_GELU_QUICK + {}, // GGML_UNARY_OP_SILU + {}, // GGML_UNARY_OP_HARDSWISH + {}, // GGML_UNARY_OP_HARDSIGMOID + {}, // GGML_UNARY_OP_EXP +}; + +static_assert(kOpCaps[GGML_OP_NONE].calc_dims_func == nullptr, "GGML_OP_NONE should not have calc_dims_func function"); +static_assert(kOpCaps[GGML_OP_ADD].calc_dims_func == element_wise_op_dims, + "GGML_OP_ADD does not have element_wise_op_dims function"); +static_assert(kOpCaps[GGML_OP_MUL_MAT].calc_dims_func == mat_mul_op_dims, + "GGML_OP_ADD does not have element_wise_op_dims function"); +static_assert(kOpCaps[GGML_OP_LOG].calc_dims_func == element_wise_op_dims, + "GGML_OP_LOG does not have element_wise_op_dims function"); +static_assert(std::size(kOpCaps) == (GGML_OP_COUNT + GGML_UNARY_OP_COUNT), + "GGML_OP_COUNT does not match the size of the kOpCaps table"); + +} // namespace + +namespace qnn { + +size_t get_qnn_op_index(const ggml_tensor *tensor) { + if (tensor->op == GGML_OP_UNARY) { + return kGgmlUnaryOpStart + ggml_get_unary_op(tensor); + } + + return tensor->op; +} + +void get_ggml_op_output_dimensions(const std::vector &input_dims, size_t op, + ggml_dimension_array_t &output_dims) { + GGML_ASSERT(op < std::size(kOpCaps)); + auto get_dims = kOpCaps[op].calc_dims_func; + GGML_ASSERT(get_dims); + get_dims(input_dims, output_dims); +} + +const char *get_qnn_op_name(size_t op) { + GGML_ASSERT(op < std::size(kOpCaps)); + GGML_ASSERT(kOpCaps[op].qnn_op_name); + return kOpCaps[op].qnn_op_name; +} + +size_t get_qnn_op_input_param_count(size_t op) { + GGML_ASSERT(op < std::size(kOpCaps)); + return kOpCaps[op].input_param_count; +} + +} // namespace qnn diff --git a/ggml/src/ggml-qnn/op-config.cpp b/ggml/src/ggml-qnn/op-config.cpp index b3c84b5435095..7edb4078a57df 100644 --- a/ggml/src/ggml-qnn/op-config.cpp +++ b/ggml/src/ggml-qnn/op-config.cpp @@ -24,16 +24,7 @@ qnn::qnn_dimension_array_t get_transposed_dimensions(const qnn::qnn_dimension_ar } int get_rank(const qnn::ggml_tensor_array_t &tensor_inputs, const qnn::ggml_tensor_array_t &tensor_outputs) { - int tensor_rank = 0; - // get the max tensor rank - for (auto tensor : tensor_inputs) { - tensor_rank = std::max(tensor_rank, ggml_n_dims(tensor)); - } - for (auto tensor : tensor_outputs) { - tensor_rank = std::max(tensor_rank, ggml_n_dims(tensor)); - } - - return tensor_rank; + return std::max(qnn::get_ggml_tensors_max_rank(tensor_inputs), qnn::get_ggml_tensors_max_rank(tensor_outputs)); } Qnn_DataType_t get_tensor_type(const qnn::qnn_tensor_array_t &tensors) { @@ -49,93 +40,6 @@ Qnn_DataType_t get_tensor_type(const qnn::qnn_tensor_array_t &tensors) { return type; } -struct tensor_common_params { - const char *name_prefix; - int tensor_rank; - bool is_input; - QNNBackend device; - Qnn_GraphHandle_t graph_handle; - std::shared_ptr qnn_instance; -}; - -void create_tensors_from_ggml_tensor(const tensor_common_params ¶ms, const qnn::ggml_tensor_array_t &ggml_tensors, - qnn::qnn_tensor_array_t *tensor_wrappers, std::vector *qnn_tensors) { - using namespace qnn; - - tensor_wrappers->resize(ggml_tensors.size()); - if (qnn_tensors) { - qnn_tensors->resize(ggml_tensors.size()); - } - char buffer[GGML_MAX_NAME] = {}; - auto tensor_type = params.is_input ? ggml_qnn_tensor::INPUT : ggml_qnn_tensor::OUTPUT; - for (size_t i = 0; i < ggml_tensors.size(); i++) { - snprintf(buffer, GGML_MAX_NAME, "%s%d", params.name_prefix, (int)i); - auto *ggml_tensor = ggml_tensors[i]; - (*tensor_wrappers)[i] = std::make_shared(tensor_type, std::string(buffer), ggml_tensor->ne, - ggml_tensor->type, params.tensor_rank, params.device, - params.graph_handle, params.qnn_instance); - } -} - -bool bind_tensors(const qnn::ggml_tensor_array_t &ggml_tensors, qnn::qnn_tensor_array_t &tensor_wrappers, - std::vector &qnn_tensors) { - for (size_t i = 0; i < ggml_tensors.size(); i++) { - auto *ggml_tensor = ggml_tensors[i]; - if (!tensor_wrappers[i]->bind_ggml_tensor(ggml_tensor)) { - QNN_LOG_ERROR("bind tensor %s failed", ggml_get_name(ggml_tensor)); - return false; - } - - qnn_tensors[i] = tensor_wrappers[i]->get_qnn_tensor(); - } - - return true; -} - -class ggml_qnn_connectable_op_config : public qnn::ggml_qnn_op_config_base { -public: - explicit ggml_qnn_connectable_op_config(const std::string &name, const std::string &package_name, - const std::string &op_type, std::shared_ptr qnn_instance) - : ggml_qnn_op_config_base(name, package_name, op_type, qnn_instance) {} - - bool initialize_op_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, - const qnn::ggml_tensor_array_t &tensor_inputs, - const qnn::ggml_tensor_array_t &tensor_outputs) override { - GGML_UNUSED(device); - GGML_UNUSED(graph_handle); - GGML_UNUSED(tensor_inputs); - GGML_UNUSED(tensor_outputs); - return true; - } - - void set_input_tensors(qnn::qnn_tensor_array_t &tensor_inputs) { - _tensor_inputs = tensor_inputs; - _qnn_tensor_inputs.resize(_tensor_inputs.size()); - } - - void set_input_tensors(qnn::qnn_tensor_array_t &&tensor_inputs) { - _tensor_inputs = std::move(tensor_inputs); - _qnn_tensor_inputs.resize(_tensor_inputs.size()); - } - - void set_output_tensors(qnn::qnn_tensor_array_t &tensor_outputs) { - _tensor_outputs = tensor_outputs; - _qnn_tensor_outputs.resize(_tensor_outputs.size()); - } - - void set_output_tensors(qnn::qnn_tensor_array_t &&tensor_outputs) { - _tensor_outputs = std::move(tensor_outputs); - _qnn_tensor_outputs.resize(_tensor_outputs.size()); - } - - qnn::qnn_tensor_array_t &get_input_tensors() { return _tensor_inputs; } - qnn::qnn_tensor_array_t &get_output_tensors() { return _tensor_outputs; } - -private: - DISABLE_COPY(ggml_qnn_connectable_op_config); - DISABLE_MOVE(ggml_qnn_connectable_op_config); -}; - } // namespace namespace qnn { @@ -161,7 +65,7 @@ bool ggml_qnn_op_config_base::add_tensor_param(const std::string &name, const qn } GGML_ASSERT(data_size > 0); - if (!param_tensor->bind_buffer(const_cast(data), data_size)) { + if (!param_tensor->set_data_buffer(data, data_size)) { QNN_LOG_ERROR("parameter tensor bind_buffer failed"); return false; } @@ -181,6 +85,26 @@ bool ggml_qnn_op_config_base::add_tensor_param(const std::string &name, const qn return true; } +void ggml_qnn_op_config_base::set_input_tensors(qnn::qnn_tensor_array_t &tensor_inputs) { + _tensor_inputs = tensor_inputs; + _qnn_tensor_inputs.resize(_tensor_inputs.size()); +} + +void ggml_qnn_op_config_base::set_input_tensors(qnn::qnn_tensor_array_t &&tensor_inputs) { + _tensor_inputs = tensor_inputs; + _qnn_tensor_inputs.resize(_tensor_inputs.size()); +} + +void ggml_qnn_op_config_base::set_output_tensors(qnn::qnn_tensor_array_t &tensor_outputs) { + _tensor_outputs = std::move(tensor_outputs); + _qnn_tensor_outputs.resize(_tensor_outputs.size()); +} + +void ggml_qnn_op_config_base::set_output_tensors(qnn::qnn_tensor_array_t &&tensor_outputs) { + _tensor_outputs = std::move(tensor_outputs); + _qnn_tensor_outputs.resize(_tensor_outputs.size()); +} + bool ggml_qnn_op_config_base::add_op_to_graph(Qnn_GraphHandle_t graph_handle) { GGML_ASSERT(_qnn_tensor_inputs.size() == _tensor_inputs.size()); GGML_ASSERT(_qnn_tensor_outputs.size() == _tensor_outputs.size()); @@ -221,12 +145,12 @@ bool ggml_qnn_op_config_base::add_op_to_graph(Qnn_GraphHandle_t graph_handle) { bool ggml_qnn_op_config_base::bind_input_tensors(const ggml_tensor_array_t &tensor_inputs) { GGML_ASSERT(tensor_inputs.size() == _tensor_inputs.size()); - return bind_tensors(tensor_inputs, _tensor_inputs, _qnn_tensor_inputs); + return qnn::bind_tensors(tensor_inputs, _tensor_inputs, _qnn_tensor_inputs); } bool ggml_qnn_op_config_base::bind_output_tensors(const ggml_tensor_array_t &tensor_outputs) { GGML_ASSERT(tensor_outputs.size() == _tensor_outputs.size()); - return bind_tensors(tensor_outputs, _tensor_outputs, _qnn_tensor_outputs); + return qnn::bind_tensors(tensor_outputs, _tensor_outputs, _qnn_tensor_outputs); } void ggml_qnn_op_config_base::unbind_input_tensors() { @@ -257,55 +181,42 @@ Qnn_OpConfig_t ggml_qnn_op_config_base::get_op_config() { return config; } -bool ggml_qnn_single_op_config::initialize_op_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, - const ggml_tensor_array_t &tensor_inputs, - const ggml_tensor_array_t &tensor_outputs) { - const auto tensor_rank = get_rank(tensor_inputs, tensor_outputs); - tensor_common_params params = {"src", tensor_rank, true, device, graph_handle, _qnn_instance}; - create_tensors_from_ggml_tensor(params, tensor_inputs, &_tensor_inputs, &_qnn_tensor_inputs); - params.name_prefix = "dst"; - params.is_input = false; - create_tensors_from_ggml_tensor(params, tensor_outputs, &_tensor_outputs, &_qnn_tensor_outputs); - - if (_param_buffer.size() > 0) { - // handle parameters in output tensor - auto *params = tensor_outputs.front()->op_params; - memcpy(_param_buffer.data(), params, _param_buffer.size()); - - const uint32_t count = uint32_t(_param_buffer.size() / qnn_datatype_size(_param_type)); - const qnn_dimension_array_t param_dims = {count, 1, 1, 1}; - add_tensor_param(_param_name, param_dims, 1, _param_buffer.data(), _param_type, device, graph_handle); - } - +bool ggml_qnn_single_op_config::initialize_op_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle) { + GGML_UNUSED(device); + GGML_UNUSED(graph_handle); return true; } -bool ggml_qnn_aggregate_op_config::bind_input_tensors(const ggml_tensor_array_t &tensor_inputs) { - return bind_tensors(tensor_inputs, _tensor_inputs, _qnn_tensor_inputs); +void ggml_qnn_aggregate_op_config::set_input_tensors(qnn::qnn_tensor_array_t &tensor_inputs) { + _tensor_inputs = tensor_inputs; } -bool ggml_qnn_aggregate_op_config::bind_output_tensors(const ggml_tensor_array_t &tensor_outputs) { - return bind_tensors(tensor_outputs, _tensor_outputs, _qnn_tensor_outputs); +void ggml_qnn_aggregate_op_config::set_input_tensors(qnn::qnn_tensor_array_t &&tensor_inputs) { + _tensor_inputs = std::move(tensor_inputs); } -bool ggml_qnn_matmul_op_config::initialize_op_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, - const ggml_tensor_array_t &tensor_inputs, - const ggml_tensor_array_t &tensor_outputs) { - GGML_ASSERT(tensor_inputs.size() == 2); - GGML_ASSERT(tensor_outputs.size() == 1); - const auto tensor_rank = get_rank(tensor_inputs, tensor_outputs); - GGML_ASSERT(tensor_rank >= 2); +void ggml_qnn_aggregate_op_config::set_output_tensors(qnn::qnn_tensor_array_t &tensor_outputs) { + _tensor_outputs = tensor_outputs; +} - // create input tensors - tensor_common_params params = {"src", tensor_rank, true, device, graph_handle, _qnn_instance}; - create_tensors_from_ggml_tensor(params, tensor_inputs, &_tensor_inputs, &_qnn_tensor_inputs); +void ggml_qnn_aggregate_op_config::set_output_tensors(qnn::qnn_tensor_array_t &&tensor_outputs) { + _tensor_outputs = std::move(tensor_outputs); +} + +bool ggml_qnn_aggregate_op_config::bind_input_tensors(const ggml_tensor_array_t &tensor_inputs) { + return qnn::bind_tensors(tensor_inputs, _tensor_inputs); +} + +bool ggml_qnn_aggregate_op_config::bind_output_tensors(const ggml_tensor_array_t &tensor_outputs) { + return qnn::bind_tensors(tensor_outputs, _tensor_outputs); +} - // create output tensor - params.name_prefix = "dst"; - params.is_input = false; - create_tensors_from_ggml_tensor(params, tensor_outputs, &_tensor_outputs, &_qnn_tensor_outputs); +bool ggml_qnn_matmul_op_config::initialize_op_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle) { + GGML_ASSERT(_tensor_inputs.size() == 2); + GGML_ASSERT(_tensor_outputs.size() == 1); // create convert nodes + const auto tensor_rank = _tensor_inputs.front()->get_rank(); qnn_tensor_array_t mat_mul_tensor_inputs = _tensor_inputs; qnn_tensor_array_t mat_mul_tensor_outputs = _tensor_outputs; if (!create_convert_nodes(device, graph_handle, tensor_rank, mat_mul_tensor_inputs, mat_mul_tensor_outputs)) { @@ -343,8 +254,8 @@ qnn_tensor_ptr_t ggml_qnn_matmul_op_config::create_gather_nodes(QNNBackend devic auto gather_out = std::make_shared(ggml_qnn_tensor::INTERMEDIATE, name + "_out", dimensions, tensor_input->get_data_type(), rank, device, graph_handle, qnn_instance); - auto gather_op = std::make_shared(name, QNN_OP_PACKAGE_NAME_QTI_AISW, - QNN_OP_GATHER, qnn_instance); + auto gather_op = std::make_shared(name, QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_GATHER, + qnn_instance); Qnn_Scalar_t scalar = QNN_SCALAR_INIT; scalar.dataType = QNN_DATATYPE_INT_32; @@ -355,16 +266,16 @@ qnn_tensor_ptr_t ggml_qnn_matmul_op_config::create_gather_nodes(QNNBackend devic // here we calculate the index mapping, will generate a 1d tensor like [0, 0, 0, 1, 1, 1, 2, 2, 2, ...], // by repeating each index [scale] times. const auto scale = dimensions[axis] / tensor_input->get_dimensions()[axis]; - std::vector index_buffer(dimensions[axis] * sizeof(uint32_t)); - for (uint32_t *curr = reinterpret_cast(index_buffer.data()), *end = curr + dimensions[axis]; + auto index_buffer = std::make_shared(dimensions[axis] * sizeof(uint32_t)); + for (uint32_t *curr = reinterpret_cast(index_buffer->get_buffer()), *end = curr + dimensions[axis]; curr < end; curr++) { - *curr = (curr - reinterpret_cast(index_buffer.data())) / scale; + *curr = (curr - reinterpret_cast(index_buffer->get_buffer())) / scale; } auto gather_index = std::make_shared( ggml_qnn_tensor::PARAMETER, name + "_index", qnn_dimension_array_t{dimensions[axis]}, QNN_DATATYPE_UINT_32, 1, device, graph_handle, qnn_instance); - gather_index->set_data_buffer(std::move(index_buffer)); + gather_index->set_data_buffer(index_buffer); gather_op->set_input_tensors({tensor_input, gather_index}); tensor_output = gather_out; @@ -409,8 +320,8 @@ bool ggml_qnn_matmul_op_config::create_convert_nodes(QNNBackend device, Qnn_Grap auto convert_out = std::make_shared(ggml_qnn_tensor::INTERMEDIATE, convert_name + "_out", convert_in->get_dimensions(), tensor_type, rank, device, graph_handle, _qnn_instance); - auto convert = std::make_shared(convert_name, QNN_OP_PACKAGE_NAME_QTI_AISW, - QNN_OP_CONVERT, _qnn_instance); + auto convert = std::make_shared(convert_name, QNN_OP_PACKAGE_NAME_QTI_AISW, + QNN_OP_CONVERT, _qnn_instance); convert->set_input_tensors({convert_in}); convert->set_output_tensors({convert_out}); tensor_inputs[i] = convert_out; @@ -424,8 +335,8 @@ bool ggml_qnn_matmul_op_config::create_convert_nodes(QNNBackend device, Qnn_Grap auto convert_in = std::make_shared(ggml_qnn_tensor::INTERMEDIATE, convert_name + "_in", convert_out->get_dimensions(), tensor_type, rank, device, graph_handle, _qnn_instance); - auto output_convert = std::make_shared( - convert_name, QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_CONVERT, _qnn_instance); + auto output_convert = std::make_shared(convert_name, QNN_OP_PACKAGE_NAME_QTI_AISW, + QNN_OP_CONVERT, _qnn_instance); output_convert->set_input_tensors({convert_in}); output_convert->set_output_tensors({convert_out}); tensor_outputs.front() = convert_in; @@ -495,12 +406,12 @@ bool ggml_qnn_matmul_op_config::create_mat_mul_nodes(QNNBackend device, Qnn_Grap dst->get_data_type(), rank, device, graph_handle, _qnn_instance); // create transpose_out - auto transpose_out = std::make_shared( - _name + "_trans1", QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_TRANSPOSE, _qnn_instance); + auto transpose_out = std::make_shared(_name + "_trans1", QNN_OP_PACKAGE_NAME_QTI_AISW, + QNN_OP_TRANSPOSE, _qnn_instance); // create mat_mul - auto mat_mul = std::make_shared(_name, QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_MAT_MUL, - _qnn_instance); + auto mat_mul = + std::make_shared(_name, QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_MAT_MUL, _qnn_instance); Qnn_Scalar_t scalar = QNN_SCALAR_INIT; scalar.dataType = QNN_DATATYPE_BOOL_8; @@ -528,19 +439,20 @@ bool ggml_qnn_matmul_op_config::create_mat_mul_nodes(QNNBackend device, Qnn_Grap return true; } -ggml_op_constructor_t create_op_constructor(const std::string &op_name) { +ggml_op_constructor_t create_op_constructor(size_t op) { + std::string op_name = get_qnn_op_name(op); if (op_name == QNN_OP_MAT_MUL) { // For QNN_OP_MAT_MUL, we need to transpose the input tensor return [](const std::string &instance_name, - std::shared_ptr qnn_instance) -> std::unique_ptr { + std::shared_ptr qnn_instance) -> std::shared_ptr { QNN_LOG_DEBUG("create QNN_OP_MAT_MUL, name %s", instance_name.c_str()); - return std::make_unique(instance_name, qnn_instance); + return std::make_shared(instance_name, qnn_instance); }; } return [op_name](const std::string &instance_name, - std::shared_ptr qnn_instance) -> std::unique_ptr { - return std::make_unique(instance_name, QNN_OP_PACKAGE_NAME_QTI_AISW, op_name, + std::shared_ptr qnn_instance) -> std::shared_ptr { + return std::make_shared(instance_name, QNN_OP_PACKAGE_NAME_QTI_AISW, op_name, qnn_instance); }; } diff --git a/ggml/src/ggml-qnn/op-config.hpp b/ggml/src/ggml-qnn/op-config.hpp index a05b75ade7e6a..ca066520bc171 100644 --- a/ggml/src/ggml-qnn/op-config.hpp +++ b/ggml/src/ggml-qnn/op-config.hpp @@ -1,7 +1,7 @@ #pragma once -#include #include +#include #include #include @@ -13,9 +13,28 @@ namespace qnn { using ggml_op_constructor_t = - std::function(const std::string &, std::shared_ptr)>; + std::function(const std::string &, std::shared_ptr)>; -ggml_op_constructor_t create_op_constructor(const std::string &op_name); +constexpr const size_t kGgmlUnaryOpStart = GGML_OP_COUNT; + +size_t get_qnn_op_index(const ggml_tensor *tensor); +void get_ggml_op_output_dimensions(const std::vector &input_dims, size_t op, + ggml_dimension_array_t &output_dims); + +const char *get_qnn_op_name(size_t op); +size_t get_qnn_op_input_param_count(size_t op); + +ggml_op_constructor_t create_op_constructor(size_t op); + +inline bool add_op_to_graph(Qnn_GraphHandle_t graph_handle, std::vector &operations) { + for (auto &op : operations) { + if (!op->add_op_to_graph(graph_handle)) { + return false; + } + } + + return true; +} class ggml_qnn_op_config_base : public ggml_qnn_op_config { public: @@ -27,13 +46,18 @@ class ggml_qnn_op_config_base : public ggml_qnn_op_config { bool add_tensor_param(const std::string &name, const qnn_dimension_array_t &dimensions, int rank, const uint8_t *data, const Qnn_DataType_t data_type, QNNBackend device, Qnn_GraphHandle_t graph_handle); + + void set_input_tensors(qnn::qnn_tensor_array_t &tensor_inputs) override; + void set_input_tensors(qnn::qnn_tensor_array_t &&tensor_inputs) override; + void set_output_tensors(qnn::qnn_tensor_array_t &tensor_inputs) override; + void set_output_tensors(qnn::qnn_tensor_array_t &&tensor_inputs) override; bool add_op_to_graph(Qnn_GraphHandle_t graph_handle) override; bool bind_input_tensors(const ggml_tensor_array_t &tensor_inputs) override; bool bind_output_tensors(const ggml_tensor_array_t &tensor_outputs) override; void unbind_input_tensors() override; void unbind_output_tensors() override; - std::vector &get_qnn_input_tensors() override { return _qnn_tensor_inputs; } - std::vector &get_qnn_output_tensors() override { return _qnn_tensor_outputs; } + const qnn_tensor_array_t &get_input_tensors() override { return _tensor_inputs; } + const qnn_tensor_array_t &get_output_tensors() override { return _tensor_outputs; } protected: Qnn_OpConfig_t get_op_config(); @@ -60,24 +84,9 @@ class ggml_qnn_single_op_config : public ggml_qnn_op_config_base { const std::string &op_type, std::shared_ptr qnn_instance) : ggml_qnn_op_config_base(name, package_name, op_type, qnn_instance) {} - explicit ggml_qnn_single_op_config(const std::string &name, const std::string &package_name, - const std::string &op_type, const std::string ¶m_name, - const Qnn_DataType_t param_type, const size_t param_size, - std::shared_ptr qnn_instance) - : ggml_qnn_op_config_base(name, package_name, op_type, qnn_instance), - _param_name(param_name), - _param_type(param_type), - _param_buffer(param_size) {} - - bool initialize_op_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, - const ggml_tensor_array_t &tensor_inputs, - const ggml_tensor_array_t &tensor_outputs) override; + bool initialize_op_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle) override; private: - const std::string _param_name; - const Qnn_DataType_t _param_type = QNN_DATATYPE_UINT_32; - std::vector _param_buffer; - DISABLE_COPY(ggml_qnn_single_op_config); DISABLE_MOVE(ggml_qnn_single_op_config); }; @@ -88,26 +97,21 @@ class ggml_qnn_aggregate_op_config : public ggml_qnn_op_config { : _name(name), _qnn_instance(qnn_instance) {} ~ggml_qnn_aggregate_op_config() { - _qnn_tensor_inputs.clear(); - _qnn_tensor_outputs.clear(); _tensor_inputs.clear(); _tensor_outputs.clear(); _operations.clear(); } + void set_input_tensors(qnn::qnn_tensor_array_t &tensor_inputs) override; + void set_input_tensors(qnn::qnn_tensor_array_t &&tensor_inputs) override; + void set_output_tensors(qnn::qnn_tensor_array_t &tensor_inputs) override; + void set_output_tensors(qnn::qnn_tensor_array_t &&tensor_inputs) override; bool add_op_to_graph(Qnn_GraphHandle_t graph_handle) override { - for (auto &op : _operations) { - if (!op->add_op_to_graph(graph_handle)) { - return false; - } - } - return true; + return qnn::add_op_to_graph(graph_handle, _operations); } bool bind_input_tensors(const ggml_tensor_array_t &tensor_inputs) override; - bool bind_output_tensors(const ggml_tensor_array_t &tensor_outputs) override; - void unbind_input_tensors() override { for (auto &tensor : _tensor_inputs) { tensor->unbind(); @@ -120,8 +124,8 @@ class ggml_qnn_aggregate_op_config : public ggml_qnn_op_config { } } - std::vector &get_qnn_input_tensors() override { return _qnn_tensor_inputs; } - std::vector &get_qnn_output_tensors() override { return _qnn_tensor_outputs; } + const qnn_tensor_array_t &get_input_tensors() override { return _tensor_inputs; } + const qnn_tensor_array_t &get_output_tensors() override { return _tensor_outputs; } protected: std::string _name; @@ -130,8 +134,6 @@ class ggml_qnn_aggregate_op_config : public ggml_qnn_op_config { std::vector _operations; qnn_tensor_array_t _tensor_inputs; qnn_tensor_array_t _tensor_outputs; - std::vector _qnn_tensor_inputs; - std::vector _qnn_tensor_outputs; private: DISABLE_COPY(ggml_qnn_aggregate_op_config); @@ -143,9 +145,7 @@ class ggml_qnn_matmul_op_config : public ggml_qnn_aggregate_op_config { ggml_qnn_matmul_op_config(const std::string &name, std::shared_ptr qnn_instance) : ggml_qnn_aggregate_op_config(name, qnn_instance) {} - bool initialize_op_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, - const ggml_tensor_array_t &tensor_inputs, - const ggml_tensor_array_t &tensor_outputs) override; + bool initialize_op_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle) override; private: qnn_tensor_ptr_t create_gather_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, const int rank, diff --git a/ggml/src/ggml-qnn/qnn-types.hpp b/ggml/src/ggml-qnn/qnn-types.hpp index 7461ac3012755..ec30602843301 100644 --- a/ggml/src/ggml-qnn/qnn-types.hpp +++ b/ggml/src/ggml-qnn/qnn-types.hpp @@ -12,7 +12,9 @@ namespace qnn { // // helper data type / data structure / macros / functions of // Qualcomm QNN(Qualcomm Neural Network, aka Qualcomm AI Engine Direct) SDK -// ref:https://github.com/pytorch/executorch/tree/main/backends/qualcomm +// ref: +// https://github.com/pytorch/executorch/blob/ae3d558d5e6aa04fc52a3065399fe6a773702f52/backends/qualcomm/serialization/qc_schema.py#L53 +// https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/overview.html#supported-snapdragon-devices // ================================================================================================= enum sdk_profile_level { profile_off = 0, profile_basic, profile_detail }; @@ -22,14 +24,18 @@ enum qcom_htp_arch { V69 = 69, V73 = 73, V75 = 75, + V79 = 79, // SD 8 Gen 4 (SM8750) }; enum qcom_chipset { UNKNOWN_SM = 0, - SM8450 = 36, // v69 - SM8475 = 42, // v69 - SM8550 = 43, // v73 - SM8650 = 57, // v75 + SM8450 = 36, // v69, SD 8 Gen 1 + SM8475 = 42, // v69, SD 8+ Gen 1 + SM8550 = 43, // v73, SD 8 Gen 2 + SSG2115P = 46, // v73 + SM8650 = 57, // v75, SD 8 Gen 3 + SA8295 = 39, // v68 + SM8750 = 69, // v79, SD 8 Gen 4 }; struct qcom_socinfo { diff --git a/ggml/src/ggml-qnn/tensor.hpp b/ggml/src/ggml-qnn/tensor.hpp index 833c620971e0d..3bd86891cb18f 100644 --- a/ggml/src/ggml-qnn/tensor.hpp +++ b/ggml/src/ggml-qnn/tensor.hpp @@ -20,9 +20,9 @@ namespace qnn { static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS should be 4"); -class ggml_qnn_tensor { +class ggml_qnn_tensor : public std::enable_shared_from_this { public: - typedef enum _tensor_type { INPUT, OUTPUT, INTERMEDIATE, PARAMETER } tensor_type_t; + typedef enum _tensor_type { INPUT, OUTPUT, INTERMEDIATE, PARAMETER, BIDIRECTION } tensor_type_t; explicit ggml_qnn_tensor(tensor_type_t tensor_type, const std::string &name, const qnn_dimension_array_t &dimensions, Qnn_DataType_t data_type, int rank, @@ -49,18 +49,27 @@ class ggml_qnn_tensor { qnn_datatype_from_ggml_datatype(data_type), rank, device, graph_handle, qnn_instance) {} ~ggml_qnn_tensor() { - _buffer_storage.clear(); - unbind(); _rpc_buffer.reset(); + unbind(); } - bool set_data_buffer(std::vector &&buffer) { - if (!bind_buffer_impl(buffer.data(), buffer.size())) { - return false; + bool set_data_buffer(const uint8_t *buffer, const size_t buffer_size) { + auto qnn_buffer = std::make_shared(buffer, buffer_size); + if (bind_buffer_impl(qnn_buffer)) { + return true; } - _buffer_storage = std::move(buffer); - return true; + can_unbind = false; + return false; + } + + bool set_data_buffer(qnn_buffer_ptr buffer) { + if (bind_buffer_impl(buffer)) { + return true; + } + + can_unbind = false; + return false; } bool alloc_qnn_tensor_id() { @@ -83,23 +92,32 @@ class ggml_qnn_tensor { return true; } - bool bind_buffer(uint8_t *buffer, const size_t buffer_size) { - if (!_buffer_storage.empty()) { + bool bind_ggml_tensor(ggml_tensor *tensor) { + if (!can_unbind) { QNN_LOG_DEBUG("[%s]already has buffer storage, skip bind", _tensor_name.c_str()); return true; } - return bind_buffer_impl(buffer, buffer_size); - } +#ifndef NDEBUG + if (tensor->view_src) { + auto *src = tensor->view_src; + QNN_LOG_DEBUG("[%s]tensor(%s_%dx%dx%dx%d) is a view, src: %s_%dx%dx%dx%d", get_backend_name(_device), + tensor->name, tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3], src->name, + src->ne[0], src->ne[1], src->ne[2], src->ne[3]); + } +#endif - bool bind_ggml_tensor(ggml_tensor *tensor) { - if (!bind_buffer(reinterpret_cast(tensor->data), ggml_nbytes(tensor))) { + auto buffer = + std::make_shared(reinterpret_cast(tensor->data), ggml_nbytes(tensor)); + if (!bind_buffer_impl(buffer)) { QNN_LOG_WARN("[%s]failed to bind ggml tensor(%s)", _tensor_name.c_str(), ggml_get_name(tensor)); return false; } QNN_LOG_DEBUG("[%s][%s]bind to ggml tensor(%s)", get_backend_name(_device), _tensor_name.c_str(), ggml_get_name(tensor)); + tensor->extra = this; + _ggml_tensor = tensor; return true; } @@ -110,7 +128,7 @@ class ggml_qnn_tensor { } if (!_buffer) { - QNN_LOG_DEBUG("[%s]bound to ggml tensor", _tensor_name.c_str()); + QNN_LOG_DEBUG("[%s]unbind to ggml tensor", _tensor_name.c_str()); return true; } @@ -119,7 +137,7 @@ class ggml_qnn_tensor { return false; } - if (!_buffer_storage.empty()) { + if (!can_unbind) { QNN_LOG_DEBUG("[%s]already has buffer storage, stop unbind", _tensor_name.c_str()); return true; } @@ -132,26 +150,32 @@ class ggml_qnn_tensor { } QNN_LOG_DEBUG("[%s][%s]unbind from buffer: %p, size: %d", get_backend_name(_device), _tensor_name.c_str(), - _buffer, (int)_buffer_size); - _buffer = nullptr; - _buffer_size = 0; + _buffer.get(), (int)_buffer->get_size()); + _buffer.reset(); + + if (_ggml_tensor) { + _ggml_tensor->extra = nullptr; + _ggml_tensor = nullptr; + } + return true; } const Qnn_Tensor_t &get_qnn_tensor() const { return _qnn_tensor; } Qnn_DataType_t get_data_type() const { return QNN_TENSOR_GET_DATA_TYPE(_qnn_tensor); } const qnn_dimension_array_t &get_dimensions() const { return _dimensions; } + uint32_t get_rank() const { return QNN_TENSOR_GET_RANK(_qnn_tensor); } uint32_t get_qnn_tensor_id() const { return QNN_TENSOR_GET_ID(_qnn_tensor); } private: - bool bind_buffer_impl(uint8_t *buffer, const size_t buffer_size) { + bool bind_buffer_impl(qnn_buffer_ptr buffer) { if (_buffer) { if (_buffer != buffer) { - QNN_LOG_WARN("[%s]has been bound to another buffer %p", _tensor_name.c_str(), _buffer); + QNN_LOG_WARN("[%s]has been bound to another buffer %p", _tensor_name.c_str(), _buffer.get()); return false; } - QNN_LOG_DEBUG("[%s]already bound to same ggml tensor %p", _tensor_name.c_str(), _buffer); + QNN_LOG_DEBUG("[%s]already bound to same ggml tensor %p", _tensor_name.c_str(), _buffer.get()); return true; } @@ -164,7 +188,7 @@ class ggml_qnn_tensor { if (should_use_mem_handle()) { if (!_rpc_buffer) { auto rpc_buffer = std::make_shared( - _qnn_instance, buffer_size, QNN_TENSOR_GET_RANK(_qnn_tensor), + _qnn_instance, buffer->get_size(), QNN_TENSOR_GET_RANK(_qnn_tensor), QNN_TENSOR_GET_DIMENSIONS(_qnn_tensor), QNN_TENSOR_GET_DATA_TYPE(_qnn_tensor)); if (!rpc_buffer->is_valid()) { QNN_LOG_WARN("[%s][%s]alloc rpc mem failed", get_backend_name(_device), _tensor_name.c_str()); @@ -187,22 +211,21 @@ class ggml_qnn_tensor { QNN_TENSOR_GET_MEM_HANDLE(_qnn_tensor)); } else { QNN_TENSOR_SET_MEM_TYPE(_qnn_tensor, QNN_TENSORMEMTYPE_RAW); - Qnn_ClientBuffer_t client_buf = {buffer, (uint32_t)buffer_size}; + Qnn_ClientBuffer_t client_buf = {buffer->get_buffer(), (uint32_t)buffer->get_size()}; QNN_TENSOR_SET_CLIENT_BUF(_qnn_tensor, client_buf); QNN_LOG_DEBUG("[%s]use client buffer %p size %d", _tensor_name.c_str(), client_buf.data, (int)client_buf.dataSize); } _buffer = buffer; - _buffer_size = buffer_size; if (!write_to_qnn_tensor()) { QNN_LOG_WARN("[%s]write to qnn tensor failed", _tensor_name.c_str()); return false; } - QNN_LOG_DEBUG("[%s][%s]bind to buffer: %p, size: %d", get_backend_name(_device), _tensor_name.c_str(), buffer, - (int)buffer_size); + QNN_LOG_DEBUG("[%s][%s]bind to buffer: %p, size: %d", get_backend_name(_device), _tensor_name.c_str(), + buffer.get(), (int)buffer->get_size()); return true; } @@ -214,7 +237,7 @@ class ggml_qnn_tensor { } if (_rpc_buffer) { - memcpy(_rpc_buffer->get_buffer(), _buffer, _buffer_size); + memcpy(_rpc_buffer->get_buffer(), _buffer->get_buffer(), _buffer->get_size()); } // For CPU and GPU, the data is already in the tensor. @@ -230,7 +253,7 @@ class ggml_qnn_tensor { } if (_rpc_buffer) { - memcpy(_buffer, _rpc_buffer->get_buffer(), _buffer_size); + memcpy(_buffer->get_buffer(), _rpc_buffer->get_buffer(), _buffer->get_size()); } // For CPU and GPU, the data is already in the tensor. @@ -258,6 +281,9 @@ class ggml_qnn_tensor { case PARAMETER: new_tensor_type = QNN_TENSOR_TYPE_STATIC; break; + case BIDIRECTION: + new_tensor_type = QNN_TENSOR_TYPE_APP_READWRITE; + break; case INTERMEDIATE: default: new_tensor_type = QNN_TENSOR_TYPE_NATIVE; @@ -273,15 +299,15 @@ class ggml_qnn_tensor { } std::string _tensor_name; - uint8_t *_buffer = nullptr; - size_t _buffer_size = 0; - std::vector _buffer_storage; + qnn_buffer_ptr _buffer; + bool can_unbind = true; QNNBackend _device; std::shared_ptr _qnn_instance; Qnn_Tensor_t _qnn_tensor = qnn_tensor_init(kDefaultQnnTensorVersion); qnn_dimension_array_t _dimensions = {}; Qnn_GraphHandle_t _graph_handle = nullptr; qnn_buffer_ptr _rpc_buffer; + ggml_tensor *_ggml_tensor = nullptr; DISABLE_COPY(ggml_qnn_tensor); DISABLE_MOVE(ggml_qnn_tensor); @@ -289,5 +315,92 @@ class ggml_qnn_tensor { using qnn_tensor_ptr_t = std::shared_ptr; using qnn_tensor_array_t = std::vector; +using ggml_tensor_array_t = std::vector; + +inline qnn_tensor_ptr_t get_qnn_tensor_ptr(ggml_tensor *ggml_tensor) { + return ggml_tensor->extra ? reinterpret_cast(ggml_tensor->extra)->shared_from_this() + : qnn_tensor_ptr_t(); +} + +inline int get_ggml_tensors_max_rank(const qnn::ggml_tensor_array_t &tensors) { + int max_rank = 0; + for (auto tensor : tensors) { + max_rank = std::max(max_rank, ggml_n_dims(tensor)); + } + + return max_rank; +} + +inline bool bind_tensors(const ggml_tensor_array_t &ggml_tensors, qnn_tensor_array_t &tensor_wrappers, + std::vector &qnn_tensors) { + GGML_ASSERT(tensor_wrappers.size() == ggml_tensors.size()); + qnn_tensors.resize(ggml_tensors.size()); + for (size_t i = 0; i < ggml_tensors.size(); i++) { + auto *ggml_tensor = ggml_tensors[i]; + if (!tensor_wrappers[i]->bind_ggml_tensor(ggml_tensor)) { + QNN_LOG_ERROR("bind tensor %s failed", ggml_get_name(ggml_tensor)); + return false; + } + + qnn_tensors[i] = tensor_wrappers[i]->get_qnn_tensor(); + } + + return true; +} + +inline bool bind_tensors(const ggml_tensor_array_t &ggml_tensors, qnn_tensor_array_t &tensor_wrappers) { + GGML_ASSERT(tensor_wrappers.size() == ggml_tensors.size()); + for (size_t i = 0; i < ggml_tensors.size(); i++) { + auto *ggml_tensor = ggml_tensors[i]; + if (!tensor_wrappers[i]->bind_ggml_tensor(ggml_tensor)) { + QNN_LOG_ERROR("bind tensor %s failed", ggml_get_name(ggml_tensor)); + return false; + } + } + + return true; +} + +inline void unbind_tensors(qnn_tensor_array_t &tensor_wrappers) { + for (auto &tensor : tensor_wrappers) { + tensor->unbind(); + } +} + +struct tensor_create_common_params { + const char *name_prefix; + int tensor_rank; + bool is_input; + QNNBackend device; + Qnn_GraphHandle_t graph_handle; + std::shared_ptr qnn_instance; +}; + +inline void create_tensors_from_ggml_tensor(const tensor_create_common_params ¶ms, + const ggml_tensor_array_t &ggml_tensors, + qnn_tensor_array_t *tensor_wrappers, + std::vector *qnn_tensors) { + if (qnn_tensors) { + qnn_tensors->resize(ggml_tensors.size()); + } + + if (!tensor_wrappers->empty()) { + QNN_LOG_DEBUG("tensor_wrappers is not empty, skip create tensors"); + GGML_ASSERT(tensor_wrappers->size() == ggml_tensors.size()); + return; + } + + tensor_wrappers->resize(ggml_tensors.size()); + + char buffer[GGML_MAX_NAME] = {}; + auto tensor_type = params.is_input ? ggml_qnn_tensor::INPUT : ggml_qnn_tensor::OUTPUT; + for (size_t i = 0; i < ggml_tensors.size(); i++) { + snprintf(buffer, GGML_MAX_NAME, "%s%d", params.name_prefix, (int)i); + auto *ggml_tensor = ggml_tensors[i]; + (*tensor_wrappers)[i] = std::make_shared(tensor_type, std::string(buffer), ggml_tensor->ne, + ggml_tensor->type, params.tensor_rank, params.device, + params.graph_handle, params.qnn_instance); + } +} } // namespace qnn diff --git a/ggml/src/ggml-qnn/utils.cpp b/ggml/src/ggml-qnn/utils.cpp index eaabe60cdb262..6e77ee5f5f287 100644 --- a/ggml/src/ggml-qnn/utils.cpp +++ b/ggml/src/ggml-qnn/utils.cpp @@ -188,13 +188,15 @@ const char *get_backend_name(QNNBackend device_index) { const char *get_chipset_desc(uint32_t chipset_id) { switch (chipset_id) { case SM8450: - return "SM8450"; + return "SD 8 Gen 1 (SM8450)"; case SM8475: - return "SM8475"; + return "SD 8+ Gen 1 (SM8475)"; case SM8550: - return "SM8550"; + return "SD 8 Gen 2 (SM8550)"; case SM8650: - return "SM8650"; + return "SD 8 Gen 3 (SM8650)"; + case SM8750: + return "SD 8 Gen 4 (SM8750)"; default: return "unknown"; } @@ -210,6 +212,8 @@ const char *get_htparch_desc(size_t htp_arch) { return "QCOM_HTP_V73"; case V75: return "QCOM_HTP_V75"; + case V79: + return "QCOM_HTP_V79"; default: return "unknown"; } From 5f93376f6703829aa15b068be52a84748507fca2 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Fri, 10 Jan 2025 11:30:00 +0800 Subject: [PATCH 138/143] fix compiling error after merged --- ggml/src/ggml-qnn/backend-ops.cpp | 3 ++- ggml/src/ggml-qnn/op-config-caps.cpp | 1 + src/llama.cpp | 4 ---- 3 files changed, 3 insertions(+), 5 deletions(-) diff --git a/ggml/src/ggml-qnn/backend-ops.cpp b/ggml/src/ggml-qnn/backend-ops.cpp index 1ed01bfd6851d..75c90e235bbc2 100644 --- a/ggml/src/ggml-qnn/backend-ops.cpp +++ b/ggml/src/ggml-qnn/backend-ops.cpp @@ -314,7 +314,8 @@ constexpr const ggml_qnn_op_t kQnnOpsTable[] = { nullptr, // GGML_OP_WIN_UNPART nullptr, // GGML_OP_GET_REL_POS nullptr, // GGML_OP_ADD_REL_POS - nullptr, // GGML_OP_RWKV_WKV + nullptr, // GGML_OP_RWKV_WKV6 + nullptr, // GGML_OP_GATED_LINEAR_ATTN nullptr, // GGML_OP_UNARY diff --git a/ggml/src/ggml-qnn/op-config-caps.cpp b/ggml/src/ggml-qnn/op-config-caps.cpp index aab8f65958bf1..7fa3d11affc18 100644 --- a/ggml/src/ggml-qnn/op-config-caps.cpp +++ b/ggml/src/ggml-qnn/op-config-caps.cpp @@ -139,6 +139,7 @@ constexpr const qnn_op_caps_t kOpCaps[] = { {}, // GGML_OP_GET_REL_POS {}, // GGML_OP_ADD_REL_POS {}, // GGML_OP_RWKV_WKV6 + {}, // GGML_OP_GATED_LINEAR_ATTN {}, // GGML_OP_UNARY diff --git a/src/llama.cpp b/src/llama.cpp index 2a3409eacbfc7..a364861d3c803 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -11848,10 +11848,6 @@ struct llama_sampler_chain_params llama_sampler_chain_default_params() { size_t llama_max_devices(void) { return 16; } -#if defined(GGML_USE_QNN) - return GGML_QNN_MAX_DEVICES; -#else -#endif bool llama_supports_mmap(void) { return llama_mmap::SUPPORTED; From 10bd671c08f7094c97316edbd12a59e207e0da34 Mon Sep 17 00:00:00 2001 From: nullname Date: Sat, 18 Jan 2025 22:15:27 +0800 Subject: [PATCH 139/143] [feat]add more op support (#18) * disable rpc buffer for npu * append input/output tensor size into unsupported op log * log dimensions for unsupported tensor * wip * split op config classes into separated file * fix reshape * wip * add op_constructor_with_type_param * set parameter for op_constructor_with_type_param func --- ggml/src/ggml-qnn/backend-ops.cpp | 24 +- ggml/src/ggml-qnn/graph.cpp | 12 +- ggml/src/ggml-qnn/op-config-caps.cpp | 208 +++++++++++++++++- .../{op-config.cpp => op-config-impl.cpp} | 27 +-- ggml/src/ggml-qnn/op-config-impl.hpp | 151 +++++++++++++ ggml/src/ggml-qnn/op-config.hpp | 136 +----------- ggml/src/ggml-qnn/tensor.hpp | 4 +- 7 files changed, 384 insertions(+), 178 deletions(-) rename ggml/src/ggml-qnn/{op-config.cpp => op-config-impl.cpp} (95%) create mode 100644 ggml/src/ggml-qnn/op-config-impl.hpp diff --git a/ggml/src/ggml-qnn/backend-ops.cpp b/ggml/src/ggml-qnn/backend-ops.cpp index 75c90e235bbc2..8bbf26da5275e 100644 --- a/ggml/src/ggml-qnn/backend-ops.cpp +++ b/ggml/src/ggml-qnn/backend-ops.cpp @@ -25,7 +25,7 @@ bool qnn_is_op_valid(ggml_backend_qnn_device_context *ctx, const ggml_tensor *ds return false; } - const auto param_count = qnn::get_qnn_op_input_param_count(qnn::get_qnn_op_index(dst)); + const auto param_count = qnn::get_qnn_op_input_param_count(dst); switch (param_count) { case 1: return dst->src[0]; @@ -91,9 +91,13 @@ void get_graph_key_from_op(const ggml_tensor *op, std::string &output) { GGML_ASSERT(op->op != GGML_OP_NONE); output += ggml_op_desc(op); output += qnn::get_ggml_type_name(op->type); - const auto param_count = qnn::get_qnn_op_input_param_count(qnn::get_qnn_op_index(op)); + const auto param_count = qnn::get_qnn_op_input_param_count(op); for (size_t i = 0; i < param_count; ++i) { auto *input = op->src[i]; + if (!input) { + break; + } + output += '_'; append_tensor_dimensions(input, output); } @@ -224,7 +228,7 @@ bool qnn_generic_op_impl(ggml_backend_qnn_device_context *ctx, ggml_tensor *dst) #ifndef NDEBUG if (!succeed) { - const auto param_count = qnn::get_qnn_op_input_param_count(qnn::get_qnn_op_index(dst)); + const auto param_count = qnn::get_qnn_op_input_param_count(dst); for (size_t i = 0; i < param_count; ++i) { print_ggml_tensor(dst->src[i]); } @@ -409,7 +413,7 @@ bool ggnl_qnn_supports_op_tensor(ggml_backend_qnn_device_context *ctx, const ggm return false; } - const auto param_count = qnn::get_qnn_op_input_param_count(qnn::get_qnn_op_index(op)); + const auto param_count = qnn::get_qnn_op_input_param_count(op); for (size_t i = 0; i < param_count; ++i) { if (!ggml_qnn_supports_tensor(ctx, op->src[i])) { return false; @@ -479,12 +483,20 @@ bool device_supports_op(ggml_backend_qnn_device_context *ctx, const ggml_tensor } if (!kQnnOpsTable[qnn::get_qnn_op_index(op)]) { - QNN_LOG_DEBUG("[%s]unsupported op", ggml_op_name(op->op)); +#ifndef NDEBUG + std::string op_key; + get_graph_key_from_op(op, op_key); + QNN_LOG_DEBUG("[%s]unsupported op", op_key.c_str()); +#endif return false; } if (!ggnl_qnn_supports_op_tensor(ctx, op)) { - QNN_LOG_DEBUG("[%s]unsupported tensor", ggml_op_name(op->op)); +#ifndef NDEBUG + std::string tensor_dims; + append_tensor_dimensions(op, tensor_dims); + QNN_LOG_DEBUG("[%s]unsupported tensor(%s)", ggml_op_name(op->op), tensor_dims.c_str()); +#endif return false; } diff --git a/ggml/src/ggml-qnn/graph.cpp b/ggml/src/ggml-qnn/graph.cpp index 0210e1554a8ab..680f5e23bd9f3 100644 --- a/ggml/src/ggml-qnn/graph.cpp +++ b/ggml/src/ggml-qnn/graph.cpp @@ -15,7 +15,7 @@ using qnn_tensor_cache_t = std::unordered_mapsrc[i])); } @@ -56,14 +56,12 @@ qnn::qnn_op_config_ptr_t create_operation_from_op_tensor(ggml_tensor *dst, const QNNBackend device, Qnn_GraphHandle_t graph_handle, std::shared_ptr qnn_instance, bool is_intermediate, qnn_tensor_cache_t &tensor_cache) { - const auto op_index = qnn::get_qnn_op_index(dst); - auto qnn_op = qnn::create_op_constructor(op_index); - auto operation = qnn_op(name, qnn_instance); + auto operation = qnn::create_op(dst, name, qnn_instance); // input tensors qnn::qnn_tensor_array_t input_qnn_tensors; auto tensor_type = is_intermediate ? qnn::ggml_qnn_tensor::INTERMEDIATE : qnn::ggml_qnn_tensor::INPUT; - for (size_t i = 0; i < qnn::get_qnn_op_input_param_count(op_index); ++i) { + for (size_t i = 0; i < qnn::get_qnn_op_input_param_count(dst); ++i) { auto input_qnn_tensor = create_tensor_with_cache(dst->src[i], tensor_type, rank, device, graph_handle, qnn_instance, tensor_cache); input_qnn_tensors.push_back(input_qnn_tensor); @@ -92,7 +90,7 @@ bool bind_src_tensors(ggml_tensor *op, qnn::qnn_tensor_array_t &tensor_wrappers, return false; } - const auto param_count = qnn::get_qnn_op_input_param_count(qnn::get_qnn_op_index(op)); + const auto param_count = qnn::get_qnn_op_input_param_count(op); GGML_ASSERT(tensor_wrappers.size() == param_count); qnn_tensors.resize(param_count); for (size_t i = 0; i < param_count; ++i) { @@ -268,7 +266,7 @@ bool qnn_graph::build_graph_from_ggml_graph(const ggml_cgraph *cgraph) { continue; } - QNN_LOG_DEBUG("[%s]create op: %s", get_backend_name(_device), get_qnn_op_name(dst->op)); + QNN_LOG_DEBUG("[%s]create op: %s", get_backend_name(_device), get_qnn_op_name(dst)); auto operation = create_operation_from_op_tensor(dst, dst->name, rank, _device, _graph_handle, _qnn_instance, true, tensor_cache); // TODO: fix op name operations.push_back(operation); diff --git a/ggml/src/ggml-qnn/op-config-caps.cpp b/ggml/src/ggml-qnn/op-config-caps.cpp index 7fa3d11affc18..9b28a76dd1dcf 100644 --- a/ggml/src/ggml-qnn/op-config-caps.cpp +++ b/ggml/src/ggml-qnn/op-config-caps.cpp @@ -1,8 +1,10 @@ -#include "op-config.hpp" +#include "op-config-impl.hpp" namespace { +using op_constructor_t = std::shared_ptr (*)(const ggml_tensor *, const std::string &, + std::shared_ptr); using op_dims_calc_func_t = void (*)(const std::vector &input_dims, qnn::ggml_dimension_array_t &output_dims); @@ -24,6 +26,7 @@ struct qnn_op_caps_t { const char *qnn_op_name = nullptr; const size_t input_param_count = 0; op_dims_calc_func_t calc_dims_func = nullptr; + const char *qnn_param_name = nullptr; }; constexpr const qnn_op_caps_t kOpCaps[] = { @@ -80,7 +83,13 @@ constexpr const qnn_op_caps_t kOpCaps[] = { {}, // GGML_OP_CONCAT {}, // GGML_OP_SILU_BACK {}, // GGML_OP_NORM - {}, // GGML_OP_RMS_NORM + { + // GGML_OP_RMS_NORM + QNN_OP_RMS_NORM, // qnn_op_name + 1, // input_param_count + nullptr, // TODO: calc_dims_func + QNN_OP_RMS_NORM_PARAM_EPSILON, // qnn_param_name + }, {}, // GGML_OP_RMS_NORM_BACK {}, // GGML_OP_GROUP_NORM { @@ -187,9 +196,172 @@ static_assert(kOpCaps[GGML_OP_MUL_MAT].calc_dims_func == mat_mul_op_dims, "GGML_OP_ADD does not have element_wise_op_dims function"); static_assert(kOpCaps[GGML_OP_LOG].calc_dims_func == element_wise_op_dims, "GGML_OP_LOG does not have element_wise_op_dims function"); +static_assert(kOpCaps[GGML_OP_COUNT + GGML_UNARY_OP_GELU].input_param_count == 1, + "GGML_UNARY_OP_GELU does not have 1 input parameter"); static_assert(std::size(kOpCaps) == (GGML_OP_COUNT + GGML_UNARY_OP_COUNT), "GGML_OP_COUNT does not match the size of the kOpCaps table"); +std::shared_ptr mat_mul_op_constructor(const ggml_tensor *op, const std::string &instance_name, + std::shared_ptr qnn_instance) { + GGML_UNUSED(op); + QNN_LOG_DEBUG("create QNN_OP_MAT_MUL, name %s", instance_name.c_str()); + return std::make_shared(instance_name, qnn_instance); +} + +template +std::shared_ptr generic_op_constructor(const ggml_tensor *op, const std::string &instance_name, + std::shared_ptr qnn_instance) { + GGML_UNUSED(op); + static_assert(_op < std::size(kOpCaps)); + static_assert(kOpCaps[_op].qnn_op_name != nullptr); + return std::make_shared(instance_name, QNN_OP_PACKAGE_NAME_QTI_AISW, + kOpCaps[_op].qnn_op_name, qnn_instance); +} + +void add_type_parameters(std::shared_ptr op, const char *name, float value) { + Qnn_Scalar_t scalar = QNN_SCALAR_INIT; + scalar.dataType = QNN_DATATYPE_FLOAT_32; + scalar.floatValue = value; + op->add_scalar_param(name, scalar); +} + +template +std::shared_ptr op_constructor_with_type_param( + const ggml_tensor *op, const std::string &instance_name, std::shared_ptr qnn_instance) { + static_assert(std::is_base_of::value); + static_assert(_op < std::size(kOpCaps)); + + constexpr auto &op_caps = kOpCaps[_op]; + static_assert(op_caps.qnn_op_name != nullptr); + + _ggml_op_param_type op_param; + memcpy(&op_param, op->op_params, sizeof(op_param)); + auto qnn_op = std::make_shared<_qnn_op_type_name>(instance_name, QNN_OP_PACKAGE_NAME_QTI_AISW, op_caps.qnn_op_name, + qnn_instance); + if (op_caps.qnn_param_name) { + add_type_parameters(qnn_op, op_caps.qnn_param_name, op_param); + } + return qnn_op; +} + +constexpr const op_constructor_t kOpConstructors[] = { + nullptr, // GGML_OP_NONE + nullptr, // GGML_OP_DUP + generic_op_constructor, // GGML_OP_ADD + nullptr, // GGML_OP_ADD1 + nullptr, // GGML_OP_ACC + generic_op_constructor, // GGML_OP_SUB + generic_op_constructor, // GGML_OP_MUL + generic_op_constructor, // GGML_OP_DIV + nullptr, // GGML_OP_SQR + generic_op_constructor, // GGML_OP_SQRT + generic_op_constructor, // GGML_OP_LOG + nullptr, // GGML_OP_SIN + nullptr, // GGML_OP_COS + nullptr, // GGML_OP_SUM + nullptr, // GGML_OP_SUM_ROWS + nullptr, // GGML_OP_MEAN + nullptr, // GGML_OP_ARGMAX + nullptr, // GGML_OP_COUNT_EQUAL + nullptr, // GGML_OP_REPEAT + nullptr, // GGML_OP_REPEAT_BACK + nullptr, // GGML_OP_CONCAT + nullptr, // GGML_OP_SILU_BACK + nullptr, // GGML_OP_NORM + op_constructor_with_type_param, // GGML_OP_RMS_NORM + nullptr, // GGML_OP_RMS_NORM_BACK + nullptr, // GGML_OP_GROUP_NORM + + mat_mul_op_constructor, // GGML_OP_MUL_MAT + nullptr, // GGML_OP_MUL_MAT_ID + nullptr, // GGML_OP_OUT_PROD + + nullptr, // GGML_OP_SCALE + nullptr, // GGML_OP_SET + nullptr, // GGML_OP_CPY + nullptr, // GGML_OP_CONT + generic_op_constructor, // GGML_OP_RESHAPE + nullptr, // GGML_OP_VIEW + nullptr, // GGML_OP_PERMUTE + nullptr, // GGML_OP_TRANSPOSE + nullptr, // GGML_OP_GET_ROWS + nullptr, // GGML_OP_GET_ROWS_BACK + nullptr, // GGML_OP_DIAG + nullptr, // GGML_OP_DIAG_MASK_INF + nullptr, // GGML_OP_DIAG_MASK_ZERO + nullptr, // GGML_OP_SOFT_MAX + nullptr, // GGML_OP_SOFT_MAX_BACK + nullptr, // GGML_OP_ROPE + nullptr, // GGML_OP_ROPE_BACK + nullptr, // GGML_OP_CLAMP + nullptr, // GGML_OP_CONV_TRANSPOSE_1D + nullptr, // GGML_OP_IM2COL + nullptr, // GGML_OP_IM2COL_BACK + nullptr, // GGML_OP_CONV_TRANSPOSE_2D + nullptr, // GGML_OP_POOL_1D + nullptr, // GGML_OP_POOL_2D + nullptr, // GGML_OP_POOL_2D_BACK + nullptr, // GGML_OP_UPSCALE + nullptr, // GGML_OP_PAD + nullptr, // GGML_OP_PAD_REFLECT_1D + nullptr, // GGML_OP_ARANGE + nullptr, // GGML_OP_TIMESTEP_EMBEDDING + nullptr, // GGML_OP_ARGSORT + nullptr, // GGML_OP_LEAKY_RELU + + nullptr, // GGML_OP_FLASH_ATTN_EXT + nullptr, // GGML_OP_FLASH_ATTN_BACK + nullptr, // GGML_OP_SSM_CONV + nullptr, // GGML_OP_SSM_SCAN + nullptr, // GGML_OP_WIN_PART + nullptr, // GGML_OP_WIN_UNPART + nullptr, // GGML_OP_GET_REL_POS + nullptr, // GGML_OP_ADD_REL_POS + nullptr, // GGML_OP_RWKV_WKV6 + nullptr, // GGML_OP_GATED_LINEAR_ATTN + + nullptr, // GGML_OP_UNARY + + nullptr, // GGML_OP_MAP_UNARY + nullptr, // GGML_OP_MAP_BINARY + + nullptr, // GGML_OP_MAP_CUSTOM1_F32 + nullptr, // GGML_OP_MAP_CUSTOM2_F32 + nullptr, // GGML_OP_MAP_CUSTOM3_F32 + + nullptr, // GGML_OP_MAP_CUSTOM1 + nullptr, // GGML_OP_MAP_CUSTOM2 + nullptr, // GGML_OP_MAP_CUSTOM3 + + nullptr, // GGML_OP_CROSS_ENTROPY_LOSS + nullptr, // GGML_OP_CROSS_ENTROPY_LOSS_BACK + nullptr, // GGML_OP_OPT_STEP_ADAMW + + // ggml_unary_op + nullptr, // GGML_UNARY_OP_ABS + nullptr, // GGML_UNARY_OP_SGN + nullptr, // GGML_UNARY_OP_NEG + nullptr, // GGML_UNARY_OP_STEP + nullptr, // GGML_UNARY_OP_TANH + nullptr, // GGML_UNARY_OP_ELU + nullptr, // GGML_UNARY_OP_RELU + nullptr, // GGML_UNARY_OP_SIGMOID + nullptr, // GGML_UNARY_OP_GELU + nullptr, // GGML_UNARY_OP_GELU_QUICK + nullptr, // GGML_UNARY_OP_SILU + nullptr, // GGML_UNARY_OP_HARDSWISH + nullptr, // GGML_UNARY_OP_HARDSIGMOID + nullptr, // GGML_UNARY_OP_EXP +}; + +static_assert(kOpConstructors[GGML_OP_NONE] == nullptr, "GGML_OP_NONE does not match the nullptr function"); +static_assert(kOpConstructors[GGML_OP_ADD] == generic_op_constructor, + "GGML_OP_ADD does not match the generic_op_constructor function"); +static_assert(kOpConstructors[GGML_OP_MUL_MAT] == mat_mul_op_constructor, + "GGML_OP_MUL_MAT does not match the mat_mul_op_constructor function"); +static_assert(std::size(kOpConstructors) == (GGML_OP_COUNT + GGML_UNARY_OP_COUNT), + "GGML_OP_COUNT does not match the size of the kOpConstructors table"); + } // namespace namespace qnn { @@ -202,23 +374,35 @@ size_t get_qnn_op_index(const ggml_tensor *tensor) { return tensor->op; } -void get_ggml_op_output_dimensions(const std::vector &input_dims, size_t op, +void get_ggml_op_output_dimensions(const std::vector &input_dims, const ggml_tensor *op, ggml_dimension_array_t &output_dims) { - GGML_ASSERT(op < std::size(kOpCaps)); - auto get_dims = kOpCaps[op].calc_dims_func; + auto op_index = get_qnn_op_index(op); + GGML_ASSERT(op_index < std::size(kOpCaps)); + auto get_dims = kOpCaps[op_index].calc_dims_func; GGML_ASSERT(get_dims); get_dims(input_dims, output_dims); } -const char *get_qnn_op_name(size_t op) { - GGML_ASSERT(op < std::size(kOpCaps)); - GGML_ASSERT(kOpCaps[op].qnn_op_name); - return kOpCaps[op].qnn_op_name; +const char *get_qnn_op_name(const ggml_tensor *op) { + auto op_index = get_qnn_op_index(op); + GGML_ASSERT(op_index < std::size(kOpCaps)); + GGML_ASSERT(kOpCaps[op_index].qnn_op_name); + return kOpCaps[op_index].qnn_op_name; +} + +size_t get_qnn_op_input_param_count(const ggml_tensor *op) { + auto op_index = get_qnn_op_index(op); + GGML_ASSERT(op_index < std::size(kOpCaps)); + return kOpCaps[op_index].input_param_count; } -size_t get_qnn_op_input_param_count(size_t op) { - GGML_ASSERT(op < std::size(kOpCaps)); - return kOpCaps[op].input_param_count; +std::shared_ptr create_op(const ggml_tensor *op, const std::string &name, + std::shared_ptr qnn_instance) { + auto op_index = get_qnn_op_index(op); + GGML_ASSERT(op_index < std::size(kOpCaps)); + auto op_constructor = kOpConstructors[op_index]; + GGML_ASSERT(op_constructor); + return op_constructor(op, name, qnn_instance); } } // namespace qnn diff --git a/ggml/src/ggml-qnn/op-config.cpp b/ggml/src/ggml-qnn/op-config-impl.cpp similarity index 95% rename from ggml/src/ggml-qnn/op-config.cpp rename to ggml/src/ggml-qnn/op-config-impl.cpp index 7edb4078a57df..19a1bf46ee9dc 100644 --- a/ggml/src/ggml-qnn/op-config.cpp +++ b/ggml/src/ggml-qnn/op-config-impl.cpp @@ -1,4 +1,4 @@ -#include "op-config.hpp" +#include "op-config-impl.hpp" #include @@ -187,6 +187,13 @@ bool ggml_qnn_single_op_config::initialize_op_nodes(QNNBackend device, Qnn_Graph return true; } +bool ggml_qnn_rmsnorm_op_config::initialize_op_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle) { + constexpr const uint32_t kAxes[] = {0}; + add_tensor_param(QNN_OP_RMS_NORM_PARAM_AXES, {1}, 1, reinterpret_cast(kAxes), QNN_DATATYPE_UINT_32, + device, graph_handle); + return true; +} + void ggml_qnn_aggregate_op_config::set_input_tensors(qnn::qnn_tensor_array_t &tensor_inputs) { _tensor_inputs = tensor_inputs; } @@ -439,22 +446,4 @@ bool ggml_qnn_matmul_op_config::create_mat_mul_nodes(QNNBackend device, Qnn_Grap return true; } -ggml_op_constructor_t create_op_constructor(size_t op) { - std::string op_name = get_qnn_op_name(op); - if (op_name == QNN_OP_MAT_MUL) { - // For QNN_OP_MAT_MUL, we need to transpose the input tensor - return [](const std::string &instance_name, - std::shared_ptr qnn_instance) -> std::shared_ptr { - QNN_LOG_DEBUG("create QNN_OP_MAT_MUL, name %s", instance_name.c_str()); - return std::make_shared(instance_name, qnn_instance); - }; - } - - return [op_name](const std::string &instance_name, - std::shared_ptr qnn_instance) -> std::shared_ptr { - return std::make_shared(instance_name, QNN_OP_PACKAGE_NAME_QTI_AISW, op_name, - qnn_instance); - }; -} - } // namespace qnn diff --git a/ggml/src/ggml-qnn/op-config-impl.hpp b/ggml/src/ggml-qnn/op-config-impl.hpp new file mode 100644 index 0000000000000..4a00ed2cc7ac3 --- /dev/null +++ b/ggml/src/ggml-qnn/op-config-impl.hpp @@ -0,0 +1,151 @@ +#pragma once + +#include +#include +#include +#include + +#include "op-config.hpp" +#include "qnn-lib.hpp" +#include "qnn-types.hpp" +#include "tensor.hpp" + +namespace qnn { + +class ggml_qnn_op_config_base : public ggml_qnn_op_config { +public: + explicit ggml_qnn_op_config_base(const std::string &name, const std::string &package_name, + const std::string &op_type, std::shared_ptr qnn_instance) + : _name(name), _package_name(package_name), _op_type(op_type), _qnn_instance(qnn_instance) {} + + void add_scalar_param(const std::string &name, const Qnn_Scalar_t scalar); + bool add_tensor_param(const std::string &name, const qnn_dimension_array_t &dimensions, int rank, + const uint8_t *data, const Qnn_DataType_t data_type, QNNBackend device, + Qnn_GraphHandle_t graph_handle); + + void set_input_tensors(qnn::qnn_tensor_array_t &tensor_inputs) override; + void set_input_tensors(qnn::qnn_tensor_array_t &&tensor_inputs) override; + void set_output_tensors(qnn::qnn_tensor_array_t &tensor_inputs) override; + void set_output_tensors(qnn::qnn_tensor_array_t &&tensor_inputs) override; + bool add_op_to_graph(Qnn_GraphHandle_t graph_handle) override; + bool bind_input_tensors(const ggml_tensor_array_t &tensor_inputs) override; + bool bind_output_tensors(const ggml_tensor_array_t &tensor_outputs) override; + void unbind_input_tensors() override; + void unbind_output_tensors() override; + const qnn_tensor_array_t &get_input_tensors() override { return _tensor_inputs; } + const qnn_tensor_array_t &get_output_tensors() override { return _tensor_outputs; } + +protected: + Qnn_OpConfig_t get_op_config(); + + std::string _name; + std::string _package_name; + std::string _op_type; + std::shared_ptr _qnn_instance; + qnn_tensor_array_t _tensor_inputs; + qnn_tensor_array_t _tensor_outputs; + qnn_tensor_array_t _tensor_parameters; + std::vector _qnn_tensor_inputs; + std::vector _qnn_tensor_outputs; + std::vector _qnn_parameters; + std::vector _param_names; + + DISABLE_COPY(ggml_qnn_op_config_base); + DISABLE_MOVE(ggml_qnn_op_config_base); +}; + +class ggml_qnn_single_op_config : public ggml_qnn_op_config_base { +public: + explicit ggml_qnn_single_op_config(const std::string &name, const std::string &package_name, + const std::string &op_type, std::shared_ptr qnn_instance) + : ggml_qnn_op_config_base(name, package_name, op_type, qnn_instance) {} + + bool initialize_op_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle) override; + +private: + DISABLE_COPY(ggml_qnn_single_op_config); + DISABLE_MOVE(ggml_qnn_single_op_config); +}; + +class ggml_qnn_rmsnorm_op_config : public ggml_qnn_op_config_base { +public: + explicit ggml_qnn_rmsnorm_op_config(const std::string &name, const std::string &package_name, + const std::string &op_type, std::shared_ptr qnn_instance) + : ggml_qnn_op_config_base(name, package_name, op_type, qnn_instance) {} + + bool initialize_op_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle) override; + +private: + DISABLE_COPY(ggml_qnn_rmsnorm_op_config); + DISABLE_MOVE(ggml_qnn_rmsnorm_op_config); +}; + +class ggml_qnn_aggregate_op_config : public ggml_qnn_op_config { +public: + explicit ggml_qnn_aggregate_op_config(const std::string &name, std::shared_ptr qnn_instance) + : _name(name), _qnn_instance(qnn_instance) {} + + ~ggml_qnn_aggregate_op_config() { + _tensor_inputs.clear(); + _tensor_outputs.clear(); + _operations.clear(); + } + + void set_input_tensors(qnn::qnn_tensor_array_t &tensor_inputs) override; + void set_input_tensors(qnn::qnn_tensor_array_t &&tensor_inputs) override; + void set_output_tensors(qnn::qnn_tensor_array_t &tensor_inputs) override; + void set_output_tensors(qnn::qnn_tensor_array_t &&tensor_inputs) override; + bool add_op_to_graph(Qnn_GraphHandle_t graph_handle) override { + return qnn::add_op_to_graph(graph_handle, _operations); + } + + bool bind_input_tensors(const ggml_tensor_array_t &tensor_inputs) override; + bool bind_output_tensors(const ggml_tensor_array_t &tensor_outputs) override; + void unbind_input_tensors() override { + for (auto &tensor : _tensor_inputs) { + tensor->unbind(); + } + } + + void unbind_output_tensors() override { + for (auto &tensor : _tensor_outputs) { + tensor->unbind(); + } + } + + const qnn_tensor_array_t &get_input_tensors() override { return _tensor_inputs; } + const qnn_tensor_array_t &get_output_tensors() override { return _tensor_outputs; } + +protected: + std::string _name; + std::shared_ptr _qnn_instance; + + std::vector _operations; + qnn_tensor_array_t _tensor_inputs; + qnn_tensor_array_t _tensor_outputs; + +private: + DISABLE_COPY(ggml_qnn_aggregate_op_config); + DISABLE_MOVE(ggml_qnn_aggregate_op_config); +}; + +class ggml_qnn_matmul_op_config : public ggml_qnn_aggregate_op_config { +public: + ggml_qnn_matmul_op_config(const std::string &name, std::shared_ptr qnn_instance) + : ggml_qnn_aggregate_op_config(name, qnn_instance) {} + + bool initialize_op_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle) override; + +private: + qnn_tensor_ptr_t create_gather_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, const int rank, + qnn_tensor_ptr_t tensor_input, qnn_dimension_array_t output_dimensions); + bool create_convert_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, const int rank, + qnn_tensor_array_t &tensor_inputs, qnn_tensor_array_t &tensor_outputs); + bool create_mat_mul_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, const int rank, + qnn_tensor_array_t &tensor_inputs, qnn_tensor_array_t &tensor_outputs); + + DISABLE_COPY(ggml_qnn_matmul_op_config); + DISABLE_MOVE(ggml_qnn_matmul_op_config); +}; + +} // namespace qnn diff --git a/ggml/src/ggml-qnn/op-config.hpp b/ggml/src/ggml-qnn/op-config.hpp index ca066520bc171..075c56fed6e13 100644 --- a/ggml/src/ggml-qnn/op-config.hpp +++ b/ggml/src/ggml-qnn/op-config.hpp @@ -12,19 +12,16 @@ namespace qnn { -using ggml_op_constructor_t = - std::function(const std::string &, std::shared_ptr)>; - constexpr const size_t kGgmlUnaryOpStart = GGML_OP_COUNT; size_t get_qnn_op_index(const ggml_tensor *tensor); -void get_ggml_op_output_dimensions(const std::vector &input_dims, size_t op, +void get_ggml_op_output_dimensions(const std::vector &input_dims, const ggml_tensor *op, ggml_dimension_array_t &output_dims); -const char *get_qnn_op_name(size_t op); -size_t get_qnn_op_input_param_count(size_t op); - -ggml_op_constructor_t create_op_constructor(size_t op); +const char *get_qnn_op_name(const ggml_tensor *op); +size_t get_qnn_op_input_param_count(const ggml_tensor *op); +std::shared_ptr create_op(const ggml_tensor *op, const std::string &name, + std::shared_ptr qnn_instance); inline bool add_op_to_graph(Qnn_GraphHandle_t graph_handle, std::vector &operations) { for (auto &op : operations) { @@ -36,127 +33,4 @@ inline bool add_op_to_graph(Qnn_GraphHandle_t graph_handle, std::vector qnn_instance) - : _name(name), _package_name(package_name), _op_type(op_type), _qnn_instance(qnn_instance) {} - - void add_scalar_param(const std::string &name, const Qnn_Scalar_t scalar); - bool add_tensor_param(const std::string &name, const qnn_dimension_array_t &dimensions, int rank, - const uint8_t *data, const Qnn_DataType_t data_type, QNNBackend device, - Qnn_GraphHandle_t graph_handle); - - void set_input_tensors(qnn::qnn_tensor_array_t &tensor_inputs) override; - void set_input_tensors(qnn::qnn_tensor_array_t &&tensor_inputs) override; - void set_output_tensors(qnn::qnn_tensor_array_t &tensor_inputs) override; - void set_output_tensors(qnn::qnn_tensor_array_t &&tensor_inputs) override; - bool add_op_to_graph(Qnn_GraphHandle_t graph_handle) override; - bool bind_input_tensors(const ggml_tensor_array_t &tensor_inputs) override; - bool bind_output_tensors(const ggml_tensor_array_t &tensor_outputs) override; - void unbind_input_tensors() override; - void unbind_output_tensors() override; - const qnn_tensor_array_t &get_input_tensors() override { return _tensor_inputs; } - const qnn_tensor_array_t &get_output_tensors() override { return _tensor_outputs; } - -protected: - Qnn_OpConfig_t get_op_config(); - - std::string _name; - std::string _package_name; - std::string _op_type; - std::shared_ptr _qnn_instance; - qnn_tensor_array_t _tensor_inputs; - qnn_tensor_array_t _tensor_outputs; - qnn_tensor_array_t _tensor_parameters; - std::vector _qnn_tensor_inputs; - std::vector _qnn_tensor_outputs; - std::vector _qnn_parameters; - std::vector _param_names; - - DISABLE_COPY(ggml_qnn_op_config_base); - DISABLE_MOVE(ggml_qnn_op_config_base); -}; - -class ggml_qnn_single_op_config : public ggml_qnn_op_config_base { -public: - explicit ggml_qnn_single_op_config(const std::string &name, const std::string &package_name, - const std::string &op_type, std::shared_ptr qnn_instance) - : ggml_qnn_op_config_base(name, package_name, op_type, qnn_instance) {} - - bool initialize_op_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle) override; - -private: - DISABLE_COPY(ggml_qnn_single_op_config); - DISABLE_MOVE(ggml_qnn_single_op_config); -}; - -class ggml_qnn_aggregate_op_config : public ggml_qnn_op_config { -public: - explicit ggml_qnn_aggregate_op_config(const std::string &name, std::shared_ptr qnn_instance) - : _name(name), _qnn_instance(qnn_instance) {} - - ~ggml_qnn_aggregate_op_config() { - _tensor_inputs.clear(); - _tensor_outputs.clear(); - _operations.clear(); - } - - void set_input_tensors(qnn::qnn_tensor_array_t &tensor_inputs) override; - void set_input_tensors(qnn::qnn_tensor_array_t &&tensor_inputs) override; - void set_output_tensors(qnn::qnn_tensor_array_t &tensor_inputs) override; - void set_output_tensors(qnn::qnn_tensor_array_t &&tensor_inputs) override; - bool add_op_to_graph(Qnn_GraphHandle_t graph_handle) override { - return qnn::add_op_to_graph(graph_handle, _operations); - } - - bool bind_input_tensors(const ggml_tensor_array_t &tensor_inputs) override; - bool bind_output_tensors(const ggml_tensor_array_t &tensor_outputs) override; - void unbind_input_tensors() override { - for (auto &tensor : _tensor_inputs) { - tensor->unbind(); - } - } - - void unbind_output_tensors() override { - for (auto &tensor : _tensor_outputs) { - tensor->unbind(); - } - } - - const qnn_tensor_array_t &get_input_tensors() override { return _tensor_inputs; } - const qnn_tensor_array_t &get_output_tensors() override { return _tensor_outputs; } - -protected: - std::string _name; - std::shared_ptr _qnn_instance; - - std::vector _operations; - qnn_tensor_array_t _tensor_inputs; - qnn_tensor_array_t _tensor_outputs; - -private: - DISABLE_COPY(ggml_qnn_aggregate_op_config); - DISABLE_MOVE(ggml_qnn_aggregate_op_config); -}; - -class ggml_qnn_matmul_op_config : public ggml_qnn_aggregate_op_config { -public: - ggml_qnn_matmul_op_config(const std::string &name, std::shared_ptr qnn_instance) - : ggml_qnn_aggregate_op_config(name, qnn_instance) {} - - bool initialize_op_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle) override; - -private: - qnn_tensor_ptr_t create_gather_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, const int rank, - qnn_tensor_ptr_t tensor_input, qnn_dimension_array_t output_dimensions); - bool create_convert_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, const int rank, - qnn_tensor_array_t &tensor_inputs, qnn_tensor_array_t &tensor_outputs); - bool create_mat_mul_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, const int rank, - qnn_tensor_array_t &tensor_inputs, qnn_tensor_array_t &tensor_outputs); - - DISABLE_COPY(ggml_qnn_matmul_op_config); - DISABLE_MOVE(ggml_qnn_matmul_op_config); -}; - } // namespace qnn diff --git a/ggml/src/ggml-qnn/tensor.hpp b/ggml/src/ggml-qnn/tensor.hpp index 3bd86891cb18f..9720e682c81d2 100644 --- a/ggml/src/ggml-qnn/tensor.hpp +++ b/ggml/src/ggml-qnn/tensor.hpp @@ -294,9 +294,7 @@ class ggml_qnn_tensor : public std::enable_shared_from_this { new_tensor_type); } - bool should_use_mem_handle() const { - return _device == QNN_BACKEND_NPU && QNN_TENSOR_GET_TYPE(_qnn_tensor) != QNN_TENSOR_TYPE_STATIC; - } + bool should_use_mem_handle() const { return false; } std::string _tensor_name; qnn_buffer_ptr _buffer; From a822d0075392defba5a83524f3a3564dc71c7f72 Mon Sep 17 00:00:00 2001 From: nullname Date: Mon, 24 Feb 2025 10:47:47 +0800 Subject: [PATCH 140/143] feat: run on win (#24) * move qnn_instance function implementation into cpp * wip * wip * move dl related function into separated file * use cast op for gpu * Revert "use cast op for gpu" This reverts commit 05df7362a15c022d05940d682e84cf480a082c6a. * Reapply "use cast op for gpu" This reverts commit 2520e5922a216faceb6d7efcde23dafe6947a4b3. * fix compiling error in win * fix align_alloc in win * fix compiling error * add get sys free/total mem for win * wip * suppress warning in win * add missing chrono header * set the correct qnn lib name for windows * add flag to control cpu backend * wip * wip * Revert "Reapply "use cast op for gpu"" This reverts commit f56519c374a7d46faac706cf214de48ff5fc5139. * fix compiling error for linux build * fix cdsprpc dynamic library name * wip * skip rpc load fail * fix page_align_alloc * suppress some warning in gcc * wip * reuse align to function * more log * add log and fix warning * wip * fix asan errors and memory leaks * fix the get_io_tensors_from_graph * improve comment * print GGML_QNN_DEFAULT_LIB_SEARCH_PATH * revert some unused changes * move library search path setter into qnn module * fix android library loading * skip qnn_device_get_platform_info for npu emulator --- ggml/src/ggml-qnn/CMakeLists.txt | 18 +- ggml/src/ggml-qnn/backend-ops.cpp | 2 +- ggml/src/ggml-qnn/buffer.hpp | 3 + ggml/src/ggml-qnn/dl_loader.hpp | 71 ++++ ggml/src/ggml-qnn/ggml-qnn.cpp | 72 +--- ggml/src/ggml-qnn/graph.cpp | 73 +++- ggml/src/ggml-qnn/logger.cpp | 15 +- ggml/src/ggml-qnn/op-config-caps.cpp | 15 +- ggml/src/ggml-qnn/op-config-impl.cpp | 2 +- ggml/src/ggml-qnn/op-config.hpp | 3 - ggml/src/ggml-qnn/qnn-lib.cpp | 521 ++++++++++++++++++++++++++- ggml/src/ggml-qnn/qnn-lib.hpp | 469 ++---------------------- ggml/src/ggml-qnn/tensor.hpp | 15 +- ggml/src/ggml-qnn/utils.cpp | 84 +++-- ggml/src/ggml-qnn/utils.hpp | 9 +- 15 files changed, 781 insertions(+), 591 deletions(-) create mode 100644 ggml/src/ggml-qnn/dl_loader.hpp diff --git a/ggml/src/ggml-qnn/CMakeLists.txt b/ggml/src/ggml-qnn/CMakeLists.txt index 7bbb9be76b4f6..ccf51e1a55a07 100644 --- a/ggml/src/ggml-qnn/CMakeLists.txt +++ b/ggml/src/ggml-qnn/CMakeLists.txt @@ -4,12 +4,15 @@ if(CMAKE_SYSTEM_NAME STREQUAL "Android") find_library(LOG_LIB log) set(QNN_LINK_LIBRARIES ${LOG_LIB}) set(QNN_DEFAULT_LIB_SEARCH_PATH "/data/local/tmp/" CACHE STRING "customized library search path for QNN backend") +elseif(CMAKE_SYSTEM_NAME STREQUAL "Windows" OR CMAKE_SYSTEM_NAME STREQUAL "Linux") + set(QNN_DEFAULT_LIB_SEARCH_PATH "" CACHE STRING "customized library search path for QNN backend") else() - message(FATAL_ERROR "QNN now only available on Android") + message(FATAL_ERROR "QNN now only available on Android, Windows and Linux") endif() if(NOT DEFINED GGML_QNN_SDK_PATH) # try read from environment variable + # TODO: create a function to search for the SDK path if(DEFINED ENV{QNN_SDK_PATH}) set(GGML_QNN_SDK_PATH $ENV{QNN_SDK_PATH}) else() @@ -29,5 +32,14 @@ ggml_add_backend_library(ggml-qnn target_include_directories(ggml-qnn PRIVATE ${GGML_QNN_SDK_PATH}/include/QNN ${CMAKE_CURRENT_LIST_DIR}) target_link_libraries(ggml-qnn PRIVATE ${QNN_LINK_LIBRARIES}) -string(REGEX REPLACE "/$" "" GGML_QNN_DEFAULT_LIB_SEARCH_PATH "${QNN_DEFAULT_LIB_SEARCH_PATH}") -target_compile_definitions(ggml-qnn PRIVATE GGML_QNN_DEFAULT_LIB_SEARCH_PATH="${QNN_DEFAULT_LIB_SEARCH_PATH}/") +if(NOT "${QNN_DEFAULT_LIB_SEARCH_PATH}" STREQUAL "") + string(REGEX REPLACE "/$" "" QNN_DEFAULT_LIB_SEARCH_PATH "${QNN_DEFAULT_LIB_SEARCH_PATH}") +endif() + +message("GGML_QNN_DEFAULT_LIB_SEARCH_PATH: ${QNN_DEFAULT_LIB_SEARCH_PATH}") +target_compile_definitions(ggml-qnn PRIVATE GGML_QNN_DEFAULT_LIB_SEARCH_PATH="${QNN_DEFAULT_LIB_SEARCH_PATH}") + +if(GGML_QNN_ENABLE_CPU_BACKEND) + message("GGML_QNN_ENABLE_CPU_BACKEND is enabled") + target_compile_definitions(ggml-qnn PRIVATE GGML_QNN_ENABLE_CPU_BACKEND) +endif() diff --git a/ggml/src/ggml-qnn/backend-ops.cpp b/ggml/src/ggml-qnn/backend-ops.cpp index 8bbf26da5275e..f62fc60d5c055 100644 --- a/ggml/src/ggml-qnn/backend-ops.cpp +++ b/ggml/src/ggml-qnn/backend-ops.cpp @@ -389,7 +389,7 @@ bool ggml_qnn_supports_tensor(ggml_backend_qnn_device_context *ctx, const ggml_t case GGML_TYPE_F16: case GGML_TYPE_Q8_0: case GGML_TYPE_Q4_0: - if (!(ctx->supported_types & (1 << tensor->type))) { + if (!(ctx->supported_types & (uint64_t(1) << tensor->type))) { QNN_LOG_DEBUG("[%s]unsupported data type %s, supported_types: 0x%x", qnn::get_backend_name(ctx->device), ggml_type_name(tensor->type), ctx->supported_types); return false; diff --git a/ggml/src/ggml-qnn/buffer.hpp b/ggml/src/ggml-qnn/buffer.hpp index af165b394eefb..ce796cbe4df08 100644 --- a/ggml/src/ggml-qnn/buffer.hpp +++ b/ggml/src/ggml-qnn/buffer.hpp @@ -133,11 +133,14 @@ class qnn_mem_buffer : public qnn_buffer_interface { if (data) { memcpy(_buffer, data, size); } + + QNN_LOG_DEBUG("alloc buffer: %p, size: %ld", _buffer, size); } explicit qnn_mem_buffer(size_t size) : qnn_mem_buffer(nullptr, size) {} ~qnn_mem_buffer() { + QNN_LOG_DEBUG("free buffer: %p, size: %ld", _buffer, _size); // the free will do nothing if the _buffer is nullptr qnn::align_free(_buffer); } diff --git a/ggml/src/ggml-qnn/dl_loader.hpp b/ggml/src/ggml-qnn/dl_loader.hpp new file mode 100644 index 0000000000000..1beec8866ba4c --- /dev/null +++ b/ggml/src/ggml-qnn/dl_loader.hpp @@ -0,0 +1,71 @@ +#pragma once + +#ifdef __linux__ +#include +#include +#elif defined(_WIN32) +#define WIN32_LEAN_AND_MEAN +#ifndef NOMINMAX +#define NOMINMAX +#endif +#include +#endif + +#include + +namespace qnn { + +#ifdef __linux__ +typedef void *dl_handler_t; + +inline qnn::dl_handler_t dl_load(const std::string &lib_path) { + return dlopen(lib_path.c_str(), RTLD_NOW | RTLD_LOCAL); +} + +inline void *dl_sym(qnn::dl_handler_t handle, const std::string &symbol) { return dlsym(handle, symbol.c_str()); } + +inline bool dl_unload(qnn::dl_handler_t handle) { return dlclose(handle) == 0; } + +inline const char *dl_error() { return dlerror(); } +#elif defined(_WIN32) +using dl_handler_t = HMODULE; + +inline qnn::dl_handler_t dl_load(const std::string &lib_path) { + // suppress error dialogs for missing DLLs + auto old_mode = SetErrorMode(SEM_FAILCRITICALERRORS); + SetErrorMode(old_mode | SEM_FAILCRITICALERRORS); + + auto handle = LoadLibraryA(lib_path.c_str()); // TODO: use wstring version for unicode paths + + SetErrorMode(old_mode); + return handle; +} + +inline void *dl_sym(qnn::dl_handler_t handle, const std::string &symbol) { + auto old_mode = SetErrorMode(SEM_FAILCRITICALERRORS); + SetErrorMode(old_mode | SEM_FAILCRITICALERRORS); + + void *p = (void *)GetProcAddress(handle, symbol.c_str()); + + SetErrorMode(old_mode); + return p; +} + +inline bool dl_unload(qnn::dl_handler_t handle) { + FreeLibrary(handle); + return true; +} + +inline const char *dl_error() { + // TODO: implement dl_error for Windows + return nullptr; +} + +#endif + +template +Fn dl_sym_typed(qnn::dl_handler_t handle, const std::string &function_name) { + return reinterpret_cast(dl_sym(handle, function_name)); +} + +} // namespace qnn diff --git a/ggml/src/ggml-qnn/ggml-qnn.cpp b/ggml/src/ggml-qnn/ggml-qnn.cpp index b3673eb35a5f3..8150dcb9ea240 100644 --- a/ggml/src/ggml-qnn/ggml-qnn.cpp +++ b/ggml/src/ggml-qnn/ggml-qnn.cpp @@ -1,23 +1,7 @@ #include "ggml-qnn.h" -#include -#include -#include -#include #include -#include -#include #include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include #include #include "ggml-backend-impl.h" @@ -44,6 +28,16 @@ namespace { +#ifdef _WIN32 +constexpr const char *kQnnCpuLibName = "QnnCpu.dll"; +constexpr const char *kQnnGpuLibName = "QnnGpu.dll"; +constexpr const char *kQnnNpuLibName = "QnnHtp.dll"; +#else +constexpr const char *kQnnCpuLibName = "libQnnCpu.so"; +constexpr const char *kQnnGpuLibName = "libQnnGpu.so"; +constexpr const char *kQnnNpuLibName = "libQnnHtp.so"; +#endif + struct qnn_device_caps { const char *name; const char *description; @@ -59,7 +53,7 @@ constexpr const qnn_device_caps kDeviceCaps[] = { // https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/CpuOpDefSupplement.html#matmul "qnn-cpu", "Qualcomm Kryo CPU", - "libQnnCpu.so", + kQnnCpuLibName, GGML_BACKEND_DEVICE_TYPE_CPU, (1 << GGML_TYPE_I8) | (1 << GGML_TYPE_F32), }, @@ -67,7 +61,7 @@ constexpr const qnn_device_caps kDeviceCaps[] = { // https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/GpuOpDefSupplement.html#matmul "qnn-gpu", "Qualcomm Adreno GPU", - "libQnnGpu.so", + kQnnGpuLibName, GGML_BACKEND_DEVICE_TYPE_GPU, (1 << GGML_TYPE_F32) | (1 << GGML_TYPE_F16), }, @@ -75,7 +69,7 @@ constexpr const qnn_device_caps kDeviceCaps[] = { // https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/HtpOpDefSupplement.html#matmul "qnn-npu", "Qualcomm NPU", - "libQnnHtp.so", + kQnnNpuLibName, GGML_BACKEND_DEVICE_TYPE_ACCEL, (1 << GGML_TYPE_F32) | (1 << GGML_TYPE_F16) | (1 << GGML_TYPE_I16) | (1 << GGML_TYPE_I8), }, @@ -214,6 +208,8 @@ void ggml_backend_qnn_free(ggml_backend_t backend) { instance->qnn_finalize(); instance.reset(); } + + delete backend; } bool ggml_backend_qnn_cpy_tensor_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, const ggml_tensor *src, @@ -332,42 +328,10 @@ ggml_backend_t ggml_backend_qnn_init_with_device_context(ggml_backend_dev_t dev, const auto device = dev_ctx->device; QNN_LOG_DEBUG("device %s", qnn::get_backend_name(device)); QNN_LOG_DEBUG("extend_lib_search_path %s", extend_lib_search_path); - std::string path = extend_lib_search_path; - -// TODO: Fix this for other platforms -#if defined(__ANDROID__) || defined(ANDROID) - if (device == QNN_BACKEND_NPU) { - if (setenv("LD_LIBRARY_PATH", - (path + ":/vendor/dsp/cdsp:/vendor/lib64:/vendor/dsp/" - "dsp:/vendor/dsp/images") - .c_str(), - 1) == 0) { - QNN_LOG_DEBUG("QNN NPU backend setenv successfully"); - } else { - QNN_LOG_ERROR("QNN NPU backend setenv failure"); - } - if (setenv("ADSP_LIBRARY_PATH", - (path + ";/vendor/dsp/cdsp;/vendor/lib/rfsa/adsp;/system/lib/" - "rfsa/adsp;/vendor/dsp/dsp;/vendor/dsp/images;/dsp") - .c_str(), - 1) == 0) { - QNN_LOG_DEBUG("QNN NPU backend setenv successfully"); - } else { - QNN_LOG_ERROR("QNN NPU backend setenv failure"); - } - } else { - if (setenv("LD_LIBRARY_PATH", path.c_str(), 1) == 0) { - QNN_LOG_DEBUG("%s backend setenv successfully", qnn::get_backend_name(device)); - } else { - QNN_LOG_ERROR("%s backend setenv failure", qnn::get_backend_name(device)); - } - } -#endif - - auto instance = std::make_shared(path, dev_ctx->lib_name, "ggml"); + auto instance = std::make_shared(extend_lib_search_path, dev_ctx->lib_name); auto result = instance->qnn_init(nullptr); if (result != 0) { - QNN_LOG_WARN("init qnn subsystem failed with qnn backend %s, pls check why", qnn::get_backend_name(device)); + QNN_LOG_WARN("failed to init qnn backend %s", qnn::get_backend_name(device)); return nullptr; } auto qnn_interface = instance->get_qnn_interface(); @@ -466,6 +430,7 @@ struct ggml_backend_qnn_reg_impl : ggml_backend_reg { QNN_LOG_DEBUG("qnn backend registry init"); for (size_t i = 0; i < QNN_BACKEND_COUNT; i++) { const auto device_enum = (QNNBackend)(QNN_BACKEND_COUNT - 1 - i); // init from the last device, i.e. NPU +#ifndef GGML_QNN_ENABLE_CPU_BACKEND if (device_enum == QNN_BACKEND_CPU) { /* * here we skip the initialization of CPU device, @@ -473,6 +438,7 @@ struct ggml_backend_qnn_reg_impl : ggml_backend_reg { */ continue; } +#endif device_contexts.emplace_back(std::make_unique( /* .device = */ device_enum, // init from the last device, i.e. NPU diff --git a/ggml/src/ggml-qnn/graph.cpp b/ggml/src/ggml-qnn/graph.cpp index 680f5e23bd9f3..25ce5b8fb2754 100644 --- a/ggml/src/ggml-qnn/graph.cpp +++ b/ggml/src/ggml-qnn/graph.cpp @@ -1,7 +1,7 @@ #include "graph.hpp" -#include +#include #include #include "ggml-impl.h" @@ -106,13 +106,29 @@ bool bind_src_tensors(ggml_tensor *op, qnn::qnn_tensor_array_t &tensor_wrappers, return true; } +/** + * @brief Extracts input and output tensors from a computational graph. + * + * This function identifies the input and output tensors of a computational graph by analyzing the connectivity between + * tensor nodes. It does this by iterating over each node in the graph, using a connectivity map that associates every + * tensor with its number of incoming connections (in_degree), outgoing connections (out_degree), and an insertion index + * that preserves order. The insertion index is used later to sort the tensors in their original discovery order. + * + * TODO: this algorithm is not perfect and may not work for all cases. It assumes that the tensors are + * connected in a way that allows for unambiguous categorization. + * It also assumes that the tensors are connected in a way that allows for unambiguous categorization. + */ int get_io_tensors_from_graph(const ggml_cgraph *cgraph, qnn::ggml_tensor_array_t &inputs, qnn::ggml_tensor_array_t &outputs) { - using ggml_tensor_set_t = std::set; + struct _tensor_connectivity_info { + size_t in_degree = 0; + size_t out_degree = 0; + size_t insert_index = 0; + }; - ggml_tensor_set_t input_set; - ggml_tensor_set_t output_set; - ggml_tensor_set_t visited_set; + using ggml_tensor_connectivity_map_t = std::unordered_map; + + ggml_tensor_connectivity_map_t connectivity_map; int rank = 0; for (int i = 0; i < cgraph->n_nodes; i++) { ggml_tensor *dst = cgraph->nodes[i]; @@ -126,25 +142,50 @@ int get_io_tensors_from_graph(const ggml_cgraph *cgraph, qnn::ggml_tensor_array_ } rank = std::max(rank, ggml_n_dims(dst)); - input_set.erase(dst); - if (!visited_set.count(dst)) { - output_set.insert(dst); - visited_set.insert(dst); + if (connectivity_map.count(dst) == 0) { + connectivity_map[dst] = { + 1, // in-degree, at least 1 + 0, + connectivity_map.size(), + }; + } else { + ++(connectivity_map[dst].in_degree); } for (size_t i = 0; i < GGML_MAX_DIMS && dst->src[i]; ++i) { auto *src = dst->src[i]; rank = std::max(rank, ggml_n_dims(src)); - output_set.erase(src); - if (!visited_set.count(src)) { - input_set.insert(src); - visited_set.insert(src); + + if (connectivity_map.count(src) == 0) { + connectivity_map[src] = { + 0, + 1, // out-degree, at least 1 + connectivity_map.size(), + }; + } else { + ++(connectivity_map[src].out_degree); } } } - inputs.assign(input_set.begin(), input_set.end()); - outputs.assign(output_set.begin(), output_set.end()); + for (const auto &kv : connectivity_map) { + if (kv.second.in_degree == 0) { + inputs.push_back(kv.first); + } + + if (kv.second.out_degree == 0) { + outputs.push_back(kv.first); + } + } + + std::sort(inputs.begin(), inputs.end(), [&connectivity_map](ggml_tensor *lhs, ggml_tensor *rhs) { + return connectivity_map[lhs].insert_index < connectivity_map[rhs].insert_index; + }); + + std::sort(outputs.begin(), outputs.end(), [&connectivity_map](ggml_tensor *lhs, ggml_tensor *rhs) { + return connectivity_map[lhs].insert_index < connectivity_map[rhs].insert_index; + }); + return rank; } @@ -187,7 +228,7 @@ qnn_graph::qnn_graph(const std::string &graph_name, QNNBackend device, std::shar QnnHtpGraph_CustomConfig_t vtcm_config; vtcm_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_VTCM_SIZE; - vtcm_config.vtcmSizeInMB = vtcm_size_in_mb; + vtcm_config.vtcmSizeInMB = (uint32_t)vtcm_size_in_mb; QnnGraph_Config_t graph_vtcm_config; graph_vtcm_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; graph_vtcm_config.customConfig = &vtcm_config; diff --git a/ggml/src/ggml-qnn/logger.cpp b/ggml/src/ggml-qnn/logger.cpp index 1e781721d629c..23a3f305c060f 100644 --- a/ggml/src/ggml-qnn/logger.cpp +++ b/ggml/src/ggml-qnn/logger.cpp @@ -1,8 +1,7 @@ #include "logger.hpp" -#include - +#include #include #if defined(__ANDROID__) || defined(ANDROID) @@ -23,10 +22,12 @@ void qnn::internal_log(ggml_log_level level, const char * /*file*/, const char * int len = vsnprintf(s_qnn_internal_log_buf + len_prefix, QNN_LOGBUF_LEN - len_prefix, format, args); if (len < (QNN_LOGBUF_LEN - len_prefix)) { #if defined(__ANDROID__) || defined(ANDROID) - // for Android APK + // print to android logcat __android_log_print(level, "ggml-qnn", "%s\n", s_qnn_internal_log_buf); +#else + (void)level; #endif - // for Android command line application or WoA(Windows on ARM) + // print to stdout printf("%s\n", s_qnn_internal_log_buf); } va_end(args); @@ -36,7 +37,7 @@ void qnn::internal_log(ggml_log_level level, const char * /*file*/, const char * #if ENABLE_QNNSDK_LOG void qnn::sdk_logcallback(const char *fmt, QnnLog_Level_t level, uint64_t /*timestamp*/, va_list argp) { static std::mutex log_mutex; - static unsigned char s_ggml_qnn_logbuf[QNN_LOGBUF_LEN]; + static char s_ggml_qnn_logbuf[QNN_LOGBUF_LEN]; const char *log_level_desc = ""; switch (level) { @@ -62,9 +63,7 @@ void qnn::sdk_logcallback(const char *fmt, QnnLog_Level_t level, uint64_t /*time { std::lock_guard lock(log_mutex); - - memset(s_ggml_qnn_logbuf, 0, QNN_LOGBUF_LEN); - vsnprintf(reinterpret_cast(s_ggml_qnn_logbuf), QNN_LOGBUF_LEN, fmt, argp); + vsnprintf(s_ggml_qnn_logbuf, QNN_LOGBUF_LEN, fmt, argp); QNN_LOG_INFO("[%s]%s", log_level_desc, s_ggml_qnn_logbuf); } } diff --git a/ggml/src/ggml-qnn/op-config-caps.cpp b/ggml/src/ggml-qnn/op-config-caps.cpp index 9b28a76dd1dcf..b250c214a3ad9 100644 --- a/ggml/src/ggml-qnn/op-config-caps.cpp +++ b/ggml/src/ggml-qnn/op-config-caps.cpp @@ -5,17 +5,17 @@ namespace { using op_constructor_t = std::shared_ptr (*)(const ggml_tensor *, const std::string &, std::shared_ptr); -using op_dims_calc_func_t = void (*)(const std::vector &input_dims, +using op_dims_calc_func_t = void (*)(const std::vector &input_dims, qnn::ggml_dimension_array_t &output_dims); -void element_wise_op_dims(const std::vector &input_dims, +void element_wise_op_dims(const std::vector &input_dims, qnn::ggml_dimension_array_t &output_dims) { for (size_t i = 1; i < std::size(output_dims); i++) { output_dims[i] = input_dims.front()[i]; } } -void mat_mul_op_dims(const std::vector &input_dims, +void mat_mul_op_dims(const std::vector &input_dims, qnn::ggml_dimension_array_t &output_dims) { GGML_ASSERT(input_dims.size() == 2); output_dims[0] = input_dims.front()[1]; @@ -374,15 +374,6 @@ size_t get_qnn_op_index(const ggml_tensor *tensor) { return tensor->op; } -void get_ggml_op_output_dimensions(const std::vector &input_dims, const ggml_tensor *op, - ggml_dimension_array_t &output_dims) { - auto op_index = get_qnn_op_index(op); - GGML_ASSERT(op_index < std::size(kOpCaps)); - auto get_dims = kOpCaps[op_index].calc_dims_func; - GGML_ASSERT(get_dims); - get_dims(input_dims, output_dims); -} - const char *get_qnn_op_name(const ggml_tensor *op) { auto op_index = get_qnn_op_index(op); GGML_ASSERT(op_index < std::size(kOpCaps)); diff --git a/ggml/src/ggml-qnn/op-config-impl.cpp b/ggml/src/ggml-qnn/op-config-impl.cpp index 19a1bf46ee9dc..934dbadfdcaf8 100644 --- a/ggml/src/ggml-qnn/op-config-impl.cpp +++ b/ggml/src/ggml-qnn/op-config-impl.cpp @@ -276,7 +276,7 @@ qnn_tensor_ptr_t ggml_qnn_matmul_op_config::create_gather_nodes(QNNBackend devic auto index_buffer = std::make_shared(dimensions[axis] * sizeof(uint32_t)); for (uint32_t *curr = reinterpret_cast(index_buffer->get_buffer()), *end = curr + dimensions[axis]; curr < end; curr++) { - *curr = (curr - reinterpret_cast(index_buffer->get_buffer())) / scale; + *curr = uint32_t((curr - reinterpret_cast(index_buffer->get_buffer())) / scale); } auto gather_index = std::make_shared( diff --git a/ggml/src/ggml-qnn/op-config.hpp b/ggml/src/ggml-qnn/op-config.hpp index 075c56fed6e13..6b8c6946b8e86 100644 --- a/ggml/src/ggml-qnn/op-config.hpp +++ b/ggml/src/ggml-qnn/op-config.hpp @@ -15,9 +15,6 @@ namespace qnn { constexpr const size_t kGgmlUnaryOpStart = GGML_OP_COUNT; size_t get_qnn_op_index(const ggml_tensor *tensor); -void get_ggml_op_output_dimensions(const std::vector &input_dims, const ggml_tensor *op, - ggml_dimension_array_t &output_dims); - const char *get_qnn_op_name(const ggml_tensor *op); size_t get_qnn_op_input_param_count(const ggml_tensor *op); std::shared_ptr create_op(const ggml_tensor *op, const std::string &name, diff --git a/ggml/src/ggml-qnn/qnn-lib.cpp b/ggml/src/ggml-qnn/qnn-lib.cpp index a7553c4ac2b75..1f9a68333c05b 100644 --- a/ggml/src/ggml-qnn/qnn-lib.cpp +++ b/ggml/src/ggml-qnn/qnn-lib.cpp @@ -1,35 +1,536 @@ #include "qnn-lib.hpp" +#include + +#if defined(__linux__) +#include +#endif + +namespace { + +#ifdef _WIN32 +constexpr const char *kQnnSystemLibName = "QnnSystem.dll"; +constexpr const char *kQnnRpcLibName = "libcdsprpc.dll"; +#else +constexpr const char *kQnnSystemLibName = "libQnnSystem.so"; +constexpr const char *kQnnRpcLibName = "libcdsprpc.so"; + +#endif + +void insert_path(std::string &path, std::string insert_path, const char separator = ':') { + if (!insert_path.empty() && !path.empty()) { + insert_path += separator; + } + + path.insert(0, insert_path); +} + +// TODO: Fix this for other platforms, or use a more portable way to set the library search path +bool set_qnn_lib_search_path(const std::string &custom_lib_search_path) { +#if defined(__linux__) + { + auto *original = getenv("LD_LIBRARY_PATH"); + std::string lib_search_path = original ? original : ""; + insert_path(lib_search_path, + "/vendor/dsp/cdsp:/vendor/lib64:" + "/vendor/dsp/dsp:/vendor/dsp/images"); + insert_path(lib_search_path, custom_lib_search_path); + if (setenv("LD_LIBRARY_PATH", lib_search_path.c_str(), 1)) { + return false; + } + } + +#if defined(__ANDROID__) || defined(ANDROID) + { + // See also: https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-2/dsp_runtime.html + std::string adsp_lib_search_path = custom_lib_search_path + + ";/vendor/dsp/cdsp;/vendor/lib/rfsa/adsp;/system/lib/" + "rfsa/adsp;/vendor/dsp/dsp;/vendor/dsp/images;/dsp"; + if (setenv("ADSP_LIBRARY_PATH", adsp_lib_search_path.c_str(), 1)) { + return false; + } + + QNN_LOG_DEBUG("ADSP_LIBRARY_PATH=%s", getenv("ADSP_LIBRARY_PATH")); + } +#endif + + QNN_LOG_DEBUG("LD_LIBRARY_PATH=%s", getenv("LD_LIBRARY_PATH")); +#else + (void)custom_lib_search_path; +#endif + + return true; +} + +qnn::dl_handler_t load_lib_with_fallback(const std::string &lib_path, const std::string &load_directory) { + std::filesystem::path full_path(load_directory); + full_path /= std::filesystem::path(lib_path).filename(); + auto handle = qnn::dl_load(full_path.string()); + if (!handle) { + QNN_LOG_WARN("failed to load %s, fallback to %s", full_path.c_str(), lib_path.c_str()); + handle = qnn::dl_load(lib_path); + } + + return handle; +} + +} // namespace + namespace qnn { -qnn_system_interface::qnn_system_interface(const QnnSystemInterface_t &qnn_sys_interface, dl_handler_t lib_handle) : - _qnn_sys_interface(qnn_sys_interface), _lib_handle(lib_handle) { +qnn_system_interface::qnn_system_interface(const QnnSystemInterface_t &qnn_sys_interface, dl_handler_t lib_handle) + : _qnn_sys_interface(qnn_sys_interface), _lib_handle(lib_handle) { qnn_system_context_create(&_qnn_system_handle); if (_qnn_system_handle) { - QNN_LOG_INFO("initialize qnn system successfully\n"); + QNN_LOG_INFO("initialize qnn system successfully"); } else { - QNN_LOG_WARN("can not create QNN system contenxt\n"); + QNN_LOG_WARN("can not create QNN system contenxt"); } } qnn_system_interface::~qnn_system_interface() { if (_qnn_system_handle) { if (qnn_system_context_free(_qnn_system_handle) != QNN_SUCCESS) { - QNN_LOG_WARN("failed to free QNN system context\n"); + QNN_LOG_WARN("failed to free QNN system context"); } } else { - QNN_LOG_WARN("system handle is null\n"); + QNN_LOG_WARN("system handle is null"); } if (_lib_handle) { - int dlclose_error = dl_unload(_lib_handle); - if (dlclose_error != 0) { - QNN_LOG_WARN("failed to close QnnSystem library, error %s\n", dl_error()); + if (!dl_unload(_lib_handle)) { + QNN_LOG_WARN("failed to close QnnSystem library, error %s", dl_error()); + } + } else { + QNN_LOG_WARN("system lib handle is null"); + } +} + +qnn_instance::qnn_instance(const std::string &lib_path, const std::string &backend_lib_name) + : _additional_lib_load_path(lib_path), _backend_lib_name(std::move(backend_lib_name)) { + if (set_qnn_lib_search_path(lib_path)) { + QNN_LOG_DEBUG("[%s] set_qnn_lib_search_path succeed", _backend_lib_name.c_str()); + } else { + QNN_LOG_ERROR("[%s] set_qnn_lib_search_path failed", _backend_lib_name.c_str()); + } +} + +int qnn_instance::qnn_init(const QnnSaver_Config_t **saver_config) { + BackendIdType backend_id = QNN_BACKEND_ID_NULL; + QNN_LOG_DEBUG("enter qnn_init"); + + std::lock_guard lock(_init_mutex); + if (load_system() != 0) { + QNN_LOG_WARN("failed to load QNN system lib"); + return 1; + } else { + QNN_LOG_DEBUG("load QNN system lib successfully"); + } + + std::string backend_lib_path = _backend_lib_name; + if (_lib_path_to_backend_id.count(backend_lib_path) == 0) { + if (load_backend(backend_lib_path, saver_config) != 0) { + QNN_LOG_WARN("failed to load QNN backend"); + return 2; + } + } + + backend_id = _lib_path_to_backend_id[backend_lib_path]; + if (_loaded_backend.count(backend_id) == 0 || _loaded_lib_handle.count(backend_id) == 0) { + QNN_LOG_WARN( + "library %s is loaded but loaded backend count=%zu, " + "loaded lib_handle count=%zu", + backend_lib_path.c_str(), _loaded_backend.count(backend_id), _loaded_lib_handle.count(backend_id)); + return 3; + } + + _qnn_interface = std::make_shared(*_loaded_backend[backend_id]); + _qnn_interface->qnn_log_create(qnn::sdk_logcallback, _qnn_log_level, &_qnn_log_handle); + if (!_qnn_log_handle) { + // NPU backend not work on Qualcomm SoC equipped low-end phone + QNN_LOG_WARN("why failed to initialize qnn log"); + return 4; + } else { + QNN_LOG_DEBUG("initialize qnn log successfully"); + } + + std::vector temp_backend_config; + _qnn_interface->qnn_backend_create( + _qnn_log_handle, temp_backend_config.empty() ? nullptr : temp_backend_config.data(), &_qnn_backend_handle); + if (!_qnn_backend_handle) { + QNN_LOG_WARN("why failed to initialize qnn backend"); + return 5; + } else { + QNN_LOG_DEBUG("initialize qnn backend successfully"); + } + + auto qnn_status = _qnn_interface->qnn_property_has_capability(QNN_PROPERTY_GROUP_DEVICE); + if (QNN_PROPERTY_NOT_SUPPORTED == qnn_status) { + QNN_LOG_WARN("device property is not supported"); + } + if (QNN_PROPERTY_ERROR_UNKNOWN_KEY == qnn_status) { + QNN_LOG_WARN("device property is not known to backend"); + } + + qnn_status = QNN_SUCCESS; + if (_backend_lib_name.find("Htp") != _backend_lib_name.npos) { + const QnnDevice_PlatformInfo_t *p_info = nullptr; + qnn_status = _qnn_interface->qnn_device_get_platform_info(nullptr, &p_info); + if (qnn_status == QNN_SUCCESS) { + QNN_LOG_INFO("device counts %d", p_info->v1.numHwDevices); + QnnDevice_HardwareDeviceInfo_t *infos = p_info->v1.hwDevices; + QnnHtpDevice_OnChipDeviceInfoExtension_t chipinfo = {}; + for (uint32_t i = 0; i < p_info->v1.numHwDevices; i++) { + QNN_LOG_INFO("deviceID:%d, deviceType:%d, numCores %d", infos[i].v1.deviceId, infos[i].v1.deviceType, + infos[i].v1.numCores); + QnnDevice_DeviceInfoExtension_t devinfo = infos[i].v1.deviceInfoExtension; + chipinfo = devinfo->onChipDevice; + size_t htp_arch = (size_t)chipinfo.arch; + QNN_LOG_INFO("htp_type:%d(%s)", devinfo->devType, + (devinfo->devType == QNN_HTP_DEVICE_TYPE_ON_CHIP) ? "ON_CHIP" : ""); + QNN_LOG_INFO("qualcomm soc_model:%d(%s), htp_arch:%d(%s), vtcm_size:%d MB", chipinfo.socModel, + qnn::get_chipset_desc(chipinfo.socModel), htp_arch, qnn::get_htparch_desc(htp_arch), + chipinfo.vtcmSize); + _soc_info = {chipinfo.socModel, htp_arch, chipinfo.vtcmSize}; + } + _qnn_interface->qnn_device_free_platform_info(nullptr, p_info); + } else { + // For emulator, we can't get platform info + QNN_LOG_WARN("failed to get platform info, are we in emulator?"); + _soc_info = {NONE, UNKNOWN_SM, 0}; + } + + QnnHtpDevice_CustomConfig_t soc_customconfig; + soc_customconfig.option = QNN_HTP_DEVICE_CONFIG_OPTION_SOC; + soc_customconfig.socModel = _soc_info.soc_model; + QnnDevice_Config_t soc_devconfig; + soc_devconfig.option = QNN_DEVICE_CONFIG_OPTION_CUSTOM; + soc_devconfig.customConfig = &soc_customconfig; + + QnnHtpDevice_CustomConfig_t arch_customconfig; + arch_customconfig.option = QNN_HTP_DEVICE_CONFIG_OPTION_ARCH; + arch_customconfig.arch.arch = (QnnHtpDevice_Arch_t)_soc_info.htp_arch; + arch_customconfig.arch.deviceId = 0; // Id of device to be used. 0 will use by default. + QnnDevice_Config_t arch_devconfig; + arch_devconfig.option = QNN_DEVICE_CONFIG_OPTION_CUSTOM; + arch_devconfig.customConfig = &arch_customconfig; + + const QnnDevice_Config_t *p_deviceconfig[] = {&soc_devconfig, &arch_devconfig, nullptr}; + qnn_status = _qnn_interface->qnn_device_create(_qnn_log_handle, p_deviceconfig, &_qnn_device_handle); + } else { + qnn_status = _qnn_interface->qnn_device_create(_qnn_log_handle, nullptr, &_qnn_device_handle); + } + if (QNN_SUCCESS != qnn_status && QNN_DEVICE_ERROR_UNSUPPORTED_FEATURE != qnn_status) { + QNN_LOG_WARN("failed to create QNN device"); + } else { + QNN_LOG_INFO("create QNN device successfully"); + } + + if (_profile_level != sdk_profile_level::profile_off) { + QNN_LOG_INFO("profiling turned on; level = %d", _profile_level); + auto profile_level = + _profile_level == sdk_profile_level::profile_detail ? QNN_PROFILE_LEVEL_DETAILED : QNN_PROFILE_LEVEL_BASIC; + + if (QNN_PROFILE_NO_ERROR != + _qnn_interface->qnn_profile_create(_qnn_backend_handle, profile_level, &_qnn_profile_handle)) { + QNN_LOG_WARN("unable to create profile handle in the backend"); + return 6; + } else { + QNN_LOG_DEBUG("initialize qnn profile successfully"); + } + } + + _rpc_lib_handle = load_lib_with_fallback(kQnnRpcLibName, _additional_lib_load_path); + if (_rpc_lib_handle) { + _pfn_rpc_mem_alloc = reinterpret_cast(dl_sym(_rpc_lib_handle, "rpcmem_alloc")); + _pfn_rpc_mem_free = reinterpret_cast(dl_sym(_rpc_lib_handle, "rpcmem_free")); + _pfn_rpc_mem_to_fd = reinterpret_cast(dl_sym(_rpc_lib_handle, "rpcmem_to_fd")); + if (!_pfn_rpc_mem_alloc || !_pfn_rpc_mem_free || !_pfn_rpc_mem_to_fd) { + QNN_LOG_WARN("unable to access symbols in QNN RPC lib. error: %s", dl_error()); + dl_unload(_rpc_lib_handle); + return 9; + } + + _pfn_rpc_mem_init = reinterpret_cast(dl_sym(_rpc_lib_handle, "rpcmem_init")); + _pfn_rpc_mem_deinit = reinterpret_cast(dl_sym(_rpc_lib_handle, "rpcmem_deinit")); + if (_pfn_rpc_mem_init) { + _pfn_rpc_mem_init(); + } + + _rpcmem_initialized = true; + QNN_LOG_DEBUG("load rpcmem lib successfully"); + } else { + QNN_LOG_WARN("failed to load qualcomm rpc lib, skipping, error:%s", dl_error()); + } + + /* TODO: not used, keep it for further usage + QnnContext_Config_t qnn_context_config = QNN_CONTEXT_CONFIG_INIT; + qnn_context_config.priority = QNN_PRIORITY_DEFAULT; + const QnnContext_Config_t * context_configs[] = {&qnn_context_config, nullptr}; + */ + _qnn_interface->qnn_context_create(_qnn_backend_handle, _qnn_device_handle, nullptr, &_qnn_context_handle); + if (nullptr == _qnn_context_handle) { + QNN_LOG_WARN("why failed to initialize qnn context"); + return 10; + } else { + QNN_LOG_DEBUG("initialize qnn context successfully"); + } + + if (_backend_lib_name.find("Htp") != _backend_lib_name.npos) { + // TODO: faster approach to probe the accurate capacity of rpc ion memory + size_t candidate_size = 0; + uint8_t *rpc_buffer = nullptr; + const int size_in_mb = (1 << 20); + size_t probe_slots[] = {1024, 1536, 2048 - 48, 2048}; + size_t probe_counts = sizeof(probe_slots) / sizeof(size_t); + for (size_t idx = 0; idx < probe_counts; idx++) { + rpc_buffer = static_cast(alloc_rpcmem(probe_slots[idx] * size_in_mb, sizeof(void *))); + if (!rpc_buffer) { + QNN_LOG_DEBUG("alloc rpcmem %d (MB) failure, %s", probe_slots[idx], strerror(errno)); + break; + } else { + candidate_size = probe_slots[idx]; + free_rpcmem(rpc_buffer); + rpc_buffer = nullptr; + } + } + + _rpcmem_capacity = std::max(candidate_size, _rpcmem_capacity); + QNN_LOG_INFO("capacity of QNN rpc ion memory is about %d MB", _rpcmem_capacity); + + if (init_htp_perfinfra() != 0) { + QNN_LOG_WARN("initialize HTP performance failure"); } + if (set_rpc_polling() != 0) { + QNN_LOG_WARN("set RPC polling failure"); + } + if (set_high_performance_mode() != 0) { + QNN_LOG_WARN("set HTP high performance mode failure"); + } + } + + QNN_LOG_DEBUG("leave qnn_init"); + + return 0; +} + +int qnn_instance::qnn_finalize() { + int ret_status = 0; + Qnn_ErrorHandle_t error = QNN_SUCCESS; + + if (_rpc_lib_handle) { + if (_pfn_rpc_mem_deinit) { + _pfn_rpc_mem_deinit(); + _pfn_rpc_mem_deinit = nullptr; + } + + if (dl_unload(_rpc_lib_handle)) { + QNN_LOG_DEBUG("succeed to close rpcmem lib"); + } else { + QNN_LOG_WARN("failed to unload qualcomm's rpc lib, error:%s", dl_error()); + } + } + + if (_backend_lib_name.find("Htp") != _backend_lib_name.npos) { + _qnn_htp_perfinfra->destroyPowerConfigId(_qnn_power_configid); + } + + if (_qnn_context_handle) { + error = _qnn_interface->qnn_context_free(_qnn_context_handle, _qnn_profile_handle); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to free QNN context_handle: ID %u, error %d", _qnn_interface->get_backend_id(), + QNN_GET_ERROR_CODE(error)); + } + _qnn_context_handle = nullptr; + } + + if (_qnn_profile_handle) { + error = _qnn_interface->qnn_profile_free(_qnn_profile_handle); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to free QNN profile_handle: ID %u, error %d", _qnn_interface->get_backend_id(), + QNN_GET_ERROR_CODE(error)); + } + _qnn_profile_handle = nullptr; + } + + if (_qnn_device_handle) { + error = _qnn_interface->qnn_device_free(_qnn_device_handle); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to free QNN device_handle: ID %u, error %d", _qnn_interface->get_backend_id(), + QNN_GET_ERROR_CODE(error)); + } + _qnn_device_handle = nullptr; + } + + if (_qnn_backend_handle) { + error = _qnn_interface->qnn_backend_free(_qnn_backend_handle); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to free QNN backend_handle: ID %u, error %d", _qnn_interface->get_backend_id(), + QNN_GET_ERROR_CODE(error)); + } + _qnn_backend_handle = nullptr; + } + + if (nullptr != _qnn_log_handle) { + error = _qnn_interface->qnn_log_free(_qnn_log_handle); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to free QNN log_handle: ID %u, error %d", _qnn_interface->get_backend_id(), + QNN_GET_ERROR_CODE(error)); + } + _qnn_log_handle = nullptr; + } + + unload_backend(); + + _qnn_sys_interface.reset(); + + return ret_status; +} + +int qnn_instance::load_system() { + QNN_LOG_DEBUG("[%s]lib: %s", _backend_lib_name.c_str(), kQnnSystemLibName); + auto system_lib_handle = load_lib_with_fallback(kQnnSystemLibName, _additional_lib_load_path); + if (!system_lib_handle) { + QNN_LOG_WARN("can not load QNN library %s, error: %s", kQnnSystemLibName, dl_error()); + return 1; + } + + auto *get_providers = + dl_sym_typed(system_lib_handle, "QnnSystemInterface_getProviders"); + if (!get_providers) { + QNN_LOG_WARN("can not load QNN symbol QnnSystemInterface_getProviders: %s", dl_error()); + return 2; + } + + uint32_t num_providers = 0; + const QnnSystemInterface_t **provider_list = nullptr; + Qnn_ErrorHandle_t error = get_providers(&provider_list, &num_providers); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to get providers, error %d", QNN_GET_ERROR_CODE(error)); + return 3; + } + + QNN_LOG_DEBUG("num_providers: %d", num_providers); + if (num_providers != _required_num_providers) { + QNN_LOG_WARN("providers is %d instead of required %d", num_providers, _required_num_providers); + return 4; + } + + if (!provider_list) { + QNN_LOG_WARN("can not get providers"); + return 5; + } + + QNN_SYSTEM_INTERFACE_VER_TYPE qnn_system_interface; + bool found_valid_system_interface = false; + for (size_t idx = 0; idx < num_providers; idx++) { + if (QNN_SYSTEM_API_VERSION_MAJOR == provider_list[idx]->systemApiVersion.major && + QNN_SYSTEM_API_VERSION_MINOR <= provider_list[idx]->systemApiVersion.minor) { + found_valid_system_interface = true; + qnn_system_interface = provider_list[idx]->QNN_SYSTEM_INTERFACE_VER_NAME; + break; + } + } + + if (!found_valid_system_interface) { + QNN_LOG_WARN("unable to find a valid qnn system interface"); + return 6; + } else { + QNN_LOG_DEBUG("find a valid qnn system interface"); + } + + auto qnn_sys_interface = std::make_shared(*provider_list[0], system_lib_handle); + if (!qnn_sys_interface->is_valid()) { + QNN_LOG_WARN("failed to create QNN system interface"); + return 7; + } + + _qnn_sys_interface = qnn_sys_interface; + return 0; +} + +int qnn_instance::load_backend(std::string &lib_path, const QnnSaver_Config_t ** /*saver_config*/) { + Qnn_ErrorHandle_t error = QNN_SUCCESS; + QNN_LOG_DEBUG("lib_path:%s", lib_path.c_str()); + + auto lib_handle = load_lib_with_fallback(lib_path, _additional_lib_load_path); + if (!lib_handle) { + QNN_LOG_WARN("can not open QNN library %s, with error: %s", lib_path.c_str(), dl_error()); + return 1; + } + + auto get_providers = dl_sym_typed(lib_handle, "QnnInterface_getProviders"); + if (!get_providers) { + QNN_LOG_WARN("can not load symbol QnnInterface_getProviders : %s", dl_error()); + return 2; + } + + std::uint32_t num_providers = 0; + const QnnInterface_t **provider_list = nullptr; + error = get_providers(&provider_list, &num_providers); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to get providers, error %d", QNN_GET_ERROR_CODE(error)); + return 3; + } + QNN_LOG_DEBUG("num_providers=%d", num_providers); + if (num_providers != _required_num_providers) { + QNN_LOG_WARN("providers is %d instead of required %d", num_providers, _required_num_providers); + return 4; + } + + if (!provider_list) { + QNN_LOG_WARN("failed to get qnn interface providers"); + return 5; + } + bool found_valid_interface = false; + QNN_INTERFACE_VER_TYPE qnn_interface; + for (size_t idx = 0; idx < num_providers; idx++) { + if (QNN_API_VERSION_MAJOR == provider_list[idx]->apiVersion.coreApiVersion.major && + QNN_API_VERSION_MINOR <= provider_list[idx]->apiVersion.coreApiVersion.minor) { + found_valid_interface = true; + qnn_interface = provider_list[idx]->QNN_INTERFACE_VER_NAME; + break; + } + } + + if (!found_valid_interface) { + QNN_LOG_WARN("unable to find a valid qnn interface"); + return 6; } else { - QNN_LOG_WARN("system lib handle is null\n"); + QNN_LOG_DEBUG("find a valid qnn interface"); + } + + BackendIdType backend_id = provider_list[0]->backendId; + _lib_path_to_backend_id[lib_path] = backend_id; + if (_loaded_backend.count(backend_id) > 0) { + QNN_LOG_WARN("lib_path %s is loaded, but backend %d already exists", lib_path.c_str(), backend_id); } + _loaded_backend[backend_id] = provider_list[0]; + if (_loaded_lib_handle.count(backend_id) > 0) { + QNN_LOG_WARN("closing %p", _loaded_lib_handle[backend_id]); + if (!dl_unload(_loaded_lib_handle[backend_id])) { + QNN_LOG_WARN("fail to close %p with error %s", _loaded_lib_handle[backend_id], dl_error()); + } + } + _loaded_lib_handle[backend_id] = lib_handle; + _backend_id = backend_id; + + return 0; +} + +int qnn_instance::unload_backend() { + for (auto &it : _loaded_lib_handle) { + if (!dl_unload(it.second)) { + QNN_LOG_WARN("failed to close QNN backend %d, error %s", it.first, dl_error()); + } + } + + _loaded_lib_handle.clear(); + _lib_path_to_backend_id.clear(); + _loaded_backend.clear(); + + return 0; } } // namespace qnn diff --git a/ggml/src/ggml-qnn/qnn-lib.hpp b/ggml/src/ggml-qnn/qnn-lib.hpp index 454c0c6aa32c5..968df5bcf297d 100644 --- a/ggml/src/ggml-qnn/qnn-lib.hpp +++ b/ggml/src/ggml-qnn/qnn-lib.hpp @@ -1,8 +1,10 @@ #pragma once -#include - #include +#include +#include +#include +#include #include #include #include @@ -22,27 +24,12 @@ #include #include +#include "dl_loader.hpp" #include "qnn-types.hpp" #include "utils.hpp" namespace qnn { -// TODO: those function should be moved to a separate file, and have separate implementation for each platform -typedef void *dl_handler_t; - -inline dl_handler_t dl_load(const std::string &lib_path) { return dlopen(lib_path.c_str(), RTLD_NOW | RTLD_LOCAL); } - -inline void *dl_sym(dl_handler_t handle, const std::string &symbol) { return dlsym(handle, symbol.c_str()); } - -inline int dl_unload(dl_handler_t handle) { return dlclose(handle); } - -inline const char *dl_error() { return dlerror(); } - -template -Fn dl_sym_typed(dl_handler_t handle, const std::string &function_name) { - return reinterpret_cast(dl_sym(handle, function_name)); -} - // ================================================================================================= // // wrapper class of Qualcomm QNN(Qualcomm Neural Network, aka Qualcomm AI Engine Direct) SDK @@ -52,6 +39,7 @@ Fn dl_sym_typed(dl_handler_t handle, const std::string &function_name) { // TODO: fix this for other compilers #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wextra-semi" +#pragma GCC diagnostic ignored "-Wpedantic" class qnn_system_interface { @@ -188,273 +176,10 @@ class qnn_instance { public: using BackendIdType = decltype(QnnInterface_t{}.backendId); - explicit qnn_instance(const std::string &lib_path, const std::string &backend_name, const std::string &model_name) - : _lib_path(std::move(lib_path)), _backend_name(std::move(backend_name)), _model_name(std::move(model_name)) {} - + explicit qnn_instance(const std::string &lib_path, const std::string &backend_lib_name); ~qnn_instance() {} - - int qnn_init(const QnnSaver_Config_t **saver_config) { - BackendIdType backend_id = QNN_BACKEND_ID_NULL; - QNN_LOG_DEBUG("enter qnn_init"); - - std::lock_guard lock(_init_mutex); - if (load_system() != 0) { - QNN_LOG_WARN("can not load QNN system lib, pls check why?"); - return 1; - } else { - QNN_LOG_DEBUG("load QNN system lib successfully"); - } - - std::string backend_lib_path = _lib_path + _backend_name; - if (_lib_path_to_backend_id.count(backend_lib_path) == 0) { - int is_load_ok = load_backend(backend_lib_path, saver_config); - if (is_load_ok != 0) { - QNN_LOG_WARN("failed to load QNN backend"); - return 2; - } - } - - backend_id = _lib_path_to_backend_id[backend_lib_path]; - if (_loaded_backend.count(backend_id) == 0 || _loaded_lib_handle.count(backend_id) == 0) { - QNN_LOG_WARN( - "library %s is loaded but loaded backend count=%zu, " - "loaded lib_handle count=%zu", - backend_lib_path.c_str(), _loaded_backend.count(backend_id), _loaded_lib_handle.count(backend_id)); - return 3; - } - - _qnn_interface = std::make_shared(*_loaded_backend[backend_id]); - _qnn_interface->qnn_log_create(qnn::sdk_logcallback, _qnn_log_level, &_qnn_log_handle); - if (nullptr == _qnn_log_handle) { - // NPU backend not work on Qualcomm SoC equipped low-end phone - QNN_LOG_WARN("why failed to initialize qnn log"); - return 4; - } else { - QNN_LOG_DEBUG("initialize qnn log successfully"); - } - - std::vector temp_backend_config; - _qnn_interface->qnn_backend_create( - _qnn_log_handle, temp_backend_config.empty() ? nullptr : temp_backend_config.data(), &_qnn_backend_handle); - if (nullptr == _qnn_backend_handle) { - QNN_LOG_WARN("why failed to initialize qnn backend"); - return 5; - } else { - QNN_LOG_DEBUG("initialize qnn backend successfully"); - } - - Qnn_ErrorHandle_t qnn_status = _qnn_interface->qnn_property_has_capability(QNN_PROPERTY_GROUP_DEVICE); - if (QNN_PROPERTY_NOT_SUPPORTED == qnn_status) { - QNN_LOG_WARN("device property is not supported"); - } - if (QNN_PROPERTY_ERROR_UNKNOWN_KEY == qnn_status) { - QNN_LOG_WARN("device property is not known to backend"); - } - - qnn_status = QNN_SUCCESS; - if (_backend_name.find("Htp") != _backend_name.npos) { - const QnnDevice_PlatformInfo_t *p_info = nullptr; - _qnn_interface->qnn_device_get_platform_info(nullptr, &p_info); - QNN_LOG_INFO("device counts %d", p_info->v1.numHwDevices); - QnnDevice_HardwareDeviceInfo_t *infos = p_info->v1.hwDevices; - QnnHtpDevice_OnChipDeviceInfoExtension_t chipinfo = {}; - for (uint32_t i = 0; i < p_info->v1.numHwDevices; i++) { - QNN_LOG_INFO("deviceID:%d, deviceType:%d, numCores %d", infos[i].v1.deviceId, infos[i].v1.deviceType, - infos[i].v1.numCores); - QnnDevice_DeviceInfoExtension_t devinfo = infos[i].v1.deviceInfoExtension; - chipinfo = devinfo->onChipDevice; - QnnHtpDevice_Arch_t htp_arch = chipinfo.arch; - QNN_LOG_INFO("htp_type:%d(%s)", devinfo->devType, - (devinfo->devType == QNN_HTP_DEVICE_TYPE_ON_CHIP) ? "ON_CHIP" : ""); - QNN_LOG_INFO("qualcomm soc_model:%d(%s), htp_arch:%d(%s), vtcm_size:%d MB", chipinfo.socModel, - qnn::get_chipset_desc(chipinfo.socModel), htp_arch, qnn::get_htparch_desc(htp_arch), - chipinfo.vtcmSize); - _soc_info = {chipinfo.socModel, htp_arch, chipinfo.vtcmSize}; - } - _qnn_interface->qnn_device_free_platform_info(nullptr, p_info); - - QnnHtpDevice_CustomConfig_t soc_customconfig; - soc_customconfig.option = QNN_HTP_DEVICE_CONFIG_OPTION_SOC; - soc_customconfig.socModel = chipinfo.socModel; - QnnDevice_Config_t soc_devconfig; - soc_devconfig.option = QNN_DEVICE_CONFIG_OPTION_CUSTOM; - soc_devconfig.customConfig = &soc_customconfig; - - QnnHtpDevice_CustomConfig_t arch_customconfig; - arch_customconfig.option = QNN_HTP_DEVICE_CONFIG_OPTION_ARCH; - arch_customconfig.arch.arch = chipinfo.arch; - arch_customconfig.arch.deviceId = 0; // Id of device to be used. If single device is used by default 0. - QnnDevice_Config_t arch_devconfig; - arch_devconfig.option = QNN_DEVICE_CONFIG_OPTION_CUSTOM; - arch_devconfig.customConfig = &arch_customconfig; - - const QnnDevice_Config_t *p_deviceconfig[] = {&soc_devconfig, &arch_devconfig, nullptr}; - qnn_status = _qnn_interface->qnn_device_create(_qnn_log_handle, p_deviceconfig, &_qnn_device_handle); - } else { - qnn_status = _qnn_interface->qnn_device_create(_qnn_log_handle, nullptr, &_qnn_device_handle); - } - if (QNN_SUCCESS != qnn_status && QNN_DEVICE_ERROR_UNSUPPORTED_FEATURE != qnn_status) { - QNN_LOG_WARN("failed to create QNN device"); - } else { - QNN_LOG_INFO("create QNN device successfully"); - } - - if (_profile_level != sdk_profile_level::profile_off) { - QNN_LOG_INFO("profiling turned on; level = %d", _profile_level); - auto profile_level = _profile_level == sdk_profile_level::profile_detail ? QNN_PROFILE_LEVEL_DETAILED - : QNN_PROFILE_LEVEL_BASIC; - - if (QNN_PROFILE_NO_ERROR != - _qnn_interface->qnn_profile_create(_qnn_backend_handle, profile_level, &_qnn_profile_handle)) { - QNN_LOG_WARN("unable to create profile handle in the backend"); - return 6; - } else { - QNN_LOG_DEBUG("initialize qnn profile successfully"); - } - } - - _rpc_lib_handle = dl_load("libcdsprpc.so"); - if (nullptr == _rpc_lib_handle) { - QNN_LOG_WARN("failed to load qualcomm's rpc lib, error:%s", dl_error()); - return 8; - } else { - QNN_LOG_DEBUG("load rpcmem lib successfully"); - set_rpcmem_initialized(true); - } - _pfn_rpc_mem_init = reinterpret_cast(dl_sym(_rpc_lib_handle, "rpcmem_init")); - _pfn_rpc_mem_deinit = reinterpret_cast(dl_sym(_rpc_lib_handle, "rpcmem_deinit")); - _pfn_rpc_mem_alloc = reinterpret_cast(dl_sym(_rpc_lib_handle, "rpcmem_alloc")); - _pfn_rpc_mem_free = reinterpret_cast(dl_sym(_rpc_lib_handle, "rpcmem_free")); - _pfn_rpc_mem_to_fd = reinterpret_cast(dl_sym(_rpc_lib_handle, "rpcmem_to_fd")); - if (nullptr == _pfn_rpc_mem_alloc || nullptr == _pfn_rpc_mem_free || nullptr == _pfn_rpc_mem_to_fd) { - QNN_LOG_WARN("unable to access symbols in QNN RPC lib. error: %s", dl_error()); - dl_unload(_rpc_lib_handle); - return 9; - } - - if (nullptr != _pfn_rpc_mem_init) { // make Qualcomm's SoC equipped low-end phone happy - _pfn_rpc_mem_init(); - } - - /* TODO: not used, keep it for further usage - QnnContext_Config_t qnn_context_config = QNN_CONTEXT_CONFIG_INIT; - qnn_context_config.priority = QNN_PRIORITY_DEFAULT; - const QnnContext_Config_t * context_configs[] = {&qnn_context_config, nullptr}; - */ - _qnn_interface->qnn_context_create(_qnn_backend_handle, _qnn_device_handle, nullptr, &_qnn_context_handle); - if (nullptr == _qnn_context_handle) { - QNN_LOG_WARN("why failed to initialize qnn context"); - return 10; - } else { - QNN_LOG_DEBUG("initialize qnn context successfully"); - } - - if (_backend_name.find("Htp") != _backend_name.npos) { - // TODO: faster approach to probe the accurate capacity of rpc ion memory - size_t candidate_size = 0; - uint8_t *rpc_buffer = nullptr; - const int size_in_mb = (1 << 20); - size_t probe_slots[] = {1024, 1536, 2048 - 48, 2048}; - size_t probe_counts = sizeof(probe_slots) / sizeof(size_t); - for (size_t idx = 0; idx < probe_counts; idx++) { - rpc_buffer = static_cast(alloc_rpcmem(probe_slots[idx] * size_in_mb, sizeof(void *))); - if (!rpc_buffer) { - QNN_LOG_DEBUG("alloc rpcmem %d (MB) failure, %s", probe_slots[idx], strerror(errno)); - break; - } else { - candidate_size = probe_slots[idx]; - free_rpcmem(rpc_buffer); - rpc_buffer = nullptr; - } - } - - _rpcmem_capacity = std::max(candidate_size, _rpcmem_capacity); - QNN_LOG_INFO("capacity of QNN rpc ion memory is about %d MB", _rpcmem_capacity); - - if (0 != init_htp_perfinfra()) { - QNN_LOG_WARN("initialize HTP performance failure"); - } - if (0 != set_rpc_polling()) { - QNN_LOG_WARN("set RPC polling failure"); - } - if (0 != set_high_performance_mode()) { - QNN_LOG_WARN("set HTP high performance mode failure"); - } - } - - QNN_LOG_DEBUG("leave qnn_init"); - - return 0; - } - - int qnn_finalize() { - int ret_status = 0; - Qnn_ErrorHandle_t error = QNN_SUCCESS; - - if (nullptr != _pfn_rpc_mem_deinit) // make Qualcomm's SoC equipped low-end phone happy - _pfn_rpc_mem_deinit(); - - if (dl_unload(_rpc_lib_handle) != 0) { - QNN_LOG_WARN("failed to unload qualcomm's rpc lib, error:%s", dl_error()); - } else { - QNN_LOG_DEBUG("succeed to close rpcmem lib"); - } - - if (_backend_name.find("Htp") != _backend_name.npos) { - _qnn_htp_perfinfra->destroyPowerConfigId(_qnn_power_configid); - } - - if (nullptr != _qnn_context_handle) { - error = _qnn_interface->qnn_context_free(_qnn_context_handle, _qnn_profile_handle); - if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to free QNN context_handle: ID %u, error %d", _qnn_interface->get_backend_id(), - QNN_GET_ERROR_CODE(error)); - } - _qnn_context_handle = nullptr; - } - - if (nullptr != _qnn_profile_handle) { - error = _qnn_interface->qnn_profile_free(_qnn_profile_handle); - if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to free QNN profile_handle: ID %u, error %d", _qnn_interface->get_backend_id(), - QNN_GET_ERROR_CODE(error)); - } - _qnn_profile_handle = nullptr; - } - - if (nullptr != _qnn_device_handle) { - error = _qnn_interface->qnn_device_free(_qnn_device_handle); - if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to free QNN device_handle: ID %u, error %d", _qnn_interface->get_backend_id(), - QNN_GET_ERROR_CODE(error)); - } - _qnn_device_handle = nullptr; - } - - if (nullptr != _qnn_backend_handle) { - error = _qnn_interface->qnn_backend_free(_qnn_backend_handle); - if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to free QNN backend_handle: ID %u, error %d", _qnn_interface->get_backend_id(), - QNN_GET_ERROR_CODE(error)); - } - _qnn_backend_handle = nullptr; - } - - if (nullptr != _qnn_log_handle) { - error = _qnn_interface->qnn_log_free(_qnn_log_handle); - if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to free QNN log_handle: ID %u, error %d", _qnn_interface->get_backend_id(), - QNN_GET_ERROR_CODE(error)); - } - _qnn_log_handle = nullptr; - } - - unload_backend(); - - _qnn_sys_interface.reset(); - - return ret_status; - } + int qnn_init(const QnnSaver_Config_t **saver_config); + int qnn_finalize(); std::shared_ptr get_qnn_interface() { if (!_qnn_interface) { @@ -477,7 +202,7 @@ class qnn_instance { int init_htp_perfinfra() { QnnDevice_Infrastructure_t device_infra = nullptr; - int error = _qnn_interface->qnn_device_get_infrastructure(&device_infra); + auto error = _qnn_interface->qnn_device_get_infrastructure(&device_infra); if (error != QNN_SUCCESS) { QNN_LOG_WARN("failed to get qnn device infra"); return 1; @@ -578,8 +303,6 @@ class qnn_instance { bool is_rpcmem_initialized() { return _rpcmem_initialized; } - void set_rpcmem_initialized(bool initialized) { _rpcmem_initialized = initialized; } - size_t get_rpcmem_capacity() { return _rpcmem_capacity; } void *alloc_rpcmem(size_t bytes, size_t alignment) { @@ -665,7 +388,7 @@ class qnn_instance { } void unregister_rpcmem(Qnn_MemHandle_t mem_handle) { - Qnn_ErrorHandle_t error = _qnn_interface->qnn_mem_de_register(&mem_handle, 1); + auto error = _qnn_interface->qnn_mem_de_register(&mem_handle, 1); if (error != QNN_SUCCESS) { QNN_LOG_WARN("failed to unregister shared memory, error %d", QNN_GET_ERROR_CODE(error)); } @@ -686,163 +409,15 @@ class qnn_instance { const qnn::qcom_socinfo &get_soc_info() { return _soc_info; } private: - int load_system() { - Qnn_ErrorHandle_t error = QNN_SUCCESS; - - std::string system_lib_path = _lib_path + "libQnnSystem.so"; - QNN_LOG_DEBUG("system_lib_path:%s", system_lib_path.c_str()); - - auto system_lib_handle = dl_load(system_lib_path); - if (!system_lib_handle) { - QNN_LOG_WARN("can not load QNN library %s, error: %s", system_lib_path.c_str(), dl_error()); - return 1; - } - - auto *get_providers = dl_sym_typed( - system_lib_handle, "QnnSystemInterface_getProviders"); - if (!get_providers) { - QNN_LOG_WARN("can not load QNN symbol QnnSystemInterface_getProviders: %s", dl_error()); - return 2; - } - - uint32_t num_providers = 0; - const QnnSystemInterface_t **provider_list = nullptr; - error = get_providers(&provider_list, &num_providers); - if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to get providers, error %d", QNN_GET_ERROR_CODE(error)); - return 3; - } - - if (num_providers != _required_num_providers) { - QNN_LOG_WARN("providers is %d instead of required %d", num_providers, _required_num_providers); - return 4; - } - - if (!provider_list) { - QNN_LOG_WARN("can not get providers"); - return 5; - } - - QNN_SYSTEM_INTERFACE_VER_TYPE qnn_system_interface; - bool found_valid_system_interface = false; - for (size_t idx = 0; idx < num_providers; idx++) { - if (QNN_SYSTEM_API_VERSION_MAJOR == provider_list[idx]->systemApiVersion.major && - QNN_SYSTEM_API_VERSION_MINOR <= provider_list[idx]->systemApiVersion.minor) { - found_valid_system_interface = true; - qnn_system_interface = provider_list[idx]->QNN_SYSTEM_INTERFACE_VER_NAME; - break; - } - } - if (!found_valid_system_interface) { - QNN_LOG_WARN("unable to find a valid qnn system interface"); - return 6; - } else { - QNN_LOG_DEBUG("find a valid qnn system interface"); - } - - auto qnn_sys_interface = std::make_shared(*provider_list[0], system_lib_handle); - if (!qnn_sys_interface->is_valid()) { - QNN_LOG_WARN("failed to create QNN system interface"); - return 7; - } - - _qnn_sys_interface = qnn_sys_interface; - return 0; - } - - int load_backend(std::string &lib_path, const QnnSaver_Config_t ** /*saver_config*/) { - Qnn_ErrorHandle_t error = QNN_SUCCESS; - QNN_LOG_DEBUG("lib_path:%s", lib_path.c_str()); - - auto lib_handle = dl_load(lib_path.c_str()); - if (!lib_handle) { - QNN_LOG_WARN("can not open QNN library %s, with error: %s", lib_path.c_str(), dl_error()); - return 1; - } - - auto get_providers = - qnn::dl_sym_typed(lib_handle, "QnnInterface_getProviders"); - if (!get_providers) { - QNN_LOG_WARN("can not load symbol QnnInterface_getProviders : %s", dl_error()); - return 2; - } - - std::uint32_t num_providers = 0; - const QnnInterface_t **provider_list = nullptr; - error = get_providers(&provider_list, &num_providers); - if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to get providers, error %d", QNN_GET_ERROR_CODE(error)); - return 3; - } - QNN_LOG_DEBUG("num_providers=%d", num_providers); - if (num_providers != _required_num_providers) { - QNN_LOG_WARN("providers is %d instead of required %d", num_providers, _required_num_providers); - return 4; - } - - if (!provider_list) { - QNN_LOG_WARN("failed to get qnn interface providers"); - return 5; - } - bool found_valid_interface = false; - QNN_INTERFACE_VER_TYPE qnn_interface; - for (size_t idx = 0; idx < num_providers; idx++) { - if (QNN_API_VERSION_MAJOR == provider_list[idx]->apiVersion.coreApiVersion.major && - QNN_API_VERSION_MINOR <= provider_list[idx]->apiVersion.coreApiVersion.minor) { - found_valid_interface = true; - qnn_interface = provider_list[idx]->QNN_INTERFACE_VER_NAME; - break; - } - } - - if (!found_valid_interface) { - QNN_LOG_WARN("unable to find a valid qnn interface"); - return 6; - } else { - QNN_LOG_DEBUG("find a valid qnn interface"); - } - - BackendIdType backend_id = provider_list[0]->backendId; - _lib_path_to_backend_id[lib_path] = backend_id; - if (_loaded_backend.count(backend_id) > 0) { - QNN_LOG_WARN("lib_path %s is loaded, but backend %d already exists", lib_path.c_str(), backend_id); - } - _loaded_backend[backend_id] = provider_list[0]; - if (_loaded_lib_handle.count(backend_id) > 0) { - QNN_LOG_WARN("closing %p", _loaded_lib_handle[backend_id]); - int dlclose_error = dl_unload(_loaded_lib_handle[backend_id]); - if (dlclose_error != 0) { - QNN_LOG_WARN("fail to close %p with error %s", _loaded_lib_handle[backend_id], dl_error()); - } - } - _loaded_lib_handle[backend_id] = lib_handle; - _backend_id = backend_id; - - return 0; - } - - int unload_backend() { - int dlclose_error = 0; - for (auto &it : _loaded_lib_handle) { - dlclose_error = dl_unload(it.second); - if (dlclose_error != 0) { - QNN_LOG_WARN("failed to close QNN backend %d, error %s", it.first, dl_error()); - } - } - - _loaded_lib_handle.clear(); - _lib_path_to_backend_id.clear(); - _loaded_backend.clear(); - - return 0; - } + int load_system(); + int load_backend(std::string &lib_path, const QnnSaver_Config_t ** /*saver_config*/); + int unload_backend(); private: static constexpr const int _required_num_providers = 1; - std::string _lib_path; - std::string _backend_name; - std::string _model_name; // Qualcomm's dedicated prebuilt model name, keep it for further usage + std::string _additional_lib_load_path; + std::string _backend_lib_name; BackendIdType _backend_id; QnnLog_Level_t _qnn_log_level = QNN_LOG_LEVEL_DEBUG; @@ -874,17 +449,17 @@ class qnn_instance { std::unordered_map _qnn_rpc_buffer_to_handles; std::mutex _init_mutex; - std::unordered_map _loaded_lib_handle; + std::unordered_map _loaded_lib_handle; std::unordered_map _lib_path_to_backend_id; std::unordered_map _loaded_backend; dl_handler_t _rpc_lib_handle = nullptr; std::atomic_bool _rpcmem_initialized{false}; - qnn::pfn_rpc_mem_alloc _pfn_rpc_mem_alloc; - qnn::pfn_rpc_mem_free _pfn_rpc_mem_free; - qnn::pfn_rpc_mem_to_fd _pfn_rpc_mem_to_fd; - qnn::pfn_rpc_mem_init _pfn_rpc_mem_init; - qnn::pfn_rpc_mem_deinit _pfn_rpc_mem_deinit; + qnn::pfn_rpc_mem_alloc _pfn_rpc_mem_alloc = nullptr; + qnn::pfn_rpc_mem_free _pfn_rpc_mem_free = nullptr; + qnn::pfn_rpc_mem_to_fd _pfn_rpc_mem_to_fd = nullptr; + qnn::pfn_rpc_mem_init _pfn_rpc_mem_init = nullptr; + qnn::pfn_rpc_mem_deinit _pfn_rpc_mem_deinit = nullptr; std::unordered_map _rpcmem_store_map; size_t _rpcmem_capacity = 512; diff --git a/ggml/src/ggml-qnn/tensor.hpp b/ggml/src/ggml-qnn/tensor.hpp index 9720e682c81d2..423c3ba7fa8c1 100644 --- a/ggml/src/ggml-qnn/tensor.hpp +++ b/ggml/src/ggml-qnn/tensor.hpp @@ -59,7 +59,7 @@ class ggml_qnn_tensor : public std::enable_shared_from_this { return true; } - can_unbind = false; + _can_unbind = false; return false; } @@ -68,7 +68,7 @@ class ggml_qnn_tensor : public std::enable_shared_from_this { return true; } - can_unbind = false; + _can_unbind = false; return false; } @@ -93,7 +93,7 @@ class ggml_qnn_tensor : public std::enable_shared_from_this { } bool bind_ggml_tensor(ggml_tensor *tensor) { - if (!can_unbind) { + if (!_can_unbind) { QNN_LOG_DEBUG("[%s]already has buffer storage, skip bind", _tensor_name.c_str()); return true; } @@ -137,7 +137,7 @@ class ggml_qnn_tensor : public std::enable_shared_from_this { return false; } - if (!can_unbind) { + if (!_can_unbind) { QNN_LOG_DEBUG("[%s]already has buffer storage, stop unbind", _tensor_name.c_str()); return true; } @@ -294,11 +294,14 @@ class ggml_qnn_tensor : public std::enable_shared_from_this { new_tensor_type); } - bool should_use_mem_handle() const { return false; } + bool should_use_mem_handle() const { + // TODO: figure out how to set rpc mem to multiple tensor + return false; + } std::string _tensor_name; qnn_buffer_ptr _buffer; - bool can_unbind = true; + bool _can_unbind = true; QNNBackend _device; std::shared_ptr _qnn_instance; Qnn_Tensor_t _qnn_tensor = qnn_tensor_init(kDefaultQnnTensorVersion); diff --git a/ggml/src/ggml-qnn/utils.cpp b/ggml/src/ggml-qnn/utils.cpp index 6e77ee5f5f287..e9aa4d37374a6 100644 --- a/ggml/src/ggml-qnn/utils.cpp +++ b/ggml/src/ggml-qnn/utils.cpp @@ -1,8 +1,6 @@ #include "utils.hpp" -#include - #include #include "ggml-qnn.h" @@ -10,11 +8,23 @@ #include "QnnGraph.h" #include "qnn-types.hpp" -#ifdef __linux__ +#ifdef _WIN32 +#include +#else #include #include #endif +namespace { + +template +_Ty align_to_generic(size_t alignment, _Ty offset) { + return offset % alignment == 0 ? offset + : offset + (static_cast<_Ty>(alignment) - (offset % static_cast<_Ty>(alignment))); +} + +} // namespace + namespace qnn { qnn_dimension_array_t get_internal_dimension(const ggml_dimension_array_t &dims, uint32_t rank) { @@ -33,7 +43,7 @@ qnn_dimension_array_t get_internal_dimension(const ggml_dimension_array_t &dims, * The ggml tensor will have dimensions [3, 2], while the qnn tensor will have dimensions [2, 3]. */ for (uint32_t i = 0; i < rank; i++) { - internal_dims[i] = std::max(dims[rank - 1 - i], 1); + internal_dims[i] = std::max((uint32_t)dims[rank - 1 - i], 1); } return internal_dims; @@ -219,37 +229,41 @@ const char *get_htparch_desc(size_t htp_arch) { } } -intptr_t align_to(size_t alignment, intptr_t offset) { - return offset % alignment == 0 - ? offset - : offset + (static_cast(alignment) - (offset % static_cast(alignment))); -} +intptr_t align_to(size_t alignment, intptr_t offset) { return align_to_generic(alignment, offset); } -uint32_t get_ggml_tensor_data_size(const ggml_tensor *tensor) { return ggml_nbytes(tensor); } +uint32_t get_ggml_tensor_data_size(const ggml_tensor *tensor) { return (uint32_t)ggml_nbytes(tensor); } -void *page_align_alloc(size_t size) { - // TODO: fix this for other platforms - const size_t alignment = sysconf(_SC_PAGESIZE); - return align_alloc(alignment, size); +#ifdef _WIN32 +static void *_align_alloc(size_t alignment, size_t size) { return _aligned_malloc(size, alignment); } + +static size_t _get_page_size() { + SYSTEM_INFO si; + GetSystemInfo(&si); + return si.dwPageSize; } -void *align_alloc(size_t alignment, size_t size) { - size_t size_aligned = size; - if ((size_aligned % alignment) != 0) { - size_aligned += (alignment - (size_aligned % alignment)); - } +void align_free(void *ptr) { _aligned_free(ptr); } +#else +static void *_align_alloc(size_t alignment, size_t size) { return std::aligned_alloc(alignment, size); } - void *data = std::aligned_alloc(alignment, size_aligned); +static size_t _get_page_size() { return sysconf(_SC_PAGESIZE); } + +void align_free(void *ptr) { std::free(ptr); } +#endif + +void *page_align_alloc(size_t size) { + const size_t alignment = _get_page_size(); + size_t size_aligned = align_to_generic(alignment, size); + QNN_LOG_DEBUG("_align_alloc success, alignment: %ld, size: %ld, size_aligned: %ld", alignment, size, size_aligned); + void *data = _align_alloc(alignment, size_aligned); if (!data) { - QNN_LOG_WARN("aligned_alloc failed\n"); + QNN_LOG_WARN("_align_alloc failed, alignment: %ld, size: %ld, size_aligned: %ld", alignment, size, size_aligned); return nullptr; } return data; } -void align_free(void *ptr) { std::free(ptr); } - // ================================================================================================= // // QNN backend internal helper functions @@ -359,7 +373,29 @@ const char *get_qnn_error_string(Qnn_ErrorHandle_t error) { } } -#ifdef __linux__ +#ifdef _WIN32 + +size_t get_system_total_memory_in_bytes() { + MEMORYSTATUSEX mem = {}; + mem.dwLength = sizeof(mem); + if (GlobalMemoryStatusEx(&mem)) { + return mem.ullTotalPhys; + } + + return 0; +} + +size_t get_system_free_memory_in_bytes() { + MEMORYSTATUSEX mem = {}; + mem.dwLength = sizeof(mem); + if (GlobalMemoryStatusEx(&mem)) { + return mem.ullAvailPhys; + } + + return 0; +} + +#else size_t get_system_total_memory_in_bytes() { struct sysinfo info = {}; diff --git a/ggml/src/ggml-qnn/utils.hpp b/ggml/src/ggml-qnn/utils.hpp index 1ec0af4c96f77..cdff53e77314d 100644 --- a/ggml/src/ggml-qnn/utils.hpp +++ b/ggml/src/ggml-qnn/utils.hpp @@ -1,12 +1,8 @@ #pragma once -#include -#include -#include -#include -#include - #include +#include +#include #include #include "ggml.h" @@ -36,7 +32,6 @@ intptr_t align_to(size_t alignment, intptr_t offset); uint32_t get_ggml_tensor_data_size(const ggml_tensor *tensor); void *page_align_alloc(size_t size); -void *align_alloc(size_t alignment, size_t size); void align_free(void *ptr); const char *opname_from_ggmlop(enum ggml_op ggmlop); From ff033e1e23d91f332d5ef3ec29fcdfaa9c8a6051 Mon Sep 17 00:00:00 2001 From: nullname Date: Tue, 25 Feb 2025 19:46:48 +0800 Subject: [PATCH 141/143] opt mulmat base on official doc (#25) https://github.com/ggml-org/llama.cpp/blob/master/CONTRIBUTING.md --- ggml/src/ggml-qnn/op-config-impl.cpp | 44 +++++++--------------------- 1 file changed, 11 insertions(+), 33 deletions(-) diff --git a/ggml/src/ggml-qnn/op-config-impl.cpp b/ggml/src/ggml-qnn/op-config-impl.cpp index 934dbadfdcaf8..1b05b3581a419 100644 --- a/ggml/src/ggml-qnn/op-config-impl.cpp +++ b/ggml/src/ggml-qnn/op-config-impl.cpp @@ -385,36 +385,26 @@ bool ggml_qnn_matmul_op_config::create_mat_mul_nodes(QNNBackend device, Qnn_Grap * [5, 4], * ]) * # Perform matrix multiplication - * result = torch.matmul(A, B.T) - * print(result.T) + * C = torch.matmul(A, B.T) + * print(C.T) * ``` * Here, the B.T is the transpose of B. + * So C.T = A * B.T which is equivalent to C = B * A.T. + * See: https://github.com/ggml-org/llama.cpp/blob/master/CONTRIBUTING.md * * So here we need to create graph like: * ```mermaid * graph TD; - * i1>ggml_tensor_in0] --src0--> mat_mul0; - * i2>ggml_tensor_in1] --src1--> mat_mul0; - * mat_mul0 --dst_trans--> transpose_out; - * transpose1 --dst0--> o1>ggml_tensor_out]; + * i1>ggml_tensor_in0] --src1--> mat_mul0; + * i2>ggml_tensor_in1] --src0.T--> mat_mul0; + * mat_mul0 --dst0--> o1>ggml_tensor_out]; * ``` */ // create src0_trans tensor - auto src1 = tensor_inputs.back(); static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS does not match the expected value"); - - qnn_dimension_array_t dimensions = get_transposed_dimensions(src1->get_dimensions(), rank); - - // create dst_trans tensor - auto dst = tensor_outputs.front(); - dimensions = get_transposed_dimensions(dst->get_dimensions(), rank); - auto dst_trans = std::make_shared(ggml_qnn_tensor::INTERMEDIATE, "dst_trans", dimensions, - dst->get_data_type(), rank, device, graph_handle, _qnn_instance); - - // create transpose_out - auto transpose_out = std::make_shared(_name + "_trans1", QNN_OP_PACKAGE_NAME_QTI_AISW, - QNN_OP_TRANSPOSE, _qnn_instance); + GGML_ASSERT(tensor_inputs.size() == 2); + GGML_ASSERT(tensor_outputs.size() == 1); // create mat_mul auto mat_mul = @@ -425,24 +415,12 @@ bool ggml_qnn_matmul_op_config::create_mat_mul_nodes(QNNBackend device, Qnn_Grap scalar.bool8Value = 1; mat_mul->add_scalar_param(QNN_OP_MAT_MUL_PARAM_TRANSPOSE_IN1, scalar); - // set transpose_out parameters - auto *params_data = reinterpret_cast(kTransposeParamData[rank - 1].data()); - const qnn_dimension_array_t param_dims = {(uint32_t)rank, 1, 1, 1}; - transpose_out->add_tensor_param(QNN_OP_TRANSPOSE_PARAM_PERM, param_dims, 1, params_data, QNN_DATATYPE_UINT_32, - device, graph_handle); - // set tensor to mat_mul + std::swap(tensor_inputs[0], tensor_inputs[1]); mat_mul->set_input_tensors(tensor_inputs); - qnn_tensor_array_t tensors = {dst_trans}; - mat_mul->set_output_tensors(tensors); - - // set tensor to transpose_out - tensors = {dst_trans}; - transpose_out->set_input_tensors(tensors); - transpose_out->set_output_tensors(tensor_outputs); + mat_mul->set_output_tensors(tensor_outputs); _operations.push_back(mat_mul); - _operations.push_back(transpose_out); return true; } From c8676412228f932b087775d69f82312ee370be13 Mon Sep 17 00:00:00 2001 From: nullname Date: Thu, 27 Feb 2025 23:16:08 +0800 Subject: [PATCH 142/143] feat: fix some TODO item in upstream PR #26 (#27) * fix warning * wip * add todo for graph key generate * rename some file to meet upstream guideline * remove local .clang-format * expend supported/unsupported counter to all ops * append device name to log * port to ggml logger * fix warning after adapt to ggml logger * append \n to all log * use case op instead of convert * Revert "use case op instead of convert" This reverts commit e662fc2dfee41719aaf7bc9d75e03e8d0f7ded0f. * fix op that needs same shape * opt kQnnOpsTable * refresh params name field when getting op config * opt npu log print * remove unused functions --- ggml/src/ggml-qnn/.clang-format | 65 ---- ggml/src/ggml-qnn/backend-ops.cpp | 506 +++++++++++++-------------- ggml/src/ggml-qnn/backend-ops.hpp | 9 +- ggml/src/ggml-qnn/backend.hpp | 30 +- ggml/src/ggml-qnn/buffer.hpp | 65 ++-- ggml/src/ggml-qnn/dl-loader.hpp | 76 ++++ ggml/src/ggml-qnn/dl_loader.hpp | 71 ---- ggml/src/ggml-qnn/ggml-qnn.cpp | 221 ++++++------ ggml/src/ggml-qnn/graph.cpp | 235 +++++-------- ggml/src/ggml-qnn/graph.hpp | 44 ++- ggml/src/ggml-qnn/logger.cpp | 63 +--- ggml/src/ggml-qnn/logger.hpp | 45 +-- ggml/src/ggml-qnn/op-config-base.hpp | 23 +- ggml/src/ggml-qnn/op-config-caps.cpp | 344 +++++++++--------- ggml/src/ggml-qnn/op-config-impl.cpp | 197 +++++------ ggml/src/ggml-qnn/op-config-impl.hpp | 136 +++---- ggml/src/ggml-qnn/op-config.hpp | 14 +- ggml/src/ggml-qnn/qnn-lib.cpp | 278 +++++++-------- ggml/src/ggml-qnn/qnn-lib.hpp | 231 ++++++------ ggml/src/ggml-qnn/qnn-types.hpp | 52 +-- ggml/src/ggml-qnn/tensor.hpp | 206 +++++------ ggml/src/ggml-qnn/utils.cpp | 104 +++--- ggml/src/ggml-qnn/utils.hpp | 165 ++++----- 23 files changed, 1508 insertions(+), 1672 deletions(-) delete mode 100644 ggml/src/ggml-qnn/.clang-format create mode 100644 ggml/src/ggml-qnn/dl-loader.hpp delete mode 100644 ggml/src/ggml-qnn/dl_loader.hpp diff --git a/ggml/src/ggml-qnn/.clang-format b/ggml/src/ggml-qnn/.clang-format deleted file mode 100644 index 0c67c54239623..0000000000000 --- a/ggml/src/ggml-qnn/.clang-format +++ /dev/null @@ -1,65 +0,0 @@ ---- -BasedOnStyle: Google -IndentWidth: 4 -AccessModifierOffset: -4 -AlignAfterOpenBracket: Align -AlignConsecutiveMacros: false -AlignConsecutiveAssignments: false -AlignConsecutiveDeclarations: false -AlignEscapedNewlines: Left -AlignOperands: true -AlignTrailingComments: true -AllowAllArgumentsOnNextLine: true -AllowAllConstructorInitializersOnNextLine: true -AllowAllParametersOfDeclarationOnNextLine: true -AllowShortBlocksOnASingleLine: Never -AllowShortCaseLabelsOnASingleLine: false -AllowShortFunctionsOnASingleLine: All -AllowShortLambdasOnASingleLine: All -AllowShortIfStatementsOnASingleLine: WithoutElse -AllowShortLoopsOnASingleLine: true -AlwaysBreakAfterDefinitionReturnType: None -AlwaysBreakAfterReturnType: None -AlwaysBreakBeforeMultilineStrings: true -AlwaysBreakTemplateDeclarations: Yes -BinPackArguments: true -BinPackParameters: true -BraceWrapping: - AfterCaseLabel: false - AfterClass: false - AfterControlStatement: false - AfterEnum: false - AfterFunction: false - AfterNamespace: false - AfterObjCDeclaration: false - AfterStruct: false - AfterUnion: false - AfterExternBlock: false - BeforeCatch: false - BeforeElse: false - IndentBraces: false - SplitEmptyFunction: true - SplitEmptyRecord: true - SplitEmptyNamespace: true -ColumnLimit: 120 -ConstructorInitializerIndentWidth: 4 -ContinuationIndentWidth: 4 -Cpp11BracedListStyle: true -DerivePointerAlignment: false -IncludeCategories: - - Regex: '^<.*\.h>' - Priority: 1 - - Regex: '^<.*' - Priority: 2 - - Regex: '^"ggml\.h"' - Priority: 3 - - Regex: '^"ggml-.+\.h"' - Priority: 4 - - Regex: '.*' - Priority: 5 -KeepEmptyLinesAtTheStartOfBlocks: true -MaxEmptyLinesToKeep: 1 -PointerAlignment: Right -SortIncludes: true -SpacesBeforeTrailingComments: 1 -UseTab: Never diff --git a/ggml/src/ggml-qnn/backend-ops.cpp b/ggml/src/ggml-qnn/backend-ops.cpp index f62fc60d5c055..3a401dd037b97 100644 --- a/ggml/src/ggml-qnn/backend-ops.cpp +++ b/ggml/src/ggml-qnn/backend-ops.cpp @@ -4,7 +4,6 @@ #include #include "ggml-impl.h" - #include "graph.hpp" #include "logger.hpp" #include "op-config.hpp" @@ -13,15 +12,15 @@ namespace { -bool qnn_is_op_valid(ggml_backend_qnn_device_context *ctx, const ggml_tensor *dst) { +bool qnn_is_op_valid(ggml_backend_qnn_device_context * ctx, const ggml_tensor * dst) { if (!ctx || !dst) { - QNN_LOG_WARN("invalid params"); + QNN_LOG_WARN("invalid params\n"); return false; } auto instance = ctx->instance; if (!instance) { - QNN_LOG_WARN("invalid instance"); + QNN_LOG_WARN("invalid instance\n"); return false; } @@ -32,7 +31,7 @@ bool qnn_is_op_valid(ggml_backend_qnn_device_context *ctx, const ggml_tensor *ds case 2: return dst->src[0] && dst->src[1]; default: - QNN_LOG_WARN("invalid op param count %d", (int)param_count); + QNN_LOG_WARN("invalid op param count %d\n", (int) param_count); break; } @@ -40,60 +39,51 @@ bool qnn_is_op_valid(ggml_backend_qnn_device_context *ctx, const ggml_tensor *ds } #ifndef NDEBUG -void print_ggml_tensor(const ggml_tensor *tensor) { - QNN_LOG_DEBUG("%s: type:%s ne: %ldx%ldx%ldx%ld, nb: %ldx%ldx%ldx%ld", tensor->name, ggml_type_name(tensor->type), - (long)tensor->ne[0], (long)tensor->ne[1], (long)tensor->ne[2], (long)tensor->ne[3], - (long)tensor->nb[0], (long)tensor->nb[1], (long)tensor->nb[2], (long)tensor->nb[3]); +void print_ggml_tensor(const ggml_tensor * tensor) { + QNN_LOG_DEBUG("%s: type:%s ne: %ldx%ldx%ldx%ld, nb: %ldx%ldx%ldx%ld\n", tensor->name, ggml_type_name(tensor->type), + (long) tensor->ne[0], (long) tensor->ne[1], (long) tensor->ne[2], (long) tensor->ne[3], + (long) tensor->nb[0], (long) tensor->nb[1], (long) tensor->nb[2], (long) tensor->nb[3]); } #endif -} // namespace +} // namespace namespace { -typedef bool (*ggml_qnn_op_t)(ggml_backend_qnn_device_context *ctx, ggml_tensor *dst); - -bool execute_graph(qnn::qnn_graph *graph, ggml_tensor *output) { - if (!graph->execute(output)) { - QNN_LOG_WARN("execute failed"); - return false; - } - - return true; -} +typedef bool (*ggml_qnn_op_t)(ggml_backend_qnn_device_context * ctx, ggml_tensor * dst); -void append_tensor_dimensions(const ggml_tensor *tensor, std::string &output) { - char buffer[256] = {}; - const auto *type_name = qnn::get_ggml_type_name(tensor->type); - int len = 0; +void append_tensor_dimensions(const ggml_tensor * tensor, std::string & output) { + char buffer[256] = {}; + const auto * type_name = qnn::get_ggml_type_name(tensor->type); + int len = 0; switch (ggml_n_dims(tensor)) { case 1: - len = snprintf(buffer, sizeof(buffer), "%ld%s", (long)tensor->ne[0], type_name); + len = snprintf(buffer, sizeof(buffer), "%ld%s", (long) tensor->ne[0], type_name); break; case 2: - len = snprintf(buffer, sizeof(buffer), "%ldx%ld%s", (long)tensor->ne[0], (long)tensor->ne[1], type_name); + len = snprintf(buffer, sizeof(buffer), "%ldx%ld%s", (long) tensor->ne[0], (long) tensor->ne[1], type_name); break; case 3: - len = snprintf(buffer, sizeof(buffer), "%ldx%ldx%ld%s", (long)tensor->ne[0], (long)tensor->ne[1], - (long)tensor->ne[2], type_name); + len = snprintf(buffer, sizeof(buffer), "%ldx%ldx%ld%s", (long) tensor->ne[0], (long) tensor->ne[1], + (long) tensor->ne[2], type_name); break; case 4: default: - len = snprintf(buffer, sizeof(buffer), "%ldx%ldx%ldx%ld%s", (long)tensor->ne[0], (long)tensor->ne[1], - (long)tensor->ne[2], (long)tensor->ne[3], type_name); + len = snprintf(buffer, sizeof(buffer), "%ldx%ldx%ldx%ld%s", (long) tensor->ne[0], (long) tensor->ne[1], + (long) tensor->ne[2], (long) tensor->ne[3], type_name); break; } - GGML_ASSERT(len > 0 && len < (int)sizeof(buffer)); + GGML_ASSERT(len > 0 && len < (int) sizeof(buffer)); output.append(buffer, len); } -void get_graph_key_from_op(const ggml_tensor *op, std::string &output) { +void get_graph_key_from_op(const ggml_tensor * op, std::string & output) { GGML_ASSERT(op->op != GGML_OP_NONE); output += ggml_op_desc(op); output += qnn::get_ggml_type_name(op->type); const auto param_count = qnn::get_qnn_op_input_param_count(op); for (size_t i = 0; i < param_count; ++i) { - auto *input = op->src[i]; + auto * input = op->src[i]; if (!input) { break; } @@ -103,7 +93,7 @@ void get_graph_key_from_op(const ggml_tensor *op, std::string &output) { } } -void get_op_key_with_src_op_desc(const ggml_tensor *op, std::string &output) { +void get_op_key_with_src_op_desc(const ggml_tensor * op, std::string & output) { output += ggml_op_desc(op); output += '('; if (op->src[0]) { @@ -116,25 +106,37 @@ void get_op_key_with_src_op_desc(const ggml_tensor *op, std::string &output) { output += ')'; } -void get_graph_key_from_cgraph(const ggml_cgraph *cgraph, std::string &output) { - // generate key from the graph, the key is used to cache the graph, like: - // "MUL_MATf32_256x16x10f32_256x1x10f32#LOG#ADD#ADDf32_16x1x10f32" +/** + * @brief Generates a unique key for a given computation graph (cgraph). + * + * This key is used to cache the graph, enabling efficient reuse of previously + * compiled graphs. The key is constructed by concatenating the descriptions + * of the operations and their associated tensor dimensions within the graph. + * + * Example key format: "MUL_MATf32_256x16x10f32_256x1x10f32#LOG#ADD#ADDf32_16x1x10f32" + * + * @param cgraph The computation graph for which the key is generated. + * @param output The string where the generated key will be stored. + * + * TODO: Improve the key generation logic to handle more complex graph structures and edge cases. + */ +void get_graph_key_from_cgraph(const ggml_cgraph * cgraph, std::string & output) { if (cgraph->n_nodes == 0) { - QNN_LOG_DEBUG("empty cgraph"); + QNN_LOG_DEBUG("empty cgraph\n"); return; } { bool is_start = true; for (int i = 0; i < cgraph->n_nodes; ++i) { - auto *op = cgraph->nodes[i]; + auto * op = cgraph->nodes[i]; if (ggml_is_empty(op)) { - QNN_LOG_DEBUG("empty op in graph, skipping"); + QNN_LOG_DEBUG("empty op in graph, skipping\n"); continue; } if (op->op == GGML_OP_NONE) { - QNN_LOG_DEBUG("GGML_OP_NONE in graph, skipping"); + QNN_LOG_DEBUG("GGML_OP_NONE in graph, skipping\n"); continue; } @@ -149,55 +151,27 @@ void get_graph_key_from_cgraph(const ggml_cgraph *cgraph, std::string &output) { } if (cgraph->n_nodes > 1) { - auto *last_op = cgraph->nodes[cgraph->n_nodes - 1]; + auto * last_op = cgraph->nodes[cgraph->n_nodes - 1]; output += qnn::get_ggml_type_name(last_op->type); output += '_'; append_tensor_dimensions(last_op, output); } } -qnn::qnn_graph *get_qnn_graph_from_cache(ggml_backend_qnn_device_context *ctx, ggml_tensor *output) { - auto &graph_cache = ctx->qnn_graph_cache; - std::string graph_key; - get_graph_key_from_op(output, graph_key); - auto it = graph_cache.find(graph_key); - qnn::qnn_graph *graph_ptr = nullptr; - if (it != graph_cache.end()) { - QNN_LOG_DEBUG("[%s]found graph %s in cache", qnn::get_backend_name(ctx->device), graph_key.c_str()); - graph_ptr = it->second.get(); - } else { - auto graph = - std::make_unique(graph_key, ctx->device, ctx->instance, ctx->socinfo.vtcm_size_in_mb); - if (!graph->is_valid()) { - return nullptr; - } - - if (!graph->build_graph_from_op(output)) { - QNN_LOG_ERROR("[%s]build_graph_from_op failed", qnn::get_backend_name(ctx->device)); - return nullptr; - } - - graph_ptr = graph.get(); - graph_cache[graph_key] = std::move(graph); - } - - return graph_ptr; -} - -qnn::qnn_graph *get_qnn_graph_from_cache(ggml_backend_qnn_device_context *ctx, const ggml_cgraph *cgraph) { - auto &graph_cache = ctx->qnn_graph_cache; +qnn::qnn_graph * get_qnn_graph_from_cache(ggml_backend_qnn_device_context * ctx, const ggml_cgraph * cgraph) { + auto & graph_cache = ctx->qnn_graph_cache; std::string graph_key; get_graph_key_from_cgraph(cgraph, graph_key); if (graph_key.empty()) { - QNN_LOG_DEBUG("[%s]empty graph key for cgraph: %p, size: %d", qnn::get_backend_name(ctx->device), cgraph, - (int)cgraph->n_nodes); + QNN_LOG_DEBUG("[%s]empty graph key for cgraph: %p, size: %d\n", qnn::get_backend_name(ctx->device), + (const void *) cgraph, (int) cgraph->n_nodes); return nullptr; } - auto it = graph_cache.find(graph_key); - qnn::qnn_graph *graph_ptr = nullptr; + auto it = graph_cache.find(graph_key); + qnn::qnn_graph * graph_ptr = nullptr; if (it != graph_cache.end()) { - QNN_LOG_DEBUG("[%s]found graph %s in cache", qnn::get_backend_name(ctx->device), graph_key.c_str()); + QNN_LOG_DEBUG("[%s]found graph %s in cache\n", qnn::get_backend_name(ctx->device), graph_key.c_str()); graph_ptr = it->second.get(); } else { auto graph = @@ -207,180 +181,151 @@ qnn::qnn_graph *get_qnn_graph_from_cache(ggml_backend_qnn_device_context *ctx, c } if (!graph->build_graph_from_ggml_graph(cgraph)) { - QNN_LOG_ERROR("[%s]build_graph_from_op failed", qnn::get_backend_name(ctx->device)); + QNN_LOG_ERROR("[%s]build_graph_from_op failed\n", qnn::get_backend_name(ctx->device)); return nullptr; } - graph_ptr = graph.get(); + graph_ptr = graph.get(); graph_cache[graph_key] = std::move(graph); } return graph_ptr; } -bool qnn_generic_op_impl(ggml_backend_qnn_device_context *ctx, ggml_tensor *dst) { - if (!qnn_is_op_valid(ctx, dst)) { - return false; - } - - auto *graph_ptr = get_qnn_graph_from_cache(ctx, dst); - bool succeed = graph_ptr && execute_graph(graph_ptr, dst); - -#ifndef NDEBUG - if (!succeed) { - const auto param_count = qnn::get_qnn_op_input_param_count(dst); - for (size_t i = 0; i < param_count; ++i) { - print_ggml_tensor(dst->src[i]); - } - print_ggml_tensor(dst); - } -#endif - - return succeed; -} - -bool qnn_nop_impl(ggml_backend_qnn_device_context *ctx, ggml_tensor *dst) { - GGML_UNUSED(ctx); - GGML_UNUSED(dst); - return true; -} - -constexpr const ggml_qnn_op_t kQnnOpsTable[] = { - qnn_nop_impl, // GGML_OP_NONE - nullptr, // GGML_OP_DUP - qnn_generic_op_impl, // GGML_OP_ADD - nullptr, // GGML_OP_ADD1 - nullptr, // GGML_OP_ACC - qnn_generic_op_impl, // GGML_OP_SUB - qnn_generic_op_impl, // GGML_OP_MUL - qnn_generic_op_impl, // GGML_OP_DIV - nullptr, // GGML_OP_SQR - qnn_generic_op_impl, // GGML_OP_SQRT - qnn_generic_op_impl, // GGML_OP_LOG - nullptr, // GGML_OP_SIN - nullptr, // GGML_OP_COS - nullptr, // GGML_OP_SUM - nullptr, // GGML_OP_SUM_ROWS - nullptr, // GGML_OP_MEAN - nullptr, // GGML_OP_ARGMAX - nullptr, // GGML_OP_COUNT_EQUAL - nullptr, // GGML_OP_REPEAT - nullptr, // GGML_OP_REPEAT_BACK - nullptr, // GGML_OP_CONCAT - nullptr, // GGML_OP_SILU_BACK - nullptr, // GGML_OP_NORM - nullptr, // GGML_OP_RMS_NORM - nullptr, // GGML_OP_RMS_NORM_BACK - nullptr, // GGML_OP_GROUP_NORM - - qnn_generic_op_impl, // GGML_OP_MUL_MAT - nullptr, // GGML_OP_MUL_MAT_ID - nullptr, // GGML_OP_OUT_PROD - - nullptr, // GGML_OP_SCALE - nullptr, // GGML_OP_SET - nullptr, // GGML_OP_CPY - nullptr, // GGML_OP_CONT - qnn_nop_impl, // GGML_OP_RESHAPE - nullptr, // GGML_OP_VIEW - nullptr, // GGML_OP_PERMUTE - nullptr, // GGML_OP_TRANSPOSE - nullptr, // GGML_OP_GET_ROWS - nullptr, // GGML_OP_GET_ROWS_BACK - nullptr, // GGML_OP_DIAG - nullptr, // GGML_OP_DIAG_MASK_INF - nullptr, // GGML_OP_DIAG_MASK_ZERO - nullptr, // GGML_OP_SOFT_MAX - nullptr, // GGML_OP_SOFT_MAX_BACK - nullptr, // GGML_OP_ROPE - nullptr, // GGML_OP_ROPE_BACK - nullptr, // GGML_OP_CLAMP - nullptr, // GGML_OP_CONV_TRANSPOSE_1D - nullptr, // GGML_OP_IM2COL - nullptr, // GGML_OP_IM2COL_BACK - nullptr, // GGML_OP_CONV_TRANSPOSE_2D - nullptr, // GGML_OP_POOL_1D - nullptr, // GGML_OP_POOL_2D - nullptr, // GGML_OP_POOL_2D_BACK - nullptr, // GGML_OP_UPSCALE - nullptr, // GGML_OP_PAD - nullptr, // GGML_OP_PAD_REFLECT_1D - nullptr, // GGML_OP_ARANGE - nullptr, // GGML_OP_TIMESTEP_EMBEDDING - nullptr, // GGML_OP_ARGSORT - nullptr, // GGML_OP_LEAKY_RELU - - nullptr, // GGML_OP_FLASH_ATTN_EXT - nullptr, // GGML_OP_FLASH_ATTN_BACK - nullptr, // GGML_OP_SSM_CONV - nullptr, // GGML_OP_SSM_SCAN - nullptr, // GGML_OP_WIN_PART - nullptr, // GGML_OP_WIN_UNPART - nullptr, // GGML_OP_GET_REL_POS - nullptr, // GGML_OP_ADD_REL_POS - nullptr, // GGML_OP_RWKV_WKV6 - nullptr, // GGML_OP_GATED_LINEAR_ATTN - - nullptr, // GGML_OP_UNARY - - nullptr, // GGML_OP_MAP_UNARY - nullptr, // GGML_OP_MAP_BINARY - - nullptr, // GGML_OP_MAP_CUSTOM1_F32 - nullptr, // GGML_OP_MAP_CUSTOM2_F32 - nullptr, // GGML_OP_MAP_CUSTOM3_F32 - - nullptr, // GGML_OP_MAP_CUSTOM1 - nullptr, // GGML_OP_MAP_CUSTOM2 - nullptr, // GGML_OP_MAP_CUSTOM3 - - nullptr, // GGML_OP_CROSS_ENTROPY_LOSS - nullptr, // GGML_OP_CROSS_ENTROPY_LOSS_BACK - nullptr, // GGML_OP_OPT_STEP_ADAMW +// TODO: could be merge into op caps array +constexpr const bool kQnnSupportedOps[] = { + true, // GGML_OP_NONE + false, // GGML_OP_DUP + true, // GGML_OP_ADD + false, // GGML_OP_ADD1 + false, // GGML_OP_ACC + true, // GGML_OP_SUB + true, // GGML_OP_MUL + true, // GGML_OP_DIV + false, // GGML_OP_SQR + true, // GGML_OP_SQRT + true, // GGML_OP_LOG + false, // GGML_OP_SIN + false, // GGML_OP_COS + false, // GGML_OP_SUM + false, // GGML_OP_SUM_ROWS + false, // GGML_OP_MEAN + false, // GGML_OP_ARGMAX + false, // GGML_OP_COUNT_EQUAL + false, // GGML_OP_REPEAT + false, // GGML_OP_REPEAT_BACK + false, // GGML_OP_CONCAT + false, // GGML_OP_SILU_BACK + false, // GGML_OP_NORM + false, // GGML_OP_RMS_NORM + false, // GGML_OP_RMS_NORM_BACK + false, // GGML_OP_GROUP_NORM + + true, // GGML_OP_MUL_MAT + false, // GGML_OP_MUL_MAT_ID + false, // GGML_OP_OUT_PROD + + false, // GGML_OP_SCALE + false, // GGML_OP_SET + false, // GGML_OP_CPY + false, // GGML_OP_CONT + true, // GGML_OP_RESHAPE + false, // GGML_OP_VIEW + false, // GGML_OP_PERMUTE + false, // GGML_OP_TRANSPOSE + false, // GGML_OP_GET_ROWS + false, // GGML_OP_GET_ROWS_BACK + false, // GGML_OP_DIAG + false, // GGML_OP_DIAG_MASK_INF + false, // GGML_OP_DIAG_MASK_ZERO + false, // GGML_OP_SOFT_MAX + false, // GGML_OP_SOFT_MAX_BACK + false, // GGML_OP_ROPE + false, // GGML_OP_ROPE_BACK + false, // GGML_OP_CLAMP + false, // GGML_OP_CONV_TRANSPOSE_1D + false, // GGML_OP_IM2COL + false, // GGML_OP_IM2COL_BACK + false, // GGML_OP_CONV_TRANSPOSE_2D + false, // GGML_OP_POOL_1D + false, // GGML_OP_POOL_2D + false, // GGML_OP_POOL_2D_BACK + false, // GGML_OP_UPSCALE + false, // GGML_OP_PAD + false, // GGML_OP_PAD_REFLECT_1D + false, // GGML_OP_ARANGE + false, // GGML_OP_TIMESTEP_EMBEDDING + false, // GGML_OP_ARGSORT + false, // GGML_OP_LEAKY_RELU + + false, // GGML_OP_FLASH_ATTN_EXT + false, // GGML_OP_FLASH_ATTN_BACK + false, // GGML_OP_SSM_CONV + false, // GGML_OP_SSM_SCAN + false, // GGML_OP_WIN_PART + false, // GGML_OP_WIN_UNPART + false, // GGML_OP_GET_REL_POS + false, // GGML_OP_ADD_REL_POS + false, // GGML_OP_RWKV_WKV6 + false, // GGML_OP_GATED_LINEAR_ATTN + + false, // GGML_OP_UNARY + + false, // GGML_OP_MAP_UNARY + false, // GGML_OP_MAP_BINARY + + false, // GGML_OP_MAP_CUSTOM1_F32 + false, // GGML_OP_MAP_CUSTOM2_F32 + false, // GGML_OP_MAP_CUSTOM3_F32 + + false, // GGML_OP_MAP_CUSTOM1 + false, // GGML_OP_MAP_CUSTOM2 + false, // GGML_OP_MAP_CUSTOM3 + + false, // GGML_OP_CROSS_ENTROPY_LOSS + false, // GGML_OP_CROSS_ENTROPY_LOSS_BACK + false, // GGML_OP_OPT_STEP_ADAMW // ggml_unary_op - nullptr, // GGML_UNARY_OP_ABS - nullptr, // GGML_UNARY_OP_SGN - nullptr, // GGML_UNARY_OP_NEG - nullptr, // GGML_UNARY_OP_STEP - nullptr, // GGML_UNARY_OP_TANH - nullptr, // GGML_UNARY_OP_ELU - nullptr, // GGML_UNARY_OP_RELU - nullptr, // GGML_UNARY_OP_SIGMOID - qnn_generic_op_impl, // GGML_UNARY_OP_GELU - nullptr, // GGML_UNARY_OP_GELU_QUICK - nullptr, // GGML_UNARY_OP_SILU - nullptr, // GGML_UNARY_OP_HARDSWISH - nullptr, // GGML_UNARY_OP_HARDSIGMOID - nullptr, // GGML_UNARY_OP_EXP + false, // GGML_UNARY_OP_ABS + false, // GGML_UNARY_OP_SGN + false, // GGML_UNARY_OP_NEG + false, // GGML_UNARY_OP_STEP + false, // GGML_UNARY_OP_TANH + false, // GGML_UNARY_OP_ELU + false, // GGML_UNARY_OP_RELU + false, // GGML_UNARY_OP_SIGMOID + true, // GGML_UNARY_OP_GELU + false, // GGML_UNARY_OP_GELU_QUICK + false, // GGML_UNARY_OP_SILU + false, // GGML_UNARY_OP_HARDSWISH + false, // GGML_UNARY_OP_HARDSIGMOID + false, // GGML_UNARY_OP_EXP }; -static_assert(kQnnOpsTable[GGML_OP_NONE] == qnn_nop_impl, "GGML_OP_NONE does not match the qnn_nop_impl function"); -static_assert(kQnnOpsTable[GGML_OP_ADD] == qnn_generic_op_impl, - "GGML_OP_ADD does not match the qnn_generic_op_impl function"); -static_assert(kQnnOpsTable[GGML_OP_MUL] == qnn_generic_op_impl, - "GGML_OP_MUL does not match the qnn_generic_op_impl function"); -static_assert(kQnnOpsTable[GGML_OP_MUL_MAT] == qnn_generic_op_impl, - "GGML_OP_MUL_MAT does not match the qnn_generic_op_impl function"); -static_assert(kQnnOpsTable[GGML_OP_RESHAPE] == qnn_nop_impl, - "GGML_OP_RESHAPE does not match the qnn_nop_impl function"); -static_assert(kQnnOpsTable[GGML_OP_VIEW] == nullptr, "GGML_OP_VIEW is not nullptr"); -static_assert(std::size(kQnnOpsTable) == (GGML_OP_COUNT + GGML_UNARY_OP_COUNT), - "GGML_OP_COUNT does not match the size of the kQnnOpsTable table"); - -bool ggml_qnn_supports_tensor(ggml_backend_qnn_device_context *ctx, const ggml_tensor *tensor) { +static_assert(kQnnSupportedOps[GGML_OP_NONE], "GGML_OP_NONE is not true"); +static_assert(kQnnSupportedOps[GGML_OP_ADD], "GGML_OP_ADD is not true"); +static_assert(kQnnSupportedOps[GGML_OP_MUL], "GGML_OP_MUL is not true"); +static_assert(kQnnSupportedOps[GGML_OP_MUL_MAT], + "GGML_OP_MUL_MAT is not true, please check the kQnnSupportedOps table in the backend-ops.cpp file"); +static_assert(kQnnSupportedOps[GGML_OP_RESHAPE], "GGML_OP_RESHAPE is not true"); +static_assert(!kQnnSupportedOps[GGML_OP_VIEW], "GGML_OP_VIEW is not false"); +static_assert(std::size(kQnnSupportedOps) == (GGML_OP_COUNT + GGML_UNARY_OP_COUNT), + "GGML_OP_COUNT does not match the size of the kQnnSupportedOps table"); + +bool ggml_qnn_supports_tensor(ggml_backend_qnn_device_context * ctx, const ggml_tensor * tensor) { if (!tensor) { - QNN_LOG_DEBUG("tensor is nullptr"); + QNN_LOG_DEBUG("tensor is nullptr\n"); return false; } #ifndef NDEBUG if (tensor->view_src) { - auto *src_tensor = tensor->view_src; - QNN_LOG_DEBUG("[%s]tensor(%s_%dx%dx%dx%d) is a view, src: %s_%dx%dx%dx%d", qnn::get_backend_name(ctx->device), - ggml_get_name(tensor), tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3], - ggml_get_name(src_tensor), src_tensor->ne[0], src_tensor->ne[1], src_tensor->ne[2], - src_tensor->ne[3]); + auto * src_tensor = tensor->view_src; + QNN_LOG_DEBUG("[%s]tensor(%s_%dx%dx%dx%d) is a view, src: %s_%dx%dx%dx%d\n", qnn::get_backend_name(ctx->device), + ggml_get_name(tensor), (int) tensor->ne[0], (int) tensor->ne[1], (int) tensor->ne[2], + (int) tensor->ne[3], ggml_get_name(src_tensor), (int) src_tensor->ne[0], (int) src_tensor->ne[1], + (int) src_tensor->ne[2], (int) src_tensor->ne[3]); } #endif @@ -390,13 +335,14 @@ bool ggml_qnn_supports_tensor(ggml_backend_qnn_device_context *ctx, const ggml_t case GGML_TYPE_Q8_0: case GGML_TYPE_Q4_0: if (!(ctx->supported_types & (uint64_t(1) << tensor->type))) { - QNN_LOG_DEBUG("[%s]unsupported data type %s, supported_types: 0x%x", qnn::get_backend_name(ctx->device), - ggml_type_name(tensor->type), ctx->supported_types); + QNN_LOG_DEBUG("[%s]unsupported data type %s, supported_types: 0x%x\n", + qnn::get_backend_name(ctx->device), ggml_type_name(tensor->type), + (unsigned int) ctx->supported_types); return false; } break; default: - QNN_LOG_DEBUG("[%s]unsupported data type %s", qnn::get_backend_name(ctx->device), + QNN_LOG_DEBUG("[%s]unsupported data type %s\n", qnn::get_backend_name(ctx->device), ggml_type_name(tensor->type)); return false; } @@ -404,7 +350,7 @@ bool ggml_qnn_supports_tensor(ggml_backend_qnn_device_context *ctx, const ggml_t return true; } -bool ggnl_qnn_supports_op_tensor(ggml_backend_qnn_device_context *ctx, const ggml_tensor *op) { +bool ggnl_qnn_supports_op_tensor(ggml_backend_qnn_device_context * ctx, const ggml_tensor * op) { if (op->op == GGML_OP_NONE) { return true; } @@ -423,14 +369,14 @@ bool ggnl_qnn_supports_op_tensor(ggml_backend_qnn_device_context *ctx, const ggm return true; } -bool ggml_qnn_supports_matmul_op(ggml_backend_qnn_device_context *ctx, const ggml_tensor *op) { +bool ggml_qnn_supports_matmul_op(ggml_backend_qnn_device_context * ctx, const ggml_tensor * op) { constexpr const size_t kMaxNpuTensorSize = 8192L * 2048 + 8192 * 512 + 2048 * 512; - constexpr const auto get_tensor_size = [](const ggml_tensor *tensor) -> size_t { + constexpr const auto get_tensor_size = [](const ggml_tensor * tensor) -> size_t { return tensor->ne[0] * tensor->ne[1] * tensor->ne[2] * tensor->ne[3]; }; - auto *src0 = op->src[0]; - auto *src1 = op->src[1]; + auto * src0 = op->src[0]; + auto * src1 = op->src[1]; switch (ctx->device) { case QNN_BACKEND_NPU: if (src1->ne[2] != src0->ne[2] || src1->ne[3] != src0->ne[3]) { @@ -438,12 +384,10 @@ bool ggml_qnn_supports_matmul_op(ggml_backend_qnn_device_context *ctx, const ggm * TODO: remove the blocker here when NPU backend supports mul_mat like this: * [ne03, ne02, n, k] * [ne03 * x, ne02 * y, m, k] -> [ne03 * x, ne02 * y, m, n] */ - QNN_LOG_DEBUG("[qnn-npu][MUL_MAT]src0 and src1 dimensions are not equal, support/unsupported: %d/%d", - ctx->support_op_count.load(), ++(ctx->unsupported_op_count)); + QNN_LOG_DEBUG("[qnn-npu][MUL_MAT]src0 and src1 dimensions are not equal\n"); return false; } else if (get_tensor_size(src0) + get_tensor_size(src1) + get_tensor_size(op) >= kMaxNpuTensorSize) { - QNN_LOG_DEBUG("[qnn-npu][MUL_MAT]tensor size is too large, support/unsupported: %d/%d", - ctx->support_op_count.load(), ++(ctx->unsupported_op_count)); + QNN_LOG_DEBUG("[qnn-npu][MUL_MAT]tensor size is too large\n"); return false; } // fall through, from test here, the convert op is super slow on NPU: @@ -451,9 +395,8 @@ bool ggml_qnn_supports_matmul_op(ggml_backend_qnn_device_context *ctx, const ggm case QNN_BACKEND_GPU: if (src0->type != src1->type || src0->type != op->type) { // there's no convert op for GPU. - QNN_LOG_DEBUG( - "[qnn-gpu][MUL_MAT]type src0(%d), src1(%d) and op(%d) are not equal, support/unsupported: %d/%d", - src0->type, src1->type, op->type, ctx->support_op_count.load(), ++(ctx->unsupported_op_count)); + QNN_LOG_DEBUG("[qnn-gpu][MUL_MAT]type src0(%s), src1(%s) and op(%s) are not equal\n", + ggml_type_name(src0->type), ggml_type_name(src1->type), ggml_type_name(op->type)); return false; } break; @@ -462,31 +405,31 @@ bool ggml_qnn_supports_matmul_op(ggml_backend_qnn_device_context *ctx, const ggm } if ((src1->ne[2] % src0->ne[2]) != 0 || (src1->ne[3] % src0->ne[3]) != 0) { - QNN_LOG_DEBUG("[%s][MUL_MAT]src0 and src1 dimensions are not equal, support/unsupported: %d/%d", - qnn::get_backend_name(ctx->device), ctx->support_op_count.load(), ++(ctx->unsupported_op_count)); + QNN_LOG_DEBUG("[%s][MUL_MAT]src0 and src1 dimensions are not equal\n", qnn::get_backend_name(ctx->device)); return false; } - QNN_LOG_DEBUG("[%s][MUL_MAT]supported matmul op, support/unsupported: %d/%d", qnn::get_backend_name(ctx->device), - ++(ctx->support_op_count), ctx->unsupported_op_count.load()); + QNN_LOG_DEBUG("[%s][MUL_MAT]supported matmul op\n", qnn::get_backend_name(ctx->device)); return true; } -} // namespace +} // namespace namespace qnn { -bool device_supports_op(ggml_backend_qnn_device_context *ctx, const ggml_tensor *op) { +bool device_supports_op(ggml_backend_qnn_device_context * ctx, const ggml_tensor * op) { // Note that this function could be called before the device context is initialized if (op->op == GGML_OP_NONE) { return true; } - if (!kQnnOpsTable[qnn::get_qnn_op_index(op)]) { + if (!kQnnSupportedOps[qnn::get_qnn_op_index(op)]) { #ifndef NDEBUG std::string op_key; get_graph_key_from_op(op, op_key); - QNN_LOG_DEBUG("[%s]unsupported op", op_key.c_str()); + ctx->unsupported_op_count++; + QNN_LOG_DEBUG("[%s][%s]op was unsupported, support/unsupported: %d/%d\n", qnn::get_backend_name(ctx->device), + op_key.c_str(), ctx->supported_op_count.load(), ctx->unsupported_op_count.load()); #endif return false; } @@ -495,48 +438,69 @@ bool device_supports_op(ggml_backend_qnn_device_context *ctx, const ggml_tensor #ifndef NDEBUG std::string tensor_dims; append_tensor_dimensions(op, tensor_dims); - QNN_LOG_DEBUG("[%s]unsupported tensor(%s)", ggml_op_name(op->op), tensor_dims.c_str()); + QNN_LOG_DEBUG("[%s][%s]unsupported tensor(%s), support/unsupported: %d/%d\n", + qnn::get_backend_name(ctx->device), ggml_op_name(op->op), tensor_dims.c_str(), + ctx->supported_op_count.load(), ctx->unsupported_op_count.load()); #endif return false; } + bool is_op_supported = true; if (op->op == GGML_OP_UNARY) { const auto unary_op = ggml_get_unary_op(op); if (unary_op == GGML_UNARY_OP_GELU) { // TODO: fix this - QNN_LOG_DEBUG("[GELU]unsupported unary op GGML_UNARY_OP_GELU for NPU"); - return false; + QNN_LOG_DEBUG("[GELU]unsupported unary op GGML_UNARY_OP_GELU for NPU\n"); + is_op_supported = false; } } else { - auto *src0 = op->src[0]; - auto *src1 = op->src[1]; + auto * src0 = op->src[0]; + auto * src1 = op->src[1]; switch (op->op) { case GGML_OP_ADD: + case GGML_OP_SUB: + case GGML_OP_MUL: + case GGML_OP_DIV: if (!ggml_are_same_shape(src0, src1)) { - QNN_LOG_DEBUG("[ADD] src0 and src1 dimensions are not equal"); - return false; + QNN_LOG_DEBUG("[%s][%s] src0 and src1 dimensions are not equal\n", + qnn::get_backend_name(ctx->device), ggml_op_name(op->op)); + is_op_supported = false; } break; - case GGML_OP_MUL_MAT: - return ggml_qnn_supports_matmul_op(ctx, op); + is_op_supported = ggml_qnn_supports_matmul_op(ctx, op); + break; default: - return false; + // default to supported + break; } } - return true; +#ifndef NDEBUG + if (is_op_supported) { + ctx->supported_op_count++; + QNN_LOG_DEBUG("[%s][%s]op was supported, support/unsupported: %d/%d\n", qnn::get_backend_name(ctx->device), + ggml_op_name(op->op), ctx->supported_op_count.load(), ctx->unsupported_op_count.load()); + } else { + ctx->unsupported_op_count++; + QNN_LOG_DEBUG("[%s][%s]op was unsupported, support/unsupported: %d/%d\n", qnn::get_backend_name(ctx->device), + ggml_op_name(op->op), ctx->supported_op_count.load(), ctx->unsupported_op_count.load()); + } +#endif + + return is_op_supported; } -bool device_compute_graph(ggml_backend_qnn_device_context *ctx, ggml_cgraph *cgraph) { - QNN_LOG_DEBUG("[%s]compute graph start, nodes count: %d", qnn::get_backend_name(ctx->device), (int)cgraph->n_nodes); +bool device_compute_graph(ggml_backend_qnn_device_context * ctx, ggml_cgraph * cgraph) { + QNN_LOG_DEBUG("[%s]compute graph start, nodes count: %d\n", qnn::get_backend_name(ctx->device), + (int) cgraph->n_nodes); auto qnn_graph = get_qnn_graph_from_cache(ctx, cgraph); - bool success = qnn_graph && qnn_graph->execute(cgraph); + bool success = qnn_graph && qnn_graph->execute(cgraph); - QNN_LOG_DEBUG("[%s]compute graph, success: %d", qnn::get_backend_name(ctx->device), (int)success); + QNN_LOG_DEBUG("[%s]compute graph, success: %d\n", qnn::get_backend_name(ctx->device), (int) success); return success; } -} // namespace qnn +} // namespace qnn diff --git a/ggml/src/ggml-qnn/backend-ops.hpp b/ggml/src/ggml-qnn/backend-ops.hpp index c49c4d6dc19d7..64fb10f00ddfe 100644 --- a/ggml/src/ggml-qnn/backend-ops.hpp +++ b/ggml/src/ggml-qnn/backend-ops.hpp @@ -1,12 +1,11 @@ #pragma once -#include "ggml.h" - #include "backend.hpp" +#include "ggml.h" namespace qnn { -bool device_supports_op(ggml_backend_qnn_device_context *ctx, const ggml_tensor *op); -bool device_compute_graph(ggml_backend_qnn_device_context *ctx, ggml_cgraph *cgraph); +bool device_supports_op(ggml_backend_qnn_device_context * ctx, const ggml_tensor * op); +bool device_compute_graph(ggml_backend_qnn_device_context * ctx, ggml_cgraph * cgraph); -} // namespace qnn +} // namespace qnn diff --git a/ggml/src/ggml-qnn/backend.hpp b/ggml/src/ggml-qnn/backend.hpp index df5e2eb08fb8f..253b0b672383d 100644 --- a/ggml/src/ggml-qnn/backend.hpp +++ b/ggml/src/ggml-qnn/backend.hpp @@ -2,7 +2,7 @@ #pragma once #ifndef NDEBUG -#include +# include #endif #include @@ -10,39 +10,41 @@ #include #include -#include "ggml.h" - #include "ggml-backend.h" #include "ggml-qnn.h" - +#include "ggml.h" #include "graph.hpp" #include "qnn-lib.hpp" namespace qnn { typedef std::unordered_map> qnn_graph_cache_t; -} // namespace qnn +} // namespace qnn struct ggml_backend_qnn_device_context { // initialize in constructor - QNNBackend device; - size_t threads; + QNNBackend device; + size_t threads; std::string name; std::string lib_name; // initialize in qnn init - qnn::qcom_socinfo socinfo = {}; - uint64_t supported_types; - std::shared_ptr instance; + qnn::qcom_socinfo socinfo = {}; + uint64_t supported_types; + std::shared_ptr instance; std::shared_ptr qnn_interface; qnn::qnn_graph_cache_t qnn_graph_cache; #ifndef NDEBUG - std::atomic_uint32_t support_op_count = 0; + std::atomic_uint32_t supported_op_count = 0; std::atomic_uint32_t unsupported_op_count = 0; #endif - explicit ggml_backend_qnn_device_context(QNNBackend device, size_t threads, const char *name, const char *lib_name, - uint64_t supported_types) - : device(device), threads(threads), name(name), lib_name(lib_name), supported_types(supported_types) {} + explicit ggml_backend_qnn_device_context(QNNBackend device, size_t threads, const char * name, + const char * lib_name, uint64_t supported_types) : + device(device), + threads(threads), + name(name), + lib_name(lib_name), + supported_types(supported_types) {} }; diff --git a/ggml/src/ggml-qnn/buffer.hpp b/ggml/src/ggml-qnn/buffer.hpp index ce796cbe4df08..43c4666dd15b1 100644 --- a/ggml/src/ggml-qnn/buffer.hpp +++ b/ggml/src/ggml-qnn/buffer.hpp @@ -14,7 +14,7 @@ namespace qnn { * This abstract class defines the interface for managing generic memory buffers in a QNN context. */ class qnn_buffer_interface { -public: + public: virtual ~qnn_buffer_interface() = default; /** @@ -35,7 +35,7 @@ class qnn_buffer_interface { * * @return A pointer to the buffer. */ - virtual uint8_t *get_buffer() = 0; + virtual uint8_t * get_buffer() = 0; /** * @brief Gets the buffer pointer. @@ -68,21 +68,22 @@ using qnn_buffer_ptr = std::shared_ptr; * handles cleanup of the buffer and its associated memory handle upon destruction. */ class qnn_rpc_buffer : public qnn_buffer_interface { -public: + public: qnn_rpc_buffer(std::shared_ptr qnn_instance, const size_t size, const uint32_t rank, - uint32_t *dimensions, Qnn_DataType_t data_type) - : _size(size), _qnn_instance(qnn_instance) { - - _qnn_rpc_buffer = static_cast(qnn_instance->alloc_rpcmem(size, alignof(uint8_t *))); + uint32_t * dimensions, Qnn_DataType_t data_type) : + _size(size), + _qnn_instance(qnn_instance) { + _qnn_rpc_buffer = static_cast(qnn_instance->alloc_rpcmem(size, alignof(uint8_t *))); _qnn_rpc_mem_handle = qnn_instance->register_rpcmem(_qnn_rpc_buffer, rank, dimensions, data_type); if (!_qnn_rpc_buffer || !_qnn_rpc_mem_handle) { - QNN_LOG_WARN("Failed to register RPC memory: buffer or memory handle is null"); + QNN_LOG_WARN("Failed to register RPC memory: buffer or memory handle is null\n"); // let the destructor free the buffer return; } - QNN_LOG_DEBUG("alloc rpcmem(%p) successfully, size %d", _qnn_rpc_buffer, (int)size); + QNN_LOG_DEBUG("alloc rpcmem(%p) successfully, size %d\n", (void *) _qnn_rpc_buffer, (int) size); } + ~qnn_rpc_buffer() { if (_qnn_instance) { if (_qnn_rpc_mem_handle) { @@ -97,14 +98,16 @@ class qnn_rpc_buffer : public qnn_buffer_interface { bool is_valid() const override { return _qnn_rpc_buffer && _qnn_rpc_mem_handle; } - uint8_t *get_buffer() override { return _qnn_rpc_buffer; } + uint8_t * get_buffer() override { return _qnn_rpc_buffer; } + size_t get_size() const override { return _size; } + Qnn_MemHandle_t get_mem_handle() const override { return _qnn_rpc_mem_handle; } -private: - size_t _size = 0; - uint8_t *_qnn_rpc_buffer = nullptr; - Qnn_MemHandle_t _qnn_rpc_mem_handle = nullptr; + private: + size_t _size = 0; + uint8_t * _qnn_rpc_buffer = nullptr; + Qnn_MemHandle_t _qnn_rpc_mem_handle = nullptr; std::shared_ptr _qnn_instance; DISABLE_COPY(qnn_rpc_buffer); @@ -119,12 +122,12 @@ class qnn_rpc_buffer : public qnn_buffer_interface { * a consistent interface for buffer management. */ class qnn_mem_buffer : public qnn_buffer_interface { -public: - explicit qnn_mem_buffer(const uint8_t *data, size_t size) { + public: + explicit qnn_mem_buffer(const uint8_t * data, size_t size) { _buffer = reinterpret_cast(qnn::page_align_alloc(size)); if (!_buffer) { - QNN_LOG_WARN("failed to allocate %.2f MiB", float(size / (1 << 20))); + QNN_LOG_WARN("failed to allocate %.2f MiB\n", float(size / (1 << 20))); return; } @@ -134,49 +137,51 @@ class qnn_mem_buffer : public qnn_buffer_interface { memcpy(_buffer, data, size); } - QNN_LOG_DEBUG("alloc buffer: %p, size: %ld", _buffer, size); + QNN_LOG_DEBUG("alloc buffer: %p, size: %ld\n", (void *) _buffer, (long) size); } explicit qnn_mem_buffer(size_t size) : qnn_mem_buffer(nullptr, size) {} ~qnn_mem_buffer() { - QNN_LOG_DEBUG("free buffer: %p, size: %ld", _buffer, _size); + QNN_LOG_DEBUG("free buffer: %p, size: %ld\n", (void *) _buffer, (long) _size); // the free will do nothing if the _buffer is nullptr qnn::align_free(_buffer); } bool is_valid() const override { return _buffer != nullptr; } - uint8_t *get_buffer() override { return _buffer; } + uint8_t * get_buffer() override { return _buffer; } + size_t get_size() const override { return _size; } + Qnn_MemHandle_t get_mem_handle() const override { return nullptr; } -private: - size_t _size = 0; - uint8_t *_buffer = nullptr; + private: + size_t _size = 0; + uint8_t * _buffer = nullptr; DISABLE_COPY(qnn_mem_buffer); DISABLE_MOVE(qnn_mem_buffer); }; class qnn_mem_buffer_slice : public qnn_buffer_interface { -public: - qnn_mem_buffer_slice(const uint8_t *buffer, size_t size) : _buffer(const_cast(buffer)), _size(size) {} + public: + qnn_mem_buffer_slice(const uint8_t * buffer, size_t size) : _buffer(const_cast(buffer)), _size(size) {} bool is_valid() const override { return _buffer && _size; } - uint8_t *get_buffer() override { return _buffer; } + uint8_t * get_buffer() override { return _buffer; } size_t get_size() const override { return _size; } Qnn_MemHandle_t get_mem_handle() const override { return nullptr; } -private: - uint8_t *_buffer = nullptr; - size_t _size = 0; + private: + uint8_t * _buffer = nullptr; + size_t _size = 0; DISABLE_COPY(qnn_mem_buffer_slice); DISABLE_MOVE(qnn_mem_buffer_slice); }; -} // namespace qnn +} // namespace qnn diff --git a/ggml/src/ggml-qnn/dl-loader.hpp b/ggml/src/ggml-qnn/dl-loader.hpp new file mode 100644 index 0000000000000..e183d190ce18f --- /dev/null +++ b/ggml/src/ggml-qnn/dl-loader.hpp @@ -0,0 +1,76 @@ +#pragma once + +#ifdef __linux__ +# include +# include +#elif defined(_WIN32) +# define WIN32_LEAN_AND_MEAN +# ifndef NOMINMAX +# define NOMINMAX +# endif +# include +#endif + +#include + +namespace qnn { + +#ifdef __linux__ +typedef void * dl_handler_t; + +inline qnn::dl_handler_t dl_load(const std::string & lib_path) { + return dlopen(lib_path.c_str(), RTLD_NOW | RTLD_LOCAL); +} + +inline void * dl_sym(qnn::dl_handler_t handle, const std::string & symbol) { + return dlsym(handle, symbol.c_str()); +} + +inline bool dl_unload(qnn::dl_handler_t handle) { + return dlclose(handle) == 0; +} + +inline const char * dl_error() { + return dlerror(); +} +#elif defined(_WIN32) +using dl_handler_t = HMODULE; + +inline qnn::dl_handler_t dl_load(const std::string & lib_path) { + // suppress error dialogs for missing DLLs + auto old_mode = SetErrorMode(SEM_FAILCRITICALERRORS); + SetErrorMode(old_mode | SEM_FAILCRITICALERRORS); + + auto handle = LoadLibraryA(lib_path.c_str()); // TODO: use wstring version for unicode paths + + SetErrorMode(old_mode); + return handle; +} + +inline void * dl_sym(qnn::dl_handler_t handle, const std::string & symbol) { + auto old_mode = SetErrorMode(SEM_FAILCRITICALERRORS); + SetErrorMode(old_mode | SEM_FAILCRITICALERRORS); + + void * p = (void *) GetProcAddress(handle, symbol.c_str()); + + SetErrorMode(old_mode); + return p; +} + +inline bool dl_unload(qnn::dl_handler_t handle) { + FreeLibrary(handle); + return true; +} + +inline const char * dl_error() { + // TODO: implement dl_error for Windows + return nullptr; +} + +#endif + +template Fn dl_sym_typed(qnn::dl_handler_t handle, const std::string & function_name) { + return reinterpret_cast(dl_sym(handle, function_name)); +} + +} // namespace qnn diff --git a/ggml/src/ggml-qnn/dl_loader.hpp b/ggml/src/ggml-qnn/dl_loader.hpp deleted file mode 100644 index 1beec8866ba4c..0000000000000 --- a/ggml/src/ggml-qnn/dl_loader.hpp +++ /dev/null @@ -1,71 +0,0 @@ -#pragma once - -#ifdef __linux__ -#include -#include -#elif defined(_WIN32) -#define WIN32_LEAN_AND_MEAN -#ifndef NOMINMAX -#define NOMINMAX -#endif -#include -#endif - -#include - -namespace qnn { - -#ifdef __linux__ -typedef void *dl_handler_t; - -inline qnn::dl_handler_t dl_load(const std::string &lib_path) { - return dlopen(lib_path.c_str(), RTLD_NOW | RTLD_LOCAL); -} - -inline void *dl_sym(qnn::dl_handler_t handle, const std::string &symbol) { return dlsym(handle, symbol.c_str()); } - -inline bool dl_unload(qnn::dl_handler_t handle) { return dlclose(handle) == 0; } - -inline const char *dl_error() { return dlerror(); } -#elif defined(_WIN32) -using dl_handler_t = HMODULE; - -inline qnn::dl_handler_t dl_load(const std::string &lib_path) { - // suppress error dialogs for missing DLLs - auto old_mode = SetErrorMode(SEM_FAILCRITICALERRORS); - SetErrorMode(old_mode | SEM_FAILCRITICALERRORS); - - auto handle = LoadLibraryA(lib_path.c_str()); // TODO: use wstring version for unicode paths - - SetErrorMode(old_mode); - return handle; -} - -inline void *dl_sym(qnn::dl_handler_t handle, const std::string &symbol) { - auto old_mode = SetErrorMode(SEM_FAILCRITICALERRORS); - SetErrorMode(old_mode | SEM_FAILCRITICALERRORS); - - void *p = (void *)GetProcAddress(handle, symbol.c_str()); - - SetErrorMode(old_mode); - return p; -} - -inline bool dl_unload(qnn::dl_handler_t handle) { - FreeLibrary(handle); - return true; -} - -inline const char *dl_error() { - // TODO: implement dl_error for Windows - return nullptr; -} - -#endif - -template -Fn dl_sym_typed(qnn::dl_handler_t handle, const std::string &function_name) { - return reinterpret_cast(dl_sym(handle, function_name)); -} - -} // namespace qnn diff --git a/ggml/src/ggml-qnn/ggml-qnn.cpp b/ggml/src/ggml-qnn/ggml-qnn.cpp index 8150dcb9ea240..626ba2cce9520 100644 --- a/ggml/src/ggml-qnn/ggml-qnn.cpp +++ b/ggml/src/ggml-qnn/ggml-qnn.cpp @@ -6,7 +6,6 @@ #include "ggml-backend-impl.h" #include "ggml-impl.h" - #include "ggml-qnn/backend-ops.hpp" #include "ggml-qnn/backend.hpp" #include "ggml-qnn/logger.hpp" @@ -19,9 +18,9 @@ // // ================================================================================================= #ifdef NDEBUG -#define ENABLE_QNNBACKEND_PERF 0 // enable/disable op's perf info +# define ENABLE_QNNBACKEND_PERF 0 // enable/disable op's perf info #else -#define ENABLE_QNNBACKEND_PERF 1 // enable/disable op's perf info +# define ENABLE_QNNBACKEND_PERF 1 // enable/disable op's perf info #endif #define QNN_BACKEND_NAME "qnn" @@ -29,50 +28,42 @@ namespace { #ifdef _WIN32 -constexpr const char *kQnnCpuLibName = "QnnCpu.dll"; -constexpr const char *kQnnGpuLibName = "QnnGpu.dll"; -constexpr const char *kQnnNpuLibName = "QnnHtp.dll"; +constexpr const char * kQnnCpuLibName = "QnnCpu.dll"; +constexpr const char * kQnnGpuLibName = "QnnGpu.dll"; +constexpr const char * kQnnNpuLibName = "QnnHtp.dll"; #else -constexpr const char *kQnnCpuLibName = "libQnnCpu.so"; -constexpr const char *kQnnGpuLibName = "libQnnGpu.so"; -constexpr const char *kQnnNpuLibName = "libQnnHtp.so"; +constexpr const char * kQnnCpuLibName = "libQnnCpu.so"; +constexpr const char * kQnnGpuLibName = "libQnnGpu.so"; +constexpr const char * kQnnNpuLibName = "libQnnHtp.so"; #endif struct qnn_device_caps { - const char *name; - const char *description; - const char *lib_name; + const char * name; + const char * description; + const char * lib_name; enum ggml_backend_dev_type type; // TODO: should get this caps from device uint64_t supported_types; }; +// TODO: should move this to qnn-lib.cpp constexpr const qnn_device_caps kDeviceCaps[] = { { - // https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/CpuOpDefSupplement.html#matmul - "qnn-cpu", - "Qualcomm Kryo CPU", - kQnnCpuLibName, - GGML_BACKEND_DEVICE_TYPE_CPU, - (1 << GGML_TYPE_I8) | (1 << GGML_TYPE_F32), - }, + "qnn-cpu", "Qualcomm Kryo CPU", + kQnnCpuLibName, GGML_BACKEND_DEVICE_TYPE_CPU, + (1 << GGML_TYPE_I8) | (1 << GGML_TYPE_F32), + }, // https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/CpuOpDefSupplement.html#matmul { - // https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/GpuOpDefSupplement.html#matmul - "qnn-gpu", - "Qualcomm Adreno GPU", - kQnnGpuLibName, - GGML_BACKEND_DEVICE_TYPE_GPU, - (1 << GGML_TYPE_F32) | (1 << GGML_TYPE_F16), - }, + "qnn-gpu", "Qualcomm Adreno GPU", + kQnnGpuLibName, GGML_BACKEND_DEVICE_TYPE_GPU, + (1 << GGML_TYPE_F32) | (1 << GGML_TYPE_F16), + }, // https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/GpuOpDefSupplement.html#matmul { - // https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/HtpOpDefSupplement.html#matmul - "qnn-npu", - "Qualcomm NPU", - kQnnNpuLibName, - GGML_BACKEND_DEVICE_TYPE_ACCEL, - (1 << GGML_TYPE_F32) | (1 << GGML_TYPE_F16) | (1 << GGML_TYPE_I16) | (1 << GGML_TYPE_I8), - }, + "qnn-npu", "Qualcomm NPU", + kQnnNpuLibName, GGML_BACKEND_DEVICE_TYPE_ACCEL, + (1 << GGML_TYPE_F32) | (1 << GGML_TYPE_F16) | (1 << GGML_TYPE_I16) | (1 << GGML_TYPE_I8), + }, // https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/HtpOpDefSupplement.html#matmul }; static_assert(sizeof(kDeviceCaps) / sizeof(kDeviceCaps[0]) == GGML_QNN_MAX_DEVICES, @@ -85,11 +76,11 @@ static_assert(kDeviceCaps[QNN_BACKEND_GPU].type == GGML_BACKEND_DEVICE_TYPE_GPU, static_assert(kDeviceCaps[QNN_BACKEND_CPU].type == GGML_BACKEND_DEVICE_TYPE_CPU, "The NPU device should be an accelerator device"); -ggml_backend_qnn_device_context *get_device_context(ggml_backend_dev_t dev) { +ggml_backend_qnn_device_context * get_device_context(ggml_backend_dev_t dev) { return reinterpret_cast(dev->context); } -qnn::qnn_buffer_interface *get_buffer_context(ggml_backend_buffer_t buffer) { +qnn::qnn_buffer_interface * get_buffer_context(ggml_backend_buffer_t buffer) { return reinterpret_cast(buffer->context); } @@ -99,34 +90,34 @@ qnn::qnn_buffer_interface *get_buffer_context(ggml_backend_buffer_t buffer) { * ----------------------------------------------------------------------------------------------- */ void ggml_backend_qnn_buffer_free_buffer(ggml_backend_buffer_t buffer) { - auto *ctx = get_buffer_context(buffer); + auto * ctx = get_buffer_context(buffer); delete ctx; } -void *ggml_backend_qnn_buffer_get_base(ggml_backend_buffer_t buffer) { - auto *ctx = get_buffer_context(buffer); +void * ggml_backend_qnn_buffer_get_base(ggml_backend_buffer_t buffer) { + auto * ctx = get_buffer_context(buffer); return ctx->get_buffer(); } -void ggml_backend_qnn_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor *tensor) { +void ggml_backend_qnn_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) { GGML_UNUSED(buffer); GGML_UNUSED(tensor); // TODO: we should create the qnn tensor along with the ggml tensor } -void ggml_backend_qnn_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor *tensor, const void *data, +void ggml_backend_qnn_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { GGML_UNUSED(buffer); - memcpy((char *)tensor->data + offset, data, size); + memcpy((char *) tensor->data + offset, data, size); } -void ggml_backend_qnn_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor *tensor, void *data, +void ggml_backend_qnn_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { GGML_UNUSED(buffer); - memcpy(data, (const char *)tensor->data + offset, size); + memcpy(data, (const char *) tensor->data + offset, size); } -bool ggml_backend_qnn_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor *src, ggml_tensor *dst) { +bool ggml_backend_qnn_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) { GGML_UNUSED(buffer); if (ggml_backend_buffer_is_host(src->buffer)) { memcpy(dst->data, src->data, ggml_nbytes(src)); @@ -137,7 +128,7 @@ bool ggml_backend_qnn_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml } void ggml_backend_qnn_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { - auto *ctx = get_buffer_context(buffer); + auto * ctx = get_buffer_context(buffer); memset(ctx->get_buffer(), value, ctx->get_size()); } @@ -158,19 +149,19 @@ constexpr const ggml_backend_buffer_i ggml_backend_qnn_buffer_interface = { * qnn backend object * ----------------------------------------------------------------------------------------------- */ -const char *ggml_backend_qnn_buffer_type_name(ggml_backend_buffer_type_t buft) { - auto *dev_ctx = get_device_context(buft->device); +const char * ggml_backend_qnn_buffer_type_name(ggml_backend_buffer_type_t buft) { + auto * dev_ctx = get_device_context(buft->device); return qnn::get_backend_name(dev_ctx->device); } ggml_backend_buffer_t ggml_backend_qnn_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { - qnn::qnn_buffer_interface *ctx = new qnn::qnn_mem_buffer(size); + qnn::qnn_buffer_interface * ctx = new qnn::qnn_mem_buffer(size); if (!ctx->is_valid()) { return nullptr; } - QNN_LOG_DEBUG("[%s]alloc buffer: %p, size: %ld", qnn::get_backend_name(get_device_context(buft->device)->device), - ctx->get_buffer(), size); + QNN_LOG_DEBUG("[%s]alloc buffer: %p, size: %ld\n", qnn::get_backend_name(get_device_context(buft->device)->device), + (void *) ctx->get_buffer(), (long) size); return ggml_backend_buffer_init(buft, ggml_backend_qnn_buffer_interface, ctx, size); } @@ -192,16 +183,16 @@ bool ggml_backend_qnn_buffer_is_host(ggml_backend_buffer_type_t buft) { return true; } -const char *ggml_backend_qnn_name(ggml_backend_t backend) { - auto *device_ctx = get_device_context(backend->device); +const char * ggml_backend_qnn_name(ggml_backend_t backend) { + auto * device_ctx = get_device_context(backend->device); return device_ctx->name.c_str(); } void ggml_backend_qnn_free(ggml_backend_t backend) { - auto *device_ctx = get_device_context(backend->device); - QNN_LOG_INFO("idx %d, name:%s", device_ctx->device, device_ctx->name.c_str()); + auto * device_ctx = get_device_context(backend->device); + QNN_LOG_INFO("idx %d, name:%s\n", device_ctx->device, device_ctx->name.c_str()); - auto &instance = device_ctx->instance; + auto & instance = device_ctx->instance; if (instance) { device_ctx->qnn_graph_cache.clear(); device_ctx->qnn_interface.reset(); @@ -212,35 +203,33 @@ void ggml_backend_qnn_free(ggml_backend_t backend) { delete backend; } -bool ggml_backend_qnn_cpy_tensor_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, const ggml_tensor *src, - ggml_tensor *dst) { +bool ggml_backend_qnn_cpy_tensor_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, const ggml_tensor * src, + ggml_tensor * dst) { GGML_UNUSED(backend_src); GGML_UNUSED(backend_dst); GGML_UNUSED(src); GGML_UNUSED(dst); - QNN_LOG_DEBUG("opy form %s to %s, src_is_qnn: %d, dst_is_qnn: %d", ggml_get_name(src), ggml_get_name(dst), - (int)ggml_backend_is_qnn(backend_src), (int)ggml_backend_is_qnn(backend_dst)); + QNN_LOG_DEBUG("opy form %s to %s, src_is_qnn: %d, dst_is_qnn: %d\n", ggml_get_name(src), ggml_get_name(dst), + (int) ggml_backend_is_qnn(backend_src), (int) ggml_backend_is_qnn(backend_dst)); return false; } ggml_backend_buffer_type_t ggml_backend_qnn_buffer_type(ggml_backend_dev_t dev) { static ggml_backend_buffer_type ggml_backend_qnn_buffer_types[GGML_QNN_MAX_DEVICES]; - auto *dev_ctx = get_device_context(dev); + auto * dev_ctx = get_device_context(dev); if (!ggml_backend_qnn_buffer_types[dev_ctx->device].device) { ggml_backend_qnn_buffer_types[dev_ctx->device] = { /* .iface = */ { - /* .get_name = */ ggml_backend_qnn_buffer_type_name, - /* .alloc_buffer = */ - ggml_backend_qnn_buffer_type_alloc_buffer, - /* .get_alignment = */ - ggml_backend_qnn_buffer_type_get_alignment, - /* .get_max_size = */ - ggml_backend_qnn_buffer_type_get_max_size, - /* .get_alloc_size = */ nullptr, // defaults to ggml_nbytes + /* .get_name = */ ggml_backend_qnn_buffer_type_name, + /* .alloc_buffer = */ + ggml_backend_qnn_buffer_type_alloc_buffer, /* .get_alignment = */ + ggml_backend_qnn_buffer_type_get_alignment, /* .get_max_size = */ + ggml_backend_qnn_buffer_type_get_max_size, /* .get_alloc_size = */ nullptr, // defaults to ggml_nbytes /* .is_host = */ ggml_backend_qnn_buffer_is_host, - }, - /* .device */ dev, + }, + /* .device */ + dev, /* .context = */ nullptr, }; } else { @@ -250,9 +239,9 @@ ggml_backend_buffer_type_t ggml_backend_qnn_buffer_type(ggml_backend_dev_t dev) return &ggml_backend_qnn_buffer_types[dev_ctx->device]; } -ggml_status ggml_backend_qnn_graph_compute(ggml_backend_t backend, ggml_cgraph *cgraph) { - return qnn::device_compute_graph(get_device_context(backend->device), cgraph) ? GGML_STATUS_SUCCESS - : GGML_STATUS_FAILED; +ggml_status ggml_backend_qnn_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { + return qnn::device_compute_graph(get_device_context(backend->device), cgraph) ? GGML_STATUS_SUCCESS : + GGML_STATUS_FAILED; } constexpr const ggml_backend_i ggml_backend_qnn_interface = { @@ -276,31 +265,31 @@ constexpr const ggml_backend_i ggml_backend_qnn_interface = { * qnn backend device object * ----------------------------------------------------------------------------------------------- */ -const char *ggml_backend_qnn_device_get_name(ggml_backend_dev_t dev) { - const auto &caps = kDeviceCaps[get_device_context(dev)->device]; +const char * ggml_backend_qnn_device_get_name(ggml_backend_dev_t dev) { + const auto & caps = kDeviceCaps[get_device_context(dev)->device]; return caps.name; } -const char *ggml_backend_qnn_device_get_description(ggml_backend_dev_t dev) { - const auto &caps = kDeviceCaps[get_device_context(dev)->device]; +const char * ggml_backend_qnn_device_get_description(ggml_backend_dev_t dev) { + const auto & caps = kDeviceCaps[get_device_context(dev)->device]; return caps.description; } -void ggml_backend_qnn_device_get_memory(ggml_backend_dev_t dev, size_t *free, size_t *total) { +void ggml_backend_qnn_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) { GGML_UNUSED(dev); - *free = qnn::get_system_free_memory_in_bytes(); + *free = qnn::get_system_free_memory_in_bytes(); *total = qnn::get_system_total_memory_in_bytes(); - QNN_LOG_DEBUG("free memory: %ldMB, total memory: %ldMB", (*free / 1048576), (*total) / 1048576); + QNN_LOG_DEBUG("free memory: %ldMB, total memory: %ldMB\n", (*free / 1048576), (*total) / 1048576); } enum ggml_backend_dev_type ggml_backend_qnn_device_get_type(ggml_backend_dev_t dev) { return kDeviceCaps[get_device_context(dev)->device].type; } -void ggml_backend_qnn_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props *props) { - props->name = ggml_backend_qnn_device_get_name(dev); +void ggml_backend_qnn_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) { + props->name = ggml_backend_qnn_device_get_name(dev); props->description = ggml_backend_qnn_device_get_description(dev); - props->type = ggml_backend_qnn_device_get_type(dev); + props->type = ggml_backend_qnn_device_get_type(dev); ggml_backend_qnn_device_get_memory(dev, &props->memory_free, &props->memory_total); props->caps = { /* async */ false, @@ -311,12 +300,12 @@ void ggml_backend_qnn_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_ } ggml_guid_t ggml_backend_qnn_guid() { - static ggml_guid guid = {0x1a, 0x2b, 0x3c, 0x4d, 0x5e, 0x6f, 0x70, 0x81, - 0x92, 0xa3, 0xb4, 0xc5, 0xd6, 0xe7, 0xf8, 0x09}; + static ggml_guid guid = { 0x1a, 0x2b, 0x3c, 0x4d, 0x5e, 0x6f, 0x70, 0x81, + 0x92, 0xa3, 0xb4, 0xc5, 0xd6, 0xe7, 0xf8, 0x09 }; return &guid; } -ggml_backend_t ggml_backend_qnn_init_with_device_context(ggml_backend_dev_t dev, const char *extend_lib_search_path) { +ggml_backend_t ggml_backend_qnn_init_with_device_context(ggml_backend_dev_t dev, const char * extend_lib_search_path) { if (!extend_lib_search_path) { extend_lib_search_path = GGML_QNN_DEFAULT_LIB_SEARCH_PATH; QNN_LOG_WARN( @@ -324,27 +313,27 @@ ggml_backend_t ggml_backend_qnn_init_with_device_context(ggml_backend_dev_t dev, "use " GGML_QNN_DEFAULT_LIB_SEARCH_PATH " as default"); } - auto *dev_ctx = get_device_context(dev); - const auto device = dev_ctx->device; - QNN_LOG_DEBUG("device %s", qnn::get_backend_name(device)); - QNN_LOG_DEBUG("extend_lib_search_path %s", extend_lib_search_path); + auto * dev_ctx = get_device_context(dev); + const auto device = dev_ctx->device; + QNN_LOG_DEBUG("device %s\n", qnn::get_backend_name(device)); + QNN_LOG_DEBUG("extend_lib_search_path %s\n", extend_lib_search_path); auto instance = std::make_shared(extend_lib_search_path, dev_ctx->lib_name); - auto result = instance->qnn_init(nullptr); + auto result = instance->qnn_init(nullptr); if (result != 0) { - QNN_LOG_WARN("failed to init qnn backend %s", qnn::get_backend_name(device)); + QNN_LOG_WARN("failed to init qnn backend %s\n", qnn::get_backend_name(device)); return nullptr; } auto qnn_interface = instance->get_qnn_interface(); if (!qnn_interface) { - QNN_LOG_WARN("qnn subsystem failure"); + QNN_LOG_WARN("qnn subsystem failure\n"); return nullptr; } std::string device_name = qnn::get_backend_name(device); - QNN_LOG_INFO("qnn device name %s", device_name.c_str()); - dev_ctx->instance = instance; - dev_ctx->qnn_interface = qnn_interface; - dev_ctx->socinfo = instance->get_soc_info(); + QNN_LOG_INFO("qnn device name %s\n", device_name.c_str()); + dev_ctx->instance = instance; + dev_ctx->qnn_interface = qnn_interface; + dev_ctx->socinfo = instance->get_soc_info(); dev_ctx->supported_types = kDeviceCaps[device].supported_types; ggml_backend_t qnn_backend = new ggml_backend{ @@ -357,7 +346,7 @@ ggml_backend_t ggml_backend_qnn_init_with_device_context(ggml_backend_dev_t dev, return qnn_backend; } -ggml_backend_t ggml_backend_qnn_device_init(ggml_backend_dev_t dev, const char *params) { +ggml_backend_t ggml_backend_qnn_device_init(ggml_backend_dev_t dev, const char * params) { return ggml_backend_qnn_init_with_device_context(dev, params); } @@ -365,7 +354,7 @@ ggml_backend_buffer_type_t ggml_backend_qnn_device_get_buffer_type(ggml_backend_ return ggml_backend_qnn_buffer_type(dev); } -ggml_backend_buffer_t ggml_backend_qnn_device_buffer_from_ptr(ggml_backend_dev_t dev, void *ptr, size_t size, +ggml_backend_buffer_t ggml_backend_qnn_device_buffer_from_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) { // TODO GGML_UNUSED(dev); @@ -373,9 +362,9 @@ ggml_backend_buffer_t ggml_backend_qnn_device_buffer_from_ptr(ggml_backend_dev_t return ggml_backend_cpu_buffer_from_ptr(ptr, size); } -bool ggml_backend_qnn_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor *op) { +bool ggml_backend_qnn_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) { // Note that this function could be called before the device context is initialized - auto *device_ctx = get_device_context(dev); + auto * device_ctx = get_device_context(dev); return qnn::device_supports_op(device_ctx, op); } @@ -384,13 +373,13 @@ bool ggml_backend_qnn_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_ return ggml_backend_buft_is_host(buft); } -bool ggml_backend_qnn_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor *op) { +bool ggml_backend_qnn_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) { #ifdef NDEBUG GGML_UNUSED(dev); GGML_UNUSED(op); #else - auto *device_ctx = get_device_context(dev); - QNN_LOG_DEBUG("[%s][%s]offload op", qnn::get_backend_name(device_ctx->device), ggml_op_name(op->op)); + auto * device_ctx = get_device_context(dev); + QNN_LOG_DEBUG("[%s][%s]offload op\n", qnn::get_backend_name(device_ctx->device), ggml_op_name(op->op)); #endif return false; } @@ -421,15 +410,15 @@ constexpr const ggml_backend_device_i ggml_backend_qnn_device_interface = { struct ggml_backend_qnn_reg_impl : ggml_backend_reg { std::vector> device_contexts; - std::vector devices; + std::vector devices; explicit ggml_backend_qnn_reg_impl(ggml_backend_reg_i interface) { context = this; - iface = interface; + iface = interface; - QNN_LOG_DEBUG("qnn backend registry init"); + QNN_LOG_DEBUG("qnn backend registry init\n"); for (size_t i = 0; i < QNN_BACKEND_COUNT; i++) { - const auto device_enum = (QNNBackend)(QNN_BACKEND_COUNT - 1 - i); // init from the last device, i.e. NPU + const auto device_enum = (QNNBackend) (QNN_BACKEND_COUNT - 1 - i); // init from the last device, i.e. NPU #ifndef GGML_QNN_ENABLE_CPU_BACKEND if (device_enum == QNN_BACKEND_CPU) { /* @@ -441,7 +430,7 @@ struct ggml_backend_qnn_reg_impl : ggml_backend_reg { #endif device_contexts.emplace_back(std::make_unique( - /* .device = */ device_enum, // init from the last device, i.e. NPU + /* .device = */ device_enum, // init from the last device, i.e. NPU /* .threads = */ 1, /* .name = */ qnn::get_backend_name(device_enum), /* .lib_name = */ kDeviceCaps[device_enum].lib_name, @@ -456,18 +445,18 @@ struct ggml_backend_qnn_reg_impl : ggml_backend_reg { } }; -const char *ggml_backend_qnn_reg_get_name(ggml_backend_reg_t reg) { +const char * ggml_backend_qnn_reg_get_name(ggml_backend_reg_t reg) { GGML_UNUSED(reg); return GGML_QNN_NAME; } size_t ggml_backend_qnn_reg_get_device_count(ggml_backend_reg_t reg) { - auto *ctx = (ggml_backend_qnn_reg_impl *)reg->context; + auto * ctx = (ggml_backend_qnn_reg_impl *) reg->context; return ctx->devices.size(); } ggml_backend_dev_t ggml_backend_qnn_reg_get_device(ggml_backend_reg_t reg, size_t index) { - auto *ctx = (ggml_backend_qnn_reg_impl *)reg->context; + auto * ctx = (ggml_backend_qnn_reg_impl *) reg->context; GGML_ASSERT(index < ctx->devices.size()); return &(ctx->devices[index]); } @@ -479,11 +468,13 @@ const ggml_backend_reg_i ggml_backend_qnn_reg_interface = { /* .get_proc_address = */ nullptr, }; -} // namespace +} // namespace -bool ggml_backend_is_qnn(ggml_backend_t backend) { return ggml_guid_matches(backend->guid, ggml_backend_qnn_guid()); } +bool ggml_backend_is_qnn(ggml_backend_t backend) { + return ggml_guid_matches(backend->guid, ggml_backend_qnn_guid()); +} ggml_backend_reg_t ggml_backend_qnn_reg() { - static ggml_backend_qnn_reg_impl reg{ggml_backend_qnn_reg_interface}; + static ggml_backend_qnn_reg_impl reg{ ggml_backend_qnn_reg_interface }; return ® } diff --git a/ggml/src/ggml-qnn/graph.cpp b/ggml/src/ggml-qnn/graph.cpp index 25ce5b8fb2754..b3ab161e9f6ca 100644 --- a/ggml/src/ggml-qnn/graph.cpp +++ b/ggml/src/ggml-qnn/graph.cpp @@ -5,7 +5,6 @@ #include #include "ggml-impl.h" - #include "logger.hpp" #include "op-config.hpp" #include "tensor.hpp" @@ -13,9 +12,9 @@ namespace { using qnn_tensor_cache_t = std::unordered_map; -int get_op_max_rank(const ggml_tensor *op) { - int max_rank = ggml_n_dims(op); - const int count = (int)qnn::get_qnn_op_input_param_count(op); +int get_op_max_rank(const ggml_tensor * op) { + int max_rank = ggml_n_dims(op); + const int count = (int) qnn::get_qnn_op_input_param_count(op); for (int i = 0; i < count; ++i) { max_rank = std::max(max_rank, ggml_n_dims(op->src[i])); } @@ -23,10 +22,10 @@ int get_op_max_rank(const ggml_tensor *op) { return max_rank; } -qnn::qnn_tensor_ptr_t create_tensor_with_cache(ggml_tensor *tensor, qnn::ggml_qnn_tensor::tensor_type_t type, int rank, +qnn::qnn_tensor_ptr_t create_tensor_with_cache(ggml_tensor * tensor, qnn::ggml_qnn_tensor::tensor_type_t type, int rank, QNNBackend device, Qnn_GraphHandle_t graph_handle, std::shared_ptr qnn_instance, - qnn_tensor_cache_t &tensor_cache) { + qnn_tensor_cache_t & tensor_cache) { GGML_ASSERT(tensor); if (tensor_cache.count(tensor)) { return tensor_cache[tensor]; @@ -38,13 +37,13 @@ qnn::qnn_tensor_ptr_t create_tensor_with_cache(ggml_tensor *tensor, qnn::ggml_qn return qnn_tensor; } -qnn::qnn_tensor_array_t create_tensors_with_cache(const qnn::ggml_tensor_array_t &ggml_tensors, +qnn::qnn_tensor_array_t create_tensors_with_cache(const qnn::ggml_tensor_array_t & ggml_tensors, qnn::ggml_qnn_tensor::tensor_type_t type, int rank, QNNBackend device, - Qnn_GraphHandle_t graph_handle, + Qnn_GraphHandle_t graph_handle, std::shared_ptr qnn_instance, - qnn_tensor_cache_t &tensor_cache) { + qnn_tensor_cache_t & tensor_cache) { qnn::qnn_tensor_array_t tensors; - for (auto *tensor : ggml_tensors) { + for (auto * tensor : ggml_tensors) { tensors.push_back( create_tensor_with_cache(tensor, type, rank, device, graph_handle, qnn_instance, tensor_cache)); } @@ -52,10 +51,10 @@ qnn::qnn_tensor_array_t create_tensors_with_cache(const qnn::ggml_tensor_array_t return tensors; } -qnn::qnn_op_config_ptr_t create_operation_from_op_tensor(ggml_tensor *dst, const std::string &name, int rank, +qnn::qnn_op_config_ptr_t create_operation_from_op_tensor(ggml_tensor * dst, const std::string & name, int rank, QNNBackend device, Qnn_GraphHandle_t graph_handle, std::shared_ptr qnn_instance, - bool is_intermediate, qnn_tensor_cache_t &tensor_cache) { + bool is_intermediate, qnn_tensor_cache_t & tensor_cache) { auto operation = qnn::create_op(dst, name, qnn_instance); // input tensors @@ -71,22 +70,22 @@ qnn::qnn_op_config_ptr_t create_operation_from_op_tensor(ggml_tensor *dst, const // output tensor tensor_type = is_intermediate ? qnn::ggml_qnn_tensor::INTERMEDIATE : qnn::ggml_qnn_tensor::OUTPUT; qnn::qnn_tensor_array_t output_qnn_tensors = - create_tensors_with_cache({dst}, tensor_type, rank, device, graph_handle, qnn_instance, tensor_cache); + create_tensors_with_cache({ dst }, tensor_type, rank, device, graph_handle, qnn_instance, tensor_cache); operation->set_output_tensors(output_qnn_tensors); // initialize operation if (!operation->initialize_op_nodes(device, graph_handle)) { - QNN_LOG_ERROR("[%s][%s]initialize_op_nodes failed", qnn::get_backend_name(device), name.c_str()); + QNN_LOG_ERROR("[%s][%s]initialize_op_nodes failed\n", qnn::get_backend_name(device), name.c_str()); return nullptr; } return operation; } -bool bind_src_tensors(ggml_tensor *op, qnn::qnn_tensor_array_t &tensor_wrappers, - std::vector &qnn_tensors) { +bool bind_src_tensors(ggml_tensor * op, qnn::qnn_tensor_array_t & tensor_wrappers, + std::vector & qnn_tensors) { if (op->op == GGML_OP_NONE) { - QNN_LOG_DEBUG("op %s is not a valid op", ggml_get_name(op)); + QNN_LOG_DEBUG("op %s is not a valid op\n", ggml_get_name(op)); return false; } @@ -94,9 +93,9 @@ bool bind_src_tensors(ggml_tensor *op, qnn::qnn_tensor_array_t &tensor_wrappers, GGML_ASSERT(tensor_wrappers.size() == param_count); qnn_tensors.resize(param_count); for (size_t i = 0; i < param_count; ++i) { - auto *ggml_tensor = op->src[i]; + auto * ggml_tensor = op->src[i]; if (!tensor_wrappers[i]->bind_ggml_tensor(ggml_tensor)) { - QNN_LOG_ERROR("bind tensor %s failed", ggml_get_name(ggml_tensor)); + QNN_LOG_ERROR("bind tensor %s failed\n", ggml_get_name(ggml_tensor)); return false; } @@ -116,22 +115,21 @@ bool bind_src_tensors(ggml_tensor *op, qnn::qnn_tensor_array_t &tensor_wrappers, * * TODO: this algorithm is not perfect and may not work for all cases. It assumes that the tensors are * connected in a way that allows for unambiguous categorization. - * It also assumes that the tensors are connected in a way that allows for unambiguous categorization. */ -int get_io_tensors_from_graph(const ggml_cgraph *cgraph, qnn::ggml_tensor_array_t &inputs, - qnn::ggml_tensor_array_t &outputs) { +int get_io_tensors_from_graph(const ggml_cgraph * cgraph, qnn::ggml_tensor_array_t & inputs, + qnn::ggml_tensor_array_t & outputs) { struct _tensor_connectivity_info { - size_t in_degree = 0; - size_t out_degree = 0; + size_t in_degree = 0; + size_t out_degree = 0; size_t insert_index = 0; }; using ggml_tensor_connectivity_map_t = std::unordered_map; ggml_tensor_connectivity_map_t connectivity_map; - int rank = 0; + int rank = 0; for (int i = 0; i < cgraph->n_nodes; i++) { - ggml_tensor *dst = cgraph->nodes[i]; + ggml_tensor * dst = cgraph->nodes[i]; if (ggml_is_empty(dst)) { continue; } @@ -144,7 +142,7 @@ int get_io_tensors_from_graph(const ggml_cgraph *cgraph, qnn::ggml_tensor_array_ rank = std::max(rank, ggml_n_dims(dst)); if (connectivity_map.count(dst) == 0) { connectivity_map[dst] = { - 1, // in-degree, at least 1 + 1, // in-degree, at least 1 0, connectivity_map.size(), }; @@ -153,13 +151,13 @@ int get_io_tensors_from_graph(const ggml_cgraph *cgraph, qnn::ggml_tensor_array_ } for (size_t i = 0; i < GGML_MAX_DIMS && dst->src[i]; ++i) { - auto *src = dst->src[i]; - rank = std::max(rank, ggml_n_dims(src)); + auto * src = dst->src[i]; + rank = std::max(rank, ggml_n_dims(src)); if (connectivity_map.count(src) == 0) { connectivity_map[src] = { 0, - 1, // out-degree, at least 1 + 1, // out-degree, at least 1 connectivity_map.size(), }; } else { @@ -168,7 +166,7 @@ int get_io_tensors_from_graph(const ggml_cgraph *cgraph, qnn::ggml_tensor_array_ } } - for (const auto &kv : connectivity_map) { + for (const auto & kv : connectivity_map) { if (kv.second.in_degree == 0) { inputs.push_back(kv.first); } @@ -178,126 +176,103 @@ int get_io_tensors_from_graph(const ggml_cgraph *cgraph, qnn::ggml_tensor_array_ } } - std::sort(inputs.begin(), inputs.end(), [&connectivity_map](ggml_tensor *lhs, ggml_tensor *rhs) { + std::sort(inputs.begin(), inputs.end(), [&connectivity_map](ggml_tensor * lhs, ggml_tensor * rhs) { return connectivity_map[lhs].insert_index < connectivity_map[rhs].insert_index; }); - std::sort(outputs.begin(), outputs.end(), [&connectivity_map](ggml_tensor *lhs, ggml_tensor *rhs) { + std::sort(outputs.begin(), outputs.end(), [&connectivity_map](ggml_tensor * lhs, ggml_tensor * rhs) { return connectivity_map[lhs].insert_index < connectivity_map[rhs].insert_index; }); return rank; } -} // namespace +} // namespace namespace qnn { -qnn_graph::qnn_graph(const std::string &graph_name, QNNBackend device, std::shared_ptr qnn_instance, - size_t vtcm_size_in_mb) - : _graph_name(graph_name), _device(device), _qnn_instance(qnn_instance) { - QNN_LOG_DEBUG("[%s][%s]created", get_backend_name(device), graph_name.c_str()); - - auto qnn_interface = qnn_instance->get_qnn_interface(); - auto qnn_context = qnn_instance->get_qnn_context_handle(); - Qnn_ErrorHandle_t error = QNN_SUCCESS; - Qnn_GraphHandle_t graph_handle = nullptr; +qnn_graph::qnn_graph(const std::string & graph_name, QNNBackend device, std::shared_ptr qnn_instance, + size_t vtcm_size_in_mb) : + _graph_name(graph_name), + _device(device), + _qnn_instance(qnn_instance) { + QNN_LOG_DEBUG("[%s][%s]created\n", get_backend_name(device), graph_name.c_str()); + + auto qnn_interface = qnn_instance->get_qnn_interface(); + auto qnn_context = qnn_instance->get_qnn_context_handle(); + Qnn_ErrorHandle_t error = QNN_SUCCESS; + Qnn_GraphHandle_t graph_handle = nullptr; if (device == QNN_BACKEND_NPU) { // TODO: fix graph config here for NPU QnnHtpGraph_CustomConfig_t hvx_config; - hvx_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_NUM_HVX_THREADS; + hvx_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_NUM_HVX_THREADS; hvx_config.numHvxThreads = 8; QnnGraph_Config_t graph_hvx_config; - graph_hvx_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; + graph_hvx_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; graph_hvx_config.customConfig = &hvx_config; QnnHtpGraph_CustomConfig_t dlbc_config; - dlbc_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_OPTIMIZATION; - dlbc_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_ENABLE_DLBC; - dlbc_config.optimizationOption.floatValue = 1.0; // set to 0.0 to turn off DLBC + dlbc_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_OPTIMIZATION; + dlbc_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_ENABLE_DLBC; + dlbc_config.optimizationOption.floatValue = 1.0; // set to 0.0 to turn off DLBC QnnGraph_Config_t graph_dlbc_config; - graph_dlbc_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; + graph_dlbc_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; graph_dlbc_config.customConfig = &dlbc_config; QnnHtpGraph_CustomConfig_t opt_config; - opt_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_OPTIMIZATION_FLAG; - opt_config.optimizationOption.floatValue = 1; // 1 / 3 + opt_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_OPTIMIZATION_FLAG; + opt_config.optimizationOption.floatValue = 1; // 1 / 3 QnnGraph_Config_t graph_opt_config; - graph_opt_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; + graph_opt_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; graph_opt_config.customConfig = &opt_config; QnnHtpGraph_CustomConfig_t vtcm_config; - vtcm_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_VTCM_SIZE; - vtcm_config.vtcmSizeInMB = (uint32_t)vtcm_size_in_mb; + vtcm_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_VTCM_SIZE; + vtcm_config.vtcmSizeInMB = (uint32_t) vtcm_size_in_mb; QnnGraph_Config_t graph_vtcm_config; - graph_vtcm_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; + graph_vtcm_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; graph_vtcm_config.customConfig = &vtcm_config; - const QnnGraph_Config_t *graph_configs[] = {&graph_hvx_config, &graph_dlbc_config, &graph_vtcm_config, - &graph_opt_config, nullptr}; + const QnnGraph_Config_t * graph_configs[] = { &graph_hvx_config, &graph_dlbc_config, &graph_vtcm_config, + &graph_opt_config, nullptr }; error = qnn_interface->qnn_graph_create(qnn_context, graph_name.c_str(), graph_configs, &graph_handle); } else { error = qnn_interface->qnn_graph_create(qnn_context, graph_name.c_str(), nullptr, &graph_handle); } if (error != QNN_SUCCESS) { - QNN_LOG_ERROR("[%s][%s]failed to create qnn graph, error: %s", get_backend_name(device), graph_name.c_str(), + QNN_LOG_ERROR("[%s][%s]failed to create qnn graph, error: %s\n", get_backend_name(device), graph_name.c_str(), get_qnn_error_string(error)); return; } - QNN_LOG_INFO("[%s][%s]create succeed", get_backend_name(device), graph_name.c_str()); - _graph_handle = graph_handle; + QNN_LOG_INFO("[%s][%s]create succeed\n", get_backend_name(device), graph_name.c_str()); + _graph_handle = graph_handle; _qnn_interface = qnn_interface; } -qnn_graph::~qnn_graph() { QNN_LOG_DEBUG("[%s][%s]destroy", get_backend_name(_device), _graph_name.c_str()); } - -bool qnn_graph::build_graph_from_op(ggml_tensor *op) { - if (!is_valid()) { - QNN_LOG_ERROR("Invalid graph"); - return false; - } - - QNN_LOG_DEBUG("[%s][%s]build start", get_backend_name(_device), _graph_name.c_str()); - qnn_tensor_cache_t tensor_cache; - const auto rank = get_op_max_rank(op); - auto operation = create_operation_from_op_tensor(op, _graph_name, rank, _device, _graph_handle, _qnn_instance, - false, tensor_cache); - if (!operation) { - QNN_LOG_ERROR("[%s][%s]create_operation_from_op_tensor failed", get_backend_name(_device), _graph_name.c_str()); - return false; - } - - _tensor_inputs = operation->get_input_tensors(); - _tensor_outputs = operation->get_output_tensors(); - _operations.push_back(std::move(operation)); - if (!finalize()) { - return false; - } - - QNN_LOG_DEBUG("[%s][%s]build succeed", get_backend_name(_device), _graph_name.c_str()); - return true; +qnn_graph::~qnn_graph() { + QNN_LOG_DEBUG("[%s][%s]destroy\n", get_backend_name(_device), _graph_name.c_str()); } -bool qnn_graph::build_graph_from_ggml_graph(const ggml_cgraph *cgraph) { - QNN_LOG_DEBUG("[%s][%s]build start", get_backend_name(_device), _graph_name.c_str()); +bool qnn_graph::build_graph_from_ggml_graph(const ggml_cgraph * cgraph) { + QNN_LOG_DEBUG("[%s][%s]build start\n", get_backend_name(_device), _graph_name.c_str()); ggml_tensor_array_t inputs; ggml_tensor_array_t outputs; - int rank = get_io_tensors_from_graph(cgraph, inputs, outputs); - QNN_LOG_DEBUG("[%s]rank: %d, input_set: %d, output_set: %d", get_backend_name(_device), rank, int(inputs.size()), + int rank = get_io_tensors_from_graph(cgraph, inputs, outputs); + QNN_LOG_DEBUG("[%s]rank: %d, input_set: %d, output_set: %d\n", get_backend_name(_device), rank, int(inputs.size()), int(outputs.size())); { qnn_tensor_cache_t tensor_cache; - auto input_tensors = create_tensors_with_cache(inputs, ggml_qnn_tensor::INPUT, rank, _device, _graph_handle, - _qnn_instance, tensor_cache); + auto input_tensors = create_tensors_with_cache(inputs, ggml_qnn_tensor::INPUT, rank, _device, _graph_handle, + _qnn_instance, tensor_cache); auto output_tensors = create_tensors_with_cache(outputs, ggml_qnn_tensor::OUTPUT, rank, _device, _graph_handle, _qnn_instance, tensor_cache); qnn_op_config_array_t operations; for (int i = 0; i < cgraph->n_nodes; i++) { - ggml_tensor *dst = cgraph->nodes[i]; + ggml_tensor * dst = cgraph->nodes[i]; if (ggml_is_empty(dst)) { continue; } @@ -307,83 +282,49 @@ bool qnn_graph::build_graph_from_ggml_graph(const ggml_cgraph *cgraph) { continue; } - QNN_LOG_DEBUG("[%s]create op: %s", get_backend_name(_device), get_qnn_op_name(dst)); + QNN_LOG_DEBUG("[%s]create op: %s\n", get_backend_name(_device), get_qnn_op_name(dst)); auto operation = create_operation_from_op_tensor(dst, dst->name, rank, _device, _graph_handle, - _qnn_instance, true, tensor_cache); // TODO: fix op name + _qnn_instance, true, tensor_cache); // TODO: fix op name operations.push_back(operation); } - _tensor_inputs = std::move(input_tensors); + _tensor_inputs = std::move(input_tensors); _tensor_outputs = std::move(output_tensors); - _operations = std::move(operations); + _operations = std::move(operations); if (!finalize()) { return false; } } - QNN_LOG_DEBUG("[%s][%s]build succeed", get_backend_name(_device), _graph_name.c_str()); - return true; -} - -bool qnn_graph::execute(ggml_tensor *op) { - if (!bind_src_tensors(op, _tensor_inputs, _qnn_tensor_inputs)) { - QNN_LOG_ERROR("[%s][%s]bind input tensors failed", get_backend_name(_device), _graph_name.c_str()); - return false; - } - - if (!qnn::bind_tensors({op}, _tensor_outputs, _qnn_tensor_outputs)) { - QNN_LOG_ERROR("[%s][%s]bind output tensors failed", get_backend_name(_device), _graph_name.c_str()); - return false; - } - - auto &qnn_tensor_inputs = _qnn_tensor_inputs; - auto &qnn_tensor_outputs = _qnn_tensor_outputs; - auto error = - _qnn_interface->qnn_graph_execute(_graph_handle, qnn_tensor_inputs.data(), qnn_tensor_inputs.size(), - qnn_tensor_outputs.data(), qnn_tensor_outputs.size(), nullptr, nullptr); - unbind_tensors(_tensor_inputs); - unbind_tensors(_tensor_outputs); - - if (error != QNN_SUCCESS) { - if (_device == QNN_BACKEND_NPU && error == QNN_COMMON_ERROR_SYSTEM_COMMUNICATION) { - QNN_LOG_WARN("[%s][%s]NPU crashed. SSR detected. Caused QNN graph execute error.", - get_backend_name(_device), _graph_name.c_str()); - } else { - QNN_LOG_ERROR("[%s][%s]error: %s", get_backend_name(_device), _graph_name.c_str(), - get_qnn_error_string(error)); - } - return false; - } - - QNN_LOG_DEBUG("[%s][%s]execute succeed", get_backend_name(_device), _graph_name.c_str()); + QNN_LOG_DEBUG("[%s][%s]build succeed\n", get_backend_name(_device), _graph_name.c_str()); return true; } -bool qnn_graph::execute(const ggml_cgraph *cgraph) { +bool qnn_graph::execute(const ggml_cgraph * cgraph) { ggml_tensor_array_t inputs; ggml_tensor_array_t outputs; #ifdef NDEBUG get_io_tensors_from_graph(cgraph, inputs, outputs); #else int rank = get_io_tensors_from_graph(cgraph, inputs, outputs); - QNN_LOG_DEBUG("[%s]rank: %d, input_set: %d, output_set: %d", get_backend_name(_device), rank, int(inputs.size()), + QNN_LOG_DEBUG("[%s]rank: %d, input_set: %d, output_set: %d\n", get_backend_name(_device), rank, int(inputs.size()), int(outputs.size())); #endif { if (!qnn::bind_tensors(inputs, _tensor_inputs, _qnn_tensor_inputs)) { - QNN_LOG_ERROR("[%s][%s]bind input tensors failed", get_backend_name(_device), _graph_name.c_str()); + QNN_LOG_ERROR("[%s][%s]bind input tensors failed\n", get_backend_name(_device), _graph_name.c_str()); return false; } if (!qnn::bind_tensors(outputs, _tensor_outputs, _qnn_tensor_outputs)) { - QNN_LOG_ERROR("[%s][%s]bind output tensors failed", get_backend_name(_device), _graph_name.c_str()); + QNN_LOG_ERROR("[%s][%s]bind output tensors failed\n", get_backend_name(_device), _graph_name.c_str()); return false; } - auto &qnn_tensor_inputs = _qnn_tensor_inputs; - auto &qnn_tensor_outputs = _qnn_tensor_outputs; - auto error = + auto & qnn_tensor_inputs = _qnn_tensor_inputs; + auto & qnn_tensor_outputs = _qnn_tensor_outputs; + auto error = _qnn_interface->qnn_graph_execute(_graph_handle, qnn_tensor_inputs.data(), qnn_tensor_inputs.size(), qnn_tensor_outputs.data(), qnn_tensor_outputs.size(), nullptr, nullptr); unbind_tensors(_tensor_inputs); @@ -391,35 +332,35 @@ bool qnn_graph::execute(const ggml_cgraph *cgraph) { if (error != QNN_SUCCESS) { if (_device == QNN_BACKEND_NPU && error == QNN_COMMON_ERROR_SYSTEM_COMMUNICATION) { - QNN_LOG_WARN("[%s][%s]NPU crashed. SSR detected. Caused QNN graph execute error.", + QNN_LOG_WARN("[%s][%s]NPU crashed. SSR detected. Caused QNN graph execute error.\n", get_backend_name(_device), _graph_name.c_str()); } else { - QNN_LOG_ERROR("[%s][%s]error: %s", get_backend_name(_device), _graph_name.c_str(), + QNN_LOG_ERROR("[%s][%s]error: %s\n", get_backend_name(_device), _graph_name.c_str(), get_qnn_error_string(error)); } return false; } - QNN_LOG_DEBUG("[%s][%s]execute succeed", get_backend_name(_device), _graph_name.c_str()); + QNN_LOG_DEBUG("[%s][%s]execute succeed\n", get_backend_name(_device), _graph_name.c_str()); return true; } } bool qnn_graph::finalize() { if (!qnn::add_op_to_graph(_graph_handle, _operations)) { - QNN_LOG_ERROR("[%s]add nodes failed", _graph_name.c_str()); + QNN_LOG_ERROR("[%s]add nodes failed\n", _graph_name.c_str()); return false; } auto error = _qnn_interface->qnn_graph_finalize(_graph_handle, nullptr, nullptr); if (error != QNN_SUCCESS) { - QNN_LOG_ERROR("[%s][%s]qnn_graph_finalize.error: %s", get_backend_name(_device), _graph_name.c_str(), + QNN_LOG_ERROR("[%s][%s]qnn_graph_finalize.error: %s\n", get_backend_name(_device), _graph_name.c_str(), get_qnn_error_string(error)); return false; } - QNN_LOG_DEBUG("[%s][%s]finalize succeed", get_backend_name(_device), _graph_name.c_str()); + QNN_LOG_DEBUG("[%s][%s]finalize succeed\n", get_backend_name(_device), _graph_name.c_str()); return true; } -} // namespace qnn +} // namespace qnn diff --git a/ggml/src/ggml-qnn/graph.hpp b/ggml/src/ggml-qnn/graph.hpp index 521186f790ee5..dc1ed0b3f8896 100644 --- a/ggml/src/ggml-qnn/graph.hpp +++ b/ggml/src/ggml-qnn/graph.hpp @@ -6,41 +6,51 @@ #include #include "ggml-qnn.h" - #include "op-config.hpp" #include "qnn-lib.hpp" namespace qnn { +/** + * @class qnn_graph + * @brief Manages a QNN graph, converting a GGML graph to QNN format and handling its execution. + * + * This class is responsible for building a QNN graph from a given GGML graph, + * determining its input/output tensors, finalizing the configuration, and + * executing the graph on the specified backend device. + */ class qnn_graph { -public: - explicit qnn_graph(const std::string &graph_name, QNNBackend device, std::shared_ptr qnn_instance, + public: + explicit qnn_graph(const std::string & graph_name, QNNBackend device, std::shared_ptr qnn_instance, size_t vtcm_size_in_mb); ~qnn_graph(); - bool build_graph_from_op(ggml_tensor *op); - bool build_graph_from_ggml_graph(const ggml_cgraph *cgraph); + bool build_graph_from_ggml_graph(const ggml_cgraph * cgraph); + + bool execute(const ggml_cgraph * cgraph); - bool execute(ggml_tensor *op); - bool execute(const ggml_cgraph *cgraph); bool is_valid() const { return _graph_handle != nullptr; } + Qnn_GraphHandle_t get_graph_handler() const { return _graph_handle; } + std::shared_ptr get_qnn_instance() { return _qnn_instance; } - const std::string &get_name() const { return _graph_name; } + + const std::string & get_name() const { return _graph_name; } + QNNBackend get_device() const { return _device; } -private: + private: bool finalize(); - const std::string _graph_name; - const QNNBackend _device; - Qnn_GraphHandle_t _graph_handle = nullptr; - std::shared_ptr _qnn_instance; + const std::string _graph_name; + const QNNBackend _device; + Qnn_GraphHandle_t _graph_handle = nullptr; + std::shared_ptr _qnn_instance; std::shared_ptr _qnn_interface; - qnn_op_config_array_t _operations; + qnn_op_config_array_t _operations; - qnn_tensor_array_t _tensor_inputs; - qnn_tensor_array_t _tensor_outputs; + qnn_tensor_array_t _tensor_inputs; + qnn_tensor_array_t _tensor_outputs; std::vector _qnn_tensor_inputs; std::vector _qnn_tensor_outputs; @@ -50,4 +60,4 @@ class qnn_graph { using qnn_graph_ptr_t = std::shared_ptr; -} // namespace qnn +} // namespace qnn diff --git a/ggml/src/ggml-qnn/logger.cpp b/ggml/src/ggml-qnn/logger.cpp index 23a3f305c060f..5418d03be45a4 100644 --- a/ggml/src/ggml-qnn/logger.cpp +++ b/ggml/src/ggml-qnn/logger.cpp @@ -1,70 +1,45 @@ #include "logger.hpp" -#include -#include +#ifndef NDEBUG -#if defined(__ANDROID__) || defined(ANDROID) -#include -#endif - -void qnn::internal_log(ggml_log_level level, const char * /*file*/, const char *func, int line, const char *format, - ...) { - static std::mutex qnn_internal_log_mutex; - static char s_qnn_internal_log_buf[QNN_LOGBUF_LEN]; +# include - { - std::lock_guard lock(qnn_internal_log_mutex); - va_list args; +# include "QnnInterface.h" +# include "QnnTypes.h" +# include "System/QnnSystemInterface.h" - va_start(args, format); - int len_prefix = snprintf(s_qnn_internal_log_buf, QNN_LOGBUF_LEN, "[%s, %d]: ", func, line); - int len = vsnprintf(s_qnn_internal_log_buf + len_prefix, QNN_LOGBUF_LEN - len_prefix, format, args); - if (len < (QNN_LOGBUF_LEN - len_prefix)) { -#if defined(__ANDROID__) || defined(ANDROID) - // print to android logcat - __android_log_print(level, "ggml-qnn", "%s\n", s_qnn_internal_log_buf); -#else - (void)level; -#endif - // print to stdout - printf("%s\n", s_qnn_internal_log_buf); - } - va_end(args); - } -} - -#if ENABLE_QNNSDK_LOG -void qnn::sdk_logcallback(const char *fmt, QnnLog_Level_t level, uint64_t /*timestamp*/, va_list argp) { +void qnn::sdk_logcallback(const char * fmt, QnnLog_Level_t level, uint64_t /*timestamp*/, va_list argp) { static std::mutex log_mutex; - static char s_ggml_qnn_logbuf[QNN_LOGBUF_LEN]; + static char s_ggml_qnn_logbuf[4096]; - const char *log_level_desc = ""; + char log_level_desc = 'U'; switch (level) { case QNN_LOG_LEVEL_ERROR: - log_level_desc = "ERROR"; + log_level_desc = 'E'; break; case QNN_LOG_LEVEL_WARN: - log_level_desc = "WARNING"; + log_level_desc = 'W'; break; case QNN_LOG_LEVEL_INFO: - log_level_desc = "INFO"; + log_level_desc = 'I'; break; case QNN_LOG_LEVEL_DEBUG: - log_level_desc = "DEBUG"; + log_level_desc = 'D'; break; case QNN_LOG_LEVEL_VERBOSE: - log_level_desc = "VERBOSE"; - break; - case QNN_LOG_LEVEL_MAX: - log_level_desc = "UNKNOWN"; + log_level_desc = 'V'; break; } { std::lock_guard lock(log_mutex); - vsnprintf(s_ggml_qnn_logbuf, QNN_LOGBUF_LEN, fmt, argp); - QNN_LOG_INFO("[%s]%s", log_level_desc, s_ggml_qnn_logbuf); + int size = vsnprintf(s_ggml_qnn_logbuf, sizeof(s_ggml_qnn_logbuf), fmt, argp); + if (size > 0 && s_ggml_qnn_logbuf[size - 1] != '\n') { + QNN_LOG_INFO("[%c]%s\n", log_level_desc, s_ggml_qnn_logbuf); + } else { + QNN_LOG_INFO("[%c]%s", log_level_desc, s_ggml_qnn_logbuf); + } } } #else diff --git a/ggml/src/ggml-qnn/logger.hpp b/ggml/src/ggml-qnn/logger.hpp index b4bab0c006691..cf94ce22174b6 100644 --- a/ggml/src/ggml-qnn/logger.hpp +++ b/ggml/src/ggml-qnn/logger.hpp @@ -1,43 +1,16 @@ #pragma once -#include +#include +#include "ggml-impl.h" #include "ggml.h" - -#include "QnnCommon.h" -#include "QnnInterface.h" -#include "QnnTypes.h" -#include "System/QnnSystemInterface.h" - -#define QNN_LOGBUF_LEN 4096 +#include "QnnLog.h" namespace qnn { -void internal_log(ggml_log_level level, const char *file, const char *func, int line, const char *format, ...); - -void sdk_logcallback(const char *fmt, QnnLog_Level_t level, uint64_t timestamp, va_list argp); -} // namespace qnn - -// ================================================================================================= -// -// QNN backend internal log function -// -// ================================================================================================= -#define QNN_LOG_ERROR(...) qnn::internal_log(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) - -#define QNN_LOG_WARN(...) qnn::internal_log(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) - -#define QNN_LOG_INFO(...) qnn::internal_log(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) - -#ifdef NDEBUG -#define ENABLE_QNNBACKEND_DEBUG 0 // for troubleshooting QNN backend -#define ENABLE_QNNSDK_LOG 0 // enable/disable QNN SDK's internal log -#else -#define ENABLE_QNNBACKEND_DEBUG 1 // for troubleshooting QNN backend -#define ENABLE_QNNSDK_LOG 1 // enable/disable QNN SDK's internal log -#endif +void sdk_logcallback(const char * fmt, QnnLog_Level_t level, uint64_t timestamp, va_list argp); +} // namespace qnn -#if ENABLE_QNNBACKEND_DEBUG -#define QNN_LOG_DEBUG(...) qnn::internal_log(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) -#else -#define QNN_LOG_DEBUG(...) -#endif +#define QNN_LOG_ERROR(...) (GGML_LOG_ERROR(__VA_ARGS__)) +#define QNN_LOG_WARN(...) (GGML_LOG_WARN(__VA_ARGS__)) +#define QNN_LOG_INFO(...) (GGML_LOG_INFO(__VA_ARGS__)) +#define QNN_LOG_DEBUG(...) (GGML_LOG_DEBUG(__VA_ARGS__)) diff --git a/ggml/src/ggml-qnn/op-config-base.hpp b/ggml/src/ggml-qnn/op-config-base.hpp index 274bb8318ff99..b24b53bf2a3b6 100644 --- a/ggml/src/ggml-qnn/op-config-base.hpp +++ b/ggml/src/ggml-qnn/op-config-base.hpp @@ -4,7 +4,6 @@ #include #include "ggml-qnn.h" - #include "qnn-types.hpp" #include "tensor.hpp" @@ -18,7 +17,7 @@ namespace qnn { * adding operations to a graph, and binding/unbinding input and output tensors. */ class ggml_qnn_op_config { -public: + public: virtual ~ggml_qnn_op_config() {} /** @@ -32,8 +31,8 @@ class ggml_qnn_op_config { * * @param tensor_inputs A reference to a vector of qnn_tensor_ptr_t objects representing the input tensors. */ - virtual void set_input_tensors(qnn::qnn_tensor_array_t &tensor_inputs) = 0; - virtual void set_input_tensors(qnn::qnn_tensor_array_t &&tensor_inputs) = 0; + virtual void set_input_tensors(qnn::qnn_tensor_array_t & tensor_inputs) = 0; + virtual void set_input_tensors(qnn::qnn_tensor_array_t && tensor_inputs) = 0; /** * @brief Sets custom output tensors for the operation. This method should be called before `initialize_op_nodes`. @@ -46,8 +45,8 @@ class ggml_qnn_op_config { * * @param tensor_outputs A reference to a vector of qnn_tensor_ptr_t objects representing the output tensors. */ - virtual void set_output_tensors(qnn::qnn_tensor_array_t &tensor_outputs) = 0; - virtual void set_output_tensors(qnn::qnn_tensor_array_t &&tensor_inputs) = 0; + virtual void set_output_tensors(qnn::qnn_tensor_array_t & tensor_outputs) = 0; + virtual void set_output_tensors(qnn::qnn_tensor_array_t && tensor_inputs) = 0; /** * @brief Creates tensors and internal nodes for constructing the calculation graph. @@ -71,7 +70,7 @@ class ggml_qnn_op_config { * * @return A reference to a vector of qnn_tensor_ptr_t objects representing the input tensors. */ - virtual const qnn_tensor_array_t &get_input_tensors() = 0; + virtual const qnn_tensor_array_t & get_input_tensors() = 0; /** * @brief Pure virtual function to retrieve the output tensors of a QNN. @@ -82,7 +81,7 @@ class ggml_qnn_op_config { * * @return A reference to a vector of qnn_tensor_ptr_t objects representing the output tensors. */ - virtual const qnn_tensor_array_t &get_output_tensors() = 0; + virtual const qnn_tensor_array_t & get_output_tensors() = 0; /** * @brief Adds an operation to the given graph. @@ -109,7 +108,7 @@ class ggml_qnn_op_config { * containing the input tensors. * @return true if the input tensors were successfully bound, false otherwise. */ - virtual bool bind_input_tensors(const ggml_tensor_array_t &tensor_inputs) = 0; + virtual bool bind_input_tensors(const ggml_tensor_array_t & tensor_inputs) = 0; /** * @brief Binds the output tensors to the given tensor array. @@ -123,7 +122,7 @@ class ggml_qnn_op_config { * represent the output tensors to be bound. * @return true if the binding is successful, false otherwise. */ - virtual bool bind_output_tensors(const ggml_tensor_array_t &tensor_outputs) = 0; + virtual bool bind_output_tensors(const ggml_tensor_array_t & tensor_outputs) = 0; /** * @brief Unbinds the input tensors from the operation. @@ -146,7 +145,7 @@ class ggml_qnn_op_config { virtual void unbind_output_tensors() = 0; }; -using qnn_op_config_ptr_t = std::shared_ptr; +using qnn_op_config_ptr_t = std::shared_ptr; using qnn_op_config_array_t = std::vector; -} // namespace qnn +} // namespace qnn diff --git a/ggml/src/ggml-qnn/op-config-caps.cpp b/ggml/src/ggml-qnn/op-config-caps.cpp index b250c214a3ad9..16b50503bea4c 100644 --- a/ggml/src/ggml-qnn/op-config-caps.cpp +++ b/ggml/src/ggml-qnn/op-config-caps.cpp @@ -3,73 +3,73 @@ namespace { -using op_constructor_t = std::shared_ptr (*)(const ggml_tensor *, const std::string &, +using op_constructor_t = std::shared_ptr (*)(const ggml_tensor *, const std::string &, std::shared_ptr); -using op_dims_calc_func_t = void (*)(const std::vector &input_dims, - qnn::ggml_dimension_array_t &output_dims); +using op_dims_calc_func_t = void (*)(const std::vector & input_dims, + qnn::ggml_dimension_array_t & output_dims); -void element_wise_op_dims(const std::vector &input_dims, - qnn::ggml_dimension_array_t &output_dims) { +void element_wise_op_dims(const std::vector & input_dims, + qnn::ggml_dimension_array_t & output_dims) { for (size_t i = 1; i < std::size(output_dims); i++) { output_dims[i] = input_dims.front()[i]; } } -void mat_mul_op_dims(const std::vector &input_dims, - qnn::ggml_dimension_array_t &output_dims) { +void mat_mul_op_dims(const std::vector & input_dims, + qnn::ggml_dimension_array_t & output_dims) { GGML_ASSERT(input_dims.size() == 2); output_dims[0] = input_dims.front()[1]; output_dims[1] = input_dims.back()[1]; } struct qnn_op_caps_t { - const char *qnn_op_name = nullptr; - const size_t input_param_count = 0; - op_dims_calc_func_t calc_dims_func = nullptr; - const char *qnn_param_name = nullptr; + const char * qnn_op_name = nullptr; + const size_t input_param_count = 0; + op_dims_calc_func_t calc_dims_func = nullptr; + const char * qnn_param_name = nullptr; }; constexpr const qnn_op_caps_t kOpCaps[] = { {}, // GGML_OP_NONE {}, // GGML_OP_DUP { - // GGML_OP_ADD - QNN_OP_ELEMENT_WISE_ADD, // qnn_op_name - 2, // input_param_count - element_wise_op_dims, // calc_dims_func + // GGML_OP_ADD + QNN_OP_ELEMENT_WISE_ADD, // qnn_op_name + 2, // input_param_count + element_wise_op_dims, // calc_dims_func }, {}, // GGML_OP_ADD1 {}, // GGML_OP_ACC { - // GGML_OP_SUB - QNN_OP_ELEMENT_WISE_SUBTRACT, // qnn_op_name - 2, // input_param_count - element_wise_op_dims, // calc_dims_func + // GGML_OP_SUB + QNN_OP_ELEMENT_WISE_SUBTRACT, // qnn_op_name + 2, // input_param_count + element_wise_op_dims, // calc_dims_func }, { - // GGML_OP_MUL - QNN_OP_ELEMENT_WISE_MULTIPLY, // qnn_op_name - 2, // input_param_count - element_wise_op_dims, // calc_dims_func + // GGML_OP_MUL + QNN_OP_ELEMENT_WISE_MULTIPLY, // qnn_op_name + 2, // input_param_count + element_wise_op_dims, // calc_dims_func }, { - // GGML_OP_DIV - QNN_OP_ELEMENT_WISE_DIVIDE, // qnn_op_name - 2, // input_param_count - element_wise_op_dims, // calc_dims_func + // GGML_OP_DIV + QNN_OP_ELEMENT_WISE_DIVIDE, // qnn_op_name + 2, // input_param_count + element_wise_op_dims, // calc_dims_func }, {}, // GGML_OP_SQR { - // GGML_OP_SQRT - QNN_OP_ELEMENT_WISE_SQUARE_ROOT, // qnn_op_name - 1, // input_param_count - element_wise_op_dims, // calc_dims_func + // GGML_OP_SQRT + QNN_OP_ELEMENT_WISE_SQUARE_ROOT, // qnn_op_name + 1, // input_param_count + element_wise_op_dims, // calc_dims_func }, { - // GGML_OP_LOG - QNN_OP_ELEMENT_WISE_LOG, // qnn_op_name - 1, // input_param_count - element_wise_op_dims, // calc_dims_func + // GGML_OP_LOG + QNN_OP_ELEMENT_WISE_LOG, // qnn_op_name + 1, // input_param_count + element_wise_op_dims, // calc_dims_func }, {}, // GGML_OP_SIN {}, // GGML_OP_COS @@ -84,19 +84,19 @@ constexpr const qnn_op_caps_t kOpCaps[] = { {}, // GGML_OP_SILU_BACK {}, // GGML_OP_NORM { - // GGML_OP_RMS_NORM - QNN_OP_RMS_NORM, // qnn_op_name - 1, // input_param_count - nullptr, // TODO: calc_dims_func - QNN_OP_RMS_NORM_PARAM_EPSILON, // qnn_param_name + // GGML_OP_RMS_NORM + QNN_OP_RMS_NORM, // qnn_op_name + 1, // input_param_count + nullptr, // TODO: calc_dims_func + QNN_OP_RMS_NORM_PARAM_EPSILON, // qnn_param_name }, {}, // GGML_OP_RMS_NORM_BACK {}, // GGML_OP_GROUP_NORM { - // GGML_OP_MUL_MAT - QNN_OP_MAT_MUL, // qnn_op_name - 2, // input_param_count - mat_mul_op_dims, // calc_dims_func + // GGML_OP_MUL_MAT + QNN_OP_MAT_MUL, // qnn_op_name + 2, // input_param_count + mat_mul_op_dims, // calc_dims_func }, {}, // GGML_OP_MUL_MAT_ID {}, // GGML_OP_OUT_PROD @@ -105,10 +105,10 @@ constexpr const qnn_op_caps_t kOpCaps[] = { {}, // GGML_OP_CPY {}, // GGML_OP_CONT { - // GGML_OP_RESHAPE - QNN_OP_RESHAPE, // qnn_op_name - 1, // input_param_count - nullptr, // TODO: calc_dims_func + // GGML_OP_RESHAPE + QNN_OP_RESHAPE, // qnn_op_name + 1, // input_param_count + nullptr, // TODO: calc_dims_func }, {}, // GGML_OP_VIEW {}, // GGML_OP_PERMUTE @@ -177,10 +177,10 @@ constexpr const qnn_op_caps_t kOpCaps[] = { {}, // GGML_UNARY_OP_RELU {}, // GGML_UNARY_OP_SIGMOID { - // GGML_UNARY_OP_GELU - QNN_OP_GELU, // qnn_op_name - 1, // input_param_count - nullptr, // TODO: calc_dims_func + // GGML_UNARY_OP_GELU + QNN_OP_GELU, // qnn_op_name + 1, // input_param_count + nullptr, // TODO: calc_dims_func }, {}, // GGML_UNARY_OP_GELU_QUICK {}, // GGML_UNARY_OP_SILU @@ -201,15 +201,17 @@ static_assert(kOpCaps[GGML_OP_COUNT + GGML_UNARY_OP_GELU].input_param_count == 1 static_assert(std::size(kOpCaps) == (GGML_OP_COUNT + GGML_UNARY_OP_COUNT), "GGML_OP_COUNT does not match the size of the kOpCaps table"); -std::shared_ptr mat_mul_op_constructor(const ggml_tensor *op, const std::string &instance_name, +std::shared_ptr mat_mul_op_constructor(const ggml_tensor * op, + const std::string & instance_name, std::shared_ptr qnn_instance) { GGML_UNUSED(op); - QNN_LOG_DEBUG("create QNN_OP_MAT_MUL, name %s", instance_name.c_str()); + QNN_LOG_DEBUG("create QNN_OP_MAT_MUL, name %s\n", instance_name.c_str()); return std::make_shared(instance_name, qnn_instance); } template -std::shared_ptr generic_op_constructor(const ggml_tensor *op, const std::string &instance_name, +std::shared_ptr generic_op_constructor(const ggml_tensor * op, + const std::string & instance_name, std::shared_ptr qnn_instance) { GGML_UNUSED(op); static_assert(_op < std::size(kOpCaps)); @@ -218,20 +220,20 @@ std::shared_ptr generic_op_constructor(const ggml_tenso kOpCaps[_op].qnn_op_name, qnn_instance); } -void add_type_parameters(std::shared_ptr op, const char *name, float value) { +void add_type_parameters(std::shared_ptr op, const char * name, float value) { Qnn_Scalar_t scalar = QNN_SCALAR_INIT; - scalar.dataType = QNN_DATATYPE_FLOAT_32; - scalar.floatValue = value; + scalar.dataType = QNN_DATATYPE_FLOAT_32; + scalar.floatValue = value; op->add_scalar_param(name, scalar); } template std::shared_ptr op_constructor_with_type_param( - const ggml_tensor *op, const std::string &instance_name, std::shared_ptr qnn_instance) { + const ggml_tensor * op, const std::string & instance_name, std::shared_ptr qnn_instance) { static_assert(std::is_base_of::value); static_assert(_op < std::size(kOpCaps)); - constexpr auto &op_caps = kOpCaps[_op]; + constexpr auto & op_caps = kOpCaps[_op]; static_assert(op_caps.qnn_op_name != nullptr); _ggml_op_param_type op_param; @@ -245,113 +247,113 @@ std::shared_ptr op_constructor_with_type_param( } constexpr const op_constructor_t kOpConstructors[] = { - nullptr, // GGML_OP_NONE - nullptr, // GGML_OP_DUP - generic_op_constructor, // GGML_OP_ADD - nullptr, // GGML_OP_ADD1 - nullptr, // GGML_OP_ACC - generic_op_constructor, // GGML_OP_SUB - generic_op_constructor, // GGML_OP_MUL - generic_op_constructor, // GGML_OP_DIV - nullptr, // GGML_OP_SQR - generic_op_constructor, // GGML_OP_SQRT - generic_op_constructor, // GGML_OP_LOG - nullptr, // GGML_OP_SIN - nullptr, // GGML_OP_COS - nullptr, // GGML_OP_SUM - nullptr, // GGML_OP_SUM_ROWS - nullptr, // GGML_OP_MEAN - nullptr, // GGML_OP_ARGMAX - nullptr, // GGML_OP_COUNT_EQUAL - nullptr, // GGML_OP_REPEAT - nullptr, // GGML_OP_REPEAT_BACK - nullptr, // GGML_OP_CONCAT - nullptr, // GGML_OP_SILU_BACK - nullptr, // GGML_OP_NORM - op_constructor_with_type_param, // GGML_OP_RMS_NORM - nullptr, // GGML_OP_RMS_NORM_BACK - nullptr, // GGML_OP_GROUP_NORM - - mat_mul_op_constructor, // GGML_OP_MUL_MAT - nullptr, // GGML_OP_MUL_MAT_ID - nullptr, // GGML_OP_OUT_PROD - - nullptr, // GGML_OP_SCALE - nullptr, // GGML_OP_SET - nullptr, // GGML_OP_CPY - nullptr, // GGML_OP_CONT - generic_op_constructor, // GGML_OP_RESHAPE - nullptr, // GGML_OP_VIEW - nullptr, // GGML_OP_PERMUTE - nullptr, // GGML_OP_TRANSPOSE - nullptr, // GGML_OP_GET_ROWS - nullptr, // GGML_OP_GET_ROWS_BACK - nullptr, // GGML_OP_DIAG - nullptr, // GGML_OP_DIAG_MASK_INF - nullptr, // GGML_OP_DIAG_MASK_ZERO - nullptr, // GGML_OP_SOFT_MAX - nullptr, // GGML_OP_SOFT_MAX_BACK - nullptr, // GGML_OP_ROPE - nullptr, // GGML_OP_ROPE_BACK - nullptr, // GGML_OP_CLAMP - nullptr, // GGML_OP_CONV_TRANSPOSE_1D - nullptr, // GGML_OP_IM2COL - nullptr, // GGML_OP_IM2COL_BACK - nullptr, // GGML_OP_CONV_TRANSPOSE_2D - nullptr, // GGML_OP_POOL_1D - nullptr, // GGML_OP_POOL_2D - nullptr, // GGML_OP_POOL_2D_BACK - nullptr, // GGML_OP_UPSCALE - nullptr, // GGML_OP_PAD - nullptr, // GGML_OP_PAD_REFLECT_1D - nullptr, // GGML_OP_ARANGE - nullptr, // GGML_OP_TIMESTEP_EMBEDDING - nullptr, // GGML_OP_ARGSORT - nullptr, // GGML_OP_LEAKY_RELU - - nullptr, // GGML_OP_FLASH_ATTN_EXT - nullptr, // GGML_OP_FLASH_ATTN_BACK - nullptr, // GGML_OP_SSM_CONV - nullptr, // GGML_OP_SSM_SCAN - nullptr, // GGML_OP_WIN_PART - nullptr, // GGML_OP_WIN_UNPART - nullptr, // GGML_OP_GET_REL_POS - nullptr, // GGML_OP_ADD_REL_POS - nullptr, // GGML_OP_RWKV_WKV6 - nullptr, // GGML_OP_GATED_LINEAR_ATTN - - nullptr, // GGML_OP_UNARY - - nullptr, // GGML_OP_MAP_UNARY - nullptr, // GGML_OP_MAP_BINARY - - nullptr, // GGML_OP_MAP_CUSTOM1_F32 - nullptr, // GGML_OP_MAP_CUSTOM2_F32 - nullptr, // GGML_OP_MAP_CUSTOM3_F32 - - nullptr, // GGML_OP_MAP_CUSTOM1 - nullptr, // GGML_OP_MAP_CUSTOM2 - nullptr, // GGML_OP_MAP_CUSTOM3 - - nullptr, // GGML_OP_CROSS_ENTROPY_LOSS - nullptr, // GGML_OP_CROSS_ENTROPY_LOSS_BACK - nullptr, // GGML_OP_OPT_STEP_ADAMW + nullptr, // GGML_OP_NONE + nullptr, // GGML_OP_DUP + generic_op_constructor, // GGML_OP_ADD + nullptr, // GGML_OP_ADD1 + nullptr, // GGML_OP_ACC + generic_op_constructor, // GGML_OP_SUB + generic_op_constructor, // GGML_OP_MUL + generic_op_constructor, // GGML_OP_DIV + nullptr, // GGML_OP_SQR + generic_op_constructor, // GGML_OP_SQRT + generic_op_constructor, // GGML_OP_LOG + nullptr, // GGML_OP_SIN + nullptr, // GGML_OP_COS + nullptr, // GGML_OP_SUM + nullptr, // GGML_OP_SUM_ROWS + nullptr, // GGML_OP_MEAN + nullptr, // GGML_OP_ARGMAX + nullptr, // GGML_OP_COUNT_EQUAL + nullptr, // GGML_OP_REPEAT + nullptr, // GGML_OP_REPEAT_BACK + nullptr, // GGML_OP_CONCAT + nullptr, // GGML_OP_SILU_BACK + nullptr, // GGML_OP_NORM + op_constructor_with_type_param, // GGML_OP_RMS_NORM + nullptr, // GGML_OP_RMS_NORM_BACK + nullptr, // GGML_OP_GROUP_NORM + + mat_mul_op_constructor, // GGML_OP_MUL_MAT + nullptr, // GGML_OP_MUL_MAT_ID + nullptr, // GGML_OP_OUT_PROD + + nullptr, // GGML_OP_SCALE + nullptr, // GGML_OP_SET + nullptr, // GGML_OP_CPY + nullptr, // GGML_OP_CONT + generic_op_constructor, // GGML_OP_RESHAPE + nullptr, // GGML_OP_VIEW + nullptr, // GGML_OP_PERMUTE + nullptr, // GGML_OP_TRANSPOSE + nullptr, // GGML_OP_GET_ROWS + nullptr, // GGML_OP_GET_ROWS_BACK + nullptr, // GGML_OP_DIAG + nullptr, // GGML_OP_DIAG_MASK_INF + nullptr, // GGML_OP_DIAG_MASK_ZERO + nullptr, // GGML_OP_SOFT_MAX + nullptr, // GGML_OP_SOFT_MAX_BACK + nullptr, // GGML_OP_ROPE + nullptr, // GGML_OP_ROPE_BACK + nullptr, // GGML_OP_CLAMP + nullptr, // GGML_OP_CONV_TRANSPOSE_1D + nullptr, // GGML_OP_IM2COL + nullptr, // GGML_OP_IM2COL_BACK + nullptr, // GGML_OP_CONV_TRANSPOSE_2D + nullptr, // GGML_OP_POOL_1D + nullptr, // GGML_OP_POOL_2D + nullptr, // GGML_OP_POOL_2D_BACK + nullptr, // GGML_OP_UPSCALE + nullptr, // GGML_OP_PAD + nullptr, // GGML_OP_PAD_REFLECT_1D + nullptr, // GGML_OP_ARANGE + nullptr, // GGML_OP_TIMESTEP_EMBEDDING + nullptr, // GGML_OP_ARGSORT + nullptr, // GGML_OP_LEAKY_RELU + + nullptr, // GGML_OP_FLASH_ATTN_EXT + nullptr, // GGML_OP_FLASH_ATTN_BACK + nullptr, // GGML_OP_SSM_CONV + nullptr, // GGML_OP_SSM_SCAN + nullptr, // GGML_OP_WIN_PART + nullptr, // GGML_OP_WIN_UNPART + nullptr, // GGML_OP_GET_REL_POS + nullptr, // GGML_OP_ADD_REL_POS + nullptr, // GGML_OP_RWKV_WKV6 + nullptr, // GGML_OP_GATED_LINEAR_ATTN + + nullptr, // GGML_OP_UNARY + + nullptr, // GGML_OP_MAP_UNARY + nullptr, // GGML_OP_MAP_BINARY + + nullptr, // GGML_OP_MAP_CUSTOM1_F32 + nullptr, // GGML_OP_MAP_CUSTOM2_F32 + nullptr, // GGML_OP_MAP_CUSTOM3_F32 + + nullptr, // GGML_OP_MAP_CUSTOM1 + nullptr, // GGML_OP_MAP_CUSTOM2 + nullptr, // GGML_OP_MAP_CUSTOM3 + + nullptr, // GGML_OP_CROSS_ENTROPY_LOSS + nullptr, // GGML_OP_CROSS_ENTROPY_LOSS_BACK + nullptr, // GGML_OP_OPT_STEP_ADAMW // ggml_unary_op - nullptr, // GGML_UNARY_OP_ABS - nullptr, // GGML_UNARY_OP_SGN - nullptr, // GGML_UNARY_OP_NEG - nullptr, // GGML_UNARY_OP_STEP - nullptr, // GGML_UNARY_OP_TANH - nullptr, // GGML_UNARY_OP_ELU - nullptr, // GGML_UNARY_OP_RELU - nullptr, // GGML_UNARY_OP_SIGMOID - nullptr, // GGML_UNARY_OP_GELU - nullptr, // GGML_UNARY_OP_GELU_QUICK - nullptr, // GGML_UNARY_OP_SILU - nullptr, // GGML_UNARY_OP_HARDSWISH - nullptr, // GGML_UNARY_OP_HARDSIGMOID - nullptr, // GGML_UNARY_OP_EXP + nullptr, // GGML_UNARY_OP_ABS + nullptr, // GGML_UNARY_OP_SGN + nullptr, // GGML_UNARY_OP_NEG + nullptr, // GGML_UNARY_OP_STEP + nullptr, // GGML_UNARY_OP_TANH + nullptr, // GGML_UNARY_OP_ELU + nullptr, // GGML_UNARY_OP_RELU + nullptr, // GGML_UNARY_OP_SIGMOID + nullptr, // GGML_UNARY_OP_GELU + nullptr, // GGML_UNARY_OP_GELU_QUICK + nullptr, // GGML_UNARY_OP_SILU + nullptr, // GGML_UNARY_OP_HARDSWISH + nullptr, // GGML_UNARY_OP_HARDSIGMOID + nullptr, // GGML_UNARY_OP_EXP }; static_assert(kOpConstructors[GGML_OP_NONE] == nullptr, "GGML_OP_NONE does not match the nullptr function"); @@ -362,11 +364,11 @@ static_assert(kOpConstructors[GGML_OP_MUL_MAT] == mat_mul_op_constructor, static_assert(std::size(kOpConstructors) == (GGML_OP_COUNT + GGML_UNARY_OP_COUNT), "GGML_OP_COUNT does not match the size of the kOpConstructors table"); -} // namespace +} // namespace namespace qnn { -size_t get_qnn_op_index(const ggml_tensor *tensor) { +size_t get_qnn_op_index(const ggml_tensor * tensor) { if (tensor->op == GGML_OP_UNARY) { return kGgmlUnaryOpStart + ggml_get_unary_op(tensor); } @@ -374,20 +376,20 @@ size_t get_qnn_op_index(const ggml_tensor *tensor) { return tensor->op; } -const char *get_qnn_op_name(const ggml_tensor *op) { +const char * get_qnn_op_name(const ggml_tensor * op) { auto op_index = get_qnn_op_index(op); GGML_ASSERT(op_index < std::size(kOpCaps)); GGML_ASSERT(kOpCaps[op_index].qnn_op_name); return kOpCaps[op_index].qnn_op_name; } -size_t get_qnn_op_input_param_count(const ggml_tensor *op) { +size_t get_qnn_op_input_param_count(const ggml_tensor * op) { auto op_index = get_qnn_op_index(op); GGML_ASSERT(op_index < std::size(kOpCaps)); return kOpCaps[op_index].input_param_count; } -std::shared_ptr create_op(const ggml_tensor *op, const std::string &name, +std::shared_ptr create_op(const ggml_tensor * op, const std::string & name, std::shared_ptr qnn_instance) { auto op_index = get_qnn_op_index(op); GGML_ASSERT(op_index < std::size(kOpCaps)); @@ -396,4 +398,4 @@ std::shared_ptr create_op(const ggml_tensor *op, const std:: return op_constructor(op, name, qnn_instance); } -} // namespace qnn +} // namespace qnn diff --git a/ggml/src/ggml-qnn/op-config-impl.cpp b/ggml/src/ggml-qnn/op-config-impl.cpp index 1b05b3581a419..14638a554e066 100644 --- a/ggml/src/ggml-qnn/op-config-impl.cpp +++ b/ggml/src/ggml-qnn/op-config-impl.cpp @@ -6,14 +6,7 @@ namespace { -constexpr const qnn::qnn_dimension_array_t kTransposeParamData[GGML_MAX_DIMS] = { - {0}, - {1, 0}, - {0, 2, 1}, - {0, 1, 3, 2}, -}; - -qnn::qnn_dimension_array_t get_transposed_dimensions(const qnn::qnn_dimension_array_t &dimensions, int rank) { +qnn::qnn_dimension_array_t get_transposed_dimensions(const qnn::qnn_dimension_array_t & dimensions, int rank) { qnn::qnn_dimension_array_t transposed_dims = dimensions; if (rank >= 2) { transposed_dims[rank - 1] = dimensions[rank - 2]; @@ -23,11 +16,11 @@ qnn::qnn_dimension_array_t get_transposed_dimensions(const qnn::qnn_dimension_ar return transposed_dims; } -int get_rank(const qnn::ggml_tensor_array_t &tensor_inputs, const qnn::ggml_tensor_array_t &tensor_outputs) { +int get_rank(const qnn::ggml_tensor_array_t & tensor_inputs, const qnn::ggml_tensor_array_t & tensor_outputs) { return std::max(qnn::get_ggml_tensors_max_rank(tensor_inputs), qnn::get_ggml_tensors_max_rank(tensor_outputs)); } -Qnn_DataType_t get_tensor_type(const qnn::qnn_tensor_array_t &tensors) { +Qnn_DataType_t get_tensor_type(const qnn::qnn_tensor_array_t & tensors) { Qnn_DataType_t type = QNN_DATATYPE_UNDEFINED; for (auto tensor : tensors) { auto tensor_type_size = qnn::qnn_datatype_size(tensor->get_data_type()); @@ -40,67 +33,67 @@ Qnn_DataType_t get_tensor_type(const qnn::qnn_tensor_array_t &tensors) { return type; } -} // namespace +} // namespace namespace qnn { -void ggml_qnn_op_config_base::add_scalar_param(const std::string &name, const Qnn_Scalar_t scalar) { +void ggml_qnn_op_config_base::add_scalar_param(const std::string & name, const Qnn_Scalar_t scalar) { _param_names.push_back(name); Qnn_Param_t param = QNN_PARAM_INIT; - param.paramType = QNN_PARAMTYPE_SCALAR; - param.name = _param_names.back().c_str(); + param.paramType = QNN_PARAMTYPE_SCALAR; + param.name = _param_names.back().c_str(); param.scalarParam = scalar; _qnn_parameters.push_back(param); } -bool ggml_qnn_op_config_base::add_tensor_param(const std::string &name, const qnn_dimension_array_t &dimensions, - int rank, const uint8_t *data, const Qnn_DataType_t data_type, +bool ggml_qnn_op_config_base::add_tensor_param(const std::string & name, const qnn_dimension_array_t & dimensions, + int rank, const uint8_t * data, const Qnn_DataType_t data_type, QNNBackend device, Qnn_GraphHandle_t graph_handle) { - std::string tensor_name = _name + name + std::to_string(_tensor_parameters.size()); - auto param_tensor = std::make_shared(ggml_qnn_tensor::PARAMETER, tensor_name, dimensions, - data_type, rank, device, graph_handle, _qnn_instance); - size_t data_size = ggml_type_size(ggml_datatype_from_qnn_datatype(data_type)); + std::string tensor_name = _name + name + std::to_string(_tensor_parameters.size()); + auto param_tensor = std::make_shared(ggml_qnn_tensor::PARAMETER, tensor_name, dimensions, + data_type, rank, device, graph_handle, _qnn_instance); + size_t data_size = ggml_type_size(ggml_datatype_from_qnn_datatype(data_type)); for (int i = 0; i < rank; i++) { data_size *= dimensions[i]; } GGML_ASSERT(data_size > 0); if (!param_tensor->set_data_buffer(data, data_size)) { - QNN_LOG_ERROR("parameter tensor bind_buffer failed"); + QNN_LOG_ERROR("parameter tensor bind_buffer failed\n"); return false; } if (!param_tensor->alloc_qnn_tensor_id()) { - QNN_LOG_ERROR("parameter tensor alloc_qnn_tensor_id failed"); + QNN_LOG_ERROR("parameter tensor alloc_qnn_tensor_id failed\n"); return false; } _tensor_parameters.push_back(param_tensor); _param_names.push_back(name); Qnn_Param_t param = QNN_PARAM_INIT; - param.paramType = QNN_PARAMTYPE_TENSOR; - param.name = _param_names.back().c_str(); + param.paramType = QNN_PARAMTYPE_TENSOR; + param.name = _param_names.back().c_str(); param.tensorParam = param_tensor->get_qnn_tensor(); _qnn_parameters.push_back(param); return true; } -void ggml_qnn_op_config_base::set_input_tensors(qnn::qnn_tensor_array_t &tensor_inputs) { +void ggml_qnn_op_config_base::set_input_tensors(qnn::qnn_tensor_array_t & tensor_inputs) { _tensor_inputs = tensor_inputs; _qnn_tensor_inputs.resize(_tensor_inputs.size()); } -void ggml_qnn_op_config_base::set_input_tensors(qnn::qnn_tensor_array_t &&tensor_inputs) { +void ggml_qnn_op_config_base::set_input_tensors(qnn::qnn_tensor_array_t && tensor_inputs) { _tensor_inputs = tensor_inputs; _qnn_tensor_inputs.resize(_tensor_inputs.size()); } -void ggml_qnn_op_config_base::set_output_tensors(qnn::qnn_tensor_array_t &tensor_outputs) { +void ggml_qnn_op_config_base::set_output_tensors(qnn::qnn_tensor_array_t & tensor_outputs) { _tensor_outputs = std::move(tensor_outputs); _qnn_tensor_outputs.resize(_tensor_outputs.size()); } -void ggml_qnn_op_config_base::set_output_tensors(qnn::qnn_tensor_array_t &&tensor_outputs) { +void ggml_qnn_op_config_base::set_output_tensors(qnn::qnn_tensor_array_t && tensor_outputs) { _tensor_outputs = std::move(tensor_outputs); _qnn_tensor_outputs.resize(_tensor_outputs.size()); } @@ -109,74 +102,80 @@ bool ggml_qnn_op_config_base::add_op_to_graph(Qnn_GraphHandle_t graph_handle) { GGML_ASSERT(_qnn_tensor_inputs.size() == _tensor_inputs.size()); GGML_ASSERT(_qnn_tensor_outputs.size() == _tensor_outputs.size()); - QNN_LOG_DEBUG("[%s]add to graph start", _name.c_str()); + QNN_LOG_DEBUG("[%s]add to graph start\n", _name.c_str()); for (size_t i = 0; i < _tensor_inputs.size(); i++) { auto tensor = _tensor_inputs[i]; if (!tensor->alloc_qnn_tensor_id()) { - QNN_LOG_ERROR("[%s]input tensor alloc_qnn_tensor_id failed", _name.c_str()); + QNN_LOG_ERROR("[%s]input tensor alloc_qnn_tensor_id failed\n", _name.c_str()); return false; } - QNN_LOG_DEBUG("[%s]input tensor id: %d", _name.c_str(), tensor->get_qnn_tensor_id()); + QNN_LOG_DEBUG("[%s]input tensor id: %d\n", _name.c_str(), tensor->get_qnn_tensor_id()); _qnn_tensor_inputs[i] = tensor->get_qnn_tensor(); } for (size_t i = 0; i < _tensor_outputs.size(); i++) { auto tensor = _tensor_outputs[i]; if (!tensor->alloc_qnn_tensor_id()) { - QNN_LOG_ERROR("[%s]output tensor alloc_qnn_tensor_id failed", _name.c_str()); + QNN_LOG_ERROR("[%s]output tensor alloc_qnn_tensor_id failed\n", _name.c_str()); return false; } - QNN_LOG_DEBUG("[%s]output tensor id: %d", _name.c_str(), tensor->get_qnn_tensor_id()); + QNN_LOG_DEBUG("[%s]output tensor id: %d\n", _name.c_str(), tensor->get_qnn_tensor_id()); _qnn_tensor_outputs[i] = tensor->get_qnn_tensor(); } auto qnn_interface = _qnn_instance->get_qnn_interface(); - auto error = qnn_interface->qnn_graph_add_node(graph_handle, get_op_config()); + auto error = qnn_interface->qnn_graph_add_node(graph_handle, get_op_config()); if (error != QNN_SUCCESS) { - QNN_LOG_ERROR("[%s]qnn_graph_add_node.error: %s", _name.c_str(), get_qnn_error_string(error)); + QNN_LOG_ERROR("[%s]qnn_graph_add_node.error: %s\n", _name.c_str(), get_qnn_error_string(error)); return false; } - QNN_LOG_DEBUG("[%s]added to graph succeed", _name.c_str()); + QNN_LOG_DEBUG("[%s]added to graph succeed\n", _name.c_str()); return true; } -bool ggml_qnn_op_config_base::bind_input_tensors(const ggml_tensor_array_t &tensor_inputs) { +bool ggml_qnn_op_config_base::bind_input_tensors(const ggml_tensor_array_t & tensor_inputs) { GGML_ASSERT(tensor_inputs.size() == _tensor_inputs.size()); return qnn::bind_tensors(tensor_inputs, _tensor_inputs, _qnn_tensor_inputs); } -bool ggml_qnn_op_config_base::bind_output_tensors(const ggml_tensor_array_t &tensor_outputs) { +bool ggml_qnn_op_config_base::bind_output_tensors(const ggml_tensor_array_t & tensor_outputs) { GGML_ASSERT(tensor_outputs.size() == _tensor_outputs.size()); return qnn::bind_tensors(tensor_outputs, _tensor_outputs, _qnn_tensor_outputs); } void ggml_qnn_op_config_base::unbind_input_tensors() { - for (auto &tensor : _tensor_inputs) { + for (auto & tensor : _tensor_inputs) { tensor->unbind(); } } void ggml_qnn_op_config_base::unbind_output_tensors() { - for (auto &tensor : _tensor_outputs) { + for (auto & tensor : _tensor_outputs) { tensor->unbind(); } } Qnn_OpConfig_t ggml_qnn_op_config_base::get_op_config() { - Qnn_OpConfig_t config = QNN_OPCONFIG_INIT; - config.version = QNN_OPCONFIG_VERSION_1; - auto &op_config = config.v1; - op_config.name = _name.c_str(); - op_config.packageName = _package_name.c_str(); - op_config.typeName = _op_type.c_str(); - op_config.numOfParams = (uint32_t)_qnn_parameters.size(); - op_config.params = _qnn_parameters.data(); - op_config.numOfInputs = (uint32_t)_qnn_tensor_inputs.size(); - op_config.inputTensors = _qnn_tensor_inputs.data(); - op_config.numOfOutputs = (uint32_t)_qnn_tensor_outputs.size(); + GGML_ASSERT(_qnn_parameters.size() == _param_names.size()); + + for (size_t i = 0; i < _qnn_parameters.size(); i++) { + _qnn_parameters[i].name = _param_names[i].c_str(); + } + + Qnn_OpConfig_t config = QNN_OPCONFIG_INIT; + config.version = QNN_OPCONFIG_VERSION_1; + auto & op_config = config.v1; + op_config.name = _name.c_str(); + op_config.packageName = _package_name.c_str(); + op_config.typeName = _op_type.c_str(); + op_config.numOfParams = (uint32_t) _qnn_parameters.size(); + op_config.params = _qnn_parameters.data(); + op_config.numOfInputs = (uint32_t) _qnn_tensor_inputs.size(); + op_config.inputTensors = _qnn_tensor_inputs.data(); + op_config.numOfOutputs = (uint32_t) _qnn_tensor_outputs.size(); op_config.outputTensors = _qnn_tensor_outputs.data(); return config; } @@ -188,33 +187,33 @@ bool ggml_qnn_single_op_config::initialize_op_nodes(QNNBackend device, Qnn_Graph } bool ggml_qnn_rmsnorm_op_config::initialize_op_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle) { - constexpr const uint32_t kAxes[] = {0}; - add_tensor_param(QNN_OP_RMS_NORM_PARAM_AXES, {1}, 1, reinterpret_cast(kAxes), QNN_DATATYPE_UINT_32, - device, graph_handle); + constexpr const uint32_t kAxes[] = { 0 }; + add_tensor_param(QNN_OP_RMS_NORM_PARAM_AXES, { 1 }, 1, reinterpret_cast(kAxes), + QNN_DATATYPE_UINT_32, device, graph_handle); return true; } -void ggml_qnn_aggregate_op_config::set_input_tensors(qnn::qnn_tensor_array_t &tensor_inputs) { +void ggml_qnn_aggregate_op_config::set_input_tensors(qnn::qnn_tensor_array_t & tensor_inputs) { _tensor_inputs = tensor_inputs; } -void ggml_qnn_aggregate_op_config::set_input_tensors(qnn::qnn_tensor_array_t &&tensor_inputs) { +void ggml_qnn_aggregate_op_config::set_input_tensors(qnn::qnn_tensor_array_t && tensor_inputs) { _tensor_inputs = std::move(tensor_inputs); } -void ggml_qnn_aggregate_op_config::set_output_tensors(qnn::qnn_tensor_array_t &tensor_outputs) { +void ggml_qnn_aggregate_op_config::set_output_tensors(qnn::qnn_tensor_array_t & tensor_outputs) { _tensor_outputs = tensor_outputs; } -void ggml_qnn_aggregate_op_config::set_output_tensors(qnn::qnn_tensor_array_t &&tensor_outputs) { +void ggml_qnn_aggregate_op_config::set_output_tensors(qnn::qnn_tensor_array_t && tensor_outputs) { _tensor_outputs = std::move(tensor_outputs); } -bool ggml_qnn_aggregate_op_config::bind_input_tensors(const ggml_tensor_array_t &tensor_inputs) { +bool ggml_qnn_aggregate_op_config::bind_input_tensors(const ggml_tensor_array_t & tensor_inputs) { return qnn::bind_tensors(tensor_inputs, _tensor_inputs); } -bool ggml_qnn_aggregate_op_config::bind_output_tensors(const ggml_tensor_array_t &tensor_outputs) { +bool ggml_qnn_aggregate_op_config::bind_output_tensors(const ggml_tensor_array_t & tensor_outputs) { return qnn::bind_tensors(tensor_outputs, _tensor_outputs); } @@ -223,18 +222,18 @@ bool ggml_qnn_matmul_op_config::initialize_op_nodes(QNNBackend device, Qnn_Graph GGML_ASSERT(_tensor_outputs.size() == 1); // create convert nodes - const auto tensor_rank = _tensor_inputs.front()->get_rank(); - qnn_tensor_array_t mat_mul_tensor_inputs = _tensor_inputs; + const auto tensor_rank = _tensor_inputs.front()->get_rank(); + qnn_tensor_array_t mat_mul_tensor_inputs = _tensor_inputs; qnn_tensor_array_t mat_mul_tensor_outputs = _tensor_outputs; if (!create_convert_nodes(device, graph_handle, tensor_rank, mat_mul_tensor_inputs, mat_mul_tensor_outputs)) { - QNN_LOG_ERROR("create convert nodes failed"); + QNN_LOG_ERROR("create convert nodes failed\n"); return false; } mat_mul_tensor_inputs.front() = create_gather_nodes(device, graph_handle, tensor_rank, mat_mul_tensor_inputs.front(), mat_mul_tensor_inputs.back()->get_dimensions()); - return create_mat_mul_nodes(device, graph_handle, tensor_rank, mat_mul_tensor_inputs, mat_mul_tensor_outputs); + return create_mat_mul_nodes(mat_mul_tensor_inputs, mat_mul_tensor_outputs); } qnn_tensor_ptr_t ggml_qnn_matmul_op_config::create_gather_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, @@ -244,9 +243,9 @@ qnn_tensor_ptr_t ggml_qnn_matmul_op_config::create_gather_nodes(QNNBackend devic return tensor_input; } - const auto &input_dimensions = tensor_input->get_dimensions(); - output_dimensions[rank - 1] = input_dimensions[rank - 1]; - output_dimensions[rank - 2] = input_dimensions[rank - 2]; + const auto & input_dimensions = tensor_input->get_dimensions(); + output_dimensions[rank - 1] = input_dimensions[rank - 1]; + output_dimensions[rank - 2] = input_dimensions[rank - 2]; const auto y = output_dimensions[rank - 3] / input_dimensions[rank - 3]; if (y == 1 && (rank == 3 || (rank == 4 && output_dimensions[rank - 4] == input_dimensions[rank - 4]))) { @@ -255,9 +254,9 @@ qnn_tensor_ptr_t ggml_qnn_matmul_op_config::create_gather_nodes(QNNBackend devic // create concat nodes, to convert tensor shape from [ne03, ne02, n, k] to [ne03 * x, ne02 * y, n, k] constexpr const auto create_node = - [](const std::string &name, const int rank, const int axis, const qnn_dimension_array_t &dimensions, + [](const std::string & name, const int rank, const int axis, const qnn_dimension_array_t & dimensions, qnn_tensor_ptr_t tensor_input, QNNBackend device, Qnn_GraphHandle_t graph_handle, - std::shared_ptr qnn_instance, qnn_tensor_ptr_t &tensor_output) -> qnn_op_config_ptr_t { + std::shared_ptr qnn_instance, qnn_tensor_ptr_t & tensor_output) -> qnn_op_config_ptr_t { auto gather_out = std::make_shared(ggml_qnn_tensor::INTERMEDIATE, name + "_out", dimensions, tensor_input->get_data_type(), rank, device, graph_handle, qnn_instance); @@ -265,32 +264,32 @@ qnn_tensor_ptr_t ggml_qnn_matmul_op_config::create_gather_nodes(QNNBackend devic qnn_instance); Qnn_Scalar_t scalar = QNN_SCALAR_INIT; - scalar.dataType = QNN_DATATYPE_INT_32; - scalar.int32Value = axis; + scalar.dataType = QNN_DATATYPE_INT_32; + scalar.int32Value = axis; gather_op->add_scalar_param(QNN_OP_GATHER_PARAM_AXIS, scalar); - gather_op->set_output_tensors({gather_out}); + gather_op->set_output_tensors({ gather_out }); // here we calculate the index mapping, will generate a 1d tensor like [0, 0, 0, 1, 1, 1, 2, 2, 2, ...], // by repeating each index [scale] times. - const auto scale = dimensions[axis] / tensor_input->get_dimensions()[axis]; - auto index_buffer = std::make_shared(dimensions[axis] * sizeof(uint32_t)); + const auto scale = dimensions[axis] / tensor_input->get_dimensions()[axis]; + auto index_buffer = std::make_shared(dimensions[axis] * sizeof(uint32_t)); for (uint32_t *curr = reinterpret_cast(index_buffer->get_buffer()), *end = curr + dimensions[axis]; curr < end; curr++) { *curr = uint32_t((curr - reinterpret_cast(index_buffer->get_buffer())) / scale); } auto gather_index = std::make_shared( - ggml_qnn_tensor::PARAMETER, name + "_index", qnn_dimension_array_t{dimensions[axis]}, QNN_DATATYPE_UINT_32, - 1, device, graph_handle, qnn_instance); + ggml_qnn_tensor::PARAMETER, name + "_index", qnn_dimension_array_t{ dimensions[axis] }, + QNN_DATATYPE_UINT_32, 1, device, graph_handle, qnn_instance); gather_index->set_data_buffer(index_buffer); - gather_op->set_input_tensors({tensor_input, gather_index}); + gather_op->set_input_tensors({ tensor_input, gather_index }); tensor_output = gather_out; return gather_op; }; qnn_dimension_array_t intermediate_dimensions = input_dimensions; - intermediate_dimensions[rank - 3] = output_dimensions[rank - 3]; + intermediate_dimensions[rank - 3] = output_dimensions[rank - 3]; qnn_tensor_ptr_t gather0_out; _operations.push_back(create_node(_name + "_gather0", rank, rank - 3, intermediate_dimensions, tensor_input, device, graph_handle, _qnn_instance, gather0_out)); @@ -305,8 +304,8 @@ qnn_tensor_ptr_t ggml_qnn_matmul_op_config::create_gather_nodes(QNNBackend devic } bool ggml_qnn_matmul_op_config::create_convert_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, const int rank, - qnn_tensor_array_t &tensor_inputs, - qnn_tensor_array_t &tensor_outputs) { + qnn_tensor_array_t & tensor_inputs, + qnn_tensor_array_t & tensor_outputs) { if (device == QNN_BACKEND_GPU) { // there's no convert op for GPU, so we should create matmul nodes directly. return true; @@ -314,7 +313,7 @@ bool ggml_qnn_matmul_op_config::create_convert_nodes(QNNBackend device, Qnn_Grap // create tensors for convert node auto tensor_type = get_tensor_type(tensor_inputs); - QNN_LOG_DEBUG("input tensor type: %s", qnn_datatype_to_string(tensor_type)); + QNN_LOG_DEBUG("input tensor type: %s\n", qnn_datatype_to_string(tensor_type)); for (size_t i = 0; i < tensor_inputs.size(); ++i) { // create input convert nodes @@ -327,10 +326,10 @@ bool ggml_qnn_matmul_op_config::create_convert_nodes(QNNBackend device, Qnn_Grap auto convert_out = std::make_shared(ggml_qnn_tensor::INTERMEDIATE, convert_name + "_out", convert_in->get_dimensions(), tensor_type, rank, device, graph_handle, _qnn_instance); - auto convert = std::make_shared(convert_name, QNN_OP_PACKAGE_NAME_QTI_AISW, - QNN_OP_CONVERT, _qnn_instance); - convert->set_input_tensors({convert_in}); - convert->set_output_tensors({convert_out}); + auto convert = std::make_shared(convert_name, QNN_OP_PACKAGE_NAME_QTI_AISW, + QNN_OP_CONVERT, _qnn_instance); + convert->set_input_tensors({ convert_in }); + convert->set_output_tensors({ convert_out }); tensor_inputs[i] = convert_out; _operations.push_back(convert); } @@ -338,14 +337,14 @@ bool ggml_qnn_matmul_op_config::create_convert_nodes(QNNBackend device, Qnn_Grap if (tensor_outputs.front()->get_data_type() != tensor_type) { // create output convert node std::string convert_name("convert_dst"); - auto convert_out = tensor_outputs.front(); - auto convert_in = std::make_shared(ggml_qnn_tensor::INTERMEDIATE, convert_name + "_in", - convert_out->get_dimensions(), tensor_type, rank, device, - graph_handle, _qnn_instance); + auto convert_out = tensor_outputs.front(); + auto convert_in = std::make_shared(ggml_qnn_tensor::INTERMEDIATE, convert_name + "_in", + convert_out->get_dimensions(), tensor_type, rank, device, + graph_handle, _qnn_instance); auto output_convert = std::make_shared(convert_name, QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_CONVERT, _qnn_instance); - output_convert->set_input_tensors({convert_in}); - output_convert->set_output_tensors({convert_out}); + output_convert->set_input_tensors({ convert_in }); + output_convert->set_output_tensors({ convert_out }); tensor_outputs.front() = convert_in; _operations.push_back(output_convert); } @@ -353,10 +352,8 @@ bool ggml_qnn_matmul_op_config::create_convert_nodes(QNNBackend device, Qnn_Grap return true; } -bool ggml_qnn_matmul_op_config::create_mat_mul_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, const int rank, - qnn_tensor_array_t &tensor_inputs, - qnn_tensor_array_t &tensor_outputs) { - +bool ggml_qnn_matmul_op_config::create_mat_mul_nodes(qnn_tensor_array_t & tensor_inputs, + qnn_tensor_array_t & tensor_outputs) { /* * First, both the ggml and qnn tensor in memory are stored as row-major format. (For more details, please refer to: * https://pytorch.org/blog/tensor-memory-format-matters/#:~:text=Column%20Major%20Order:%20In%20this%20format,%20the%20matrix) @@ -395,8 +392,8 @@ bool ggml_qnn_matmul_op_config::create_mat_mul_nodes(QNNBackend device, Qnn_Grap * So here we need to create graph like: * ```mermaid * graph TD; - * i1>ggml_tensor_in0] --src1--> mat_mul0; - * i2>ggml_tensor_in1] --src0.T--> mat_mul0; + * i1>ggml_tensor_in1] --src0--> mat_mul0; + * i2>ggml_tensor_in0] --src1.T--> mat_mul0; * mat_mul0 --dst0--> o1>ggml_tensor_out]; * ``` */ @@ -411,8 +408,8 @@ bool ggml_qnn_matmul_op_config::create_mat_mul_nodes(QNNBackend device, Qnn_Grap std::make_shared(_name, QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_MAT_MUL, _qnn_instance); Qnn_Scalar_t scalar = QNN_SCALAR_INIT; - scalar.dataType = QNN_DATATYPE_BOOL_8; - scalar.bool8Value = 1; + scalar.dataType = QNN_DATATYPE_BOOL_8; + scalar.bool8Value = 1; mat_mul->add_scalar_param(QNN_OP_MAT_MUL_PARAM_TRANSPOSE_IN1, scalar); // set tensor to mat_mul @@ -424,4 +421,4 @@ bool ggml_qnn_matmul_op_config::create_mat_mul_nodes(QNNBackend device, Qnn_Grap return true; } -} // namespace qnn +} // namespace qnn diff --git a/ggml/src/ggml-qnn/op-config-impl.hpp b/ggml/src/ggml-qnn/op-config-impl.hpp index 4a00ed2cc7ac3..8e2f107b2dae1 100644 --- a/ggml/src/ggml-qnn/op-config-impl.hpp +++ b/ggml/src/ggml-qnn/op-config-impl.hpp @@ -13,77 +13,83 @@ namespace qnn { class ggml_qnn_op_config_base : public ggml_qnn_op_config { -public: - explicit ggml_qnn_op_config_base(const std::string &name, const std::string &package_name, - const std::string &op_type, std::shared_ptr qnn_instance) - : _name(name), _package_name(package_name), _op_type(op_type), _qnn_instance(qnn_instance) {} - - void add_scalar_param(const std::string &name, const Qnn_Scalar_t scalar); - bool add_tensor_param(const std::string &name, const qnn_dimension_array_t &dimensions, int rank, - const uint8_t *data, const Qnn_DataType_t data_type, QNNBackend device, + public: + explicit ggml_qnn_op_config_base(const std::string & name, const std::string & package_name, + const std::string & op_type, std::shared_ptr qnn_instance) : + _name(name), + _package_name(package_name), + _op_type(op_type), + _qnn_instance(qnn_instance) {} + + void add_scalar_param(const std::string & name, const Qnn_Scalar_t scalar); + bool add_tensor_param(const std::string & name, const qnn_dimension_array_t & dimensions, int rank, + const uint8_t * data, const Qnn_DataType_t data_type, QNNBackend device, Qnn_GraphHandle_t graph_handle); - void set_input_tensors(qnn::qnn_tensor_array_t &tensor_inputs) override; - void set_input_tensors(qnn::qnn_tensor_array_t &&tensor_inputs) override; - void set_output_tensors(qnn::qnn_tensor_array_t &tensor_inputs) override; - void set_output_tensors(qnn::qnn_tensor_array_t &&tensor_inputs) override; + void set_input_tensors(qnn::qnn_tensor_array_t & tensor_inputs) override; + void set_input_tensors(qnn::qnn_tensor_array_t && tensor_inputs) override; + void set_output_tensors(qnn::qnn_tensor_array_t & tensor_inputs) override; + void set_output_tensors(qnn::qnn_tensor_array_t && tensor_inputs) override; bool add_op_to_graph(Qnn_GraphHandle_t graph_handle) override; - bool bind_input_tensors(const ggml_tensor_array_t &tensor_inputs) override; - bool bind_output_tensors(const ggml_tensor_array_t &tensor_outputs) override; + bool bind_input_tensors(const ggml_tensor_array_t & tensor_inputs) override; + bool bind_output_tensors(const ggml_tensor_array_t & tensor_outputs) override; void unbind_input_tensors() override; void unbind_output_tensors() override; - const qnn_tensor_array_t &get_input_tensors() override { return _tensor_inputs; } - const qnn_tensor_array_t &get_output_tensors() override { return _tensor_outputs; } -protected: + const qnn_tensor_array_t & get_input_tensors() override { return _tensor_inputs; } + + const qnn_tensor_array_t & get_output_tensors() override { return _tensor_outputs; } + + protected: Qnn_OpConfig_t get_op_config(); - std::string _name; - std::string _package_name; - std::string _op_type; + std::string _name; + std::string _package_name; + std::string _op_type; std::shared_ptr _qnn_instance; - qnn_tensor_array_t _tensor_inputs; - qnn_tensor_array_t _tensor_outputs; - qnn_tensor_array_t _tensor_parameters; - std::vector _qnn_tensor_inputs; - std::vector _qnn_tensor_outputs; - std::vector _qnn_parameters; - std::vector _param_names; + qnn_tensor_array_t _tensor_inputs; + qnn_tensor_array_t _tensor_outputs; + qnn_tensor_array_t _tensor_parameters; + std::vector _qnn_tensor_inputs; + std::vector _qnn_tensor_outputs; + std::vector _qnn_parameters; + std::vector _param_names; DISABLE_COPY(ggml_qnn_op_config_base); DISABLE_MOVE(ggml_qnn_op_config_base); }; class ggml_qnn_single_op_config : public ggml_qnn_op_config_base { -public: - explicit ggml_qnn_single_op_config(const std::string &name, const std::string &package_name, - const std::string &op_type, std::shared_ptr qnn_instance) - : ggml_qnn_op_config_base(name, package_name, op_type, qnn_instance) {} + public: + explicit ggml_qnn_single_op_config(const std::string & name, const std::string & package_name, + const std::string & op_type, std::shared_ptr qnn_instance) : + ggml_qnn_op_config_base(name, package_name, op_type, qnn_instance) {} bool initialize_op_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle) override; -private: + private: DISABLE_COPY(ggml_qnn_single_op_config); DISABLE_MOVE(ggml_qnn_single_op_config); }; class ggml_qnn_rmsnorm_op_config : public ggml_qnn_op_config_base { -public: - explicit ggml_qnn_rmsnorm_op_config(const std::string &name, const std::string &package_name, - const std::string &op_type, std::shared_ptr qnn_instance) - : ggml_qnn_op_config_base(name, package_name, op_type, qnn_instance) {} + public: + explicit ggml_qnn_rmsnorm_op_config(const std::string & name, const std::string & package_name, + const std::string & op_type, std::shared_ptr qnn_instance) : + ggml_qnn_op_config_base(name, package_name, op_type, qnn_instance) {} bool initialize_op_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle) override; -private: + private: DISABLE_COPY(ggml_qnn_rmsnorm_op_config); DISABLE_MOVE(ggml_qnn_rmsnorm_op_config); }; class ggml_qnn_aggregate_op_config : public ggml_qnn_op_config { -public: - explicit ggml_qnn_aggregate_op_config(const std::string &name, std::shared_ptr qnn_instance) - : _name(name), _qnn_instance(qnn_instance) {} + public: + explicit ggml_qnn_aggregate_op_config(const std::string & name, std::shared_ptr qnn_instance) : + _name(name), + _qnn_instance(qnn_instance) {} ~ggml_qnn_aggregate_op_config() { _tensor_inputs.clear(); @@ -91,61 +97,63 @@ class ggml_qnn_aggregate_op_config : public ggml_qnn_op_config { _operations.clear(); } - void set_input_tensors(qnn::qnn_tensor_array_t &tensor_inputs) override; - void set_input_tensors(qnn::qnn_tensor_array_t &&tensor_inputs) override; - void set_output_tensors(qnn::qnn_tensor_array_t &tensor_inputs) override; - void set_output_tensors(qnn::qnn_tensor_array_t &&tensor_inputs) override; + void set_input_tensors(qnn::qnn_tensor_array_t & tensor_inputs) override; + void set_input_tensors(qnn::qnn_tensor_array_t && tensor_inputs) override; + void set_output_tensors(qnn::qnn_tensor_array_t & tensor_inputs) override; + void set_output_tensors(qnn::qnn_tensor_array_t && tensor_inputs) override; + bool add_op_to_graph(Qnn_GraphHandle_t graph_handle) override { return qnn::add_op_to_graph(graph_handle, _operations); } - bool bind_input_tensors(const ggml_tensor_array_t &tensor_inputs) override; - bool bind_output_tensors(const ggml_tensor_array_t &tensor_outputs) override; + bool bind_input_tensors(const ggml_tensor_array_t & tensor_inputs) override; + bool bind_output_tensors(const ggml_tensor_array_t & tensor_outputs) override; + void unbind_input_tensors() override { - for (auto &tensor : _tensor_inputs) { + for (auto & tensor : _tensor_inputs) { tensor->unbind(); } } void unbind_output_tensors() override { - for (auto &tensor : _tensor_outputs) { + for (auto & tensor : _tensor_outputs) { tensor->unbind(); } } - const qnn_tensor_array_t &get_input_tensors() override { return _tensor_inputs; } - const qnn_tensor_array_t &get_output_tensors() override { return _tensor_outputs; } + const qnn_tensor_array_t & get_input_tensors() override { return _tensor_inputs; } + + const qnn_tensor_array_t & get_output_tensors() override { return _tensor_outputs; } -protected: - std::string _name; + protected: + std::string _name; std::shared_ptr _qnn_instance; std::vector _operations; - qnn_tensor_array_t _tensor_inputs; - qnn_tensor_array_t _tensor_outputs; + qnn_tensor_array_t _tensor_inputs; + qnn_tensor_array_t _tensor_outputs; -private: + private: DISABLE_COPY(ggml_qnn_aggregate_op_config); DISABLE_MOVE(ggml_qnn_aggregate_op_config); }; class ggml_qnn_matmul_op_config : public ggml_qnn_aggregate_op_config { -public: - ggml_qnn_matmul_op_config(const std::string &name, std::shared_ptr qnn_instance) - : ggml_qnn_aggregate_op_config(name, qnn_instance) {} + public: + ggml_qnn_matmul_op_config(const std::string & name, std::shared_ptr qnn_instance) : + ggml_qnn_aggregate_op_config(name, qnn_instance) {} bool initialize_op_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle) override; -private: + private: qnn_tensor_ptr_t create_gather_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, const int rank, qnn_tensor_ptr_t tensor_input, qnn_dimension_array_t output_dimensions); - bool create_convert_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, const int rank, - qnn_tensor_array_t &tensor_inputs, qnn_tensor_array_t &tensor_outputs); - bool create_mat_mul_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, const int rank, - qnn_tensor_array_t &tensor_inputs, qnn_tensor_array_t &tensor_outputs); + bool create_convert_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, const int rank, + qnn_tensor_array_t & tensor_inputs, qnn_tensor_array_t & tensor_outputs); + bool create_mat_mul_nodes(qnn_tensor_array_t & tensor_inputs, qnn_tensor_array_t & tensor_outputs); DISABLE_COPY(ggml_qnn_matmul_op_config); DISABLE_MOVE(ggml_qnn_matmul_op_config); }; -} // namespace qnn +} // namespace qnn diff --git a/ggml/src/ggml-qnn/op-config.hpp b/ggml/src/ggml-qnn/op-config.hpp index 6b8c6946b8e86..d613a2116c04a 100644 --- a/ggml/src/ggml-qnn/op-config.hpp +++ b/ggml/src/ggml-qnn/op-config.hpp @@ -14,14 +14,14 @@ namespace qnn { constexpr const size_t kGgmlUnaryOpStart = GGML_OP_COUNT; -size_t get_qnn_op_index(const ggml_tensor *tensor); -const char *get_qnn_op_name(const ggml_tensor *op); -size_t get_qnn_op_input_param_count(const ggml_tensor *op); -std::shared_ptr create_op(const ggml_tensor *op, const std::string &name, +size_t get_qnn_op_index(const ggml_tensor * tensor); +const char * get_qnn_op_name(const ggml_tensor * op); +size_t get_qnn_op_input_param_count(const ggml_tensor * op); +std::shared_ptr create_op(const ggml_tensor * op, const std::string & name, std::shared_ptr qnn_instance); -inline bool add_op_to_graph(Qnn_GraphHandle_t graph_handle, std::vector &operations) { - for (auto &op : operations) { +inline bool add_op_to_graph(Qnn_GraphHandle_t graph_handle, std::vector & operations) { + for (auto & op : operations) { if (!op->add_op_to_graph(graph_handle)) { return false; } @@ -30,4 +30,4 @@ inline bool add_op_to_graph(Qnn_GraphHandle_t graph_handle, std::vector #if defined(__linux__) -#include +# include #endif namespace { #ifdef _WIN32 -constexpr const char *kQnnSystemLibName = "QnnSystem.dll"; -constexpr const char *kQnnRpcLibName = "libcdsprpc.dll"; +constexpr const char * kQnnSystemLibName = "QnnSystem.dll"; +constexpr const char * kQnnRpcLibName = "libcdsprpc.dll"; #else -constexpr const char *kQnnSystemLibName = "libQnnSystem.so"; -constexpr const char *kQnnRpcLibName = "libcdsprpc.so"; +constexpr const char * kQnnSystemLibName = "libQnnSystem.so"; +constexpr const char * kQnnRpcLibName = "libcdsprpc.so"; #endif -void insert_path(std::string &path, std::string insert_path, const char separator = ':') { +void insert_path(std::string & path, std::string insert_path, const char separator = ':') { if (!insert_path.empty() && !path.empty()) { insert_path += separator; } @@ -27,10 +27,10 @@ void insert_path(std::string &path, std::string insert_path, const char separato } // TODO: Fix this for other platforms, or use a more portable way to set the library search path -bool set_qnn_lib_search_path(const std::string &custom_lib_search_path) { +bool set_qnn_lib_search_path(const std::string & custom_lib_search_path) { #if defined(__linux__) { - auto *original = getenv("LD_LIBRARY_PATH"); + auto * original = getenv("LD_LIBRARY_PATH"); std::string lib_search_path = original ? original : ""; insert_path(lib_search_path, "/vendor/dsp/cdsp:/vendor/lib64:" @@ -41,7 +41,7 @@ bool set_qnn_lib_search_path(const std::string &custom_lib_search_path) { } } -#if defined(__ANDROID__) || defined(ANDROID) +# if defined(__ANDROID__) || defined(ANDROID) { // See also: https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-2/dsp_runtime.html std::string adsp_lib_search_path = custom_lib_search_path + @@ -51,87 +51,89 @@ bool set_qnn_lib_search_path(const std::string &custom_lib_search_path) { return false; } - QNN_LOG_DEBUG("ADSP_LIBRARY_PATH=%s", getenv("ADSP_LIBRARY_PATH")); + QNN_LOG_DEBUG("ADSP_LIBRARY_PATH=%s", getenv("ADSP_LIBRARY_PATH\n")); } -#endif +# endif - QNN_LOG_DEBUG("LD_LIBRARY_PATH=%s", getenv("LD_LIBRARY_PATH")); + QNN_LOG_DEBUG("LD_LIBRARY_PATH=%s", getenv("LD_LIBRARY_PATH\n")); #else - (void)custom_lib_search_path; + (void) custom_lib_search_path; #endif return true; } -qnn::dl_handler_t load_lib_with_fallback(const std::string &lib_path, const std::string &load_directory) { +qnn::dl_handler_t load_lib_with_fallback(const std::string & lib_path, const std::string & load_directory) { std::filesystem::path full_path(load_directory); full_path /= std::filesystem::path(lib_path).filename(); auto handle = qnn::dl_load(full_path.string()); if (!handle) { - QNN_LOG_WARN("failed to load %s, fallback to %s", full_path.c_str(), lib_path.c_str()); + QNN_LOG_WARN("failed to load %s, fallback to %s\n", full_path.c_str(), lib_path.c_str()); handle = qnn::dl_load(lib_path); } return handle; } -} // namespace +} // namespace namespace qnn { -qnn_system_interface::qnn_system_interface(const QnnSystemInterface_t &qnn_sys_interface, dl_handler_t lib_handle) - : _qnn_sys_interface(qnn_sys_interface), _lib_handle(lib_handle) { +qnn_system_interface::qnn_system_interface(const QnnSystemInterface_t & qnn_sys_interface, dl_handler_t lib_handle) : + _qnn_sys_interface(qnn_sys_interface), + _lib_handle(lib_handle) { qnn_system_context_create(&_qnn_system_handle); if (_qnn_system_handle) { - QNN_LOG_INFO("initialize qnn system successfully"); + QNN_LOG_INFO("initialize qnn system successfully\n"); } else { - QNN_LOG_WARN("can not create QNN system contenxt"); + QNN_LOG_WARN("can not create QNN system contenxt\n"); } } qnn_system_interface::~qnn_system_interface() { if (_qnn_system_handle) { if (qnn_system_context_free(_qnn_system_handle) != QNN_SUCCESS) { - QNN_LOG_WARN("failed to free QNN system context"); + QNN_LOG_WARN("failed to free QNN system context\n"); } } else { - QNN_LOG_WARN("system handle is null"); + QNN_LOG_WARN("system handle is null\n"); } if (_lib_handle) { if (!dl_unload(_lib_handle)) { - QNN_LOG_WARN("failed to close QnnSystem library, error %s", dl_error()); + QNN_LOG_WARN("failed to close QnnSystem library, error %s\n", dl_error()); } } else { - QNN_LOG_WARN("system lib handle is null"); + QNN_LOG_WARN("system lib handle is null\n"); } } -qnn_instance::qnn_instance(const std::string &lib_path, const std::string &backend_lib_name) - : _additional_lib_load_path(lib_path), _backend_lib_name(std::move(backend_lib_name)) { +qnn_instance::qnn_instance(const std::string & lib_path, const std::string & backend_lib_name) : + _additional_lib_load_path(lib_path), + _backend_lib_name(std::move(backend_lib_name)) { if (set_qnn_lib_search_path(lib_path)) { - QNN_LOG_DEBUG("[%s] set_qnn_lib_search_path succeed", _backend_lib_name.c_str()); + QNN_LOG_DEBUG("[%s] set_qnn_lib_search_path succeed\n", _backend_lib_name.c_str()); } else { - QNN_LOG_ERROR("[%s] set_qnn_lib_search_path failed", _backend_lib_name.c_str()); + QNN_LOG_ERROR("[%s] set_qnn_lib_search_path failed\n", _backend_lib_name.c_str()); } } -int qnn_instance::qnn_init(const QnnSaver_Config_t **saver_config) { +int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) { BackendIdType backend_id = QNN_BACKEND_ID_NULL; - QNN_LOG_DEBUG("enter qnn_init"); + QNN_LOG_DEBUG("enter qnn_init\n"); std::lock_guard lock(_init_mutex); if (load_system() != 0) { - QNN_LOG_WARN("failed to load QNN system lib"); + QNN_LOG_WARN("failed to load QNN system lib\n"); return 1; } else { - QNN_LOG_DEBUG("load QNN system lib successfully"); + QNN_LOG_DEBUG("load QNN system lib successfully\n"); } std::string backend_lib_path = _backend_lib_name; if (_lib_path_to_backend_id.count(backend_lib_path) == 0) { if (load_backend(backend_lib_path, saver_config) != 0) { - QNN_LOG_WARN("failed to load QNN backend"); + QNN_LOG_WARN("failed to load QNN backend\n"); return 2; } } @@ -149,119 +151,119 @@ int qnn_instance::qnn_init(const QnnSaver_Config_t **saver_config) { _qnn_interface->qnn_log_create(qnn::sdk_logcallback, _qnn_log_level, &_qnn_log_handle); if (!_qnn_log_handle) { // NPU backend not work on Qualcomm SoC equipped low-end phone - QNN_LOG_WARN("why failed to initialize qnn log"); + QNN_LOG_WARN("why failed to initialize qnn log\n"); return 4; } else { - QNN_LOG_DEBUG("initialize qnn log successfully"); + QNN_LOG_DEBUG("initialize qnn log successfully\n"); } std::vector temp_backend_config; _qnn_interface->qnn_backend_create( _qnn_log_handle, temp_backend_config.empty() ? nullptr : temp_backend_config.data(), &_qnn_backend_handle); if (!_qnn_backend_handle) { - QNN_LOG_WARN("why failed to initialize qnn backend"); + QNN_LOG_WARN("why failed to initialize qnn backend\n"); return 5; } else { - QNN_LOG_DEBUG("initialize qnn backend successfully"); + QNN_LOG_DEBUG("initialize qnn backend successfully\n"); } auto qnn_status = _qnn_interface->qnn_property_has_capability(QNN_PROPERTY_GROUP_DEVICE); if (QNN_PROPERTY_NOT_SUPPORTED == qnn_status) { - QNN_LOG_WARN("device property is not supported"); + QNN_LOG_WARN("device property is not supported\n"); } if (QNN_PROPERTY_ERROR_UNKNOWN_KEY == qnn_status) { - QNN_LOG_WARN("device property is not known to backend"); + QNN_LOG_WARN("device property is not known to backend\n"); } qnn_status = QNN_SUCCESS; if (_backend_lib_name.find("Htp") != _backend_lib_name.npos) { - const QnnDevice_PlatformInfo_t *p_info = nullptr; - qnn_status = _qnn_interface->qnn_device_get_platform_info(nullptr, &p_info); + const QnnDevice_PlatformInfo_t * p_info = nullptr; + qnn_status = _qnn_interface->qnn_device_get_platform_info(nullptr, &p_info); if (qnn_status == QNN_SUCCESS) { - QNN_LOG_INFO("device counts %d", p_info->v1.numHwDevices); - QnnDevice_HardwareDeviceInfo_t *infos = p_info->v1.hwDevices; + QNN_LOG_INFO("device counts %d\n", p_info->v1.numHwDevices); + QnnDevice_HardwareDeviceInfo_t * infos = p_info->v1.hwDevices; QnnHtpDevice_OnChipDeviceInfoExtension_t chipinfo = {}; for (uint32_t i = 0; i < p_info->v1.numHwDevices; i++) { - QNN_LOG_INFO("deviceID:%d, deviceType:%d, numCores %d", infos[i].v1.deviceId, infos[i].v1.deviceType, - infos[i].v1.numCores); + QNN_LOG_INFO("deviceID:%d, deviceType:%d, numCores %d\n", (int) infos[i].v1.deviceId, + (int) infos[i].v1.deviceType, (int) infos[i].v1.numCores); QnnDevice_DeviceInfoExtension_t devinfo = infos[i].v1.deviceInfoExtension; - chipinfo = devinfo->onChipDevice; - size_t htp_arch = (size_t)chipinfo.arch; - QNN_LOG_INFO("htp_type:%d(%s)", devinfo->devType, + chipinfo = devinfo->onChipDevice; + size_t htp_arch = (size_t) chipinfo.arch; + QNN_LOG_INFO("htp_type:%d(%s)\n", devinfo->devType, (devinfo->devType == QNN_HTP_DEVICE_TYPE_ON_CHIP) ? "ON_CHIP" : ""); - QNN_LOG_INFO("qualcomm soc_model:%d(%s), htp_arch:%d(%s), vtcm_size:%d MB", chipinfo.socModel, - qnn::get_chipset_desc(chipinfo.socModel), htp_arch, qnn::get_htparch_desc(htp_arch), - chipinfo.vtcmSize); - _soc_info = {chipinfo.socModel, htp_arch, chipinfo.vtcmSize}; + QNN_LOG_INFO("qualcomm soc_model:%d(%s), htp_arch:%d(%s), vtcm_size:%d MB\n", (int) chipinfo.socModel, + qnn::get_chipset_desc(chipinfo.socModel), (int) htp_arch, qnn::get_htparch_desc(htp_arch), + (int) chipinfo.vtcmSize); + _soc_info = { chipinfo.socModel, htp_arch, chipinfo.vtcmSize }; } _qnn_interface->qnn_device_free_platform_info(nullptr, p_info); } else { // For emulator, we can't get platform info - QNN_LOG_WARN("failed to get platform info, are we in emulator?"); - _soc_info = {NONE, UNKNOWN_SM, 0}; + QNN_LOG_WARN("failed to get platform info, are we in emulator?\n"); + _soc_info = { NONE, UNKNOWN_SM, 0 }; } QnnHtpDevice_CustomConfig_t soc_customconfig; - soc_customconfig.option = QNN_HTP_DEVICE_CONFIG_OPTION_SOC; + soc_customconfig.option = QNN_HTP_DEVICE_CONFIG_OPTION_SOC; soc_customconfig.socModel = _soc_info.soc_model; QnnDevice_Config_t soc_devconfig; - soc_devconfig.option = QNN_DEVICE_CONFIG_OPTION_CUSTOM; + soc_devconfig.option = QNN_DEVICE_CONFIG_OPTION_CUSTOM; soc_devconfig.customConfig = &soc_customconfig; QnnHtpDevice_CustomConfig_t arch_customconfig; - arch_customconfig.option = QNN_HTP_DEVICE_CONFIG_OPTION_ARCH; - arch_customconfig.arch.arch = (QnnHtpDevice_Arch_t)_soc_info.htp_arch; - arch_customconfig.arch.deviceId = 0; // Id of device to be used. 0 will use by default. + arch_customconfig.option = QNN_HTP_DEVICE_CONFIG_OPTION_ARCH; + arch_customconfig.arch.arch = (QnnHtpDevice_Arch_t) _soc_info.htp_arch; + arch_customconfig.arch.deviceId = 0; // Id of device to be used. 0 will use by default. QnnDevice_Config_t arch_devconfig; - arch_devconfig.option = QNN_DEVICE_CONFIG_OPTION_CUSTOM; + arch_devconfig.option = QNN_DEVICE_CONFIG_OPTION_CUSTOM; arch_devconfig.customConfig = &arch_customconfig; - const QnnDevice_Config_t *p_deviceconfig[] = {&soc_devconfig, &arch_devconfig, nullptr}; + const QnnDevice_Config_t * p_deviceconfig[] = { &soc_devconfig, &arch_devconfig, nullptr }; qnn_status = _qnn_interface->qnn_device_create(_qnn_log_handle, p_deviceconfig, &_qnn_device_handle); } else { qnn_status = _qnn_interface->qnn_device_create(_qnn_log_handle, nullptr, &_qnn_device_handle); } if (QNN_SUCCESS != qnn_status && QNN_DEVICE_ERROR_UNSUPPORTED_FEATURE != qnn_status) { - QNN_LOG_WARN("failed to create QNN device"); + QNN_LOG_WARN("failed to create QNN device\n"); } else { - QNN_LOG_INFO("create QNN device successfully"); + QNN_LOG_INFO("create QNN device successfully\n"); } if (_profile_level != sdk_profile_level::profile_off) { - QNN_LOG_INFO("profiling turned on; level = %d", _profile_level); + QNN_LOG_INFO("profiling turned on; level = %d\n", _profile_level); auto profile_level = _profile_level == sdk_profile_level::profile_detail ? QNN_PROFILE_LEVEL_DETAILED : QNN_PROFILE_LEVEL_BASIC; if (QNN_PROFILE_NO_ERROR != _qnn_interface->qnn_profile_create(_qnn_backend_handle, profile_level, &_qnn_profile_handle)) { - QNN_LOG_WARN("unable to create profile handle in the backend"); + QNN_LOG_WARN("unable to create profile handle in the backend\n"); return 6; } else { - QNN_LOG_DEBUG("initialize qnn profile successfully"); + QNN_LOG_DEBUG("initialize qnn profile successfully\n"); } } _rpc_lib_handle = load_lib_with_fallback(kQnnRpcLibName, _additional_lib_load_path); if (_rpc_lib_handle) { _pfn_rpc_mem_alloc = reinterpret_cast(dl_sym(_rpc_lib_handle, "rpcmem_alloc")); - _pfn_rpc_mem_free = reinterpret_cast(dl_sym(_rpc_lib_handle, "rpcmem_free")); + _pfn_rpc_mem_free = reinterpret_cast(dl_sym(_rpc_lib_handle, "rpcmem_free")); _pfn_rpc_mem_to_fd = reinterpret_cast(dl_sym(_rpc_lib_handle, "rpcmem_to_fd")); if (!_pfn_rpc_mem_alloc || !_pfn_rpc_mem_free || !_pfn_rpc_mem_to_fd) { - QNN_LOG_WARN("unable to access symbols in QNN RPC lib. error: %s", dl_error()); + QNN_LOG_WARN("unable to access symbols in QNN RPC lib. error: %s\n", dl_error()); dl_unload(_rpc_lib_handle); return 9; } - _pfn_rpc_mem_init = reinterpret_cast(dl_sym(_rpc_lib_handle, "rpcmem_init")); + _pfn_rpc_mem_init = reinterpret_cast(dl_sym(_rpc_lib_handle, "rpcmem_init")); _pfn_rpc_mem_deinit = reinterpret_cast(dl_sym(_rpc_lib_handle, "rpcmem_deinit")); if (_pfn_rpc_mem_init) { _pfn_rpc_mem_init(); } _rpcmem_initialized = true; - QNN_LOG_DEBUG("load rpcmem lib successfully"); + QNN_LOG_DEBUG("load rpcmem lib successfully\n"); } else { - QNN_LOG_WARN("failed to load qualcomm rpc lib, skipping, error:%s", dl_error()); + QNN_LOG_WARN("failed to load qualcomm rpc lib, skipping, error:%s\n", dl_error()); } /* TODO: not used, keep it for further usage @@ -271,23 +273,23 @@ int qnn_instance::qnn_init(const QnnSaver_Config_t **saver_config) { */ _qnn_interface->qnn_context_create(_qnn_backend_handle, _qnn_device_handle, nullptr, &_qnn_context_handle); if (nullptr == _qnn_context_handle) { - QNN_LOG_WARN("why failed to initialize qnn context"); + QNN_LOG_WARN("why failed to initialize qnn context\n"); return 10; } else { - QNN_LOG_DEBUG("initialize qnn context successfully"); + QNN_LOG_DEBUG("initialize qnn context successfully\n"); } if (_backend_lib_name.find("Htp") != _backend_lib_name.npos) { // TODO: faster approach to probe the accurate capacity of rpc ion memory - size_t candidate_size = 0; - uint8_t *rpc_buffer = nullptr; - const int size_in_mb = (1 << 20); - size_t probe_slots[] = {1024, 1536, 2048 - 48, 2048}; - size_t probe_counts = sizeof(probe_slots) / sizeof(size_t); + size_t candidate_size = 0; + uint8_t * rpc_buffer = nullptr; + const int size_in_mb = (1 << 20); + size_t probe_slots[] = { 1024, 1536, 2048 - 48, 2048 }; + size_t probe_counts = sizeof(probe_slots) / sizeof(size_t); for (size_t idx = 0; idx < probe_counts; idx++) { rpc_buffer = static_cast(alloc_rpcmem(probe_slots[idx] * size_in_mb, sizeof(void *))); if (!rpc_buffer) { - QNN_LOG_DEBUG("alloc rpcmem %d (MB) failure, %s", probe_slots[idx], strerror(errno)); + QNN_LOG_DEBUG("alloc rpcmem %d (MB) failure, %s\n", (int) probe_slots[idx], strerror(errno)); break; } else { candidate_size = probe_slots[idx]; @@ -297,27 +299,27 @@ int qnn_instance::qnn_init(const QnnSaver_Config_t **saver_config) { } _rpcmem_capacity = std::max(candidate_size, _rpcmem_capacity); - QNN_LOG_INFO("capacity of QNN rpc ion memory is about %d MB", _rpcmem_capacity); + QNN_LOG_INFO("capacity of QNN rpc ion memory is about %d MB\n", (int) _rpcmem_capacity); if (init_htp_perfinfra() != 0) { - QNN_LOG_WARN("initialize HTP performance failure"); + QNN_LOG_WARN("initialize HTP performance failure\n"); } if (set_rpc_polling() != 0) { - QNN_LOG_WARN("set RPC polling failure"); + QNN_LOG_WARN("set RPC polling failure\n"); } if (set_high_performance_mode() != 0) { - QNN_LOG_WARN("set HTP high performance mode failure"); + QNN_LOG_WARN("set HTP high performance mode failure\n"); } } - QNN_LOG_DEBUG("leave qnn_init"); + QNN_LOG_DEBUG("leave qnn_init\n"); return 0; } int qnn_instance::qnn_finalize() { - int ret_status = 0; - Qnn_ErrorHandle_t error = QNN_SUCCESS; + int ret_status = 0; + Qnn_ErrorHandle_t error = QNN_SUCCESS; if (_rpc_lib_handle) { if (_pfn_rpc_mem_deinit) { @@ -326,9 +328,9 @@ int qnn_instance::qnn_finalize() { } if (dl_unload(_rpc_lib_handle)) { - QNN_LOG_DEBUG("succeed to close rpcmem lib"); + QNN_LOG_DEBUG("succeed to close rpcmem lib\n"); } else { - QNN_LOG_WARN("failed to unload qualcomm's rpc lib, error:%s", dl_error()); + QNN_LOG_WARN("failed to unload qualcomm's rpc lib, error:%s\n", dl_error()); } } @@ -339,8 +341,8 @@ int qnn_instance::qnn_finalize() { if (_qnn_context_handle) { error = _qnn_interface->qnn_context_free(_qnn_context_handle, _qnn_profile_handle); if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to free QNN context_handle: ID %u, error %d", _qnn_interface->get_backend_id(), - QNN_GET_ERROR_CODE(error)); + QNN_LOG_WARN("failed to free QNN context_handle: ID %u, error %d\n", _qnn_interface->get_backend_id(), + (int) QNN_GET_ERROR_CODE(error)); } _qnn_context_handle = nullptr; } @@ -348,8 +350,8 @@ int qnn_instance::qnn_finalize() { if (_qnn_profile_handle) { error = _qnn_interface->qnn_profile_free(_qnn_profile_handle); if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to free QNN profile_handle: ID %u, error %d", _qnn_interface->get_backend_id(), - QNN_GET_ERROR_CODE(error)); + QNN_LOG_WARN("failed to free QNN profile_handle: ID %u, error %d\n", _qnn_interface->get_backend_id(), + (int) QNN_GET_ERROR_CODE(error)); } _qnn_profile_handle = nullptr; } @@ -357,8 +359,8 @@ int qnn_instance::qnn_finalize() { if (_qnn_device_handle) { error = _qnn_interface->qnn_device_free(_qnn_device_handle); if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to free QNN device_handle: ID %u, error %d", _qnn_interface->get_backend_id(), - QNN_GET_ERROR_CODE(error)); + QNN_LOG_WARN("failed to free QNN device_handle: ID %u, error %d\n", _qnn_interface->get_backend_id(), + (int) QNN_GET_ERROR_CODE(error)); } _qnn_device_handle = nullptr; } @@ -366,17 +368,17 @@ int qnn_instance::qnn_finalize() { if (_qnn_backend_handle) { error = _qnn_interface->qnn_backend_free(_qnn_backend_handle); if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to free QNN backend_handle: ID %u, error %d", _qnn_interface->get_backend_id(), - QNN_GET_ERROR_CODE(error)); + QNN_LOG_WARN("failed to free QNN backend_handle: ID %u, error %d\n", _qnn_interface->get_backend_id(), + (int) QNN_GET_ERROR_CODE(error)); } _qnn_backend_handle = nullptr; } - if (nullptr != _qnn_log_handle) { + if (_qnn_log_handle) { error = _qnn_interface->qnn_log_free(_qnn_log_handle); if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to free QNN log_handle: ID %u, error %d", _qnn_interface->get_backend_id(), - QNN_GET_ERROR_CODE(error)); + QNN_LOG_WARN("failed to free QNN log_handle: ID %u, error %d\n", _qnn_interface->get_backend_id(), + (int) QNN_GET_ERROR_CODE(error)); } _qnn_log_handle = nullptr; } @@ -389,60 +391,60 @@ int qnn_instance::qnn_finalize() { } int qnn_instance::load_system() { - QNN_LOG_DEBUG("[%s]lib: %s", _backend_lib_name.c_str(), kQnnSystemLibName); + QNN_LOG_DEBUG("[%s]lib: %s\n", _backend_lib_name.c_str(), kQnnSystemLibName); auto system_lib_handle = load_lib_with_fallback(kQnnSystemLibName, _additional_lib_load_path); if (!system_lib_handle) { - QNN_LOG_WARN("can not load QNN library %s, error: %s", kQnnSystemLibName, dl_error()); + QNN_LOG_WARN("can not load QNN library %s, error: %s\n", kQnnSystemLibName, dl_error()); return 1; } - auto *get_providers = + auto * get_providers = dl_sym_typed(system_lib_handle, "QnnSystemInterface_getProviders"); if (!get_providers) { - QNN_LOG_WARN("can not load QNN symbol QnnSystemInterface_getProviders: %s", dl_error()); + QNN_LOG_WARN("can not load QNN symbol QnnSystemInterface_getProviders: %s\n", dl_error()); return 2; } - uint32_t num_providers = 0; - const QnnSystemInterface_t **provider_list = nullptr; - Qnn_ErrorHandle_t error = get_providers(&provider_list, &num_providers); + uint32_t num_providers = 0; + const QnnSystemInterface_t ** provider_list = nullptr; + Qnn_ErrorHandle_t error = get_providers(&provider_list, &num_providers); if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to get providers, error %d", QNN_GET_ERROR_CODE(error)); + QNN_LOG_WARN("failed to get providers, error %d\n", (int) QNN_GET_ERROR_CODE(error)); return 3; } - QNN_LOG_DEBUG("num_providers: %d", num_providers); + QNN_LOG_DEBUG("num_providers: %d\n", num_providers); if (num_providers != _required_num_providers) { - QNN_LOG_WARN("providers is %d instead of required %d", num_providers, _required_num_providers); + QNN_LOG_WARN("providers is %d instead of required %d\n", (int) num_providers, (int) _required_num_providers); return 4; } if (!provider_list) { - QNN_LOG_WARN("can not get providers"); + QNN_LOG_WARN("can not get providers\n"); return 5; } QNN_SYSTEM_INTERFACE_VER_TYPE qnn_system_interface; - bool found_valid_system_interface = false; + bool found_valid_system_interface = false; for (size_t idx = 0; idx < num_providers; idx++) { if (QNN_SYSTEM_API_VERSION_MAJOR == provider_list[idx]->systemApiVersion.major && QNN_SYSTEM_API_VERSION_MINOR <= provider_list[idx]->systemApiVersion.minor) { found_valid_system_interface = true; - qnn_system_interface = provider_list[idx]->QNN_SYSTEM_INTERFACE_VER_NAME; + qnn_system_interface = provider_list[idx]->QNN_SYSTEM_INTERFACE_VER_NAME; break; } } if (!found_valid_system_interface) { - QNN_LOG_WARN("unable to find a valid qnn system interface"); + QNN_LOG_WARN("unable to find a valid qnn system interface\n"); return 6; } else { - QNN_LOG_DEBUG("find a valid qnn system interface"); + QNN_LOG_DEBUG("find a valid qnn system interface\n"); } auto qnn_sys_interface = std::make_shared(*provider_list[0], system_lib_handle); if (!qnn_sys_interface->is_valid()) { - QNN_LOG_WARN("failed to create QNN system interface"); + QNN_LOG_WARN("failed to create QNN system interface\n"); return 7; } @@ -450,79 +452,79 @@ int qnn_instance::load_system() { return 0; } -int qnn_instance::load_backend(std::string &lib_path, const QnnSaver_Config_t ** /*saver_config*/) { +int qnn_instance::load_backend(std::string & lib_path, const QnnSaver_Config_t ** /*saver_config*/) { Qnn_ErrorHandle_t error = QNN_SUCCESS; - QNN_LOG_DEBUG("lib_path:%s", lib_path.c_str()); + QNN_LOG_DEBUG("lib_path:%s\n", lib_path.c_str()); auto lib_handle = load_lib_with_fallback(lib_path, _additional_lib_load_path); if (!lib_handle) { - QNN_LOG_WARN("can not open QNN library %s, with error: %s", lib_path.c_str(), dl_error()); + QNN_LOG_WARN("can not open QNN library %s, with error: %s\n", lib_path.c_str(), dl_error()); return 1; } auto get_providers = dl_sym_typed(lib_handle, "QnnInterface_getProviders"); if (!get_providers) { - QNN_LOG_WARN("can not load symbol QnnInterface_getProviders : %s", dl_error()); + QNN_LOG_WARN("can not load symbol QnnInterface_getProviders : %s\n", dl_error()); return 2; } - std::uint32_t num_providers = 0; - const QnnInterface_t **provider_list = nullptr; - error = get_providers(&provider_list, &num_providers); + std::uint32_t num_providers = 0; + const QnnInterface_t ** provider_list = nullptr; + error = get_providers(&provider_list, &num_providers); if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to get providers, error %d", QNN_GET_ERROR_CODE(error)); + QNN_LOG_WARN("failed to get providers, error %d\n", (int) QNN_GET_ERROR_CODE(error)); return 3; } - QNN_LOG_DEBUG("num_providers=%d", num_providers); + QNN_LOG_DEBUG("num_providers=%d\n", num_providers); if (num_providers != _required_num_providers) { - QNN_LOG_WARN("providers is %d instead of required %d", num_providers, _required_num_providers); + QNN_LOG_WARN("providers is %d instead of required %d\n", num_providers, _required_num_providers); return 4; } if (!provider_list) { - QNN_LOG_WARN("failed to get qnn interface providers"); + QNN_LOG_WARN("failed to get qnn interface providers\n"); return 5; } - bool found_valid_interface = false; + bool found_valid_interface = false; QNN_INTERFACE_VER_TYPE qnn_interface; for (size_t idx = 0; idx < num_providers; idx++) { if (QNN_API_VERSION_MAJOR == provider_list[idx]->apiVersion.coreApiVersion.major && QNN_API_VERSION_MINOR <= provider_list[idx]->apiVersion.coreApiVersion.minor) { found_valid_interface = true; - qnn_interface = provider_list[idx]->QNN_INTERFACE_VER_NAME; + qnn_interface = provider_list[idx]->QNN_INTERFACE_VER_NAME; break; } } if (!found_valid_interface) { - QNN_LOG_WARN("unable to find a valid qnn interface"); + QNN_LOG_WARN("unable to find a valid qnn interface\n"); return 6; } else { - QNN_LOG_DEBUG("find a valid qnn interface"); + QNN_LOG_DEBUG("find a valid qnn interface\n"); } - BackendIdType backend_id = provider_list[0]->backendId; + BackendIdType backend_id = provider_list[0]->backendId; _lib_path_to_backend_id[lib_path] = backend_id; if (_loaded_backend.count(backend_id) > 0) { - QNN_LOG_WARN("lib_path %s is loaded, but backend %d already exists", lib_path.c_str(), backend_id); + QNN_LOG_WARN("lib_path %s is loaded, but backend %d already exists\n", lib_path.c_str(), backend_id); } _loaded_backend[backend_id] = provider_list[0]; if (_loaded_lib_handle.count(backend_id) > 0) { - QNN_LOG_WARN("closing %p", _loaded_lib_handle[backend_id]); + QNN_LOG_WARN("closing %p\n", _loaded_lib_handle[backend_id]); if (!dl_unload(_loaded_lib_handle[backend_id])) { - QNN_LOG_WARN("fail to close %p with error %s", _loaded_lib_handle[backend_id], dl_error()); + QNN_LOG_WARN("fail to close %p with error %s\n", _loaded_lib_handle[backend_id], dl_error()); } } _loaded_lib_handle[backend_id] = lib_handle; - _backend_id = backend_id; + _backend_id = backend_id; return 0; } int qnn_instance::unload_backend() { - for (auto &it : _loaded_lib_handle) { + for (auto & it : _loaded_lib_handle) { if (!dl_unload(it.second)) { - QNN_LOG_WARN("failed to close QNN backend %d, error %s", it.first, dl_error()); + QNN_LOG_WARN("failed to close QNN backend %d, error %s\n", it.first, dl_error()); } } @@ -533,4 +535,4 @@ int qnn_instance::unload_backend() { return 0; } -} // namespace qnn +} // namespace qnn diff --git a/ggml/src/ggml-qnn/qnn-lib.hpp b/ggml/src/ggml-qnn/qnn-lib.hpp index 968df5bcf297d..bb6006acda19c 100644 --- a/ggml/src/ggml-qnn/qnn-lib.hpp +++ b/ggml/src/ggml-qnn/qnn-lib.hpp @@ -24,7 +24,7 @@ #include #include -#include "dl_loader.hpp" +#include "dl-loader.hpp" #include "qnn-types.hpp" #include "utils.hpp" @@ -42,16 +42,15 @@ namespace qnn { #pragma GCC diagnostic ignored "-Wpedantic" class qnn_system_interface { - #define DEFINE_SHIM_FUNCTION_SYS_INTERFACE(F, pointer_name) \ - template \ - inline auto qnn_##F(Args... args) const { \ + template inline auto qnn_##F(Args... args) const { \ return (_qnn_sys_interface.QNN_SYSTEM_INTERFACE_VER_NAME.pointer_name)(std::forward(args)...); \ } -public: - qnn_system_interface(const QnnSystemInterface_t &qnn_sys_interface, dl_handler_t lib_handle); + public: + qnn_system_interface(const QnnSystemInterface_t & qnn_sys_interface, dl_handler_t lib_handle); ~qnn_system_interface(); + bool is_valid() const { return _qnn_system_handle != nullptr; } // QnnSystem @@ -61,27 +60,25 @@ class qnn_system_interface { DEFINE_SHIM_FUNCTION_SYS_INTERFACE(system_context_free, systemContextFree); -private: + private: qnn_system_interface(const qnn_system_interface &) = delete; - void operator=(const qnn_system_interface &) = delete; - qnn_system_interface(qnn_system_interface &&) = delete; - void operator=(qnn_system_interface &&) = delete; + void operator=(const qnn_system_interface &) = delete; + qnn_system_interface(qnn_system_interface &&) = delete; + void operator=(qnn_system_interface &&) = delete; const QnnSystemInterface_t _qnn_sys_interface = {}; - dl_handler_t _lib_handle = nullptr; - QnnSystemContext_Handle_t _qnn_system_handle = nullptr; + dl_handler_t _lib_handle = nullptr; + QnnSystemContext_Handle_t _qnn_system_handle = nullptr; }; class qnn_interface { - #define DEFINE_SHIM_FUNCTION_INTERFACE(F, pointer_name) \ - template \ - inline auto qnn_##F(Args... args) const { \ + template inline auto qnn_##F(Args... args) const { \ return (_qnn_interface.QNN_INTERFACE_VER_NAME.pointer_name)(std::forward(args)...); \ } -public: - qnn_interface(const QnnInterface_t &qnn_interface) : _qnn_interface(qnn_interface) {} + public: + qnn_interface(const QnnInterface_t & qnn_interface) : _qnn_interface(qnn_interface) {} // QnnBackend DEFINE_SHIM_FUNCTION_INTERFACE(backend_create, backendCreate); @@ -161,11 +158,11 @@ class qnn_interface { uint32_t get_backend_id() const { return _qnn_interface.backendId; } -private: - qnn_interface(const qnn_interface &) = delete; + private: + qnn_interface(const qnn_interface &) = delete; void operator=(const qnn_interface &) = delete; - qnn_interface(qnn_interface &&) = delete; - void operator=(qnn_interface &&) = delete; + qnn_interface(qnn_interface &&) = delete; + void operator=(qnn_interface &&) = delete; const QnnInterface_t _qnn_interface = {}; }; @@ -173,17 +170,19 @@ class qnn_interface { #pragma GCC diagnostic pop class qnn_instance { -public: + public: using BackendIdType = decltype(QnnInterface_t{}.backendId); - explicit qnn_instance(const std::string &lib_path, const std::string &backend_lib_name); + explicit qnn_instance(const std::string & lib_path, const std::string & backend_lib_name); + ~qnn_instance() {} - int qnn_init(const QnnSaver_Config_t **saver_config); + + int qnn_init(const QnnSaver_Config_t ** saver_config); int qnn_finalize(); std::shared_ptr get_qnn_interface() { if (!_qnn_interface) { - QNN_LOG_WARN("pls check why _qnn_interface is not loaded"); + QNN_LOG_WARN("pls check why _qnn_interface is not loaded\n"); } return _qnn_interface; } @@ -202,26 +201,26 @@ class qnn_instance { int init_htp_perfinfra() { QnnDevice_Infrastructure_t device_infra = nullptr; - auto error = _qnn_interface->qnn_device_get_infrastructure(&device_infra); + auto error = _qnn_interface->qnn_device_get_infrastructure(&device_infra); if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to get qnn device infra"); + QNN_LOG_WARN("failed to get qnn device infra\n"); return 1; } else { - QNN_LOG_INFO("HTP backend perf_infrastructure creation ok"); + QNN_LOG_INFO("HTP backend perf_infrastructure creation ok\n"); } - QnnHtpDevice_Infrastructure_t *htp_infra = static_cast(device_infra); - QnnHtpDevice_PerfInfrastructure_t *htp_perfinfra = &htp_infra->perfInfra; - uint32_t power_configid = 1; - uint32_t device_id = 0; - uint32_t core_id = 0; + QnnHtpDevice_Infrastructure_t * htp_infra = static_cast(device_infra); + QnnHtpDevice_PerfInfrastructure_t * htp_perfinfra = &htp_infra->perfInfra; + uint32_t power_configid = 1; + uint32_t device_id = 0; + uint32_t core_id = 0; htp_perfinfra->createPowerConfigId(device_id, core_id, &power_configid); if (htp_infra->infraType != QNN_HTP_DEVICE_INFRASTRUCTURE_TYPE_PERF) { - QNN_LOG_INFO("HTP infra type = %d, which is not perf infra type", htp_infra->infraType); + QNN_LOG_INFO("HTP infra type = %d, which is not perf infra type\n", htp_infra->infraType); } else { - QNN_LOG_INFO("HTP infra type = %d, which is perf infra type", htp_infra->infraType); + QNN_LOG_INFO("HTP infra type = %d, which is perf infra type\n", htp_infra->infraType); } - _qnn_htp_perfinfra = htp_perfinfra; + _qnn_htp_perfinfra = htp_perfinfra; _qnn_power_configid = power_configid; return 0; @@ -231,7 +230,7 @@ class qnn_instance { if (_qnn_htp_perfinfra) { QnnHtpPerfInfrastructure_PowerConfig_t rpc_polling_time; memset(&rpc_polling_time, 0, sizeof(rpc_polling_time)); - rpc_polling_time.option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_RPC_POLLING_TIME; + rpc_polling_time.option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_RPC_POLLING_TIME; // use rpc polling time recommended 0-10000 us rpc_polling_time.rpcPollingTimeConfig = 9999; @@ -241,16 +240,16 @@ class qnn_instance { // use rpc control latency recommended 100 us, refer hexagon sdk rpc_control_latency.rpcControlLatencyConfig = 100; - const QnnHtpPerfInfrastructure_PowerConfig_t *power_configs[] = {&rpc_polling_time, &rpc_control_latency, - nullptr}; + const QnnHtpPerfInfrastructure_PowerConfig_t * power_configs[] = { &rpc_polling_time, &rpc_control_latency, + nullptr }; Qnn_ErrorHandle_t qnn_status = _qnn_htp_perfinfra->setPowerConfig(_qnn_power_configid, power_configs); if (qnn_status != QNN_SUCCESS) { - QNN_LOG_WARN("set htp perf failed"); + QNN_LOG_WARN("set htp perf failed\n"); } else { - QNN_LOG_DEBUG("set htp perf ok"); + QNN_LOG_DEBUG("set htp perf ok\n"); } } else { - QNN_LOG_WARN("can't set htp perf"); + QNN_LOG_WARN("can't set htp perf\n"); } return 0; @@ -258,7 +257,7 @@ class qnn_instance { int set_high_performance_mode() { if (nullptr == _qnn_htp_perfinfra) { - QNN_LOG_WARN("perf intra is null"); + QNN_LOG_WARN("perf intra is null\n"); return 1; } @@ -266,83 +265,83 @@ class qnn_instance { memset(&power_config, 0, sizeof(power_config)); power_config.option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_DCVS_V3; - power_config.dcvsV3Config.setDcvsEnable = 1; - power_config.dcvsV3Config.dcvsEnable = 0; - power_config.dcvsV3Config.contextId = _qnn_power_configid; - power_config.dcvsV3Config.powerMode = QNN_HTP_PERF_INFRASTRUCTURE_POWERMODE_PERFORMANCE_MODE; - power_config.dcvsV3Config.setSleepLatency = 1; // true to consider Latency parameter otherwise false - power_config.dcvsV3Config.sleepLatency = 40; - power_config.dcvsV3Config.setBusParams = 1; // true to consider Bus parameter otherwise false - power_config.dcvsV3Config.setCoreParams = 1; // true to consider Core parameter otherwise false - power_config.dcvsV3Config.sleepDisable = 1; // true to consider sleep/LPM modes, false to enable + power_config.dcvsV3Config.setDcvsEnable = 1; + power_config.dcvsV3Config.dcvsEnable = 0; + power_config.dcvsV3Config.contextId = _qnn_power_configid; + power_config.dcvsV3Config.powerMode = QNN_HTP_PERF_INFRASTRUCTURE_POWERMODE_PERFORMANCE_MODE; + power_config.dcvsV3Config.setSleepLatency = 1; // true to consider Latency parameter otherwise false + power_config.dcvsV3Config.sleepLatency = 40; + power_config.dcvsV3Config.setBusParams = 1; // true to consider Bus parameter otherwise false + power_config.dcvsV3Config.setCoreParams = 1; // true to consider Core parameter otherwise false + power_config.dcvsV3Config.sleepDisable = 1; // true to consider sleep/LPM modes, false to enable power_config.dcvsV3Config.setSleepDisable = - 1; // true to consider sleep disable/enable parameter otherwise false set sleep latency parameter + 1; // true to consider sleep disable/enable parameter otherwise false set sleep latency parameter // set Bus Clock Parameters - power_config.dcvsV3Config.busVoltageCornerMin = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; - power_config.dcvsV3Config.busVoltageCornerTarget = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; - power_config.dcvsV3Config.busVoltageCornerMax = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; + power_config.dcvsV3Config.busVoltageCornerMin = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; + power_config.dcvsV3Config.busVoltageCornerTarget = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; + power_config.dcvsV3Config.busVoltageCornerMax = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; // set Core Clock Parameters - power_config.dcvsV3Config.coreVoltageCornerMin = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; + power_config.dcvsV3Config.coreVoltageCornerMin = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; power_config.dcvsV3Config.coreVoltageCornerTarget = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; - power_config.dcvsV3Config.coreVoltageCornerMax = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; + power_config.dcvsV3Config.coreVoltageCornerMax = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; // set power config with different performance parameters - const QnnHtpPerfInfrastructure_PowerConfig_t *power_configs[] = {&power_config, nullptr}; - Qnn_ErrorHandle_t qnn_status = QNN_SUCCESS; + const QnnHtpPerfInfrastructure_PowerConfig_t * power_configs[] = { &power_config, nullptr }; + Qnn_ErrorHandle_t qnn_status = QNN_SUCCESS; qnn_status = _qnn_htp_perfinfra->setPowerConfig(_qnn_power_configid, power_configs); if (qnn_status != QNN_SUCCESS) { - QNN_LOG_WARN("set htp high performance mode failed"); + QNN_LOG_WARN("set htp high performance mode failed\n"); } else { - QNN_LOG_DEBUG("set htp high performance mode ok"); + QNN_LOG_DEBUG("set htp high performance mode ok\n"); } return 0; } - std::string &get_qnn_graph_name() { return _graph_name; } + std::string & get_qnn_graph_name() { return _graph_name; } bool is_rpcmem_initialized() { return _rpcmem_initialized; } size_t get_rpcmem_capacity() { return _rpcmem_capacity; } - void *alloc_rpcmem(size_t bytes, size_t alignment) { + void * alloc_rpcmem(size_t bytes, size_t alignment) { if (!_rpcmem_initialized) { - QNN_LOG_WARN("rpc memory not initialized"); + QNN_LOG_WARN("rpc memory not initialized\n"); return nullptr; } - auto allocate_bytes = static_cast(bytes + alignment); - void *buf = _pfn_rpc_mem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS, (int)allocate_bytes); + auto allocate_bytes = static_cast(bytes + alignment); + void * buf = _pfn_rpc_mem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS, (int) allocate_bytes); if (!buf) { - QNN_LOG_WARN("failed to allocate rpc memory, size: %d MB", (int)(allocate_bytes / (1 << 20))); + QNN_LOG_WARN("failed to allocate rpc memory, size: %d MB\n", (int) (allocate_bytes / (1 << 20))); return nullptr; } auto aligned_buf = reinterpret_cast(qnn::align_to(alignment, reinterpret_cast(buf))); - bool status = _rpcmem_store_map.insert(std::pair(aligned_buf, buf)).second; + bool status = _rpcmem_store_map.insert(std::pair(aligned_buf, buf)).second; if (!status) { - QNN_LOG_WARN("failed to allocate rpc memory"); + QNN_LOG_WARN("failed to allocate rpc memory\n"); _pfn_rpc_mem_free(buf); } return aligned_buf; } - void free_rpcmem(void *buf) { + void free_rpcmem(void * buf) { if (!_rpcmem_initialized) { - QNN_LOG_WARN("rpc memory not initialized"); + QNN_LOG_WARN("rpc memory not initialized\n"); } else if (_rpcmem_store_map.count(buf) == 0) { - QNN_LOG_WARN("no allocated tensor"); + QNN_LOG_WARN("no allocated tensor\n"); } else { _pfn_rpc_mem_free(_rpcmem_store_map[buf]); _rpcmem_store_map.erase(buf); } } - int32_t rpcmem_to_fd(void *buf) { + int32_t rpcmem_to_fd(void * buf) { int32_t mem_fd = -1; if (!is_rpcmem_initialized()) { - QNN_LOG_WARN("rpc memory not initialized"); + QNN_LOG_WARN("rpc memory not initialized\n"); } else { mem_fd = _pfn_rpc_mem_to_fd(buf); } @@ -350,74 +349,80 @@ class qnn_instance { return mem_fd; } - Qnn_MemHandle_t register_rpcmem(void *p_data, const uint32_t rank, uint32_t *dimensions, Qnn_DataType_t data_type) { + Qnn_MemHandle_t register_rpcmem(void * p_data, const uint32_t rank, uint32_t * dimensions, + Qnn_DataType_t data_type) { if (!p_data) { - QNN_LOG_WARN("invalid param"); + QNN_LOG_WARN("invalid param\n"); return nullptr; } if (!is_rpcmem_initialized()) { - QNN_LOG_WARN("rpc memory not initialized"); + QNN_LOG_WARN("rpc memory not initialized\n"); return nullptr; } if (is_rpcmem_registered(p_data)) { - QNN_LOG_WARN("rpc memory already registered"); + QNN_LOG_WARN("rpc memory already registered\n"); return _qnn_rpc_buffer_to_handles[p_data]; } auto mem_fd = rpcmem_to_fd(p_data); if (mem_fd == -1) { - QNN_LOG_WARN("failed to get file descriptor"); + QNN_LOG_WARN("failed to get file descriptor\n"); return nullptr; } - QNN_LOG_DEBUG("mem_fd %d", mem_fd); - Qnn_MemDescriptor_t descriptor = {{rank, dimensions, nullptr}, data_type, QNN_MEM_TYPE_ION, {{mem_fd}}}; + QNN_LOG_DEBUG("mem_fd %d\n", mem_fd); + Qnn_MemDescriptor_t descriptor = { + { rank, dimensions, nullptr }, + data_type, QNN_MEM_TYPE_ION, { { mem_fd } } + }; Qnn_MemHandle_t handle = nullptr; - auto error = _qnn_interface->qnn_mem_register(_qnn_context_handle, &descriptor, - /*numDescriptors=*/1, &handle); + auto error = _qnn_interface->qnn_mem_register(_qnn_context_handle, &descriptor, + /*numDescriptors=*/1, &handle); if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to register shared memory, error %d, %s", QNN_GET_ERROR_CODE(error), strerror(error)); + QNN_LOG_WARN("failed to register shared memory, error %d, %s\n", (int) QNN_GET_ERROR_CODE(error), + strerror(error)); return nullptr; } - _qnn_rpc_buffer_to_handles.insert({p_data, handle}); - QNN_LOG_DEBUG("successfully register shared memory handler: %p", handle); + _qnn_rpc_buffer_to_handles.insert({ p_data, handle }); + QNN_LOG_DEBUG("successfully register shared memory handler: %p\n", handle); return handle; } void unregister_rpcmem(Qnn_MemHandle_t mem_handle) { auto error = _qnn_interface->qnn_mem_de_register(&mem_handle, 1); if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to unregister shared memory, error %d", QNN_GET_ERROR_CODE(error)); + QNN_LOG_WARN("failed to unregister shared memory, error %d\n", (int) QNN_GET_ERROR_CODE(error)); } auto it = std::find_if(_qnn_rpc_buffer_to_handles.begin(), _qnn_rpc_buffer_to_handles.end(), - [mem_handle](const auto &kv) { return kv.second == mem_handle; }); + [mem_handle](const auto & kv) { return kv.second == mem_handle; }); if (it == _qnn_rpc_buffer_to_handles.end()) { - QNN_LOG_WARN("failed to find shared memory handler: %p", mem_handle); + QNN_LOG_WARN("failed to find shared memory handler: %p\n", mem_handle); return; } _qnn_rpc_buffer_to_handles.erase(it); } - bool is_rpcmem_allocated(void *buf) { return _rpcmem_store_map.count(buf) != 0; } - bool is_rpcmem_registered(void *buf) { return _qnn_rpc_buffer_to_handles.count(buf) != 0U; } + bool is_rpcmem_allocated(void * buf) { return _rpcmem_store_map.count(buf) != 0; } + + bool is_rpcmem_registered(void * buf) { return _qnn_rpc_buffer_to_handles.count(buf) != 0U; } - const qnn::qcom_socinfo &get_soc_info() { return _soc_info; } + const qnn::qcom_socinfo & get_soc_info() { return _soc_info; } -private: + private: int load_system(); - int load_backend(std::string &lib_path, const QnnSaver_Config_t ** /*saver_config*/); + int load_backend(std::string & lib_path, const QnnSaver_Config_t ** /*saver_config*/); int unload_backend(); -private: + private: static constexpr const int _required_num_providers = 1; - std::string _additional_lib_load_path; - std::string _backend_lib_name; + std::string _additional_lib_load_path; + std::string _backend_lib_name; BackendIdType _backend_id; QnnLog_Level_t _qnn_log_level = QNN_LOG_LEVEL_DEBUG; @@ -429,7 +434,7 @@ class qnn_instance { #endif std::shared_ptr _qnn_sys_interface; - std::shared_ptr _qnn_interface; + std::shared_ptr _qnn_interface; Qnn_GraphHandle_t _qnn_graph_handle = nullptr; @@ -443,29 +448,29 @@ class qnn_instance { Qnn_ContextHandle_t _qnn_context_handle = nullptr; - QnnHtpDevice_PerfInfrastructure_t *_qnn_htp_perfinfra = nullptr; - uint32_t _qnn_power_configid = 1; + QnnHtpDevice_PerfInfrastructure_t * _qnn_htp_perfinfra = nullptr; + uint32_t _qnn_power_configid = 1; std::unordered_map _qnn_rpc_buffer_to_handles; - std::mutex _init_mutex; - std::unordered_map _loaded_lib_handle; - std::unordered_map _lib_path_to_backend_id; + std::mutex _init_mutex; + std::unordered_map _loaded_lib_handle; + std::unordered_map _lib_path_to_backend_id; std::unordered_map _loaded_backend; - dl_handler_t _rpc_lib_handle = nullptr; - std::atomic_bool _rpcmem_initialized{false}; - qnn::pfn_rpc_mem_alloc _pfn_rpc_mem_alloc = nullptr; - qnn::pfn_rpc_mem_free _pfn_rpc_mem_free = nullptr; - qnn::pfn_rpc_mem_to_fd _pfn_rpc_mem_to_fd = nullptr; - qnn::pfn_rpc_mem_init _pfn_rpc_mem_init = nullptr; - qnn::pfn_rpc_mem_deinit _pfn_rpc_mem_deinit = nullptr; + dl_handler_t _rpc_lib_handle = nullptr; + std::atomic_bool _rpcmem_initialized{ false }; + qnn::pfn_rpc_mem_alloc _pfn_rpc_mem_alloc = nullptr; + qnn::pfn_rpc_mem_free _pfn_rpc_mem_free = nullptr; + qnn::pfn_rpc_mem_to_fd _pfn_rpc_mem_to_fd = nullptr; + qnn::pfn_rpc_mem_init _pfn_rpc_mem_init = nullptr; + qnn::pfn_rpc_mem_deinit _pfn_rpc_mem_deinit = nullptr; std::unordered_map _rpcmem_store_map; - size_t _rpcmem_capacity = 512; + size_t _rpcmem_capacity = 512; std::string _graph_name; qnn::qcom_socinfo _soc_info = {}; }; -} // namespace qnn +} // namespace qnn diff --git a/ggml/src/ggml-qnn/qnn-types.hpp b/ggml/src/ggml-qnn/qnn-types.hpp index ec30602843301..8284036bb7503 100644 --- a/ggml/src/ggml-qnn/qnn-types.hpp +++ b/ggml/src/ggml-qnn/qnn-types.hpp @@ -20,48 +20,48 @@ enum sdk_profile_level { profile_off = 0, profile_basic, profile_detail }; enum qcom_htp_arch { NONE = 0, - V68 = 68, - V69 = 69, - V73 = 73, - V75 = 75, - V79 = 79, // SD 8 Gen 4 (SM8750) + V68 = 68, + V69 = 69, + V73 = 73, + V75 = 75, + V79 = 79, // SD 8 Gen 4 (SM8750) }; enum qcom_chipset { UNKNOWN_SM = 0, - SM8450 = 36, // v69, SD 8 Gen 1 - SM8475 = 42, // v69, SD 8+ Gen 1 - SM8550 = 43, // v73, SD 8 Gen 2 - SSG2115P = 46, // v73 - SM8650 = 57, // v75, SD 8 Gen 3 - SA8295 = 39, // v68 - SM8750 = 69, // v79, SD 8 Gen 4 + SM8450 = 36, // v69, SD 8 Gen 1 + SM8475 = 42, // v69, SD 8+ Gen 1 + SM8550 = 43, // v73, SD 8 Gen 2 + SSG2115P = 46, // v73 + SM8650 = 57, // v75, SD 8 Gen 3 + SA8295 = 39, // v68 + SM8750 = 69, // v79, SD 8 Gen 4 }; struct qcom_socinfo { uint32_t soc_model; - size_t htp_arch; - size_t vtcm_size_in_mb; + size_t htp_arch; + size_t vtcm_size_in_mb; }; -using pfn_rpc_mem_init = void (*)(void); +using pfn_rpc_mem_init = void (*)(void); using pfn_rpc_mem_deinit = void (*)(void); -using pfn_rpc_mem_alloc = void *(*)(int, uint32_t, int); -using pfn_rpc_mem_free = void (*)(void *); -using pfn_rpc_mem_to_fd = int (*)(void *); +using pfn_rpc_mem_alloc = void * (*) (int, uint32_t, int); +using pfn_rpc_mem_free = void (*)(void *); +using pfn_rpc_mem_to_fd = int (*)(void *); -using pfn_qnnsaver_initialize = decltype(QnnSaver_initialize); -using pfn_qnninterface_getproviders = decltype(QnnInterface_getProviders); +using pfn_qnnsaver_initialize = decltype(QnnSaver_initialize); +using pfn_qnninterface_getproviders = decltype(QnnInterface_getProviders); using pfn_qnnsysteminterface_getproviders = decltype(QnnSystemInterface_getProviders); -} // namespace qnn +} // namespace qnn -#define RPCMEM_DEFAULT_FLAGS 1 +#define RPCMEM_DEFAULT_FLAGS 1 #define RPCMEM_HEAP_ID_SYSTEM 25 -#define DISABLE_COPY(class_name) \ - class_name(const class_name &) = delete; \ +#define DISABLE_COPY(class_name) \ + class_name(const class_name &) = delete; \ void operator=(const class_name &) = delete -#define DISABLE_MOVE(class_name) \ - class_name(class_name &&) = delete; \ +#define DISABLE_MOVE(class_name) \ + class_name(class_name &&) = delete; \ void operator=(class_name &&) = delete diff --git a/ggml/src/ggml-qnn/tensor.hpp b/ggml/src/ggml-qnn/tensor.hpp index 423c3ba7fa8c1..660223caf728a 100644 --- a/ggml/src/ggml-qnn/tensor.hpp +++ b/ggml/src/ggml-qnn/tensor.hpp @@ -9,9 +9,8 @@ #include #include -#include "ggml-qnn.h" - #include "buffer.hpp" +#include "ggml-qnn.h" #include "logger.hpp" #include "qnn-lib.hpp" #include "utils.hpp" @@ -21,14 +20,17 @@ namespace qnn { static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS should be 4"); class ggml_qnn_tensor : public std::enable_shared_from_this { -public: + public: typedef enum _tensor_type { INPUT, OUTPUT, INTERMEDIATE, PARAMETER, BIDIRECTION } tensor_type_t; - explicit ggml_qnn_tensor(tensor_type_t tensor_type, const std::string &name, - const qnn_dimension_array_t &dimensions, Qnn_DataType_t data_type, int rank, + explicit ggml_qnn_tensor(tensor_type_t tensor_type, const std::string & name, + const qnn_dimension_array_t & dimensions, Qnn_DataType_t data_type, int rank, QNNBackend device, Qnn_GraphHandle_t graph_handle, - std::shared_ptr qnn_instance) - : _tensor_name(name), _device(device), _qnn_instance(qnn_instance), _graph_handle(graph_handle) { + std::shared_ptr qnn_instance) : + _tensor_name(name), + _device(device), + _qnn_instance(qnn_instance), + _graph_handle(graph_handle) { if (!_tensor_name.empty()) { QNN_TENSOR_SET_NAME(_qnn_tensor, _tensor_name.c_str()); } @@ -37,23 +39,24 @@ class ggml_qnn_tensor : public std::enable_shared_from_this { QNN_TENSOR_SET_DIMENSIONS(_qnn_tensor, _dimensions.data()); QNN_TENSOR_SET_DATA_FORMAT(_qnn_tensor, QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER); update_params_from_ggml_tensor(tensor_type, data_type, rank); - QNN_LOG_DEBUG("[%s][%s]created, rank: %d, dims: [%d, %d, %d, %d], type: %s", get_backend_name(device), - _tensor_name.c_str(), rank, (int)_dimensions[0], (int)_dimensions[1], (int)_dimensions[2], - (int)_dimensions[3], qnn_datatype_to_string(data_type)); + QNN_LOG_DEBUG("[%s][%s]created, rank: %d, dims: [%d, %d, %d, %d], type: %s\n", get_backend_name(device), + _tensor_name.c_str(), rank, (int) _dimensions[0], (int) _dimensions[1], (int) _dimensions[2], + (int) _dimensions[3], qnn_datatype_to_string(data_type)); } - explicit ggml_qnn_tensor(tensor_type_t tensor_type, const std::string &name, - const ggml_dimension_array_t &dimensions, ggml_type data_type, int rank, QNNBackend device, - Qnn_GraphHandle_t graph_handle, std::shared_ptr qnn_instance) - : ggml_qnn_tensor(tensor_type, name, get_internal_dimension(dimensions, rank), - qnn_datatype_from_ggml_datatype(data_type), rank, device, graph_handle, qnn_instance) {} + explicit ggml_qnn_tensor(tensor_type_t tensor_type, const std::string & name, + const ggml_dimension_array_t & dimensions, ggml_type data_type, int rank, + QNNBackend device, Qnn_GraphHandle_t graph_handle, + std::shared_ptr qnn_instance) : + ggml_qnn_tensor(tensor_type, name, get_internal_dimension(dimensions, rank), + qnn_datatype_from_ggml_datatype(data_type), rank, device, graph_handle, qnn_instance) {} ~ggml_qnn_tensor() { _rpc_buffer.reset(); unbind(); } - bool set_data_buffer(const uint8_t *buffer, const size_t buffer_size) { + bool set_data_buffer(const uint8_t * buffer, const size_t buffer_size) { auto qnn_buffer = std::make_shared(buffer, buffer_size); if (bind_buffer_impl(qnn_buffer)) { return true; @@ -74,71 +77,72 @@ class ggml_qnn_tensor : public std::enable_shared_from_this { bool alloc_qnn_tensor_id() { if (QNN_TENSOR_GET_ID(_qnn_tensor)) { - QNN_LOG_DEBUG("[%s]tensor already has a id: %d", _tensor_name.c_str(), QNN_TENSOR_GET_ID(_qnn_tensor)); + QNN_LOG_DEBUG("[%s]tensor already has a id: %d\n", _tensor_name.c_str(), QNN_TENSOR_GET_ID(_qnn_tensor)); return true; } - Qnn_Tensor_t qnn_tensor = _qnn_tensor; - auto qnn_interface = _qnn_instance->get_qnn_interface(); - auto error = qnn_interface->qnn_tensor_create_graph_tensor(_graph_handle, &qnn_tensor); + Qnn_Tensor_t qnn_tensor = _qnn_tensor; + auto qnn_interface = _qnn_instance->get_qnn_interface(); + auto error = qnn_interface->qnn_tensor_create_graph_tensor(_graph_handle, &qnn_tensor); if (error != QNN_SUCCESS) { - QNN_LOG_WARN("[%s]allocate id failed , error: %d", _tensor_name.c_str(), error); + QNN_LOG_WARN("[%s]allocate id failed , error: %d\n", _tensor_name.c_str(), (int) error); return false; } QNN_TENSOR_SET_ID(_qnn_tensor, QNN_TENSOR_GET_ID(qnn_tensor)); - QNN_LOG_DEBUG("[%s][%s]allocated id: %d, rank: %d", get_backend_name(_device), _tensor_name.c_str(), + QNN_LOG_DEBUG("[%s][%s]allocated id: %d, rank: %d\n", get_backend_name(_device), _tensor_name.c_str(), QNN_TENSOR_GET_ID(qnn_tensor), QNN_TENSOR_GET_RANK(qnn_tensor)); return true; } - bool bind_ggml_tensor(ggml_tensor *tensor) { + bool bind_ggml_tensor(ggml_tensor * tensor) { if (!_can_unbind) { - QNN_LOG_DEBUG("[%s]already has buffer storage, skip bind", _tensor_name.c_str()); + QNN_LOG_DEBUG("[%s]already has buffer storage, skip bind\n", _tensor_name.c_str()); return true; } #ifndef NDEBUG if (tensor->view_src) { - auto *src = tensor->view_src; - QNN_LOG_DEBUG("[%s]tensor(%s_%dx%dx%dx%d) is a view, src: %s_%dx%dx%dx%d", get_backend_name(_device), - tensor->name, tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3], src->name, - src->ne[0], src->ne[1], src->ne[2], src->ne[3]); + auto * src = tensor->view_src; + QNN_LOG_DEBUG("[%s]tensor(%s_%dx%dx%dx%d) is a view, src: %s_%dx%dx%dx%d\n", get_backend_name(_device), + tensor->name, (int) tensor->ne[0], (int) tensor->ne[1], (int) tensor->ne[2], + (int) tensor->ne[3], src->name, (int) src->ne[0], (int) src->ne[1], (int) src->ne[2], + (int) src->ne[3]); } #endif auto buffer = std::make_shared(reinterpret_cast(tensor->data), ggml_nbytes(tensor)); if (!bind_buffer_impl(buffer)) { - QNN_LOG_WARN("[%s]failed to bind ggml tensor(%s)", _tensor_name.c_str(), ggml_get_name(tensor)); + QNN_LOG_WARN("[%s]failed to bind ggml tensor(%s)\n", _tensor_name.c_str(), ggml_get_name(tensor)); return false; } - QNN_LOG_DEBUG("[%s][%s]bind to ggml tensor(%s)", get_backend_name(_device), _tensor_name.c_str(), + QNN_LOG_DEBUG("[%s][%s]bind to ggml tensor(%s)\n", get_backend_name(_device), _tensor_name.c_str(), ggml_get_name(tensor)); tensor->extra = this; - _ggml_tensor = tensor; + _ggml_tensor = tensor; return true; } bool unbind() { if (!_graph_handle) { - QNN_LOG_WARN("[%s]not bound to any graph", _tensor_name.c_str()); + QNN_LOG_WARN("[%s]not bound to any graph\n", _tensor_name.c_str()); return false; } if (!_buffer) { - QNN_LOG_DEBUG("[%s]unbind to ggml tensor", _tensor_name.c_str()); + QNN_LOG_DEBUG("[%s]unbind to ggml tensor\n", _tensor_name.c_str()); return true; } if (!read_from_qnn_tensor()) { - QNN_LOG_WARN("[%s]read from qnn tensor failed", _tensor_name.c_str()); + QNN_LOG_WARN("[%s]read from qnn tensor failed\n", _tensor_name.c_str()); return false; } if (!_can_unbind) { - QNN_LOG_DEBUG("[%s]already has buffer storage, stop unbind", _tensor_name.c_str()); + QNN_LOG_DEBUG("[%s]already has buffer storage, stop unbind\n", _tensor_name.c_str()); return true; } @@ -146,42 +150,46 @@ class ggml_qnn_tensor : public std::enable_shared_from_this { QNN_TENSOR_SET_MEM_TYPE(_qnn_tensor, QNN_TENSORMEMTYPE_RAW); Qnn_ClientBuffer_t client_buf = {}; QNN_TENSOR_SET_CLIENT_BUF(_qnn_tensor, client_buf); - QNN_LOG_DEBUG("[%s]clear client buffer", _tensor_name.c_str()); + QNN_LOG_DEBUG("[%s]clear client buffer\n", _tensor_name.c_str()); } - QNN_LOG_DEBUG("[%s][%s]unbind from buffer: %p, size: %d", get_backend_name(_device), _tensor_name.c_str(), - _buffer.get(), (int)_buffer->get_size()); + QNN_LOG_DEBUG("[%s][%s]unbind from buffer: %p, size: %d\n", get_backend_name(_device), _tensor_name.c_str(), + (void *) _buffer.get(), (int) _buffer->get_size()); _buffer.reset(); if (_ggml_tensor) { _ggml_tensor->extra = nullptr; - _ggml_tensor = nullptr; + _ggml_tensor = nullptr; } return true; } - const Qnn_Tensor_t &get_qnn_tensor() const { return _qnn_tensor; } + const Qnn_Tensor_t & get_qnn_tensor() const { return _qnn_tensor; } + Qnn_DataType_t get_data_type() const { return QNN_TENSOR_GET_DATA_TYPE(_qnn_tensor); } - const qnn_dimension_array_t &get_dimensions() const { return _dimensions; } + + const qnn_dimension_array_t & get_dimensions() const { return _dimensions; } + uint32_t get_rank() const { return QNN_TENSOR_GET_RANK(_qnn_tensor); } + uint32_t get_qnn_tensor_id() const { return QNN_TENSOR_GET_ID(_qnn_tensor); } -private: + private: bool bind_buffer_impl(qnn_buffer_ptr buffer) { if (_buffer) { if (_buffer != buffer) { - QNN_LOG_WARN("[%s]has been bound to another buffer %p", _tensor_name.c_str(), _buffer.get()); + QNN_LOG_WARN("[%s]has been bound to another buffer %p\n", _tensor_name.c_str(), (void *) _buffer.get()); return false; } - QNN_LOG_DEBUG("[%s]already bound to same ggml tensor %p", _tensor_name.c_str(), _buffer.get()); + QNN_LOG_DEBUG("[%s]already bound to same ggml tensor %p\n", _tensor_name.c_str(), (void *) _buffer.get()); return true; } if (QNN_TENSOR_GET_TYPE(_qnn_tensor) == QNN_TENSOR_TYPE_NATIVE) { - QNN_LOG_DEBUG("[%s]tensor type(%d) not READ/WRITE, skipping", _tensor_name.c_str(), - (int)QNN_TENSOR_TYPE_NATIVE); + QNN_LOG_DEBUG("[%s]tensor type(%d) not READ/WRITE, skipping\n", _tensor_name.c_str(), + (int) QNN_TENSOR_TYPE_NATIVE); return true; } @@ -191,7 +199,7 @@ class ggml_qnn_tensor : public std::enable_shared_from_this { _qnn_instance, buffer->get_size(), QNN_TENSOR_GET_RANK(_qnn_tensor), QNN_TENSOR_GET_DIMENSIONS(_qnn_tensor), QNN_TENSOR_GET_DATA_TYPE(_qnn_tensor)); if (!rpc_buffer->is_valid()) { - QNN_LOG_WARN("[%s][%s]alloc rpc mem failed", get_backend_name(_device), _tensor_name.c_str()); + QNN_LOG_WARN("[%s][%s]alloc rpc mem failed\n", get_backend_name(_device), _tensor_name.c_str()); return false; } @@ -201,38 +209,38 @@ class ggml_qnn_tensor : public std::enable_shared_from_this { QNN_TENSOR_SET_MEM_TYPE(_qnn_tensor, QNN_TENSORMEMTYPE_MEMHANDLE); auto mem_handle = _rpc_buffer->get_mem_handle(); if (!mem_handle) { - QNN_LOG_WARN("[%s][%s]can't find rpcmem from qnn mem handle", get_backend_name(_device), + QNN_LOG_WARN("[%s][%s]can't find rpcmem from qnn mem handle\n", get_backend_name(_device), _tensor_name.c_str()); return false; } QNN_TENSOR_SET_MEM_HANDLE(_qnn_tensor, mem_handle); - QNN_LOG_DEBUG("[%s][%s]use mem handle %p", get_backend_name(_device), _tensor_name.c_str(), + QNN_LOG_DEBUG("[%s][%s]use mem handle %p\n", get_backend_name(_device), _tensor_name.c_str(), QNN_TENSOR_GET_MEM_HANDLE(_qnn_tensor)); } else { QNN_TENSOR_SET_MEM_TYPE(_qnn_tensor, QNN_TENSORMEMTYPE_RAW); - Qnn_ClientBuffer_t client_buf = {buffer->get_buffer(), (uint32_t)buffer->get_size()}; + Qnn_ClientBuffer_t client_buf = { buffer->get_buffer(), (uint32_t) buffer->get_size() }; QNN_TENSOR_SET_CLIENT_BUF(_qnn_tensor, client_buf); - QNN_LOG_DEBUG("[%s]use client buffer %p size %d", _tensor_name.c_str(), client_buf.data, - (int)client_buf.dataSize); + QNN_LOG_DEBUG("[%s]use client buffer %p size %d\n", _tensor_name.c_str(), client_buf.data, + (int) client_buf.dataSize); } _buffer = buffer; if (!write_to_qnn_tensor()) { - QNN_LOG_WARN("[%s]write to qnn tensor failed", _tensor_name.c_str()); + QNN_LOG_WARN("[%s]write to qnn tensor failed\n", _tensor_name.c_str()); return false; } - QNN_LOG_DEBUG("[%s][%s]bind to buffer: %p, size: %d", get_backend_name(_device), _tensor_name.c_str(), - buffer.get(), (int)buffer->get_size()); + QNN_LOG_DEBUG("[%s][%s]bind to buffer: %p, size: %d\n", get_backend_name(_device), _tensor_name.c_str(), + (void *) buffer.get(), (int) buffer->get_size()); return true; } bool write_to_qnn_tensor() { auto tensor_type = QNN_TENSOR_GET_TYPE(_qnn_tensor); if (tensor_type != QNN_TENSOR_TYPE_APP_WRITE && tensor_type != QNN_TENSOR_TYPE_APP_READWRITE) { - QNN_LOG_DEBUG("[%s]tensor type(%d) not WRITE", _tensor_name.c_str(), (int)tensor_type); + QNN_LOG_DEBUG("[%s]tensor type(%d) not WRITE\n", _tensor_name.c_str(), (int) tensor_type); return true; } @@ -241,14 +249,14 @@ class ggml_qnn_tensor : public std::enable_shared_from_this { } // For CPU and GPU, the data is already in the tensor. - QNN_LOG_DEBUG("[%s][%s]write tensor to qnn", get_backend_name(_device), _tensor_name.c_str()); + QNN_LOG_DEBUG("[%s][%s]write tensor to qnn\n", get_backend_name(_device), _tensor_name.c_str()); return true; } bool read_from_qnn_tensor() { auto tensor_type = QNN_TENSOR_GET_TYPE(_qnn_tensor); if (tensor_type != QNN_TENSOR_TYPE_APP_READ && tensor_type != QNN_TENSOR_TYPE_APP_READWRITE) { - QNN_LOG_DEBUG("[%s]tensor type(%d) not READ", _tensor_name.c_str(), (int)tensor_type); + QNN_LOG_DEBUG("[%s]tensor type(%d) not READ\n", _tensor_name.c_str(), (int) tensor_type); return true; } @@ -257,7 +265,7 @@ class ggml_qnn_tensor : public std::enable_shared_from_this { } // For CPU and GPU, the data is already in the tensor. - QNN_LOG_DEBUG("[%s][%s]read tensor from qnn", get_backend_name(_device), _tensor_name.c_str()); + QNN_LOG_DEBUG("[%s][%s]read tensor from qnn\n", get_backend_name(_device), _tensor_name.c_str()); return true; } @@ -265,7 +273,7 @@ class ggml_qnn_tensor : public std::enable_shared_from_this { QNN_TENSOR_SET_DATA_TYPE(_qnn_tensor, data_type); // TODO: set the quantizeParams base on the tensor type - QNN_TENSOR_SET_RANK(_qnn_tensor, (uint32_t)rank); + QNN_TENSOR_SET_RANK(_qnn_tensor, (uint32_t) rank); QNN_TENSOR_SET_MEM_TYPE(_qnn_tensor, QNN_TENSORMEMTYPE_RAW); Qnn_ClientBuffer_t client_buf = {}; QNN_TENSOR_SET_CLIENT_BUF(_qnn_tensor, client_buf); @@ -290,7 +298,7 @@ class ggml_qnn_tensor : public std::enable_shared_from_this { break; } QNN_TENSOR_SET_TYPE(_qnn_tensor, new_tensor_type); - QNN_LOG_DEBUG("[%s][%s]tensor changed to type %d", get_backend_name(_device), _tensor_name.c_str(), + QNN_LOG_DEBUG("[%s][%s]tensor changed to type %d\n", get_backend_name(_device), _tensor_name.c_str(), new_tensor_type); } @@ -299,31 +307,31 @@ class ggml_qnn_tensor : public std::enable_shared_from_this { return false; } - std::string _tensor_name; - qnn_buffer_ptr _buffer; - bool _can_unbind = true; - QNNBackend _device; + std::string _tensor_name; + qnn_buffer_ptr _buffer; + bool _can_unbind = true; + QNNBackend _device; std::shared_ptr _qnn_instance; - Qnn_Tensor_t _qnn_tensor = qnn_tensor_init(kDefaultQnnTensorVersion); - qnn_dimension_array_t _dimensions = {}; - Qnn_GraphHandle_t _graph_handle = nullptr; - qnn_buffer_ptr _rpc_buffer; - ggml_tensor *_ggml_tensor = nullptr; + Qnn_Tensor_t _qnn_tensor = qnn_tensor_init(kDefaultQnnTensorVersion); + qnn_dimension_array_t _dimensions = {}; + Qnn_GraphHandle_t _graph_handle = nullptr; + qnn_buffer_ptr _rpc_buffer; + ggml_tensor * _ggml_tensor = nullptr; DISABLE_COPY(ggml_qnn_tensor); DISABLE_MOVE(ggml_qnn_tensor); }; -using qnn_tensor_ptr_t = std::shared_ptr; -using qnn_tensor_array_t = std::vector; +using qnn_tensor_ptr_t = std::shared_ptr; +using qnn_tensor_array_t = std::vector; using ggml_tensor_array_t = std::vector; -inline qnn_tensor_ptr_t get_qnn_tensor_ptr(ggml_tensor *ggml_tensor) { - return ggml_tensor->extra ? reinterpret_cast(ggml_tensor->extra)->shared_from_this() - : qnn_tensor_ptr_t(); +inline qnn_tensor_ptr_t get_qnn_tensor_ptr(ggml_tensor * ggml_tensor) { + return ggml_tensor->extra ? reinterpret_cast(ggml_tensor->extra)->shared_from_this() : + qnn_tensor_ptr_t(); } -inline int get_ggml_tensors_max_rank(const qnn::ggml_tensor_array_t &tensors) { +inline int get_ggml_tensors_max_rank(const qnn::ggml_tensor_array_t & tensors) { int max_rank = 0; for (auto tensor : tensors) { max_rank = std::max(max_rank, ggml_n_dims(tensor)); @@ -332,14 +340,14 @@ inline int get_ggml_tensors_max_rank(const qnn::ggml_tensor_array_t &tensors) { return max_rank; } -inline bool bind_tensors(const ggml_tensor_array_t &ggml_tensors, qnn_tensor_array_t &tensor_wrappers, - std::vector &qnn_tensors) { +inline bool bind_tensors(const ggml_tensor_array_t & ggml_tensors, qnn_tensor_array_t & tensor_wrappers, + std::vector & qnn_tensors) { GGML_ASSERT(tensor_wrappers.size() == ggml_tensors.size()); qnn_tensors.resize(ggml_tensors.size()); for (size_t i = 0; i < ggml_tensors.size(); i++) { - auto *ggml_tensor = ggml_tensors[i]; + auto * ggml_tensor = ggml_tensors[i]; if (!tensor_wrappers[i]->bind_ggml_tensor(ggml_tensor)) { - QNN_LOG_ERROR("bind tensor %s failed", ggml_get_name(ggml_tensor)); + QNN_LOG_ERROR("bind tensor %s failed\n", ggml_get_name(ggml_tensor)); return false; } @@ -349,12 +357,12 @@ inline bool bind_tensors(const ggml_tensor_array_t &ggml_tensors, qnn_tensor_arr return true; } -inline bool bind_tensors(const ggml_tensor_array_t &ggml_tensors, qnn_tensor_array_t &tensor_wrappers) { +inline bool bind_tensors(const ggml_tensor_array_t & ggml_tensors, qnn_tensor_array_t & tensor_wrappers) { GGML_ASSERT(tensor_wrappers.size() == ggml_tensors.size()); for (size_t i = 0; i < ggml_tensors.size(); i++) { - auto *ggml_tensor = ggml_tensors[i]; + auto * ggml_tensor = ggml_tensors[i]; if (!tensor_wrappers[i]->bind_ggml_tensor(ggml_tensor)) { - QNN_LOG_ERROR("bind tensor %s failed", ggml_get_name(ggml_tensor)); + QNN_LOG_ERROR("bind tensor %s failed\n", ggml_get_name(ggml_tensor)); return false; } } @@ -362,31 +370,31 @@ inline bool bind_tensors(const ggml_tensor_array_t &ggml_tensors, qnn_tensor_arr return true; } -inline void unbind_tensors(qnn_tensor_array_t &tensor_wrappers) { - for (auto &tensor : tensor_wrappers) { +inline void unbind_tensors(qnn_tensor_array_t & tensor_wrappers) { + for (auto & tensor : tensor_wrappers) { tensor->unbind(); } } struct tensor_create_common_params { - const char *name_prefix; - int tensor_rank; - bool is_input; - QNNBackend device; - Qnn_GraphHandle_t graph_handle; + const char * name_prefix; + int tensor_rank; + bool is_input; + QNNBackend device; + Qnn_GraphHandle_t graph_handle; std::shared_ptr qnn_instance; }; -inline void create_tensors_from_ggml_tensor(const tensor_create_common_params ¶ms, - const ggml_tensor_array_t &ggml_tensors, - qnn_tensor_array_t *tensor_wrappers, - std::vector *qnn_tensors) { +inline void create_tensors_from_ggml_tensor(const tensor_create_common_params & params, + const ggml_tensor_array_t & ggml_tensors, + qnn_tensor_array_t * tensor_wrappers, + std::vector * qnn_tensors) { if (qnn_tensors) { qnn_tensors->resize(ggml_tensors.size()); } if (!tensor_wrappers->empty()) { - QNN_LOG_DEBUG("tensor_wrappers is not empty, skip create tensors"); + QNN_LOG_DEBUG("tensor_wrappers is not empty, skip create tensors\n"); GGML_ASSERT(tensor_wrappers->size() == ggml_tensors.size()); return; } @@ -394,14 +402,14 @@ inline void create_tensors_from_ggml_tensor(const tensor_create_common_params &p tensor_wrappers->resize(ggml_tensors.size()); char buffer[GGML_MAX_NAME] = {}; - auto tensor_type = params.is_input ? ggml_qnn_tensor::INPUT : ggml_qnn_tensor::OUTPUT; + auto tensor_type = params.is_input ? ggml_qnn_tensor::INPUT : ggml_qnn_tensor::OUTPUT; for (size_t i = 0; i < ggml_tensors.size(); i++) { - snprintf(buffer, GGML_MAX_NAME, "%s%d", params.name_prefix, (int)i); - auto *ggml_tensor = ggml_tensors[i]; + snprintf(buffer, GGML_MAX_NAME, "%s%d", params.name_prefix, (int) i); + auto * ggml_tensor = ggml_tensors[i]; (*tensor_wrappers)[i] = std::make_shared(tensor_type, std::string(buffer), ggml_tensor->ne, ggml_tensor->type, params.tensor_rank, params.device, params.graph_handle, params.qnn_instance); } } -} // namespace qnn +} // namespace qnn diff --git a/ggml/src/ggml-qnn/utils.cpp b/ggml/src/ggml-qnn/utils.cpp index e9aa4d37374a6..f9178f90d556f 100644 --- a/ggml/src/ggml-qnn/utils.cpp +++ b/ggml/src/ggml-qnn/utils.cpp @@ -4,30 +4,28 @@ #include #include "ggml-qnn.h" - -#include "QnnGraph.h" #include "qnn-types.hpp" +#include "QnnGraph.h" #ifdef _WIN32 -#include +# include #else -#include -#include +# include +# include #endif namespace { -template -_Ty align_to_generic(size_t alignment, _Ty offset) { - return offset % alignment == 0 ? offset - : offset + (static_cast<_Ty>(alignment) - (offset % static_cast<_Ty>(alignment))); +template _Ty align_to_generic(size_t alignment, _Ty offset) { + return offset % alignment == 0 ? offset : + offset + (static_cast<_Ty>(alignment) - (offset % static_cast<_Ty>(alignment))); } -} // namespace +} // namespace namespace qnn { -qnn_dimension_array_t get_internal_dimension(const ggml_dimension_array_t &dims, uint32_t rank) { +qnn_dimension_array_t get_internal_dimension(const ggml_dimension_array_t & dims, uint32_t rank) { static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS should be 4"); GGML_ASSERT(rank <= GGML_MAX_DIMS && rank > 0); @@ -43,30 +41,29 @@ qnn_dimension_array_t get_internal_dimension(const ggml_dimension_array_t &dims, * The ggml tensor will have dimensions [3, 2], while the qnn tensor will have dimensions [2, 3]. */ for (uint32_t i = 0; i < rank; i++) { - internal_dims[i] = std::max((uint32_t)dims[rank - 1 - i], 1); + internal_dims[i] = std::max((uint32_t) dims[rank - 1 - i], 1); } return internal_dims; } -qnn_dimension_array_t get_view_internal_dimension(const ggml_tensor *tensor, size_t &element_offset_out) { - +qnn_dimension_array_t get_view_internal_dimension(const ggml_tensor * tensor, size_t & element_offset_out) { element_offset_out = 0; - auto *parent_tensor = tensor; + auto * parent_tensor = tensor; while (parent_tensor->view_src) { element_offset_out += parent_tensor->view_offs; parent_tensor = parent_tensor->view_src; } - const auto rank = get_ggml_tensor_rank(tensor); + const auto rank = get_ggml_tensor_rank(tensor); const auto parent_rank = get_ggml_tensor_rank(parent_tensor); GGML_ASSERT(parent_tensor->type == tensor->type); GGML_ASSERT(parent_rank == rank); const auto block_size = ggml_blck_size(tensor->type); element_offset_out = - element_offset_out * block_size / tensor->nb[0]; // calculate the element offset in the view tensor + element_offset_out * block_size / tensor->nb[0]; // calculate the element offset in the view tensor return get_internal_dimension(parent_tensor->ne, parent_rank); } @@ -141,7 +138,7 @@ size_t qnn_datatype_size(Qnn_DataType_t qnn_type) { return 0; } -const char *qnn_datatype_to_string(Qnn_DataType_t qnn_type) { +const char * qnn_datatype_to_string(Qnn_DataType_t qnn_type) { switch (qnn_type) { case QNN_DATATYPE_FLOAT_32: return "QNN_DATATYPE_FLOAT_32"; @@ -166,7 +163,7 @@ const char *qnn_datatype_to_string(Qnn_DataType_t qnn_type) { return "QNN_DATATYPE_UNDEFINED"; } -uint32_t get_ggml_tensor_rank(const ggml_tensor *tensor) { +uint32_t get_ggml_tensor_rank(const ggml_tensor * tensor) { uint32_t rank = 0; for (int i = 0; i < GGML_MAX_DIMS; i++) { if ((0 != tensor->ne[i]) && (1 != tensor->ne[i])) { @@ -176,12 +173,12 @@ uint32_t get_ggml_tensor_rank(const ggml_tensor *tensor) { return rank; } -const char *get_ggml_type_name(ggml_type type) { - const auto *traits = ggml_get_type_traits(type); +const char * get_ggml_type_name(ggml_type type) { + const auto * traits = ggml_get_type_traits(type); return traits->type_name; } -const char *get_backend_name(QNNBackend device_index) { +const char * get_backend_name(QNNBackend device_index) { switch (device_index) { case QNN_BACKEND_CPU: return "qnn-cpu"; @@ -195,7 +192,7 @@ const char *get_backend_name(QNNBackend device_index) { } } -const char *get_chipset_desc(uint32_t chipset_id) { +const char * get_chipset_desc(uint32_t chipset_id) { switch (chipset_id) { case SM8450: return "SD 8 Gen 1 (SM8450)"; @@ -212,7 +209,7 @@ const char *get_chipset_desc(uint32_t chipset_id) { } } -const char *get_htparch_desc(size_t htp_arch) { +const char * get_htparch_desc(size_t htp_arch) { switch (htp_arch) { case V68: return "QCOM_HTP_V68"; @@ -229,12 +226,18 @@ const char *get_htparch_desc(size_t htp_arch) { } } -intptr_t align_to(size_t alignment, intptr_t offset) { return align_to_generic(alignment, offset); } +intptr_t align_to(size_t alignment, intptr_t offset) { + return align_to_generic(alignment, offset); +} -uint32_t get_ggml_tensor_data_size(const ggml_tensor *tensor) { return (uint32_t)ggml_nbytes(tensor); } +uint32_t get_ggml_tensor_data_size(const ggml_tensor * tensor) { + return (uint32_t) ggml_nbytes(tensor); +} #ifdef _WIN32 -static void *_align_alloc(size_t alignment, size_t size) { return _aligned_malloc(size, alignment); } +static void * _align_alloc(size_t alignment, size_t size) { + return _aligned_malloc(size, alignment); +} static size_t _get_page_size() { SYSTEM_INFO si; @@ -242,22 +245,31 @@ static size_t _get_page_size() { return si.dwPageSize; } -void align_free(void *ptr) { _aligned_free(ptr); } +void align_free(void * ptr) { + _aligned_free(ptr); +} #else -static void *_align_alloc(size_t alignment, size_t size) { return std::aligned_alloc(alignment, size); } +static void * _align_alloc(size_t alignment, size_t size) { + return std::aligned_alloc(alignment, size); +} -static size_t _get_page_size() { return sysconf(_SC_PAGESIZE); } +static size_t _get_page_size() { + return sysconf(_SC_PAGESIZE); +} -void align_free(void *ptr) { std::free(ptr); } +void align_free(void * ptr) { + std::free(ptr); +} #endif -void *page_align_alloc(size_t size) { - const size_t alignment = _get_page_size(); - size_t size_aligned = align_to_generic(alignment, size); - QNN_LOG_DEBUG("_align_alloc success, alignment: %ld, size: %ld, size_aligned: %ld", alignment, size, size_aligned); - void *data = _align_alloc(alignment, size_aligned); +void * page_align_alloc(size_t size) { + const size_t alignment = _get_page_size(); + size_t size_aligned = align_to_generic(alignment, size); + QNN_LOG_DEBUG("_align_alloc success, alignment: %ld, size: %ld, size_aligned: %ld\n", alignment, size, size_aligned); + void * data = _align_alloc(alignment, size_aligned); if (!data) { - QNN_LOG_WARN("_align_alloc failed, alignment: %ld, size: %ld, size_aligned: %ld", alignment, size, size_aligned); + QNN_LOG_WARN("_align_alloc failed, alignment: %ld, size: %ld, size_aligned: %ld\n", alignment, size, + size_aligned); return nullptr; } @@ -270,7 +282,7 @@ void *page_align_alloc(size_t size) { // // ================================================================================================= // TODO: only support GGML_OP_ADD/GGML_OP_MUL/GGML_OP_MUL_MAT -const char *opname_from_ggmlop(enum ggml_op ggmlop) { +const char * opname_from_ggmlop(enum ggml_op ggmlop) { switch (ggmlop) { case GGML_OP_ADD: return QNN_OP_ELEMENT_WISE_ADD; @@ -284,7 +296,7 @@ const char *opname_from_ggmlop(enum ggml_op ggmlop) { return nullptr; } -const char *get_qnn_error_string(Qnn_ErrorHandle_t error) { +const char * get_qnn_error_string(Qnn_ErrorHandle_t error) { // A complete list of error codes can be found at here: // https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/api_error_codes.html thread_local static char error_code[128] = {}; @@ -377,7 +389,7 @@ const char *get_qnn_error_string(Qnn_ErrorHandle_t error) { size_t get_system_total_memory_in_bytes() { MEMORYSTATUSEX mem = {}; - mem.dwLength = sizeof(mem); + mem.dwLength = sizeof(mem); if (GlobalMemoryStatusEx(&mem)) { return mem.ullTotalPhys; } @@ -387,7 +399,7 @@ size_t get_system_total_memory_in_bytes() { size_t get_system_free_memory_in_bytes() { MEMORYSTATUSEX mem = {}; - mem.dwLength = sizeof(mem); + mem.dwLength = sizeof(mem); if (GlobalMemoryStatusEx(&mem)) { return mem.ullAvailPhys; } @@ -403,8 +415,8 @@ size_t get_system_total_memory_in_bytes() { return (info.totalram + info.totalswap) * info.mem_unit; } - auto pages = (size_t)sysconf(_SC_PHYS_PAGES); - auto page_size = (size_t)sysconf(_SC_PAGE_SIZE); + auto pages = (size_t) sysconf(_SC_PHYS_PAGES); + auto page_size = (size_t) sysconf(_SC_PAGE_SIZE); return pages * page_size; } @@ -414,11 +426,11 @@ size_t get_system_free_memory_in_bytes() { return (info.freeram + info.freeswap) * info.mem_unit; } - auto avail_pages = (size_t)sysconf(_SC_AVPHYS_PAGES); - auto page_size = (size_t)sysconf(_SC_PAGE_SIZE); + auto avail_pages = (size_t) sysconf(_SC_AVPHYS_PAGES); + auto page_size = (size_t) sysconf(_SC_PAGE_SIZE); return avail_pages * page_size; } #endif -} // namespace qnn +} // namespace qnn diff --git a/ggml/src/ggml-qnn/utils.hpp b/ggml/src/ggml-qnn/utils.hpp index cdff53e77314d..d6130a3df4b4e 100644 --- a/ggml/src/ggml-qnn/utils.hpp +++ b/ggml/src/ggml-qnn/utils.hpp @@ -5,38 +5,36 @@ #include #include -#include "ggml.h" - #include "ggml-qnn.h" - -#include "QnnTypes.h" +#include "ggml.h" #include "logger.hpp" +#include "QnnTypes.h" #define QNN_TENSOR_VER(x) ((x).v1) namespace qnn { using ggml_dimension_array_t = int64_t[GGML_MAX_DIMS]; -using ggml_stride_array_t = size_t[GGML_MAX_DIMS]; -using qnn_dimension_array_t = std::array; +using ggml_stride_array_t = size_t[GGML_MAX_DIMS]; +using qnn_dimension_array_t = std::array; -qnn_dimension_array_t get_internal_dimension(const ggml_dimension_array_t &dims, uint32_t rank); -qnn_dimension_array_t get_view_internal_dimension(const ggml_tensor *tensor, size_t &element_offser_out); +qnn_dimension_array_t get_internal_dimension(const ggml_dimension_array_t & dims, uint32_t rank); +qnn_dimension_array_t get_view_internal_dimension(const ggml_tensor * tensor, size_t & element_offser_out); -uint32_t get_ggml_tensor_rank(const ggml_tensor *tensor); -const char *get_ggml_type_name(ggml_type type); -const char *get_backend_name(QNNBackend device_index); -const char *get_chipset_desc(uint32_t chipset_id); -const char *get_htparch_desc(size_t htp_arch); -intptr_t align_to(size_t alignment, intptr_t offset); -uint32_t get_ggml_tensor_data_size(const ggml_tensor *tensor); +uint32_t get_ggml_tensor_rank(const ggml_tensor * tensor); +const char * get_ggml_type_name(ggml_type type); +const char * get_backend_name(QNNBackend device_index); +const char * get_chipset_desc(uint32_t chipset_id); +const char * get_htparch_desc(size_t htp_arch); +intptr_t align_to(size_t alignment, intptr_t offset); +uint32_t get_ggml_tensor_data_size(const ggml_tensor * tensor); -void *page_align_alloc(size_t size); -void align_free(void *ptr); +void * page_align_alloc(size_t size); +void align_free(void * ptr); -const char *opname_from_ggmlop(enum ggml_op ggmlop); +const char * opname_from_ggmlop(enum ggml_op ggmlop); -const char *get_qnn_error_string(Qnn_ErrorHandle_t error); +const char * get_qnn_error_string(Qnn_ErrorHandle_t error); constexpr const Qnn_TensorVersion_t kDefaultQnnTensorVersion = QNN_TENSOR_VERSION_1; @@ -51,7 +49,7 @@ inline Qnn_Tensor_t qnn_tensor_init(Qnn_TensorVersion_t version) { return tensor; } -inline uint32_t get_qnn_tensorid(const Qnn_Tensor_t &tensor) { +inline uint32_t get_qnn_tensorid(const Qnn_Tensor_t & tensor) { if (tensor.version == kDefaultQnnTensorVersion) { return QNN_TENSOR_VER(tensor).id; } @@ -59,156 +57,158 @@ inline uint32_t get_qnn_tensorid(const Qnn_Tensor_t &tensor) { return 0u; } -inline const char *get_qnn_tensorname(const Qnn_Tensor_t &tensor) { +inline const char * get_qnn_tensorname(const Qnn_Tensor_t & tensor) { if (tensor.version == kDefaultQnnTensorVersion) { return QNN_TENSOR_VER(tensor).name; } return nullptr; } -inline Qnn_TensorType_t get_qnn_tensortype(const Qnn_Tensor_t &tensor) { +inline Qnn_TensorType_t get_qnn_tensortype(const Qnn_Tensor_t & tensor) { if (tensor.version == kDefaultQnnTensorVersion) { return QNN_TENSOR_VER(tensor).type; } return QNN_TENSOR_TYPE_UNDEFINED; } -inline Qnn_TensorDataFormat_t get_qnn_tensor_dataformat(const Qnn_Tensor_t &tensor) { +inline Qnn_TensorDataFormat_t get_qnn_tensor_dataformat(const Qnn_Tensor_t & tensor) { if (tensor.version == kDefaultQnnTensorVersion) { return QNN_TENSOR_VER(tensor).dataFormat; } return QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER; } -inline Qnn_DataType_t get_qnn_tensor_datatype(const Qnn_Tensor_t &tensor) { +inline Qnn_DataType_t get_qnn_tensor_datatype(const Qnn_Tensor_t & tensor) { if (tensor.version == kDefaultQnnTensorVersion) { return QNN_TENSOR_VER(tensor).dataType; } return QNN_DATATYPE_UNDEFINED; } -inline Qnn_QuantizeParams_t get_qnn_tensor_quantparams(const Qnn_Tensor_t &tensor) { +inline Qnn_QuantizeParams_t get_qnn_tensor_quantparams(const Qnn_Tensor_t & tensor) { if (tensor.version == kDefaultQnnTensorVersion) { return QNN_TENSOR_VER(tensor).quantizeParams; } return QNN_QUANTIZE_PARAMS_INIT; } -inline uint32_t get_qnn_tensor_rank(const Qnn_Tensor_t &tensor) { +inline uint32_t get_qnn_tensor_rank(const Qnn_Tensor_t & tensor) { if (tensor.version == kDefaultQnnTensorVersion) { return QNN_TENSOR_VER(tensor).rank; } return 0u; } -inline uint32_t *get_qnn_tensor_dimensions(const Qnn_Tensor_t &tensor) { +inline uint32_t * get_qnn_tensor_dimensions(const Qnn_Tensor_t & tensor) { if (tensor.version == kDefaultQnnTensorVersion) { return QNN_TENSOR_VER(tensor).dimensions; } return nullptr; } -inline Qnn_TensorMemType_t get_qnn_tensor_memtype(const Qnn_Tensor_t &tensor) { +inline Qnn_TensorMemType_t get_qnn_tensor_memtype(const Qnn_Tensor_t & tensor) { if (tensor.version == kDefaultQnnTensorVersion) { return QNN_TENSOR_VER(tensor).memType; } return QNN_TENSORMEMTYPE_UNDEFINED; } -inline Qnn_MemHandle_t get_qnn_tensor_memhandle(const Qnn_Tensor_t &tensor) { +inline Qnn_MemHandle_t get_qnn_tensor_memhandle(const Qnn_Tensor_t & tensor) { if (tensor.version == kDefaultQnnTensorVersion) { return QNN_TENSOR_VER(tensor).memHandle; } return nullptr; } -inline void set_qnn_tensor_id(Qnn_Tensor_t &tensor, uint32_t id) { +inline void set_qnn_tensor_id(Qnn_Tensor_t & tensor, uint32_t id) { if (tensor.version == kDefaultQnnTensorVersion) { QNN_TENSOR_VER(tensor).id = id; } } -inline void set_qnn_tensor_name(Qnn_Tensor_t &tensor, const char *name) { +inline void set_qnn_tensor_name(Qnn_Tensor_t & tensor, const char * name) { if (tensor.version == kDefaultQnnTensorVersion) { QNN_TENSOR_VER(tensor).name = name; } } -inline void set_qnn_tensor_type(Qnn_Tensor_t &tensor, Qnn_TensorType_t type) { +inline void set_qnn_tensor_type(Qnn_Tensor_t & tensor, Qnn_TensorType_t type) { if (tensor.version == kDefaultQnnTensorVersion) { QNN_TENSOR_VER(tensor).type = type; } } -inline void set_qnn_tensor_dataformat(Qnn_Tensor_t &tensor, Qnn_TensorDataFormat_t format) { +inline void set_qnn_tensor_dataformat(Qnn_Tensor_t & tensor, Qnn_TensorDataFormat_t format) { if (tensor.version == kDefaultQnnTensorVersion) { QNN_TENSOR_VER(tensor).dataFormat = format; } } -inline void set_qnn_tensor_datatype(Qnn_Tensor_t &tensor, Qnn_DataType_t dataType) { +inline void set_qnn_tensor_datatype(Qnn_Tensor_t & tensor, Qnn_DataType_t dataType) { if (tensor.version == kDefaultQnnTensorVersion) { QNN_TENSOR_VER(tensor).dataType = dataType; } } -inline void set_qnn_tensor_quantparams(Qnn_Tensor_t &tensor, Qnn_QuantizeParams_t params) { +inline void set_qnn_tensor_quantparams(Qnn_Tensor_t & tensor, Qnn_QuantizeParams_t params) { if (tensor.version == kDefaultQnnTensorVersion) { QNN_TENSOR_VER(tensor).quantizeParams = params; } } -inline void set_qnn_tensor_rank(Qnn_Tensor_t &tensor, uint32_t rank) { +inline void set_qnn_tensor_rank(Qnn_Tensor_t & tensor, uint32_t rank) { if (tensor.version == kDefaultQnnTensorVersion) { QNN_TENSOR_VER(tensor).rank = rank; } } -inline void set_qnn_tensor_dimensions(Qnn_Tensor_t &tensor, uint32_t *dims) { +inline void set_qnn_tensor_dimensions(Qnn_Tensor_t & tensor, uint32_t * dims) { if (tensor.version == kDefaultQnnTensorVersion) { QNN_TENSOR_VER(tensor).dimensions = dims; } } -inline void set_qnn_tensor_memtype(Qnn_Tensor_t &tensor, Qnn_TensorMemType_t mem_type) { +inline void set_qnn_tensor_memtype(Qnn_Tensor_t & tensor, Qnn_TensorMemType_t mem_type) { if (tensor.version == kDefaultQnnTensorVersion) { QNN_TENSOR_VER(tensor).memType = mem_type; } } -inline void set_qnn_tensor_clientbuf(Qnn_Tensor_t &tensor, Qnn_ClientBuffer_t client_buf) { +inline void set_qnn_tensor_clientbuf(Qnn_Tensor_t & tensor, Qnn_ClientBuffer_t client_buf) { if (tensor.version == kDefaultQnnTensorVersion) { QNN_TENSOR_VER(tensor).clientBuf = client_buf; } } -inline void set_qnn_tensor_memhandle(Qnn_Tensor_t &tensor, Qnn_MemHandle_t handle) { +inline void set_qnn_tensor_memhandle(Qnn_Tensor_t & tensor, Qnn_MemHandle_t handle) { if (tensor.version == kDefaultQnnTensorVersion) { QNN_TENSOR_VER(tensor).memHandle = handle; } } -inline void set_qnn_tensor_dyn_dimensions(Qnn_Tensor_t &tensor, uint8_t *isDynamicDimensions) { +inline void set_qnn_tensor_dyn_dimensions(Qnn_Tensor_t & tensor, uint8_t * isDynamicDimensions) { if (tensor.version == QNN_TENSOR_VERSION_2) { tensor.v2.isDynamicDimensions = isDynamicDimensions; } } Qnn_DataType_t qnn_datatype_from_ggml_datatype(ggml_type ggml_type); -ggml_type ggml_datatype_from_qnn_datatype(Qnn_DataType_t qnn_type); -size_t qnn_datatype_size(Qnn_DataType_t qnn_type); -const char *qnn_datatype_to_string(Qnn_DataType_t qnn_type); -size_t get_system_total_memory_in_bytes(); -size_t get_system_free_memory_in_bytes(); +ggml_type ggml_datatype_from_qnn_datatype(Qnn_DataType_t qnn_type); +size_t qnn_datatype_size(Qnn_DataType_t qnn_type); +const char * qnn_datatype_to_string(Qnn_DataType_t qnn_type); +size_t get_system_total_memory_in_bytes(); +size_t get_system_free_memory_in_bytes(); #if ENABLE_QNNBACKEND_PERF class qnn_perf { -public: - qnn_perf(const std::string &perf_name) : _perf_name(std::move(perf_name)) {}; + public: + qnn_perf(const std::string & perf_name) : _perf_name(std::move(perf_name)) {}; + ~qnn_perf() { info(); } - qnn_perf() = delete; - qnn_perf(const qnn_perf &) = delete; - qnn_perf &operator=(const qnn_perf &) = delete; + + qnn_perf() = delete; + qnn_perf(const qnn_perf &) = delete; + qnn_perf & operator=(const qnn_perf &) = delete; void start() { _begin_time = ggml_time_us(); } @@ -218,48 +218,51 @@ class qnn_perf { QNN_LOG_INFO("duration of %s : %lld microseconds\n", _perf_name.c_str(), _duration); } -private: - int64_t _begin_time = 0LL; - int64_t _end_time = 0LL; - int64_t _duration = 0LL; + private: + int64_t _begin_time = 0LL; + int64_t _end_time = 0LL; + int64_t _duration = 0LL; std::string _perf_name; }; #else class qnn_perf { -public: + public: qnn_perf(const std::string &) {} + ~qnn_perf() { info(); } - qnn_perf() = delete; - qnn_perf(const qnn_perf &) = delete; - qnn_perf &operator=(const qnn_perf &) = delete; + + qnn_perf() = delete; + qnn_perf(const qnn_perf &) = delete; + qnn_perf & operator=(const qnn_perf &) = delete; void start() {} + void info() {} }; #endif -} // namespace qnn +} // namespace qnn -#define QNN_TENSOR_GET_ID(tensor) qnn::get_qnn_tensorid(tensor) -#define QNN_TENSOR_GET_NAME(tensor) qnn::get_qnn_tensorname(tensor) -#define QNN_TENSOR_GET_TYPE(tensor) qnn::get_qnn_tensortype(tensor) -#define QNN_TENSOR_GET_DATA_FORMAT(tensor) qnn::get_qnn_tensor_dataformat(tensor) -#define QNN_TENSOR_GET_DATA_TYPE(tensor) qnn::get_qnn_tensor_datatype(tensor) +#define QNN_TENSOR_GET_ID(tensor) qnn::get_qnn_tensorid(tensor) +#define QNN_TENSOR_GET_NAME(tensor) qnn::get_qnn_tensorname(tensor) +#define QNN_TENSOR_GET_TYPE(tensor) qnn::get_qnn_tensortype(tensor) +#define QNN_TENSOR_GET_DATA_FORMAT(tensor) qnn::get_qnn_tensor_dataformat(tensor) +#define QNN_TENSOR_GET_DATA_TYPE(tensor) qnn::get_qnn_tensor_datatype(tensor) #define QNN_TENSOR_GET_QUANT_PARAMS(tensor) qnn::get_qnn_tensor_quantparams(tensor) -#define QNN_TENSOR_GET_RANK(tensor) qnn::get_qnn_tensor_rank(tensor) -#define QNN_TENSOR_GET_DIMENSIONS(tensor) qnn::get_qnn_tensor_dimensions(tensor) -#define QNN_TENSOR_GET_MEM_TYPE(tensor) qnn::get_qnn_tensor_memtype(tensor) -#define QNN_TENSOR_GET_MEM_HANDLE(tensor) qnn::get_qnn_tensor_memhandle(tensor) - -#define QNN_TENSOR_SET_ID(tensor, value) qnn::set_qnn_tensor_id(tensor, value) -#define QNN_TENSOR_SET_NAME(tensor, value) qnn::set_qnn_tensor_name(tensor, value) -#define QNN_TENSOR_SET_TYPE(tensor, value) qnn::set_qnn_tensor_type(tensor, value) -#define QNN_TENSOR_SET_DATA_FORMAT(tensor, value) qnn::set_qnn_tensor_dataformat(tensor, value) -#define QNN_TENSOR_SET_DATA_TYPE(tensor, value) qnn::set_qnn_tensor_datatype(tensor, value) -#define QNN_TENSOR_SET_QUANT_PARAMS(tensor, value) qnn::set_qnn_tensor_quantparams(tensor, value) -#define QNN_TENSOR_SET_RANK(tensor, value) qnn::set_qnn_tensor_rank(tensor, value) -#define QNN_TENSOR_SET_DIMENSIONS(tensor, value) qnn::set_qnn_tensor_dimensions(tensor, value) -#define QNN_TENSOR_SET_MEM_TYPE(tensor, value) qnn::set_qnn_tensor_memtype(tensor, value) -#define QNN_TENSOR_SET_CLIENT_BUF(tensor, value) qnn::set_qnn_tensor_clientbuf(tensor, value) -#define QNN_TENSOR_SET_MEM_HANDLE(tensor, value) qnn::set_qnn_tensor_memhandle(tensor, value) +#define QNN_TENSOR_GET_RANK(tensor) qnn::get_qnn_tensor_rank(tensor) +#define QNN_TENSOR_GET_DIMENSIONS(tensor) qnn::get_qnn_tensor_dimensions(tensor) +#define QNN_TENSOR_GET_MEM_TYPE(tensor) qnn::get_qnn_tensor_memtype(tensor) +#define QNN_TENSOR_GET_MEM_HANDLE(tensor) qnn::get_qnn_tensor_memhandle(tensor) + +#define QNN_TENSOR_SET_ID(tensor, value) qnn::set_qnn_tensor_id(tensor, value) +#define QNN_TENSOR_SET_NAME(tensor, value) qnn::set_qnn_tensor_name(tensor, value) +#define QNN_TENSOR_SET_TYPE(tensor, value) qnn::set_qnn_tensor_type(tensor, value) +#define QNN_TENSOR_SET_DATA_FORMAT(tensor, value) qnn::set_qnn_tensor_dataformat(tensor, value) +#define QNN_TENSOR_SET_DATA_TYPE(tensor, value) qnn::set_qnn_tensor_datatype(tensor, value) +#define QNN_TENSOR_SET_QUANT_PARAMS(tensor, value) qnn::set_qnn_tensor_quantparams(tensor, value) +#define QNN_TENSOR_SET_RANK(tensor, value) qnn::set_qnn_tensor_rank(tensor, value) +#define QNN_TENSOR_SET_DIMENSIONS(tensor, value) qnn::set_qnn_tensor_dimensions(tensor, value) +#define QNN_TENSOR_SET_MEM_TYPE(tensor, value) qnn::set_qnn_tensor_memtype(tensor, value) +#define QNN_TENSOR_SET_CLIENT_BUF(tensor, value) qnn::set_qnn_tensor_clientbuf(tensor, value) +#define QNN_TENSOR_SET_MEM_HANDLE(tensor, value) qnn::set_qnn_tensor_memhandle(tensor, value) #define QNN_TENSOR_SET_DYN_DIMENSIONS(tensor, value) qnn::set_qnn_tensor_dyn_dimensions(tensor, value) From f289752664beecc7a1a1fb214a9aa65d8e6410e6 Mon Sep 17 00:00:00 2001 From: nullname Date: Fri, 28 Feb 2025 19:18:16 +0800 Subject: [PATCH 143/143] [bugfix]make sure single node op will have the same type (#29) * debug * disable reshape * make sure single node op have same type * fix warning at the logger * Revert "disable reshape" This reverts commit 5aeca4ba9bec6db3f047f9da803df20f9f6612b3. --- ggml/src/ggml-qnn/backend-ops.cpp | 31 +++++++++++++++++++++++++++---- ggml/src/ggml-qnn/graph.cpp | 2 +- ggml/src/ggml-qnn/logger.cpp | 5 ++++- 3 files changed, 32 insertions(+), 6 deletions(-) diff --git a/ggml/src/ggml-qnn/backend-ops.cpp b/ggml/src/ggml-qnn/backend-ops.cpp index 3a401dd037b97..95fe35b465417 100644 --- a/ggml/src/ggml-qnn/backend-ops.cpp +++ b/ggml/src/ggml-qnn/backend-ops.cpp @@ -369,6 +369,31 @@ bool ggnl_qnn_supports_op_tensor(ggml_backend_qnn_device_context * ctx, const gg return true; } +bool ggml_qnn_have_same_tensor_types(ggml_backend_qnn_device_context * ctx, const ggml_tensor * op) { + auto * src0 = op->src[0]; + auto * src1 = op->src[1]; + if (src1) { + if (src0->type != op->type || src1->type != op->type) { + QNN_LOG_DEBUG("[%s][%s]type src0(%s), src1(%s) and op(%s) are not equal\n", + qnn::get_backend_name(ctx->device), ggml_op_name(op->op), ggml_type_name(src0->type), + ggml_type_name(src1->type), ggml_type_name(op->type)); + return false; + } + } else { + if (src0->type != op->type) { + QNN_LOG_DEBUG("[%s][%s]type src0(%s) and op(%s) are not equal\n", qnn::get_backend_name(ctx->device), + ggml_op_name(op->op), ggml_type_name(src0->type), ggml_type_name(op->type)); + return false; + } + } + +#ifdef NDEBUG + GGML_UNUSED(ctx); +#endif + + return true; +} + bool ggml_qnn_supports_matmul_op(ggml_backend_qnn_device_context * ctx, const ggml_tensor * op) { constexpr const size_t kMaxNpuTensorSize = 8192L * 2048 + 8192 * 512 + 2048 * 512; constexpr const auto get_tensor_size = [](const ggml_tensor * tensor) -> size_t { @@ -393,10 +418,8 @@ bool ggml_qnn_supports_matmul_op(ggml_backend_qnn_device_context * ctx, const gg // fall through, from test here, the convert op is super slow on NPU: // https://github.com/usefulsensors/qc_npu_benchmark case QNN_BACKEND_GPU: - if (src0->type != src1->type || src0->type != op->type) { + if (ggml_qnn_have_same_tensor_types(ctx, op)) { // there's no convert op for GPU. - QNN_LOG_DEBUG("[qnn-gpu][MUL_MAT]type src0(%s), src1(%s) and op(%s) are not equal\n", - ggml_type_name(src0->type), ggml_type_name(src1->type), ggml_type_name(op->type)); return false; } break; @@ -472,7 +495,7 @@ bool device_supports_op(ggml_backend_qnn_device_context * ctx, const ggml_tensor break; default: - // default to supported + is_op_supported = ggml_qnn_have_same_tensor_types(ctx, op); break; } } diff --git a/ggml/src/ggml-qnn/graph.cpp b/ggml/src/ggml-qnn/graph.cpp index b3ab161e9f6ca..2a282771c2a2d 100644 --- a/ggml/src/ggml-qnn/graph.cpp +++ b/ggml/src/ggml-qnn/graph.cpp @@ -246,7 +246,7 @@ qnn_graph::qnn_graph(const std::string & graph_name, QNNBackend device, std::sha return; } - QNN_LOG_INFO("[%s][%s]create succeed\n", get_backend_name(device), graph_name.c_str()); + QNN_LOG_DEBUG("[%s][%s]create succeed\n", get_backend_name(device), graph_name.c_str()); _graph_handle = graph_handle; _qnn_interface = qnn_interface; } diff --git a/ggml/src/ggml-qnn/logger.cpp b/ggml/src/ggml-qnn/logger.cpp index 5418d03be45a4..0ffa12e7b1bb3 100644 --- a/ggml/src/ggml-qnn/logger.cpp +++ b/ggml/src/ggml-qnn/logger.cpp @@ -13,7 +13,7 @@ void qnn::sdk_logcallback(const char * fmt, QnnLog_Level_t level, uint64_t /*tim static std::mutex log_mutex; static char s_ggml_qnn_logbuf[4096]; - char log_level_desc = 'U'; + char log_level_desc; switch (level) { case QNN_LOG_LEVEL_ERROR: log_level_desc = 'E'; @@ -30,6 +30,9 @@ void qnn::sdk_logcallback(const char * fmt, QnnLog_Level_t level, uint64_t /*tim case QNN_LOG_LEVEL_VERBOSE: log_level_desc = 'V'; break; + default: + log_level_desc = 'U'; + break; } {