From d5380f3af20cf5dd8d8a95854767891f29c5a4d1 Mon Sep 17 00:00:00 2001 From: arthw <14088817+arthw@users.noreply.github.com> Date: Tue, 30 Jul 2024 23:49:34 +0800 Subject: [PATCH 1/6] refactor device in sycl_device, restore ctx in create_queue --- ggml/src/ggml-sycl.cpp | 36 ++-- ggml/src/ggml-sycl/common.cpp | 316 +---------------------------- ggml/src/ggml-sycl/common.hpp | 88 +------- ggml/src/ggml-sycl/dpct/helper.hpp | 132 ++++++------ ggml/src/ggml-sycl/mmq.cpp | 20 +- ggml/src/ggml-sycl/sycl_device.cpp | 286 ++++++++++++++++++++++++++ ggml/src/ggml-sycl/sycl_device.hpp | 83 ++++++++ ggml/src/ggml-sycl/sycl_hw.cpp | 6 + ggml/src/ggml-sycl/sycl_hw.hpp | 7 +- 9 files changed, 494 insertions(+), 480 deletions(-) create mode 100644 ggml/src/ggml-sycl/sycl_device.cpp create mode 100644 ggml/src/ggml-sycl/sycl_device.hpp diff --git a/ggml/src/ggml-sycl.cpp b/ggml/src/ggml-sycl.cpp index 9b1ef70ad18e6..8a0b8ff639a79 100644 --- a/ggml/src/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl.cpp @@ -48,7 +48,7 @@ void ggml_sycl_get_device_description(int device, char * description, size_t d bool ggml_backend_is_sycl(ggml_backend_t backend); int ggml_backend_sycl_get_device(ggml_backend_t backend); static bool ggml_backend_buffer_is_sycl_split(ggml_backend_buffer_t buffer); - +static bool ggml_backend_buffer_is_sycl(ggml_backend_buffer_t buffer); void dev2dev_memcpy(sycl::queue &q_dst, sycl::queue &q_src, void *ptr_dst, const void *ptr_src, size_t size) { @@ -2279,11 +2279,11 @@ static int64_t get_row_rounding(ggml_type type, const std::array ggml_sycl_info().devices[id].cc) { - min_compute_capability = ggml_sycl_info().devices[id].cc; + if (min_compute_capability > ggml_sycl_info().device_infos[id].cc) { + min_compute_capability = ggml_sycl_info().device_infos[id].cc; } - if (max_compute_capability < ggml_sycl_info().devices[id].cc) { - max_compute_capability = ggml_sycl_info().devices[id].cc; + if (max_compute_capability < ggml_sycl_info().device_infos[id].cc) { + max_compute_capability = ggml_sycl_info().device_infos[id].cc; } } } @@ -2680,17 +2680,14 @@ static void ggml_sycl_set_peer_access(const int n_tokens, int main_device) { } #ifdef NDEBUG - for (int i = 0; i < ggml_sycl_info().device_count; ++i) { - int id = ggml_backend_sycl_get_device_id(i); + for (auto &id: ggml_sycl_info().ids) { SYCL_CHECK(ggml_sycl_set_device(id)); } - for (int i = 0; i < ggml_sycl_info().device_count; ++i) { - int id = ggml_backend_sycl_get_device_id(i); + for (auto &id: ggml_sycl_info().ids) { SYCL_CHECK(ggml_sycl_set_device(id)); - for (int i_other = 0; i_other < ggml_sycl_info().device_count; ++i_other) { - int id_other = ggml_backend_sycl_get_device_id(i_other); + for (auto &id_other: ggml_sycl_info().ids) { if (id == id_other) { continue; } @@ -2843,7 +2840,6 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten } else { dev[id].src1_ddf = dev[id].src1_ddf_alloc.alloc(ctx.pool(id), ggml_nelements(src1)); } - if (convert_src1_to_q8_1) { dev[id].src1_ddq = dev[id].src1_ddq_alloc.alloc(ctx.pool(id), nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs); @@ -3165,8 +3161,13 @@ static void ggml_sycl_pad(ggml_backend_sycl_context & ctx, const ggml_tensor * s static void ggml_sycl_rms_norm(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { GGML_SYCL_DEBUG("call %s\n", __func__); + // log_tensor_with_cnt(ctx, "log/src0", src0, -1); + // log_tensor_with_cnt(ctx, "log/src1", src1, -1); + // log_tensor_with_cnt(ctx, "log/dst0", dst, -1); ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_rms_norm); + // log_tensor_with_cnt(ctx, "log/dst1", dst, -1); GGML_SYCL_DEBUG("call %s done\n", __func__); + // exit(1); } static void ggml_sycl_mul_mat_vec_p021(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, @@ -3417,12 +3418,12 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor continue; } - if (min_compute_capability > ggml_sycl_info().devices[id].cc) { - min_compute_capability = ggml_sycl_info().devices[id].cc; + if (min_compute_capability > ggml_sycl_info().device_infos[id].cc) { + min_compute_capability = ggml_sycl_info().device_infos[id].cc; } } } else { - min_compute_capability = ggml_sycl_info().devices[ctx.device].cc; + min_compute_capability = ggml_sycl_info().device_infos[ctx.device].cc; } // check data types and tensor shapes for custom matrix multiplication kernels: @@ -4332,7 +4333,6 @@ static ggml_backend_buffer_type_i ggml_backend_sycl_buffer_type_interface = { ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device_id) { static std::mutex mutex; std::lock_guard lock(mutex); - GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_buffer_type\n"); check_allow_device_id(device_id); @@ -4345,7 +4345,9 @@ ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device_id) { for (int i = 0; i < ggml_sycl_info().device_count; i++) { int id = ggml_backend_sycl_get_device_id(i); auto & device = dpct::dev_mgr::instance().get_device(id); - queue_ptr stream = &(device.default_queue()); + // queue_ptr stream = &(device.default_queue()); + queue_ptr stream = ggml_sycl_info().device_infos[id].qptrs[0]; + ggml_backend_sycl_buffer_types[id] = { /* .iface = */ ggml_backend_sycl_buffer_type_interface, /* .context = */ new ggml_backend_sycl_buffer_type_context{id, GGML_SYCL_NAME + std::to_string(id), stream}, diff --git a/ggml/src/ggml-sycl/common.cpp b/ggml/src/ggml-sycl/common.cpp index 3633edefac67a..efd702bf19991 100644 --- a/ggml/src/ggml-sycl/common.cpp +++ b/ggml/src/ggml-sycl/common.cpp @@ -20,12 +20,16 @@ void* ggml_sycl_host_malloc(size_t size) try { if (getenv("GGML_SYCL_NO_PINNED") != nullptr) { return nullptr; } - +// ggml_sycl_info().device_mgr->first_queue void* ptr = nullptr; // allow to use dpct::get_in_order_queue() for host malloc + auto q = dpct::get_in_order_queue(); +// sycl::queue q = *ggml_sycl_info().device_mgr->qptrs[0][0]; + dpct::err0 err = CHECK_TRY_ERROR( - ptr = (void*)sycl::malloc_host(size, dpct::get_in_order_queue())); + ptr = (void*)sycl::malloc_host(size, q)); +// printf("zjy ggml_sycl_host_malloc ptr=%p queue=%p size=%lu \n", ptr,q, size); if (err != 0) { // clear the error fprintf( @@ -66,27 +70,6 @@ static inline int get_sycl_env(const char *env_name, int default_val) { return user_number; } -static inline bool env_existed(const char *env_name) { - char *user_device_string = getenv(env_name); - return user_device_string!=NULL; -} - -static std::vector get_sycl_visible_devices() { - static std::vector device_ids; - char *devices_env = getenv("GGML_SYCL_VISIBLE_DEVICES"); - if (devices_env != nullptr) { - std::string devices(devices_env); - std::replace(devices.begin(), devices.end(), ',', ' '); - - std::stringstream ss(devices); - int tmp; - while (ss >> tmp) { - device_ids.push_back(tmp); - } - } - return device_ids; -} - void print_device_detail_part1(int id, sycl::device &device, std::string device_type) { dpct::device_info prop; @@ -193,8 +176,7 @@ static ggml_sycl_device_info ggml_sycl_init() try { initialized = true; } - static ggml_sycl_device_info info = {}; - info.refresh_device(); + static ggml_sycl_device_info info; if (info.device_count == 0) { fprintf(stderr, "%s: failed to initialize " GGML_SYCL_NAME ": no available device found\n", @@ -215,288 +197,4 @@ ggml_sycl_device_info &ggml_sycl_info() { return info; } -//--sycl_device_mgr-- - -sycl_device_mgr::sycl_device_mgr( - ggml_sycl_backend_device_filter device_filter) { - switch (device_filter) { - case SYCL_DEVICES_TOP_LEVEL_ZERO: - detect_sycl_gpu_list_with_max_cu(); - create_context_for_group_gpus(); - break; - case SYCL_ALL_DEVICES: - detect_all_sycl_device_list(); - create_context_for_devices(); - break; - case SYCL_VISIBLE_DEVICES: - detect_sycl_visible_device_list(); - create_context_for_devices(); - break; - default: - std::cerr << "sycl_device_mgr: Invalid device_filter " << device_filter - << std::endl; - } - init_allow_devices(); -} - -/* -Bind all gpus in same host with same context, for better performance in -device-to-device copy in the future. -*/ -void sycl_device_mgr::create_context_for_group_gpus() { - sycl::context ctx = sycl::context(devices); - assert(device_ids.size() > 0); - first_queue = _create_queue_ptr(devices[0]); - sycl::context ctx0 = first_queue->get_context(); - for (int i = 0; i < device_ids.size(); i++) { - ctxs.push_back(ctx0); - } -} - -sycl::queue *sycl_device_mgr::_create_queue_ptr(sycl::device device) { - auto q = dpct::get_current_device().create_queue(device); - return q; - // _queues.push_back(q); - // return & _queues.back(); -} - -sycl::queue *sycl_device_mgr::create_queue_for_device(sycl::device &device) { - dpct::select_device(dpct::dev_mgr::instance().get_device_id(device)); - auto qptr = _create_queue_ptr(device); - return qptr; -} - -sycl::queue *sycl_device_mgr::create_queue_for_device_id(int device_id) { - int i = get_device_index(device_id); - sycl::device device = dpct::dev_mgr::instance().get_device(device_id); - return create_queue_for_device(device); -} - -int sycl_device_mgr::get_device_index(int device_id) { - for (int i = 0; i < device_ids.size(); i++) { - if (device_ids[i] == device_id) - return i; - } - return -1; -} - -void sycl_device_mgr::create_context_for_devices() { - for (int i = 0; i < device_ids.size(); i++) { - sycl::context ctx = sycl::context(devices[i]); - ctxs.push_back(ctx); - } -} - -void sycl_device_mgr::init_allow_devices() { - device_list = ""; - for (size_t i = 0; i < device_ids.size(); ++i) { - device_list += std::to_string(device_ids[i]); - device_list += ","; - } - if (device_list.length() > 1) { - device_list.pop_back(); - } -} - -bool sycl_device_mgr::is_allowed_device(int device_id) { - return std::find(device_ids.begin(), device_ids.end(), device_id) != - device_ids.end(); -} - -void sycl_device_mgr::detect_all_sycl_device_list() try { - int device_count = dpct::dev_mgr::instance().device_count(); - - for (int id = 0; id < device_count; id++) { - add_device_info(id); - } - return; -} catch (sycl::exception const &exc) { - std::cerr << exc.what() << "Exception caught at file:" << __FILE__ - << ", line:" << __LINE__ << std::endl; - std::exit(1); -} - -void sycl_device_mgr::detect_sycl_visible_device_list() try { - std::vector sycl_devices = get_sycl_visible_devices(); - int device_count = dpct::dev_mgr::instance().device_count(); - - for (int i = 0; i < sycl_devices.size(); i++) { - int id = sycl_devices[i]; - if (id >= device_count) { - std::cerr << __func__ << ": invalid device_id:" << id - << " from GGML_SYCL_VISIBLE_DEVICES=" - << getenv("GGML_SYCL_VISIBLE_DEVICES") - << ", available IDs: "; - if (device_count > 1) { - std::cerr << "[0, " << device_count - 1 << "]"; - } else if (device_count == 1) { - std::cerr << "[0]"; - } else { - std::cerr << "[]"; - } - std::cerr << std::endl; - } - add_device_info(id); - } - return; -} catch (sycl::exception const &exc) { - std::cerr << exc.what() << "Exception caught at file:" << __FILE__ - << ", line:" << __LINE__ << std::endl; - std::exit(1); -} - -/* -Use all GPUs with same top max compute units -*/ -void sycl_device_mgr::detect_sycl_gpu_list_with_max_cu() try { - int device_count = dpct::dev_mgr::instance().device_count(); - int local_max_compute_units = 0; - for (int id = 0; id < device_count; id++) { - sycl::device device = dpct::dev_mgr::instance().get_device(id); - if (!device.is_gpu()) - continue; - dpct::device_info prop; - dpct::get_device_info(prop, device); - if (local_max_compute_units < prop.get_max_compute_units()) - local_max_compute_units = prop.get_max_compute_units(); - } - - for (int id = 0; id < device_count; id++) { - sycl::device device = dpct::dev_mgr::instance().get_device(id); - if (!device.is_gpu()) - continue; - dpct::device_info prop; - dpct::get_device_info(prop, device); - if (local_max_compute_units == prop.get_max_compute_units() && - is_ext_oneapi_device(device)) { - add_device_info(id); - } - } - return; -} catch (sycl::exception const &exc) { - std::cerr << exc.what() << "Exception caught at file:" << __FILE__ - << ", line:" << __LINE__ << std::endl; - std::exit(1); -} - -int sycl_device_mgr::get_device_count() { return (int)device_ids.size(); } - -bool sycl_device_mgr::is_ext_oneapi_device(const sycl::device &dev) { - sycl::backend dev_backend = dev.get_backend(); - if (dev_backend == sycl::backend::ext_oneapi_level_zero || - dev_backend == sycl::backend::ext_oneapi_cuda || - dev_backend == sycl::backend::ext_oneapi_hip) - return true; - return false; -} - -void sycl_device_mgr::add_device_info(int id) { - sycl::device device = dpct::dev_mgr::instance().get_device(id); - device_ids.push_back(id); - devices.push_back(device); - dpct::device_info prop; - dpct::get_device_info(prop, device); - work_group_sizes.push_back(prop.get_max_work_group_size()); - max_compute_units.push_back(prop.get_max_compute_units()); - hw_familys.push_back(get_device_family(&device)); -} - -//--sycl_device_mgr-- - -//--ggml_sycl_device_info-- -void ggml_sycl_device_info::print_gpu_device_list() { - GGML_ASSERT(device_mgr); - - char *hint = NULL; - if (oneapi_device_selector_existed && sycl_visible_devices_existed) { - hint = "detect %d SYCL devices:[%s] by ONEAPI_DEVICE_SELECTOR=%s and " - "GGML_SYCL_VISIBLE_DEVICES=%s\n"; - fprintf(stderr, hint, device_mgr->get_device_count(), devices_list(), - getenv("ONEAPI_DEVICE_SELECTOR"), - getenv("GGML_SYCL_VISIBLE_DEVICES")); - } else if (oneapi_device_selector_existed) { - hint = "detect %d SYCL devices:[%s] by ONEAPI_DEVICE_SELECTOR=%s\n"; - fprintf(stderr, hint, device_mgr->get_device_count(), devices_list(), - getenv("ONEAPI_DEVICE_SELECTOR")); - } else if (sycl_visible_devices_existed) { - hint = "detect %d SYCL devices:[%s] by GGML_SYCL_VISIBLE_DEVICES=%s\n"; - fprintf(stderr, hint, device_mgr->get_device_count(), devices_list(), - getenv("GGML_SYCL_VISIBLE_DEVICES")); - } else { - hint = "detect %d SYCL level-zero GPUs:[%s] with top Max compute " - "units:%d, to use any SYCL devices, set/export " - "GGML_SYCL_VISIBLE_DEVICES or ONEAPI_DEVICE_SELECTOR\n"; - fprintf(stderr, hint, device_mgr->get_device_count(), devices_list(), - device_mgr->max_compute_units[0]); - } -} - -int ggml_sycl_device_info::work_group_size(int device_id) { - GGML_ASSERT(device_mgr); - return device_mgr->work_group_sizes[device_id]; -} - -void ggml_sycl_device_info::refresh_device() { - oneapi_device_selector_existed = env_existed("ONEAPI_DEVICE_SELECTOR"); - sycl_visible_devices_existed = env_existed("GGML_SYCL_VISIBLE_DEVICES"); - if (!device_mgr) - delete device_mgr; - - if (sycl_visible_devices_existed) { - device_mgr = new sycl_device_mgr(SYCL_VISIBLE_DEVICES); - } else if (oneapi_device_selector_existed) { - device_mgr = new sycl_device_mgr(SYCL_ALL_DEVICES); - } else { - device_mgr = new sycl_device_mgr(SYCL_DEVICES_TOP_LEVEL_ZERO); - } - - device_count = device_mgr->get_device_count(); - - int64_t total_vram = 0; - - for (int i = 0; i < device_count; ++i) { - int id = get_device_id(i); - devices[id].vmm = 0; - dpct::device_info prop; - SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info( - prop, dpct::dev_mgr::instance().get_device(id)))); - - default_tensor_split[i] = - total_vram; // continue data, so use device index - total_vram += prop.get_global_mem_size(); - - devices[id].cc = - 100 * prop.get_major_version() + 10 * prop.get_minor_version(); - } - - for (int i = 0; i < device_count; ++i) { - default_tensor_split[i] /= - total_vram; // continue data, so use device index - } - - print_gpu_device_list(); -} - -bool ggml_sycl_device_info::is_allowed_device(int device_id) { - return device_mgr->is_allowed_device(device_id); -} - -const char *ggml_sycl_device_info::devices_list() { - return device_mgr->device_list.c_str(); -} - -int ggml_sycl_device_info::get_device_id(int device_index) { - if (device_index < device_mgr->device_ids.size()) { - return device_mgr->device_ids.at(device_index); - } else { - std::cerr << __func__ << ":SYCL device:" << device_index - << " is out of range:[" << devices_list() << "]" << std::endl; - std::exit(1); - } -} - -int ggml_sycl_device_info::hw_family(int device_id) { - return device_mgr->hw_familys[device_id]; -} - //--ggml_sycl_device_info-- diff --git a/ggml/src/ggml-sycl/common.hpp b/ggml/src/ggml-sycl/common.hpp index 00be4f70274f0..f0c93a52d84e4 100644 --- a/ggml/src/ggml-sycl/common.hpp +++ b/ggml/src/ggml-sycl/common.hpp @@ -21,12 +21,13 @@ #include "ggml-sycl.h" #include "presets.hpp" #include "sycl_hw.hpp" +#include "sycl_device.hpp" #define GGML_COMMON_DECL_SYCL #define GGML_COMMON_IMPL_SYCL #include "ggml-common.h" -void* ggml_sycl_host_malloc(size_t size); + void ggml_sycl_host_free(void* ptr); static int g_ggml_sycl_debug = 0; @@ -86,12 +87,6 @@ enum ggml_sycl_backend_gpu_mode { SYCL_MUL_GPU_MODE }; -enum ggml_sycl_backend_device_filter { - SYCL_ALL_DEVICES = 0, - SYCL_DEVICES_TOP_LEVEL_ZERO, - SYCL_VISIBLE_DEVICES -}; - static_assert(sizeof(sycl::half) == sizeof(ggml_fp16_t), "wrong fp16 size"); static void crash() { @@ -169,10 +164,10 @@ inline dpct::err0 ggml_sycl_set_device(const int device_id) try { int current_device_id; SYCL_CHECK(CHECK_TRY_ERROR(current_device_id = get_current_device_id())); - GGML_SYCL_DEBUG("ggml_sycl_set_device device_id=%d, current_device_id=%d\n", device_id, current_device_id); if (device_id == current_device_id) { return 0; } + GGML_SYCL_DEBUG("ggml_sycl_set_device device_id=%d, current_device_id=%d\n", device_id, current_device_id); return CHECK_TRY_ERROR(dpct::select_device(device_id)); @@ -183,67 +178,6 @@ inline dpct::err0 ggml_sycl_set_device(const int device_id) try { std::exit(1); } - -class sycl_device_mgr { - public: - std::vector device_ids; - std::vector devices; - std::vector max_compute_units; - std::vector work_group_sizes; - std::vector hw_familys; - - sycl::queue *first_queue; - std::vector _queues; - std::vector ctxs; - std::string device_list = ""; - - sycl_device_mgr(ggml_sycl_backend_device_filter device_filter); - - sycl::queue *_create_queue_ptr(sycl::device device); //internal API to hide dpct API. - void create_context_for_group_gpus(); - sycl::queue *create_queue_for_device(sycl::device &device); - sycl::queue *create_queue_for_device_id(int device_id); - int get_device_index(int device_id); - void create_context_for_devices(); - void init_allow_devices(); - bool is_allowed_device(int device_id); - void detect_all_sycl_device_list(); - void detect_sycl_visible_device_list(); - void detect_sycl_gpu_list_with_max_cu(); - int get_device_count(); - bool is_ext_oneapi_device(const sycl::device &dev); - void add_device_info(int id); -}; - - -struct ggml_sycl_device_info { - int device_count; - bool oneapi_device_selector_existed = false; - bool sycl_visible_devices_existed = false; - - struct sycl_device_info { - int cc; // compute capability - // int nsm; // number of streaming multiprocessors - // size_t smpb; // max. shared memory per block - bool vmm; // virtual memory support - size_t total_vram; - }; - - sycl_device_info devices[GGML_SYCL_MAX_DEVICES] = {}; - - std::array default_tensor_split = {}; - - sycl_device_mgr *device_mgr = NULL; - - void print_gpu_device_list(); - int work_group_size(int device_id); - void refresh_device(); - bool is_allowed_device(int device_id); - const char* devices_list(); - int get_device_id(int device_index); - int hw_family(int device_id); -}; - struct ggml_sycl_pool { virtual ~ggml_sycl_pool() = default; @@ -309,17 +243,17 @@ struct ggml_backend_sycl_context { queue_ptr qptrs[GGML_SYCL_MAX_DEVICES][GGML_SYCL_MAX_STREAMS] = { { nullptr } }; - explicit ggml_backend_sycl_context(struct ggml_sycl_device_info &sycl_device_info, int device_id) : - device(device_id), + explicit ggml_backend_sycl_context(struct ggml_sycl_device_info &sycl_device_info, int id) : + device(id), name(GGML_SYCL_NAME + std::to_string(device)) { for (int i=0;icreate_queue_for_device_id(device_id); + qptrs[id][i] = sycl_device_info.device_infos[id].qptrs[i]; } } - queue_ptr stream(int device, int stream) { - assert(qptrs[device][stream] != nullptr); - return qptrs[device][stream]; + queue_ptr stream(int id, int stream) { + assert(qptrs[id][stream] != nullptr); + return qptrs[id][stream]; } queue_ptr stream() { @@ -349,10 +283,10 @@ static inline void exit_with_stack_print() { static inline int get_sycl_env(const char *env_name, int default_val); -static inline bool env_existed(const char *env_name); + void* ggml_sycl_host_malloc(size_t size); void ggml_sycl_host_free(void* ptr); -static std::vector get_sycl_visible_devices(); + void ggml_backend_sycl_print_sycl_devices(); static ggml_sycl_device_info ggml_sycl_init(); ggml_sycl_device_info &ggml_sycl_info(); diff --git a/ggml/src/ggml-sycl/dpct/helper.hpp b/ggml/src/ggml-sycl/dpct/helper.hpp index d93f118184524..d4f3a7edc2a2d 100644 --- a/ggml/src/ggml-sycl/dpct/helper.hpp +++ b/ggml/src/ggml-sycl/dpct/helper.hpp @@ -592,16 +592,18 @@ namespace dpct class device_ext : public sycl::device { typedef std::mutex mutex_type; - public: - device_ext() : sycl::device() {} - ~device_ext() { - std::lock_guard lock(m_mutex); - clear_queues(); - } - device_ext(const sycl::device &base) : sycl::device(base) { - std::lock_guard lock(m_mutex); - init_queues(); - } + public: + device_ext() : sycl::device(), _ctx(*this) {} + ~device_ext() + { + std::lock_guard lock(m_mutex); + clear_queues(); + } + device_ext(const sycl::device &base) : sycl::device(base), _ctx(*this) + { + std::lock_guard lock(m_mutex); + init_queues(); + } int is_native_atomic_supported() { return 0; } int get_major_version() const { return dpct::get_major_version(*this); } @@ -711,10 +713,10 @@ namespace dpct return create_in_order_queue(enable_exception_handler); } - sycl::queue *create_queue(sycl::device device, - bool enable_exception_handler = false) { - return create_in_order_queue(device, enable_exception_handler); - } + sycl::queue *create_queue(sycl::context context, sycl::device device, + bool enable_exception_handler = false) { + return create_in_order_queue(context, device, enable_exception_handler); + } sycl::queue *create_in_order_queue(bool enable_exception_handler = false) { std::lock_guard lock(m_mutex); @@ -722,12 +724,12 @@ namespace dpct sycl::property::queue::in_order()); } - sycl::queue *create_in_order_queue(sycl::device device, + sycl::queue *create_in_order_queue(sycl::context context, sycl::device device, bool enable_exception_handler = false) { - std::lock_guard lock(m_mutex); - return create_queue_impl(device, enable_exception_handler, - sycl::property::queue::in_order()); - } + std::lock_guard lock(m_mutex); + return create_queue_impl(context, device, enable_exception_handler, + sycl::property::queue::in_order()); + } sycl::queue *create_out_of_order_queue( bool enable_exception_handler = false) { @@ -735,24 +737,28 @@ namespace dpct return create_queue_impl(enable_exception_handler); } - void destroy_queue(sycl::queue *&queue) { - std::lock_guard lock(m_mutex); - _queues.erase(std::remove_if(_queues.begin(), _queues.end(), - [=](const std::shared_ptr &q) -> bool - { - return q.get() == queue; - }), - _queues.end()); - queue = nullptr; - } - void set_saved_queue(sycl::queue *q) { - std::lock_guard lock(m_mutex); - _saved_queue = q; - } - sycl::queue *get_saved_queue() const { - std::lock_guard lock(m_mutex); - return _saved_queue; - } + void destroy_queue(sycl::queue *&queue) + { + std::lock_guard lock(m_mutex); + _queues.erase(std::remove_if(_queues.begin(), _queues.end(), + [=](const std::shared_ptr &q) -> bool + { + return q.get() == queue; + }), + _queues.end()); + queue = nullptr; + } + void set_saved_queue(sycl::queue *q) + { + std::lock_guard lock(m_mutex); + _saved_queue = q; + } + sycl::queue *get_saved_queue() const + { + std::lock_guard lock(m_mutex); + return _saved_queue; + } + sycl::context get_context() const { return _ctx; } private: void clear_queues() { @@ -767,18 +773,19 @@ namespace dpct _saved_queue = &default_queue(); } - /// Caller should acquire resource \p m_mutex before calling this - /// function. - template - sycl::queue *create_queue_impl(bool enable_exception_handler, - Properties... properties) { - sycl::async_handler eh = {}; - if (enable_exception_handler) { - eh = exception_handler; - } - _queues.push_back(std::make_shared( - *this, eh, - sycl::property_list( + /// Caller should acquire resource \p m_mutex before calling this function. + template + sycl::queue *create_queue_impl(bool enable_exception_handler, + Properties... properties) + { + sycl::async_handler eh = {}; + if (enable_exception_handler) + { + eh = exception_handler; + } + _queues.push_back(std::make_shared( + _ctx, *this, eh, + sycl::property_list( #ifdef DPCT_PROFILING_ENABLED sycl::property::queue::enable_profiling(), #endif @@ -787,21 +794,21 @@ namespace dpct return _queues.back().get(); } - template - sycl::queue *create_queue_impl(sycl::device device, + template + sycl::queue *create_queue_impl(sycl::context context, sycl::device device, bool enable_exception_handler, Properties... properties) { - sycl::async_handler eh = {}; - if (enable_exception_handler) { - eh = exception_handler; - } - _queues.push_back(std::make_shared( - device, eh, - sycl::property_list( -#ifdef DPCT_PROFILING_ENABLED - sycl::property::queue::enable_profiling(), -#endif - properties...))); + sycl::async_handler eh = {}; + if (enable_exception_handler) { + eh = exception_handler; + } + _queues.push_back(std::make_shared( + context, device, eh, + sycl::property_list( + #ifdef DPCT_PROFILING_ENABLED + sycl::property::queue::enable_profiling(), + #endif + properties...))); return _queues.back().get(); } @@ -811,6 +818,7 @@ namespace dpct } sycl::queue *_q_in_order, *_q_out_of_order; sycl::queue *_saved_queue; + sycl::context _ctx; std::vector> _queues; mutable mutex_type m_mutex; }; diff --git a/ggml/src/ggml-sycl/mmq.cpp b/ggml/src/ggml-sycl/mmq.cpp index e952533d310ec..0a6a5456c47ee 100644 --- a/ggml/src/ggml-sycl/mmq.cpp +++ b/ggml/src/ggml-sycl/mmq.cpp @@ -1779,7 +1779,7 @@ static void ggml_mul_mat_q4_0_q8_1_sycl(const void *vx, const void *vy, int id; SYCL_CHECK( CHECK_TRY_ERROR(id = get_current_device_id())); - const int compute_capability = ggml_sycl_info().devices[id].cc; + const int compute_capability = ggml_sycl_info().device_infos[id].cc; int mmq_x, mmq_y, nwarps; if (compute_capability >= VER_GEN13) { @@ -1894,7 +1894,7 @@ static void ggml_mul_mat_q4_1_q8_1_sycl(const void *vx, const void *vy, int id; SYCL_CHECK( CHECK_TRY_ERROR(id = get_current_device_id())); - const int compute_capability = ggml_sycl_info().devices[id].cc; + const int compute_capability = ggml_sycl_info().device_infos[id].cc; int mmq_x, mmq_y, nwarps; if (compute_capability >= VER_GEN13) { @@ -2009,7 +2009,7 @@ static void ggml_mul_mat_q5_0_q8_1_sycl(const void *vx, const void *vy, int id; SYCL_CHECK( CHECK_TRY_ERROR(id = get_current_device_id())); - const int compute_capability = ggml_sycl_info().devices[id].cc; + const int compute_capability = ggml_sycl_info().device_infos[id].cc; int mmq_x, mmq_y, nwarps; if (compute_capability >= VER_GEN13) { @@ -2124,7 +2124,7 @@ static void ggml_mul_mat_q5_1_q8_1_sycl(const void *vx, const void *vy, int id; SYCL_CHECK( CHECK_TRY_ERROR(id = get_current_device_id())); - const int compute_capability = ggml_sycl_info().devices[id].cc; + const int compute_capability = ggml_sycl_info().device_infos[id].cc; int mmq_x, mmq_y, nwarps; if (compute_capability >= VER_GEN13) { @@ -2239,7 +2239,7 @@ static void ggml_mul_mat_q8_0_q8_1_sycl(const void *vx, const void *vy, int id; SYCL_CHECK( CHECK_TRY_ERROR(id = get_current_device_id())); - const int compute_capability = ggml_sycl_info().devices[id].cc; + const int compute_capability = ggml_sycl_info().device_infos[id].cc; int mmq_x, mmq_y, nwarps; if (compute_capability >= VER_GEN13) { @@ -2354,7 +2354,7 @@ static void ggml_mul_mat_q2_K_q8_1_sycl(const void *vx, const void *vy, int id; SYCL_CHECK( CHECK_TRY_ERROR(id = get_current_device_id())); - const int compute_capability = ggml_sycl_info().devices[id].cc; + const int compute_capability = ggml_sycl_info().device_infos[id].cc; int mmq_x, mmq_y, nwarps; if (compute_capability >= VER_GEN13) { @@ -2477,7 +2477,7 @@ static void ggml_mul_mat_q3_K_q8_1_sycl(const void *vx, const void *vy, int id; SYCL_CHECK( CHECK_TRY_ERROR(id = get_current_device_id())); - const int compute_capability = ggml_sycl_info().devices[id].cc; + const int compute_capability = ggml_sycl_info().device_infos[id].cc; int mmq_x, mmq_y, nwarps; if (compute_capability >= VER_GEN13) { @@ -2605,7 +2605,7 @@ static void ggml_mul_mat_q4_K_q8_1_sycl(const void *vx, const void *vy, int id; SYCL_CHECK( CHECK_TRY_ERROR(id = get_current_device_id())); - const int compute_capability = ggml_sycl_info().devices[id].cc; + const int compute_capability = ggml_sycl_info().device_infos[id].cc; int mmq_x, mmq_y, nwarps; if (compute_capability >= VER_GEN13) { @@ -2726,7 +2726,7 @@ static void ggml_mul_mat_q5_K_q8_1_sycl(const void *vx, const void *vy, int id; SYCL_CHECK( CHECK_TRY_ERROR(id = get_current_device_id())); - const int compute_capability = ggml_sycl_info().devices[id].cc; + const int compute_capability = ggml_sycl_info().device_infos[id].cc; int mmq_x, mmq_y, nwarps; if (compute_capability >= VER_GEN13) { @@ -2847,7 +2847,7 @@ static void ggml_mul_mat_q6_K_q8_1_sycl(const void *vx, const void *vy, int id; SYCL_CHECK( CHECK_TRY_ERROR(id = get_current_device_id())); - const int compute_capability = ggml_sycl_info().devices[id].cc; + const int compute_capability = ggml_sycl_info().device_infos[id].cc; int mmq_x, mmq_y, nwarps; if (compute_capability >= VER_GEN13) { diff --git a/ggml/src/ggml-sycl/sycl_device.cpp b/ggml/src/ggml-sycl/sycl_device.cpp new file mode 100644 index 0000000000000..e2633d729671e --- /dev/null +++ b/ggml/src/ggml-sycl/sycl_device.cpp @@ -0,0 +1,286 @@ +#include "sycl_device.hpp" +#include "sycl_hw.hpp" + + +void ggml_sycl_device_info::init( + ggml_sycl_backend_device_filter device_filter) { + switch (device_filter) { + case SYCL_DEVICES_TOP_LEVEL_ZERO: + detect_sycl_gpu_list_with_max_cu(); + create_context_for_devices(); + break; + case SYCL_ALL_DEVICES: + detect_all_sycl_device_list(); + create_context_for_devices(); + break; + case SYCL_VISIBLE_DEVICES: + detect_sycl_visible_device_list(); + create_context_for_devices(); + break; + default: + std::cerr << "ggml_sycl_device_info: Invalid device_filter " << device_filter + << std::endl; + } + init_allow_devices(); + device_count = ids.size(); +} + +/* +Bind all devices in same host with same context, for better performance in +device-to-device copy in the future. +*/ +void ggml_sycl_device_info::create_context_for_devices() { + assert(devices.size() > 0); + sycl::context ctx = sycl::context(devices); + first_queue = dpct::get_current_device().create_queue(ctx, devices[0]); + co_ctx = first_queue->get_context(); +} + +sycl::queue *ggml_sycl_device_info::_create_queue_ptr(sycl::device device) { + auto q = dpct::get_current_device().create_queue(co_ctx, device); + return q; +} + +sycl::queue *ggml_sycl_device_info::create_queue_for_device(sycl::device &device) { + dpct::select_device(dpct::dev_mgr::instance().get_device_id(device)); + auto qptr = _create_queue_ptr(device); + return qptr; +} + +sycl::queue *ggml_sycl_device_info::create_queue_for_device_id(int id) { + sycl::device device = dpct::dev_mgr::instance().get_device(id); + return create_queue_for_device(device); +} + +int ggml_sycl_device_info::get_device_index(int id) { + for (int i = 0; i < ids.size(); i++) { + if (ids[i] == id) + return i; + } + return -1; +} + +void ggml_sycl_device_info::init_allow_devices() { + device_list = ""; + for (auto & id: ids) { + device_list += std::to_string(id); + device_list += ","; + } + if (device_list.length() > 1) { + device_list.pop_back(); + } +} + +bool ggml_sycl_device_info::is_allowed_device(int id) { + return std::find(ids.begin(), ids.end(), id) != ids.end(); +} + +void ggml_sycl_device_info::detect_all_sycl_device_list() try { + int all_device_count = dpct::dev_mgr::instance().device_count(); + + for (int id = 0; id < all_device_count; id++) { + add_device_info(id); + } + return; +} catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +std::vector ggml_sycl_device_info::get_sycl_visible_devices() { + static std::vector device_ids; + char *devices_env = getenv("GGML_SYCL_VISIBLE_DEVICES"); + if (devices_env != nullptr) { + std::string devices(devices_env); + std::replace(devices.begin(), devices.end(), ',', ' '); + + std::stringstream ss(devices); + int tmp; + while (ss >> tmp) { + device_ids.push_back(tmp); + } + } + return device_ids; +} + +void ggml_sycl_device_info::detect_sycl_visible_device_list() try { + std::vector sycl_devices = get_sycl_visible_devices(); + int all_device_count = dpct::dev_mgr::instance().device_count(); + + for (auto & id: sycl_devices) { + if (id >= all_device_count) { + std::cerr << __func__ << ": invalid device_id:" << id + << " from GGML_SYCL_VISIBLE_DEVICES=" + << getenv("GGML_SYCL_VISIBLE_DEVICES") + << ", available IDs: "; + if (all_device_count > 1) { + std::cerr << "[0, " << all_device_count - 1 << "]"; + } else if (all_device_count == 1) { + std::cerr << "[0]"; + } else { + std::cerr << "[]"; + } + std::cerr << std::endl; + } + add_device_info(id); + } + return; +} catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +/* +Use all GPUs with same top max compute units +*/ +void ggml_sycl_device_info::detect_sycl_gpu_list_with_max_cu() try { + int all_device_count = dpct::dev_mgr::instance().device_count(); + int local_max_compute_units = 0; + for (int id = 0; id < all_device_count; id++) { + sycl::device device = dpct::dev_mgr::instance().get_device(id); + if (!device.is_gpu()) + continue; + dpct::device_info prop; + dpct::get_device_info(prop, device); + if (local_max_compute_units < prop.get_max_compute_units()) + local_max_compute_units = prop.get_max_compute_units(); + } + + for (int id = 0; id < all_device_count; id++) { + sycl::device device = dpct::dev_mgr::instance().get_device(id); + if (!device.is_gpu()) + continue; + dpct::device_info prop; + dpct::get_device_info(prop, device); + if (local_max_compute_units == prop.get_max_compute_units() && + is_ext_oneapi_device(device)) { + add_device_info(id); + } + } + return; +} catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +int ggml_sycl_device_info::get_device_count() { return device_count; } + +bool ggml_sycl_device_info::is_ext_oneapi_device(const sycl::device &dev) { + sycl::backend dev_backend = dev.get_backend(); + if (dev_backend == sycl::backend::ext_oneapi_level_zero || + dev_backend == sycl::backend::ext_oneapi_cuda || + dev_backend == sycl::backend::ext_oneapi_hip) + return true; + return false; +} + +void ggml_sycl_device_info::add_device_info(int id) { + sycl::device device = dpct::dev_mgr::instance().get_device(id); + dpct::device_info prop; + dpct::get_device_info(prop, device); + + ids.push_back(id); + devices.push_back(device); + + device_infos[id].id = id; + device_infos[id].device = device; + device_infos[id].max_work_group_sizes = prop.get_max_work_group_size(); + device_infos[id].max_compute_units = prop.get_max_compute_units(); + device_infos[id].hw_family = get_device_family(&device); + for (int i=0; i +#include +#include + + +#include +#include "dpct/helper.hpp" + +#include "ggml-sycl.h" +#include "presets.hpp" +// #include "common.hpp" + +enum ggml_sycl_backend_device_filter { + SYCL_ALL_DEVICES = 0, + SYCL_DEVICES_TOP_LEVEL_ZERO, + SYCL_VISIBLE_DEVICES +}; + +struct sycl_device_info { + int cc; // compute capability + // int nsm; // number of streaming multiprocessors + // size_t smpb; // max. shared memory per block + bool vmm; // virtual memory support + size_t total_vram; + + int id; + sycl::device device; + int max_compute_units; + int max_work_group_sizes; + int hw_family; + sycl::context ctx; + sycl::queue * qptrs[GGML_SYCL_MAX_STREAMS] = { nullptr }; +}; + +struct ggml_sycl_device_info { + int device_count; + bool oneapi_device_selector_existed = false; + bool sycl_visible_devices_existed = false; + std::vector ids; + std::vector devices; + sycl::queue *first_queue; + std::string device_list; + sycl::context co_ctx; + + sycl_device_info device_infos[GGML_SYCL_MAX_DEVICES]; + std::array default_tensor_split = {}; + + ggml_sycl_device_info(); + void init(ggml_sycl_backend_device_filter device_filter); + + void print_gpu_device_list(); + int work_group_size(int device_id); + bool is_allowed_device(int device_id); + const char* devices_list(); + int get_device_id(int device_index); + int hw_family(int device_id); + + sycl::queue *_create_queue_ptr(sycl::device device); //internal API to hide dpct API. + void create_context_for_group_gpus(); + sycl::queue *create_queue_for_device(sycl::device &device); + sycl::queue *create_queue_for_device_id(int device_id); + int get_device_index(int device_id); + void create_context_for_devices(); + void init_allow_devices(); + void detect_all_sycl_device_list(); + void detect_sycl_visible_device_list(); + void detect_sycl_gpu_list_with_max_cu(); + int get_device_count(); + bool is_ext_oneapi_device(const sycl::device &dev); + void add_device_info(int id); + std::vector get_devices(); + std::vector get_sycl_visible_devices(); + + sycl::context &get_co_ctx() { return co_ctx; } + +}; + +static inline bool env_existed(const char *env_name); + +#endif // SYCL_DEVICE_HPP diff --git a/ggml/src/ggml-sycl/sycl_hw.cpp b/ggml/src/ggml-sycl/sycl_hw.cpp index 21e0c2af90191..af7675adb3270 100644 --- a/ggml/src/ggml-sycl/sycl_hw.cpp +++ b/ggml/src/ggml-sycl/sycl_hw.cpp @@ -8,8 +8,14 @@ SYCL_HW_FAMILY get_device_family(sycl::device *device_ptr) { auto id = device_ptr->get_info(); auto id_prefix = id & 0xff00; + const std::vector Xe_ARC = {0x5600, 0x4f00}; + const std::vector Xe_Iris_IDs = {0x4900, 0xa700}; + const std::vector UHD_IDs = {0x4600}; + if (is_in_vector(Xe_Iris_IDs, id_prefix) or is_in_vector(UHD_IDs, id_prefix)) { return SYCL_HW_FAMILY_INTEL_IGPU; + } else if (is_in_vector(Xe_ARC, id_prefix)) { + return SYCL_HW_FAMILY_INTEL_ARC; } else { std::cerr << "No support PCI_ID: " << std::hex << id << std::endl; return SYCL_HW_FAMILY_UNKNOWN; diff --git a/ggml/src/ggml-sycl/sycl_hw.hpp b/ggml/src/ggml-sycl/sycl_hw.hpp index 81944475663a4..4537955d01669 100644 --- a/ggml/src/ggml-sycl/sycl_hw.hpp +++ b/ggml/src/ggml-sycl/sycl_hw.hpp @@ -8,13 +8,10 @@ #include -// const int Xe_ARC[] = {0x5600, 0x4f}; -const std::vector Xe_Iris_IDs = {0x4900, 0xa700}; -const std::vector UHD_IDs = {0x4600}; - enum SYCL_HW_FAMILY { SYCL_HW_FAMILY_UNKNOWN = -1, - SYCL_HW_FAMILY_INTEL_IGPU = 0 + SYCL_HW_FAMILY_INTEL_IGPU = 0, + SYCL_HW_FAMILY_INTEL_ARC = 1 }; bool is_in_vector(std::vector &vec, int item); From 1947c1200e1f5fd40ca8606454fe097efd3505aa Mon Sep 17 00:00:00 2001 From: arthw <14088817+arthw@users.noreply.github.com> Date: Thu, 1 Aug 2024 11:21:16 +0800 Subject: [PATCH 2/6] support set main gpu --- ggml/include/ggml-sycl.h | 2 + ggml/src/ggml-sycl.cpp | 8 +++ ggml/src/ggml-sycl/common.cpp | 8 +-- ggml/src/ggml-sycl/common.hpp | 10 +--- ggml/src/ggml-sycl/sycl_device.cpp | 86 +++++++++++++++++++++++++----- ggml/src/ggml-sycl/sycl_device.hpp | 20 ++++++- src/llama.cpp | 7 ++- 7 files changed, 111 insertions(+), 30 deletions(-) diff --git a/ggml/include/ggml-sycl.h b/ggml/include/ggml-sycl.h index a50086afae9eb..4368e27a57178 100644 --- a/ggml/include/ggml-sycl.h +++ b/ggml/include/ggml-sycl.h @@ -38,6 +38,8 @@ GGML_API GGML_CALL int ggml_backend_sycl_get_device_index(int device_id); GGML_API GGML_CALL int ggml_backend_sycl_get_device_id(int index); GGML_API GGML_CALL void ggml_sycl_set_single_device(int main_gpu_id); +GGML_API GGML_CALL void ggml_backend_sycl_set_single_device_mode(int main_gpu_id); + // SYCL doesn't support registering host memory, keep here for reference // GGML_API GGML_CALL bool ggml_backend_sycl_register_host_buffer(void * buffer, size_t size); // GGML_API GGML_CALL void ggml_backend_sycl_unregister_host_buffer(void * buffer); diff --git a/ggml/src/ggml-sycl.cpp b/ggml/src/ggml-sycl.cpp index 8a0b8ff639a79..77c7f610f3966 100644 --- a/ggml/src/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl.cpp @@ -38,6 +38,7 @@ #include "ggml-sycl/backend.hpp" #include "ggml-sycl/presets.hpp" +#include "ggml-sycl/sycl_device.hpp" void ggml_sycl_free_data(struct ggml_tensor * tensor); @@ -5150,6 +5151,13 @@ GGML_CALL int ggml_backend_sycl_get_device_count() { return ggml_sycl_info().device_count; } +GGML_API GGML_CALL void ggml_backend_sycl_set_single_device_mode(int main_gpu_id) { + + GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_set_single_device_mode\n"); + fprintf(stderr, "ggml_backend_sycl_set_single_device: use single device: [%d]\n", main_gpu_id); + ggml_sycl_info(main_gpu_id); +} + GGML_CALL static ggml_backend_t ggml_backend_reg_sycl_init(const char * params, void * user_data) { ggml_backend_t sycl_backend = ggml_backend_sycl_init((int) (intptr_t) user_data); return sycl_backend; diff --git a/ggml/src/ggml-sycl/common.cpp b/ggml/src/ggml-sycl/common.cpp index efd702bf19991..da7d8c60eb035 100644 --- a/ggml/src/ggml-sycl/common.cpp +++ b/ggml/src/ggml-sycl/common.cpp @@ -137,7 +137,7 @@ void ggml_backend_sycl_print_sycl_devices() { } } -static ggml_sycl_device_info ggml_sycl_init() try { +static ggml_sycl_device_info ggml_sycl_init(int main_gpu_id) try { static bool initialized = false; if (!initialized) { @@ -176,7 +176,7 @@ static ggml_sycl_device_info ggml_sycl_init() try { initialized = true; } - static ggml_sycl_device_info info; + static ggml_sycl_device_info info(main_gpu_id); if (info.device_count == 0) { fprintf(stderr, "%s: failed to initialize " GGML_SYCL_NAME ": no available device found\n", @@ -192,8 +192,8 @@ static ggml_sycl_device_info ggml_sycl_init() try { std::exit(1); } -ggml_sycl_device_info &ggml_sycl_info() { - static ggml_sycl_device_info info = ggml_sycl_init(); +ggml_sycl_device_info &ggml_sycl_info(int main_gpu_id) { + static ggml_sycl_device_info info = ggml_sycl_init(main_gpu_id); return info; } diff --git a/ggml/src/ggml-sycl/common.hpp b/ggml/src/ggml-sycl/common.hpp index f0c93a52d84e4..5b60835a23cc0 100644 --- a/ggml/src/ggml-sycl/common.hpp +++ b/ggml/src/ggml-sycl/common.hpp @@ -81,12 +81,6 @@ static int g_ggml_sycl_debug = 0; typedef sycl::queue *queue_ptr; -enum ggml_sycl_backend_gpu_mode { - SYCL_UNSET_GPU_MODE = -1, - SYCL_SINGLE_GPU_MODE = 0, - SYCL_MUL_GPU_MODE -}; - static_assert(sizeof(sycl::half) == sizeof(ggml_fp16_t), "wrong fp16 size"); static void crash() { @@ -288,8 +282,8 @@ void* ggml_sycl_host_malloc(size_t size); void ggml_sycl_host_free(void* ptr); void ggml_backend_sycl_print_sycl_devices(); -static ggml_sycl_device_info ggml_sycl_init(); -ggml_sycl_device_info &ggml_sycl_info(); +static ggml_sycl_device_info ggml_sycl_init(int main_gpu_id); +ggml_sycl_device_info &ggml_sycl_info(int main_gpu_id = -1); // common device functions diff --git a/ggml/src/ggml-sycl/sycl_device.cpp b/ggml/src/ggml-sycl/sycl_device.cpp index e2633d729671e..fd8c045994b98 100644 --- a/ggml/src/ggml-sycl/sycl_device.cpp +++ b/ggml/src/ggml-sycl/sycl_device.cpp @@ -7,22 +7,54 @@ void ggml_sycl_device_info::init( switch (device_filter) { case SYCL_DEVICES_TOP_LEVEL_ZERO: detect_sycl_gpu_list_with_max_cu(); - create_context_for_devices(); break; case SYCL_ALL_DEVICES: detect_all_sycl_device_list(); - create_context_for_devices(); break; case SYCL_VISIBLE_DEVICES: detect_sycl_visible_device_list(); - create_context_for_devices(); break; default: std::cerr << "ggml_sycl_device_info: Invalid device_filter " << device_filter << std::endl; } - init_allow_devices(); + init_devices_dynamic_info(); + m_device_filter = device_filter; +} + +void ggml_sycl_device_info::clear_device_infos() { + ids.clear(); + devices.clear(); + + for (int id=0;id default_tensor_split = {}; - ggml_sycl_device_info(); + ggml_sycl_device_info(int main_gpu_id);//single device mode + void init(ggml_sycl_backend_device_filter device_filter); + void init_single_mode(int main_gpu_id); + void clear_device_infos(); void print_gpu_device_list(); int work_group_size(int device_id); bool is_allowed_device(int device_id); @@ -64,15 +76,19 @@ struct ggml_sycl_device_info { sycl::queue *create_queue_for_device_id(int device_id); int get_device_index(int device_id); void create_context_for_devices(); - void init_allow_devices(); + void set_allow_devices(); void detect_all_sycl_device_list(); void detect_sycl_visible_device_list(); void detect_sycl_gpu_list_with_max_cu(); int get_device_count(); bool is_ext_oneapi_device(const sycl::device &dev); void add_device_info(int id); + void create_queues(int id); + void create_queues_for_devices(); std::vector get_devices(); std::vector get_sycl_visible_devices(); + void update_mem(); + void init_devices_dynamic_info(); sycl::context &get_co_ctx() { return co_ctx; } diff --git a/src/llama.cpp b/src/llama.cpp index 7bb2dfd4625b3..c2a914addff0a 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -2831,8 +2831,7 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_ #elif defined(GGML_USE_VULKAN) buft = ggml_backend_vk_buffer_type(gpu); #elif defined(GGML_USE_SYCL) - int gpu_id = ggml_backend_sycl_get_device_id(gpu); - buft = ggml_backend_sycl_buffer_type(gpu_id); + buft = ggml_backend_sycl_buffer_type(gpu); #elif defined(GGML_USE_KOMPUTE) buft = ggml_backend_kompute_buffer_type(gpu); if (buft == nullptr) { @@ -5931,6 +5930,10 @@ static bool llm_load_tensors( model.buft_output = llama_default_buffer_type_cpu(true); } } else { + +#if defined(GGML_USE_SYCL) + ggml_backend_sycl_set_single_device_mode(main_gpu); +#endif ggml_backend_buffer_type_t split_buft; if (split_mode == LLAMA_SPLIT_MODE_ROW) { split_buft = llama_default_buffer_type_split(model, main_gpu, tensor_split); From 6211ac040886495c44ad2ddee80204c9bcd6721a Mon Sep 17 00:00:00 2001 From: arthw <14088817+arthw@users.noreply.github.com> Date: Thu, 1 Aug 2024 12:42:11 +0800 Subject: [PATCH 3/6] simple code for loop --- ggml/src/ggml-sycl.cpp | 23 +++++++---------------- 1 file changed, 7 insertions(+), 16 deletions(-) diff --git a/ggml/src/ggml-sycl.cpp b/ggml/src/ggml-sycl.cpp index 77c7f610f3966..ae2420a822cd7 100644 --- a/ggml/src/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl.cpp @@ -2816,8 +2816,7 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten } } - for (int i = 0; i < ggml_sycl_info().device_count; ++i) { - int id = ggml_backend_sycl_get_device_id(i); + for (auto & id: ggml_sycl_info().ids) { if ((!split && id != ctx.device) || dev[id].row_low == dev[id].row_high) { continue; } @@ -2882,8 +2881,7 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten const int64_t is = split ? (src1_col_0/src1_col_stride) % GGML_SYCL_MAX_STREAMS : 0; const int64_t src1_ncols = src1_col_0 + src1_col_stride > ne11 ? ne11 - src1_col_0 : src1_col_stride; - for (int i = 0; i < ggml_sycl_info().device_count; ++i) { - int id = ggml_backend_sycl_get_device_id(i); + for (auto & id: ggml_sycl_info().ids) { if ((!split && id != ctx.device) || dev[id].row_low == dev[id].row_high) { continue; } @@ -3025,8 +3023,7 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten is_max = is_max <= GGML_SYCL_MAX_STREAMS ? is_max : GGML_SYCL_MAX_STREAMS; ggml_sycl_set_device(ctx.device); - for (int i = 0; i < ggml_sycl_info().device_count; ++i) { - int id = ggml_backend_sycl_get_device_id(i); + for (auto & id: ggml_sycl_info().ids) { if (dev[id].row_low == dev[id].row_high) { continue; } @@ -4343,12 +4340,9 @@ ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device_id) { static bool ggml_backend_sycl_buffer_type_initialized = false; if (!ggml_backend_sycl_buffer_type_initialized) { - for (int i = 0; i < ggml_sycl_info().device_count; i++) { - int id = ggml_backend_sycl_get_device_id(i); + for (auto & id: ggml_sycl_info().ids) { auto & device = dpct::dev_mgr::instance().get_device(id); - // queue_ptr stream = &(device.default_queue()); queue_ptr stream = ggml_sycl_info().device_infos[id].qptrs[0]; - ggml_backend_sycl_buffer_types[id] = { /* .iface = */ ggml_backend_sycl_buffer_type_interface, /* .context = */ new ggml_backend_sycl_buffer_type_context{id, GGML_SYCL_NAME + std::to_string(id), stream}, @@ -4369,8 +4363,7 @@ ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(ggml_backend_sycl_conte static bool ggml_backend_sycl_buffer_type_initialized = false; if (!ggml_backend_sycl_buffer_type_initialized) { - for (int i = 0; i < ggml_sycl_info().device_count; i++) { - int id = ggml_backend_sycl_get_device_id(i); + for (auto & id: ggml_sycl_info().ids) { ggml_backend_sycl_buffer_types[id] = { /* .iface = */ ggml_backend_sycl_buffer_type_interface, /* .context = */ new ggml_backend_sycl_buffer_type_context{id, GGML_SYCL_NAME + std::to_string(id), ctx->stream(id, 0)}, @@ -4399,8 +4392,7 @@ static void get_row_split(int64_t * row_low, int64_t * row_high, const ggml_tens struct ggml_backend_sycl_split_buffer_context { ~ggml_backend_sycl_split_buffer_context() try { for (ggml_tensor_extra_gpu * extra : tensor_extras) { - for (int i = 0; i < ggml_sycl_info().device_count; ++i) { - int id = ggml_backend_sycl_get_device_id(i); + for (auto & id: ggml_sycl_info().ids) { for (int64_t is = 0; is < GGML_SYCL_MAX_STREAMS; ++is) { if (extra->events[id][is] != nullptr) { /* @@ -5169,8 +5161,7 @@ extern "C" int ggml_backend_sycl_reg_devices(); int ggml_backend_sycl_reg_devices() { assert(ggml_sycl_info().device_count>0); - for (int i = 0; i < ggml_sycl_info().device_count; i++) { - int id = ggml_backend_sycl_get_device_id(i); + for (auto & id: ggml_sycl_info().ids) { char name[128]; snprintf(name, sizeof(name), "%s%d", GGML_SYCL_NAME, id); ggml_backend_register(name, ggml_backend_reg_sycl_init, ggml_backend_sycl_buffer_type(id), (void *) (intptr_t) id); From 254a750249f74e838e744eedd8607f05fe9707d7 Mon Sep 17 00:00:00 2001 From: arthw <14088817+arthw@users.noreply.github.com> Date: Thu, 1 Aug 2024 12:48:18 +0800 Subject: [PATCH 4/6] rename device_infos to infos --- ggml/src/ggml-sycl.cpp | 16 +++++------ ggml/src/ggml-sycl/common.hpp | 2 +- ggml/src/ggml-sycl/mmq.cpp | 20 +++++++------- ggml/src/ggml-sycl/sycl_device.cpp | 43 ++++++++++++++---------------- ggml/src/ggml-sycl/sycl_device.hpp | 4 +-- 5 files changed, 41 insertions(+), 44 deletions(-) diff --git a/ggml/src/ggml-sycl.cpp b/ggml/src/ggml-sycl.cpp index ae2420a822cd7..cf83d8c10f5cb 100644 --- a/ggml/src/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl.cpp @@ -2280,11 +2280,11 @@ static int64_t get_row_rounding(ggml_type type, const std::array ggml_sycl_info().device_infos[id].cc) { - min_compute_capability = ggml_sycl_info().device_infos[id].cc; + if (min_compute_capability > ggml_sycl_info().infos[id].cc) { + min_compute_capability = ggml_sycl_info().infos[id].cc; } - if (max_compute_capability < ggml_sycl_info().device_infos[id].cc) { - max_compute_capability = ggml_sycl_info().device_infos[id].cc; + if (max_compute_capability < ggml_sycl_info().infos[id].cc) { + max_compute_capability = ggml_sycl_info().infos[id].cc; } } } @@ -3416,12 +3416,12 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor continue; } - if (min_compute_capability > ggml_sycl_info().device_infos[id].cc) { - min_compute_capability = ggml_sycl_info().device_infos[id].cc; + if (min_compute_capability > ggml_sycl_info().infos[id].cc) { + min_compute_capability = ggml_sycl_info().infos[id].cc; } } } else { - min_compute_capability = ggml_sycl_info().device_infos[ctx.device].cc; + min_compute_capability = ggml_sycl_info().infos[ctx.device].cc; } // check data types and tensor shapes for custom matrix multiplication kernels: @@ -4342,7 +4342,7 @@ ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device_id) { if (!ggml_backend_sycl_buffer_type_initialized) { for (auto & id: ggml_sycl_info().ids) { auto & device = dpct::dev_mgr::instance().get_device(id); - queue_ptr stream = ggml_sycl_info().device_infos[id].qptrs[0]; + queue_ptr stream = ggml_sycl_info().infos[id].qptrs[0]; ggml_backend_sycl_buffer_types[id] = { /* .iface = */ ggml_backend_sycl_buffer_type_interface, /* .context = */ new ggml_backend_sycl_buffer_type_context{id, GGML_SYCL_NAME + std::to_string(id), stream}, diff --git a/ggml/src/ggml-sycl/common.hpp b/ggml/src/ggml-sycl/common.hpp index 5b60835a23cc0..ccdd789cc77dd 100644 --- a/ggml/src/ggml-sycl/common.hpp +++ b/ggml/src/ggml-sycl/common.hpp @@ -241,7 +241,7 @@ struct ggml_backend_sycl_context { device(id), name(GGML_SYCL_NAME + std::to_string(device)) { for (int i=0;i= VER_GEN13) { @@ -1894,7 +1894,7 @@ static void ggml_mul_mat_q4_1_q8_1_sycl(const void *vx, const void *vy, int id; SYCL_CHECK( CHECK_TRY_ERROR(id = get_current_device_id())); - const int compute_capability = ggml_sycl_info().device_infos[id].cc; + const int compute_capability = ggml_sycl_info().infos[id].cc; int mmq_x, mmq_y, nwarps; if (compute_capability >= VER_GEN13) { @@ -2009,7 +2009,7 @@ static void ggml_mul_mat_q5_0_q8_1_sycl(const void *vx, const void *vy, int id; SYCL_CHECK( CHECK_TRY_ERROR(id = get_current_device_id())); - const int compute_capability = ggml_sycl_info().device_infos[id].cc; + const int compute_capability = ggml_sycl_info().infos[id].cc; int mmq_x, mmq_y, nwarps; if (compute_capability >= VER_GEN13) { @@ -2124,7 +2124,7 @@ static void ggml_mul_mat_q5_1_q8_1_sycl(const void *vx, const void *vy, int id; SYCL_CHECK( CHECK_TRY_ERROR(id = get_current_device_id())); - const int compute_capability = ggml_sycl_info().device_infos[id].cc; + const int compute_capability = ggml_sycl_info().infos[id].cc; int mmq_x, mmq_y, nwarps; if (compute_capability >= VER_GEN13) { @@ -2239,7 +2239,7 @@ static void ggml_mul_mat_q8_0_q8_1_sycl(const void *vx, const void *vy, int id; SYCL_CHECK( CHECK_TRY_ERROR(id = get_current_device_id())); - const int compute_capability = ggml_sycl_info().device_infos[id].cc; + const int compute_capability = ggml_sycl_info().infos[id].cc; int mmq_x, mmq_y, nwarps; if (compute_capability >= VER_GEN13) { @@ -2354,7 +2354,7 @@ static void ggml_mul_mat_q2_K_q8_1_sycl(const void *vx, const void *vy, int id; SYCL_CHECK( CHECK_TRY_ERROR(id = get_current_device_id())); - const int compute_capability = ggml_sycl_info().device_infos[id].cc; + const int compute_capability = ggml_sycl_info().infos[id].cc; int mmq_x, mmq_y, nwarps; if (compute_capability >= VER_GEN13) { @@ -2477,7 +2477,7 @@ static void ggml_mul_mat_q3_K_q8_1_sycl(const void *vx, const void *vy, int id; SYCL_CHECK( CHECK_TRY_ERROR(id = get_current_device_id())); - const int compute_capability = ggml_sycl_info().device_infos[id].cc; + const int compute_capability = ggml_sycl_info().infos[id].cc; int mmq_x, mmq_y, nwarps; if (compute_capability >= VER_GEN13) { @@ -2605,7 +2605,7 @@ static void ggml_mul_mat_q4_K_q8_1_sycl(const void *vx, const void *vy, int id; SYCL_CHECK( CHECK_TRY_ERROR(id = get_current_device_id())); - const int compute_capability = ggml_sycl_info().device_infos[id].cc; + const int compute_capability = ggml_sycl_info().infos[id].cc; int mmq_x, mmq_y, nwarps; if (compute_capability >= VER_GEN13) { @@ -2726,7 +2726,7 @@ static void ggml_mul_mat_q5_K_q8_1_sycl(const void *vx, const void *vy, int id; SYCL_CHECK( CHECK_TRY_ERROR(id = get_current_device_id())); - const int compute_capability = ggml_sycl_info().device_infos[id].cc; + const int compute_capability = ggml_sycl_info().infos[id].cc; int mmq_x, mmq_y, nwarps; if (compute_capability >= VER_GEN13) { @@ -2847,7 +2847,7 @@ static void ggml_mul_mat_q6_K_q8_1_sycl(const void *vx, const void *vy, int id; SYCL_CHECK( CHECK_TRY_ERROR(id = get_current_device_id())); - const int compute_capability = ggml_sycl_info().device_infos[id].cc; + const int compute_capability = ggml_sycl_info().infos[id].cc; int mmq_x, mmq_y, nwarps; if (compute_capability >= VER_GEN13) { diff --git a/ggml/src/ggml-sycl/sycl_device.cpp b/ggml/src/ggml-sycl/sycl_device.cpp index fd8c045994b98..e476adef0e38e 100644 --- a/ggml/src/ggml-sycl/sycl_device.cpp +++ b/ggml/src/ggml-sycl/sycl_device.cpp @@ -22,18 +22,15 @@ void ggml_sycl_device_info::init( m_device_filter = device_filter; } -void ggml_sycl_device_info::clear_device_infos() { +void ggml_sycl_device_info::clear_infos() { ids.clear(); devices.clear(); for (int id=0;id default_tensor_split = {}; ggml_sycl_device_info(int main_gpu_id);//single device mode @@ -62,7 +62,7 @@ struct ggml_sycl_device_info { void init(ggml_sycl_backend_device_filter device_filter); void init_single_mode(int main_gpu_id); - void clear_device_infos(); + void clear_infos(); void print_gpu_device_list(); int work_group_size(int device_id); bool is_allowed_device(int device_id); From 4d71c98544c9150d974878867e1277a57d94ecb2 Mon Sep 17 00:00:00 2001 From: arthw <14088817+arthw@users.noreply.github.com> Date: Thu, 1 Aug 2024 12:52:06 +0800 Subject: [PATCH 5/6] mv dpct/helper.hpp to dpct.hpp --- ggml/src/ggml-sycl/common.hpp | 2 +- ggml/src/ggml-sycl/{dpct/helper.hpp => dpct.hpp} | 0 ggml/src/ggml-sycl/sycl_device.hpp | 2 +- ggml/src/ggml-sycl/vecdotq.hpp | 2 +- 4 files changed, 3 insertions(+), 3 deletions(-) rename ggml/src/ggml-sycl/{dpct/helper.hpp => dpct.hpp} (100%) diff --git a/ggml/src/ggml-sycl/common.hpp b/ggml/src/ggml-sycl/common.hpp index ccdd789cc77dd..cb9514356e052 100644 --- a/ggml/src/ggml-sycl/common.hpp +++ b/ggml/src/ggml-sycl/common.hpp @@ -17,7 +17,7 @@ #include #include -#include "dpct/helper.hpp" +#include "dpct.hpp" #include "ggml-sycl.h" #include "presets.hpp" #include "sycl_hw.hpp" diff --git a/ggml/src/ggml-sycl/dpct/helper.hpp b/ggml/src/ggml-sycl/dpct.hpp similarity index 100% rename from ggml/src/ggml-sycl/dpct/helper.hpp rename to ggml/src/ggml-sycl/dpct.hpp diff --git a/ggml/src/ggml-sycl/sycl_device.hpp b/ggml/src/ggml-sycl/sycl_device.hpp index 900a779426054..7bffe75567c32 100644 --- a/ggml/src/ggml-sycl/sycl_device.hpp +++ b/ggml/src/ggml-sycl/sycl_device.hpp @@ -7,7 +7,7 @@ #include -#include "dpct/helper.hpp" +#include "dpct.hpp" #include "ggml-sycl.h" #include "presets.hpp" diff --git a/ggml/src/ggml-sycl/vecdotq.hpp b/ggml/src/ggml-sycl/vecdotq.hpp index d2dccade20bfd..a0ab8287c27b4 100644 --- a/ggml/src/ggml-sycl/vecdotq.hpp +++ b/ggml/src/ggml-sycl/vecdotq.hpp @@ -13,7 +13,7 @@ #ifndef GGML_SYCL_VECDOTQ_HPP #define GGML_SYCL_VECDOTQ_HPP -#include "dpct/helper.hpp" +#include "dpct.hpp" typedef float (*vec_dot_q_sycl_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs); From f1bc5ad852d3007864bfca5c8898554a970fc1e6 Mon Sep 17 00:00:00 2001 From: arthw <14088817+arthw@users.noreply.github.com> Date: Thu, 1 Aug 2024 13:14:58 +0800 Subject: [PATCH 6/6] add final newline --- ggml/src/ggml-sycl/sycl_device.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-sycl/sycl_device.cpp b/ggml/src/ggml-sycl/sycl_device.cpp index e476adef0e38e..02937529c37a5 100644 --- a/ggml/src/ggml-sycl/sycl_device.cpp +++ b/ggml/src/ggml-sycl/sycl_device.cpp @@ -338,4 +338,4 @@ int ggml_sycl_device_info::hw_family(int id) { static inline bool env_existed(const char *env_name) { char *user_device_string = getenv(env_name); return user_device_string!=NULL; -} \ No newline at end of file +}