From d5380f3af20cf5dd8d8a95854767891f29c5a4d1 Mon Sep 17 00:00:00 2001
From: arthw <14088817+arthw@users.noreply.github.com>
Date: Tue, 30 Jul 2024 23:49:34 +0800
Subject: [PATCH 1/6] refactor device in sycl_device, restore ctx in
 create_queue

---
 ggml/src/ggml-sycl.cpp             |  36 ++--
 ggml/src/ggml-sycl/common.cpp      | 316 +----------------------------
 ggml/src/ggml-sycl/common.hpp      |  88 +-------
 ggml/src/ggml-sycl/dpct/helper.hpp | 132 ++++++------
 ggml/src/ggml-sycl/mmq.cpp         |  20 +-
 ggml/src/ggml-sycl/sycl_device.cpp | 286 ++++++++++++++++++++++++++
 ggml/src/ggml-sycl/sycl_device.hpp |  83 ++++++++
 ggml/src/ggml-sycl/sycl_hw.cpp     |   6 +
 ggml/src/ggml-sycl/sycl_hw.hpp     |   7 +-
 9 files changed, 494 insertions(+), 480 deletions(-)
 create mode 100644 ggml/src/ggml-sycl/sycl_device.cpp
 create mode 100644 ggml/src/ggml-sycl/sycl_device.hpp

diff --git a/ggml/src/ggml-sycl.cpp b/ggml/src/ggml-sycl.cpp
index 9b1ef70ad18e6..8a0b8ff639a79 100644
--- a/ggml/src/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl.cpp
@@ -48,7 +48,7 @@ void   ggml_sycl_get_device_description(int device, char * description, size_t d
 bool   ggml_backend_is_sycl(ggml_backend_t backend);
 int    ggml_backend_sycl_get_device(ggml_backend_t backend);
 static bool ggml_backend_buffer_is_sycl_split(ggml_backend_buffer_t buffer);
-
+static bool ggml_backend_buffer_is_sycl(ggml_backend_buffer_t buffer);
 
 void dev2dev_memcpy(sycl::queue &q_dst, sycl::queue &q_src, void *ptr_dst,
                     const void *ptr_src, size_t size) {
@@ -2279,11 +2279,11 @@ static int64_t get_row_rounding(ggml_type type, const std::array<float, GGML_SYC
     for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
         int id = ggml_backend_sycl_get_device_id(i);
         if (tensor_split[i] < (i + 1 < ggml_sycl_info().device_count ? tensor_split[i + 1] : 1.0f)) {
-            if (min_compute_capability > ggml_sycl_info().devices[id].cc) {
-                min_compute_capability = ggml_sycl_info().devices[id].cc;
+            if (min_compute_capability > ggml_sycl_info().device_infos[id].cc) {
+                min_compute_capability = ggml_sycl_info().device_infos[id].cc;
             }
-            if (max_compute_capability < ggml_sycl_info().devices[id].cc) {
-                max_compute_capability = ggml_sycl_info().devices[id].cc;
+            if (max_compute_capability < ggml_sycl_info().device_infos[id].cc) {
+                max_compute_capability = ggml_sycl_info().device_infos[id].cc;
             }
         }
     }
@@ -2680,17 +2680,14 @@ static void ggml_sycl_set_peer_access(const int n_tokens, int main_device) {
     }
 
 #ifdef NDEBUG
-    for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
-        int id = ggml_backend_sycl_get_device_id(i);
+    for (auto &id: ggml_sycl_info().ids) {
         SYCL_CHECK(ggml_sycl_set_device(id));
     }
 
-    for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
-        int id = ggml_backend_sycl_get_device_id(i);
+    for (auto &id: ggml_sycl_info().ids) {
         SYCL_CHECK(ggml_sycl_set_device(id));
 
-        for (int i_other = 0; i_other < ggml_sycl_info().device_count; ++i_other) {
-            int id_other = ggml_backend_sycl_get_device_id(i_other);
+        for (auto &id_other: ggml_sycl_info().ids) {
             if (id == id_other) {
                 continue;
             }
@@ -2843,7 +2840,6 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
         } else {
             dev[id].src1_ddf = dev[id].src1_ddf_alloc.alloc(ctx.pool(id), ggml_nelements(src1));
         }
-
         if (convert_src1_to_q8_1) {
             dev[id].src1_ddq = dev[id].src1_ddq_alloc.alloc(ctx.pool(id), nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs);
 
@@ -3165,8 +3161,13 @@ static void ggml_sycl_pad(ggml_backend_sycl_context & ctx, const ggml_tensor * s
 
 static void ggml_sycl_rms_norm(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     GGML_SYCL_DEBUG("call %s\n", __func__);
+    // log_tensor_with_cnt(ctx, "log/src0", src0, -1);
+    // log_tensor_with_cnt(ctx, "log/src1", src1, -1);
+    // log_tensor_with_cnt(ctx, "log/dst0", dst, -1);
     ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_rms_norm);
+    // log_tensor_with_cnt(ctx, "log/dst1", dst, -1);
     GGML_SYCL_DEBUG("call %s done\n", __func__);
+    // exit(1);
 }
 
 static void ggml_sycl_mul_mat_vec_p021(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
@@ -3417,12 +3418,12 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor
                 continue;
             }
 
-            if (min_compute_capability > ggml_sycl_info().devices[id].cc) {
-                min_compute_capability = ggml_sycl_info().devices[id].cc;
+            if (min_compute_capability > ggml_sycl_info().device_infos[id].cc) {
+                min_compute_capability = ggml_sycl_info().device_infos[id].cc;
             }
         }
     } else {
-        min_compute_capability    = ggml_sycl_info().devices[ctx.device].cc;
+        min_compute_capability    = ggml_sycl_info().device_infos[ctx.device].cc;
     }
 
     // check data types and tensor shapes for custom matrix multiplication kernels:
@@ -4332,7 +4333,6 @@ static ggml_backend_buffer_type_i ggml_backend_sycl_buffer_type_interface = {
 ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device_id) {
     static std::mutex mutex;
     std::lock_guard<std::mutex> lock(mutex);
-
     GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_buffer_type\n");
 
     check_allow_device_id(device_id);
@@ -4345,7 +4345,9 @@ ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device_id) {
         for (int i = 0; i < ggml_sycl_info().device_count; i++) {
             int id = ggml_backend_sycl_get_device_id(i);
             auto & device = dpct::dev_mgr::instance().get_device(id);
-            queue_ptr stream = &(device.default_queue());
+            // queue_ptr stream = &(device.default_queue());
+            queue_ptr stream = ggml_sycl_info().device_infos[id].qptrs[0];
+
             ggml_backend_sycl_buffer_types[id] = {
                 /* .iface    = */ ggml_backend_sycl_buffer_type_interface,
                 /* .context  = */ new ggml_backend_sycl_buffer_type_context{id, GGML_SYCL_NAME + std::to_string(id), stream},
diff --git a/ggml/src/ggml-sycl/common.cpp b/ggml/src/ggml-sycl/common.cpp
index 3633edefac67a..efd702bf19991 100644
--- a/ggml/src/ggml-sycl/common.cpp
+++ b/ggml/src/ggml-sycl/common.cpp
@@ -20,12 +20,16 @@ void* ggml_sycl_host_malloc(size_t size) try {
   if (getenv("GGML_SYCL_NO_PINNED") != nullptr) {
     return nullptr;
   }
-
+//   ggml_sycl_info().device_mgr->first_queue
   void* ptr = nullptr;
   // allow to use dpct::get_in_order_queue() for host malloc
+  auto q = dpct::get_in_order_queue();
+//   sycl::queue q = *ggml_sycl_info().device_mgr->qptrs[0][0];
+
   dpct::err0 err = CHECK_TRY_ERROR(
-      ptr = (void*)sycl::malloc_host(size, dpct::get_in_order_queue()));
+      ptr = (void*)sycl::malloc_host(size, q));
 
+//  printf("zjy ggml_sycl_host_malloc ptr=%p queue=%p size=%lu \n", ptr,q, size);
   if (err != 0) {
     // clear the error
     fprintf(
@@ -66,27 +70,6 @@ static inline int get_sycl_env(const char *env_name, int default_val) {
     return user_number;
 }
 
-static inline bool env_existed(const char *env_name) {
-     char *user_device_string = getenv(env_name);
-     return user_device_string!=NULL;
-}
-
-static std::vector<int> get_sycl_visible_devices() {
-    static std::vector<int> device_ids;
-    char *devices_env = getenv("GGML_SYCL_VISIBLE_DEVICES");
-    if (devices_env != nullptr) {
-        std::string devices(devices_env);
-        std::replace(devices.begin(), devices.end(), ',', ' ');
-
-        std::stringstream ss(devices);
-        int tmp;
-        while (ss >> tmp) {
-            device_ids.push_back(tmp);
-        }
-    }
-    return device_ids;
-}
-
 void print_device_detail_part1(int id, sycl::device &device, std::string device_type) {
 
     dpct::device_info prop;
@@ -193,8 +176,7 @@ static ggml_sycl_device_info ggml_sycl_init() try {
         initialized = true;
     }
 
-    static ggml_sycl_device_info info = {};
-    info.refresh_device();
+    static ggml_sycl_device_info info;
 
     if (info.device_count == 0) {
         fprintf(stderr, "%s: failed to initialize " GGML_SYCL_NAME ": no available device found\n",
@@ -215,288 +197,4 @@ ggml_sycl_device_info &ggml_sycl_info() {
     return info;
 }
 
-//--sycl_device_mgr--
-
-sycl_device_mgr::sycl_device_mgr(
-    ggml_sycl_backend_device_filter device_filter) {
-    switch (device_filter) {
-    case SYCL_DEVICES_TOP_LEVEL_ZERO:
-        detect_sycl_gpu_list_with_max_cu();
-        create_context_for_group_gpus();
-        break;
-    case SYCL_ALL_DEVICES:
-        detect_all_sycl_device_list();
-        create_context_for_devices();
-        break;
-    case SYCL_VISIBLE_DEVICES:
-        detect_sycl_visible_device_list();
-        create_context_for_devices();
-        break;
-    default:
-        std::cerr << "sycl_device_mgr: Invalid device_filter " << device_filter
-                  << std::endl;
-    }
-    init_allow_devices();
-}
-
-/*
-Bind all gpus in same host with same context, for better performance in
-device-to-device copy in the future.
-*/
-void sycl_device_mgr::create_context_for_group_gpus() {
-    sycl::context ctx = sycl::context(devices);
-    assert(device_ids.size() > 0);
-    first_queue = _create_queue_ptr(devices[0]);
-    sycl::context ctx0 = first_queue->get_context();
-    for (int i = 0; i < device_ids.size(); i++) {
-        ctxs.push_back(ctx0);
-    }
-}
-
-sycl::queue *sycl_device_mgr::_create_queue_ptr(sycl::device device) {
-    auto q = dpct::get_current_device().create_queue(device);
-    return q;
-    // _queues.push_back(q);
-    // return & _queues.back();
-}
-
-sycl::queue *sycl_device_mgr::create_queue_for_device(sycl::device &device) {
-    dpct::select_device(dpct::dev_mgr::instance().get_device_id(device));
-    auto qptr = _create_queue_ptr(device);
-    return qptr;
-}
-
-sycl::queue *sycl_device_mgr::create_queue_for_device_id(int device_id) {
-    int i = get_device_index(device_id);
-    sycl::device device = dpct::dev_mgr::instance().get_device(device_id);
-    return create_queue_for_device(device);
-}
-
-int sycl_device_mgr::get_device_index(int device_id) {
-    for (int i = 0; i < device_ids.size(); i++) {
-        if (device_ids[i] == device_id)
-            return i;
-    }
-    return -1;
-}
-
-void sycl_device_mgr::create_context_for_devices() {
-    for (int i = 0; i < device_ids.size(); i++) {
-        sycl::context ctx = sycl::context(devices[i]);
-        ctxs.push_back(ctx);
-    }
-}
-
-void sycl_device_mgr::init_allow_devices() {
-    device_list = "";
-    for (size_t i = 0; i < device_ids.size(); ++i) {
-        device_list += std::to_string(device_ids[i]);
-        device_list += ",";
-    }
-    if (device_list.length() > 1) {
-        device_list.pop_back();
-    }
-}
-
-bool sycl_device_mgr::is_allowed_device(int device_id) {
-    return std::find(device_ids.begin(), device_ids.end(), device_id) !=
-           device_ids.end();
-}
-
-void sycl_device_mgr::detect_all_sycl_device_list() try {
-    int device_count = dpct::dev_mgr::instance().device_count();
-
-    for (int id = 0; id < device_count; id++) {
-        add_device_info(id);
-    }
-    return;
-} catch (sycl::exception const &exc) {
-    std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-              << ", line:" << __LINE__ << std::endl;
-    std::exit(1);
-}
-
-void sycl_device_mgr::detect_sycl_visible_device_list() try {
-    std::vector<int> sycl_devices = get_sycl_visible_devices();
-    int device_count = dpct::dev_mgr::instance().device_count();
-
-    for (int i = 0; i < sycl_devices.size(); i++) {
-        int id = sycl_devices[i];
-        if (id >= device_count) {
-            std::cerr << __func__ << ": invalid device_id:" << id
-                      << " from GGML_SYCL_VISIBLE_DEVICES="
-                      << getenv("GGML_SYCL_VISIBLE_DEVICES")
-                      << ", available IDs: ";
-            if (device_count > 1) {
-                std::cerr << "[0, " << device_count - 1 << "]";
-            } else if (device_count == 1) {
-                std::cerr << "[0]";
-            } else {
-                std::cerr << "[]";
-            }
-            std::cerr << std::endl;
-        }
-        add_device_info(id);
-    }
-    return;
-} catch (sycl::exception const &exc) {
-    std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-              << ", line:" << __LINE__ << std::endl;
-    std::exit(1);
-}
-
-/*
-Use all GPUs with same top max compute units
-*/
-void sycl_device_mgr::detect_sycl_gpu_list_with_max_cu() try {
-    int device_count = dpct::dev_mgr::instance().device_count();
-    int local_max_compute_units = 0;
-    for (int id = 0; id < device_count; id++) {
-        sycl::device device = dpct::dev_mgr::instance().get_device(id);
-        if (!device.is_gpu())
-            continue;
-        dpct::device_info prop;
-        dpct::get_device_info(prop, device);
-        if (local_max_compute_units < prop.get_max_compute_units())
-            local_max_compute_units = prop.get_max_compute_units();
-    }
-
-    for (int id = 0; id < device_count; id++) {
-        sycl::device device = dpct::dev_mgr::instance().get_device(id);
-        if (!device.is_gpu())
-            continue;
-        dpct::device_info prop;
-        dpct::get_device_info(prop, device);
-        if (local_max_compute_units == prop.get_max_compute_units() &&
-            is_ext_oneapi_device(device)) {
-            add_device_info(id);
-        }
-    }
-    return;
-} catch (sycl::exception const &exc) {
-    std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-              << ", line:" << __LINE__ << std::endl;
-    std::exit(1);
-}
-
-int sycl_device_mgr::get_device_count() { return (int)device_ids.size(); }
-
-bool sycl_device_mgr::is_ext_oneapi_device(const sycl::device &dev) {
-    sycl::backend dev_backend = dev.get_backend();
-    if (dev_backend == sycl::backend::ext_oneapi_level_zero ||
-        dev_backend == sycl::backend::ext_oneapi_cuda ||
-        dev_backend == sycl::backend::ext_oneapi_hip)
-        return true;
-    return false;
-}
-
-void sycl_device_mgr::add_device_info(int id) {
-    sycl::device device = dpct::dev_mgr::instance().get_device(id);
-    device_ids.push_back(id);
-    devices.push_back(device);
-    dpct::device_info prop;
-    dpct::get_device_info(prop, device);
-    work_group_sizes.push_back(prop.get_max_work_group_size());
-    max_compute_units.push_back(prop.get_max_compute_units());
-    hw_familys.push_back(get_device_family(&device));
-}
-
-//--sycl_device_mgr--
-
-//--ggml_sycl_device_info--
-void ggml_sycl_device_info::print_gpu_device_list() {
-    GGML_ASSERT(device_mgr);
-
-    char *hint = NULL;
-    if (oneapi_device_selector_existed && sycl_visible_devices_existed) {
-        hint = "detect %d SYCL devices:[%s] by ONEAPI_DEVICE_SELECTOR=%s and "
-               "GGML_SYCL_VISIBLE_DEVICES=%s\n";
-        fprintf(stderr, hint, device_mgr->get_device_count(), devices_list(),
-                getenv("ONEAPI_DEVICE_SELECTOR"),
-                getenv("GGML_SYCL_VISIBLE_DEVICES"));
-    } else if (oneapi_device_selector_existed) {
-        hint = "detect %d SYCL devices:[%s] by ONEAPI_DEVICE_SELECTOR=%s\n";
-        fprintf(stderr, hint, device_mgr->get_device_count(), devices_list(),
-                getenv("ONEAPI_DEVICE_SELECTOR"));
-    } else if (sycl_visible_devices_existed) {
-        hint = "detect %d SYCL devices:[%s] by GGML_SYCL_VISIBLE_DEVICES=%s\n";
-        fprintf(stderr, hint, device_mgr->get_device_count(), devices_list(),
-                getenv("GGML_SYCL_VISIBLE_DEVICES"));
-    } else {
-        hint = "detect %d SYCL level-zero GPUs:[%s] with top Max compute "
-               "units:%d, to use any SYCL devices, set/export "
-               "GGML_SYCL_VISIBLE_DEVICES or ONEAPI_DEVICE_SELECTOR\n";
-        fprintf(stderr, hint, device_mgr->get_device_count(), devices_list(),
-                device_mgr->max_compute_units[0]);
-    }
-}
-
-int ggml_sycl_device_info::work_group_size(int device_id) {
-    GGML_ASSERT(device_mgr);
-    return device_mgr->work_group_sizes[device_id];
-}
-
-void ggml_sycl_device_info::refresh_device() {
-    oneapi_device_selector_existed = env_existed("ONEAPI_DEVICE_SELECTOR");
-    sycl_visible_devices_existed = env_existed("GGML_SYCL_VISIBLE_DEVICES");
-    if (!device_mgr)
-        delete device_mgr;
-
-    if (sycl_visible_devices_existed) {
-        device_mgr = new sycl_device_mgr(SYCL_VISIBLE_DEVICES);
-    } else if (oneapi_device_selector_existed) {
-        device_mgr = new sycl_device_mgr(SYCL_ALL_DEVICES);
-    } else {
-        device_mgr = new sycl_device_mgr(SYCL_DEVICES_TOP_LEVEL_ZERO);
-    }
-
-    device_count = device_mgr->get_device_count();
-
-    int64_t total_vram = 0;
-
-    for (int i = 0; i < device_count; ++i) {
-        int id = get_device_id(i);
-        devices[id].vmm = 0;
-        dpct::device_info prop;
-        SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info(
-            prop, dpct::dev_mgr::instance().get_device(id))));
-
-        default_tensor_split[i] =
-            total_vram; // continue data, so use device index
-        total_vram += prop.get_global_mem_size();
-
-        devices[id].cc =
-            100 * prop.get_major_version() + 10 * prop.get_minor_version();
-    }
-
-    for (int i = 0; i < device_count; ++i) {
-        default_tensor_split[i] /=
-            total_vram; // continue data, so use device index
-    }
-
-    print_gpu_device_list();
-}
-
-bool ggml_sycl_device_info::is_allowed_device(int device_id) {
-    return device_mgr->is_allowed_device(device_id);
-}
-
-const char *ggml_sycl_device_info::devices_list() {
-    return device_mgr->device_list.c_str();
-}
-
-int ggml_sycl_device_info::get_device_id(int device_index) {
-    if (device_index < device_mgr->device_ids.size()) {
-        return device_mgr->device_ids.at(device_index);
-    } else {
-        std::cerr << __func__ << ":SYCL device:" << device_index
-                  << " is out of range:[" << devices_list() << "]" << std::endl;
-        std::exit(1);
-    }
-}
-
-int ggml_sycl_device_info::hw_family(int device_id) {
-    return device_mgr->hw_familys[device_id];
-}
-
 //--ggml_sycl_device_info--
diff --git a/ggml/src/ggml-sycl/common.hpp b/ggml/src/ggml-sycl/common.hpp
index 00be4f70274f0..f0c93a52d84e4 100644
--- a/ggml/src/ggml-sycl/common.hpp
+++ b/ggml/src/ggml-sycl/common.hpp
@@ -21,12 +21,13 @@
 #include "ggml-sycl.h"
 #include "presets.hpp"
 #include "sycl_hw.hpp"
+#include "sycl_device.hpp"
 
 #define GGML_COMMON_DECL_SYCL
 #define GGML_COMMON_IMPL_SYCL
 #include "ggml-common.h"
 
-void* ggml_sycl_host_malloc(size_t size);
+
 void ggml_sycl_host_free(void* ptr);
 
 static int g_ggml_sycl_debug = 0;
@@ -86,12 +87,6 @@ enum ggml_sycl_backend_gpu_mode {
   SYCL_MUL_GPU_MODE
 };
 
-enum ggml_sycl_backend_device_filter {
-  SYCL_ALL_DEVICES = 0,
-  SYCL_DEVICES_TOP_LEVEL_ZERO,
-  SYCL_VISIBLE_DEVICES
-};
-
 static_assert(sizeof(sycl::half) == sizeof(ggml_fp16_t), "wrong fp16 size");
 
 static void crash() {
@@ -169,10 +164,10 @@ inline dpct::err0 ggml_sycl_set_device(const int device_id) try {
   int current_device_id;
   SYCL_CHECK(CHECK_TRY_ERROR(current_device_id = get_current_device_id()));
 
-  GGML_SYCL_DEBUG("ggml_sycl_set_device device_id=%d, current_device_id=%d\n", device_id, current_device_id);
   if (device_id == current_device_id) {
     return 0;
   }
+  GGML_SYCL_DEBUG("ggml_sycl_set_device device_id=%d, current_device_id=%d\n", device_id, current_device_id);
 
   return CHECK_TRY_ERROR(dpct::select_device(device_id));
 
@@ -183,67 +178,6 @@ inline dpct::err0 ggml_sycl_set_device(const int device_id) try {
   std::exit(1);
 }
 
-
-class sycl_device_mgr {
-  public:
-    std::vector<int> device_ids;
-    std::vector<sycl::device> devices;
-    std::vector<int> max_compute_units;
-    std::vector<int> work_group_sizes;
-    std::vector<int> hw_familys;
-
-    sycl::queue *first_queue;
-    std::vector<sycl::queue> _queues;
-    std::vector<sycl::context> ctxs;
-    std::string device_list = "";
-
-    sycl_device_mgr(ggml_sycl_backend_device_filter device_filter);
-
-    sycl::queue *_create_queue_ptr(sycl::device device); //internal API to hide dpct API.
-    void create_context_for_group_gpus();
-    sycl::queue *create_queue_for_device(sycl::device &device);
-    sycl::queue *create_queue_for_device_id(int device_id);
-    int get_device_index(int device_id);
-    void create_context_for_devices();
-    void init_allow_devices();
-    bool is_allowed_device(int device_id);
-    void detect_all_sycl_device_list();
-    void detect_sycl_visible_device_list();
-    void detect_sycl_gpu_list_with_max_cu();
-    int get_device_count();
-    bool is_ext_oneapi_device(const sycl::device &dev);
-    void add_device_info(int id);
-};
-
-
-struct ggml_sycl_device_info {
-    int device_count;
-    bool oneapi_device_selector_existed = false;
-    bool sycl_visible_devices_existed = false;
-
-    struct sycl_device_info {
-        int     cc;                 // compute capability
-        // int     nsm;                // number of streaming multiprocessors
-        // size_t  smpb;               // max. shared memory per block
-        bool    vmm;                // virtual memory support
-        size_t  total_vram;
-    };
-
-    sycl_device_info devices[GGML_SYCL_MAX_DEVICES] = {};
-
-    std::array<float, GGML_SYCL_MAX_DEVICES> default_tensor_split = {};
-
-    sycl_device_mgr *device_mgr = NULL;
-
-    void print_gpu_device_list();
-    int work_group_size(int device_id);
-    void refresh_device();
-    bool is_allowed_device(int device_id);
-    const char* devices_list();
-    int get_device_id(int device_index);
-    int hw_family(int device_id);
-};
-
 struct ggml_sycl_pool {
     virtual ~ggml_sycl_pool() = default;
 
@@ -309,17 +243,17 @@ struct ggml_backend_sycl_context {
 
     queue_ptr qptrs[GGML_SYCL_MAX_DEVICES][GGML_SYCL_MAX_STREAMS] = { { nullptr } };
 
-    explicit ggml_backend_sycl_context(struct ggml_sycl_device_info &sycl_device_info, int device_id) :
-        device(device_id),
+    explicit ggml_backend_sycl_context(struct ggml_sycl_device_info &sycl_device_info, int id) :
+        device(id),
         name(GGML_SYCL_NAME + std::to_string(device)) {
             for (int i=0;i<GGML_SYCL_MAX_STREAMS; i++){
-                qptrs[device_id][i] = sycl_device_info.device_mgr->create_queue_for_device_id(device_id);
+                qptrs[id][i] = sycl_device_info.device_infos[id].qptrs[i];
             }
     }
 
-    queue_ptr stream(int device, int stream) {
-        assert(qptrs[device][stream] != nullptr);
-        return qptrs[device][stream];
+    queue_ptr stream(int id, int stream) {
+        assert(qptrs[id][stream] != nullptr);
+        return qptrs[id][stream];
     }
 
     queue_ptr stream() {
@@ -349,10 +283,10 @@ static inline void exit_with_stack_print() {
 
 
 static inline int get_sycl_env(const char *env_name, int default_val);
-static inline bool env_existed(const char *env_name);
+
 void* ggml_sycl_host_malloc(size_t size);
 void ggml_sycl_host_free(void* ptr);
-static std::vector<int> get_sycl_visible_devices();
+
 void ggml_backend_sycl_print_sycl_devices();
 static ggml_sycl_device_info ggml_sycl_init();
 ggml_sycl_device_info &ggml_sycl_info();
diff --git a/ggml/src/ggml-sycl/dpct/helper.hpp b/ggml/src/ggml-sycl/dpct/helper.hpp
index d93f118184524..d4f3a7edc2a2d 100644
--- a/ggml/src/ggml-sycl/dpct/helper.hpp
+++ b/ggml/src/ggml-sycl/dpct/helper.hpp
@@ -592,16 +592,18 @@ namespace dpct
     class device_ext : public sycl::device {
       typedef std::mutex mutex_type;
 
-     public:
-      device_ext() : sycl::device() {}
-      ~device_ext() {
-        std::lock_guard<mutex_type> lock(m_mutex);
-        clear_queues();
-      }
-      device_ext(const sycl::device &base) : sycl::device(base) {
-        std::lock_guard<mutex_type> lock(m_mutex);
-        init_queues();
-      }
+    public:
+        device_ext() : sycl::device(), _ctx(*this) {}
+        ~device_ext()
+        {
+            std::lock_guard<mutex_type> lock(m_mutex);
+            clear_queues();
+        }
+        device_ext(const sycl::device &base) : sycl::device(base), _ctx(*this)
+        {
+            std::lock_guard<mutex_type> lock(m_mutex);
+            init_queues();
+        }
 
       int is_native_atomic_supported() { return 0; }
       int get_major_version() const { return dpct::get_major_version(*this); }
@@ -711,10 +713,10 @@ namespace dpct
         return create_in_order_queue(enable_exception_handler);
       }
 
-      sycl::queue *create_queue(sycl::device device,
-                               bool enable_exception_handler = false) {
-        return create_in_order_queue(device, enable_exception_handler);
-      }
+        sycl::queue *create_queue(sycl::context context, sycl::device device,
+                                bool enable_exception_handler = false) {
+            return create_in_order_queue(context, device, enable_exception_handler);
+        }
 
       sycl::queue *create_in_order_queue(bool enable_exception_handler = false) {
         std::lock_guard<mutex_type> lock(m_mutex);
@@ -722,12 +724,12 @@ namespace dpct
                                  sycl::property::queue::in_order());
       }
 
-      sycl::queue *create_in_order_queue(sycl::device device,
+        sycl::queue *create_in_order_queue(sycl::context context, sycl::device device,
                                         bool enable_exception_handler = false) {
-        std::lock_guard<mutex_type> lock(m_mutex);
-        return create_queue_impl(device, enable_exception_handler,
-                                 sycl::property::queue::in_order());
-      }
+            std::lock_guard<mutex_type> lock(m_mutex);
+            return create_queue_impl(context, device, enable_exception_handler,
+                                    sycl::property::queue::in_order());
+        }
 
       sycl::queue *create_out_of_order_queue(
           bool enable_exception_handler = false) {
@@ -735,24 +737,28 @@ namespace dpct
         return create_queue_impl(enable_exception_handler);
       }
 
-      void destroy_queue(sycl::queue *&queue) {
-        std::lock_guard<mutex_type> lock(m_mutex);
-        _queues.erase(std::remove_if(_queues.begin(), _queues.end(),
-                                    [=](const std::shared_ptr<sycl::queue> &q) -> bool
-                                    {
-                                        return q.get() == queue;
-                                    }),
-                    _queues.end());
-        queue = nullptr;
-      }
-      void set_saved_queue(sycl::queue *q) {
-        std::lock_guard<mutex_type> lock(m_mutex);
-        _saved_queue = q;
-      }
-      sycl::queue *get_saved_queue() const {
-        std::lock_guard<mutex_type> lock(m_mutex);
-        return _saved_queue;
-      }
+        void destroy_queue(sycl::queue *&queue)
+        {
+            std::lock_guard<mutex_type> lock(m_mutex);
+            _queues.erase(std::remove_if(_queues.begin(), _queues.end(),
+                                         [=](const std::shared_ptr<sycl::queue> &q) -> bool
+                                         {
+                                             return q.get() == queue;
+                                         }),
+                          _queues.end());
+            queue = nullptr;
+        }
+        void set_saved_queue(sycl::queue *q)
+        {
+            std::lock_guard<mutex_type> lock(m_mutex);
+            _saved_queue = q;
+        }
+        sycl::queue *get_saved_queue() const
+        {
+            std::lock_guard<mutex_type> lock(m_mutex);
+            return _saved_queue;
+        }
+        sycl::context get_context() const { return _ctx; }
 
      private:
       void clear_queues() {
@@ -767,18 +773,19 @@ namespace dpct
         _saved_queue = &default_queue();
       }
 
-      /// Caller should acquire resource \p m_mutex before calling this
-      /// function.
-      template <class... Properties>
-      sycl::queue *create_queue_impl(bool enable_exception_handler,
-                                    Properties... properties) {
-        sycl::async_handler eh = {};
-        if (enable_exception_handler) {
-          eh = exception_handler;
-        }
-        _queues.push_back(std::make_shared<sycl::queue>(
-            *this, eh,
-            sycl::property_list(
+        /// Caller should acquire resource \p m_mutex before calling this function.
+        template <class... Properties>
+        sycl::queue *create_queue_impl(bool enable_exception_handler,
+                                       Properties... properties)
+        {
+            sycl::async_handler eh = {};
+            if (enable_exception_handler)
+            {
+                eh = exception_handler;
+            }
+            _queues.push_back(std::make_shared<sycl::queue>(
+                _ctx, *this, eh,
+                sycl::property_list(
 #ifdef DPCT_PROFILING_ENABLED
                 sycl::property::queue::enable_profiling(),
 #endif
@@ -787,21 +794,21 @@ namespace dpct
         return _queues.back().get();
       }
 
-      template <class... Properties>
-      sycl::queue *create_queue_impl(sycl::device device,
+        template <class... Properties>
+        sycl::queue *create_queue_impl(sycl::context context, sycl::device device,
                                     bool enable_exception_handler,
                                     Properties... properties) {
-        sycl::async_handler eh = {};
-        if (enable_exception_handler) {
-          eh = exception_handler;
-        }
-        _queues.push_back(std::make_shared<sycl::queue>(
-            device, eh,
-                        sycl::property_list(
-#ifdef DPCT_PROFILING_ENABLED
-                            sycl::property::queue::enable_profiling(),
-#endif
-                            properties...)));
+            sycl::async_handler eh = {};
+            if (enable_exception_handler) {
+                eh = exception_handler;
+            }
+            _queues.push_back(std::make_shared<sycl::queue>(
+                context, device, eh,
+                sycl::property_list(
+        #ifdef DPCT_PROFILING_ENABLED
+                    sycl::property::queue::enable_profiling(),
+        #endif
+                    properties...)));
 
         return _queues.back().get();
       }
@@ -811,6 +818,7 @@ namespace dpct
       }
       sycl::queue *_q_in_order, *_q_out_of_order;
       sycl::queue *_saved_queue;
+      sycl::context _ctx;
       std::vector<std::shared_ptr<sycl::queue>> _queues;
       mutable mutex_type m_mutex;
     };
diff --git a/ggml/src/ggml-sycl/mmq.cpp b/ggml/src/ggml-sycl/mmq.cpp
index e952533d310ec..0a6a5456c47ee 100644
--- a/ggml/src/ggml-sycl/mmq.cpp
+++ b/ggml/src/ggml-sycl/mmq.cpp
@@ -1779,7 +1779,7 @@ static void ggml_mul_mat_q4_0_q8_1_sycl(const void *vx, const void *vy,
     int id;
     SYCL_CHECK(
         CHECK_TRY_ERROR(id = get_current_device_id()));
-    const int compute_capability = ggml_sycl_info().devices[id].cc;
+    const int compute_capability = ggml_sycl_info().device_infos[id].cc;
 
     int mmq_x, mmq_y, nwarps;
     if (compute_capability >= VER_GEN13) {
@@ -1894,7 +1894,7 @@ static void ggml_mul_mat_q4_1_q8_1_sycl(const void *vx, const void *vy,
     int id;
     SYCL_CHECK(
         CHECK_TRY_ERROR(id = get_current_device_id()));
-    const int compute_capability = ggml_sycl_info().devices[id].cc;
+    const int compute_capability = ggml_sycl_info().device_infos[id].cc;
 
     int mmq_x, mmq_y, nwarps;
     if (compute_capability >= VER_GEN13) {
@@ -2009,7 +2009,7 @@ static void ggml_mul_mat_q5_0_q8_1_sycl(const void *vx, const void *vy,
     int id;
     SYCL_CHECK(
         CHECK_TRY_ERROR(id = get_current_device_id()));
-    const int compute_capability = ggml_sycl_info().devices[id].cc;
+    const int compute_capability = ggml_sycl_info().device_infos[id].cc;
 
     int mmq_x, mmq_y, nwarps;
     if (compute_capability >= VER_GEN13) {
@@ -2124,7 +2124,7 @@ static void ggml_mul_mat_q5_1_q8_1_sycl(const void *vx, const void *vy,
     int id;
     SYCL_CHECK(
         CHECK_TRY_ERROR(id = get_current_device_id()));
-    const int compute_capability = ggml_sycl_info().devices[id].cc;
+    const int compute_capability = ggml_sycl_info().device_infos[id].cc;
 
     int mmq_x, mmq_y, nwarps;
     if (compute_capability >= VER_GEN13) {
@@ -2239,7 +2239,7 @@ static void ggml_mul_mat_q8_0_q8_1_sycl(const void *vx, const void *vy,
     int id;
     SYCL_CHECK(
         CHECK_TRY_ERROR(id = get_current_device_id()));
-    const int compute_capability = ggml_sycl_info().devices[id].cc;
+    const int compute_capability = ggml_sycl_info().device_infos[id].cc;
 
     int mmq_x, mmq_y, nwarps;
     if (compute_capability >= VER_GEN13) {
@@ -2354,7 +2354,7 @@ static void ggml_mul_mat_q2_K_q8_1_sycl(const void *vx, const void *vy,
     int id;
     SYCL_CHECK(
         CHECK_TRY_ERROR(id = get_current_device_id()));
-    const int compute_capability = ggml_sycl_info().devices[id].cc;
+    const int compute_capability = ggml_sycl_info().device_infos[id].cc;
 
     int mmq_x, mmq_y, nwarps;
     if (compute_capability >= VER_GEN13) {
@@ -2477,7 +2477,7 @@ static void ggml_mul_mat_q3_K_q8_1_sycl(const void *vx, const void *vy,
     int id;
     SYCL_CHECK(
         CHECK_TRY_ERROR(id = get_current_device_id()));
-    const int compute_capability = ggml_sycl_info().devices[id].cc;
+    const int compute_capability = ggml_sycl_info().device_infos[id].cc;
 
     int mmq_x, mmq_y, nwarps;
     if (compute_capability >= VER_GEN13) {
@@ -2605,7 +2605,7 @@ static void ggml_mul_mat_q4_K_q8_1_sycl(const void *vx, const void *vy,
     int id;
     SYCL_CHECK(
         CHECK_TRY_ERROR(id = get_current_device_id()));
-    const int compute_capability = ggml_sycl_info().devices[id].cc;
+    const int compute_capability = ggml_sycl_info().device_infos[id].cc;
 
     int mmq_x, mmq_y, nwarps;
     if (compute_capability >= VER_GEN13) {
@@ -2726,7 +2726,7 @@ static void ggml_mul_mat_q5_K_q8_1_sycl(const void *vx, const void *vy,
     int id;
     SYCL_CHECK(
         CHECK_TRY_ERROR(id = get_current_device_id()));
-    const int compute_capability = ggml_sycl_info().devices[id].cc;
+    const int compute_capability = ggml_sycl_info().device_infos[id].cc;
 
     int mmq_x, mmq_y, nwarps;
     if (compute_capability >= VER_GEN13) {
@@ -2847,7 +2847,7 @@ static void ggml_mul_mat_q6_K_q8_1_sycl(const void *vx, const void *vy,
     int id;
     SYCL_CHECK(
         CHECK_TRY_ERROR(id = get_current_device_id()));
-    const int compute_capability = ggml_sycl_info().devices[id].cc;
+    const int compute_capability = ggml_sycl_info().device_infos[id].cc;
 
     int mmq_x, mmq_y, nwarps;
     if (compute_capability >= VER_GEN13) {
diff --git a/ggml/src/ggml-sycl/sycl_device.cpp b/ggml/src/ggml-sycl/sycl_device.cpp
new file mode 100644
index 0000000000000..e2633d729671e
--- /dev/null
+++ b/ggml/src/ggml-sycl/sycl_device.cpp
@@ -0,0 +1,286 @@
+#include "sycl_device.hpp"
+#include "sycl_hw.hpp"
+
+
+void ggml_sycl_device_info::init(
+    ggml_sycl_backend_device_filter device_filter) {
+    switch (device_filter) {
+    case SYCL_DEVICES_TOP_LEVEL_ZERO:
+        detect_sycl_gpu_list_with_max_cu();
+        create_context_for_devices();
+        break;
+    case SYCL_ALL_DEVICES:
+        detect_all_sycl_device_list();
+        create_context_for_devices();
+        break;
+    case SYCL_VISIBLE_DEVICES:
+        detect_sycl_visible_device_list();
+        create_context_for_devices();
+        break;
+    default:
+        std::cerr << "ggml_sycl_device_info: Invalid device_filter " << device_filter
+                  << std::endl;
+    }
+    init_allow_devices();
+    device_count = ids.size();
+}
+
+/*
+Bind all devices in same host with same context, for better performance in
+device-to-device copy in the future.
+*/
+void ggml_sycl_device_info::create_context_for_devices() {
+    assert(devices.size() > 0);
+    sycl::context ctx = sycl::context(devices);
+    first_queue = dpct::get_current_device().create_queue(ctx, devices[0]);
+    co_ctx = first_queue->get_context();
+}
+
+sycl::queue *ggml_sycl_device_info::_create_queue_ptr(sycl::device device) {
+    auto q = dpct::get_current_device().create_queue(co_ctx, device);
+    return q;
+}
+
+sycl::queue *ggml_sycl_device_info::create_queue_for_device(sycl::device &device) {
+    dpct::select_device(dpct::dev_mgr::instance().get_device_id(device));
+    auto qptr = _create_queue_ptr(device);
+    return qptr;
+}
+
+sycl::queue *ggml_sycl_device_info::create_queue_for_device_id(int id) {
+    sycl::device device = dpct::dev_mgr::instance().get_device(id);
+    return create_queue_for_device(device);
+}
+
+int ggml_sycl_device_info::get_device_index(int id) {
+    for (int i = 0; i < ids.size(); i++) {
+        if (ids[i] == id)
+            return i;
+    }
+    return -1;
+}
+
+void ggml_sycl_device_info::init_allow_devices() {
+    device_list = "";
+    for (auto & id: ids) {
+        device_list += std::to_string(id);
+        device_list += ",";
+    }
+    if (device_list.length() > 1) {
+        device_list.pop_back();
+    }
+}
+
+bool ggml_sycl_device_info::is_allowed_device(int id) {
+    return std::find(ids.begin(), ids.end(), id) != ids.end();
+}
+
+void ggml_sycl_device_info::detect_all_sycl_device_list() try {
+    int all_device_count = dpct::dev_mgr::instance().device_count();
+
+    for (int id = 0; id < all_device_count; id++) {
+        add_device_info(id);
+    }
+    return;
+} catch (sycl::exception const &exc) {
+    std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+              << ", line:" << __LINE__ << std::endl;
+    std::exit(1);
+}
+
+std::vector<int> ggml_sycl_device_info::get_sycl_visible_devices() {
+    static std::vector<int> device_ids;
+    char *devices_env = getenv("GGML_SYCL_VISIBLE_DEVICES");
+    if (devices_env != nullptr) {
+        std::string devices(devices_env);
+        std::replace(devices.begin(), devices.end(), ',', ' ');
+
+        std::stringstream ss(devices);
+        int tmp;
+        while (ss >> tmp) {
+            device_ids.push_back(tmp);
+        }
+    }
+    return device_ids;
+}
+
+void ggml_sycl_device_info::detect_sycl_visible_device_list() try {
+    std::vector<int> sycl_devices = get_sycl_visible_devices();
+    int all_device_count = dpct::dev_mgr::instance().device_count();
+
+    for (auto & id: sycl_devices) {
+        if (id >= all_device_count) {
+            std::cerr << __func__ << ": invalid device_id:" << id
+                      << " from GGML_SYCL_VISIBLE_DEVICES="
+                      << getenv("GGML_SYCL_VISIBLE_DEVICES")
+                      << ", available IDs: ";
+            if (all_device_count > 1) {
+                std::cerr << "[0, " << all_device_count - 1 << "]";
+            } else if (all_device_count == 1) {
+                std::cerr << "[0]";
+            } else {
+                std::cerr << "[]";
+            }
+            std::cerr << std::endl;
+        }
+        add_device_info(id);
+    }
+    return;
+} catch (sycl::exception const &exc) {
+    std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+              << ", line:" << __LINE__ << std::endl;
+    std::exit(1);
+}
+
+/*
+Use all GPUs with same top max compute units
+*/
+void ggml_sycl_device_info::detect_sycl_gpu_list_with_max_cu() try {
+    int all_device_count = dpct::dev_mgr::instance().device_count();
+    int local_max_compute_units = 0;
+    for (int id = 0; id < all_device_count; id++) {
+        sycl::device device = dpct::dev_mgr::instance().get_device(id);
+        if (!device.is_gpu())
+            continue;
+        dpct::device_info prop;
+        dpct::get_device_info(prop, device);
+        if (local_max_compute_units < prop.get_max_compute_units())
+            local_max_compute_units = prop.get_max_compute_units();
+    }
+
+    for (int id = 0; id < all_device_count; id++) {
+        sycl::device device = dpct::dev_mgr::instance().get_device(id);
+        if (!device.is_gpu())
+            continue;
+        dpct::device_info prop;
+        dpct::get_device_info(prop, device);
+        if (local_max_compute_units == prop.get_max_compute_units() &&
+            is_ext_oneapi_device(device)) {
+            add_device_info(id);
+        }
+    }
+    return;
+} catch (sycl::exception const &exc) {
+    std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+              << ", line:" << __LINE__ << std::endl;
+    std::exit(1);
+}
+
+int ggml_sycl_device_info::get_device_count() { return device_count; }
+
+bool ggml_sycl_device_info::is_ext_oneapi_device(const sycl::device &dev) {
+    sycl::backend dev_backend = dev.get_backend();
+    if (dev_backend == sycl::backend::ext_oneapi_level_zero ||
+        dev_backend == sycl::backend::ext_oneapi_cuda ||
+        dev_backend == sycl::backend::ext_oneapi_hip)
+        return true;
+    return false;
+}
+
+void ggml_sycl_device_info::add_device_info(int id) {
+    sycl::device device = dpct::dev_mgr::instance().get_device(id);
+    dpct::device_info prop;
+    dpct::get_device_info(prop, device);
+
+    ids.push_back(id);
+    devices.push_back(device);
+
+    device_infos[id].id = id;
+    device_infos[id].device = device;
+    device_infos[id].max_work_group_sizes = prop.get_max_work_group_size();
+    device_infos[id].max_compute_units = prop.get_max_compute_units();
+    device_infos[id].hw_family = get_device_family(&device);
+    for (int i=0; i<GGML_SYCL_MAX_STREAMS;i++) {
+        device_infos[id].qptrs[i] = create_queue_for_device_id(id);
+    }
+}
+
+void ggml_sycl_device_info::print_gpu_device_list() {
+   char *hint = NULL;
+    if (oneapi_device_selector_existed && sycl_visible_devices_existed) {
+        hint = "detect %d SYCL devices:[%s] by ONEAPI_DEVICE_SELECTOR=%s and "
+               "GGML_SYCL_VISIBLE_DEVICES=%s\n";
+        fprintf(stderr, hint, get_device_count(), devices_list(),
+                getenv("ONEAPI_DEVICE_SELECTOR"),
+                getenv("GGML_SYCL_VISIBLE_DEVICES"));
+    } else if (oneapi_device_selector_existed) {
+        hint = "detect %d SYCL devices:[%s] by ONEAPI_DEVICE_SELECTOR=%s\n";
+        fprintf(stderr, hint, get_device_count(), devices_list(),
+                getenv("ONEAPI_DEVICE_SELECTOR"));
+    } else if (sycl_visible_devices_existed) {
+        hint = "detect %d SYCL devices:[%s] by GGML_SYCL_VISIBLE_DEVICES=%s\n";
+        fprintf(stderr, hint, get_device_count(), devices_list(),
+                getenv("GGML_SYCL_VISIBLE_DEVICES"));
+    } else {
+        hint = "detect %d SYCL level-zero GPUs:[%s] with top Max compute "
+               "units:%d, to use any SYCL devices, set/export "
+               "GGML_SYCL_VISIBLE_DEVICES or ONEAPI_DEVICE_SELECTOR\n";
+        fprintf(stderr, hint, get_device_count(), devices_list(),
+                device_infos[0].max_compute_units);
+    }
+}
+
+int ggml_sycl_device_info::work_group_size(int id) {
+    GGML_ASSERT(is_allowed_device(id));
+    return device_infos[id].max_work_group_sizes;
+}
+
+ggml_sycl_device_info::ggml_sycl_device_info() {
+    oneapi_device_selector_existed = env_existed("ONEAPI_DEVICE_SELECTOR");
+    sycl_visible_devices_existed = env_existed("GGML_SYCL_VISIBLE_DEVICES");
+
+    if (sycl_visible_devices_existed) {
+        init(SYCL_VISIBLE_DEVICES);
+    } else if (oneapi_device_selector_existed) {
+        init(SYCL_ALL_DEVICES);
+    } else {
+        init(SYCL_DEVICES_TOP_LEVEL_ZERO);
+    }
+
+    int64_t total_vram = 0;
+
+    for (int i = 0; i < device_count; ++i) {
+        int id = get_device_id(i);
+        device_infos[id].vmm = 0;
+        dpct::device_info prop;
+        dpct::get_device_info(
+            prop, dpct::dev_mgr::instance().get_device(id));
+
+        // continue data, so use device index
+        default_tensor_split[i] = total_vram;
+        total_vram += prop.get_global_mem_size();
+
+        device_infos[id].cc =
+            100 * prop.get_major_version() + 10 * prop.get_minor_version();
+    }
+
+    // continue data, so use device index
+    for (int i = 0; i < device_count; ++i) {
+        default_tensor_split[i] /= total_vram;
+    }
+
+    print_gpu_device_list();
+}
+
+const char *ggml_sycl_device_info::devices_list() {
+    return device_list.c_str();
+}
+
+int ggml_sycl_device_info::get_device_id(int device_index) {
+    if (device_index < device_count) {
+        return ids.at(device_index);
+    } else {
+        std::cerr << __func__ << ":SYCL device:" << device_index
+                  << " is out of range:[" << devices_list() << "]" << std::endl;
+        std::exit(1);
+    }
+}
+
+int ggml_sycl_device_info::hw_family(int id) {
+    return device_infos[id].hw_family;
+}
+
+static inline bool env_existed(const char *env_name) {
+     char *user_device_string = getenv(env_name);
+     return user_device_string!=NULL;
+}
\ No newline at end of file
diff --git a/ggml/src/ggml-sycl/sycl_device.hpp b/ggml/src/ggml-sycl/sycl_device.hpp
new file mode 100644
index 0000000000000..33c4fc5fa6945
--- /dev/null
+++ b/ggml/src/ggml-sycl/sycl_device.hpp
@@ -0,0 +1,83 @@
+#ifndef SYCL_DEVICE_HPP
+#define SYCL_DEVICE_HPP
+
+#include <algorithm>
+#include <stdio.h>
+#include <vector>
+
+
+#include <sycl/sycl.hpp>
+#include "dpct/helper.hpp"
+
+#include "ggml-sycl.h"
+#include "presets.hpp"
+// #include "common.hpp"
+
+enum ggml_sycl_backend_device_filter {
+  SYCL_ALL_DEVICES = 0,
+  SYCL_DEVICES_TOP_LEVEL_ZERO,
+  SYCL_VISIBLE_DEVICES
+};
+
+struct sycl_device_info {
+    int     cc;                 // compute capability
+    // int     nsm;                // number of streaming multiprocessors
+    // size_t  smpb;               // max. shared memory per block
+    bool    vmm;                // virtual memory support
+    size_t  total_vram;
+
+    int id;
+    sycl::device device;
+    int max_compute_units;
+    int max_work_group_sizes;
+    int hw_family;
+    sycl::context ctx;
+    sycl::queue * qptrs[GGML_SYCL_MAX_STREAMS] = { nullptr };
+};
+
+struct ggml_sycl_device_info {
+    int device_count;
+    bool oneapi_device_selector_existed = false;
+    bool sycl_visible_devices_existed = false;
+    std::vector<int> ids;
+    std::vector<sycl::device> devices;
+    sycl::queue *first_queue;
+    std::string device_list;
+    sycl::context co_ctx;
+
+    sycl_device_info device_infos[GGML_SYCL_MAX_DEVICES];
+    std::array<float, GGML_SYCL_MAX_DEVICES> default_tensor_split = {};
+
+    ggml_sycl_device_info();
+    void init(ggml_sycl_backend_device_filter device_filter);
+
+    void print_gpu_device_list();
+    int work_group_size(int device_id);
+    bool is_allowed_device(int device_id);
+    const char* devices_list();
+    int get_device_id(int device_index);
+    int hw_family(int device_id);
+
+    sycl::queue *_create_queue_ptr(sycl::device device); //internal API to hide dpct API.
+    void create_context_for_group_gpus();
+    sycl::queue *create_queue_for_device(sycl::device &device);
+    sycl::queue *create_queue_for_device_id(int device_id);
+    int get_device_index(int device_id);
+    void create_context_for_devices();
+    void init_allow_devices();
+    void detect_all_sycl_device_list();
+    void detect_sycl_visible_device_list();
+    void detect_sycl_gpu_list_with_max_cu();
+    int get_device_count();
+    bool is_ext_oneapi_device(const sycl::device &dev);
+    void add_device_info(int id);
+    std::vector<sycl::device> get_devices();
+    std::vector<int> get_sycl_visible_devices();
+
+    sycl::context &get_co_ctx() { return co_ctx; }
+
+};
+
+static inline bool env_existed(const char *env_name);
+
+#endif // SYCL_DEVICE_HPP
diff --git a/ggml/src/ggml-sycl/sycl_hw.cpp b/ggml/src/ggml-sycl/sycl_hw.cpp
index 21e0c2af90191..af7675adb3270 100644
--- a/ggml/src/ggml-sycl/sycl_hw.cpp
+++ b/ggml/src/ggml-sycl/sycl_hw.cpp
@@ -8,8 +8,14 @@ SYCL_HW_FAMILY get_device_family(sycl::device *device_ptr) {
   auto id = device_ptr->get_info<sycl::ext::intel::info::device::device_id>();
   auto id_prefix = id & 0xff00;
 
+  const std::vector<int> Xe_ARC = {0x5600, 0x4f00};
+  const std::vector<int> Xe_Iris_IDs = {0x4900, 0xa700};
+  const std::vector<int> UHD_IDs = {0x4600};
+
   if (is_in_vector(Xe_Iris_IDs, id_prefix) or is_in_vector(UHD_IDs, id_prefix)) {
     return SYCL_HW_FAMILY_INTEL_IGPU;
+  } else if (is_in_vector(Xe_ARC, id_prefix)) {
+    return SYCL_HW_FAMILY_INTEL_ARC;
   } else {
     std::cerr << "No support PCI_ID: " << std::hex << id << std::endl;
     return SYCL_HW_FAMILY_UNKNOWN;
diff --git a/ggml/src/ggml-sycl/sycl_hw.hpp b/ggml/src/ggml-sycl/sycl_hw.hpp
index 81944475663a4..4537955d01669 100644
--- a/ggml/src/ggml-sycl/sycl_hw.hpp
+++ b/ggml/src/ggml-sycl/sycl_hw.hpp
@@ -8,13 +8,10 @@
 
 #include <sycl/sycl.hpp>
 
-// const int Xe_ARC[] = {0x5600, 0x4f};
-const std::vector<int> Xe_Iris_IDs = {0x4900, 0xa700};
-const std::vector<int> UHD_IDs = {0x4600};
-
 enum SYCL_HW_FAMILY {
   SYCL_HW_FAMILY_UNKNOWN = -1,
-  SYCL_HW_FAMILY_INTEL_IGPU = 0
+  SYCL_HW_FAMILY_INTEL_IGPU = 0,
+  SYCL_HW_FAMILY_INTEL_ARC = 1
 };
 
 bool is_in_vector(std::vector<int> &vec, int item);

From 1947c1200e1f5fd40ca8606454fe097efd3505aa Mon Sep 17 00:00:00 2001
From: arthw <14088817+arthw@users.noreply.github.com>
Date: Thu, 1 Aug 2024 11:21:16 +0800
Subject: [PATCH 2/6] support set main gpu

---
 ggml/include/ggml-sycl.h           |  2 +
 ggml/src/ggml-sycl.cpp             |  8 +++
 ggml/src/ggml-sycl/common.cpp      |  8 +--
 ggml/src/ggml-sycl/common.hpp      | 10 +---
 ggml/src/ggml-sycl/sycl_device.cpp | 86 +++++++++++++++++++++++++-----
 ggml/src/ggml-sycl/sycl_device.hpp | 20 ++++++-
 src/llama.cpp                      |  7 ++-
 7 files changed, 111 insertions(+), 30 deletions(-)

diff --git a/ggml/include/ggml-sycl.h b/ggml/include/ggml-sycl.h
index a50086afae9eb..4368e27a57178 100644
--- a/ggml/include/ggml-sycl.h
+++ b/ggml/include/ggml-sycl.h
@@ -38,6 +38,8 @@ GGML_API GGML_CALL int ggml_backend_sycl_get_device_index(int device_id);
 GGML_API GGML_CALL int ggml_backend_sycl_get_device_id(int index);
 GGML_API GGML_CALL void ggml_sycl_set_single_device(int main_gpu_id);
 
+GGML_API GGML_CALL void ggml_backend_sycl_set_single_device_mode(int main_gpu_id);
+
 // SYCL doesn't support registering host memory, keep here for reference
 // GGML_API GGML_CALL bool ggml_backend_sycl_register_host_buffer(void * buffer, size_t size);
 // GGML_API GGML_CALL void ggml_backend_sycl_unregister_host_buffer(void * buffer);
diff --git a/ggml/src/ggml-sycl.cpp b/ggml/src/ggml-sycl.cpp
index 8a0b8ff639a79..77c7f610f3966 100644
--- a/ggml/src/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl.cpp
@@ -38,6 +38,7 @@
 
 #include "ggml-sycl/backend.hpp"
 #include "ggml-sycl/presets.hpp"
+#include "ggml-sycl/sycl_device.hpp"
 
 
 void   ggml_sycl_free_data(struct ggml_tensor * tensor);
@@ -5150,6 +5151,13 @@ GGML_CALL int ggml_backend_sycl_get_device_count() {
     return ggml_sycl_info().device_count;
 }
 
+GGML_API GGML_CALL void ggml_backend_sycl_set_single_device_mode(int main_gpu_id) {
+
+    GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_set_single_device_mode\n");
+    fprintf(stderr, "ggml_backend_sycl_set_single_device: use single device: [%d]\n", main_gpu_id);
+    ggml_sycl_info(main_gpu_id);
+}
+
 GGML_CALL static ggml_backend_t ggml_backend_reg_sycl_init(const char * params, void * user_data) {
     ggml_backend_t sycl_backend = ggml_backend_sycl_init((int) (intptr_t) user_data);
     return sycl_backend;
diff --git a/ggml/src/ggml-sycl/common.cpp b/ggml/src/ggml-sycl/common.cpp
index efd702bf19991..da7d8c60eb035 100644
--- a/ggml/src/ggml-sycl/common.cpp
+++ b/ggml/src/ggml-sycl/common.cpp
@@ -137,7 +137,7 @@ void ggml_backend_sycl_print_sycl_devices() {
     }
 }
 
-static ggml_sycl_device_info ggml_sycl_init() try {
+static ggml_sycl_device_info ggml_sycl_init(int main_gpu_id) try {
     static bool initialized = false;
 
     if (!initialized) {
@@ -176,7 +176,7 @@ static ggml_sycl_device_info ggml_sycl_init() try {
         initialized = true;
     }
 
-    static ggml_sycl_device_info info;
+    static ggml_sycl_device_info info(main_gpu_id);
 
     if (info.device_count == 0) {
         fprintf(stderr, "%s: failed to initialize " GGML_SYCL_NAME ": no available device found\n",
@@ -192,8 +192,8 @@ static ggml_sycl_device_info ggml_sycl_init() try {
     std::exit(1);
 }
 
-ggml_sycl_device_info &ggml_sycl_info() {
-    static ggml_sycl_device_info info = ggml_sycl_init();
+ggml_sycl_device_info &ggml_sycl_info(int main_gpu_id) {
+    static ggml_sycl_device_info info = ggml_sycl_init(main_gpu_id);
     return info;
 }
 
diff --git a/ggml/src/ggml-sycl/common.hpp b/ggml/src/ggml-sycl/common.hpp
index f0c93a52d84e4..5b60835a23cc0 100644
--- a/ggml/src/ggml-sycl/common.hpp
+++ b/ggml/src/ggml-sycl/common.hpp
@@ -81,12 +81,6 @@ static int g_ggml_sycl_debug = 0;
 
 typedef sycl::queue *queue_ptr;
 
-enum ggml_sycl_backend_gpu_mode {
-  SYCL_UNSET_GPU_MODE = -1,
-  SYCL_SINGLE_GPU_MODE = 0,
-  SYCL_MUL_GPU_MODE
-};
-
 static_assert(sizeof(sycl::half) == sizeof(ggml_fp16_t), "wrong fp16 size");
 
 static void crash() {
@@ -288,8 +282,8 @@ void* ggml_sycl_host_malloc(size_t size);
 void ggml_sycl_host_free(void* ptr);
 
 void ggml_backend_sycl_print_sycl_devices();
-static ggml_sycl_device_info ggml_sycl_init();
-ggml_sycl_device_info &ggml_sycl_info();
+static ggml_sycl_device_info ggml_sycl_init(int main_gpu_id);
+ggml_sycl_device_info &ggml_sycl_info(int main_gpu_id = -1);
 
 // common device functions
 
diff --git a/ggml/src/ggml-sycl/sycl_device.cpp b/ggml/src/ggml-sycl/sycl_device.cpp
index e2633d729671e..fd8c045994b98 100644
--- a/ggml/src/ggml-sycl/sycl_device.cpp
+++ b/ggml/src/ggml-sycl/sycl_device.cpp
@@ -7,22 +7,54 @@ void ggml_sycl_device_info::init(
     switch (device_filter) {
     case SYCL_DEVICES_TOP_LEVEL_ZERO:
         detect_sycl_gpu_list_with_max_cu();
-        create_context_for_devices();
         break;
     case SYCL_ALL_DEVICES:
         detect_all_sycl_device_list();
-        create_context_for_devices();
         break;
     case SYCL_VISIBLE_DEVICES:
         detect_sycl_visible_device_list();
-        create_context_for_devices();
         break;
     default:
         std::cerr << "ggml_sycl_device_info: Invalid device_filter " << device_filter
                   << std::endl;
     }
-    init_allow_devices();
+    init_devices_dynamic_info();
+    m_device_filter = device_filter;
+}
+
+void ggml_sycl_device_info::clear_device_infos() {
+  ids.clear();
+  devices.clear();
+
+  for (int id=0;id<GGML_SYCL_MAX_DEVICES;id++) {
+    device_infos[id].id = -1;
+    device_infos[id].max_work_group_sizes = 0;
+    device_infos[id].max_compute_units = 0;
+    device_infos[id].hw_family = -1;
+    // for (int i=0; i<GGML_SYCL_MAX_STREAMS;i++) {
+    //     free(device_infos[id].qptrs[i]);
+    // }
+  }
+
+  device_count = 0;
+  device_list = "";
+}
+
+void ggml_sycl_device_info::init_single_mode(int main_gpu_id) {
+    GGML_ASSERT(main_gpu_id<dpct::dev_mgr::instance().device_count());
+
+    clear_device_infos();
+    add_device_info(main_gpu_id);
+    init_devices_dynamic_info();
+    device_mode = SYCL_SINGLE_GPU_MODE;
+}
+
+void ggml_sycl_device_info::init_devices_dynamic_info() {
+    create_context_for_devices();
+    set_allow_devices();
     device_count = ids.size();
+    create_queues_for_devices();
+    update_mem();
 }
 
 /*
@@ -60,7 +92,7 @@ int ggml_sycl_device_info::get_device_index(int id) {
     return -1;
 }
 
-void ggml_sycl_device_info::init_allow_devices() {
+void ggml_sycl_device_info::set_allow_devices() {
     device_list = "";
     for (auto & id: ids) {
         device_list += std::to_string(id);
@@ -190,11 +222,22 @@ void ggml_sycl_device_info::add_device_info(int id) {
     device_infos[id].max_work_group_sizes = prop.get_max_work_group_size();
     device_infos[id].max_compute_units = prop.get_max_compute_units();
     device_infos[id].hw_family = get_device_family(&device);
+}
+
+void ggml_sycl_device_info::create_queues(int id) {
     for (int i=0; i<GGML_SYCL_MAX_STREAMS;i++) {
         device_infos[id].qptrs[i] = create_queue_for_device_id(id);
     }
 }
 
+void ggml_sycl_device_info::create_queues_for_devices() {
+    for (auto &id: ids) {
+        for (int i=0; i<GGML_SYCL_MAX_STREAMS;i++) {
+            device_infos[id].qptrs[i] = create_queue_for_device_id(id);
+        }
+    }
+}
+
 void ggml_sycl_device_info::print_gpu_device_list() {
    char *hint = NULL;
     if (oneapi_device_selector_existed && sycl_visible_devices_existed) {
@@ -225,16 +268,12 @@ int ggml_sycl_device_info::work_group_size(int id) {
     return device_infos[id].max_work_group_sizes;
 }
 
-ggml_sycl_device_info::ggml_sycl_device_info() {
-    oneapi_device_selector_existed = env_existed("ONEAPI_DEVICE_SELECTOR");
-    sycl_visible_devices_existed = env_existed("GGML_SYCL_VISIBLE_DEVICES");
+void ggml_sycl_device_info::update_mem() {
 
-    if (sycl_visible_devices_existed) {
-        init(SYCL_VISIBLE_DEVICES);
-    } else if (oneapi_device_selector_existed) {
-        init(SYCL_ALL_DEVICES);
-    } else {
-        init(SYCL_DEVICES_TOP_LEVEL_ZERO);
+    for (int i = 0; i < GGML_SYCL_MAX_DEVICES; ++i) {
+        device_infos[i].vmm = 0;
+        default_tensor_split[i] = 0;
+        device_infos[i].cc =0;
     }
 
     int64_t total_vram = 0;
@@ -258,6 +297,23 @@ ggml_sycl_device_info::ggml_sycl_device_info() {
     for (int i = 0; i < device_count; ++i) {
         default_tensor_split[i] /= total_vram;
     }
+}
+
+ggml_sycl_device_info::ggml_sycl_device_info(int main_gpu_id) {
+    oneapi_device_selector_existed = env_existed("ONEAPI_DEVICE_SELECTOR");
+    sycl_visible_devices_existed = env_existed("GGML_SYCL_VISIBLE_DEVICES");
+
+    if (main_gpu_id == -1) {
+        if (sycl_visible_devices_existed) {
+            init(SYCL_VISIBLE_DEVICES);
+        } else if (oneapi_device_selector_existed) {
+            init(SYCL_ALL_DEVICES);
+        } else {
+            init(SYCL_DEVICES_TOP_LEVEL_ZERO);
+        }
+    } else {
+        init_single_mode(main_gpu_id);
+    }
 
     print_gpu_device_list();
 }
@@ -272,6 +328,8 @@ int ggml_sycl_device_info::get_device_id(int device_index) {
     } else {
         std::cerr << __func__ << ":SYCL device:" << device_index
                   << " is out of range:[" << devices_list() << "]" << std::endl;
+        int* ptr = NULL;
+        *ptr = 0;
         std::exit(1);
     }
 }
diff --git a/ggml/src/ggml-sycl/sycl_device.hpp b/ggml/src/ggml-sycl/sycl_device.hpp
index 33c4fc5fa6945..f7acd5a8cc81f 100644
--- a/ggml/src/ggml-sycl/sycl_device.hpp
+++ b/ggml/src/ggml-sycl/sycl_device.hpp
@@ -19,6 +19,13 @@ enum ggml_sycl_backend_device_filter {
   SYCL_VISIBLE_DEVICES
 };
 
+enum ggml_sycl_backend_gpu_mode {
+    SYCL_UNSET_GPU_MODE = -1,
+    SYCL_SINGLE_GPU_MODE = 0,
+    SYCL_MUL_GPU_MODE
+};
+
+
 struct sycl_device_info {
     int     cc;                 // compute capability
     // int     nsm;                // number of streaming multiprocessors
@@ -36,6 +43,7 @@ struct sycl_device_info {
 };
 
 struct ggml_sycl_device_info {
+    int device_mode = SYCL_MUL_GPU_MODE;
     int device_count;
     bool oneapi_device_selector_existed = false;
     bool sycl_visible_devices_existed = false;
@@ -44,13 +52,17 @@ struct ggml_sycl_device_info {
     sycl::queue *first_queue;
     std::string device_list;
     sycl::context co_ctx;
+    int m_device_filter;
 
     sycl_device_info device_infos[GGML_SYCL_MAX_DEVICES];
     std::array<float, GGML_SYCL_MAX_DEVICES> default_tensor_split = {};
 
-    ggml_sycl_device_info();
+    ggml_sycl_device_info(int main_gpu_id);//single device mode
+
     void init(ggml_sycl_backend_device_filter device_filter);
+    void init_single_mode(int main_gpu_id);
 
+    void clear_device_infos();
     void print_gpu_device_list();
     int work_group_size(int device_id);
     bool is_allowed_device(int device_id);
@@ -64,15 +76,19 @@ struct ggml_sycl_device_info {
     sycl::queue *create_queue_for_device_id(int device_id);
     int get_device_index(int device_id);
     void create_context_for_devices();
-    void init_allow_devices();
+    void set_allow_devices();
     void detect_all_sycl_device_list();
     void detect_sycl_visible_device_list();
     void detect_sycl_gpu_list_with_max_cu();
     int get_device_count();
     bool is_ext_oneapi_device(const sycl::device &dev);
     void add_device_info(int id);
+    void create_queues(int id);
+    void create_queues_for_devices();
     std::vector<sycl::device> get_devices();
     std::vector<int> get_sycl_visible_devices();
+    void update_mem();
+    void init_devices_dynamic_info();
 
     sycl::context &get_co_ctx() { return co_ctx; }
 
diff --git a/src/llama.cpp b/src/llama.cpp
index 7bb2dfd4625b3..c2a914addff0a 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -2831,8 +2831,7 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_
 #elif defined(GGML_USE_VULKAN)
     buft = ggml_backend_vk_buffer_type(gpu);
 #elif defined(GGML_USE_SYCL)
-    int gpu_id = ggml_backend_sycl_get_device_id(gpu);
-    buft = ggml_backend_sycl_buffer_type(gpu_id);
+    buft = ggml_backend_sycl_buffer_type(gpu);
 #elif defined(GGML_USE_KOMPUTE)
     buft = ggml_backend_kompute_buffer_type(gpu);
     if (buft == nullptr) {
@@ -5931,6 +5930,10 @@ static bool llm_load_tensors(
             model.buft_output = llama_default_buffer_type_cpu(true);
         }
     } else {
+
+#if defined(GGML_USE_SYCL)
+        ggml_backend_sycl_set_single_device_mode(main_gpu);
+#endif
         ggml_backend_buffer_type_t split_buft;
         if (split_mode == LLAMA_SPLIT_MODE_ROW) {
             split_buft = llama_default_buffer_type_split(model, main_gpu, tensor_split);

From 6211ac040886495c44ad2ddee80204c9bcd6721a Mon Sep 17 00:00:00 2001
From: arthw <14088817+arthw@users.noreply.github.com>
Date: Thu, 1 Aug 2024 12:42:11 +0800
Subject: [PATCH 3/6] simple code for loop

---
 ggml/src/ggml-sycl.cpp | 23 +++++++----------------
 1 file changed, 7 insertions(+), 16 deletions(-)

diff --git a/ggml/src/ggml-sycl.cpp b/ggml/src/ggml-sycl.cpp
index 77c7f610f3966..ae2420a822cd7 100644
--- a/ggml/src/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl.cpp
@@ -2816,8 +2816,7 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
         }
     }
 
-    for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
-        int id = ggml_backend_sycl_get_device_id(i);
+    for (auto & id: ggml_sycl_info().ids) {
         if ((!split && id != ctx.device) || dev[id].row_low == dev[id].row_high) {
             continue;
         }
@@ -2882,8 +2881,7 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
         const int64_t is = split ? (src1_col_0/src1_col_stride) % GGML_SYCL_MAX_STREAMS : 0;
         const int64_t src1_ncols = src1_col_0 + src1_col_stride > ne11 ? ne11 - src1_col_0 : src1_col_stride;
 
-        for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
-            int id = ggml_backend_sycl_get_device_id(i);
+        for (auto & id: ggml_sycl_info().ids) {
             if ((!split && id != ctx.device) || dev[id].row_low == dev[id].row_high) {
                 continue;
             }
@@ -3025,8 +3023,7 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
         is_max = is_max <= GGML_SYCL_MAX_STREAMS ? is_max : GGML_SYCL_MAX_STREAMS;
 
         ggml_sycl_set_device(ctx.device);
-        for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
-            int id = ggml_backend_sycl_get_device_id(i);
+        for (auto & id: ggml_sycl_info().ids) {
             if (dev[id].row_low == dev[id].row_high) {
                 continue;
             }
@@ -4343,12 +4340,9 @@ ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device_id) {
     static bool ggml_backend_sycl_buffer_type_initialized = false;
 
     if (!ggml_backend_sycl_buffer_type_initialized) {
-        for (int i = 0; i < ggml_sycl_info().device_count; i++) {
-            int id = ggml_backend_sycl_get_device_id(i);
+        for (auto & id: ggml_sycl_info().ids) {
             auto & device = dpct::dev_mgr::instance().get_device(id);
-            // queue_ptr stream = &(device.default_queue());
             queue_ptr stream = ggml_sycl_info().device_infos[id].qptrs[0];
-
             ggml_backend_sycl_buffer_types[id] = {
                 /* .iface    = */ ggml_backend_sycl_buffer_type_interface,
                 /* .context  = */ new ggml_backend_sycl_buffer_type_context{id, GGML_SYCL_NAME + std::to_string(id), stream},
@@ -4369,8 +4363,7 @@ ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(ggml_backend_sycl_conte
     static bool ggml_backend_sycl_buffer_type_initialized = false;
 
     if (!ggml_backend_sycl_buffer_type_initialized) {
-        for (int i = 0; i < ggml_sycl_info().device_count; i++) {
-            int id = ggml_backend_sycl_get_device_id(i);
+        for (auto & id: ggml_sycl_info().ids) {
             ggml_backend_sycl_buffer_types[id] = {
                 /* .iface    = */ ggml_backend_sycl_buffer_type_interface,
                 /* .context  = */ new ggml_backend_sycl_buffer_type_context{id, GGML_SYCL_NAME + std::to_string(id), ctx->stream(id, 0)},
@@ -4399,8 +4392,7 @@ static void get_row_split(int64_t * row_low, int64_t * row_high, const ggml_tens
 struct ggml_backend_sycl_split_buffer_context {
     ~ggml_backend_sycl_split_buffer_context() try {
         for (ggml_tensor_extra_gpu * extra : tensor_extras) {
-            for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
-                int id = ggml_backend_sycl_get_device_id(i);
+            for (auto & id: ggml_sycl_info().ids) {
                 for (int64_t is = 0; is < GGML_SYCL_MAX_STREAMS; ++is) {
                     if (extra->events[id][is] != nullptr) {
                         /*
@@ -5169,8 +5161,7 @@ extern "C" int ggml_backend_sycl_reg_devices();
 
 int ggml_backend_sycl_reg_devices() {
     assert(ggml_sycl_info().device_count>0);
-    for (int i = 0; i < ggml_sycl_info().device_count; i++) {
-        int id = ggml_backend_sycl_get_device_id(i);
+    for (auto & id: ggml_sycl_info().ids) {
         char name[128];
         snprintf(name, sizeof(name), "%s%d", GGML_SYCL_NAME, id);
         ggml_backend_register(name, ggml_backend_reg_sycl_init, ggml_backend_sycl_buffer_type(id), (void *) (intptr_t) id);

From 254a750249f74e838e744eedd8607f05fe9707d7 Mon Sep 17 00:00:00 2001
From: arthw <14088817+arthw@users.noreply.github.com>
Date: Thu, 1 Aug 2024 12:48:18 +0800
Subject: [PATCH 4/6] rename device_infos to infos

---
 ggml/src/ggml-sycl.cpp             | 16 +++++------
 ggml/src/ggml-sycl/common.hpp      |  2 +-
 ggml/src/ggml-sycl/mmq.cpp         | 20 +++++++-------
 ggml/src/ggml-sycl/sycl_device.cpp | 43 ++++++++++++++----------------
 ggml/src/ggml-sycl/sycl_device.hpp |  4 +--
 5 files changed, 41 insertions(+), 44 deletions(-)

diff --git a/ggml/src/ggml-sycl.cpp b/ggml/src/ggml-sycl.cpp
index ae2420a822cd7..cf83d8c10f5cb 100644
--- a/ggml/src/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl.cpp
@@ -2280,11 +2280,11 @@ static int64_t get_row_rounding(ggml_type type, const std::array<float, GGML_SYC
     for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
         int id = ggml_backend_sycl_get_device_id(i);
         if (tensor_split[i] < (i + 1 < ggml_sycl_info().device_count ? tensor_split[i + 1] : 1.0f)) {
-            if (min_compute_capability > ggml_sycl_info().device_infos[id].cc) {
-                min_compute_capability = ggml_sycl_info().device_infos[id].cc;
+            if (min_compute_capability > ggml_sycl_info().infos[id].cc) {
+                min_compute_capability = ggml_sycl_info().infos[id].cc;
             }
-            if (max_compute_capability < ggml_sycl_info().device_infos[id].cc) {
-                max_compute_capability = ggml_sycl_info().device_infos[id].cc;
+            if (max_compute_capability < ggml_sycl_info().infos[id].cc) {
+                max_compute_capability = ggml_sycl_info().infos[id].cc;
             }
         }
     }
@@ -3416,12 +3416,12 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor
                 continue;
             }
 
-            if (min_compute_capability > ggml_sycl_info().device_infos[id].cc) {
-                min_compute_capability = ggml_sycl_info().device_infos[id].cc;
+            if (min_compute_capability > ggml_sycl_info().infos[id].cc) {
+                min_compute_capability = ggml_sycl_info().infos[id].cc;
             }
         }
     } else {
-        min_compute_capability    = ggml_sycl_info().device_infos[ctx.device].cc;
+        min_compute_capability    = ggml_sycl_info().infos[ctx.device].cc;
     }
 
     // check data types and tensor shapes for custom matrix multiplication kernels:
@@ -4342,7 +4342,7 @@ ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device_id) {
     if (!ggml_backend_sycl_buffer_type_initialized) {
         for (auto & id: ggml_sycl_info().ids) {
             auto & device = dpct::dev_mgr::instance().get_device(id);
-            queue_ptr stream = ggml_sycl_info().device_infos[id].qptrs[0];
+            queue_ptr stream = ggml_sycl_info().infos[id].qptrs[0];
             ggml_backend_sycl_buffer_types[id] = {
                 /* .iface    = */ ggml_backend_sycl_buffer_type_interface,
                 /* .context  = */ new ggml_backend_sycl_buffer_type_context{id, GGML_SYCL_NAME + std::to_string(id), stream},
diff --git a/ggml/src/ggml-sycl/common.hpp b/ggml/src/ggml-sycl/common.hpp
index 5b60835a23cc0..ccdd789cc77dd 100644
--- a/ggml/src/ggml-sycl/common.hpp
+++ b/ggml/src/ggml-sycl/common.hpp
@@ -241,7 +241,7 @@ struct ggml_backend_sycl_context {
         device(id),
         name(GGML_SYCL_NAME + std::to_string(device)) {
             for (int i=0;i<GGML_SYCL_MAX_STREAMS; i++){
-                qptrs[id][i] = sycl_device_info.device_infos[id].qptrs[i];
+                qptrs[id][i] = sycl_device_info.infos[id].qptrs[i];
             }
     }
 
diff --git a/ggml/src/ggml-sycl/mmq.cpp b/ggml/src/ggml-sycl/mmq.cpp
index 0a6a5456c47ee..9f2ee2c88b820 100644
--- a/ggml/src/ggml-sycl/mmq.cpp
+++ b/ggml/src/ggml-sycl/mmq.cpp
@@ -1779,7 +1779,7 @@ static void ggml_mul_mat_q4_0_q8_1_sycl(const void *vx, const void *vy,
     int id;
     SYCL_CHECK(
         CHECK_TRY_ERROR(id = get_current_device_id()));
-    const int compute_capability = ggml_sycl_info().device_infos[id].cc;
+    const int compute_capability = ggml_sycl_info().infos[id].cc;
 
     int mmq_x, mmq_y, nwarps;
     if (compute_capability >= VER_GEN13) {
@@ -1894,7 +1894,7 @@ static void ggml_mul_mat_q4_1_q8_1_sycl(const void *vx, const void *vy,
     int id;
     SYCL_CHECK(
         CHECK_TRY_ERROR(id = get_current_device_id()));
-    const int compute_capability = ggml_sycl_info().device_infos[id].cc;
+    const int compute_capability = ggml_sycl_info().infos[id].cc;
 
     int mmq_x, mmq_y, nwarps;
     if (compute_capability >= VER_GEN13) {
@@ -2009,7 +2009,7 @@ static void ggml_mul_mat_q5_0_q8_1_sycl(const void *vx, const void *vy,
     int id;
     SYCL_CHECK(
         CHECK_TRY_ERROR(id = get_current_device_id()));
-    const int compute_capability = ggml_sycl_info().device_infos[id].cc;
+    const int compute_capability = ggml_sycl_info().infos[id].cc;
 
     int mmq_x, mmq_y, nwarps;
     if (compute_capability >= VER_GEN13) {
@@ -2124,7 +2124,7 @@ static void ggml_mul_mat_q5_1_q8_1_sycl(const void *vx, const void *vy,
     int id;
     SYCL_CHECK(
         CHECK_TRY_ERROR(id = get_current_device_id()));
-    const int compute_capability = ggml_sycl_info().device_infos[id].cc;
+    const int compute_capability = ggml_sycl_info().infos[id].cc;
 
     int mmq_x, mmq_y, nwarps;
     if (compute_capability >= VER_GEN13) {
@@ -2239,7 +2239,7 @@ static void ggml_mul_mat_q8_0_q8_1_sycl(const void *vx, const void *vy,
     int id;
     SYCL_CHECK(
         CHECK_TRY_ERROR(id = get_current_device_id()));
-    const int compute_capability = ggml_sycl_info().device_infos[id].cc;
+    const int compute_capability = ggml_sycl_info().infos[id].cc;
 
     int mmq_x, mmq_y, nwarps;
     if (compute_capability >= VER_GEN13) {
@@ -2354,7 +2354,7 @@ static void ggml_mul_mat_q2_K_q8_1_sycl(const void *vx, const void *vy,
     int id;
     SYCL_CHECK(
         CHECK_TRY_ERROR(id = get_current_device_id()));
-    const int compute_capability = ggml_sycl_info().device_infos[id].cc;
+    const int compute_capability = ggml_sycl_info().infos[id].cc;
 
     int mmq_x, mmq_y, nwarps;
     if (compute_capability >= VER_GEN13) {
@@ -2477,7 +2477,7 @@ static void ggml_mul_mat_q3_K_q8_1_sycl(const void *vx, const void *vy,
     int id;
     SYCL_CHECK(
         CHECK_TRY_ERROR(id = get_current_device_id()));
-    const int compute_capability = ggml_sycl_info().device_infos[id].cc;
+    const int compute_capability = ggml_sycl_info().infos[id].cc;
 
     int mmq_x, mmq_y, nwarps;
     if (compute_capability >= VER_GEN13) {
@@ -2605,7 +2605,7 @@ static void ggml_mul_mat_q4_K_q8_1_sycl(const void *vx, const void *vy,
     int id;
     SYCL_CHECK(
         CHECK_TRY_ERROR(id = get_current_device_id()));
-    const int compute_capability = ggml_sycl_info().device_infos[id].cc;
+    const int compute_capability = ggml_sycl_info().infos[id].cc;
 
     int mmq_x, mmq_y, nwarps;
     if (compute_capability >= VER_GEN13) {
@@ -2726,7 +2726,7 @@ static void ggml_mul_mat_q5_K_q8_1_sycl(const void *vx, const void *vy,
     int id;
     SYCL_CHECK(
         CHECK_TRY_ERROR(id = get_current_device_id()));
-    const int compute_capability = ggml_sycl_info().device_infos[id].cc;
+    const int compute_capability = ggml_sycl_info().infos[id].cc;
 
     int mmq_x, mmq_y, nwarps;
     if (compute_capability >= VER_GEN13) {
@@ -2847,7 +2847,7 @@ static void ggml_mul_mat_q6_K_q8_1_sycl(const void *vx, const void *vy,
     int id;
     SYCL_CHECK(
         CHECK_TRY_ERROR(id = get_current_device_id()));
-    const int compute_capability = ggml_sycl_info().device_infos[id].cc;
+    const int compute_capability = ggml_sycl_info().infos[id].cc;
 
     int mmq_x, mmq_y, nwarps;
     if (compute_capability >= VER_GEN13) {
diff --git a/ggml/src/ggml-sycl/sycl_device.cpp b/ggml/src/ggml-sycl/sycl_device.cpp
index fd8c045994b98..e476adef0e38e 100644
--- a/ggml/src/ggml-sycl/sycl_device.cpp
+++ b/ggml/src/ggml-sycl/sycl_device.cpp
@@ -22,18 +22,15 @@ void ggml_sycl_device_info::init(
     m_device_filter = device_filter;
 }
 
-void ggml_sycl_device_info::clear_device_infos() {
+void ggml_sycl_device_info::clear_infos() {
   ids.clear();
   devices.clear();
 
   for (int id=0;id<GGML_SYCL_MAX_DEVICES;id++) {
-    device_infos[id].id = -1;
-    device_infos[id].max_work_group_sizes = 0;
-    device_infos[id].max_compute_units = 0;
-    device_infos[id].hw_family = -1;
-    // for (int i=0; i<GGML_SYCL_MAX_STREAMS;i++) {
-    //     free(device_infos[id].qptrs[i]);
-    // }
+    infos[id].id = -1;
+    infos[id].max_work_group_sizes = 0;
+    infos[id].max_compute_units = 0;
+    infos[id].hw_family = -1;
   }
 
   device_count = 0;
@@ -43,7 +40,7 @@ void ggml_sycl_device_info::clear_device_infos() {
 void ggml_sycl_device_info::init_single_mode(int main_gpu_id) {
     GGML_ASSERT(main_gpu_id<dpct::dev_mgr::instance().device_count());
 
-    clear_device_infos();
+    clear_infos();
     add_device_info(main_gpu_id);
     init_devices_dynamic_info();
     device_mode = SYCL_SINGLE_GPU_MODE;
@@ -217,23 +214,23 @@ void ggml_sycl_device_info::add_device_info(int id) {
     ids.push_back(id);
     devices.push_back(device);
 
-    device_infos[id].id = id;
-    device_infos[id].device = device;
-    device_infos[id].max_work_group_sizes = prop.get_max_work_group_size();
-    device_infos[id].max_compute_units = prop.get_max_compute_units();
-    device_infos[id].hw_family = get_device_family(&device);
+    infos[id].id = id;
+    infos[id].device = device;
+    infos[id].max_work_group_sizes = prop.get_max_work_group_size();
+    infos[id].max_compute_units = prop.get_max_compute_units();
+    infos[id].hw_family = get_device_family(&device);
 }
 
 void ggml_sycl_device_info::create_queues(int id) {
     for (int i=0; i<GGML_SYCL_MAX_STREAMS;i++) {
-        device_infos[id].qptrs[i] = create_queue_for_device_id(id);
+        infos[id].qptrs[i] = create_queue_for_device_id(id);
     }
 }
 
 void ggml_sycl_device_info::create_queues_for_devices() {
     for (auto &id: ids) {
         for (int i=0; i<GGML_SYCL_MAX_STREAMS;i++) {
-            device_infos[id].qptrs[i] = create_queue_for_device_id(id);
+            infos[id].qptrs[i] = create_queue_for_device_id(id);
         }
     }
 }
@@ -259,28 +256,28 @@ void ggml_sycl_device_info::print_gpu_device_list() {
                "units:%d, to use any SYCL devices, set/export "
                "GGML_SYCL_VISIBLE_DEVICES or ONEAPI_DEVICE_SELECTOR\n";
         fprintf(stderr, hint, get_device_count(), devices_list(),
-                device_infos[0].max_compute_units);
+                infos[0].max_compute_units);
     }
 }
 
 int ggml_sycl_device_info::work_group_size(int id) {
     GGML_ASSERT(is_allowed_device(id));
-    return device_infos[id].max_work_group_sizes;
+    return infos[id].max_work_group_sizes;
 }
 
 void ggml_sycl_device_info::update_mem() {
 
     for (int i = 0; i < GGML_SYCL_MAX_DEVICES; ++i) {
-        device_infos[i].vmm = 0;
+        infos[i].vmm = 0;
         default_tensor_split[i] = 0;
-        device_infos[i].cc =0;
+        infos[i].cc =0;
     }
 
     int64_t total_vram = 0;
 
     for (int i = 0; i < device_count; ++i) {
         int id = get_device_id(i);
-        device_infos[id].vmm = 0;
+        infos[id].vmm = 0;
         dpct::device_info prop;
         dpct::get_device_info(
             prop, dpct::dev_mgr::instance().get_device(id));
@@ -289,7 +286,7 @@ void ggml_sycl_device_info::update_mem() {
         default_tensor_split[i] = total_vram;
         total_vram += prop.get_global_mem_size();
 
-        device_infos[id].cc =
+        infos[id].cc =
             100 * prop.get_major_version() + 10 * prop.get_minor_version();
     }
 
@@ -335,7 +332,7 @@ int ggml_sycl_device_info::get_device_id(int device_index) {
 }
 
 int ggml_sycl_device_info::hw_family(int id) {
-    return device_infos[id].hw_family;
+    return infos[id].hw_family;
 }
 
 static inline bool env_existed(const char *env_name) {
diff --git a/ggml/src/ggml-sycl/sycl_device.hpp b/ggml/src/ggml-sycl/sycl_device.hpp
index f7acd5a8cc81f..900a779426054 100644
--- a/ggml/src/ggml-sycl/sycl_device.hpp
+++ b/ggml/src/ggml-sycl/sycl_device.hpp
@@ -54,7 +54,7 @@ struct ggml_sycl_device_info {
     sycl::context co_ctx;
     int m_device_filter;
 
-    sycl_device_info device_infos[GGML_SYCL_MAX_DEVICES];
+    sycl_device_info infos[GGML_SYCL_MAX_DEVICES];
     std::array<float, GGML_SYCL_MAX_DEVICES> default_tensor_split = {};
 
     ggml_sycl_device_info(int main_gpu_id);//single device mode
@@ -62,7 +62,7 @@ struct ggml_sycl_device_info {
     void init(ggml_sycl_backend_device_filter device_filter);
     void init_single_mode(int main_gpu_id);
 
-    void clear_device_infos();
+    void clear_infos();
     void print_gpu_device_list();
     int work_group_size(int device_id);
     bool is_allowed_device(int device_id);

From 4d71c98544c9150d974878867e1277a57d94ecb2 Mon Sep 17 00:00:00 2001
From: arthw <14088817+arthw@users.noreply.github.com>
Date: Thu, 1 Aug 2024 12:52:06 +0800
Subject: [PATCH 5/6] mv dpct/helper.hpp to dpct.hpp

---
 ggml/src/ggml-sycl/common.hpp                    | 2 +-
 ggml/src/ggml-sycl/{dpct/helper.hpp => dpct.hpp} | 0
 ggml/src/ggml-sycl/sycl_device.hpp               | 2 +-
 ggml/src/ggml-sycl/vecdotq.hpp                   | 2 +-
 4 files changed, 3 insertions(+), 3 deletions(-)
 rename ggml/src/ggml-sycl/{dpct/helper.hpp => dpct.hpp} (100%)

diff --git a/ggml/src/ggml-sycl/common.hpp b/ggml/src/ggml-sycl/common.hpp
index ccdd789cc77dd..cb9514356e052 100644
--- a/ggml/src/ggml-sycl/common.hpp
+++ b/ggml/src/ggml-sycl/common.hpp
@@ -17,7 +17,7 @@
 #include <iostream>
 #include <regex>
 
-#include "dpct/helper.hpp"
+#include "dpct.hpp"
 #include "ggml-sycl.h"
 #include "presets.hpp"
 #include "sycl_hw.hpp"
diff --git a/ggml/src/ggml-sycl/dpct/helper.hpp b/ggml/src/ggml-sycl/dpct.hpp
similarity index 100%
rename from ggml/src/ggml-sycl/dpct/helper.hpp
rename to ggml/src/ggml-sycl/dpct.hpp
diff --git a/ggml/src/ggml-sycl/sycl_device.hpp b/ggml/src/ggml-sycl/sycl_device.hpp
index 900a779426054..7bffe75567c32 100644
--- a/ggml/src/ggml-sycl/sycl_device.hpp
+++ b/ggml/src/ggml-sycl/sycl_device.hpp
@@ -7,7 +7,7 @@
 
 
 #include <sycl/sycl.hpp>
-#include "dpct/helper.hpp"
+#include "dpct.hpp"
 
 #include "ggml-sycl.h"
 #include "presets.hpp"
diff --git a/ggml/src/ggml-sycl/vecdotq.hpp b/ggml/src/ggml-sycl/vecdotq.hpp
index d2dccade20bfd..a0ab8287c27b4 100644
--- a/ggml/src/ggml-sycl/vecdotq.hpp
+++ b/ggml/src/ggml-sycl/vecdotq.hpp
@@ -13,7 +13,7 @@
 #ifndef GGML_SYCL_VECDOTQ_HPP
 #define GGML_SYCL_VECDOTQ_HPP
 
-#include "dpct/helper.hpp"
+#include "dpct.hpp"
 
 typedef float (*vec_dot_q_sycl_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs);
 

From f1bc5ad852d3007864bfca5c8898554a970fc1e6 Mon Sep 17 00:00:00 2001
From: arthw <14088817+arthw@users.noreply.github.com>
Date: Thu, 1 Aug 2024 13:14:58 +0800
Subject: [PATCH 6/6] add final newline

---
 ggml/src/ggml-sycl/sycl_device.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml/src/ggml-sycl/sycl_device.cpp b/ggml/src/ggml-sycl/sycl_device.cpp
index e476adef0e38e..02937529c37a5 100644
--- a/ggml/src/ggml-sycl/sycl_device.cpp
+++ b/ggml/src/ggml-sycl/sycl_device.cpp
@@ -338,4 +338,4 @@ int ggml_sycl_device_info::hw_family(int id) {
 static inline bool env_existed(const char *env_name) {
      char *user_device_string = getenv(env_name);
      return user_device_string!=NULL;
-}
\ No newline at end of file
+}