Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

sync : llama.cpp #1051

Merged
merged 19 commits into from
Dec 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
f8f71ea
vulkan: disable spirv-opt for coopmat shaders (llama/10763)
jeffbolznv Dec 10, 2024
259296c
CUDA: rename macros to avoid conflicts with WinAPI (llama/10736)
aendk Dec 10, 2024
8f8e2c8
vulkan: dynamic subgroup size for the remaining k quants (llama/10745)
netrunnereve Dec 10, 2024
d9b9571
vulkan: request round-to-even for fp16 in im2col/rope_head (llama/10767)
jeffbolznv Dec 10, 2024
4847921
ggml: load all backends from a user-provided search path (llama/10699)
giladgd Dec 11, 2024
feec94d
Vulkan: Add VK_EXT_subgroup_size_control support to ensure full subgr…
0cc4m Dec 12, 2024
de05eb6
Vulkan: Use improved q4_k and q5_k dequant code in dequant shaders (l…
0cc4m Dec 12, 2024
68c517e
remove CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS (llama/10797)
slaren Dec 12, 2024
bfa090f
CUDA: faster non-contiguous concat (llama/10760)
A3shTnT Dec 12, 2024
72e030c
ggml : Fix compilation issues on ARM platform when building without f…
kkontny Dec 13, 2024
e6cdbe9
SYCL: Reduce most of the compiler warnings (llama/10748)
qnixsynapse Dec 13, 2024
793b3fd
vulkan: small mul_mat_vec optimizations (llama/10665)
netrunnereve Dec 13, 2024
42609f8
Fix crash caused by ggml_backend_load_all when launching on Android A…
sienaiwun Dec 13, 2024
b36f93a
Introducing experimental OpenCL backend with support for Qualcomm Adr…
lhez Dec 13, 2024
118c41d
llama : add Qwen2VL support + multimodal RoPE (llama/10361)
HimariO Dec 14, 2024
3a02d27
rwkv6: add wkv6 support for Vulkan backend (llama/10829)
uniartisan Dec 16, 2024
00d6f1b
vulkan: bugfixes for small subgroup size systems + llvmpipe test (lla…
netrunnereve Dec 17, 2024
6690443
ggml : update ggml_backend_cpu_device_supports_op (llama/10867)
ggerganov Dec 17, 2024
d7ae184
sync : llama.cpp
ggerganov Dec 17, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,13 @@ else()
endif()
endif()

# remove the lib prefix on win32 mingw
if (WIN32)
set(CMAKE_STATIC_LIBRARY_PREFIX "")
set(CMAKE_SHARED_LIBRARY_PREFIX "")
set(CMAKE_SHARED_MODULE_PREFIX "")
endif()

option(BUILD_SHARED_LIBS "ggml: build shared libraries" ${BUILD_SHARED_LIBS_DEFAULT})
option(GGML_BACKEND_DL "ggml: build backends as dynamic libraries (requires BUILD_SHARED_LIBS)" OFF)

Expand Down Expand Up @@ -172,6 +179,11 @@ set (GGML_SYCL_TARGET "INTEL" CACHE STRING
set (GGML_SYCL_DEVICE_ARCH "" CACHE STRING
"ggml: sycl device architecture")

option(GGML_OPENCL "ggml: use OpenCL" OFF)
option(GGML_OPENCL_PROFILING "ggml: use OpenCL profiling (increases overhead)" OFF)
option(GGML_OPENCL_EMBED_KERNELS "ggml: embed kernels" ON)
option(GGML_OPENCL_USE_ADRENO_KERNELS "ggml: use optimized kernels for Adreno" ON)

# extra artifacts
option(GGML_BUILD_TESTS "ggml: build tests" ${GGML_STANDALONE})
option(GGML_BUILD_EXAMPLES "ggml: build examples" ${GGML_STANDALONE})
Expand Down
1 change: 1 addition & 0 deletions include/ggml-backend.h
Original file line number Diff line number Diff line change
Expand Up @@ -228,6 +228,7 @@ extern "C" {
GGML_API void ggml_backend_unload(ggml_backend_reg_t reg);
// Load all known backends from dynamic libraries
GGML_API void ggml_backend_load_all(void);
GGML_API void ggml_backend_load_all_from_path(const char * dir_path);

//
// Backend scheduler
Expand Down
26 changes: 26 additions & 0 deletions include/ggml-opencl.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
#ifndef GGML_OPENCL_H
#define GGML_OPENCL_H

#include "ggml.h"
#include "ggml-backend.h"

#ifdef __cplusplus
extern "C" {
#endif

//
// backend API
//
GGML_BACKEND_API ggml_backend_t ggml_backend_opencl_init(void);
GGML_BACKEND_API bool ggml_backend_is_opencl(ggml_backend_t backend);

GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_opencl_buffer_type(void);
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_opencl_host_buffer_type(void);

GGML_BACKEND_API ggml_backend_reg_t ggml_backend_opencl_reg(void);

#ifdef __cplusplus
}
#endif

#endif // GGML_OPENCL_H
20 changes: 19 additions & 1 deletion include/ggml.h
Original file line number Diff line number Diff line change
Expand Up @@ -237,7 +237,9 @@
#define GGML_EXIT_SUCCESS 0
#define GGML_EXIT_ABORTED 1

#define GGML_ROPE_TYPE_NEOX 2
#define GGML_ROPE_TYPE_NEOX 2
#define GGML_ROPE_TYPE_MROPE 8
#define GGML_ROPE_TYPE_VISION 24

#define GGUF_MAGIC "GGUF"

Expand Down Expand Up @@ -1443,6 +1445,22 @@ extern "C" {
float beta_fast,
float beta_slow);

GGML_API struct ggml_tensor * ggml_rope_multi(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b,
struct ggml_tensor * c,
int n_dims,
int sections[4],
int mode,
int n_ctx_orig,
float freq_base,
float freq_scale,
float ext_factor,
float attn_factor,
float beta_fast,
float beta_slow);

// in-place, returns view(a)
GGML_API struct ggml_tensor * ggml_rope_ext_inplace(
struct ggml_context * ctx,
Expand Down
2 changes: 1 addition & 1 deletion scripts/sync-llama.last
Original file line number Diff line number Diff line change
@@ -1 +1 @@
26a8406ba9198eb6fdd8329fa717555b4f77f05f
5437d4aaf5132c879acda0bb67f2f8f71da4c9fe
6 changes: 1 addition & 5 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -194,11 +194,6 @@ endif()

if (WIN32)
add_compile_definitions(_CRT_SECURE_NO_WARNINGS)

if (BUILD_SHARED_LIBS)
# TODO: should not use this
set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
endif()
endif()

# ggml
Expand Down Expand Up @@ -313,6 +308,7 @@ ggml_add_backend(MUSA)
ggml_add_backend(RPC)
ggml_add_backend(SYCL)
ggml_add_backend(Vulkan)
ggml_add_backend(OpenCL)

foreach (target ggml-base ggml)
target_include_directories(${target} PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../include> $<INSTALL_INTERFACE:include>)
Expand Down
51 changes: 37 additions & 14 deletions src/ggml-backend-reg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,10 @@
#include "ggml-vulkan.h"
#endif

#ifdef GGML_USE_OPENCL
#include "ggml-opencl.h"
#endif

#ifdef GGML_USE_BLAS
#include "ggml-blas.h"
#endif
Expand Down Expand Up @@ -146,6 +150,9 @@ struct ggml_backend_registry {
#ifdef GGML_USE_VULKAN
register_backend(ggml_backend_vk_reg());
#endif
#ifdef GGML_USE_OPENCL
register_backend(ggml_backend_opencl_reg());
#endif
#ifdef GGML_USE_CANN
register_backend(ggml_backend_cann_reg());
#endif
Expand Down Expand Up @@ -449,11 +456,21 @@ static std::string backend_filename_suffix() {
#endif
}

static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent) {
static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent, const char * user_search_path) {
// enumerate all the files that match [lib]ggml-name-*.[so|dll] in the search paths
// TODO: search system paths
std::vector<std::string> search_paths = { "./", get_executable_path() };
std::string file_prefix = backend_filename_prefix() + name + "-";
std::vector<std::string> search_paths;
if (user_search_path == nullptr) {
search_paths.push_back("./");
search_paths.push_back(get_executable_path());
} else {
#if defined(_WIN32)
search_paths.push_back(std::string(user_search_path) + "\\");
#else
search_paths.push_back(std::string(user_search_path) + "/");
#endif
}

int best_score = 0;
std::string best_path;
Expand All @@ -463,7 +480,8 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent)
if (!fs::exists(search_path)) {
continue;
}
for (const auto & entry : fs::directory_iterator(search_path)) {
fs::directory_iterator dir_it(search_path, fs::directory_options::skip_permission_denied);
for (const auto & entry : dir_it) {
if (entry.is_regular_file()) {
std::string filename = entry.path().filename().string();
std::string ext = entry.path().extension().string();
Expand Down Expand Up @@ -509,21 +527,26 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent)
}

void ggml_backend_load_all() {
ggml_backend_load_all_from_path(nullptr);
}

void ggml_backend_load_all_from_path(const char * dir_path) {
#ifdef NDEBUG
bool silent = true;
#else
bool silent = false;
#endif

ggml_backend_load_best("blas", silent);
ggml_backend_load_best("cann", silent);
ggml_backend_load_best("cuda", silent);
ggml_backend_load_best("hip", silent);
ggml_backend_load_best("kompute", silent);
ggml_backend_load_best("metal", silent);
ggml_backend_load_best("rpc", silent);
ggml_backend_load_best("sycl", silent);
ggml_backend_load_best("vulkan", silent);
ggml_backend_load_best("musa", silent);
ggml_backend_load_best("cpu", silent);
ggml_backend_load_best("blas", silent, dir_path);
ggml_backend_load_best("cann", silent, dir_path);
ggml_backend_load_best("cuda", silent, dir_path);
ggml_backend_load_best("hip", silent, dir_path);
ggml_backend_load_best("kompute", silent, dir_path);
ggml_backend_load_best("metal", silent, dir_path);
ggml_backend_load_best("rpc", silent, dir_path);
ggml_backend_load_best("sycl", silent, dir_path);
ggml_backend_load_best("vulkan", silent, dir_path);
ggml_backend_load_best("opencl", silent, dir_path);
ggml_backend_load_best("musa", silent, dir_path);
ggml_backend_load_best("cpu", silent, dir_path);
}
9 changes: 9 additions & 0 deletions src/ggml-cann/ggml-cann.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1747,6 +1747,15 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
if (*ext_factor != 0) {
return false;
}

const int mode = ((const int32_t *) op->op_params)[2];
if (mode & GGML_ROPE_TYPE_MROPE) {
return false;
}
if (mode & GGML_ROPE_TYPE_VISION) {
return false;
}

return true;
}
case GGML_OP_UPSCALE: {
Expand Down
2 changes: 1 addition & 1 deletion src/ggml-common.h
Original file line number Diff line number Diff line change
Expand Up @@ -473,7 +473,7 @@ GGML_TABLE_BEGIN(uint8_t, ksigns_iq2xs, 128)
240, 113, 114, 243, 116, 245, 246, 119, 120, 249, 250, 123, 252, 125, 126, 255,
GGML_TABLE_END()

//#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
//#if __CUDA_ARCH__ >= GGML_CUDA_CC_DP4A // lowest compute capability for integer intrinsics
GGML_TABLE_BEGIN(uint64_t, ksigns64, 128)
0x0000000000000000, 0xff000000000000ff, 0xff0000000000ff00, 0x000000000000ffff,
0xff00000000ff0000, 0x0000000000ff00ff, 0x0000000000ffff00, 0xff00000000ffffff,
Expand Down
2 changes: 1 addition & 1 deletion src/ggml-cpu/amx/amx.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@ static const char * ggml_backend_amx_buffer_type_get_name(ggml_backend_buffer_ty
}

static ggml_backend_buffer_t ggml_backend_amx_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
void * data = aligned_alloc(TENSOR_ALIGNMENT, size);
void * data = ggml_aligned_malloc(size);
if (data == NULL) {
fprintf(stderr, "%s: failed to allocate buffer of size %zu\n", __func__, size);
return NULL;
Expand Down
Loading
Loading