Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

sync : llama.cpp #1060

Merged
merged 22 commits into from
Jan 3, 2025
Merged
Changes from 1 commit
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
089373d
tests: add tests for GGUF (llama/10830)
JohannesGaessler Dec 17, 2024
1c45f7d
tts : add OuteTTS support (llama/10784)
ggerganov Dec 18, 2024
9937f58
ggml : fix arm build (llama/10890)
slaren Dec 18, 2024
66d5a45
ggml: fix arm build with gcc (llama/10895)
angt Dec 19, 2024
88ac055
ggml : add test for SVE and disable when it fails (llama/10906)
slaren Dec 20, 2024
5a7354f
SYCL: Migrate away from deprecated ggml_tensor->backend (llama/10840)
qnixsynapse Dec 20, 2024
ee74a78
ggml-cpu: replace NEON asm with intrinsics in ggml_gemv_q4_0_4x8_q8_0…
angt Dec 20, 2024
8b18e52
vulkan: optimize coopmat2 dequant functions (llama/10855)
jeffbolznv Dec 21, 2024
02db05d
vulkan: build fixes for 32b (llama/10927)
jeffbolznv Dec 22, 2024
f77c213
ggml : fix run-time on FreeBSD in get_executable_path() (llama/10948)
Dec 23, 2024
2f8aea9
ggml : fix const usage in SSE path (llama/10962)
slaren Dec 23, 2024
250b245
ggml : fix arm enabled features check (llama/10961)
slaren Dec 24, 2024
7f292ed
ggml : use wstring for backend search paths (llama/10960)
slaren Dec 24, 2024
636a15a
ggml : more perfo with llamafile tinyblas on x86_64 (llama/10714)
Djip007 Dec 24, 2024
d71439a
examples, ggml : fix GCC compiler warnings (llama/10983)
peter277 Dec 26, 2024
e659119
vulkan: multi-row k quants (llama/10846)
netrunnereve Dec 26, 2024
92d38ff
vulkan: Use push constant offset to handle misaligned descriptors (ll…
jeffbolznv Dec 29, 2024
3bcc231
vulkan: im2col and matmul optimizations for stable diffusion (llama/1…
jeffbolznv Dec 29, 2024
97d1ca9
vulkan: optimize mul_mat for small values of N (llama/10991)
jeffbolznv Dec 30, 2024
43c2a5c
ggml : fixes for AVXVNNI instruction set with MSVC and Clang (llama/1…
Srihari-mcw Dec 31, 2024
0937be4
metal : avoid uint (llama/11019)
ggerganov Jan 3, 2025
add9f12
sync : llama.cpp
ggerganov Jan 3, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
vulkan: optimize coopmat2 dequant functions (llama/10855)
Change the code to do 16b loads when possible and extract the appropriate
component late, so the code is effectively decoding a pair of elements and
then selecting one. This can allow more commoning to happen in the compiler
when neighboring elements are loaded.
jeffbolznv authored and ggerganov committed Jan 3, 2025

Verified

This commit was created on GitHub.com and signed with GitHub’s verified signature.
commit 8b18e52cf179f7ee1faff2c0d0a80ab3df7a1f64
70 changes: 45 additions & 25 deletions src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.comp
Original file line number Diff line number Diff line change
@@ -10,9 +10,10 @@ float16_t dequantFuncQ4_0(const in decodeBufQ4_0 bl, const in uint blockCoords[2
const float16_t d = bl.block.d;
const uint idx = coordInBlock[1];
const uint shift = (idx & 0x10) >> 2;
uint32_t qs = unpack8(uint32_t(bl.block.qs[(idx & 0xE) >> 1]))[idx & 1];
uint32_t qs = uint32_t(bl.block.qs[(idx & 0xE) >> 1]);
qs >>= shift;
qs &= 0xF;
qs &= 0x0F0F;
qs = unpack8(qs)[idx & 1];
float16_t ret = (float16_t(qs) - float16_t(8)) * d;
return ret;
}
@@ -152,15 +153,17 @@ layout(buffer_reference, std430, buffer_reference_align = 16) buffer decodeBufQ4
block_q4_K block;
};

layout(buffer_reference, std430, buffer_reference_align = 16) buffer decodeBufQ4_K_packed16 {
block_q4_K_packed16 block;
};

float16_t dequantFuncQ4_K(const in decodeBufQ4_K bl, const in uint blockCoords[2], const in uint coordInBlock[2])
{
decodeBufQ4_K_packed16 bl16 = decodeBufQ4_K_packed16(bl);
const uint idx = coordInBlock[1];
const uint iqs = idx;

const uint n = iqs / 64; // 0,1,2,3
const uint b = (iqs % 64) / 32; // 0,1
const uint b = (idx & 0x20) >> 5; // 0,1
const uint is = (idx & 0xE0) >> 5; // 0..7
const uint qsi = n * 32 + (iqs % 32); // 0..127

const f16vec2 loadd = bl.block.d;

@@ -184,9 +187,11 @@ float16_t dequantFuncQ4_K(const in decodeBufQ4_K bl, const in uint blockCoords[2
const float16_t d = loadd.x * float16_t(sc);
const float16_t m = loadd.y * float16_t(mbyte);

uint32_t dmask = 0xF << (b * 4);
uint qs = uint32_t(bl16.block.qs[((idx & 0xC0) >> 2) + ((idx & 0x1E) >> 1)]);
qs = (qs >> (b * 4)) & 0x0F0F;
qs = unpack8(qs)[idx & 1];

float16_t ret = d * float16_t((bl.block.qs[qsi ] & dmask) >> (b * 4)) - m;
float16_t ret = d * float16_t(qs) - m;

return ret;
}
@@ -195,18 +200,19 @@ layout(buffer_reference, std430, buffer_reference_align = 16) buffer decodeBufQ5
block_q5_K block;
};

layout(buffer_reference, std430, buffer_reference_align = 16) buffer decodeBufQ5_K_packed16 {
block_q5_K_packed16 block;
};

float16_t dequantFuncQ5_K(const in decodeBufQ5_K bl, const in uint blockCoords[2], const in uint coordInBlock[2])
{
decodeBufQ5_K_packed16 bl16 = decodeBufQ5_K_packed16(bl);
const uint idx = coordInBlock[1];
const uint iqs = idx;

const uint n = iqs / 64; // 0,1,2,3
const uint b = (iqs % 64) / 32; // 0,1
const uint b = (idx & 0x20) >> 5; // 0,1
const uint is = (idx & 0xE0) >> 5; // 0..7
const uint qsi = n * 32 + (iqs % 32); // 0..127
const uint qhi = (iqs % 32); // 0..31

const uint8_t hm = uint8_t(1 << (iqs / 32));
const uint32_t hm = 0x0101 << is;

const f16vec2 loadd = bl.block.d;

@@ -230,9 +236,15 @@ float16_t dequantFuncQ5_K(const in decodeBufQ5_K bl, const in uint blockCoords[2
const float16_t d = loadd.x * float16_t(sc);
const float16_t m = loadd.y * float16_t(mbyte);

uint32_t dmask = 0xF << (b * 4);
uint qh = uint32_t(bl16.block.qh[(idx & 0x1E) >> 1]);
qh = qh & hm;
qh = unpack8(qh)[idx & 1];

float16_t ret = d * (float16_t((bl.block.qs[qsi ] & dmask) >> (b * 4)) + float16_t((bl.block.qh[qhi ] & hm) != 0 ? 16 : 0)) - m;
uint qs = uint32_t(bl16.block.qs[((idx & 0xC0) >> 2) + ((idx & 0x1E) >> 1)]);
qs = (qs >> (b * 4)) & 0x0F0F;
qs = unpack8(qs)[idx & 1];

float16_t ret = d * (float16_t(qs) + (qh != 0 ? float16_t(16) : float16_t(0))) - m;

return ret;
}
@@ -241,22 +253,30 @@ layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufQ6_
block_q6_K block;
};

layout(buffer_reference, std430, buffer_reference_align = 16) buffer decodeBufQ6_K_packed16 {
block_q6_K_packed16 block;
};

float16_t dequantFuncQ6_K(const in decodeBufQ6_K bl, const in uint blockCoords[2], const in uint coordInBlock[2])
{
decodeBufQ6_K_packed16 bl16 = decodeBufQ6_K_packed16(bl);
const uint idx = coordInBlock[1];
const uint iqs = idx;

const uint n = iqs / 128; // 0,1
const uint b = (iqs % 128) / 64; // 0,1
const uint is_b = (iqs % 32) / 16; // 0,1
const uint qhshift = ((iqs % 128) / 32) * 2;// 0,2,4,6
const uint is = 8 * n + qhshift + is_b; // 0..15
const uint qsi = n * 64 + (iqs % 64); // 0..127
const uint qhi = n * 32 + (iqs % 32); // 0..63
const uint b = (idx & 0x40) >> 6; // 0,1
const uint qhshift = (idx & 0x60) >> 4; // 0,2,4,6
const uint is = (idx & 0xF0) >> 4; // 0..15

const float16_t dscale = bl.block.d * float16_t(bl.block.scales[is]);

float16_t ret = dscale * float16_t(int8_t(((bl.block.ql[qsi ] >> (b * 4)) & 0xF) | (((bl.block.qh[qhi ] >> qhshift) & 3) << 4)) - 32);
uint ql = uint32_t(bl16.block.ql[((idx & 0x80) >> 2) + ((idx & 0x3E) >> 1)]);
ql = (ql >> (b * 4)) & 0x0F0F;

uint qh = uint32_t(bl16.block.qh[((idx & 0x80) >> 3) + ((idx & 0x1E) >> 1)]);
qh = ((qh >> qhshift) & 0x0303) << 4;

int q = unpack8(ql | qh)[idx & 1];

float16_t ret = dscale * float16_t(q - 32);

return ret;
}