Skip to content

Commit 87aeacf

Browse files
committed
Review: make check more explicit, add back comments, and fix formatting
1 parent b7225ec commit 87aeacf

File tree

2 files changed

+13
-8
lines changed

2 files changed

+13
-8
lines changed

ggml/src/ggml-cuda/ggml-cuda.cu

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1769,7 +1769,7 @@ static __global__ void k_compute_batched_ptrs(
17691769
ptrs_dst[0*ne23 + i12 + i13*ne12] = ( char *) dst + i12*nbd2 + i13*nbd3;
17701770
}
17711771

1772-
// Type traits for CUDA types
1772+
// Type traits for mapping ggml types to CUDA/cuBLAS types
17731773
template<ggml_type T>
17741774
struct batched_mul_mat_traits;
17751775

@@ -1823,6 +1823,9 @@ static void ggml_cuda_mul_mat_batched_cublas_impl(ggml_backend_cuda_context & ct
18231823
GGML_ASSERT(src0->type == src0_type);
18241824
GGML_ASSERT(ggml_is_contiguous(dst));
18251825

1826+
// Byte offsets and tensor dimensions are currently used in an inconsistent way for dst.
1827+
// As long as dst is contiguous this does not matter though.
1828+
18261829
GGML_TENSOR_BINARY_OP_LOCALS
18271830

18281831
const int64_t ne_dst = ggml_nelements(dst);
@@ -1874,6 +1877,8 @@ static void ggml_cuda_mul_mat_batched_cublas_impl(ggml_backend_cuda_context & ct
18741877
cudaDataType_t cu_data_type_b = traits::data_type;
18751878
const void * alpha = traits::get_alpha();
18761879
const void * beta = traits::get_beta();
1880+
const float alpha_f32 = 1.0f;
1881+
const float beta_f32 = 0.0f;
18771882

18781883
if (dst->op_params[0] == GGML_PREC_DEFAULT) {
18791884
if constexpr (src0_type == GGML_TYPE_F32) {
@@ -1887,8 +1892,6 @@ static void ggml_cuda_mul_mat_batched_cublas_impl(ggml_backend_cuda_context & ct
18871892
dst_t = (char *) dst_ddf;
18881893
cu_compute_type = CUBLAS_COMPUTE_32F;
18891894
cu_data_type = CUDA_R_32F;
1890-
const float alpha_f32 = 1.0f;
1891-
const float beta_f32 = 0.0f;
18921895
alpha = &alpha_f32;
18931896
beta = &beta_f32;
18941897
}
@@ -1897,8 +1900,6 @@ static void ggml_cuda_mul_mat_batched_cublas_impl(ggml_backend_cuda_context & ct
18971900
const int cc = ggml_cuda_info().devices[id].cc;
18981901
if (GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA4(cc)) {
18991902
cu_compute_type = CUBLAS_COMPUTE_32F;
1900-
const float alpha_f32 = 1.0f;
1901-
const float beta_f32 = 0.0f;
19021903
alpha = &alpha_f32;
19031904
beta = &beta_f32;
19041905
}
@@ -2029,6 +2030,11 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
20292030
//printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
20302031
//printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
20312032

2033+
const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
2034+
bool can_use_batched_cublas_f16 = src0->type == GGML_TYPE_F16 && (src1->type == GGML_TYPE_F16 || !any_gpus_with_slow_fp16);
2035+
bool can_use_batched_cublas_bf16 = src0->type == GGML_TYPE_BF16 && bf16_mma_hardware_available(cc);
2036+
bool can_use_batched_cublas_f32 = src0->type == GGML_TYPE_F32;
2037+
20322038
if (!split && use_mul_mat_vec) {
20332039
// the custom F16 vector kernel can be used over batched cuBLAS GEMM
20342040
// but this is only faster for GPUs without tensor cores or with a thin src0 matrix (particularly KQV in attention)
@@ -2037,8 +2043,7 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
20372043
ggml_cuda_mul_mat_vec_q(ctx, src0, src1, nullptr, dst);
20382044
} else if (!split && use_mul_mat_q) {
20392045
ggml_cuda_mul_mat_q(ctx, src0, src1, nullptr, dst);
2040-
} else if (!split && (src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_BF16 || src0->type == GGML_TYPE_F32)
2041-
&& (src1->type == GGML_TYPE_F16 || src1->type == GGML_TYPE_BF16 || src1->type == GGML_TYPE_F32)
2046+
} else if (!split && (can_use_batched_cublas_f16 || can_use_batched_cublas_bf16 || can_use_batched_cublas_f32)
20422047
&& !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
20432048
// general KQ + KQV multi-batch without FlashAttention
20442049
ggml_cuda_mul_mat_batched_cublas(ctx, src0, src1, dst);

tests/test-backend-ops.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4332,7 +4332,7 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
43324332
for (auto nr : {1,4}) {
43334333
for (uint32_t m = 0; m < 2; ++m) {
43344334
for (uint32_t k = 0; k < 2; ++k) {
4335-
for(ggml_type type: {GGML_TYPE_F16, GGML_TYPE_BF16, GGML_TYPE_F32}){
4335+
for (ggml_type type: {GGML_TYPE_F16, GGML_TYPE_BF16, GGML_TYPE_F32}) {
43364336
test_cases.emplace_back(new test_mul_mat(type, GGML_TYPE_F32, 1056 + m, 1, 128 + k, {bs, 1}, {nr, 1}, {0, 2, 1, 3}));
43374337
test_cases.emplace_back(new test_mul_mat(type, GGML_TYPE_F32, 128 + m, 1, 1056 + k, {bs, 1}, {nr, 1}, {0, 1, 2, 3}, true));
43384338
}

0 commit comments

Comments
 (0)