From 0e45a531466fecf31d17134f3a2d9d343d2a23cf Mon Sep 17 00:00:00 2001 From: Strahinja Stamenkovic Date: Fri, 15 Nov 2024 09:58:28 -0800 Subject: [PATCH] Added maximum gridDim.y overflow heck before calling transposeNoOverlap kernel so that TransposeBigMLFloat16 test passes --- onnxruntime/core/providers/rocm/fpgeneric.cu | 19 +++++++++++++++++++ .../providers/rocm/shared_inc/fpgeneric.h | 2 +- 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/onnxruntime/core/providers/rocm/fpgeneric.cu b/onnxruntime/core/providers/rocm/fpgeneric.cu index d130758bec084..3237ce57e67e6 100644 --- a/onnxruntime/core/providers/rocm/fpgeneric.cu +++ b/onnxruntime/core/providers/rocm/fpgeneric.cu @@ -53,6 +53,25 @@ __global__ void CopyVectorBFloat16(const onnxruntime::BFloat16* x, int incx, onn } // namespace +dim3 rocblasTransposeHelperDimGrid(int m, int n) { + return dim3((n + TRANS_TILE_DIM - 1) / TRANS_TILE_DIM, (m + TRANS_TILE_DIM - 1) / TRANS_TILE_DIM, 1); +} + +// rocblasTransposeHelper can only be used if it won't overflow the maxGridSize y dimension size +__host__ bool CanUse_rocblasTransposeHelper_MLFloat16(int m, int n) { + dim3 dimGrid = rocblasTransposeHelperDimGrid(m, n); + + int deviceId; + hipError_t hipError = hipGetDevice(&deviceId); + if(hipError != 0) return false; + + hipDeviceProp_t deviceProp; + hipError = hipGetDeviceProperties(&deviceProp, deviceId); + if(hipError != 0) return false; + + return dimGrid.y < deviceProp.maxGridSize[1]; +} + rocblas_status rocblasTransposeHelper(hipStream_t stream, rocblas_handle, rocblas_operation , rocblas_operation , int m, int n, const half*, const half* A, int, const half*, const half*, int, half* C, int) { if (C != A) { dim3 dimGrid((n + TRANS_TILE_DIM - 1) / TRANS_TILE_DIM, (m + TRANS_TILE_DIM - 1) / TRANS_TILE_DIM, 1); diff --git a/onnxruntime/core/providers/rocm/shared_inc/fpgeneric.h b/onnxruntime/core/providers/rocm/shared_inc/fpgeneric.h index d93f70785c093..c165158f7e461 100644 --- a/onnxruntime/core/providers/rocm/shared_inc/fpgeneric.h +++ b/onnxruntime/core/providers/rocm/shared_inc/fpgeneric.h @@ -470,7 +470,7 @@ inline rocblas_status rocblasTransposeHelper(hipStream_t /*stream*/, rocblas_han return rocblas_dgeam(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C, ldc); } -inline bool CanUse_rocblasTransposeHelper_MLFloat16(int /*m*/, int /*n*/) { return true; } // CUDA has a limited grid size of 65536, ROCm has higher limits. +bool CanUse_rocblasTransposeHelper_MLFloat16(int m, int n); rocblas_status rocblasTransposeHelper(hipStream_t stream, rocblas_handle, rocblas_operation, rocblas_operation, int m, int n, const half*, const half* A, int, const half*, const half*, int, half* C, int); // copy