From 0e45a531466fecf31d17134f3a2d9d343d2a23cf Mon Sep 17 00:00:00 2001 From: Strahinja Stamenkovic Date: Fri, 15 Nov 2024 09:58:28 -0800 Subject: [PATCH 1/2] Added maximum gridDim.y overflow heck before calling transposeNoOverlap kernel so that TransposeBigMLFloat16 test passes --- onnxruntime/core/providers/rocm/fpgeneric.cu | 19 +++++++++++++++++++ .../providers/rocm/shared_inc/fpgeneric.h | 2 +- 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/onnxruntime/core/providers/rocm/fpgeneric.cu b/onnxruntime/core/providers/rocm/fpgeneric.cu index d130758bec084..3237ce57e67e6 100644 --- a/onnxruntime/core/providers/rocm/fpgeneric.cu +++ b/onnxruntime/core/providers/rocm/fpgeneric.cu @@ -53,6 +53,25 @@ __global__ void CopyVectorBFloat16(const onnxruntime::BFloat16* x, int incx, onn } // namespace +dim3 rocblasTransposeHelperDimGrid(int m, int n) { + return dim3((n + TRANS_TILE_DIM - 1) / TRANS_TILE_DIM, (m + TRANS_TILE_DIM - 1) / TRANS_TILE_DIM, 1); +} + +// rocblasTransposeHelper can only be used if it won't overflow the maxGridSize y dimension size +__host__ bool CanUse_rocblasTransposeHelper_MLFloat16(int m, int n) { + dim3 dimGrid = rocblasTransposeHelperDimGrid(m, n); + + int deviceId; + hipError_t hipError = hipGetDevice(&deviceId); + if(hipError != 0) return false; + + hipDeviceProp_t deviceProp; + hipError = hipGetDeviceProperties(&deviceProp, deviceId); + if(hipError != 0) return false; + + return dimGrid.y < deviceProp.maxGridSize[1]; +} + rocblas_status rocblasTransposeHelper(hipStream_t stream, rocblas_handle, rocblas_operation , rocblas_operation , int m, int n, const half*, const half* A, int, const half*, const half*, int, half* C, int) { if (C != A) { dim3 dimGrid((n + TRANS_TILE_DIM - 1) / TRANS_TILE_DIM, (m + TRANS_TILE_DIM - 1) / TRANS_TILE_DIM, 1); diff --git a/onnxruntime/core/providers/rocm/shared_inc/fpgeneric.h b/onnxruntime/core/providers/rocm/shared_inc/fpgeneric.h index d93f70785c093..c165158f7e461 100644 --- a/onnxruntime/core/providers/rocm/shared_inc/fpgeneric.h +++ b/onnxruntime/core/providers/rocm/shared_inc/fpgeneric.h @@ -470,7 +470,7 @@ inline rocblas_status rocblasTransposeHelper(hipStream_t /*stream*/, rocblas_han return rocblas_dgeam(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C, ldc); } -inline bool CanUse_rocblasTransposeHelper_MLFloat16(int /*m*/, int /*n*/) { return true; } // CUDA has a limited grid size of 65536, ROCm has higher limits. +bool CanUse_rocblasTransposeHelper_MLFloat16(int m, int n); rocblas_status rocblasTransposeHelper(hipStream_t stream, rocblas_handle, rocblas_operation, rocblas_operation, int m, int n, const half*, const half* A, int, const half*, const half*, int, half* C, int); // copy From a89a8da06eb533bdb3b14e93addc08589ed9ebff Mon Sep 17 00:00:00 2001 From: Strahinja Stamenkovic Date: Mon, 18 Nov 2024 02:21:34 -0800 Subject: [PATCH 2/2] Fix formatting --- onnxruntime/core/providers/rocm/fpgeneric.cu | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/onnxruntime/core/providers/rocm/fpgeneric.cu b/onnxruntime/core/providers/rocm/fpgeneric.cu index 3237ce57e67e6..2e0fd57144380 100644 --- a/onnxruntime/core/providers/rocm/fpgeneric.cu +++ b/onnxruntime/core/providers/rocm/fpgeneric.cu @@ -63,16 +63,16 @@ __host__ bool CanUse_rocblasTransposeHelper_MLFloat16(int m, int n) { int deviceId; hipError_t hipError = hipGetDevice(&deviceId); - if(hipError != 0) return false; + if (hipError != 0) return false; hipDeviceProp_t deviceProp; hipError = hipGetDeviceProperties(&deviceProp, deviceId); - if(hipError != 0) return false; + if (hipError != 0) return false; return dimGrid.y < deviceProp.maxGridSize[1]; } -rocblas_status rocblasTransposeHelper(hipStream_t stream, rocblas_handle, rocblas_operation , rocblas_operation , int m, int n, const half*, const half* A, int, const half*, const half*, int, half* C, int) { +rocblas_status rocblasTransposeHelper(hipStream_t stream, rocblas_handle, rocblas_operation, rocblas_operation, int m, int n, const half*, const half* A, int, const half*, const half*, int, half* C, int) { if (C != A) { dim3 dimGrid((n + TRANS_TILE_DIM - 1) / TRANS_TILE_DIM, (m + TRANS_TILE_DIM - 1) / TRANS_TILE_DIM, 1); dim3 dimBlock(TRANS_TILE_DIM, BLOCK_ROWS, 1); @@ -92,7 +92,7 @@ rocblas_status rocblasCopyHelper(hipStream_t stream, rocblas_handle, int n, cons } rocblas_status rocblasCopyHelper(hipStream_t stream, rocblas_handle, int n, const onnxruntime::BFloat16* x, int incx, - onnxruntime::BFloat16* y, int incy) { + onnxruntime::BFloat16* y, int incy) { dim3 dimGrid((unsigned int)(n + COPY_BLOCK_DIM - 1) / COPY_BLOCK_DIM, 1, 1); dim3 dimBlock(COPY_BLOCK_DIM, 1, 1); CopyVectorBFloat16<<>>(x, incx, y, incy, n);