diff --git a/onnxruntime/contrib_ops/cuda/quantization/matmul_nbits.cu b/onnxruntime/contrib_ops/cuda/quantization/matmul_nbits.cu index af9e87eaf225d..b73e2d7742c30 100644 --- a/onnxruntime/contrib_ops/cuda/quantization/matmul_nbits.cu +++ b/onnxruntime/contrib_ops/cuda/quantization/matmul_nbits.cu @@ -288,6 +288,7 @@ bool TryMatMul4Bits( if (n % kColsPerThreadBlock != 0 || k % 8 != 0 || m > 1) { return false; } + const int kWarpSize = GPU_WARP_SIZE_HOST; dim3 blocks((n + kColsPerThreadBlock - 1) / kColsPerThreadBlock, m); dim3 threads(kWarpSize, kColsPerThreadBlock); int blocks_per_K = (k + block_size - 1) / block_size;