diff --git a/Dockerfile.rocm b/Dockerfile.rocm index 11f9efb2e7371..49edca537d18e 100644 --- a/Dockerfile.rocm +++ b/Dockerfile.rocm @@ -1,5 +1,5 @@ # default base image -ARG BASE_IMAGE="rocm/pytorch:rocm6.1.1_ubuntu20.04_py3.9_pytorch_release-2.1.2" +ARG BASE_IMAGE="rocm/pytorch:rocm6.1.1_ubuntu20.04_py3.9_pytorch_staging" ARG COMMON_WORKDIR=/app diff --git a/csrc/quantization/fp8/gemm_kernel.cu b/csrc/quantization/fp8/gemm_kernel.cu index 0463cc75eac6c..558228db2c084 100644 --- a/csrc/quantization/fp8/gemm_kernel.cu +++ b/csrc/quantization/fp8/gemm_kernel.cu @@ -101,7 +101,7 @@ torch::Tensor fp8_gemm(torch::Tensor& a, torch::Tensor& b, torch::Tensor& scaleA auto d_scaleD = scaleD.data_ptr(); auto handle = at::cuda::getCurrentCUDABlasLtHandle(); - auto stream = at::hip::getCurrentHIPStreamMasqueradingAsCUDA(); + auto stream = at::cuda::getCurrentCUDAStream(); hipblaslt_ext::GemmPreference gemmPref; gemmPref.setMaxWorkspaceBytes(0); @@ -218,7 +218,7 @@ torch::Tensor fp8_gemm_16( auto d_scaleB = transpose_result ? scaleA.data_ptr() : scaleB.data_ptr(); auto handle = at::cuda::getCurrentCUDABlasLtHandle(); - auto stream = at::hip::getCurrentHIPStreamMasqueradingAsCUDA(); + auto stream = at::cuda::getCurrentCUDAStream(); hipblaslt_ext::GemmPreference gemmPref; gemmPref.setMaxWorkspaceBytes(0);