diff --git a/csrc/layer_norm_cuda_kernel.cu b/csrc/layer_norm_cuda_kernel.cu
index a6301b2ee..c935fa67f 100644
--- a/csrc/layer_norm_cuda_kernel.cu
+++ b/csrc/layer_norm_cuda_kernel.cu
@@ -88,9 +88,9 @@ void cuWelfordMuSigma2(
     // intra-warp reductions
     for (int l = 0;  l <= 4;  ++l) {
       int srcLaneB = (threadIdx.x+(1<<l))&31;
-      U muB = WARP_SHFL(mu, srcLaneB);
-      U countB = WARP_SHFL(count, srcLaneB);
-      U sigma2B = WARP_SHFL(sigma2, srcLaneB);
+      U muB = WARP_SHFL(mu, srcLaneB, 32);
+      U countB = WARP_SHFL(count, srcLaneB, 32);
+      U sigma2B = WARP_SHFL(sigma2, srcLaneB, 32);
       cuChanOnlineSum<U>(muB,sigma2B,countB,mu,sigma2,count);
     }
     // threadIdx.x == 0 has correct values for each warp
@@ -126,8 +126,8 @@ void cuWelfordMuSigma2(
       sigma2 = ubuf[1]/U(n2);
       // don't care about final value of count, we know count == n2
     } else {
-      mu = WARP_SHFL(mu, 0);
-      sigma2 = WARP_SHFL(sigma2/U(n2), 0);
+      mu = WARP_SHFL(mu, 0, 32);
+      sigma2 = WARP_SHFL(sigma2/U(n2), 0, 32);
     }
   }
 }
@@ -183,9 +183,9 @@ void cuWelfordMuSigma2(
     // intra-warp reductions
     for (int l = 0;  l <= 4;  ++l) {
       int srcLaneB = (threadIdx.x+(1<<l))&31;
-      float muB = WARP_SHFL(mu, srcLaneB);
-      float countB = WARP_SHFL(count, srcLaneB);
-      float sigma2B = WARP_SHFL(sigma2, srcLaneB);
+      float muB = WARP_SHFL(mu, srcLaneB, 32);
+      float countB = WARP_SHFL(count, srcLaneB, 32);
+      float sigma2B = WARP_SHFL(sigma2, srcLaneB, 32);
       cuChanOnlineSum(muB,sigma2B,countB,mu,sigma2,count);
     }
     // threadIdx.x == 0 has correct values for each warp
@@ -221,8 +221,8 @@ void cuWelfordMuSigma2(
       sigma2 = ubuf[1]/float(n2);
       // don't care about final value of count, we know count == n2
     } else {
-      mu = WARP_SHFL(mu, 0);
-      sigma2 = WARP_SHFL(sigma2/float(n2), 0);
+      mu = WARP_SHFL(mu, 0, 32);
+      sigma2 = WARP_SHFL(sigma2/float(n2), 0, 32);
     }
   }
 }
@@ -581,8 +581,8 @@ void cuComputeGradInput(
     }
     // intra-warp reductions
     for (int mask = blockDim.x/2;  mask > 0;  mask /= 2) {
-      sum_loss1 += WARP_SHFL_XOR(sum_loss1, mask);
-      sum_loss2 += WARP_SHFL_XOR(sum_loss2, mask);
+      sum_loss1 += WARP_SHFL_XOR(sum_loss1, mask, 32);
+      sum_loss2 += WARP_SHFL_XOR(sum_loss2, mask, 32);
     }
     // inter-warp reductions
     if (blockDim.y > 1) {
diff --git a/tests/L0/run_test.py b/tests/L0/run_test.py
index 2678c1902..7299cf6ef 100644
--- a/tests/L0/run_test.py
+++ b/tests/L0/run_test.py
@@ -6,7 +6,6 @@
 test_dirs = ["run_amp", "run_fp16util", "run_optimizers", "run_fused_layer_norm", "run_pyprof_nvtx", "run_pyprof_data", "run_mlp"]
 
 ROCM_BLACKLIST = [
-    'run_fused_layer_norm',
     'run_pyprof_nvtx',
     'run_pyprof_data',
 ]