Resize scheduler vectorization failure #3855

naoyam · 2025-02-08T07:47:17Z

This fusion is incorrectly vectorized by 4:

TEST_F(ResizeTest, VectorizationWithNonCancellableReshape) {
  auto fusion_ptr = std::make_unique<Fusion>();
  auto& fusion = *fusion_ptr;
  FusionGuard fg(fusion_ptr.get());

  auto tv0 = makeContigConcreteTensor({32, 16});
  fusion.addInput(tv0);

  auto tv1 = sin(tv0);
  auto tv2 = reshape(tv1, {32, 16}, {32, 8, 2});
  auto tv3 = slice(tv2,
                   {{IrBuilder::create<Val>(0L), IrBuilder::create<Val>(32L)},
                    {IrBuilder::create<Val>(0L), IrBuilder::create<Val>(4L)},
                    {IrBuilder::create<Val>(0L), IrBuilder::create<Val>(2L)}});
  auto tv4 = cos(tv3);
  fusion.addOutput(tv4);

  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
  auto t0 = at::randn({32, 16}, options);
  std::vector<c10::IValue> inputs({t0});

  FusionExecutorCache executor_cache(std::move(fusion_ptr));
  auto outputs = executor_cache.runFusionWithInputs(inputs);
  testValidate(executor_cache.fusion(), outputs, inputs, __LINE__, __FILE__);
}

Inputs:
  T0_g_float[iS0{128}, iS1{16}]
Outputs:
  T4_g_float[iblockIdx.x34{4}, ithreadIdx.x35{128}, iV31{4}] ca_pos( 2 ) produce_pos( 2 )

%kernel_math {
T5_l_float[iblockIdx.x36{4}, ithreadIdx.x37{128}, iV38{4}] ca_pos( 2 )
   = Set( T0_g_float[iS0{128}, iS1{16}], cache_op=Streaming )
T1_l_float[iblockIdx.x42{4}, ithreadIdx.x43{128}, iS44{4}] ca_pos( 3 ) produce_pos( 2 )
   = sinf(T5_l_float[iblockIdx.x36{4}, ithreadIdx.x37{128}, iV38{4}] ca_pos( 2 ));
T2_l_float[iblockIdx.x48{4}, ithreadIdx.x49{128}, iS50{4}] ca_pos( 3 ) produce_pos( 3 ) = view( T1_l_float[iblockIdx.x42{4}, ithreadIdx.x43{128}, iS44{4}] ca_pos( 3 ) pro
duce_pos( 2 ) )
T3_l_float[iblockIdx.x54{4}, ithreadIdx.x55{128}, iS56{4}] ca_pos( 3 ) produce_pos( 3 )
   = slice( T2_l_float[iblockIdx.x48{4}, ithreadIdx.x49{128}, iS50{4}] ca_pos( 3 ) produce_pos( 3 ), { {0, 128, 1} {0, 4, 1} {0, 2, 1} } )
T6_l_float[iblockIdx.x60{4}, ithreadIdx.x61{128}, iS62{4}] ca_pos( 2 ) produce_pos( 3 )
   = cosf(T3_l_float[iblockIdx.x54{4}, ithreadIdx.x55{128}, iS56{4}] ca_pos( 3 ) produce_pos( 3 ));
T4_g_float[iblockIdx.x34{4}, ithreadIdx.x35{128}, iV31{4}] ca_pos( 2 ) produce_pos( 2 )
   = Set( T6_l_float[iblockIdx.x60{4}, ithreadIdx.x61{128}, iS62{4}] ca_pos( 2 ) produce_pos( 3 ), cache_op=Streaming )
} // %kernel_math

The resize scheduler incorrectly vectorizes this by 4. It's technically possible to vectorize by 4, but since the scheduler currently only vectorizes the innermost dimension, 2 is the maximum factor.

The text was updated successfully, but these errors were encountered:

naoyam self-assigned this Feb 8, 2025

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Resize scheduler vectorization failure #3855

Resize scheduler vectorization failure #3855

naoyam commented Feb 8, 2025

Resize scheduler vectorization failure #3855

Resize scheduler vectorization failure #3855

Comments

naoyam commented Feb 8, 2025