Use reshape instead of view and update PyTorch autocast API

Signed-off-by: eljandoubi <[email protected]>
NVIDIA · Oct 18, 2024 · 5cb5e2e · 5cb5e2e
1 parent d0b2c57
commit 5cb5e2e
Show file tree

Hide file tree

Showing 8 changed files with 245 additions and 246 deletions.
diff --git a/qa/L0_pytorch_distributed_unittest/test.sh b/qa/L0_pytorch_distributed_unittest/test.sh
diff --git a/tests/pytorch/fused_attn/run_fused_attn_with_cp.py b/tests/pytorch/fused_attn/run_fused_attn_with_cp.py
@@ -222,7 +222,8 @@ def run_dpa_with_cp(
         seq_idx = torch.tensor([rank, 2 * world_size - rank - 1], device=q_.device)
         q_, k_, v_, dout_ = [x.index_select(seq_dim, seq_idx) for x in [q_, k_, v_, dout_]]
         q_, k_, v_, dout_ = [
-            x.reshape(*x.shape[:seq_dim], -1, *x.shape[(seq_dim + 2) :]) for x in [q_, k_, v_, dout_]
+            x.reshape(*x.shape[:seq_dim], -1, *x.shape[(seq_dim + 2) :])
+            for x in [q_, k_, v_, dout_]
         ]
     elif qkv_format == "thd":
         seq_idx_q = tex.thd_get_partitioned_indices(

diff --git a/tests/pytorch/fused_attn/test_fused_attn.py b/tests/pytorch/fused_attn/test_fused_attn.py
@@ -1845,7 +1845,9 @@ def _run_ref_mha_f16(dtype, config, backend):
     cu_seqlens = torch.zeros(config.batch_size + 1, device="cuda", dtype=torch.int32)
     cu_seqlens[1:] = torch.cumsum(seqlens, dim=0)
     out_grad = (
-        torch.load("out_grad.pt").to(device="cuda").reshape(config.batch_size, config.max_seqlen_q, -1)
+        torch.load("out_grad.pt")
+        .to(device="cuda")
+        .reshape(config.batch_size, config.max_seqlen_q, -1)
     )
 
     _DUMMY_CUDA_RNG_STATE_TRACKER = CudaRNGStatesTracker()

diff --git a/tests/pytorch/test_fused_optimizer.py b/tests/pytorch/test_fused_optimizer.py
@@ -323,8 +323,8 @@ def setup_method(self, *, seed: int = 0) -> None:
     def test_grad_scaler(self):
         params_ = [p for p in self.model_.parameters() if p.requires_grad]
         optimizer_ = te.optimizers.FusedAdam(params_, lr=self.lr, capturable=False)
-        scaler = torch.cuda.amp.GradScaler(enabled=True)
-        scaler_ = torch.cuda.amp.GradScaler(enabled=True)
+        scaler = torch.amp.GradScaler("cuda", enabled=True)
+        scaler_ = torch.amp.GradScaler("cuda", enabled=True)
 
         for i in range(100):
             x = torch.rand([32, 1, 28, 28]).cuda().to(memory_format=torch.channels_last)
@@ -333,7 +333,7 @@ def test_grad_scaler(self):
             gt_ = gt.clone()
 
             # Reference
-            with torch.cuda.amp.autocast(enabled=True):
+            with torch.amp.autocast(device_type="cuda", enabled=True):
                 y = self.model(x)
                 loss = ((gt - y) ** 2).mean()
 
@@ -342,7 +342,7 @@ def test_grad_scaler(self):
             scaler.update()
 
             # DUT
-            with torch.cuda.amp.autocast(enabled=True):
+            with torch.amp.autocast(device_type="cuda", enabled=True):
                 y = self.model_(x)
                 loss_ = ((gt_ - y) ** 2).mean()
 
@@ -374,8 +374,8 @@ def test_grad_scaler(self):
     def test_grad_scaler_capturable(self):
         params_ = [p for p in self.model_.parameters() if p.requires_grad]
         optimizer_ = te.optimizers.FusedAdam(params_, lr=self.lr, capturable=True)
-        scaler = torch.cuda.amp.GradScaler(enabled=True)
-        scaler_ = torch.cuda.amp.GradScaler(enabled=True)
+        scaler = torch.amp.GradScaler("cuda", enabled=True)
+        scaler_ = torch.amp.GradScaler("cuda", enabled=True)
 
         for i in range(100):
             x = torch.rand([32, 1, 28, 28]).cuda().to(memory_format=torch.channels_last)
@@ -384,7 +384,7 @@ def test_grad_scaler_capturable(self):
             gt_ = gt.clone()
 
             # Reference
-            with torch.cuda.amp.autocast(enabled=True):
+            with torch.amp.autocast(device_type="cuda", enabled=True):
                 y = self.model(x)
                 loss = ((gt - y) ** 2).mean()
 
@@ -393,7 +393,7 @@ def test_grad_scaler_capturable(self):
             scaler.update()
 
             # DUT
-            with torch.cuda.amp.autocast(enabled=True):
+            with torch.amp.autocast(device_type="cuda", enabled=True):
                 y = self.model_(x)
                 loss_ = ((gt_ - y) ** 2).mean()
 
@@ -432,8 +432,8 @@ def test_grad_scaler_capturable_master(self):
         optimizer_ = te.optimizers.FusedAdam(
             params_, lr=self.lr, capturable=True, master_weights=master_weights
         )
-        scaler = torch.cuda.amp.GradScaler(enabled=True)
-        scaler_ = torch.cuda.amp.GradScaler(enabled=True)
+        scaler = torch.amp.GradScaler("cuda", enabled=True)
+        scaler_ = torch.amp.GradScaler("cuda", enabled=True)
 
         for i in range(100):
             x = torch.rand([32, 1, 28, 28]).cuda().to(memory_format=torch.channels_last)
@@ -442,7 +442,7 @@ def test_grad_scaler_capturable_master(self):
             gt_ = gt.clone()
 
             # Reference
-            with torch.cuda.amp.autocast(enabled=True):
+            with torch.amp.autocast(device_type="cuda", enabled=True):
                 y = self.model(x)
                 loss = ((gt - y) ** 2).mean()
 
@@ -451,7 +451,7 @@ def test_grad_scaler_capturable_master(self):
             scaler.update()
 
             # DUT
-            with torch.cuda.amp.autocast(enabled=True):
+            with torch.amp.autocast(device_type="cuda", enabled=True):
                 y = self.model_(x)
                 loss_ = ((gt_ - y) ** 2).mean()
 

diff --git a/tests/pytorch/test_numerics.py b/tests/pytorch/test_numerics.py
@@ -214,7 +214,7 @@ def forward(
         )
 
         # change view to [b, np, sq, sk]
-        attention_scores = matmul_result.reshape(*output_size)
+        attention_scores = matmul_result.view(*output_size)
 
         # attention scores and attention mask [b, np, sq, sk]
         attention_probs = self.scale_mask_softmax(attention_scores, attention_mask)
@@ -233,7 +233,7 @@ def forward(
         value_layer = value_layer.reshape(value_layer.size(0), output_size[0] * output_size[1], -1)
 
         # change view [b * np, sq, sk]
-        attention_probs = attention_probs.reshape(output_size[0] * output_size[1], output_size[2], -1)
+        attention_probs = attention_probs.view(output_size[0] * output_size[1], output_size[2], -1)
 
         # matmul: [b * np, sq, hn]
         context_layer = torch.bmm(attention_probs, value_layer.transpose(0, 1))