modify save list for varlen attn (#2082)

liangel-02 · web-flow · commit 53e949cdb01e · 2025-12-02T16:06:16.000-05:00
adding varlen attention ops to ac save list

**testing**

used DebugMode() to print out op list. verified that forward is not
being recomputed in the backward step.

```
[rank0]:forward ops
[rank0]:varlen_attn in forward: True
...
[rank0]:varlen_attn recomputed in backward: False
[rank0]:saved correctly
```
diff --git a/tests/integration_tests/features.py b/tests/integration_tests/features.py
@@ -350,12 +350,13 @@ def build_features_test_list() -> list[OverrideDefinitions]:
             [
                 [
                     "--parallelism.data_parallel_shard_degree=4",
-                    "--activation_checkpoint.mode='full'",
+                    "--activation_checkpoint.mode=selective",
+                    "--activation_checkpoint.selective_ac_option=op",
                     "--model.flavor=debugmodel_varlen_attn",
                 ]
             ],
-            "FSDP+VARLEN_ATTN",
-            "fsdp+varlen_attn",
+            "FSDP+VARLEN_ATTN + per op SAC",
+            "fsdp+varlen_attn+per_op_sac",
             ngpu=4,
             skip_rocm_test=True,
         ),
diff --git a/tests/unit_tests/test_activation_checkpoint.py b/tests/unit_tests/test_activation_checkpoint.py
@@ -28,6 +28,7 @@
     # used to compute the scaling factor for quantization.
     torch.ops.aten.max.default,
     torch._higher_order_ops.flex_attention,
+    torch.ops.torch_attn._varlen_attn,
 }
 
 
diff --git a/torchtitan/experiments/simple_fsdp/llama3/parallelize.py b/torchtitan/experiments/simple_fsdp/llama3/parallelize.py
@@ -33,6 +33,7 @@
     # used to compute the scaling factor for quantization.
     torch.ops.aten.max.default,
     torch._higher_order_ops.flex_attention,
+    torch.ops.torch_attn._varlen_attn,
 }
 
 
diff --git a/torchtitan/models/llama3/infra/parallelize.py b/torchtitan/models/llama3/infra/parallelize.py
@@ -44,6 +44,7 @@
     # used to compute the scaling factor for quantization.
     torch.ops.aten.max.default,
     torch._higher_order_ops.flex_attention,
+    torch.ops.torch_attn._varlen_attn.default,
 }
 
 
diff --git a/torchtitan/models/qwen3/infra/parallelize.py b/torchtitan/models/qwen3/infra/parallelize.py
@@ -46,7 +46,7 @@
     # used to compute the scaling factor for quantization.
     torch.ops.aten.max.default,
     torch._higher_order_ops.flex_attention,
-    torch.ops.torch_attn._varlen_attn,
+    torch.ops.torch_attn._varlen_attn.default,
 }
 
 

Original file line number	Diff line number	Diff line change
`@@ -350,12 +350,13 @@ def build_features_test_list() -> list[OverrideDefinitions]:`
`350`	`350`	`[`
`351`	`351`	`[`
`352`	`352`	`"--parallelism.data_parallel_shard_degree=4",`
`353`		`- "--activation_checkpoint.mode='full'",`
	`353`	`+ "--activation_checkpoint.mode=selective",`
	`354`	`+ "--activation_checkpoint.selective_ac_option=op",`
`354`	`355`	`"--model.flavor=debugmodel_varlen_attn",`
`355`	`356`	`]`
`356`	`357`	`],`
`357`		`- "FSDP+VARLEN_ATTN",`
`358`		`- "fsdp+varlen_attn",`
	`358`	`+ "FSDP+VARLEN_ATTN + per op SAC",`
	`359`	`+ "fsdp+varlen_attn+per_op_sac",`
`359`	`360`	`ngpu=4,`
`360`	`361`	`skip_rocm_test=True,`
`361`	`362`	`),`
Original file line number	Diff line number	Diff line change
`@@ -28,6 +28,7 @@`
`28`	`28`	`# used to compute the scaling factor for quantization.`
`29`	`29`	`torch.ops.aten.max.default,`
`30`	`30`	`torch._higher_order_ops.flex_attention,`
	`31`	`+ torch.ops.torch_attn._varlen_attn,`
`31`	`32`	`}`
`32`	`33`
`33`	`34`
Original file line number	Diff line number	Diff line change
`@@ -33,6 +33,7 @@`
`33`	`33`	`# used to compute the scaling factor for quantization.`
`34`	`34`	`torch.ops.aten.max.default,`
`35`	`35`	`torch._higher_order_ops.flex_attention,`
	`36`	`+ torch.ops.torch_attn._varlen_attn,`
`36`	`37`	`}`
`37`	`38`
`38`	`39`
Original file line number	Diff line number	Diff line change
`@@ -44,6 +44,7 @@`
`44`	`44`	`# used to compute the scaling factor for quantization.`
`45`	`45`	`torch.ops.aten.max.default,`
`46`	`46`	`torch._higher_order_ops.flex_attention,`
	`47`	`+ torch.ops.torch_attn._varlen_attn.default,`
`47`	`48`	`}`
`48`	`49`
`49`	`50`
Original file line number	Diff line number	Diff line change
`@@ -46,7 +46,7 @@`
`46`	`46`	`# used to compute the scaling factor for quantization.`
`47`	`47`	`torch.ops.aten.max.default,`
`48`	`48`	`torch._higher_order_ops.flex_attention,`
`49`		`- torch.ops.torch_attn._varlen_attn,`
	`49`	`+ torch.ops.torch_attn._varlen_attn.default,`
`50`	`50`	`}`
`51`	`51`
`52`	`52`