modify save list

liangel-02 · liangel-02 · commit 4b9440ccc4c8 · 2025-11-25T08:27:06.000-08:00
diff --git a/tests/integration_tests/features.py b/tests/integration_tests/features.py
@@ -350,12 +350,13 @@ def build_features_test_list() -> list[OverrideDefinitions]:
             [
                 [
                     "--parallelism.data_parallel_shard_degree=4",
-                    "--activation_checkpoint.mode='full'",
+                    "--activation_checkpoint.mode=selective",
+                    "--activation_checkpoint.selective_ac_option=op",
                     "--model.flavor=debugmodel_varlen_attn",
                 ]
             ],
-            "FSDP+VARLEN_ATTN",
-            "fsdp+varlen_attn",
+            "FSDP+VARLEN_ATTN + per op SAC",
+            "fsdp+varlen_attn+per_op_sac",
             ngpu=4,
             skip_rocm_test=True,
         ),
diff --git a/tests/unit_tests/test_activation_checkpoint.py b/tests/unit_tests/test_activation_checkpoint.py
@@ -25,6 +25,7 @@
     # used to compute the scaling factor for quantization.
     torch.ops.aten.max.default,
     torch._higher_order_ops.flex_attention,
+    torch.ops.torch_attn._varlen_attn,
 }
 
 
diff --git a/torchtitan/experiments/simple_fsdp/llama3/parallelize.py b/torchtitan/experiments/simple_fsdp/llama3/parallelize.py
@@ -30,6 +30,7 @@
     # used to compute the scaling factor for quantization.
     torch.ops.aten.max.default,
     torch._higher_order_ops.flex_attention,
+    torch.ops.torch_attn._varlen_attn,
 }
 
 
diff --git a/torchtitan/models/llama3/infra/parallelize.py b/torchtitan/models/llama3/infra/parallelize.py
@@ -41,6 +41,7 @@
     # used to compute the scaling factor for quantization.
     torch.ops.aten.max.default,
     torch._higher_order_ops.flex_attention,
+    torch.ops.torch_attn._varlen_attn,
 }
 
 

Original file line number	Diff line number	Diff line change
`@@ -350,12 +350,13 @@ def build_features_test_list() -> list[OverrideDefinitions]:`
`350`	`350`	`[`
`351`	`351`	`[`
`352`	`352`	`"--parallelism.data_parallel_shard_degree=4",`
`353`		`- "--activation_checkpoint.mode='full'",`
	`353`	`+ "--activation_checkpoint.mode=selective",`
	`354`	`+ "--activation_checkpoint.selective_ac_option=op",`
`354`	`355`	`"--model.flavor=debugmodel_varlen_attn",`
`355`	`356`	`]`
`356`	`357`	`],`
`357`		`- "FSDP+VARLEN_ATTN",`
`358`		`- "fsdp+varlen_attn",`
	`358`	`+ "FSDP+VARLEN_ATTN + per op SAC",`
	`359`	`+ "fsdp+varlen_attn+per_op_sac",`
`359`	`360`	`ngpu=4,`
`360`	`361`	`skip_rocm_test=True,`
`361`	`362`	`),`
Original file line number	Diff line number	Diff line change
`@@ -25,6 +25,7 @@`
`25`	`25`	`# used to compute the scaling factor for quantization.`
`26`	`26`	`torch.ops.aten.max.default,`
`27`	`27`	`torch._higher_order_ops.flex_attention,`
	`28`	`+ torch.ops.torch_attn._varlen_attn,`
`28`	`29`	`}`
`29`	`30`
`30`	`31`
Original file line number	Diff line number	Diff line change
`@@ -30,6 +30,7 @@`
`30`	`30`	`# used to compute the scaling factor for quantization.`
`31`	`31`	`torch.ops.aten.max.default,`
`32`	`32`	`torch._higher_order_ops.flex_attention,`
	`33`	`+ torch.ops.torch_attn._varlen_attn,`
`33`	`34`	`}`
`34`	`35`
`35`	`36`
Original file line number	Diff line number	Diff line change
`@@ -41,6 +41,7 @@`
`41`	`41`	`# used to compute the scaling factor for quantization.`
`42`	`42`	`torch.ops.aten.max.default,`
`43`	`43`	`torch._higher_order_ops.flex_attention,`
	`44`	`+ torch.ops.torch_attn._varlen_attn,`
`44`	`45`	`}`
`45`	`46`
`46`	`47`