use pin_memory for ModelInput and minor refactoring (#2910)

TroyGarden · facebook-github-bot · commit 3f467011cdb1 · 2025-04-24T15:56:50.000-07:00
Summary: # context * this is some BE work and minor refactoring when working on pipeline optimization * major change is to add "pin_memory" option to the test_input file for `ModelInput` generation: ``` The `pin_memory()` call for all KJT tensors are important for training benchmark, and also valid argument for the prod training scenario: TrainModelInput should be created on pinned memory for a fast transfer to gpu. For more on pin_memory: https://pytorch.org/tutorials/intermediate/pinmem_nonblock.html#pin-memory ``` * minor refactoring includes (1) default parameters for TrainPipeline benchmark so that the embedding size, batch size, etc. are resonable. (2) fix the batch index error in trace, previously used (curr_index+1) (3) split the `EmbeddingPipelinedForward` __call__ function into two parts. * trace comparison: the `pin_memory()` for the ModelInput is critical for a non_blocking cpu to gpu data copy before copy_batch_to_gpu is the same size as gpu data transfer {F1977324224} after: copy_batch_to_gpu is hardly seen in trace {F1977324220} Reviewed By: aporialiao Differential Revision: D73514639
diff --git a/torchrec/distributed/benchmark/benchmark_train_sparsenn.py b/torchrec/distributed/benchmark/benchmark_train_sparsenn.py
@@ -54,17 +54,17 @@
 
 @dataclass
 class RunOptions:
-    world_size: int = 4
-    num_batches: int = 20
+    world_size: int = 2
+    num_batches: int = 10
     sharding_type: ShardingType = ShardingType.TABLE_WISE
     input_type: str = "kjt"
     profile: str = ""
 
 
 @dataclass
 class EmbeddingTablesConfig:
-    num_unweighted_features: int = 4
-    num_weighted_features: int = 4
+    num_unweighted_features: int = 100
+    num_weighted_features: int = 100
     embedding_feature_dim: int = 512
 
     def generate_tables(
diff --git a/torchrec/distributed/benchmark/benchmark_utils.py b/torchrec/distributed/benchmark/benchmark_utils.py
@@ -867,6 +867,7 @@ def trace_handler(prof) -> None:
                 profile_memory=True,
                 with_flops=True,
                 with_modules=True,
+                with_stack=False,  # usually we don't want to show the entire stack in the trace
                 on_trace_ready=trace_handler,
             ) as p:
                 for i in range(num_profiles):
diff --git a/torchrec/distributed/test_utils/test_input.py b/torchrec/distributed/test_utils/test_input.py
@@ -205,6 +205,7 @@ def generate_local_batches(
         offsets_dtype: torch.dtype = torch.int64,
         lengths_dtype: torch.dtype = torch.int64,
         all_zeros: bool = False,
+        pin_memory: bool = False,  # pin_memory is needed for training job qps benchmark
     ) -> List["ModelInput"]:
         """
         Returns multi-rank batches (ModelInput) of world_size
@@ -224,6 +225,7 @@ def generate_local_batches(
                 offsets_dtype=offsets_dtype,
                 lengths_dtype=lengths_dtype,
                 all_zeros=all_zeros,
+                pin_memory=pin_memory,
             )
             for _ in range(world_size)
         ]
@@ -256,9 +258,15 @@ def generate(
         offsets_dtype: torch.dtype = torch.int64,
         lengths_dtype: torch.dtype = torch.int64,
         all_zeros: bool = False,
+        pin_memory: bool = False,  # pin_memory is needed for training job qps benchmark
     ) -> "ModelInput":
         """
         Returns a single batch of `ModelInput`
+
+        The `pin_memory()` call for all KJT tensors are important for training benchmark, and
+        also valid argument for the prod training scenario: TrainModelInput should be created
+        on pinned memory for a fast transfer to gpu. For more on pin_memory:
+        https://pytorch.org/tutorials/intermediate/pinmem_nonblock.html#pin-memory
         """
         float_features = (
             torch.zeros((batch_size, num_float_features), device=device)
@@ -279,6 +287,7 @@ def generate(
                 offsets_dtype=offsets_dtype,
                 lengths_dtype=lengths_dtype,
                 all_zeros=all_zeros,
+                pin_memory=pin_memory,
             )
             if tables is not None and len(tables) > 0
             else None
@@ -297,6 +306,7 @@ def generate(
                 offsets_dtype=offsets_dtype,
                 lengths_dtype=lengths_dtype,
                 all_zeros=all_zeros,
+                pin_memory=pin_memory,
             )
             if weighted_tables is not None and len(weighted_tables) > 0
             else None
@@ -306,6 +316,9 @@ def generate(
             if all_zeros
             else torch.rand((batch_size,), device=device)
         )
+        if pin_memory:
+            float_features = float_features.pin_memory()
+            label = label.pin_memory()
         return ModelInput(
             float_features=float_features,
             idlist_features=idlist_features,
@@ -404,13 +417,18 @@ def _assemble_kjt(
         device: Optional[torch.device] = None,
         use_offsets: bool = False,
         offsets_dtype: torch.dtype = torch.int64,
+        pin_memory: bool = False,
     ) -> KeyedJaggedTensor:
         """
-
         Assembles a KeyedJaggedTensor (KJT) from the provided per-feature lengths and indices.
 
         This method is used to generate corresponding local_batches and global_batch KJTs.
         It concatenates the lengths and indices for each feature to form a complete KJT.
+
+        The `pin_memory()` call for all KJT tensors are important for training benchmark, and
+        also valid argument for the prod training scenario: TrainModelInput should be created
+        on pinned memory for a fast transfer to gpu. For more on pin_memory:
+        https://pytorch.org/tutorials/intermediate/pinmem_nonblock.html#pin-memory
         """
 
         lengths = torch.cat(lengths_per_feature)
@@ -422,6 +440,11 @@ def _assemble_kjt(
                 [torch.tensor([0], device=device), lengths.cumsum(0)]
             ).to(offsets_dtype)
             lengths = None
+        if pin_memory:
+            indices = indices.pin_memory()
+            lengths = lengths.pin_memory() if lengths else None
+            weights = weights.pin_memory() if weights else None
+            offsets = offsets.pin_memory() if offsets else None
         return KeyedJaggedTensor(features, indices, weights, lengths, offsets)
 
     @staticmethod
@@ -440,6 +463,7 @@ def create_standard_kjt(
         offsets_dtype: torch.dtype = torch.int64,
         lengths_dtype: torch.dtype = torch.int64,
         all_zeros: bool = False,
+        pin_memory: bool = False,
     ) -> KeyedJaggedTensor:
         features, lengths_per_feature, indices_per_feature = (
             ModelInput._create_features_lengths_indices(
@@ -462,6 +486,7 @@ def create_standard_kjt(
             device=device,
             use_offsets=use_offsets,
             offsets_dtype=offsets_dtype,
+            pin_memory=pin_memory,
         )
 
     @staticmethod
@@ -555,14 +580,15 @@ class TdModelInput(ModelInput):
 
 @dataclass
 class TestSparseNNInputConfig:
-    batch_size: int = 1
+    batch_size: int = 8192
     num_float_features: int = 10
     feature_pooling_avg: int = 10
     use_offsets: bool = False
     dev_str: str = ""
     long_kjt_indices: bool = True
     long_kjt_offsets: bool = True
     long_kjt_lengths: bool = True
+    pin_memory: bool = True
 
     def generate_model_input(
         self,
@@ -584,4 +610,5 @@ def generate_model_input(
             indices_dtype=torch.int64 if self.long_kjt_indices else torch.int32,
             offsets_dtype=torch.int64 if self.long_kjt_offsets else torch.int32,
             lengths_dtype=torch.int64 if self.long_kjt_lengths else torch.int32,
+            pin_memory=self.pin_memory,
         )
diff --git a/torchrec/distributed/train_pipeline/train_pipelines.py b/torchrec/distributed/train_pipeline/train_pipelines.py
@@ -680,7 +680,7 @@ def copy_batch_to_gpu(
                 `self._execute_all_batches=True`, then returns None.
         """
         context = self._create_context()
-        with record_function(f"## copy_batch_to_gpu {self._next_index} ##"):
+        with record_function(f"## copy_batch_to_gpu {context.index} ##"):
             with self._stream_context(self._memcpy_stream):
                 batch = self._next_batch(dataloader_iter)
                 if batch is not None:
@@ -1008,11 +1008,13 @@ def embedding_backward(self, context: EmbeddingTrainPipelineContext) -> None:
             context.detached_embedding_tensors,
         ):
             grads = [tensor.grad for tensor in detached_emb_tensors]
-            # Some embeddings may never get used in the final loss computation,
-            # so the grads will be `None`. If we don't exclude these, it will fail
-            # with error: "grad can be implicitly created only for scalar outputs"
-            # Alternatively, if the tensor has only 1 element, pytorch can still
-            # figure out how to do autograd
+            """
+            Some embeddings may never get used in the final loss computation,
+            so the grads will be `None`. If we don't exclude these, it will fail
+            with error: "grad can be implicitly created only for scalar outputs"
+            Alternatively, if the tensor has only 1 element, pytorch can still
+            figure out how to do autograd
+            """
             embs_to_backprop, grads_to_use, invalid_features = [], [], []
             assert len(embedding_features) == len(emb_tensors)
             for features, tensor, grad in zip(embedding_features, emb_tensors, grads):
diff --git a/torchrec/distributed/train_pipeline/utils.py b/torchrec/distributed/train_pipeline/utils.py
@@ -600,6 +600,7 @@ def __call__(
                 self._stream
             )
             ctx.record_stream(cur_stream)
+
         awaitable = self._context.embedding_a2a_requests.pop(self._name)
         # in case of MC modules
         is_mc_module: bool = isinstance(awaitable, Iterable)
@@ -613,6 +614,24 @@ def __call__(
             embeddings = (
                 awaitable.wait()
             )  # trigger awaitable manually for type checking
+
+        self.detach_embeddings(embeddings=embeddings, cur_stream=cur_stream)
+
+        if is_mc_module:
+            return (LazyNoWait(embeddings), LazyNoWait(remapped_kjts))
+        else:
+            return LazyNoWait(embeddings)
+
+    def detach_embeddings(
+        self,
+        embeddings: Union[Dict[str, JaggedTensor], KeyedTensor],
+        cur_stream: torch.Stream,
+    ) -> None:
+        """
+        detach the grad from embeddings so that the backward/opt of the embeddings
+        won't be invoked by loss.backward(). Instead, there is a dedicated embedding_backward
+        call in semi-sync pipeline progress.
+        """
         tensors = []
         detached_tensors = []
         # in case of EC, embeddings are Dict[str, JaggedTensor]
@@ -650,11 +669,6 @@ def __call__(
             self._context.embedding_features.append([list(embeddings.keys())])
             self._context.detached_embedding_tensors.append(detached_tensors)
 
-        if is_mc_module:
-            return (LazyNoWait(embeddings), LazyNoWait(remapped_kjts))
-        else:
-            return LazyNoWait(embeddings)
-
 
 class PrefetchPipelinedForward(BaseForward[PrefetchTrainPipelineContext]):
     """