#40 Calling DistributedSampler.set_epoch

LambdaLabsML · Oct 21, 2024 · 574444a · 574444a
1 parent c25b7c7
commit 574444a
Show file tree

Hide file tree

Showing 6 changed files with 16 additions and 0 deletions.
diff --git a/02-multi-gpu/README.md b/02-multi-gpu/README.md
@@ -219,6 +219,16 @@ As discussed before, this will let each rank grab a different subset of the data
  )
 ```
 
+You also need to call [DistributedSampler.set_epoch](https://pytorch.org/docs/stable/data.html#torch.utils.data.distributed.DistributedSampler). Here's the quote from the pytorch doc on this:
+
+```diff
++dataloader.sampler.set_epoch(state["epoch"])
+ batches = iter(dataloader)
+```
+
+> In distributed mode, calling the set_epoch() method at the beginning of each epoch before creating the DataLoader iterator is necessary to make shuffling work properly across multiple epochs. Otherwise, the same ordering will be always used.
+
+
 ### Only creating experiment directory on rank 0
 
 Note the `dist.barrier()` calls before and after we create the directory. **These are very important!**

diff --git a/02-multi-gpu/train_llm.py b/02-multi-gpu/train_llm.py
@@ -138,6 +138,8 @@ def _load_to_device(p):
         if state["epoch_step"] > 0:
             progress_bar.update(state["epoch_step"])
 
+        # We need to do this so we shuffle differently on each epoch in a reproducible way.
+        dataloader.sampler.set_epoch(state["epoch"])
         batches = iter(dataloader)
 
         for i_step in range(len(dataloader)):

diff --git a/03-multi-node/train_llm.py b/03-multi-node/train_llm.py
@@ -141,6 +141,7 @@ def _load_to_device(p):
         if state["epoch_step"] > 0:
             progress_bar.update(state["epoch_step"])
 
+        dataloader.sampler.set_epoch(state["epoch"])
         batches = iter(dataloader)
 
         for i_step in range(len(dataloader)):

diff --git a/05-sharding-deepspeed/train_llm.py b/05-sharding-deepspeed/train_llm.py
@@ -132,6 +132,7 @@ def main():
         if state["epoch_step"] > 0:
             progress_bar.update(state["epoch_step"])
 
+        dataloader.sampler.set_epoch(state["epoch"])
         batches = iter(dataloader)
 
         for i_step in range(len(dataloader)):

diff --git a/05-sharding-fsdp/train_llm.py b/05-sharding-fsdp/train_llm.py
@@ -205,6 +205,7 @@ def safe_param_init_fn(module: torch.nn.Module):
         if state["epoch_step"] > 0:
             progress_bar.update(state["epoch_step"])
 
+        dataloader.sampler.set_epoch(state["epoch"])
         batches = iter(dataloader)
 
         for i_step in range(len(dataloader)):

diff --git a/06-training-llama-405b/train_llm.py b/06-training-llama-405b/train_llm.py
@@ -220,6 +220,7 @@ def main():
         if state["epoch_step"] > 0:
             progress_bar.update(state["epoch_step"])
 
+        dataloader.sampler.set_epoch(state["epoch"])
         batches = iter(dataloader)
 
         for i_step in range(len(dataloader)):