Skip to content

Commit

Permalink
Shuffle examples before they are packed (#2037)
Browse files Browse the repository at this point in the history
Co-authored-by: lewtun <[email protected]>
  • Loading branch information
muupan and lewtun authored Sep 13, 2024
1 parent d47220f commit 7a2bbe3
Showing 1 changed file with 3 additions and 0 deletions.
3 changes: 3 additions & 0 deletions trl/trainer/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -635,6 +635,8 @@ def __iter__(self):
else:
more_examples = False
break
if self.shuffle:
random.shuffle(buffer)
tokenized_inputs = self.tokenizer(buffer, add_special_tokens=self.add_special_tokens, truncation=False)[
"input_ids"
]
Expand All @@ -649,6 +651,7 @@ def __iter__(self):
if len(input_ids) == self.seq_length:
examples.append(input_ids)
if self.shuffle:
# Shuffle again, otherwise split examples occur in consecutive tensors.
random.shuffle(examples)
for example in examples:
self.current_size += 1
Expand Down

0 comments on commit 7a2bbe3

Please sign in to comment.