Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions src/datatrove/pipeline/tokens/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -282,6 +282,7 @@ def __init__(
local_working_dir: DataFolderLike | None = None,
save_filename: str = None, # if defined, the final output filename will be this
tokenizer_name_or_path: str = "gpt2", # tokenizer to use, from HF or a local
bos_token: str = "<|startoftext|>", # whether to add the BOS token nefore each document
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
bos_token: str = "<|startoftext|>", # whether to add the BOS token nefore each document
bos_token: str | None = None, # whether to add the BOS token nefore each document

should disable it by default

eos_token: str = "<|endoftext|>", # whether to add the EOS token after each document
save_loss_metadata: bool = False, # save the loss information
shuffle: bool = True, # whether to shuffle documents in the dataset,
Expand All @@ -304,6 +305,7 @@ def __init__(
)
self.save_filename = save_filename
self.tokenizer_name_or_path = tokenizer_name_or_path
self.bos_token = bos_token
self.eos_token = eos_token
self.save_loss_metadata = save_loss_metadata
self.shuffle = shuffle
Expand Down
18 changes: 15 additions & 3 deletions src/datatrove/utils/tokenization.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ class PipelineStepWithTokenizer(PipelineStep, ABC):
def __init__(self):
super().__init__()
self.tokenizer_name_or_path = None
self.bos_token = None
self.eos_token = None
self._tokenizer: "Tokenizer" | None = None
self._post_processor = None
Expand All @@ -50,10 +51,21 @@ def tokenizer(self) -> "Tokenizer":
self._tokenizer = load_tokenizer(self.tokenizer_name_or_path)
if self._post_processor:
self._tokenizer.post_processor = self._post_processor
elif self.eos_token:
elif self.bos_token is not None or self.eos_token is not None:
special_tokens = []
if self.bos_token:
special_tokens.append(("<BOS>", self.tokenizer.token_to_id(self.bos_token)))
if self.eos_token:
special_tokens.append(("<EOS>", self.tokenizer.token_to_id(self.eos_token)))
if self.bos_token and not self.eos_token:
single = "<BOS> $A"
elif self.eos_token and not self.bos_token:
single = "$A <EOS>"
else:
single = "<BOS> $A <EOS>"
Comment on lines +54 to +65
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
elif self.bos_token is not None or self.eos_token is not None:
special_tokens = []
if self.bos_token:
special_tokens.append(("<BOS>", self.tokenizer.token_to_id(self.bos_token)))
if self.eos_token:
special_tokens.append(("<EOS>", self.tokenizer.token_to_id(self.eos_token)))
if self.bos_token and not self.eos_token:
single = "<BOS> $A"
elif self.eos_token and not self.bos_token:
single = "$A <EOS>"
else:
single = "<BOS> $A <EOS>"
elif self.bos_token is not None or self.eos_token is not None:
special_tokens = []
single_elems = []
if self.bos_token:
special_tokens.append(("<BOS>", self.tokenizer.token_to_id(self.bos_token)))
single_elems.append("<BOS>")
single_elems.append("$A")
if self.eos_token:
special_tokens.append(("<EOS>", self.tokenizer.token_to_id(self.eos_token)))
single_elems.append("<EOS>")

self._tokenizer.post_processor = TemplateProcessing(
single="$A <EOS>",
special_tokens=[("<EOS>", self.tokenizer.token_to_id(self.eos_token))],
single=single,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
single=single,
single=" ".join(single_elems),

special_tokens=special_tokens,
pair=None,
)
return self._tokenizer