supporting overriding validate_ends_with_eos in tokenizer.call (#128)

michalozeryflato · web-flow · commit 3a6adcbfbc1f · 2024-06-13T13:55:04.000+03:00
The following changes were made to allow this:
1. In the init: change validate_ends_with_eos from str to bool, and add
another argument eos, with default="&lt;EOS&gt;". default behavior is
maintained. I searched for all call for the init - None used
validate_ends_with_eos
2. add a new boolean argument validate_ends_with_eos to tokenizer.call
with default None. If not None - will override
self._validate_ends_with_eos during the call.

The change was made to allow not adding &lt;EOS&gt; to the decoder, to allow
it to be the same length as the label. This allows cropping redundant
padding in a batch - also in decoder input and label fields
diff --git a/fusedrug/data/tokenizer/modulartokenizer/configs/__init__.py b/fusedrug/data/tokenizer/modulartokenizer/configs/__init__.py
@@ -0,0 +1,8 @@
+from pathlib import Path
+
+# The directory containing this file
+CONFIG_DIRPATH = Path(__file__).parent
+
+
+def get_modular_tokenizer_config_dirpath() -> str:
+    return str(CONFIG_DIRPATH.resolve())
diff --git a/fusedrug/data/tokenizer/modulartokenizer/modular_tokenizer.py b/fusedrug/data/tokenizer/modulartokenizer/modular_tokenizer.py
@@ -776,6 +776,9 @@ def set_field(tokenizers_info_cfg: List, name: str, key: str, val: Any) -> List:
                 key="json_path",
                 val=config_out_path,
             )
+            tokenizer_dir = os.path.dirname(write_out_path)
+            if not os.path.exists(tokenizer_dir):
+                os.mkdir(tokenizer_dir)
             tokenizer_inst.save(write_out_path)
         tokenizer_config_overall = {
             "tokenizers_info": tokenizers_info_cfg,
diff --git a/fusedrug/data/tokenizer/ops/fast_tokenizer_ops.py b/fusedrug/data/tokenizer/ops/fast_tokenizer_ops.py
@@ -22,7 +22,8 @@ def __init__(
         max_size: int = None,
         pad_token: str = None,
         pad_type_id: str = None,
-        validate_ends_with_eos: Optional[str] = "<EOS>",
+        validate_ends_with_eos: Optional[bool] = True,
+        eos: Optional[str] = "<EOS>",
         verbose: bool = False,
         **kwargs: dict,
     ):
@@ -54,11 +55,12 @@ def __init__(
             )
 
         self._validate_ends_with_eos = validate_ends_with_eos
+        self._eos = eos
 
-        if self._validate_ends_with_eos is not None:
-            if self._validate_ends_with_eos not in vocab.keys():
+        if self._validate_ends_with_eos:
+            if self._eos not in vocab.keys():
                 raise Exception(
-                    f"Could not find eos token = {validate_ends_with_eos} in {tokenizer_json}. You can disable the validation by setting validate_ends_with_eos=None"
+                    f"Could not find eos token = {self._eos} in {tokenizer_json}. You can disable the validation by setting validate_ends_with_eos=False"
                 )
 
         self._pad_id = pad_id
@@ -171,6 +173,7 @@ def __call__(
         key_out_tokens_ids: str = None,
         key_out_attention_mask: str = None,
         convert_attention_mask_to_bool: bool = True,
+        validate_ends_with_eos: Optional[bool] = None,
     ) -> NDict:
         # if self._verbose:
         #     print(
@@ -182,11 +185,13 @@ def __call__(
             raise Exception(
                 f"Expected key_in={key_in} to point to a string, and instead got a {type(data_str)}. value={data_str}"
             )
+        if validate_ends_with_eos is None:
+            validate_ends_with_eos = self._validate_ends_with_eos
 
-        if self._validate_ends_with_eos is not None:
-            if not data_str.rstrip().endswith(self._validate_ends_with_eos):
+        if validate_ends_with_eos:
+            if not data_str.rstrip().endswith(self._eos):
                 raise Exception(
-                    f"self._validate_ends_with_eos was set to {self._validate_ends_with_eos}, but about to encode a string that does not end with it. The str was: {data_str}"
+                    f"validate_ends_with_eos was set to {validate_ends_with_eos}, but about to encode a string that does not end with {self._eos}. The str was: {data_str}"
                 )
 
         encoded = self._tokenizer.encode(data_str)
diff --git a/fusedrug/data/tokenizer/ops/modular_tokenizer_ops.py b/fusedrug/data/tokenizer/ops/modular_tokenizer_ops.py
@@ -21,7 +21,8 @@ def __init__(
         max_size: Union[int, None] = None,
         pad_token: Union[str, None] = None,
         pad_type_id: Union[int, None] = None,
-        validate_ends_with_eos: Optional[str] = "<EOS>",
+        validate_ends_with_eos: Optional[bool] = True,
+        eos: Optional[str] = "<EOS>",
         verbose: Optional[bool] = False,
         **kwargs: Any,
     ) -> None:
@@ -53,12 +54,13 @@ def __init__(
             )
 
         self._validate_ends_with_eos = validate_ends_with_eos
+        self._eos = eos
 
-        if self._validate_ends_with_eos is not None:
-            eos_id = self._tokenizer.token_to_id(self._validate_ends_with_eos)
+        if self._validate_ends_with_eos:
+            eos_id = self._tokenizer.token_to_id(self._eos)
             if eos_id is None:
                 raise Exception(
-                    f"Could not find eos token = {validate_ends_with_eos} in {tokenizer_path}. You can disable the validation by setting validate_ends_with_eos=None"
+                    f"Could not find eos token = {self._eos} in {tokenizer_path}. You can disable the validation by setting validate_ends_with_eos=False"
                 )
 
         self._pad_id = pad_id
@@ -190,6 +192,7 @@ def __call__(
         max_seq_len: Optional[int] = None,
         on_unknown: Optional[str] = "warn",
         verbose: Optional[int] = 1,
+        validate_ends_with_eos: Optional[bool] = None,
     ) -> NDict:
         """_summary_
 
@@ -207,6 +210,7 @@ def __call__(
             on_unknown (Optional[str], optional): What happens if unknown tokens (i.e. ones mapped to <UNK>) are encountered: 'raise' or 'warn'. Defaults to "warn".
             verbose (Optional[int], optional): verbosity level. 0: no notification, 1: warning notification, 2: warning with partial data, 3: warning
                 with full data. Defaults to 1.
+            validate_ends_with_eos (Optional[bool], optional): if not None, overrides self._validate_ends_with_eos
 
         Raises:
             Exception: _description_
@@ -222,15 +226,17 @@ def __call__(
             raise Exception(
                 f"Expected key_in={key_in} to point to a list of inputs or string with builtin tokenizer hints, and instead got a {type(data)}. value={data}"
             )
+        if validate_ends_with_eos is None:
+            validate_ends_with_eos = self._validate_ends_with_eos
 
-        if self._validate_ends_with_eos is not None:
+        if validate_ends_with_eos:
             if isinstance(data, str):
                 last_seq = data
             else:
                 last_seq = data[-1].input_string
-            if not last_seq.rstrip().endswith(self._validate_ends_with_eos):
+            if not last_seq.rstrip().endswith(self._eos):
                 raise Exception(
-                    f"self._validate_ends_with_eos was set to {self._validate_ends_with_eos}, but about to encode a string that does not end with it. The str end was: {last_seq}"
+                    f"validate_ends_with_eos was set to {validate_ends_with_eos}, but about to encode a string that does not end with {self._eos}. The str end was: {last_seq}"
                 )
 
         if isinstance(data, str):
diff --git a/fusedrug/data/tokenizer/ops/test_modular_tokenizer_ops.py b/fusedrug/data/tokenizer/ops/test_modular_tokenizer_ops.py
@@ -1,6 +1,7 @@
 import hydra
 from omegaconf import DictConfig, OmegaConf
 
+
 from typing import Dict, Optional, Any
 import pytorch_lightning as pl
 from fuse.utils import NDict
@@ -142,7 +143,7 @@ def main(cfg: DictConfig) -> None:
         tokenizer_path=cfg_raw["data"]["tokenizer"]["out_path"],
         max_size=global_max_len,
         pad_token="<PAD>",
-        validate_ends_with_eos="<EOS>",
+        validate_ends_with_eos=True,
     )
     test_tokenizer_op(
         tokenizer_op_inst=mod_tokenizer_op,
@@ -154,7 +155,7 @@ def main(cfg: DictConfig) -> None:
         tokenizer_path=cfg_raw["data"]["tokenizer"]["out_path"],
         max_size=global_max_len,
         pad_token="<PAD>",
-        validate_ends_with_eos="<EOS>",
+        validate_ends_with_eos=True,
     )
     test_tokenizer_op(
         tokenizer_op_inst=mod_tokenizer_op,