ServiceNow · jlamypoirier · Oct 4, 2025 · Oct 6, 2025 · Oct 6, 2025 · Oct 14, 2025
diff --git a/fast_llm/data/dataset/gpt/config.py b/fast_llm/data/dataset/gpt/config.py
@@ -8,19 +8,12 @@
 from fast_llm.config import Config, Field, FieldHint, check_field, config_class, skip_valid_if_none
 from fast_llm.data.config import TokenizerConfig
 from fast_llm.data.dataset.abstract import SamplableDataset, SampledDataset
-from fast_llm.data.dataset.config import (
-    IndexedDatasetConfig,
-    SamplableDatasetConfig,
-    SampledDatasetConfig,
-    SamplingData,
-    SamplingParameters,
-)
+from fast_llm.data.dataset.config import SamplableDatasetConfig, SampledDatasetConfig, SamplingData, SamplingParameters
 from fast_llm.data.sample.language_model import LanguageModelSample
 from fast_llm.utils import Assert
 
 if typing.TYPE_CHECKING:
     from fast_llm.data.dataset.gpt.fim import GPTFimDataset
-    from fast_llm.data.dataset.gpt.memmap import GPTMemmapDataset
     from fast_llm.data.dataset.gpt.random import GPTRandomDataset
 
 
@@ -30,7 +23,9 @@ class GPTSamplingParameters(SamplingParameters):
     Sampling parameters set externally to the dataset and data, ex. determined by the trainer or model.
     """
 
+    # TODO: ====== The dataset should know it already ======
     vocab_size: int
+    # TODO: ====== Where to put? ======
     use_loss_masking_spans: bool = False
     use_preference_loss_spans: bool = False
 
@@ -60,33 +55,6 @@ def build(self) -> "GPTRandomDataset[SampleType]":
         return GPTRandomDataset[SampleType](self.name)
 
 
-@config_class(dynamic_type={SampledDatasetConfig: "memmap"})
-class GPTMemmapDatasetConfig[SampleType: LanguageModelSample](IndexedDatasetConfig[SampleType]):
-    _abstract: typing.ClassVar[bool] = False
-    path: pathlib.Path = Field(
-        default=None,
-        desc="The path to the dataset, excluding the `.bin` or `.idx` suffix.",
-        hint=FieldHint.core,
-    )
-    num_documents: int | None = Field(
-        default=None,
-        desc="Expected number of documents in the dataset.",
-        hint=FieldHint.optional,
-    )
-    num_tokens: int | None = Field(
-        default=None,
-        desc="Expected number of tokens in the dataset.",
-        hint=FieldHint.optional,
-    )
-
-    def build(self) -> "GPTMemmapDataset[SampleType]":
-        from fast_llm.data.dataset.gpt.memmap import GPTMemmapDataset
-
-        return GPTMemmapDataset[SampleType](
-            str(self.path).replace("/", "__"), self.path, self.num_documents, self.num_tokens
-        )
-
-
 @config_class(dynamic_type={SampledDatasetConfig: "file"})
 class GPTDatasetFromFileConfig[SampleType: LanguageModelSample](SamplableDatasetConfig[SampleType]):
     _abstract: typing.ClassVar[bool] = False

diff --git a/fast_llm/data/dataset/gpt/memmap.py b/fast_llm/data/dataset/gpt/memmap.py