Skip to content
38 changes: 3 additions & 35 deletions fast_llm/data/dataset/gpt/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,19 +8,12 @@
from fast_llm.config import Config, Field, FieldHint, check_field, config_class, skip_valid_if_none
from fast_llm.data.config import TokenizerConfig
from fast_llm.data.dataset.abstract import SamplableDataset, SampledDataset
from fast_llm.data.dataset.config import (
IndexedDatasetConfig,
SamplableDatasetConfig,
SampledDatasetConfig,
SamplingData,
SamplingParameters,
)
from fast_llm.data.dataset.config import SamplableDatasetConfig, SampledDatasetConfig, SamplingData, SamplingParameters
from fast_llm.data.sample.language_model import LanguageModelSample
from fast_llm.utils import Assert

if typing.TYPE_CHECKING:
from fast_llm.data.dataset.gpt.fim import GPTFimDataset
from fast_llm.data.dataset.gpt.memmap import GPTMemmapDataset
from fast_llm.data.dataset.gpt.random import GPTRandomDataset


Expand All @@ -30,7 +23,9 @@ class GPTSamplingParameters(SamplingParameters):
Sampling parameters set externally to the dataset and data, ex. determined by the trainer or model.
"""

# TODO: ====== The dataset should know it already ======
vocab_size: int
# TODO: ====== Where to put? ======
use_loss_masking_spans: bool = False
use_preference_loss_spans: bool = False

Expand Down Expand Up @@ -60,33 +55,6 @@ def build(self) -> "GPTRandomDataset[SampleType]":
return GPTRandomDataset[SampleType](self.name)


@config_class(dynamic_type={SampledDatasetConfig: "memmap"})
class GPTMemmapDatasetConfig[SampleType: LanguageModelSample](IndexedDatasetConfig[SampleType]):
_abstract: typing.ClassVar[bool] = False
path: pathlib.Path = Field(
default=None,
desc="The path to the dataset, excluding the `.bin` or `.idx` suffix.",
hint=FieldHint.core,
)
num_documents: int | None = Field(
default=None,
desc="Expected number of documents in the dataset.",
hint=FieldHint.optional,
)
num_tokens: int | None = Field(
default=None,
desc="Expected number of tokens in the dataset.",
hint=FieldHint.optional,
)

def build(self) -> "GPTMemmapDataset[SampleType]":
from fast_llm.data.dataset.gpt.memmap import GPTMemmapDataset

return GPTMemmapDataset[SampleType](
str(self.path).replace("/", "__"), self.path, self.num_documents, self.num_tokens
)


@config_class(dynamic_type={SampledDatasetConfig: "file"})
class GPTDatasetFromFileConfig[SampleType: LanguageModelSample](SamplableDatasetConfig[SampleType]):
_abstract: typing.ClassVar[bool] = False
Expand Down
316 changes: 0 additions & 316 deletions fast_llm/data/dataset/gpt/memmap.py

This file was deleted.

Loading