From 9fc01e02d4ef59958d8213368bd1450baac4600e Mon Sep 17 00:00:00 2001 From: epwalsh Date: Fri, 22 Nov 2024 15:28:59 -0800 Subject: [PATCH 01/25] Add model ladder building blocks --- docs/source/index.rst | 1 + docs/source/ladder.rst | 5 + src/olmo_core/ladder/__init__.py | 4 + src/olmo_core/ladder/baseline.py | 68 +++++++++++ src/olmo_core/ladder/ladder.py | 153 +++++++++++++++++++++++++ src/olmo_core/nn/transformer/config.py | 70 +++++++++++ src/test/ladder/__init__.py | 0 src/test/ladder/baseline_test.py | 6 + 8 files changed, 307 insertions(+) create mode 100644 docs/source/ladder.rst create mode 100644 src/olmo_core/ladder/__init__.py create mode 100644 src/olmo_core/ladder/baseline.py create mode 100644 src/olmo_core/ladder/ladder.py create mode 100644 src/test/ladder/__init__.py create mode 100644 src/test/ladder/baseline_test.py diff --git a/docs/source/index.rst b/docs/source/index.rst index e7b57038..617671f3 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -51,6 +51,7 @@ specific to your environment. Then you can install OLMo-core from PyPI with: exceptions float8 io + ladder launch nn/index optim diff --git a/docs/source/ladder.rst b/docs/source/ladder.rst new file mode 100644 index 00000000..8eb5407c --- /dev/null +++ b/docs/source/ladder.rst @@ -0,0 +1,5 @@ +``ladder`` +========== + +.. automodule:: olmo_core.ladder + :members: diff --git a/src/olmo_core/ladder/__init__.py b/src/olmo_core/ladder/__init__.py new file mode 100644 index 00000000..879e4d83 --- /dev/null +++ b/src/olmo_core/ladder/__init__.py @@ -0,0 +1,4 @@ +from .baseline import BaselineModelLadder +from .ladder import ModelLadder, ModelSize + +__all__ = ["ModelSize", "ModelLadder", "BaselineModelLadder"] diff --git a/src/olmo_core/ladder/baseline.py b/src/olmo_core/ladder/baseline.py new file mode 100644 index 00000000..e1b16ed9 --- /dev/null +++ b/src/olmo_core/ladder/baseline.py @@ -0,0 +1,68 @@ +from olmo_core.config import DType +from olmo_core.data import TokenizerConfig +from olmo_core.distributed.parallel import DataParallelType +from olmo_core.nn.transformer import TransformerConfig, TransformerDataParallelConfig +from olmo_core.optim import AdamWConfig, OptimConfig, OptimGroupOverride + +from .ladder import ModelLadder, ModelSize + + +class BaselineModelLadder(ModelLadder): + """ + Baseline OLMo model ladder using the current recommended architecture. + """ + + MBZ_SIZES = { + # TODO: may need to tune these + # =============================== + ModelSize.size_190M: 32 * 4096, + ModelSize.size_370M: 32 * 4096, + ModelSize.size_600M: 16 * 4096, + ModelSize.size_760M: 16 * 4096, + # =============================== + ModelSize.size_1B: 8 * 4096, + ModelSize.size_3B: 4 * 4096, + ModelSize.size_7B: 2 * 4096, + ModelSize.size_13B: 1 * 4096, + } + + MODEL_OVERRIDES = { + ModelSize.size_1B: dict(n_layers=16), # need to scale down our actual 1B model + } + + def get_model_config( + self, size: ModelSize, sequence_length: int, tokenizer: TokenizerConfig + ) -> TransformerConfig: + del sequence_length + return getattr(TransformerConfig, f"olmo_{size}")( + vocab_size=tokenizer.padded_vocab_size(), + compile=True, + dp_config=TransformerDataParallelConfig( + name=DataParallelType.fsdp, param_dtype=DType.bfloat16, reduce_dtype=DType.float32 + ), + **self.MODEL_OVERRIDES.get(size, {}), + ) + + def get_optim_config(self, size: ModelSize, sequence_length: int) -> OptimConfig: + # Calculate LR according to https://api.semanticscholar.org/CorpusID:270764838 + assert sequence_length in {2048, 4096} + model_size = self.get_model_config( + size, sequence_length, self.get_tokenizer_config() + ).num_non_embedding_params + lr = 0.0047 * (model_size / 108000000) ** (-1 / 3) + if sequence_length == 4096: + lr /= 4 + + return AdamWConfig( + lr=lr, + weight_decay=0.1, + betas=(0.9, 0.95), + group_overrides=[ + OptimGroupOverride(params=["embeddings.weight"], opts=dict(weight_decay=0.0)) + ], + fused=True, + ) + + def get_rank_microbatch_size(self, size: ModelSize, sequence_length: int, gpu_type: str) -> int: + del sequence_length, gpu_type # assuming we're running on 80GB GPUs + return self.MBZ_SIZES[size] diff --git a/src/olmo_core/ladder/ladder.py b/src/olmo_core/ladder/ladder.py new file mode 100644 index 00000000..18acddb8 --- /dev/null +++ b/src/olmo_core/ladder/ladder.py @@ -0,0 +1,153 @@ +from abc import ABCMeta, abstractmethod + +from olmo_core.config import StrEnum +from olmo_core.data import TokenizerConfig +from olmo_core.exceptions import OLMoConfigurationError +from olmo_core.nn.transformer import TransformerConfig +from olmo_core.optim import OptimConfig + + +class ModelSize(StrEnum): + """ + An enumeration of the standard model sizes in the ladder. + :class:`ModelLadder` implementations should produce models that match these sizes + as close as possible, ignoring embeddings. + """ + + size_190M = "190M" + """ + 190M parameters. + """ + size_370M = "370M" + """ + 370M parameters. + """ + size_600M = "600M" + """ + 600M parameters. + """ + size_760M = "760M" + """ + 760M parameters. + """ + size_1B = "1B" + """ + 1B parameters. + """ + size_3B = "3B" + """ + 3B parameters. + """ + size_7B = "7B" + """ + 7B parameters. + """ + size_13B = "13B" + """ + 13B parameters. + """ + + +class ModelLadder(metaclass=ABCMeta): + """ + Base class for defining model ladder experiments. + """ + + def get_tokenizer_config(self) -> TokenizerConfig: + """ + Get the tokenizer config to use throughput the ladder. + """ + return TokenizerConfig.dolma2() + + @abstractmethod + def get_model_config( + self, size: ModelSize, sequence_length: int, tokenizer: TokenizerConfig + ) -> TransformerConfig: + """ + Get the model config for a given model size. + + :param size: The target model size. + :param sequence_length: The sequence length to be used. + """ + raise NotImplementedError + + @abstractmethod + def get_optim_config(self, size: ModelSize, sequence_length: int) -> OptimConfig: + """ + Get the optimizer config for a given model size. + + :param size: The target model size. + :param sequence_length: The sequence length to be used. + """ + raise NotImplementedError + + @abstractmethod + def get_rank_microbatch_size(self, size: ModelSize, sequence_length: int, gpu_type: str) -> int: + """ + Returns the micro-batch size in tokens per device that should be used for the given + model size. + + :param size: The target model size. + :param sequence_length: The sequence length to be used. + :param gpu_type: The type of GPU. + """ + raise NotImplementedError + + def get_global_batch_size( + self, size: ModelSize, sequence_length: int, batch_size_divisor: int = 64 + ) -> int: + """ + Get the global batch size in tokens for a given model size. + """ + # Calculate batch size according to https://api.semanticscholar.org/CorpusID:270764838, + # which assumes a sequence length of 2048. So adjust from there accordingly. + assert sequence_length in {2048, 4096, 8192} + seq_len_divisor = sequence_length // 2048 + + num_params = self.get_model_config( + size, sequence_length, self.get_tokenizer_config() + ).num_non_embedding_params + + global_batch_size = 160 * (num_params / 108000000) ** (2 / 3) + global_batch_size /= seq_len_divisor + global_batch_size /= batch_size_divisor + global_batch_size = round(global_batch_size) + global_batch_size *= batch_size_divisor + + return sequence_length * global_batch_size + + def validate(self): + """ + Validate the ladder configuration. + + :raises OLMoConfigurationError: If the ladder has any issues. + """ + tokenizer = self.get_tokenizer_config() + for size in ModelSize: + target_size = int(size[:-1]) + if size.endswith("M"): + target_size = target_size * 10**6 + elif size.endswith("B"): + target_size = target_size * 10**9 + else: + raise NotImplementedError(size) + + for sequence_length in (2048, 4096): + model_config = self.get_model_config(size, sequence_length, tokenizer) + + # Make sure actual model size is close to target size. + num_params = model_config.num_non_embedding_params + if abs(num_params - target_size) / target_size > 0.05: + raise OLMoConfigurationError( + f"Model size of {num_params:,d} for sequence length {sequence_length} is " + f"too far from target size of {size}: {model_config}" + ) + + self.get_optim_config(size, sequence_length) + self.get_rank_microbatch_size(size, sequence_length, "H100") + bz_tokens = self.get_global_batch_size(size, sequence_length) + if bz_tokens % sequence_length != 0: + raise OLMoConfigurationError( + f"Batch size of {bz_tokens:,d} tokens for model size {size} " + f"must be divisible by the sequence length ({sequence_length})" + ) diff --git a/src/olmo_core/nn/transformer/config.py b/src/olmo_core/nn/transformer/config.py index 6a48e508..b724b9be 100644 --- a/src/olmo_core/nn/transformer/config.py +++ b/src/olmo_core/nn/transformer/config.py @@ -292,6 +292,62 @@ def num_flops_per_token(self, seq_len: int) -> int: return flop_per_token + @classmethod + def olmo_190M(cls, vocab_size: int, **kwargs) -> "TransformerConfig": + return cls.llama_like( + d_model=768, + hidden_size_multiplier=1.5, + n_layers=kwargs.pop("n_layers", 12), + n_heads=kwargs.pop("n_heads", 12), + vocab_size=vocab_size, + block_name=kwargs.pop("block_name", TransformerBlockType.reordered_norm), + qk_norm=kwargs.pop("qk_norm", True), + rope_theta=kwargs.pop("rope_theta", 500_000), + layer_norm_eps=1e-6, + ) + + @classmethod + def olmo_370M(cls, vocab_size: int, **kwargs) -> "TransformerConfig": + return cls.llama_like( + d_model=1024, + hidden_size_multiplier=1.4, + n_layers=kwargs.pop("n_layers", 16), + n_heads=kwargs.pop("n_heads", 16), + vocab_size=vocab_size, + block_name=kwargs.pop("block_name", TransformerBlockType.reordered_norm), + qk_norm=kwargs.pop("qk_norm", True), + rope_theta=kwargs.pop("rope_theta", 500_000), + layer_norm_eps=1e-6, + ) + + @classmethod + def olmo_600M(cls, vocab_size: int, **kwargs) -> "TransformerConfig": + return cls.llama_like( + d_model=1344, + hidden_size_multiplier=1.5, + n_layers=kwargs.pop("n_layers", 16), + n_heads=kwargs.pop("n_heads", 16), + vocab_size=vocab_size, + block_name=kwargs.pop("block_name", TransformerBlockType.reordered_norm), + qk_norm=kwargs.pop("qk_norm", True), + rope_theta=kwargs.pop("rope_theta", 500_000), + layer_norm_eps=1e-6, + ) + + @classmethod + def olmo_760M(cls, vocab_size: int, **kwargs) -> "TransformerConfig": + return cls.llama_like( + d_model=1536, + hidden_size_multiplier=1.5, + n_layers=kwargs.pop("n_layers", 16), + n_heads=kwargs.pop("n_heads", 16), + vocab_size=vocab_size, + block_name=kwargs.pop("block_name", TransformerBlockType.reordered_norm), + qk_norm=kwargs.pop("qk_norm", True), + rope_theta=kwargs.pop("rope_theta", 500_000), + layer_norm_eps=1e-6, + ) + @classmethod def olmo_1B(cls, vocab_size: int, **kwargs) -> "TransformerConfig": """ @@ -306,6 +362,20 @@ def olmo_1B(cls, vocab_size: int, **kwargs) -> "TransformerConfig": **kwargs, ) + @classmethod + def olmo_3B(cls, vocab_size: int, **kwargs) -> "TransformerConfig": + return cls.llama_like( + d_model=3328, + hidden_size_multiplier=1.4, + n_layers=kwargs.pop("n_layers", 16), + n_heads=kwargs.pop("n_heads", 16), + vocab_size=vocab_size, + block_name=kwargs.pop("block_name", TransformerBlockType.reordered_norm), + qk_norm=kwargs.pop("qk_norm", True), + rope_theta=kwargs.pop("rope_theta", 500_000), + layer_norm_eps=1e-6, + ) + @classmethod def olmo_7B(cls, vocab_size: int, **kwargs) -> "TransformerConfig": """ diff --git a/src/test/ladder/__init__.py b/src/test/ladder/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/test/ladder/baseline_test.py b/src/test/ladder/baseline_test.py new file mode 100644 index 00000000..392f3892 --- /dev/null +++ b/src/test/ladder/baseline_test.py @@ -0,0 +1,6 @@ +from olmo_core.ladder.baseline import BaselineModelLadder + + +def test_validate_baseline_model_ladder(): + ladder = BaselineModelLadder() + ladder.validate() From 0799eb030ca1421817618cfc2998d738452539dd Mon Sep 17 00:00:00 2001 From: epwalsh Date: Mon, 25 Nov 2024 13:22:31 -0800 Subject: [PATCH 02/25] reorganize --- docs/source/index.rst | 2 +- docs/source/ladder.rst | 5 - docs/source/model_ladder.rst | 5 + src/olmo_core/internal/experiment.py | 2 +- .../baseline.py => internal/model_ladder.py} | 35 +- src/olmo_core/ladder/__init__.py | 4 - src/olmo_core/ladder/ladder.py | 153 ------ src/olmo_core/model_ladder.py | 455 ++++++++++++++++++ src/test/{ladder => internal}/__init__.py | 0 src/test/internal/model_ladder_test.py | 11 + src/test/ladder/baseline_test.py | 6 - src/test/model_ladder_test.py | 6 + 12 files changed, 495 insertions(+), 189 deletions(-) delete mode 100644 docs/source/ladder.rst create mode 100644 docs/source/model_ladder.rst rename src/olmo_core/{ladder/baseline.py => internal/model_ladder.py} (65%) delete mode 100644 src/olmo_core/ladder/__init__.py delete mode 100644 src/olmo_core/ladder/ladder.py create mode 100644 src/olmo_core/model_ladder.py rename src/test/{ladder => internal}/__init__.py (100%) create mode 100644 src/test/internal/model_ladder_test.py delete mode 100644 src/test/ladder/baseline_test.py create mode 100644 src/test/model_ladder_test.py diff --git a/docs/source/index.rst b/docs/source/index.rst index 617671f3..e1bf0ee7 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -51,8 +51,8 @@ specific to your environment. Then you can install OLMo-core from PyPI with: exceptions float8 io - ladder launch + model_ladder nn/index optim train/index diff --git a/docs/source/ladder.rst b/docs/source/ladder.rst deleted file mode 100644 index 8eb5407c..00000000 --- a/docs/source/ladder.rst +++ /dev/null @@ -1,5 +0,0 @@ -``ladder`` -========== - -.. automodule:: olmo_core.ladder - :members: diff --git a/docs/source/model_ladder.rst b/docs/source/model_ladder.rst new file mode 100644 index 00000000..71ac20c4 --- /dev/null +++ b/docs/source/model_ladder.rst @@ -0,0 +1,5 @@ +``model_ladder`` +================ + +.. automodule:: olmo_core.model_ladder + :members: diff --git a/src/olmo_core/internal/experiment.py b/src/olmo_core/internal/experiment.py index 97649ed0..be3da2a3 100644 --- a/src/olmo_core/internal/experiment.py +++ b/src/olmo_core/internal/experiment.py @@ -345,7 +345,7 @@ def train(config: ExperimentConfig): data_loader = config.data_loader.build(dataset) trainer = config.trainer.build(model, optim, data_loader) - # Record the config to W&B and each checkpoint dir. + # Record the config to W&B/Comet and each checkpoint dir. config_dict = config.as_config_dict() cast(CometCallback, trainer.callbacks["comet"]).config = config_dict cast(WandBCallback, trainer.callbacks["wandb"]).config = config_dict diff --git a/src/olmo_core/ladder/baseline.py b/src/olmo_core/internal/model_ladder.py similarity index 65% rename from src/olmo_core/ladder/baseline.py rename to src/olmo_core/internal/model_ladder.py index e1b16ed9..9ff965d0 100644 --- a/src/olmo_core/ladder/baseline.py +++ b/src/olmo_core/internal/model_ladder.py @@ -1,18 +1,20 @@ +from dataclasses import dataclass +from typing import Any, ClassVar, Dict + from olmo_core.config import DType -from olmo_core.data import TokenizerConfig from olmo_core.distributed.parallel import DataParallelType +from olmo_core.model_ladder import ModelLadder, ModelSize from olmo_core.nn.transformer import TransformerConfig, TransformerDataParallelConfig from olmo_core.optim import AdamWConfig, OptimConfig, OptimGroupOverride -from .ladder import ModelLadder, ModelSize - +@dataclass class BaselineModelLadder(ModelLadder): """ Baseline OLMo model ladder using the current recommended architecture. """ - MBZ_SIZES = { + MBZ_SIZES: ClassVar[Dict[ModelSize, int]] = { # TODO: may need to tune these # =============================== ModelSize.size_190M: 32 * 4096, @@ -26,16 +28,14 @@ class BaselineModelLadder(ModelLadder): ModelSize.size_13B: 1 * 4096, } - MODEL_OVERRIDES = { + MODEL_OVERRIDES: ClassVar[Dict[ModelSize, Dict[str, Any]]] = { ModelSize.size_1B: dict(n_layers=16), # need to scale down our actual 1B model } - def get_model_config( - self, size: ModelSize, sequence_length: int, tokenizer: TokenizerConfig - ) -> TransformerConfig: - del sequence_length + def get_model_config(self, *, size: ModelSize) -> TransformerConfig: return getattr(TransformerConfig, f"olmo_{size}")( - vocab_size=tokenizer.padded_vocab_size(), + vocab_size=self.tokenizer.padded_vocab_size(), + init_seed=self.init_seed, compile=True, dp_config=TransformerDataParallelConfig( name=DataParallelType.fsdp, param_dtype=DType.bfloat16, reduce_dtype=DType.float32 @@ -43,14 +43,11 @@ def get_model_config( **self.MODEL_OVERRIDES.get(size, {}), ) - def get_optim_config(self, size: ModelSize, sequence_length: int) -> OptimConfig: + def get_optim_config(self, *, size: ModelSize) -> OptimConfig: # Calculate LR according to https://api.semanticscholar.org/CorpusID:270764838 - assert sequence_length in {2048, 4096} - model_size = self.get_model_config( - size, sequence_length, self.get_tokenizer_config() - ).num_non_embedding_params - lr = 0.0047 * (model_size / 108000000) ** (-1 / 3) - if sequence_length == 4096: + assert self.sequence_length in {2048, 4096} + lr = 0.0047 * (size.num_params / 108000000) ** (-1 / 3) + if self.sequence_length == 4096: lr /= 4 return AdamWConfig( @@ -63,6 +60,6 @@ def get_optim_config(self, size: ModelSize, sequence_length: int) -> OptimConfig fused=True, ) - def get_rank_microbatch_size(self, size: ModelSize, sequence_length: int, gpu_type: str) -> int: - del sequence_length, gpu_type # assuming we're running on 80GB GPUs + def get_rank_microbatch_size(self, *, size: ModelSize, gpu_type: str) -> int: + assert "h100" in gpu_type.lower() return self.MBZ_SIZES[size] diff --git a/src/olmo_core/ladder/__init__.py b/src/olmo_core/ladder/__init__.py deleted file mode 100644 index 879e4d83..00000000 --- a/src/olmo_core/ladder/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -from .baseline import BaselineModelLadder -from .ladder import ModelLadder, ModelSize - -__all__ = ["ModelSize", "ModelLadder", "BaselineModelLadder"] diff --git a/src/olmo_core/ladder/ladder.py b/src/olmo_core/ladder/ladder.py deleted file mode 100644 index 18acddb8..00000000 --- a/src/olmo_core/ladder/ladder.py +++ /dev/null @@ -1,153 +0,0 @@ -from abc import ABCMeta, abstractmethod - -from olmo_core.config import StrEnum -from olmo_core.data import TokenizerConfig -from olmo_core.exceptions import OLMoConfigurationError -from olmo_core.nn.transformer import TransformerConfig -from olmo_core.optim import OptimConfig - - -class ModelSize(StrEnum): - """ - An enumeration of the standard model sizes in the ladder. - :class:`ModelLadder` implementations should produce models that match these sizes - as close as possible, ignoring embeddings. - """ - - size_190M = "190M" - """ - 190M parameters. - """ - size_370M = "370M" - """ - 370M parameters. - """ - size_600M = "600M" - """ - 600M parameters. - """ - size_760M = "760M" - """ - 760M parameters. - """ - size_1B = "1B" - """ - 1B parameters. - """ - size_3B = "3B" - """ - 3B parameters. - """ - size_7B = "7B" - """ - 7B parameters. - """ - size_13B = "13B" - """ - 13B parameters. - """ - - -class ModelLadder(metaclass=ABCMeta): - """ - Base class for defining model ladder experiments. - """ - - def get_tokenizer_config(self) -> TokenizerConfig: - """ - Get the tokenizer config to use throughput the ladder. - """ - return TokenizerConfig.dolma2() - - @abstractmethod - def get_model_config( - self, size: ModelSize, sequence_length: int, tokenizer: TokenizerConfig - ) -> TransformerConfig: - """ - Get the model config for a given model size. - - :param size: The target model size. - :param sequence_length: The sequence length to be used. - """ - raise NotImplementedError - - @abstractmethod - def get_optim_config(self, size: ModelSize, sequence_length: int) -> OptimConfig: - """ - Get the optimizer config for a given model size. - - :param size: The target model size. - :param sequence_length: The sequence length to be used. - """ - raise NotImplementedError - - @abstractmethod - def get_rank_microbatch_size(self, size: ModelSize, sequence_length: int, gpu_type: str) -> int: - """ - Returns the micro-batch size in tokens per device that should be used for the given - model size. - - :param size: The target model size. - :param sequence_length: The sequence length to be used. - :param gpu_type: The type of GPU. - """ - raise NotImplementedError - - def get_global_batch_size( - self, size: ModelSize, sequence_length: int, batch_size_divisor: int = 64 - ) -> int: - """ - Get the global batch size in tokens for a given model size. - """ - # Calculate batch size according to https://api.semanticscholar.org/CorpusID:270764838, - # which assumes a sequence length of 2048. So adjust from there accordingly. - assert sequence_length in {2048, 4096, 8192} - seq_len_divisor = sequence_length // 2048 - - num_params = self.get_model_config( - size, sequence_length, self.get_tokenizer_config() - ).num_non_embedding_params - - global_batch_size = 160 * (num_params / 108000000) ** (2 / 3) - global_batch_size /= seq_len_divisor - global_batch_size /= batch_size_divisor - global_batch_size = round(global_batch_size) - global_batch_size *= batch_size_divisor - - return sequence_length * global_batch_size - - def validate(self): - """ - Validate the ladder configuration. - - :raises OLMoConfigurationError: If the ladder has any issues. - """ - tokenizer = self.get_tokenizer_config() - for size in ModelSize: - target_size = int(size[:-1]) - if size.endswith("M"): - target_size = target_size * 10**6 - elif size.endswith("B"): - target_size = target_size * 10**9 - else: - raise NotImplementedError(size) - - for sequence_length in (2048, 4096): - model_config = self.get_model_config(size, sequence_length, tokenizer) - - # Make sure actual model size is close to target size. - num_params = model_config.num_non_embedding_params - if abs(num_params - target_size) / target_size > 0.05: - raise OLMoConfigurationError( - f"Model size of {num_params:,d} for sequence length {sequence_length} is " - f"too far from target size of {size}: {model_config}" - ) - - self.get_optim_config(size, sequence_length) - self.get_rank_microbatch_size(size, sequence_length, "H100") - bz_tokens = self.get_global_batch_size(size, sequence_length) - if bz_tokens % sequence_length != 0: - raise OLMoConfigurationError( - f"Batch size of {bz_tokens:,d} tokens for model size {size} " - f"must be divisible by the sequence length ({sequence_length})" - ) diff --git a/src/olmo_core/model_ladder.py b/src/olmo_core/model_ladder.py new file mode 100644 index 00000000..5cc993e2 --- /dev/null +++ b/src/olmo_core/model_ladder.py @@ -0,0 +1,455 @@ +""" +Configuration classes for defining model ladder scaling ablations. +""" + +from abc import ABCMeta, abstractmethod +from dataclasses import dataclass, field +from typing import Optional, cast + +import torch +from torch.distributed.device_mesh import DeviceMesh + +from olmo_core.config import Config, StrEnum +from olmo_core.data import ( + DataMix, + NumpyDataLoaderConfig, + NumpyDatasetConfig, + NumpyDatasetType, + TokenizerConfig, +) +from olmo_core.distributed.utils import ( + get_num_nodes, + get_world_size, + init_hybrid_shard_mesh, +) +from olmo_core.exceptions import OLMoConfigurationError +from olmo_core.io import is_url +from olmo_core.nn.transformer import TransformerConfig +from olmo_core.optim import CosWithWarmup, OptimConfig +from olmo_core.train import ( + Duration, + TrainerConfig, + prepare_training_environment, + teardown_training_environment, +) +from olmo_core.train.callbacks import ( + CheckpointerCallback, + CometCallback, + ConfigSaverCallback, + DownstreamEvaluatorCallbackConfig, + GarbageCollectorCallback, + GPUMemoryMonitorCallback, + GradClipperCallback, + LMEvaluatorCallbackConfig, + SchedulerCallback, + WandBCallback, +) +from olmo_core.utils import get_default_device, seed_all + +__all__ = ["ModelSize", "ModelLadder"] + + +class ModelSize(StrEnum): + """ + An enumeration of the standard model sizes in the ladder. + :class:`ModelLadder` implementations should produce models that match these sizes + as close as possible, ignoring embeddings. + """ + + size_190M = "190M" + """ + 190M parameters. + """ + size_370M = "370M" + """ + 370M parameters. + """ + size_600M = "600M" + """ + 600M parameters. + """ + size_760M = "760M" + """ + 760M parameters. + """ + size_1B = "1B" + """ + 1B parameters. + """ + size_3B = "3B" + """ + 3B parameters. + """ + size_7B = "7B" + """ + 7B parameters. + """ + size_13B = "13B" + """ + 13B parameters. + """ + + @property + def num_params(self) -> int: + value, unit = int(self[:-1]), self[-1] + if unit == "M": + return value * int(1e6) + elif unit == "B": + return value * int(1e9) + else: + raise NotImplementedError(self) + + +@dataclass +class ModelLadder(Config, metaclass=ABCMeta): + """ + Base class for defining model ladder experiments. + + At a minimum subclasses must implement: + + - :meth:`get_model_config()` + - :meth:`get_optim_config()` + - :meth:`get_rank_microbatch_size()` + + for every model size defined by :class:`ModelSize`. + """ + + name: str + """ + The name of the ladder runs. + """ + + project: str + """ + The name of the W&B/Comet project to save run data to. + """ + + root_dir: str + """ + The root directory. Defines where to find the data mix paths and where to save checkpoints to. + """ + + sequence_length: int = 2048 + """ + The target sequence length to train the ladder on. + """ + + tokenizer: TokenizerConfig = field(default_factory=TokenizerConfig.dolma2) + """ + Get the tokenizer config to use throughput the ladder. + """ + + init_seed: int = 2352 + """ + The seed to use when first initializing RNG states. + """ + + data_mix: DataMix = DataMix.OLMoE_mix_0824 + """ + The data mix to train on. + """ + + data_seed: int = 34521 + """ + The seed to use for shuffling the data. + """ + + @property + def work_dir(self) -> str: + """ + The working directory used for dataset caching. + """ + return ( + "./dataset-cache" + if is_url(self.root_dir) + else f"{self.root_dir}/checkpoints/{self._get_beaker_username() or 'OLMo-core'}/dataset-cache" + ) + + def get_save_folder(self, size: ModelSize) -> str: + """ + The local or remote folder to save checkpoints to. + Should be unique for the ladder config and model size. + """ + return f"{self.root_dir}/checkpoints/{self._get_beaker_username() or 'OLMo-core'}/{self.name}-{size}" + + @abstractmethod + def get_model_config(self, *, size: ModelSize) -> TransformerConfig: + """ + Get the model config for a given model size. + + :param size: The target model size. + """ + raise NotImplementedError + + @abstractmethod + def get_optim_config(self, *, size: ModelSize) -> OptimConfig: + """ + Get the optimizer config for a given model size. + + :param size: The target model size. + """ + raise NotImplementedError + + def get_dataset_config(self) -> NumpyDatasetConfig: + """ + Get the train dataset config. + + :param sequence_length: The sequence length to be used. + """ + return NumpyDatasetConfig.from_data_mix( + self.data_mix, + tokenizer=self.tokenizer, + mix_base_root=self.root_dir, + sequence_length=self.sequence_length, + work_dir=self.work_dir, + ) + + def get_data_loader_config( + self, *, size: ModelSize, dp_world_size: int + ) -> NumpyDataLoaderConfig: + """ + Get the data loader config. + + :param size: The target model size. + :param dp_world_size: The data parallel world size for training. + """ + return NumpyDataLoaderConfig( + global_batch_size=self.get_global_batch_size(size=size, dp_world_size=dp_world_size), + seed=self.data_seed, + num_workers=4, + ) + + @abstractmethod + def get_rank_microbatch_size(self, *, size: ModelSize, gpu_type: str) -> int: + """ + Returns the micro-batch size in tokens per device that should be used for the given + model size. + + :param size: The target model size. + :param gpu_type: The type of GPU as given by ``torch.cuda.get_device_name()``. + """ + raise NotImplementedError + + def get_global_batch_size(self, *, size: ModelSize, dp_world_size: int = 64) -> int: + """ + Get the global batch size in tokens for a given model size. + + :param size: The target model size. + :param dp_world_size: The data parallel world size for training. + """ + # Calculate batch size according to https://api.semanticscholar.org/CorpusID:270764838, + # which assumes a sequence length of 2048. So adjust from there accordingly. + assert self.sequence_length in {2048, 4096, 8192} + seq_len_divisor = self.sequence_length // 2048 + + global_batch_size = 160 * (size.num_params / 108000000) ** (2 / 3) + global_batch_size /= seq_len_divisor + global_batch_size /= dp_world_size + global_batch_size = round(global_batch_size) + global_batch_size *= dp_world_size + + return self.sequence_length * global_batch_size + + def get_duration(self, size: ModelSize) -> Duration: + """ + Get the duration to train for given the model size. Defaults to 2 x Chinchilla optimal. + + :param size: The target model size. + """ + return Duration.tokens(2 * 20 * size.num_params) + + def get_dp_mesh(self, *, size: ModelSize) -> Optional[DeviceMesh]: + """ + Get the data parallel device mesh. Could be a 2D mesh for HSDP, or just none or FSDP/DDP. + """ + if get_num_nodes() == 1 or size.num_params < 1e9: + return None + else: + return init_hybrid_shard_mesh() + + def get_trainer_config( + self, + *, + size: ModelSize, + gpu_type: str, + ) -> TrainerConfig: + """ + Build the trainer config. + + :param size: The target model size. + :param gpu_type: The type of GPU as given by ``torch.cuda.get_device_name()``. + """ + rank_mbz = self.get_rank_microbatch_size(size=size, gpu_type=gpu_type) + if rank_mbz % self.sequence_length != 0: + raise OLMoConfigurationError( + f"rank micro-batch size ({rank_mbz:,d} tokens) must be divisible " + f"by the sequence length ({self.sequence_length:,d})" + ) + + return ( + TrainerConfig( + save_folder=self.get_save_folder(size), + rank_microbatch_size=rank_mbz, + metrics_collect_interval=10, + cancel_check_interval=1, + compile_loss=True, + max_duration=self.get_duration(size), + ) + .with_callback( + "lr_scheduler", SchedulerCallback(scheduler=CosWithWarmup(warmup_steps=2000)) + ) + .with_callback("gpu_monitor", GPUMemoryMonitorCallback()) + .with_callback("grad_clipper", GradClipperCallback(max_grad_norm=1.0)) + .with_callback("config_saver", ConfigSaverCallback()) + .with_callback("garbage_collector", GarbageCollectorCallback()) + .with_callback( + "lm_evaluator", + LMEvaluatorCallbackConfig( + eval_dataset=NumpyDatasetConfig.from_data_mix( + DataMix.v3_small_ppl_validation, + name=NumpyDatasetType.padded_fsl, + mix_base_dir=self.root_dir, + sequence_length=self.sequence_length, + tokenizer=self.tokenizer, + work_dir=self.work_dir, + ), + eval_interval=1000, + ), + ) + .with_callback( + "downstream_evaluator", + DownstreamEvaluatorCallbackConfig( + tasks=["hellaswag"], # TODO: which other tasks? + tokenizer=self.tokenizer, + eval_interval=250, + ), + ) + .with_callback( + "checkpointer", + CheckpointerCallback( + save_interval=100_000, # large enough value that we won't save until the end + ephemeral_save_interval=250, + save_async=True, + ), + ) + .with_callback( + "comet", + CometCallback( + name=f"{self.name}-{size}", + workspace="ai2", + project=self.project, + enabled=True, + cancel_check_interval=5, + ), + ) + .with_callback( + "wandb", + WandBCallback( + name=f"{self.name}-{size}", + entity="ai2", + project=self.project, + enabled=False, + cancel_check_interval=5, + ), + ) + ) + + def train(self, size: ModelSize): + """ + Run the ladder at the given size. + + .. note:: + This will call :func:`~olmo_core.train.prepare_training_environment()`, so there's no + need to call that before. + """ + prepare_training_environment() + + gpu_type = torch.cuda.get_device_name() + dp_world_size = get_world_size() + + try: + seed_all(self.init_seed) + + # Get configs. + model_config = self.get_model_config(size=size) + optim_config = self.get_optim_config(size=size) + dataset_config = self.get_dataset_config() + data_loader_config = self.get_data_loader_config(size=size, dp_world_size=dp_world_size) + trainer_config = self.get_trainer_config(size=size, gpu_type=gpu_type) + + # Build components. + model = model_config.build( + init_device="meta", + device=get_default_device(), + max_seq_len=self.sequence_length, + dp_mesh=self.get_dp_mesh(size=size), + ) + optim = optim_config.build(model) + dataset = dataset_config.build() + data_loader = data_loader_config.build(dataset) + trainer = trainer_config.build(model, optim, data_loader) + + # Record the config to W&B/Comet and each checkpoint dir. + config_dict = self.as_config_dict() + config_dict.update( + dict( + model=model_config, + optim=optim_config, + dataset=dataset_config, + data_loader=data_loader_config, + trainer=trainer_config, + ) + ) + cast(CometCallback, trainer.callbacks["comet"]).config = config_dict + cast(WandBCallback, trainer.callbacks["wandb"]).config = config_dict + cast(ConfigSaverCallback, trainer.callbacks["config_saver"]).config = config_dict + + # Train. + trainer.fit() + finally: + teardown_training_environment() + + def validate(self): + """ + Validate the ladder configuration. + + :raises OLMoConfigurationError: If the ladder has any issues. + """ + for size in ModelSize: + target_size = int(size[:-1]) + if size.endswith("M"): + target_size = target_size * 10**6 + elif size.endswith("B"): + target_size = target_size * 10**9 + else: + raise NotImplementedError(size) + + model_config = self.get_model_config(size=size) + + # Make sure actual model size is close to target size. + num_params = model_config.num_non_embedding_params + if abs(num_params - target_size) / target_size > 0.05: + raise OLMoConfigurationError( + f"Model size of {num_params:,d} for sequence length {self.sequence_length} is " + f"too far from target size of {size}: {model_config}" + ) + + self.get_optim_config(size=size) + self.get_rank_microbatch_size(size=size, gpu_type="H100") + bz_tokens = self.get_global_batch_size(size=size) + if bz_tokens % self.sequence_length != 0: + raise OLMoConfigurationError( + f"Batch size of {bz_tokens:,d} tokens for model size {size} " + f"must be divisible by the sequence length ({self.sequence_length})" + ) + + def _get_beaker_username(self) -> Optional[str]: + try: + from beaker import Beaker, BeakerError + except ImportError: + return None + + try: + return Beaker.from_env().account.whoami().name + except BeakerError: + return None diff --git a/src/test/ladder/__init__.py b/src/test/internal/__init__.py similarity index 100% rename from src/test/ladder/__init__.py rename to src/test/internal/__init__.py diff --git a/src/test/internal/model_ladder_test.py b/src/test/internal/model_ladder_test.py new file mode 100644 index 00000000..140083ad --- /dev/null +++ b/src/test/internal/model_ladder_test.py @@ -0,0 +1,11 @@ +import pytest + +from olmo_core.ladder.baseline import BaselineModelLadder + + +@pytest.mark.parametrize("sequence_length", [2048, 4096]) +def test_validate_baseline_model_ladder(tmp_path, sequence_length): + ladder = BaselineModelLadder( + name="baseline", project="ladder", root_dir=tmp_path, sequence_length=sequence_length + ) + ladder.validate() diff --git a/src/test/ladder/baseline_test.py b/src/test/ladder/baseline_test.py deleted file mode 100644 index 392f3892..00000000 --- a/src/test/ladder/baseline_test.py +++ /dev/null @@ -1,6 +0,0 @@ -from olmo_core.ladder.baseline import BaselineModelLadder - - -def test_validate_baseline_model_ladder(): - ladder = BaselineModelLadder() - ladder.validate() diff --git a/src/test/model_ladder_test.py b/src/test/model_ladder_test.py new file mode 100644 index 00000000..f7bfe797 --- /dev/null +++ b/src/test/model_ladder_test.py @@ -0,0 +1,6 @@ +from olmo_core.ladder import ModelSize + + +def test_model_size_num_params(): + assert ModelSize.size_190M.num_params == 190_000_000 + assert ModelSize.size_7B.num_params == 7_000_000_000 From 8286ff7e606a53da7b0a9151c711def3b855e137 Mon Sep 17 00:00:00 2001 From: epwalsh Date: Mon, 25 Nov 2024 15:47:19 -0800 Subject: [PATCH 03/25] big refactor --- src/olmo_core/config.py | 22 +- src/olmo_core/internal/common.py | 14 ++ src/olmo_core/internal/experiment.py | 9 +- src/olmo_core/internal/model_ladder.py | 265 +++++++++++++++++++------ src/olmo_core/model_ladder.py | 152 +++++--------- src/scripts/train/ladder_baseline.py | 88 ++++++++ src/test/config_test.py | 32 +++ 7 files changed, 417 insertions(+), 165 deletions(-) create mode 100644 src/olmo_core/internal/common.py create mode 100644 src/scripts/train/ladder_baseline.py diff --git a/src/olmo_core/config.py b/src/olmo_core/config.py index 27f509a5..d6ef4f12 100644 --- a/src/olmo_core/config.py +++ b/src/olmo_core/config.py @@ -158,14 +158,32 @@ def validate(self): """ pass - def merge(self, dotlist: List[str]) -> Self: + def merge(self, dotlist: List[str], prefix: Optional[str] = None, strict: bool = True) -> Self: """ Merge self with fields from a "dotlist", creating a new object. :param dotlist: A list of field attributes with dot notation, e.g. ``foo.bar=1``. + :param prefix: Only use override items in the dotlist that start with a given prefix name, + and strip that prefix (including the subsequent ".") before applying the overrides. + :param strict: Parse the dotlist strictly. """ try: - merge_fields = om.from_dotlist(_clean_opts(dotlist)) + dotlist = _clean_opts(dotlist) + if prefix is not None: + dotlist = [o.lstrip(f"{prefix}.") for o in dotlist if o.startswith(f"{prefix}.")] + if not strict: + field_names = set(f.name for f in fields(self)) + dotlist = [ + o + for o in dotlist + if any( + [ + o.startswith(f"{name}=") or o.startswith(f"{name}.") + for name in field_names + ] + ) + ] + merge_fields = om.from_dotlist(dotlist) merged = om.merge(self, merge_fields) out = cast(Self, om.to_object(merged)) out.apply(lambda c: c.validate()) diff --git a/src/olmo_core/internal/common.py b/src/olmo_core/internal/common.py new file mode 100644 index 00000000..c4cc46ea --- /dev/null +++ b/src/olmo_core/internal/common.py @@ -0,0 +1,14 @@ +from typing import Optional + +from beaker import Beaker + +_BEAKER_USERNAME: Optional[str] = None + + +def get_beaker_username() -> str: + global _BEAKER_USERNAME + + if _BEAKER_USERNAME is None: + _BEAKER_USERNAME = Beaker.from_env().account.whoami().name + + return _BEAKER_USERNAME diff --git a/src/olmo_core/internal/experiment.py b/src/olmo_core/internal/experiment.py index be3da2a3..3af64f7d 100644 --- a/src/olmo_core/internal/experiment.py +++ b/src/olmo_core/internal/experiment.py @@ -3,7 +3,6 @@ from dataclasses import dataclass from typing import Callable, Dict, List, Optional, cast -from beaker import Beaker from rich import print from torch.distributed.device_mesh import DeviceMesh @@ -53,6 +52,8 @@ seed_all, ) +from .common import get_beaker_username + log = logging.getLogger(__name__) @@ -109,7 +110,7 @@ def prepare_environment(self): elif self == SubCmd.train: prepare_training_environment() else: - raise NotADirectoryError(self) + raise NotImplementedError(self) def run(self, config: ExperimentConfig): print(config) @@ -133,7 +134,7 @@ def run(self, config: ExperimentConfig): elif self == SubCmd.launch_prep: launch_prep(config) else: - raise NotADirectoryError(self) + raise NotImplementedError(self) def build_common_components( @@ -153,7 +154,7 @@ def build_common_components( elif "augusta" in cluster: root_dir = "gs://ai2-llm" - beaker_user = (Beaker.from_env().account.whoami().name).upper() + beaker_user = get_beaker_username() cmd_to_launch = SubCmd.train if cmd == SubCmd.launch_prep: cmd_to_launch = SubCmd.prep diff --git a/src/olmo_core/internal/model_ladder.py b/src/olmo_core/internal/model_ladder.py index 9ff965d0..377be0fb 100644 --- a/src/olmo_core/internal/model_ladder.py +++ b/src/olmo_core/internal/model_ladder.py @@ -1,65 +1,218 @@ +import sys from dataclasses import dataclass -from typing import Any, ClassVar, Dict +from typing import Callable, List, cast -from olmo_core.config import DType -from olmo_core.distributed.parallel import DataParallelType +from beaker import Beaker +from rich import print + +from olmo_core.config import Config, StrEnum +from olmo_core.data import NumpyDataLoaderConfig, NumpyDatasetConfig +from olmo_core.io import is_url +from olmo_core.launch.beaker import ( + BeakerEnvSecret, + BeakerLaunchConfig, + BeakerWekaBucket, + OLMoCoreBeakerImage, +) from olmo_core.model_ladder import ModelLadder, ModelSize -from olmo_core.nn.transformer import TransformerConfig, TransformerDataParallelConfig -from olmo_core.optim import AdamWConfig, OptimConfig, OptimGroupOverride +from olmo_core.nn.transformer import TransformerConfig +from olmo_core.optim import OptimConfig +from olmo_core.train import ( + TrainerConfig, + prepare_training_environment, + teardown_training_environment, +) +from olmo_core.train.callbacks import CometCallback, ConfigSaverCallback, WandBCallback +from olmo_core.utils import ( + generate_uuid, + get_default_device, + prepare_cli_environment, + seed_all, +) @dataclass -class BaselineModelLadder(ModelLadder): - """ - Baseline OLMo model ladder using the current recommended architecture. - """ - - MBZ_SIZES: ClassVar[Dict[ModelSize, int]] = { - # TODO: may need to tune these - # =============================== - ModelSize.size_190M: 32 * 4096, - ModelSize.size_370M: 32 * 4096, - ModelSize.size_600M: 16 * 4096, - ModelSize.size_760M: 16 * 4096, - # =============================== - ModelSize.size_1B: 8 * 4096, - ModelSize.size_3B: 4 * 4096, - ModelSize.size_7B: 2 * 4096, - ModelSize.size_13B: 1 * 4096, - } - - MODEL_OVERRIDES: ClassVar[Dict[ModelSize, Dict[str, Any]]] = { - ModelSize.size_1B: dict(n_layers=16), # need to scale down our actual 1B model - } - - def get_model_config(self, *, size: ModelSize) -> TransformerConfig: - return getattr(TransformerConfig, f"olmo_{size}")( - vocab_size=self.tokenizer.padded_vocab_size(), - init_seed=self.init_seed, - compile=True, - dp_config=TransformerDataParallelConfig( - name=DataParallelType.fsdp, param_dtype=DType.bfloat16, reduce_dtype=DType.float32 - ), - **self.MODEL_OVERRIDES.get(size, {}), - ) +class LadderRunConfig(Config): + launch: BeakerLaunchConfig + ladder: ModelLadder + model: TransformerConfig + optim: OptimConfig + dataset: NumpyDatasetConfig + data_loader: NumpyDataLoaderConfig + trainer: TrainerConfig + + +class SubCmd(StrEnum): + launch = "launch" + train = "train" + dry_run = "dry_run" + + def prepare_environment(self): + if self in (SubCmd.launch, SubCmd.dry_run): + prepare_cli_environment() + elif self == SubCmd.train: + prepare_training_environment() + else: + raise NotImplementedError(self) + + def run(self, size: ModelSize, config: LadderRunConfig): + print(config) + + if self == SubCmd.launch: + config.launch.launch(follow=True) + elif self == SubCmd.dry_run: + pass + elif self == SubCmd.train: + try: + # Set RNG states on all devices. + seed_all(config.ladder.init_seed) + + # Build components. + model = config.model.build( + init_device="meta", + device=get_default_device(), + max_seq_len=config.dataset.sequence_length, + dp_mesh=config.ladder.get_dp_mesh(size=size), + ) + optim = config.optim.build(model) + dataset = config.dataset.build() + data_loader = config.data_loader.build(dataset) + trainer = config.trainer.build(model, optim, data_loader) + + # Record the config to W&B/Comet and each checkpoint dir. + config_dict = config.as_config_dict() + cast(CometCallback, trainer.callbacks["comet"]).config = config_dict + cast(WandBCallback, trainer.callbacks["wandb"]).config = config_dict + cast(ConfigSaverCallback, trainer.callbacks["config_saver"]).config = config_dict + + # Train. + trainer.fit() + finally: + teardown_training_environment() + else: + raise NotImplementedError(self) + + +def get_root_dir(cluster: str) -> str: + root_dir: str = "weka://oe-training-default/ai2-llm" + if "jupiter" in cluster: + root_dir = "/weka/oe-training-default/ai2-llm" + elif "augusta" in cluster: + root_dir = "gs://ai2-llm" + return root_dir + + +def build_config( + ladder: ModelLadder, + script: str, + size: ModelSize, + cmd: SubCmd, + cluster: str, + overrides: List[str], +) -> LadderRunConfig: + del cmd - def get_optim_config(self, *, size: ModelSize) -> OptimConfig: - # Calculate LR according to https://api.semanticscholar.org/CorpusID:270764838 - assert self.sequence_length in {2048, 4096} - lr = 0.0047 * (size.num_params / 108000000) ** (-1 / 3) - if self.sequence_length == 4096: - lr /= 4 - - return AdamWConfig( - lr=lr, - weight_decay=0.1, - betas=(0.9, 0.95), - group_overrides=[ - OptimGroupOverride(params=["embeddings.weight"], opts=dict(weight_decay=0.0)) - ], - fused=True, + root_dir = get_root_dir(cluster) + weka_buckets: List[BeakerWekaBucket] = [] + if root_dir.startswith("/weka/"): + weka_buckets.append(BeakerWekaBucket("oe-training-default", "/weka/oe-training-default")) + + beaker_user = (Beaker.from_env().account.whoami().name).upper() + + launch = BeakerLaunchConfig( + name=f"{ladder.name}-{size}-{generate_uuid()[:8]}", + budget="ai2/oe-training", + cmd=[script, SubCmd.train, size, cluster, *overrides], + task_name="train", + workspace="ai2/OLMo-core", + clusters=[cluster], + weka_buckets=weka_buckets, + beaker_image=OLMoCoreBeakerImage.nightly, # some features require nightly at the moment + num_nodes=1, + num_gpus=8, + shared_filesystem=not is_url(root_dir), + allow_dirty=False, + env_secrets=[ + BeakerEnvSecret(name="BEAKER_TOKEN", secret=f"{beaker_user}_BEAKER_TOKEN"), + BeakerEnvSecret(name="WANDB_API_KEY", secret=f"{beaker_user}_WANDB_API_KEY"), + BeakerEnvSecret(name="COMET_API_KEY", secret=f"{beaker_user}_COMET_API_KEY"), + BeakerEnvSecret(name="AWS_CONFIG", secret=f"{beaker_user}_AWS_CONFIG"), + BeakerEnvSecret(name="AWS_CREDENTIALS", secret=f"{beaker_user}_AWS_CREDENTIALS"), + BeakerEnvSecret(name="R2_ENDPOINT_URL", secret="R2_ENDPOINT_URL"), + BeakerEnvSecret(name="WEKA_ENDPOINT_URL", secret="WEKA_ENDPOINT_URL"), + ], + setup_steps=[ + # Clone repo. + 'git clone "$REPO_URL" .', + 'git checkout "$GIT_REF"', + "git submodule update --init --recursive", + # Setup python environment. + "conda shell.bash activate base", + "pip install -e '.[all]'", + "pip freeze", + # Move AWS credentials from env to relevant files + "mkdir -p ~/.aws", + "printenv AWS_CONFIG > ~/.aws/config", + "printenv AWS_CREDENTIALS > ~/.aws/credentials", + ], + ).merge(overrides, strict=False) + + dp_world_size = launch.num_nodes * launch.num_gpus + gpu_type = "h100" + + model = ladder.get_model_config(size=size) + optim = ladder.get_optim_config(size=size) + dataset = ladder.get_dataset_config() + data_loader = ladder.get_data_loader_config(size=size, dp_world_size=dp_world_size) + trainer = ladder.get_trainer_config(size=size, gpu_type=gpu_type) + + return LadderRunConfig( + launch=launch, + ladder=ladder, + model=model, + optim=optim, + dataset=dataset, + data_loader=data_loader, + trainer=trainer, + ).merge(overrides) + + +def main(ladder_builder: Callable[[str], ModelLadder]): + usage = f""" +[yellow]Usage:[/] [i blue]python[/] [i cyan]{sys.argv[0]}[/] [i b magenta]{'|'.join(SubCmd)}[/] [i b]SIZE CLUSTER[/] [i][OVERRIDES...][/] + +[b]Subcommands[/] +[b magenta]launch:[/] Launch the script on Beaker with the [b magenta]train[/] subcommand. +[b magenta]train:[/] Run the trainer. You usually shouldn't invoke the script with this subcommand directly. + Instead use [b magenta]launch[/] or run it with torchrun. +[b magenta]dry_run:[/] Pretty print the config to run and exit. + +[b]Examples[/] +$ [i]python {sys.argv[0]} {SubCmd.launch} 1B ai2/pluto-cirrascale --launch.num_nodes=2[/] + """.strip() + + try: + script, cmd, size, cluster, overrides = ( + sys.argv[0], + SubCmd(sys.argv[1]), + ModelSize(sys.argv[2]), + sys.argv[3], + sys.argv[4:], ) + except (IndexError, ValueError): + import rich + + rich.get_console().print(usage, highlight=False) + sys.exit(1) + + cmd.prepare_environment() + + # Build ladder config. + ladder = ladder_builder(get_root_dir(cluster)) + ladder.merge(overrides, prefix="ladder") + + # Build run config. + config = build_config(ladder, script, size, cmd, cluster, overrides) - def get_rank_microbatch_size(self, *, size: ModelSize, gpu_type: str) -> int: - assert "h100" in gpu_type.lower() - return self.MBZ_SIZES[size] + # Run the cmd. + cmd.run(size, config) diff --git a/src/olmo_core/model_ladder.py b/src/olmo_core/model_ladder.py index 5cc993e2..8c187ff0 100644 --- a/src/olmo_core/model_ladder.py +++ b/src/olmo_core/model_ladder.py @@ -4,9 +4,8 @@ from abc import ABCMeta, abstractmethod from dataclasses import dataclass, field -from typing import Optional, cast +from typing import Optional -import torch from torch.distributed.device_mesh import DeviceMesh from olmo_core.config import Config, StrEnum @@ -17,21 +16,12 @@ NumpyDatasetType, TokenizerConfig, ) -from olmo_core.distributed.utils import ( - get_num_nodes, - get_world_size, - init_hybrid_shard_mesh, -) +from olmo_core.distributed.utils import get_num_nodes, init_hybrid_shard_mesh from olmo_core.exceptions import OLMoConfigurationError -from olmo_core.io import is_url +from olmo_core.io import join_path from olmo_core.nn.transformer import TransformerConfig from olmo_core.optim import CosWithWarmup, OptimConfig -from olmo_core.train import ( - Duration, - TrainerConfig, - prepare_training_environment, - teardown_training_environment, -) +from olmo_core.train import Duration, TrainerConfig from olmo_core.train.callbacks import ( CheckpointerCallback, CometCallback, @@ -44,9 +34,8 @@ SchedulerCallback, WandBCallback, ) -from olmo_core.utils import get_default_device, seed_all -__all__ = ["ModelSize", "ModelLadder"] +__all__ = ["ModelSize", "ModelLadder", "LadderRunConfig"] class ModelSize(StrEnum): @@ -100,6 +89,34 @@ def num_params(self) -> int: raise NotImplementedError(self) +@dataclass +class LadderRunConfig(Config): + """ + Defines components required for a single run of a particular model size. + """ + + model: TransformerConfig + """ + The model config. + """ + optim: OptimConfig + """ + The optimizer config. + """ + dataset: NumpyDatasetConfig + """ + The dataset config. + """ + data_loader: NumpyDataLoaderConfig + """ + The data loader config. + """ + trainer: TrainerConfig + """ + The trainer config. + """ + + @dataclass class ModelLadder(Config, metaclass=ABCMeta): """ @@ -124,9 +141,19 @@ class ModelLadder(Config, metaclass=ABCMeta): The name of the W&B/Comet project to save run data to. """ - root_dir: str + mix_base_dir: str + """ + The base directory of the training data. + """ + + work_dir: str + """ + The local working directory used for dataset caching. + """ + + save_folder: str """ - The root directory. Defines where to find the data mix paths and where to save checkpoints to. + The local or remote folder to save checkpoints to. """ sequence_length: int = 2048 @@ -154,23 +181,8 @@ class ModelLadder(Config, metaclass=ABCMeta): The seed to use for shuffling the data. """ - @property - def work_dir(self) -> str: - """ - The working directory used for dataset caching. - """ - return ( - "./dataset-cache" - if is_url(self.root_dir) - else f"{self.root_dir}/checkpoints/{self._get_beaker_username() or 'OLMo-core'}/dataset-cache" - ) - def get_save_folder(self, size: ModelSize) -> str: - """ - The local or remote folder to save checkpoints to. - Should be unique for the ladder config and model size. - """ - return f"{self.root_dir}/checkpoints/{self._get_beaker_username() or 'OLMo-core'}/{self.name}-{size}" + return str(join_path(self.save_folder, f"checkpoints/{self.name}-{size}")) @abstractmethod def get_model_config(self, *, size: ModelSize) -> TransformerConfig: @@ -194,12 +206,12 @@ def get_dataset_config(self) -> NumpyDatasetConfig: """ Get the train dataset config. - :param sequence_length: The sequence length to be used. + :param kwargs: Extra kwargs to pass to the dataset config constructor. """ return NumpyDatasetConfig.from_data_mix( self.data_mix, tokenizer=self.tokenizer, - mix_base_root=self.root_dir, + mix_base_dir=self.mix_base_dir, sequence_length=self.sequence_length, work_dir=self.work_dir, ) @@ -308,7 +320,7 @@ def get_trainer_config( eval_dataset=NumpyDatasetConfig.from_data_mix( DataMix.v3_small_ppl_validation, name=NumpyDatasetType.padded_fsl, - mix_base_dir=self.root_dir, + mix_base_dir=self.mix_base_dir, sequence_length=self.sequence_length, tokenizer=self.tokenizer, work_dir=self.work_dir, @@ -354,61 +366,6 @@ def get_trainer_config( ) ) - def train(self, size: ModelSize): - """ - Run the ladder at the given size. - - .. note:: - This will call :func:`~olmo_core.train.prepare_training_environment()`, so there's no - need to call that before. - """ - prepare_training_environment() - - gpu_type = torch.cuda.get_device_name() - dp_world_size = get_world_size() - - try: - seed_all(self.init_seed) - - # Get configs. - model_config = self.get_model_config(size=size) - optim_config = self.get_optim_config(size=size) - dataset_config = self.get_dataset_config() - data_loader_config = self.get_data_loader_config(size=size, dp_world_size=dp_world_size) - trainer_config = self.get_trainer_config(size=size, gpu_type=gpu_type) - - # Build components. - model = model_config.build( - init_device="meta", - device=get_default_device(), - max_seq_len=self.sequence_length, - dp_mesh=self.get_dp_mesh(size=size), - ) - optim = optim_config.build(model) - dataset = dataset_config.build() - data_loader = data_loader_config.build(dataset) - trainer = trainer_config.build(model, optim, data_loader) - - # Record the config to W&B/Comet and each checkpoint dir. - config_dict = self.as_config_dict() - config_dict.update( - dict( - model=model_config, - optim=optim_config, - dataset=dataset_config, - data_loader=data_loader_config, - trainer=trainer_config, - ) - ) - cast(CometCallback, trainer.callbacks["comet"]).config = config_dict - cast(WandBCallback, trainer.callbacks["wandb"]).config = config_dict - cast(ConfigSaverCallback, trainer.callbacks["config_saver"]).config = config_dict - - # Train. - trainer.fit() - finally: - teardown_training_environment() - def validate(self): """ Validate the ladder configuration. @@ -442,14 +399,3 @@ def validate(self): f"Batch size of {bz_tokens:,d} tokens for model size {size} " f"must be divisible by the sequence length ({self.sequence_length})" ) - - def _get_beaker_username(self) -> Optional[str]: - try: - from beaker import Beaker, BeakerError - except ImportError: - return None - - try: - return Beaker.from_env().account.whoami().name - except BeakerError: - return None diff --git a/src/scripts/train/ladder_baseline.py b/src/scripts/train/ladder_baseline.py new file mode 100644 index 00000000..90f94fcd --- /dev/null +++ b/src/scripts/train/ladder_baseline.py @@ -0,0 +1,88 @@ +from dataclasses import dataclass +from typing import Any, ClassVar, Dict + +from olmo_core.config import DType +from olmo_core.distributed.parallel import DataParallelType +from olmo_core.internal.common import get_beaker_username +from olmo_core.internal.model_ladder import main +from olmo_core.io import is_url, join_path +from olmo_core.model_ladder import ModelLadder, ModelSize +from olmo_core.nn.transformer import TransformerConfig, TransformerDataParallelConfig +from olmo_core.optim import AdamWConfig, OptimConfig, OptimGroupOverride + + +@dataclass +class BaselineModelLadder(ModelLadder): + """ + Baseline OLMo model ladder using the current recommended architecture. + """ + + MBZ_SIZES: ClassVar[Dict[ModelSize, int]] = { + # TODO: may need to tune these + # =============================== + ModelSize.size_190M: 32 * 4096, + ModelSize.size_370M: 32 * 4096, + ModelSize.size_600M: 16 * 4096, + ModelSize.size_760M: 16 * 4096, + # =============================== + ModelSize.size_1B: 8 * 4096, + ModelSize.size_3B: 4 * 4096, + ModelSize.size_7B: 2 * 4096, + ModelSize.size_13B: 1 * 4096, + } + + MODEL_OVERRIDES: ClassVar[Dict[ModelSize, Dict[str, Any]]] = { + ModelSize.size_1B: dict(n_layers=16), # need to scale down our actual 1B model + } + + def get_model_config(self, *, size: ModelSize) -> TransformerConfig: + return getattr(TransformerConfig, f"olmo_{size}")( + vocab_size=self.tokenizer.padded_vocab_size(), + init_seed=self.init_seed, + compile=True, + dp_config=TransformerDataParallelConfig( + name=DataParallelType.fsdp, param_dtype=DType.bfloat16, reduce_dtype=DType.float32 + ), + **self.MODEL_OVERRIDES.get(size, {}), + ) + + def get_optim_config(self, *, size: ModelSize) -> OptimConfig: + # Calculate LR according to https://api.semanticscholar.org/CorpusID:270764838 + assert self.sequence_length in {2048, 4096} + lr = 0.0047 * (size.num_params / 108000000) ** (-1 / 3) + if self.sequence_length == 4096: + lr /= 4 + + return AdamWConfig( + lr=lr, + weight_decay=0.1, + betas=(0.9, 0.95), + group_overrides=[ + OptimGroupOverride(params=["embeddings.weight"], opts=dict(weight_decay=0.0)) + ], + fused=True, + ) + + def get_rank_microbatch_size(self, *, size: ModelSize, gpu_type: str) -> int: + assert "h100" in gpu_type.lower() + return self.MBZ_SIZES[size] + + +def build_ladder(root_dir: str) -> BaselineModelLadder: + work_dir = ( + "./dataset-cache" + if is_url(root_dir) + else str(join_path(root_dir, f"checkpoints/{get_beaker_username().lower()}/dataset-cache")) + ) + save_folder = str(join_path(root_dir, f"checkpoints/{get_beaker_username().lower()}")) + return BaselineModelLadder( + name="OLMo2", + project="OLMo2-model-ladder", + mix_base_dir=root_dir, + work_dir=work_dir, + save_folder=save_folder, + ) + + +if __name__ == "__main__": + main(build_ladder) diff --git a/src/test/config_test.py b/src/test/config_test.py index 10472aad..5037420e 100644 --- a/src/test/config_test.py +++ b/src/test/config_test.py @@ -67,3 +67,35 @@ class Foo(Config): "x_tuple": [0, 1], "x_set": ["a"], } + + +def test_non_strict_merge(): + @dataclass + class Bar(Config): + x: int + y: int + + @dataclass + class Foo(Config): + bar: Bar + z: str + + foo = Foo(bar=Bar(x=1, y=2), z="a").merge(["--z=b", "--bar.x=0", "--baz.booz=0"], strict=False) + assert foo.z == "b" + assert foo.bar.x == 0 + + +def test_merge_with_prefix(): + @dataclass + class Bar(Config): + x: int + y: int + + @dataclass + class Foo(Config): + bar: Bar + z: str + + foo = Foo(bar=Bar(x=1, y=2), z="a").merge(["--foo.z=b", "--foo.bar.x=0"], prefix="foo") + assert foo.z == "b" + assert foo.bar.x == 0 From d5843c153640c992b257c0a63724e20b8bdeee0e Mon Sep 17 00:00:00 2001 From: epwalsh Date: Mon, 25 Nov 2024 16:00:02 -0800 Subject: [PATCH 04/25] clean up, consolidate --- src/olmo_core/internal/common.py | 83 +++++++++++++++++++++++++- src/olmo_core/internal/experiment.py | 79 ++++-------------------- src/olmo_core/internal/model_ladder.py | 74 +++-------------------- src/olmo_core/model_ladder.py | 30 +--------- src/scripts/train/ladder_baseline.py | 11 +--- 5 files changed, 107 insertions(+), 170 deletions(-) diff --git a/src/olmo_core/internal/common.py b/src/olmo_core/internal/common.py index c4cc46ea..9373dca2 100644 --- a/src/olmo_core/internal/common.py +++ b/src/olmo_core/internal/common.py @@ -1,7 +1,16 @@ -from typing import Optional +from typing import List, Optional from beaker import Beaker +from olmo_core.io import is_url +from olmo_core.launch.beaker import ( + BeakerEnvSecret, + BeakerLaunchConfig, + BeakerWekaBucket, + OLMoCoreBeakerImage, +) +from olmo_core.utils import generate_uuid + _BEAKER_USERNAME: Optional[str] = None @@ -12,3 +21,75 @@ def get_beaker_username() -> str: _BEAKER_USERNAME = Beaker.from_env().account.whoami().name return _BEAKER_USERNAME + + +def get_root_dir(cluster: str) -> str: + root_dir: str = "weka://oe-training-default/ai2-llm" + if "jupiter" in cluster: + root_dir = "/weka/oe-training-default/ai2-llm" + elif "augusta" in cluster: + root_dir = "gs://ai2-llm" + return root_dir + + +def get_work_dir(root_dir: str) -> str: + return ( + "./dataset-cache" + if is_url(root_dir) + else f"{root_dir}/checkpoints/{get_beaker_username().lower()}/dataset-cache" + ) + + +def build_launch_config( + *, + name: str, + root_dir: str, + cmd: List[str], + cluster: str, + task_name: str = "train", + workspace: str = "ai2/OLMo-core", + budget: str = "ai2/oe-training", +) -> BeakerLaunchConfig: + weka_buckets: List[BeakerWekaBucket] = [] + if root_dir.startswith("/weka/"): + weka_buckets.append(BeakerWekaBucket("oe-training-default", "/weka/oe-training-default")) + + beaker_user = get_beaker_username() + + return BeakerLaunchConfig( + name=f"{name}-{generate_uuid()[:8]}", + budget=budget, + cmd=cmd, + task_name=task_name, + workspace=workspace, + clusters=[cluster], + weka_buckets=weka_buckets, + beaker_image=OLMoCoreBeakerImage.nightly, # some features require nightly at the moment + num_nodes=1, + num_gpus=8, + shared_filesystem=not is_url(root_dir), + allow_dirty=False, + env_secrets=[ + BeakerEnvSecret(name="BEAKER_TOKEN", secret=f"{beaker_user}_BEAKER_TOKEN"), + BeakerEnvSecret(name="WANDB_API_KEY", secret=f"{beaker_user}_WANDB_API_KEY"), + BeakerEnvSecret(name="COMET_API_KEY", secret=f"{beaker_user}_COMET_API_KEY"), + BeakerEnvSecret(name="AWS_CONFIG", secret=f"{beaker_user}_AWS_CONFIG"), + BeakerEnvSecret(name="AWS_CREDENTIALS", secret=f"{beaker_user}_AWS_CREDENTIALS"), + BeakerEnvSecret(name="R2_ENDPOINT_URL", secret="R2_ENDPOINT_URL"), + BeakerEnvSecret(name="WEKA_ENDPOINT_URL", secret="WEKA_ENDPOINT_URL"), + ], + setup_steps=[ + # Clone repo. + 'git clone "$REPO_URL" .', + 'git checkout "$GIT_REF"', + "git submodule update --init --recursive", + # Setup python environment. + "conda shell.bash activate base", + "pip install -e '.[all]'", + "pip freeze", + # Move AWS credentials from env to relevant files + "mkdir -p ~/.aws", + "printenv AWS_CONFIG > ~/.aws/config", + "printenv AWS_CREDENTIALS > ~/.aws/credentials", + ], + ) diff --git a/src/olmo_core/internal/experiment.py b/src/olmo_core/internal/experiment.py index 3af64f7d..f480bc3e 100644 --- a/src/olmo_core/internal/experiment.py +++ b/src/olmo_core/internal/experiment.py @@ -18,13 +18,7 @@ ) from olmo_core.distributed.utils import get_num_nodes, init_hybrid_shard_mesh from olmo_core.float8 import Float8Config -from olmo_core.io import is_url -from olmo_core.launch.beaker import ( - BeakerEnvSecret, - BeakerLaunchConfig, - BeakerWekaBucket, - OLMoCoreBeakerImage, -) +from olmo_core.launch.beaker import BeakerLaunchConfig from olmo_core.nn.transformer import TransformerConfig from olmo_core.optim import CosWithWarmup, OptimConfig from olmo_core.train import ( @@ -45,14 +39,9 @@ SchedulerCallback, WandBCallback, ) -from olmo_core.utils import ( - generate_uuid, - get_default_device, - prepare_cli_environment, - seed_all, -) +from olmo_core.utils import get_default_device, prepare_cli_environment, seed_all -from .common import get_beaker_username +from .common import build_launch_config, get_beaker_username, get_root_dir, get_work_dir log = logging.getLogger(__name__) @@ -146,57 +135,21 @@ def build_common_components( *, global_batch_size: int, ) -> CommonComponents: - root_dir: str = "weka://oe-training-default/ai2-llm" - weka_buckets: List[BeakerWekaBucket] = [] - if "jupiter" in cluster: - root_dir = "/weka/oe-training-default/ai2-llm" - weka_buckets.append(BeakerWekaBucket("oe-training-default", "/weka/oe-training-default")) - elif "augusta" in cluster: - root_dir = "gs://ai2-llm" + root_dir = get_root_dir(cluster) - beaker_user = get_beaker_username() cmd_to_launch = SubCmd.train if cmd == SubCmd.launch_prep: cmd_to_launch = SubCmd.prep - launch_config = BeakerLaunchConfig( - name=f"{run_name}-{cmd_to_launch}-{generate_uuid()[:8]}", - budget="ai2/oe-training", + launch_config = build_launch_config( + name=f"{run_name}-{cmd_to_launch}", + root_dir=root_dir, cmd=[script, cmd_to_launch, run_name, cluster, *overrides], - task_name="train", - workspace="ai2/OLMo-core", - clusters=[cluster], - weka_buckets=weka_buckets, - beaker_image=OLMoCoreBeakerImage.nightly, # some features require nightly at the moment - num_nodes=1, - num_gpus=8, - shared_filesystem=not is_url(root_dir), - allow_dirty=False, - env_secrets=[ - BeakerEnvSecret(name="BEAKER_TOKEN", secret=f"{beaker_user}_BEAKER_TOKEN"), - BeakerEnvSecret(name="WANDB_API_KEY", secret=f"{beaker_user}_WANDB_API_KEY"), - BeakerEnvSecret(name="COMET_API_KEY", secret=f"{beaker_user}_COMET_API_KEY"), - BeakerEnvSecret(name="AWS_CONFIG", secret=f"{beaker_user}_AWS_CONFIG"), - BeakerEnvSecret(name="AWS_CREDENTIALS", secret=f"{beaker_user}_AWS_CREDENTIALS"), - BeakerEnvSecret(name="R2_ENDPOINT_URL", secret="R2_ENDPOINT_URL"), - BeakerEnvSecret(name="WEKA_ENDPOINT_URL", secret="WEKA_ENDPOINT_URL"), - ], - setup_steps=[ - # Clone repo. - 'git clone "$REPO_URL" .', - 'git checkout "$GIT_REF"', - "git submodule update --init --recursive", - # Setup python environment. - "conda shell.bash activate base", - "pip install -e '.[all]'", - "pip freeze", - # Move AWS credentials from env to relevant files - "mkdir -p ~/.aws", - "printenv AWS_CONFIG > ~/.aws/config", - "printenv AWS_CREDENTIALS > ~/.aws/credentials", - ], + cluster=cluster, ) + beaker_user = get_beaker_username() + tokenizer_config = TokenizerConfig.dolma2() dataset_config = NumpyDatasetConfig.from_data_mix( @@ -210,11 +163,7 @@ def build_common_components( vsl_curriculum=VSLCurriculumConfig( name=VSLCurriculumType.grow_p2, num_cycles=8, balanced=False ), - work_dir=( - "./dataset-cache" - if is_url(root_dir) - else f"{root_dir}/checkpoints/{beaker_user.lower()}/dataset-cache" - ), + work_dir=get_work_dir(root_dir), ) data_loader_config = NumpyDataLoaderConfig( @@ -235,11 +184,7 @@ def build_common_components( mix_base_dir=root_dir, sequence_length=dataset_config.effective_sequence_length, tokenizer=tokenizer_config, - work_dir=( - "./dataset-cache" - if is_url(root_dir) - else f"{root_dir}/checkpoints/{beaker_user.lower()}/dataset-cache" - ), + work_dir=get_work_dir(root_dir), ), eval_interval=1000, ), diff --git a/src/olmo_core/internal/model_ladder.py b/src/olmo_core/internal/model_ladder.py index 377be0fb..743cef01 100644 --- a/src/olmo_core/internal/model_ladder.py +++ b/src/olmo_core/internal/model_ladder.py @@ -2,18 +2,11 @@ from dataclasses import dataclass from typing import Callable, List, cast -from beaker import Beaker from rich import print from olmo_core.config import Config, StrEnum from olmo_core.data import NumpyDataLoaderConfig, NumpyDatasetConfig -from olmo_core.io import is_url -from olmo_core.launch.beaker import ( - BeakerEnvSecret, - BeakerLaunchConfig, - BeakerWekaBucket, - OLMoCoreBeakerImage, -) +from olmo_core.launch.beaker import BeakerLaunchConfig from olmo_core.model_ladder import ModelLadder, ModelSize from olmo_core.nn.transformer import TransformerConfig from olmo_core.optim import OptimConfig @@ -23,12 +16,9 @@ teardown_training_environment, ) from olmo_core.train.callbacks import CometCallback, ConfigSaverCallback, WandBCallback -from olmo_core.utils import ( - generate_uuid, - get_default_device, - prepare_cli_environment, - seed_all, -) +from olmo_core.utils import get_default_device, prepare_cli_environment, seed_all + +from .common import build_launch_config, get_root_dir @dataclass @@ -93,15 +83,6 @@ def run(self, size: ModelSize, config: LadderRunConfig): raise NotImplementedError(self) -def get_root_dir(cluster: str) -> str: - root_dir: str = "weka://oe-training-default/ai2-llm" - if "jupiter" in cluster: - root_dir = "/weka/oe-training-default/ai2-llm" - elif "augusta" in cluster: - root_dir = "gs://ai2-llm" - return root_dir - - def build_config( ladder: ModelLadder, script: str, @@ -113,52 +94,15 @@ def build_config( del cmd root_dir = get_root_dir(cluster) - weka_buckets: List[BeakerWekaBucket] = [] - if root_dir.startswith("/weka/"): - weka_buckets.append(BeakerWekaBucket("oe-training-default", "/weka/oe-training-default")) - - beaker_user = (Beaker.from_env().account.whoami().name).upper() - - launch = BeakerLaunchConfig( - name=f"{ladder.name}-{size}-{generate_uuid()[:8]}", - budget="ai2/oe-training", + launch = build_launch_config( + name=f"{ladder.name}-{size}", + root_dir=root_dir, cmd=[script, SubCmd.train, size, cluster, *overrides], - task_name="train", - workspace="ai2/OLMo-core", - clusters=[cluster], - weka_buckets=weka_buckets, - beaker_image=OLMoCoreBeakerImage.nightly, # some features require nightly at the moment - num_nodes=1, - num_gpus=8, - shared_filesystem=not is_url(root_dir), - allow_dirty=False, - env_secrets=[ - BeakerEnvSecret(name="BEAKER_TOKEN", secret=f"{beaker_user}_BEAKER_TOKEN"), - BeakerEnvSecret(name="WANDB_API_KEY", secret=f"{beaker_user}_WANDB_API_KEY"), - BeakerEnvSecret(name="COMET_API_KEY", secret=f"{beaker_user}_COMET_API_KEY"), - BeakerEnvSecret(name="AWS_CONFIG", secret=f"{beaker_user}_AWS_CONFIG"), - BeakerEnvSecret(name="AWS_CREDENTIALS", secret=f"{beaker_user}_AWS_CREDENTIALS"), - BeakerEnvSecret(name="R2_ENDPOINT_URL", secret="R2_ENDPOINT_URL"), - BeakerEnvSecret(name="WEKA_ENDPOINT_URL", secret="WEKA_ENDPOINT_URL"), - ], - setup_steps=[ - # Clone repo. - 'git clone "$REPO_URL" .', - 'git checkout "$GIT_REF"', - "git submodule update --init --recursive", - # Setup python environment. - "conda shell.bash activate base", - "pip install -e '.[all]'", - "pip freeze", - # Move AWS credentials from env to relevant files - "mkdir -p ~/.aws", - "printenv AWS_CONFIG > ~/.aws/config", - "printenv AWS_CREDENTIALS > ~/.aws/credentials", - ], + cluster=cluster, ).merge(overrides, strict=False) dp_world_size = launch.num_nodes * launch.num_gpus - gpu_type = "h100" + gpu_type = "h100" # TODO: get actual device name model = ladder.get_model_config(size=size) optim = ladder.get_optim_config(size=size) diff --git a/src/olmo_core/model_ladder.py b/src/olmo_core/model_ladder.py index 8c187ff0..2ae443dd 100644 --- a/src/olmo_core/model_ladder.py +++ b/src/olmo_core/model_ladder.py @@ -35,7 +35,7 @@ WandBCallback, ) -__all__ = ["ModelSize", "ModelLadder", "LadderRunConfig"] +__all__ = ["ModelSize", "ModelLadder"] class ModelSize(StrEnum): @@ -89,34 +89,6 @@ def num_params(self) -> int: raise NotImplementedError(self) -@dataclass -class LadderRunConfig(Config): - """ - Defines components required for a single run of a particular model size. - """ - - model: TransformerConfig - """ - The model config. - """ - optim: OptimConfig - """ - The optimizer config. - """ - dataset: NumpyDatasetConfig - """ - The dataset config. - """ - data_loader: NumpyDataLoaderConfig - """ - The data loader config. - """ - trainer: TrainerConfig - """ - The trainer config. - """ - - @dataclass class ModelLadder(Config, metaclass=ABCMeta): """ diff --git a/src/scripts/train/ladder_baseline.py b/src/scripts/train/ladder_baseline.py index 90f94fcd..f099bd07 100644 --- a/src/scripts/train/ladder_baseline.py +++ b/src/scripts/train/ladder_baseline.py @@ -3,9 +3,9 @@ from olmo_core.config import DType from olmo_core.distributed.parallel import DataParallelType -from olmo_core.internal.common import get_beaker_username +from olmo_core.internal.common import get_beaker_username, get_work_dir from olmo_core.internal.model_ladder import main -from olmo_core.io import is_url, join_path +from olmo_core.io import join_path from olmo_core.model_ladder import ModelLadder, ModelSize from olmo_core.nn.transformer import TransformerConfig, TransformerDataParallelConfig from olmo_core.optim import AdamWConfig, OptimConfig, OptimGroupOverride @@ -69,17 +69,12 @@ def get_rank_microbatch_size(self, *, size: ModelSize, gpu_type: str) -> int: def build_ladder(root_dir: str) -> BaselineModelLadder: - work_dir = ( - "./dataset-cache" - if is_url(root_dir) - else str(join_path(root_dir, f"checkpoints/{get_beaker_username().lower()}/dataset-cache")) - ) save_folder = str(join_path(root_dir, f"checkpoints/{get_beaker_username().lower()}")) return BaselineModelLadder( name="OLMo2", project="OLMo2-model-ladder", mix_base_dir=root_dir, - work_dir=work_dir, + work_dir=get_work_dir(root_dir), save_folder=save_folder, ) From 4ef79a76f1706febf5c3832b3dfb19e4d2a6b3b7 Mon Sep 17 00:00:00 2001 From: epwalsh Date: Mon, 25 Nov 2024 16:01:37 -0800 Subject: [PATCH 05/25] more clean up --- src/olmo_core/internal/model_ladder.py | 1 + src/test/internal/__init__.py | 0 src/test/internal/model_ladder_test.py | 11 ----------- src/test/model_ladder_test.py | 2 +- 4 files changed, 2 insertions(+), 12 deletions(-) delete mode 100644 src/test/internal/__init__.py delete mode 100644 src/test/internal/model_ladder_test.py diff --git a/src/olmo_core/internal/model_ladder.py b/src/olmo_core/internal/model_ladder.py index 743cef01..9db92e0f 100644 --- a/src/olmo_core/internal/model_ladder.py +++ b/src/olmo_core/internal/model_ladder.py @@ -157,6 +157,7 @@ def main(ladder_builder: Callable[[str], ModelLadder]): # Build run config. config = build_config(ladder, script, size, cmd, cluster, overrides) + config.ladder.validate() # Run the cmd. cmd.run(size, config) diff --git a/src/test/internal/__init__.py b/src/test/internal/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/src/test/internal/model_ladder_test.py b/src/test/internal/model_ladder_test.py deleted file mode 100644 index 140083ad..00000000 --- a/src/test/internal/model_ladder_test.py +++ /dev/null @@ -1,11 +0,0 @@ -import pytest - -from olmo_core.ladder.baseline import BaselineModelLadder - - -@pytest.mark.parametrize("sequence_length", [2048, 4096]) -def test_validate_baseline_model_ladder(tmp_path, sequence_length): - ladder = BaselineModelLadder( - name="baseline", project="ladder", root_dir=tmp_path, sequence_length=sequence_length - ) - ladder.validate() diff --git a/src/test/model_ladder_test.py b/src/test/model_ladder_test.py index f7bfe797..14f84d40 100644 --- a/src/test/model_ladder_test.py +++ b/src/test/model_ladder_test.py @@ -1,4 +1,4 @@ -from olmo_core.ladder import ModelSize +from olmo_core.model_ladder import ModelSize def test_model_size_num_params(): From a0cf257a24a68f80772765db2f35896e36dc0cdd Mon Sep 17 00:00:00 2001 From: epwalsh Date: Mon, 25 Nov 2024 16:12:35 -0800 Subject: [PATCH 06/25] programmatically get GPU type --- src/olmo_core/internal/common.py | 32 +++++++++++++++++++++++++- src/olmo_core/internal/model_ladder.py | 4 ++-- 2 files changed, 33 insertions(+), 3 deletions(-) diff --git a/src/olmo_core/internal/common.py b/src/olmo_core/internal/common.py index 9373dca2..9399e3cf 100644 --- a/src/olmo_core/internal/common.py +++ b/src/olmo_core/internal/common.py @@ -1,3 +1,4 @@ +import logging from typing import List, Optional from beaker import Beaker @@ -11,14 +12,25 @@ ) from olmo_core.utils import generate_uuid +log = logging.getLogger(__name__) +_BEAKER_CLIENT: Optional[Beaker] = None _BEAKER_USERNAME: Optional[str] = None +def get_beaker_client() -> Beaker: + global _BEAKER_CLIENT + + if _BEAKER_CLIENT is None: + _BEAKER_CLIENT = Beaker.from_env() + + return _BEAKER_CLIENT + + def get_beaker_username() -> str: global _BEAKER_USERNAME if _BEAKER_USERNAME is None: - _BEAKER_USERNAME = Beaker.from_env().account.whoami().name + _BEAKER_USERNAME = get_beaker_client().account.whoami().name return _BEAKER_USERNAME @@ -93,3 +105,21 @@ def build_launch_config( "printenv AWS_CREDENTIALS > ~/.aws/credentials", ], ) + + +CLUSTER_TO_GPU_TYPE = { + "ai2/jupiter-cirrascale-2": "NVIDIA H100 80GB HBM3", + "ai2/pluto-cirrascale": "NVIDIA H100", + "ai2/augusta-google-1": "NVIDIA H100", +} + + +def get_gpu_type(cluster: str) -> str: + if cluster in CLUSTER_TO_GPU_TYPE: + return CLUSTER_TO_GPU_TYPE[cluster] + else: + log.warning(f"Missing cluster '{cluster}' in CLUSTER_TO_GPU_TYPE mapping") + beaker = get_beaker_client() + nodes = beaker.cluster.nodes(cluster) + assert nodes and nodes[0].limits.gpu_type + return nodes[0].limits.gpu_type diff --git a/src/olmo_core/internal/model_ladder.py b/src/olmo_core/internal/model_ladder.py index 9db92e0f..1cddc6ca 100644 --- a/src/olmo_core/internal/model_ladder.py +++ b/src/olmo_core/internal/model_ladder.py @@ -18,7 +18,7 @@ from olmo_core.train.callbacks import CometCallback, ConfigSaverCallback, WandBCallback from olmo_core.utils import get_default_device, prepare_cli_environment, seed_all -from .common import build_launch_config, get_root_dir +from .common import build_launch_config, get_gpu_type, get_root_dir @dataclass @@ -102,7 +102,7 @@ def build_config( ).merge(overrides, strict=False) dp_world_size = launch.num_nodes * launch.num_gpus - gpu_type = "h100" # TODO: get actual device name + gpu_type = get_gpu_type(cluster) model = ladder.get_model_config(size=size) optim = ladder.get_optim_config(size=size) From b712d9868ff178239d0aa90f55d8ca45da924f2b Mon Sep 17 00:00:00 2001 From: epwalsh Date: Mon, 25 Nov 2024 16:15:59 -0800 Subject: [PATCH 07/25] add more tasks --- src/olmo_core/model_ladder.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/olmo_core/model_ladder.py b/src/olmo_core/model_ladder.py index 2ae443dd..b332bbfa 100644 --- a/src/olmo_core/model_ladder.py +++ b/src/olmo_core/model_ladder.py @@ -263,6 +263,8 @@ def get_trainer_config( :param size: The target model size. :param gpu_type: The type of GPU as given by ``torch.cuda.get_device_name()``. """ + from olmo_eval import list_tasks + rank_mbz = self.get_rank_microbatch_size(size=size, gpu_type=gpu_type) if rank_mbz % self.sequence_length != 0: raise OLMoConfigurationError( @@ -303,7 +305,9 @@ def get_trainer_config( .with_callback( "downstream_evaluator", DownstreamEvaluatorCallbackConfig( - tasks=["hellaswag"], # TODO: which other tasks? + tasks=[ + task for task in list_tasks() if "_mc" not in task and "_var" not in task + ], tokenizer=self.tokenizer, eval_interval=250, ), From 9daaf2d9a7a9ed8b5beaa364b75fd2cf8ce67cf3 Mon Sep 17 00:00:00 2001 From: epwalsh Date: Mon, 25 Nov 2024 16:20:11 -0800 Subject: [PATCH 08/25] rename scripts --- README.md | 16 ++++++++-------- src/scripts/train/{Llama-8B.py => Llama3-8B.py} | 0 src/scripts/train/{OLMo-13B.py => OLMo2-13B.py} | 0 src/scripts/train/{OLMo-1B.py => OLMo2-1B.py} | 0 src/scripts/train/{OLMo-7B.py => OLMo2-7B.py} | 0 .../{ladder_baseline.py => OLMo2-ladder.py} | 0 6 files changed, 8 insertions(+), 8 deletions(-) rename src/scripts/train/{Llama-8B.py => Llama3-8B.py} (100%) rename src/scripts/train/{OLMo-13B.py => OLMo2-13B.py} (100%) rename src/scripts/train/{OLMo-1B.py => OLMo2-1B.py} (100%) rename src/scripts/train/{OLMo-7B.py => OLMo2-7B.py} (100%) rename src/scripts/train/{ladder_baseline.py => OLMo2-ladder.py} (100%) diff --git a/README.md b/README.md index 9979d4fa..d74c0c25 100644 --- a/README.md +++ b/README.md @@ -39,14 +39,14 @@ Throughput numbers from these scripts with various different configuration setti | Model size | Model arch.   | Context length | Precision | Throughput[^1] | Training script | Commandline overrides                                    | | :--------: | :--------: | :------------: | :-------: | -----------: | :----------- | :-------- | -| **1B** | OLMo-1124 | 4096 | BF16 | 55,000 TPS | `OLMo-1B.py` | | -| | | 4096 | BF16/FP8[^2] | 65,000 TPS | `OLMo-1B.py` | `--model.float8_config.enabled=true` | -| **7B** | OLMo-1124 | 4096 | BF16 | 10,000 TPS | `OLMo-7B.py` | | -| | | 4096 | BF16/FP8 | 13,000 TPS | `OLMo-7B.py` | `--model.float8_config.enabled=true` | -| **8B** | Llama | 4096 | BF16 | 9,500 TPS | `Llama-8B.py` | | -| | | 4096 | BF16/FP8 | 12,500 TPS | `Llama-8B.py` | `--model.float8_config.enabled=true` | -| **13B** | OLMo-1124 | 4096 | BF16 | 4,600 TPS | `OLMo-13B.py` | | -| | | 4096 | BF16/FP8 | 5,500 TPS | `OLMo-13B.py` | `--model.float8_config.enabled=true` | +| **1B** | OLMo-1124 | 4096 | BF16 | 55,000 TPS | `OLMo2-1B.py` | | +| | | 4096 | BF16/FP8[^2] | 65,000 TPS | `OLMo2-1B.py` | `--model.float8_config.enabled=true` | +| **7B** | OLMo-1124 | 4096 | BF16 | 10,000 TPS | `OLMo2-7B.py` | | +| | | 4096 | BF16/FP8 | 13,000 TPS | `OLMo2-7B.py` | `--model.float8_config.enabled=true` | +| **8B** | Llama | 4096 | BF16 | 9,500 TPS | `Llama3-8B.py` | | +| | | 4096 | BF16/FP8 | 12,500 TPS | `Llama3-8B.py` | `--model.float8_config.enabled=true` | +| **13B** | OLMo-1124 | 4096 | BF16 | 4,600 TPS | `OLMo2-13B.py` | | +| | | 4096 | BF16/FP8 | 5,500 TPS | `OLMo2-13B.py` | `--model.float8_config.enabled=true` | [^1]: Throughput reported in tokens per second per device. [^2]: In this setup most matrix multiplications are computed in `float8`, everything else is in `bfloat16`. diff --git a/src/scripts/train/Llama-8B.py b/src/scripts/train/Llama3-8B.py similarity index 100% rename from src/scripts/train/Llama-8B.py rename to src/scripts/train/Llama3-8B.py diff --git a/src/scripts/train/OLMo-13B.py b/src/scripts/train/OLMo2-13B.py similarity index 100% rename from src/scripts/train/OLMo-13B.py rename to src/scripts/train/OLMo2-13B.py diff --git a/src/scripts/train/OLMo-1B.py b/src/scripts/train/OLMo2-1B.py similarity index 100% rename from src/scripts/train/OLMo-1B.py rename to src/scripts/train/OLMo2-1B.py diff --git a/src/scripts/train/OLMo-7B.py b/src/scripts/train/OLMo2-7B.py similarity index 100% rename from src/scripts/train/OLMo-7B.py rename to src/scripts/train/OLMo2-7B.py diff --git a/src/scripts/train/ladder_baseline.py b/src/scripts/train/OLMo2-ladder.py similarity index 100% rename from src/scripts/train/ladder_baseline.py rename to src/scripts/train/OLMo2-ladder.py From fb8da2afe6bc0445da7d7f2fa18f96689bb8d598 Mon Sep 17 00:00:00 2001 From: epwalsh Date: Mon, 25 Nov 2024 16:23:58 -0800 Subject: [PATCH 09/25] changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6bf6050a..c341d30b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,6 +15,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Added an implementation of nGPT called `NormalizedTransformer`. - Added an example showing how to convert a HuggingFace Llama 3.2 checkpoint into the right format for OLMo-core. - Added an API for scaling RoPE embeddings. +- Added a `ModelLadder` API. ### Changed From bd28f430161dbe34495e0a84bf217c017b2bebbd Mon Sep 17 00:00:00 2001 From: epwalsh Date: Tue, 26 Nov 2024 10:13:01 -0800 Subject: [PATCH 10/25] change how we set batch size --- src/olmo_core/internal/model_ladder.py | 8 +++++++- src/olmo_core/model_ladder.py | 19 ++++++++++--------- 2 files changed, 17 insertions(+), 10 deletions(-) diff --git a/src/olmo_core/internal/model_ladder.py b/src/olmo_core/internal/model_ladder.py index 1cddc6ca..5557e737 100644 --- a/src/olmo_core/internal/model_ladder.py +++ b/src/olmo_core/internal/model_ladder.py @@ -6,6 +6,7 @@ from olmo_core.config import Config, StrEnum from olmo_core.data import NumpyDataLoaderConfig, NumpyDatasetConfig +from olmo_core.exceptions import OLMoConfigurationError from olmo_core.launch.beaker import BeakerLaunchConfig from olmo_core.model_ladder import ModelLadder, ModelSize from olmo_core.nn.transformer import TransformerConfig @@ -102,12 +103,17 @@ def build_config( ).merge(overrides, strict=False) dp_world_size = launch.num_nodes * launch.num_gpus + if dp_world_size > ladder.max_dp_world_size: + raise OLMoConfigurationError( + f"max_dp_world_size ({ladder.max_dp_world_size}) must be at least as big as current dp " + f"world size ({dp_world_size})" + ) gpu_type = get_gpu_type(cluster) model = ladder.get_model_config(size=size) optim = ladder.get_optim_config(size=size) dataset = ladder.get_dataset_config() - data_loader = ladder.get_data_loader_config(size=size, dp_world_size=dp_world_size) + data_loader = ladder.get_data_loader_config(size=size) trainer = ladder.get_trainer_config(size=size, gpu_type=gpu_type) return LadderRunConfig( diff --git a/src/olmo_core/model_ladder.py b/src/olmo_core/model_ladder.py index b332bbfa..ec35542b 100644 --- a/src/olmo_core/model_ladder.py +++ b/src/olmo_core/model_ladder.py @@ -153,6 +153,11 @@ class ModelLadder(Config, metaclass=ABCMeta): The seed to use for shuffling the data. """ + max_dp_world_size: int = 64 + """ + The maximum data parallel world size that you intent to run with. This is used to set the batch size. + """ + def get_save_folder(self, size: ModelSize) -> str: return str(join_path(self.save_folder, f"checkpoints/{self.name}-{size}")) @@ -188,17 +193,14 @@ def get_dataset_config(self) -> NumpyDatasetConfig: work_dir=self.work_dir, ) - def get_data_loader_config( - self, *, size: ModelSize, dp_world_size: int - ) -> NumpyDataLoaderConfig: + def get_data_loader_config(self, *, size: ModelSize) -> NumpyDataLoaderConfig: """ Get the data loader config. :param size: The target model size. - :param dp_world_size: The data parallel world size for training. """ return NumpyDataLoaderConfig( - global_batch_size=self.get_global_batch_size(size=size, dp_world_size=dp_world_size), + global_batch_size=self.get_global_batch_size(size=size), seed=self.data_seed, num_workers=4, ) @@ -214,12 +216,11 @@ def get_rank_microbatch_size(self, *, size: ModelSize, gpu_type: str) -> int: """ raise NotImplementedError - def get_global_batch_size(self, *, size: ModelSize, dp_world_size: int = 64) -> int: + def get_global_batch_size(self, *, size: ModelSize) -> int: """ Get the global batch size in tokens for a given model size. :param size: The target model size. - :param dp_world_size: The data parallel world size for training. """ # Calculate batch size according to https://api.semanticscholar.org/CorpusID:270764838, # which assumes a sequence length of 2048. So adjust from there accordingly. @@ -228,9 +229,9 @@ def get_global_batch_size(self, *, size: ModelSize, dp_world_size: int = 64) -> global_batch_size = 160 * (size.num_params / 108000000) ** (2 / 3) global_batch_size /= seq_len_divisor - global_batch_size /= dp_world_size + global_batch_size /= self.max_dp_world_size global_batch_size = round(global_batch_size) - global_batch_size *= dp_world_size + global_batch_size *= self.max_dp_world_size return self.sequence_length * global_batch_size From 3ce2f257e765fce6d21ea542046bcfe0a17128ba Mon Sep 17 00:00:00 2001 From: epwalsh Date: Wed, 27 Nov 2024 09:46:12 -0800 Subject: [PATCH 11/25] fix model builders --- src/olmo_core/nn/transformer/config.py | 21 +++++++++++++-------- src/scripts/train/OLMo2-13B.py | 2 +- src/scripts/train/OLMo2-1B.py | 2 +- src/scripts/train/OLMo2-7B.py | 2 +- src/scripts/train/OLMoE-1B-7B.py | 2 +- 5 files changed, 17 insertions(+), 12 deletions(-) diff --git a/src/olmo_core/nn/transformer/config.py b/src/olmo_core/nn/transformer/config.py index b724b9be..16fea510 100644 --- a/src/olmo_core/nn/transformer/config.py +++ b/src/olmo_core/nn/transformer/config.py @@ -293,7 +293,7 @@ def num_flops_per_token(self, seq_len: int) -> int: return flop_per_token @classmethod - def olmo_190M(cls, vocab_size: int, **kwargs) -> "TransformerConfig": + def olmo2_190M(cls, vocab_size: int, **kwargs) -> "TransformerConfig": return cls.llama_like( d_model=768, hidden_size_multiplier=1.5, @@ -304,10 +304,11 @@ def olmo_190M(cls, vocab_size: int, **kwargs) -> "TransformerConfig": qk_norm=kwargs.pop("qk_norm", True), rope_theta=kwargs.pop("rope_theta", 500_000), layer_norm_eps=1e-6, + **kwargs, ) @classmethod - def olmo_370M(cls, vocab_size: int, **kwargs) -> "TransformerConfig": + def olmo2_370M(cls, vocab_size: int, **kwargs) -> "TransformerConfig": return cls.llama_like( d_model=1024, hidden_size_multiplier=1.4, @@ -318,10 +319,11 @@ def olmo_370M(cls, vocab_size: int, **kwargs) -> "TransformerConfig": qk_norm=kwargs.pop("qk_norm", True), rope_theta=kwargs.pop("rope_theta", 500_000), layer_norm_eps=1e-6, + **kwargs, ) @classmethod - def olmo_600M(cls, vocab_size: int, **kwargs) -> "TransformerConfig": + def olmo2_600M(cls, vocab_size: int, **kwargs) -> "TransformerConfig": return cls.llama_like( d_model=1344, hidden_size_multiplier=1.5, @@ -332,10 +334,11 @@ def olmo_600M(cls, vocab_size: int, **kwargs) -> "TransformerConfig": qk_norm=kwargs.pop("qk_norm", True), rope_theta=kwargs.pop("rope_theta", 500_000), layer_norm_eps=1e-6, + **kwargs, ) @classmethod - def olmo_760M(cls, vocab_size: int, **kwargs) -> "TransformerConfig": + def olmo2_760M(cls, vocab_size: int, **kwargs) -> "TransformerConfig": return cls.llama_like( d_model=1536, hidden_size_multiplier=1.5, @@ -346,10 +349,11 @@ def olmo_760M(cls, vocab_size: int, **kwargs) -> "TransformerConfig": qk_norm=kwargs.pop("qk_norm", True), rope_theta=kwargs.pop("rope_theta", 500_000), layer_norm_eps=1e-6, + **kwargs, ) @classmethod - def olmo_1B(cls, vocab_size: int, **kwargs) -> "TransformerConfig": + def olmo2_1B(cls, vocab_size: int, **kwargs) -> "TransformerConfig": """ A 1B OLMo model config. """ @@ -363,7 +367,7 @@ def olmo_1B(cls, vocab_size: int, **kwargs) -> "TransformerConfig": ) @classmethod - def olmo_3B(cls, vocab_size: int, **kwargs) -> "TransformerConfig": + def olmo2_3B(cls, vocab_size: int, **kwargs) -> "TransformerConfig": return cls.llama_like( d_model=3328, hidden_size_multiplier=1.4, @@ -374,10 +378,11 @@ def olmo_3B(cls, vocab_size: int, **kwargs) -> "TransformerConfig": qk_norm=kwargs.pop("qk_norm", True), rope_theta=kwargs.pop("rope_theta", 500_000), layer_norm_eps=1e-6, + **kwargs, ) @classmethod - def olmo_7B(cls, vocab_size: int, **kwargs) -> "TransformerConfig": + def olmo2_7B(cls, vocab_size: int, **kwargs) -> "TransformerConfig": """ A 7B OLMo model config. """ @@ -391,7 +396,7 @@ def olmo_7B(cls, vocab_size: int, **kwargs) -> "TransformerConfig": ) @classmethod - def olmo_13B(cls, vocab_size: int, **kwargs) -> "TransformerConfig": + def olmo2_13B(cls, vocab_size: int, **kwargs) -> "TransformerConfig": """ A 13B OLMo model config. """ diff --git a/src/scripts/train/OLMo2-13B.py b/src/scripts/train/OLMo2-13B.py index 4d69ef8a..500e7b21 100644 --- a/src/scripts/train/OLMo2-13B.py +++ b/src/scripts/train/OLMo2-13B.py @@ -16,7 +16,7 @@ def build_model_config(common: CommonComponents) -> TransformerConfig: - return TransformerConfig.olmo_13B( + return TransformerConfig.olmo2_13B( vocab_size=common.tokenizer.padded_vocab_size(), compile=True, dp_config=TransformerDataParallelConfig( diff --git a/src/scripts/train/OLMo2-1B.py b/src/scripts/train/OLMo2-1B.py index b4bc97c5..d3c51ac4 100644 --- a/src/scripts/train/OLMo2-1B.py +++ b/src/scripts/train/OLMo2-1B.py @@ -12,7 +12,7 @@ def build_model_config(common: CommonComponents) -> TransformerConfig: - return TransformerConfig.olmo_1B( + return TransformerConfig.olmo2_1B( vocab_size=common.tokenizer.padded_vocab_size(), compile=True, dp_config=TransformerDataParallelConfig( diff --git a/src/scripts/train/OLMo2-7B.py b/src/scripts/train/OLMo2-7B.py index 1098e3ff..af94f466 100644 --- a/src/scripts/train/OLMo2-7B.py +++ b/src/scripts/train/OLMo2-7B.py @@ -20,7 +20,7 @@ def build_model_config(common: CommonComponents) -> TransformerConfig: - return TransformerConfig.olmo_7B( + return TransformerConfig.olmo2_7B( vocab_size=common.tokenizer.padded_vocab_size(), compile=True, dp_config=TransformerDataParallelConfig( diff --git a/src/scripts/train/OLMoE-1B-7B.py b/src/scripts/train/OLMoE-1B-7B.py index 04081289..439633ac 100644 --- a/src/scripts/train/OLMoE-1B-7B.py +++ b/src/scripts/train/OLMoE-1B-7B.py @@ -24,7 +24,7 @@ def build_model_config(common: CommonComponents) -> TransformerConfig: - model_config = TransformerConfig.olmo_1B( + model_config = TransformerConfig.olmo2_1B( vocab_size=common.tokenizer.padded_vocab_size(), n_layers=16, n_heads=16, From 2d5cd3ea41a24a786a6c76d78d4ce9e6a547fd26 Mon Sep 17 00:00:00 2001 From: epwalsh Date: Wed, 27 Nov 2024 09:46:47 -0800 Subject: [PATCH 12/25] fix --- src/scripts/train/OLMo2-ladder.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/scripts/train/OLMo2-ladder.py b/src/scripts/train/OLMo2-ladder.py index f099bd07..92d891ee 100644 --- a/src/scripts/train/OLMo2-ladder.py +++ b/src/scripts/train/OLMo2-ladder.py @@ -36,7 +36,7 @@ class BaselineModelLadder(ModelLadder): } def get_model_config(self, *, size: ModelSize) -> TransformerConfig: - return getattr(TransformerConfig, f"olmo_{size}")( + return getattr(TransformerConfig, f"olmo2_{size}")( vocab_size=self.tokenizer.padded_vocab_size(), init_seed=self.init_seed, compile=True, From cfdcb448f627985e47cf7a2f660b130bdff93c17 Mon Sep 17 00:00:00 2001 From: epwalsh Date: Wed, 27 Nov 2024 09:59:32 -0800 Subject: [PATCH 13/25] adjust rank mbz on the fly --- src/olmo_core/internal/model_ladder.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/src/olmo_core/internal/model_ladder.py b/src/olmo_core/internal/model_ladder.py index 5557e737..d88a76fb 100644 --- a/src/olmo_core/internal/model_ladder.py +++ b/src/olmo_core/internal/model_ladder.py @@ -1,3 +1,4 @@ +import logging import sys from dataclasses import dataclass from typing import Callable, List, cast @@ -21,6 +22,8 @@ from .common import build_launch_config, get_gpu_type, get_root_dir +log = logging.getLogger(__name__) + @dataclass class LadderRunConfig(Config): @@ -116,6 +119,17 @@ def build_config( data_loader = ladder.get_data_loader_config(size=size) trainer = ladder.get_trainer_config(size=size, gpu_type=gpu_type) + # Make sure rank micro-batch size makes sense. + rank_mbz_instances = trainer.rank_microbatch_size // ladder.sequence_length + global_bz_instances = data_loader.global_batch_size // ladder.sequence_length + if rank_mbz_instances * dp_world_size > global_bz_instances: + new_rank_mbz_instances = global_bz_instances // dp_world_size + new_rank_mbz = new_rank_mbz_instances * ladder.sequence_length + log.warning( + f"Adjusting rank micro-batch size from {trainer.rank_microbatch_size:,d} tokens ({rank_mbz_instances:,d} instances) " + f"down to {new_rank_mbz:,d} tokens ({new_rank_mbz_instances:,d} instances)" + ) + return LadderRunConfig( launch=launch, ladder=ladder, From 811d044bd43ca9304284d263228c988f490d6a3e Mon Sep 17 00:00:00 2001 From: epwalsh Date: Wed, 27 Nov 2024 10:02:14 -0800 Subject: [PATCH 14/25] oops --- src/olmo_core/internal/model_ladder.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/olmo_core/internal/model_ladder.py b/src/olmo_core/internal/model_ladder.py index d88a76fb..d763a46c 100644 --- a/src/olmo_core/internal/model_ladder.py +++ b/src/olmo_core/internal/model_ladder.py @@ -125,6 +125,7 @@ def build_config( if rank_mbz_instances * dp_world_size > global_bz_instances: new_rank_mbz_instances = global_bz_instances // dp_world_size new_rank_mbz = new_rank_mbz_instances * ladder.sequence_length + trainer.rank_microbatch_size = new_rank_mbz log.warning( f"Adjusting rank micro-batch size from {trainer.rank_microbatch_size:,d} tokens ({rank_mbz_instances:,d} instances) " f"down to {new_rank_mbz:,d} tokens ({new_rank_mbz_instances:,d} instances)" From edc9a16aacdce3f0e6a3809d4b6ebab16bba0237 Mon Sep 17 00:00:00 2001 From: epwalsh Date: Wed, 27 Nov 2024 10:08:45 -0800 Subject: [PATCH 15/25] updates --- src/olmo_core/model_ladder.py | 4 ++-- src/scripts/train/OLMo2-ladder.py | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/olmo_core/model_ladder.py b/src/olmo_core/model_ladder.py index ec35542b..62c741dc 100644 --- a/src/olmo_core/model_ladder.py +++ b/src/olmo_core/model_ladder.py @@ -300,7 +300,7 @@ def get_trainer_config( tokenizer=self.tokenizer, work_dir=self.work_dir, ), - eval_interval=1000, + eval_interval=500, ), ) .with_callback( @@ -310,7 +310,7 @@ def get_trainer_config( task for task in list_tasks() if "_mc" not in task and "_var" not in task ], tokenizer=self.tokenizer, - eval_interval=250, + eval_interval=500, ), ) .with_callback( diff --git a/src/scripts/train/OLMo2-ladder.py b/src/scripts/train/OLMo2-ladder.py index 92d891ee..97606f60 100644 --- a/src/scripts/train/OLMo2-ladder.py +++ b/src/scripts/train/OLMo2-ladder.py @@ -20,8 +20,8 @@ class BaselineModelLadder(ModelLadder): MBZ_SIZES: ClassVar[Dict[ModelSize, int]] = { # TODO: may need to tune these # =============================== - ModelSize.size_190M: 32 * 4096, - ModelSize.size_370M: 32 * 4096, + ModelSize.size_190M: 16 * 4096, + ModelSize.size_370M: 16 * 4096, ModelSize.size_600M: 16 * 4096, ModelSize.size_760M: 16 * 4096, # =============================== @@ -69,7 +69,7 @@ def get_rank_microbatch_size(self, *, size: ModelSize, gpu_type: str) -> int: def build_ladder(root_dir: str) -> BaselineModelLadder: - save_folder = str(join_path(root_dir, f"checkpoints/{get_beaker_username().lower()}")) + save_folder = str(join_path(root_dir, f"checkpoints/{get_beaker_username().lower()}/ladder")) return BaselineModelLadder( name="OLMo2", project="OLMo2-model-ladder", From e49d262f3f1c38212c4d0921e8e042140643f043 Mon Sep 17 00:00:00 2001 From: epwalsh Date: Wed, 27 Nov 2024 10:53:52 -0800 Subject: [PATCH 16/25] fix --- src/olmo_core/internal/model_ladder.py | 14 +------- src/olmo_core/model_ladder.py | 49 +++++++++++++++++++++++++- 2 files changed, 49 insertions(+), 14 deletions(-) diff --git a/src/olmo_core/internal/model_ladder.py b/src/olmo_core/internal/model_ladder.py index d763a46c..206e5940 100644 --- a/src/olmo_core/internal/model_ladder.py +++ b/src/olmo_core/internal/model_ladder.py @@ -117,19 +117,7 @@ def build_config( optim = ladder.get_optim_config(size=size) dataset = ladder.get_dataset_config() data_loader = ladder.get_data_loader_config(size=size) - trainer = ladder.get_trainer_config(size=size, gpu_type=gpu_type) - - # Make sure rank micro-batch size makes sense. - rank_mbz_instances = trainer.rank_microbatch_size // ladder.sequence_length - global_bz_instances = data_loader.global_batch_size // ladder.sequence_length - if rank_mbz_instances * dp_world_size > global_bz_instances: - new_rank_mbz_instances = global_bz_instances // dp_world_size - new_rank_mbz = new_rank_mbz_instances * ladder.sequence_length - trainer.rank_microbatch_size = new_rank_mbz - log.warning( - f"Adjusting rank micro-batch size from {trainer.rank_microbatch_size:,d} tokens ({rank_mbz_instances:,d} instances) " - f"down to {new_rank_mbz:,d} tokens ({new_rank_mbz_instances:,d} instances)" - ) + trainer = ladder.get_trainer_config(size=size, gpu_type=gpu_type, dp_world_size=dp_world_size) return LadderRunConfig( launch=launch, diff --git a/src/olmo_core/model_ladder.py b/src/olmo_core/model_ladder.py index 62c741dc..f1a067a5 100644 --- a/src/olmo_core/model_ladder.py +++ b/src/olmo_core/model_ladder.py @@ -2,6 +2,8 @@ Configuration classes for defining model ladder scaling ablations. """ +import logging +import math from abc import ABCMeta, abstractmethod from dataclasses import dataclass, field from typing import Optional @@ -37,6 +39,8 @@ __all__ = ["ModelSize", "ModelLadder"] +log = logging.getLogger(__name__) + class ModelSize(StrEnum): """ @@ -208,7 +212,7 @@ def get_data_loader_config(self, *, size: ModelSize) -> NumpyDataLoaderConfig: @abstractmethod def get_rank_microbatch_size(self, *, size: ModelSize, gpu_type: str) -> int: """ - Returns the micro-batch size in tokens per device that should be used for the given + Returns the maximum micro-batch size in tokens per device that should be used for the given model size. :param size: The target model size. @@ -257,15 +261,23 @@ def get_trainer_config( *, size: ModelSize, gpu_type: str, + dp_world_size: int, ) -> TrainerConfig: """ Build the trainer config. :param size: The target model size. :param gpu_type: The type of GPU as given by ``torch.cuda.get_device_name()``. + :param dp_world_size: The data parallel world size. """ from olmo_eval import list_tasks + if dp_world_size > self.max_dp_world_size: + raise OLMoConfigurationError( + f"max_dp_world_size ({self.max_dp_world_size}) must be at least as big as current dp " + f"world size ({dp_world_size})" + ) + rank_mbz = self.get_rank_microbatch_size(size=size, gpu_type=gpu_type) if rank_mbz % self.sequence_length != 0: raise OLMoConfigurationError( @@ -273,6 +285,41 @@ def get_trainer_config( f"by the sequence length ({self.sequence_length:,d})" ) + rank_mbz_instances = rank_mbz // self.sequence_length + + global_bz = self.get_global_batch_size(size=size) + if global_bz % self.sequence_length != 0: + raise OLMoConfigurationError( + f"global batch size ({rank_mbz:,d} tokens) must be divisible " + f"by the sequence length ({self.sequence_length:,d})" + ) + + global_bz_instances = self.get_global_batch_size(size=size) // self.sequence_length + + if global_bz_instances % (rank_mbz_instances * dp_world_size) != 0: + new_rank_mbz_instances = global_bz_instances // dp_world_size + if new_rank_mbz_instances > rank_mbz_instances: + for divisor in range(2, new_rank_mbz_instances + 1): + if ( + new_rank_mbz_instances % divisor == 0 + and new_rank_mbz_instances // divisor < rank_mbz_instances + ): + new_rank_mbz_instances //= divisor + break + else: + raise RuntimeError("shouldn't get here") + + assert new_rank_mbz_instances <= rank_mbz_instances + assert global_bz_instances % (new_rank_mbz_instances * dp_world_size) == 0 + + new_rank_mbz = new_rank_mbz_instances * self.sequence_length + log.warning( + f"Adjusting rank micro-batch size from {rank_mbz:,d} tokens ({rank_mbz_instances:,d} instances) " + f"down to {new_rank_mbz:,d} tokens ({new_rank_mbz_instances:,d} instances) to be compatible " + "with data parallel world size" + ) + rank_mbz = new_rank_mbz + return ( TrainerConfig( save_folder=self.get_save_folder(size), From a5f47f6333051538d5704ad4193ba6557a1e6f21 Mon Sep 17 00:00:00 2001 From: epwalsh Date: Wed, 27 Nov 2024 10:54:22 -0800 Subject: [PATCH 17/25] clean up --- src/olmo_core/internal/model_ladder.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/src/olmo_core/internal/model_ladder.py b/src/olmo_core/internal/model_ladder.py index 206e5940..f74313f7 100644 --- a/src/olmo_core/internal/model_ladder.py +++ b/src/olmo_core/internal/model_ladder.py @@ -7,7 +7,6 @@ from olmo_core.config import Config, StrEnum from olmo_core.data import NumpyDataLoaderConfig, NumpyDatasetConfig -from olmo_core.exceptions import OLMoConfigurationError from olmo_core.launch.beaker import BeakerLaunchConfig from olmo_core.model_ladder import ModelLadder, ModelSize from olmo_core.nn.transformer import TransformerConfig @@ -106,11 +105,6 @@ def build_config( ).merge(overrides, strict=False) dp_world_size = launch.num_nodes * launch.num_gpus - if dp_world_size > ladder.max_dp_world_size: - raise OLMoConfigurationError( - f"max_dp_world_size ({ladder.max_dp_world_size}) must be at least as big as current dp " - f"world size ({dp_world_size})" - ) gpu_type = get_gpu_type(cluster) model = ladder.get_model_config(size=size) From b550c185770e36976f2d0dc3a1a2cc0a7b728a1d Mon Sep 17 00:00:00 2001 From: epwalsh Date: Wed, 27 Nov 2024 10:54:48 -0800 Subject: [PATCH 18/25] clean up again --- src/olmo_core/model_ladder.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/olmo_core/model_ladder.py b/src/olmo_core/model_ladder.py index f1a067a5..5c97c76b 100644 --- a/src/olmo_core/model_ladder.py +++ b/src/olmo_core/model_ladder.py @@ -3,7 +3,6 @@ """ import logging -import math from abc import ABCMeta, abstractmethod from dataclasses import dataclass, field from typing import Optional From 35e6f872fe2a0c2ecc24188a677b85a786272310 Mon Sep 17 00:00:00 2001 From: epwalsh Date: Wed, 27 Nov 2024 11:22:58 -0800 Subject: [PATCH 19/25] generalize conversion script --- .../huggingface/convert_checkpoint.py | 35 +++++++++++++++---- 1 file changed, 29 insertions(+), 6 deletions(-) diff --git a/src/examples/huggingface/convert_checkpoint.py b/src/examples/huggingface/convert_checkpoint.py index 34ed07e1..5875061b 100644 --- a/src/examples/huggingface/convert_checkpoint.py +++ b/src/examples/huggingface/convert_checkpoint.py @@ -1,8 +1,8 @@ """ -Example script showing how you could convert model weights on HuggingFace for a Llama-3.2 model -into a format that can be loaded by OLMo-core for fine-tuning. +Example script showing how you could convert model weights on HuggingFace for an OLMo2 or Llama-3.* +model into a format that can be loaded by OLMo-core for fine-tuning. -Note that this script is architecture-dependent, meaning it may only work for Llama-3.2 models on +Note that this script is architecture-dependent, meaning it may only work for OLMo2/Llama models on HuggingFace. """ @@ -21,13 +21,36 @@ log = logging.getLogger(__name__) HF_MODEL = "meta-llama/Llama-3.2-1B" +# HF_MODEL = "meta-llama/Llama-3.2-8B" +# HF_MODEL = "allenai/OLMo-2-1124-7B-Instruct" +# HF_MODEL = "allenai/OLMo-2-1124-13B-Instruct" + SAVE_PATH = f"/tmp/checkpoints/{HF_MODEL}" SAVE_OVERWRITE = False TOKENIZER_CONFIG = TokenizerConfig.from_hf(HF_MODEL) -MODEL_CONFIG = TransformerConfig.llama3_1B( - TOKENIZER_CONFIG.vocab_size, fused_ops=False, use_flash=False, rope_scaling=RoPEScalingConfig() -) +MODEL_CONFIG: TransformerConfig +if HF_MODEL == "meta-llama/Llama-3.2-1B": + MODEL_CONFIG = TransformerConfig.llama3_1B( + TOKENIZER_CONFIG.vocab_size, + fused_ops=False, + use_flash=False, + rope_scaling=RoPEScalingConfig(), + ) +elif HF_MODEL == "allenai/OLMo-2-1124-7B-Instruct": + MODEL_CONFIG = TransformerConfig.olmo2_7B( + TOKENIZER_CONFIG.vocab_size, + fused_ops=False, + use_flash=False, + ) +elif HF_MODEL == "allenai/OLMo-2-1124-13B-Instruct": + MODEL_CONFIG = TransformerConfig.olmo2_7B( + TOKENIZER_CONFIG.vocab_size, + fused_ops=False, + use_flash=False, + ) +else: + raise NotImplementedError(HF_MODEL) def convert_checkpoint() -> AutoModelForCausalLM: From 717392751773666f2880fec3bf9dc199532c0e8b Mon Sep 17 00:00:00 2001 From: epwalsh Date: Wed, 27 Nov 2024 11:37:08 -0800 Subject: [PATCH 20/25] fixes --- .../huggingface/convert_checkpoint.py | 33 +++++++++++-------- 1 file changed, 20 insertions(+), 13 deletions(-) diff --git a/src/examples/huggingface/convert_checkpoint.py b/src/examples/huggingface/convert_checkpoint.py index 5875061b..07d7c050 100644 --- a/src/examples/huggingface/convert_checkpoint.py +++ b/src/examples/huggingface/convert_checkpoint.py @@ -21,7 +21,8 @@ log = logging.getLogger(__name__) HF_MODEL = "meta-llama/Llama-3.2-1B" -# HF_MODEL = "meta-llama/Llama-3.2-8B" +# HF_MODEL = "meta-llama/Llama-3.2-8B" +# HF_MODEL = "allenai/OLMo-2-1124-7B" # HF_MODEL = "allenai/OLMo-2-1124-7B-Instruct" # HF_MODEL = "allenai/OLMo-2-1124-13B-Instruct" @@ -37,14 +38,14 @@ use_flash=False, rope_scaling=RoPEScalingConfig(), ) -elif HF_MODEL == "allenai/OLMo-2-1124-7B-Instruct": +elif HF_MODEL.startswith("allenai/OLMo-2-1124-7B"): MODEL_CONFIG = TransformerConfig.olmo2_7B( TOKENIZER_CONFIG.vocab_size, fused_ops=False, use_flash=False, ) -elif HF_MODEL == "allenai/OLMo-2-1124-13B-Instruct": - MODEL_CONFIG = TransformerConfig.olmo2_7B( +elif HF_MODEL.startswith("allenai/OLMo-2-1124-13B"): + MODEL_CONFIG = TransformerConfig.olmo2_13B( TOKENIZER_CONFIG.vocab_size, fused_ops=False, use_flash=False, @@ -101,15 +102,21 @@ def convert_checkpoint() -> AutoModelForCausalLM: f"model.layers.{block}.mlp.up_proj.weight" ) - # Attention layer norm. - new_state_dict[f"blocks.{block}.attention_norm.weight"] = state_dict.pop( - f"model.layers.{block}.input_layernorm.weight" - ) - - # MLP layer norm. - new_state_dict[f"blocks.{block}.feed_forward_norm.weight"] = state_dict.pop( - f"model.layers.{block}.post_attention_layernorm.weight" - ) + # Layer norms. + if "Llama" in HF_MODEL: + new_state_dict[f"blocks.{block}.feed_forward_norm.weight"] = state_dict.pop( + f"model.layers.{block}.post_attention_layernorm.weight" + ) + new_state_dict[f"blocks.{block}.attention_norm.weight"] = state_dict.pop( + f"model.layers.{block}.input_layernorm.weight" + ) + else: + new_state_dict[f"blocks.{block}.feed_forward_norm.weight"] = state_dict.pop( + f"model.layers.{block}.post_attention_layernorm.weight" + ) + new_state_dict[f"blocks.{block}.feed_forward_norm.weight"] = state_dict.pop( + f"model.layers.{block}.post_feed_forward_norm.weight" + ) assert len(state_dict) == 0 From e08627416a2188c74e3d2b48678c8161231c417a Mon Sep 17 00:00:00 2001 From: epwalsh Date: Wed, 27 Nov 2024 11:39:25 -0800 Subject: [PATCH 21/25] refactor to save mem --- src/examples/huggingface/convert_checkpoint.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/src/examples/huggingface/convert_checkpoint.py b/src/examples/huggingface/convert_checkpoint.py index 07d7c050..2b0ad698 100644 --- a/src/examples/huggingface/convert_checkpoint.py +++ b/src/examples/huggingface/convert_checkpoint.py @@ -131,18 +131,22 @@ def validate_conversion(hf_model): device = get_default_device() - model = MODEL_CONFIG.build(device=device, max_seq_len=131072).eval() - load_model_and_optim_state(SAVE_PATH, model) + B, T = 1, 120 + input_ids = torch.randint(0, TOKENIZER_CONFIG.vocab_size, (B, T)).to(device) hf_model = hf_model.to(device).eval() + with torch.no_grad(): + hf_logits, *_ = hf_model(input_ids=input_ids, return_dict=False) - B, T = 1, 120 - input_ids = torch.randint(0, TOKENIZER_CONFIG.vocab_size, (B, T)).to(device) + del hf_model + + model = MODEL_CONFIG.build(device=device, max_seq_len=131072).eval() + load_model_and_optim_state(SAVE_PATH, model) with torch.no_grad(): logits = model(input_ids=input_ids) - hf_logits, *_ = hf_model(input_ids=input_ids, return_dict=False) - torch.testing.assert_close(hf_logits, logits) + + torch.testing.assert_close(hf_logits, logits) log.info("Conversion successful") From 78de663b4e3b1ac787a1fabb7016616a94475ef3 Mon Sep 17 00:00:00 2001 From: epwalsh Date: Wed, 27 Nov 2024 11:46:26 -0800 Subject: [PATCH 22/25] fix --- src/examples/huggingface/convert_checkpoint.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/examples/huggingface/convert_checkpoint.py b/src/examples/huggingface/convert_checkpoint.py index 2b0ad698..8d2863a2 100644 --- a/src/examples/huggingface/convert_checkpoint.py +++ b/src/examples/huggingface/convert_checkpoint.py @@ -111,11 +111,11 @@ def convert_checkpoint() -> AutoModelForCausalLM: f"model.layers.{block}.input_layernorm.weight" ) else: - new_state_dict[f"blocks.{block}.feed_forward_norm.weight"] = state_dict.pop( + new_state_dict[f"blocks.{block}.attention_norm.weight"] = state_dict.pop( f"model.layers.{block}.post_attention_layernorm.weight" ) new_state_dict[f"blocks.{block}.feed_forward_norm.weight"] = state_dict.pop( - f"model.layers.{block}.post_feed_forward_norm.weight" + f"model.layers.{block}.post_feedforward_layernorm.weight" ) assert len(state_dict) == 0 From 23c47d45943e8246795d5f9d08ea7eba3c42894b Mon Sep 17 00:00:00 2001 From: epwalsh Date: Wed, 27 Nov 2024 11:48:15 -0800 Subject: [PATCH 23/25] add QK norms --- src/examples/huggingface/convert_checkpoint.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/src/examples/huggingface/convert_checkpoint.py b/src/examples/huggingface/convert_checkpoint.py index 8d2863a2..4f2eafef 100644 --- a/src/examples/huggingface/convert_checkpoint.py +++ b/src/examples/huggingface/convert_checkpoint.py @@ -20,11 +20,11 @@ log = logging.getLogger(__name__) -HF_MODEL = "meta-llama/Llama-3.2-1B" -# HF_MODEL = "meta-llama/Llama-3.2-8B" -# HF_MODEL = "allenai/OLMo-2-1124-7B" +HF_MODEL = "allenai/OLMo-2-1124-7B" # HF_MODEL = "allenai/OLMo-2-1124-7B-Instruct" # HF_MODEL = "allenai/OLMo-2-1124-13B-Instruct" +# HF_MODEL = "meta-llama/Llama-3.2-1B" +# HF_MODEL = "meta-llama/Llama-3.2-8B" SAVE_PATH = f"/tmp/checkpoints/{HF_MODEL}" SAVE_OVERWRITE = False @@ -117,6 +117,12 @@ def convert_checkpoint() -> AutoModelForCausalLM: new_state_dict[f"blocks.{block}.feed_forward_norm.weight"] = state_dict.pop( f"model.layers.{block}.post_feedforward_layernorm.weight" ) + new_state_dict[f"blocks.{block}.attention.q_norm.weight"] = state_dict.pop( + f"model.layers.{block}.self_attn.q_norm.weight" + ) + new_state_dict[f"blocks.{block}.attention.k_norm.weight"] = state_dict.pop( + f"model.layers.{block}.self_attn.k_norm.weight" + ) assert len(state_dict) == 0 From 78d9c58a4347cb7dff070e91da150d857c9c52a2 Mon Sep 17 00:00:00 2001 From: epwalsh Date: Wed, 27 Nov 2024 11:52:20 -0800 Subject: [PATCH 24/25] clean up --- src/examples/huggingface/convert_checkpoint.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/examples/huggingface/convert_checkpoint.py b/src/examples/huggingface/convert_checkpoint.py index 4f2eafef..6559ee43 100644 --- a/src/examples/huggingface/convert_checkpoint.py +++ b/src/examples/huggingface/convert_checkpoint.py @@ -133,8 +133,6 @@ def convert_checkpoint() -> AutoModelForCausalLM: def validate_conversion(hf_model): - log.info("Loading converted checkpoint for validation...") - device = get_default_device() B, T = 1, 120 @@ -147,6 +145,8 @@ def validate_conversion(hf_model): del hf_model model = MODEL_CONFIG.build(device=device, max_seq_len=131072).eval() + + log.info("Loading converted checkpoint for validation...") load_model_and_optim_state(SAVE_PATH, model) with torch.no_grad(): From 4b0e1d2f7a86c56ce9ac51f9fb4e0b9b40737ec0 Mon Sep 17 00:00:00 2001 From: epwalsh Date: Wed, 27 Nov 2024 11:54:06 -0800 Subject: [PATCH 25/25] docs --- docs/source/examples/huggingface.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/examples/huggingface.rst b/docs/source/examples/huggingface.rst index 68d88446..01baa543 100644 --- a/docs/source/examples/huggingface.rst +++ b/docs/source/examples/huggingface.rst @@ -7,7 +7,7 @@ One way to do this would be to manually apply a data parallel wrapper (like DDP Instead we recommend converting your HuggingFace checkpoint into a format that can be loaded into an equivalent OLMo-core :class:`~olmo_core.nn.transformer.Transformer` model, when possible, using the functions provided by :mod:`olmo_core.distributed.checkpoint`. -Below is an example that shows how to convert a Llama-3.2 checkpoint on HuggingFace into the right format for OLMo-core. +Below is an example that shows how to convert an OLMo2 or Llama-3 checkpoint on HuggingFace into the right format for OLMo-core. It would be straight forward to adapt this script to convert in the other direction as well. .. seealso::