diff --git a/bionemo-recipes/recipes/llama3_native_te/eval_downstream.py b/bionemo-recipes/recipes/llama3_native_te/eval_downstream.py new file mode 100644 index 0000000000..1f277437c1 --- /dev/null +++ b/bionemo-recipes/recipes/llama3_native_te/eval_downstream.py @@ -0,0 +1,562 @@ +#!/usr/bin/env python + +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: LicenseRef-Apache2 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Evaluate a trained Llama checkpoint on downstream NLP benchmarks using lm-eval. + +Supports loading from: + 1. A consolidated final_model directory (model.safetensors + config.json) + 2. A distributed FSDP2 training checkpoint (step_N directory) + 3. A DDP checkpoint (checkpoint.pt file) + +Examples: + # From a consolidated final_model (single GPU, no torchrun needed): + python eval_downstream.py \ + --checkpoint-path /path/to/ckpt_dir/train_fsdp2/final_model + + # From a distributed FSDP2 checkpoint (needs torchrun for weight gathering): + torchrun --nproc_per_node=1 eval_downstream.py \ + --checkpoint-path /path/to/ckpt_dir/train_fsdp2/ \ + --from-distributed \ + --model-config ./model_configs/lingua-1B + + # Specific step from a distributed checkpoint: + torchrun --nproc_per_node=2 eval_downstream.py \ + --checkpoint-path /path/to/ckpt_dir/train_fsdp2/ \ + --from-distributed \ + --checkpoint-step 20000 \ + --model-config ./model_configs/lingua-1B + + # Custom tasks and batch size: + python eval_downstream.py \ + --checkpoint-path /path/to/final_model \ + --tasks arc_easy,hellaswag \ + --batch-size 16 + + # Save results to a file: + python eval_downstream.py \ + --checkpoint-path /path/to/final_model \ + --output-path ./eval_results +""" + +from __future__ import annotations + +import argparse +import json +import logging +import os +import shutil +import subprocess +import sys +import tempfile +import time +from pathlib import Path + + +logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") +logger = logging.getLogger(__name__) + +DOWNSTREAM_TASKS = "arc_challenge,arc_easy,boolq,copa,hellaswag,piqa,winogrande" + + +# --------------------------------------------------------------------------- +# Checkpoint discovery (adapted from opengenome2 evaluate_fasta_lm_loss.py) +# --------------------------------------------------------------------------- + + +def find_checkpoint_path(checkpoint_dir: str, step: int | None = None) -> tuple[Path, str]: + """Locate the checkpoint inside *checkpoint_dir* and return ``(path, type)``. + + Supports: + - ``safetensors``: directory with ``model.safetensors`` + - ``dcp``: FSDP2 distributed checkpoint (``step_N/`` with ``.metadata``) + - ``ddp``: DDP checkpoint (``checkpoint.pt``) + + Args: + checkpoint_dir: Root checkpoint directory. + step: Specific step to load. If None, uses the latest. + + Returns: + Tuple of (checkpoint_path, checkpoint_type). + """ + root = Path(checkpoint_dir) + + for candidate in [root, root / "final_model", root / "train_fsdp2" / "final_model"]: + if (candidate / "model.safetensors").exists(): + return candidate, "safetensors" + + fsdp2_dir = root / "train_fsdp2" if (root / "train_fsdp2").exists() else root + step_dirs = sorted( + [d for d in fsdp2_dir.iterdir() if d.is_dir() and d.name.startswith("step_")], + key=lambda d: int(d.name.split("_")[1]), + ) + if step_dirs: + if step is not None: + target = fsdp2_dir / f"step_{step}" + if not target.exists(): + raise FileNotFoundError(f"step_{step} not found. Available: {[d.name for d in step_dirs]}") + chosen = target + else: + chosen = step_dirs[-1] + if (chosen / ".metadata").exists() or any(chosen.glob("*.distcp")): + return chosen, "dcp" + if (chosen / "checkpoint.pt").exists(): + return chosen, "ddp" + return chosen, "dcp" + + if (root / "checkpoint.pt").exists(): + return root, "ddp" + if (root / ".metadata").exists() or any(root.glob("*.distcp")): + return root, "dcp" + + raise FileNotFoundError(f"No recognisable checkpoint in {checkpoint_dir}") + + +# --------------------------------------------------------------------------- +# DCP loading helpers (self-contained, avoids importing checkpoint.py which +# may have TE version-specific imports) +# --------------------------------------------------------------------------- + + +def _build_app_state(model, optimizer, scheduler): + """Build a Stateful wrapper compatible with the training checkpoint format.""" + from dataclasses import dataclass + + from torch.distributed.checkpoint.state_dict import StateDictOptions, get_state_dict, set_state_dict + from torch.distributed.checkpoint.stateful import Stateful + + @dataclass + class _AppState(Stateful): + model: object + optimizer: object + scheduler: object + step: int = 0 + epoch: int = 0 + + def state_dict(self): + model_state_dict, optimizer_state_dict = get_state_dict(self.model, self.optimizer) + model_state_dict = {k: v for k, v in model_state_dict.items() if not k.endswith("_extra_state")} + return { + "model": model_state_dict, + "optim": optimizer_state_dict, + "scheduler": self.scheduler.state_dict(), + "step": self.step, + "epoch": self.epoch, + } + + def load_state_dict(self, state_dict: dict): + set_state_dict( + self.model, + self.optimizer, + model_state_dict=state_dict["model"], + optim_state_dict=state_dict["optim"], + options=StateDictOptions(strict=False), + ) + self.scheduler.load_state_dict(state_dict["scheduler"]) + self.step = state_dict["step"] + self.epoch = state_dict["epoch"] + + return _AppState(model=model, optimizer=optimizer, scheduler=scheduler) + + +def _get_lenient_load_planner(): + """Return a load planner that skips keys missing from the checkpoint. + + Handles checkpoints saved without TransformerEngine _extra_state keys + (FP8 metadata). These keys are registered by newer TE versions even when + FP8 is disabled, but older checkpoints don't contain them. + """ + from torch.distributed.checkpoint.default_planner import DefaultLoadPlanner + + class _LenientLoadPlanner(DefaultLoadPlanner): + def create_local_plan(self): + missing_keys = [fqn for fqn in self.state_dict if fqn not in self.metadata.state_dict_metadata] + if missing_keys: + logger.warning( + "Skipping %d keys not found in checkpoint: %s%s", + len(missing_keys), + missing_keys[:5], + "..." if len(missing_keys) > 5 else "", + ) + for key in missing_keys: + del self.state_dict[key] + return super().create_local_plan() + + return _LenientLoadPlanner() + + +# --------------------------------------------------------------------------- +# Distributed checkpoint export +# --------------------------------------------------------------------------- + + +def export_distributed_checkpoint( + checkpoint_dir: str, + model_config: str, + output_path: str, + checkpoint_step: int | None = None, +) -> bool: + """Load a distributed checkpoint and export consolidated weights. + + Auto-detects checkpoint format (safetensors, DCP, DDP). For DCP checkpoints, + must be called inside a torchrun context. All ranks participate in loading + and gathering, but only rank 0 saves the exported model. + + Args: + checkpoint_dir: Root checkpoint directory. + model_config: Path to model config (e.g. ./model_configs/lingua-1B). + output_path: Directory to save the consolidated model. + checkpoint_step: Specific step to load (latest if None). + + Returns: + True if this is rank 0 (should continue to evaluation), False otherwise. + """ + import torch + from safetensors.torch import load_file, save_file + from torch.distributed.checkpoint.state_dict import StateDictOptions, get_model_state_dict + from torch.distributed.checkpoint.state_dict_loader import load as dcp_load + from torch.distributed.device_mesh import init_device_mesh + from torch.distributed.fsdp import fully_shard + + from distributed_config import DistributedConfig + from modeling_llama_te import NVLlamaConfig, NVLlamaForCausalLM + from scheduler import get_cosine_annealing_schedule_with_warmup + + dist_config = DistributedConfig() + device = torch.device(f"cuda:{dist_config.local_rank}") + torch.distributed.init_process_group(backend="cpu:gloo,cuda:nccl", device_id=device) + torch.cuda.set_device(dist_config.local_rank) + device_mesh = init_device_mesh("cuda", mesh_shape=(dist_config.world_size,), mesh_dim_names=("dp",)) + + ckpt_path, ckpt_type = find_checkpoint_path(checkpoint_dir, checkpoint_step) + if dist_config.rank == 0: + logger.info("Resolved checkpoint: %s (type=%s)", ckpt_path, ckpt_type) + + config = NVLlamaConfig.from_pretrained(model_config, dtype=torch.bfloat16, attn_input_format="bshd") + model = NVLlamaForCausalLM(config) + if dist_config.rank == 0: + logger.info("Model created (%s parameters)", f"{sum(p.numel() for p in model.parameters()):,}") + + # For safetensors/DDP: load weights BEFORE FSDP2 wrapping (plain Tensor → plain Parameter) + if ckpt_type == "safetensors": + weights = load_file(str(ckpt_path / "model.safetensors")) + model.load_state_dict(weights, strict=False) + if dist_config.rank == 0: + logger.info("Loaded safetensors checkpoint") + elif ckpt_type == "ddp": + ckpt = torch.load(ckpt_path / "checkpoint.pt", map_location="cpu", weights_only=True) + model.load_state_dict(ckpt["model"], strict=False) + if dist_config.rank == 0: + logger.info("Loaded DDP checkpoint (step=%d)", ckpt.get("step", -1)) + + # FSDP2 wrapping + for layer in model.model.layers: + fully_shard(layer, mesh=device_mesh["dp"]) + fully_shard(model, mesh=device_mesh["dp"]) + + # For DCP: load AFTER FSDP2 wrapping (DCP handles DTensor resharding) + if ckpt_type == "dcp": + if dist_config.rank == 0: + logger.info("Loading FSDP2 DCP checkpoint from %s", ckpt_path) + optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5) + scheduler = get_cosine_annealing_schedule_with_warmup(optimizer, num_warmup_steps=100, num_decay_steps=1000) + app_state = _build_app_state(model, optimizer, scheduler) + dcp_load( + {"app": app_state}, + checkpoint_id=ckpt_path, + process_group=device_mesh.get_group("dp"), + planner=_get_lenient_load_planner(), + ) + if dist_config.rank == 0: + logger.info("DCP checkpoint loaded (step=%d, epoch=%d)", app_state.step, app_state.epoch) + + # Gather full model state dict from all ranks + model_state_dict = get_model_state_dict( + model=model, + options=StateDictOptions(full_state_dict=True, cpu_offload=True), + ) + + if dist_config.is_main_process(): + os.makedirs(output_path, exist_ok=True) + save_file(model_state_dict, os.path.join(output_path, "model.safetensors")) + config.save_pretrained(output_path) + logger.info("Exported consolidated model to %s", output_path) + + torch.distributed.barrier() + torch.distributed.destroy_process_group() + + return dist_config.is_main_process() + + +# --------------------------------------------------------------------------- +# Eval directory preparation +# --------------------------------------------------------------------------- + + +def prepare_eval_directory(checkpoint_path: str, output_path: str, tokenizer_name: str) -> str: + """Prepare a checkpoint directory with all files lm-eval needs. + + Copies model files, patches config.json with auto_map and inference-compatible + attention settings, copies modeling_llama_te.py, and saves the tokenizer. + + Args: + checkpoint_path: Source directory with model.safetensors + config.json. + output_path: Destination directory for the eval-ready checkpoint. + tokenizer_name: HuggingFace tokenizer name or local path. + + Returns: + The output_path string. + """ + from transformers import AutoTokenizer + + from modeling_llama_te import AUTO_MAP + + checkpoint_path_obj = Path(checkpoint_path) + output_path_obj = Path(output_path) + + if output_path_obj.resolve() != checkpoint_path_obj.resolve(): + os.makedirs(output_path, exist_ok=True) + for f in checkpoint_path_obj.iterdir(): + if f.is_file(): + shutil.copy2(f, output_path_obj / f.name) + + config_file = output_path_obj / "config.json" + with open(config_file) as f: + config = json.load(f) + + config["auto_map"] = AUTO_MAP + config["attn_input_format"] = "bshd" + config["self_attn_mask_type"] = "causal" + + with open(config_file, "w") as f: + json.dump(config, f, indent=2, sort_keys=True) + + script_dir = Path(__file__).parent + shutil.copy2(script_dir / "modeling_llama_te.py", output_path_obj / "modeling_llama_te.py") + + tokenizer = AutoTokenizer.from_pretrained(tokenizer_name) + tokenizer.save_pretrained(str(output_path_obj)) + + logger.info("Prepared eval directory: %s", output_path) + return output_path + + +# --------------------------------------------------------------------------- +# lm-eval runner +# --------------------------------------------------------------------------- + + +def run_lm_eval( + eval_dir: str, + tasks: str, + batch_size: str, + device: str, + output_path: str | None = None, + num_fewshot: int | None = None, +) -> float: + """Run lm-eval on the prepared checkpoint directory. + + Args: + eval_dir: Path to the prepared eval checkpoint directory. + tasks: Comma-separated list of lm-eval task names. + batch_size: Batch size string (integer or "auto"). + device: Device string (e.g. "cuda:0"). + output_path: Optional path to save results JSON. + num_fewshot: Optional number of few-shot examples. + + Returns: + Wall-clock time in seconds. + """ + cmd = [ + sys.executable, + "-m", + "lm_eval", + "--model", + "hf", + "--model_args", + f"pretrained={eval_dir},tokenizer={eval_dir}", + "--trust_remote_code", + "--tasks", + tasks, + "--device", + device, + "--batch_size", + batch_size, + ] + + if output_path: + cmd.extend(["--output_path", output_path]) + + if num_fewshot is not None: + cmd.extend(["--num_fewshot", str(num_fewshot)]) + + logger.info("Running lm-eval:\n %s", " ".join(cmd)) + print("=" * 80) + + start_time = time.time() + result = subprocess.run(cmd, check=False) + elapsed = time.time() - start_time + + print("=" * 80) + logger.info("lm-eval completed in %.1fs (%.1f min)", elapsed, elapsed / 60) + + if result.returncode != 0: + logger.error("lm-eval failed with exit code %d", result.returncode) + sys.exit(result.returncode) + + return elapsed + + +# --------------------------------------------------------------------------- +# CLI +# --------------------------------------------------------------------------- + + +def parse_args() -> argparse.Namespace: + """Parse command line arguments.""" + parser = argparse.ArgumentParser( + description="Evaluate a trained checkpoint on downstream NLP tasks with lm-eval.", + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + parser.add_argument( + "--checkpoint-path", + type=str, + required=True, + help="Path to checkpoint directory. Auto-detects format (final_model, step_N, etc).", + ) + parser.add_argument( + "--checkpoint-step", + type=int, + default=None, + help="Specific training step to evaluate (latest if omitted).", + ) + parser.add_argument( + "--tokenizer", + type=str, + default="meta-llama/Meta-Llama-3-8B", + help="Tokenizer name or path (default: meta-llama/Meta-Llama-3-8B).", + ) + parser.add_argument( + "--tasks", + type=str, + default=DOWNSTREAM_TASKS, + help=f"Comma-separated lm-eval task names (default: {DOWNSTREAM_TASKS}).", + ) + parser.add_argument( + "--batch-size", + type=str, + default="auto", + help="Batch size for lm-eval. Use 'auto' for automatic selection (default: auto).", + ) + parser.add_argument( + "--device", + type=str, + default="cuda:0", + help="Device for lm-eval inference (default: cuda:0).", + ) + parser.add_argument( + "--eval-dir", + type=str, + default=None, + help="Directory to store the prepared eval checkpoint. Uses a temp directory if not set.", + ) + parser.add_argument( + "--from-distributed", + action="store_true", + help="Export from a distributed checkpoint before evaluating. Requires torchrun.", + ) + parser.add_argument( + "--model-config", + type=str, + default="./model_configs/lingua-1B", + help="Model config path for --from-distributed (default: ./model_configs/lingua-1B).", + ) + parser.add_argument( + "--output-path", + type=str, + default=None, + help="Path to save lm-eval results JSON.", + ) + parser.add_argument( + "--num-fewshot", + type=int, + default=None, + help="Number of few-shot examples (default: lm-eval task default).", + ) + return parser.parse_args() + + +# --------------------------------------------------------------------------- +# Main +# --------------------------------------------------------------------------- + + +def main() -> None: + """Main entry point.""" + args = parse_args() + + use_temp = args.eval_dir is None + eval_dir = args.eval_dir if args.eval_dir else tempfile.mkdtemp(prefix="lm_eval_checkpoint_") + + if use_temp: + logger.info("Using temporary eval directory: %s", eval_dir) + + try: + if args.from_distributed: + is_main = export_distributed_checkpoint( + checkpoint_dir=args.checkpoint_path, + model_config=args.model_config, + output_path=eval_dir, + checkpoint_step=args.checkpoint_step, + ) + if not is_main: + return + source_dir = eval_dir + else: + ckpt_path, ckpt_type = find_checkpoint_path(args.checkpoint_path, args.checkpoint_step) + if ckpt_type != "safetensors": + logger.error( + "Found %s checkpoint at %s. Non-safetensors checkpoints require " + "--from-distributed flag with torchrun.", + ckpt_type, + ckpt_path, + ) + sys.exit(1) + source_dir = str(ckpt_path) + + prepare_eval_directory( + checkpoint_path=source_dir, + output_path=eval_dir, + tokenizer_name=args.tokenizer, + ) + + run_lm_eval( + eval_dir=eval_dir, + tasks=args.tasks, + batch_size=args.batch_size, + device=args.device, + output_path=args.output_path, + num_fewshot=args.num_fewshot, + ) + + finally: + if use_temp and os.path.exists(eval_dir): + logger.info("Cleaning up temporary directory: %s", eval_dir) + shutil.rmtree(eval_dir) + + +if __name__ == "__main__": + main() diff --git a/bionemo-recipes/recipes/llama3_native_te/lepton_backup/lepton_configs/og2_fp8_refactor_all_fp8_fp32mw.yaml b/bionemo-recipes/recipes/llama3_native_te/lepton_backup/lepton_configs/og2_fp8_refactor_all_fp8_fp32mw.yaml new file mode 100644 index 0000000000..52e3303445 --- /dev/null +++ b/bionemo-recipes/recipes/llama3_native_te/lepton_backup/lepton_configs/og2_fp8_refactor_all_fp8_fp32mw.yaml @@ -0,0 +1,70 @@ +# OpenGenome2 7B - FP8 Refactor Branch Test +# Same settings as og2-7b-fp32mw-pq2-cfi-false-lf100 but: +# - On savitha/og2-fp8-refactor branch (FP8 logic moved into model) +# - FP8 enabled on ALL layers (including first/last, no BF16 override) +# - FP32 master weights +# - No CP (standard FSDP2 only) +# +# Data: /data/opengenome2/parquet2 +# 6 nodes H100, THD format, GQA, FP8 + FP32 master weights +# GBS = mbs * grad_acc * dp_size = 1 * 8 * 48 = 384 +defaults: + - _self_ + +job_name: "og2-7b-fp8-refactor-all-fp8-fp32mw" +node_group: "yo-bom-lepton-001" +resource_shape: "gpu.8xh100-sxm" + +num_nodes: 6 +gpus_per_node: 8 +num_train_steps: 182314 +micro_batch_size: 1 +grad_acc_steps: 8 + +dataset_path: "/data/opengenome2/parquet2" +data_dir: "" +num_workers: 8 +buffer_size: 10000 + +repo_root: "/data/savithas/bionemo-framework" +code_path: "/data/savithas/bionemo-framework/bionemo-recipes/recipes/opengenome2_llama_native_te" +train_script: "train_fsdp2.py" +hydra_config: "og2_7b_thd_gqa" + +git_branch: "savitha/og2-fp8-refactor" + +validation_enabled: false + +spike_no_more_embedding_init: true +skip_embedding_weight_decay: true +use_megatron_scaled_init: true +use_weight_decay_grouping: true +use_meta_device: false + +# FP8 enabled on ALL layers (fp8_first_last_bf16 stays false in base config) +fp8_enabled: true +fp8_recipe: transformer_engine.common.recipe.Float8BlockScaling +fp8_format: E4M3 +use_fp32_master_weights: true + +logger_frequency: 100 + +checkpoint_dir: "/data/savithas/checkpoints/og2-7b-fp8-refactor-all-fp8-fp32mw" # pragma: allowlist secret +save_every_n_steps: 5000 +async_save: false + +wandb_project: "llama3-metagenome-7b" +wandb_name: "og2-7b-fp8-refactor-all-fp8-fp32mw" +wandb_secret: "wandb.savithas" # pragma: allowlist secret + +hf_secret: "HUGGING_FACE_HUB_TOKEN.savithas" # pragma: allowlist secret + +exclude_nodes: + - node-ip-10-50-80-195 + - node-ip-10-50-81-231 + - nvidia-lepton093 + - nvidia-lepton007 + +container: + image: "nvcr.io/nvidia/pytorch:25.11-py3" + registry_auth: "lepton-nvidia-cvai-bnmo-trng" diff --git a/bionemo-recipes/recipes/llama3_native_te/lepton_backup/submit_og2_lepton_eden.py b/bionemo-recipes/recipes/llama3_native_te/lepton_backup/submit_og2_lepton_eden.py new file mode 100644 index 0000000000..a39c5cc320 --- /dev/null +++ b/bionemo-recipes/recipes/llama3_native_te/lepton_backup/submit_og2_lepton_eden.py @@ -0,0 +1,463 @@ +#!/usr/bin/env python3 + +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: LicenseRef-Apache2 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Lepton Job submission script for OpenGenome2 training (v2 with git checkout support).""" + +import hydra +from leptonai.api.v1.types.affinity import LeptonResourceAffinity +from leptonai.api.v1.types.common import Metadata +from leptonai.api.v1.types.deployment import EnvValue, EnvVar, LeptonContainer +from leptonai.api.v1.types.job import LeptonJob, LeptonJobUserSpec +from leptonai.api.v2.client import APIClient +from omegaconf import DictConfig, OmegaConf + + +def _resolve_scheduling_target(client, cfg: DictConfig): + """Resolve node group and resource shape.""" + desired_node_group = str(cfg.node_group).strip() + resource_shape = str(cfg.resource_shape).strip() + + node_groups = client.nodegroup.list_all() + node_group_map = {ng.metadata.name: ng for ng in node_groups} + + if desired_node_group not in node_group_map: + available = ", ".join(sorted(node_group_map.keys())) + raise SystemExit(f"Node group '{desired_node_group}' not found.\nAvailable: {available}") + + chosen_group = node_group_map[desired_node_group] + valid_node_ids = {n.metadata.id_ for n in client.nodegroup.list_nodes(chosen_group.metadata.id_)} + + # Filter out excluded nodes if specified + if getattr(cfg, "exclude_nodes", None): + exclude_set = set(cfg.exclude_nodes) + original_count = len(valid_node_ids) + valid_node_ids = valid_node_ids - exclude_set + if original_count != len(valid_node_ids): + print(f" ⚠️ Excluding {original_count - len(valid_node_ids)} nodes: {sorted(exclude_set)}") + print(f" ✓ Remaining valid nodes: {len(valid_node_ids)}") + if len(valid_node_ids) < cfg.num_nodes: + raise SystemExit( + f"ERROR: Need {cfg.num_nodes} nodes but only {len(valid_node_ids)} available after exclusions!" + ) + + return chosen_group, valid_node_ids, resource_shape + + +def launch_single_job(client, cfg: DictConfig): + """Launch a single multinode job.""" + chosen_group, valid_node_ids, resource_shape = _resolve_scheduling_target(client, cfg) + + # Build Hydra overrides + hydra_overrides = [ + f"dataset.micro_batch_size={cfg.micro_batch_size}", + f"grad_acc_steps={cfg.grad_acc_steps}", + f"num_train_steps={cfg.num_train_steps}", + f'dataset.load_dataset_kwargs.path="{cfg.dataset_path}"', + ] + + # Add data_dir if present + if cfg.get("data_dir") and cfg.data_dir: + hydra_overrides.append(f'dataset.load_dataset_kwargs.data_dir="{cfg.data_dir}"') + + # Add checkpoint config + resume_from_checkpoint = cfg.get("resume_from_checkpoint", False) + async_save = cfg.get("async_save", False) + hydra_overrides.extend( + [ + f"checkpoint.ckpt_dir={cfg.checkpoint_dir}", + f"checkpoint.save_every_n_steps={cfg.save_every_n_steps}", + f"checkpoint.resume_from_checkpoint={str(resume_from_checkpoint).lower()}", + f"checkpoint.async_save={str(async_save).lower()}", + ] + ) + + # Add FP8 config if present + if "fp8_enabled" in cfg: + hydra_overrides.append(f"fp8_config.enabled={str(cfg.fp8_enabled).lower()}") + if "fp8_recipe" in cfg: + hydra_overrides.append(f"fp8_config.fp8_recipe={cfg.fp8_recipe}") + if "fp8_format" in cfg: + hydra_overrides.append(f"fp8_config.fp8_format={cfg.fp8_format}") + + # Add init settings if specified in config + if "spike_no_more_embedding_init" in cfg: + hydra_overrides.append(f"spike_no_more_embedding_init={str(cfg.spike_no_more_embedding_init).lower()}") + if "skip_embedding_weight_decay" in cfg: + hydra_overrides.append(f"skip_embedding_weight_decay={str(cfg.skip_embedding_weight_decay).lower()}") + if "use_megatron_scaled_init" in cfg: + hydra_overrides.append(f"use_megatron_scaled_init={str(cfg.use_megatron_scaled_init).lower()}") + if "use_weight_decay_grouping" in cfg: + hydra_overrides.append(f"use_weight_decay_grouping={str(cfg.use_weight_decay_grouping).lower()}") + if "use_megatron_loss_reduction" in cfg: + hydra_overrides.append(f"++use_megatron_loss_reduction={str(cfg.use_megatron_loss_reduction).lower()}") + + # Add meta device init setting + if "use_meta_device" in cfg: + hydra_overrides.append(f"++use_meta_device={str(cfg.use_meta_device).lower()}") + + # Add FP32 master weights setting (use ++ to add or override) + if "use_fp32_master_weights" in cfg: + hydra_overrides.append(f"++use_fp32_master_weights={str(cfg.use_fp32_master_weights).lower()}") + + # Add num_workers if specified + if "num_workers" in cfg: + hydra_overrides.append(f"dataset.num_workers={cfg.num_workers}") + + # Add buffer_size if specified (for shuffle buffer override) + if "buffer_size" in cfg: + hydra_overrides.append(f"dataset.buffer_size={cfg.buffer_size}") + + # Add shuffle_sequences if specified (for dual shuffling) + if "shuffle_sequences" in cfg: + hydra_overrides.append(f"dataset.shuffle_sequences={str(cfg.shuffle_sequences).lower()}") + + # Add logger frequency (support both naming conventions) + if "logger_frequency" in cfg: + hydra_overrides.append(f"logger.frequency={cfg.logger_frequency}") + elif "log_frequency" in cfg: + hydra_overrides.append(f"logger.frequency={cfg.log_frequency}") + + # Add metrics JSON file config (for baseline comparison / Claude agent optimization) + if "metrics_json_path" in cfg: + hydra_overrides.append(f'++logger.metrics_json_path="{cfg.metrics_json_path}"') + if "metrics_json_frequency" in cfg: + hydra_overrides.append(f"++logger.metrics_json_frequency={cfg.metrics_json_frequency}") + + # Add CP size if specified (use ++ to add key if not in base config) + if "cp_size" in cfg: + hydra_overrides.append(f"++cp_size={cfg.cp_size}") + + # Add validation settings if specified + if "validation_enabled" in cfg: + hydra_overrides.append(f"++validation.enabled={str(cfg.validation_enabled).lower()}") + if "validation_interval" in cfg: + hydra_overrides.append(f"++validation.eval_interval={cfg.validation_interval}") + + # Add use_sequence_packing if specified + if "use_sequence_packing" in cfg: + hydra_overrides.append(f"++use_sequence_packing={str(cfg.use_sequence_packing).lower()}") + + # Agent control plane overrides + agent_enabled = cfg.get("agent_enabled", False) + if agent_enabled: + agent_dir = cfg.get("agent_dir", "/data/agent") + hydra_overrides.extend( + [ + "++agent.enabled=true", + f'++agent.control_file="{agent_dir}/control.yaml"', + f'++agent.metrics_file="{agent_dir}/metrics.jsonl"', + f'++agent.journal_file="{agent_dir}/journal.jsonl"', + ] + ) + + # Add extra config_kwargs overrides (for attn_input_format, self_attn_mask_type, num_key_value_heads, etc.) + hydra_overrides.extend( + f"++config_kwargs.{key}={cfg[key]}" + for key in ["attn_input_format", "self_attn_mask_type", "num_key_value_heads"] + if key in cfg + ) + + # Add arbitrary extra hydra overrides (list of raw override strings) + if "extra_hydra_overrides" in cfg: + hydra_overrides.extend(str(override) for override in cfg.extra_hydra_overrides) + + # Add wandb config + hydra_overrides.extend( + [ + f'wandb.project="{cfg.wandb_project}"', + f'wandb.name="{cfg.wandb_name}"', + ] + ) + + # Format as bash command arguments (each arg on new line with backslash continuation) + hydra_args_formatted = " \\\n ".join(hydra_overrides) + + # Git branch checkout logic + git_branch = cfg.get("git_branch", "") + repo_root = cfg.get("repo_root", "/data/savithas/bionemo-framework") + + git_sync_script = "" + if git_branch: + git_sync_script = f""" +# Git sync to specified branch (only on master node to avoid race conditions) +if [ "$NODE_RANK" = "0" ]; then + echo "==========================================" + echo "[Rank 0] Syncing to branch: {git_branch}" + echo "==========================================" + cd {repo_root} + # Remove any stale lock files from previous failed git operations + find .git -name "*.lock" -delete 2>/dev/null || true + git fetch origin + git checkout {git_branch} + git pull origin {git_branch} + echo "Git sync complete!" + echo "Current commit: $(git rev-parse HEAD)" + # Create a marker file to signal other nodes + echo "$(git rev-parse HEAD)" > /tmp/git_sync_complete_marker + echo "==========================================" +else + echo "[Rank $NODE_RANK] Waiting for rank 0 to complete git sync..." + # Wait for rank 0 to finish (check for marker or just wait a bit) + sleep 30 + cd {repo_root} + echo "[Rank $NODE_RANK] Current commit: $(git rev-parse HEAD)" +fi +""" + + training_script = f"""#!/bin/bash +set -e + +echo "==========================================" +echo "OpenGenome2 7B Training" +echo "Nodes: {cfg.num_nodes}" +echo "GPUs per node: {cfg.gpus_per_node}" +echo "Total GPUs: {cfg.num_nodes * cfg.gpus_per_node}" +echo "Micro batch size: {cfg.micro_batch_size}" +echo "Grad acc steps: {cfg.grad_acc_steps}" +echo "Effective batch size: {cfg.micro_batch_size * cfg.num_nodes * cfg.gpus_per_node * cfg.grad_acc_steps}" +echo "FP8: {cfg.get("fp8_enabled", False)}" +echo "FP32 Master Weights: {cfg.get("use_fp32_master_weights", False)}" +echo "Steps: {cfg.num_train_steps:,}" +echo "==========================================" + +# Initialize Lepton environment +wget -O init.sh https://raw.githubusercontent.com/leptonai/scripts/main/lepton_env_to_pytorch.sh +chmod +x init.sh +source init.sh + +export MASTER_PORT=29400 +export NCCL_TIMEOUT_MS=1800000 +export NCCL_DEBUG=WARN +export HF_HOME=/data/savithas/cache +{git_sync_script} +cd {cfg.code_path} +pip install -r requirements.txt + +# Ensure checkpoint directory exists +mkdir -p {cfg.checkpoint_dir} +""" + # Add pre-training commands if specified (e.g., symlink checkpoint steps) + if "pre_training_commands" in cfg: + for cmd in cfg.pre_training_commands: + training_script += f"\n{cmd}\n" + # Ensure metrics JSON output directory exists if configured + metrics_json_path = cfg.get("metrics_json_path", "") + if metrics_json_path: + training_script += f""" +# Ensure metrics JSON output directory exists +mkdir -p $(dirname {metrics_json_path}) +""" + training_script += f""" + +python -c "from huggingface_hub import login; login(token='${{HF_TOKEN}}')" +wandb login ${{WANDB_API_KEY}} + +echo "==========================================" +echo "Starting multinode training..." +echo "==========================================" + +torchrun \\ + --nnodes=$NNODES \\ + --nproc_per_node={cfg.gpus_per_node} \\ + --node_rank=$NODE_RANK \\ + --master_addr=$MASTER_ADDR \\ + --master_port=$MASTER_PORT \\ + {cfg.train_script} \\ + --config-name={cfg.hydra_config} \\ + {hydra_args_formatted} + +echo "==========================================" +echo "Training completed!" +echo "==========================================" +""" + + command = ["bash", "-c", training_script] + + env_vars = [ + EnvVar(name="WANDB_API_KEY", value_from=EnvValue(secret_name_ref=cfg.wandb_secret)), + EnvVar(name="HF_TOKEN", value_from=EnvValue(secret_name_ref=cfg.hf_secret)), + ] + + # Use NFS settings from config, with defaults for H100 + nfs_source_path = cfg.get("nfs", {}).get("source_path", "/BioNeMo") + nfs_mount_path = cfg.get("nfs", {}).get("mount_path", "/data") + nfs_source = cfg.get("nfs", {}).get("nfs_source", "node-nfs:fs1") + + mounts = [ + { + "path": nfs_source_path, + "mount_path": nfs_mount_path, + "from": nfs_source, + }, + ] + + job_spec = LeptonJobUserSpec( + resource_shape=resource_shape, + affinity=LeptonResourceAffinity( + allowed_dedicated_node_groups=[chosen_group.metadata.id_], + allowed_nodes_in_node_group=valid_node_ids, + ), + container=LeptonContainer( + image=cfg.container.image, + command=command, + ), + completions=cfg.num_nodes, + parallelism=cfg.num_nodes, + envs=env_vars, + image_pull_secrets=[cfg.container.registry_auth], + mounts=mounts, + ) + + job = LeptonJob(spec=job_spec, metadata=Metadata(id=cfg.job_name)) + + try: + launched_job = client.job.create(job) + if launched_job.status: + print(f" ✓ Job launched: {cfg.job_name}") + workspace_id = cfg.get("workspace_id", "vfco61g2") + print( + f" View at: https://dashboard.dgxc-lepton.nvidia.com/workspace/{workspace_id}/compute/jobs/detail/{launched_job.metadata.id_}/replicas/list" + ) + return True + except Exception as e: + print(f" ERROR submitting job {cfg.job_name}: {e}") + return False + + +def launch_agent_sidecar(client, cfg: DictConfig): + """Launch the agent daemon as a CPU-only sidecar job with NFS access. + + The agent monitors training metrics on shared NFS and can write control + files for hot-reload interventions. It does not require GPUs. + """ + chosen_group, valid_node_ids, _ = _resolve_scheduling_target(client, cfg) + + agent_dir = cfg.get("agent_dir", "/data/agent") + code_path = cfg.get( + "code_path", "/data/savithas/bionemo-framework/bionemo-recipes/recipes/opengenome2_llama_native_te" + ) + repo_root = cfg.get("repo_root", "/data/savithas/bionemo-framework") + git_branch = cfg.get("git_branch", "") + observe_only_flag = "" if cfg.get("agent_intervene", False) else "--observe-only" + + git_sync_agent = "" + if git_branch: + git_sync_agent = f""" +cd {repo_root} +find .git -name "*.lock" -delete 2>/dev/null || true +git fetch origin +git checkout {git_branch} +git pull origin {git_branch} +echo "Agent: git sync to {git_branch} complete ($(git rev-parse HEAD))" +""" + + agent_script = f"""#!/bin/bash +set -e + +echo "==========================================" +echo "Agent Daemon Sidecar" +echo "Monitoring: {agent_dir}/metrics.jsonl" +echo "Control: {agent_dir}/control.yaml" +echo "==========================================" + +{git_sync_agent} + +mkdir -p {agent_dir} +cd {code_path} +pip install pyyaml + +echo "Starting agent daemon..." +python agent_daemon.py \\ + --metrics-file {agent_dir}/metrics.jsonl \\ + --control-file {agent_dir}/control.yaml \\ + --journal-file {agent_dir}/journal.jsonl \\ + --fp8-stats-dir {agent_dir}/fp8_stats \\ + --monitor-interval 30 \\ + {observe_only_flag} +""" + + command = ["bash", "-c", agent_script] + + nfs_source_path = cfg.get("nfs", {}).get("source_path", "/BioNeMo") + nfs_mount_path = cfg.get("nfs", {}).get("mount_path", "/data") + nfs_source = cfg.get("nfs", {}).get("nfs_source", "node-nfs:fs1") + + mounts = [{"path": nfs_source_path, "mount_path": nfs_mount_path, "from": nfs_source}] + + agent_resource_shape = cfg.get("agent_resource_shape", "cpu.small") + + job_spec = LeptonJobUserSpec( + resource_shape=agent_resource_shape, + affinity=LeptonResourceAffinity( + allowed_dedicated_node_groups=[chosen_group.metadata.id_], + ), + container=LeptonContainer( + image=cfg.container.image, + command=command, + ), + completions=1, + parallelism=1, + envs=[], + image_pull_secrets=[cfg.container.registry_auth], + mounts=mounts, + ) + + agent_job_name = f"{cfg.job_name}-agent" + job = LeptonJob(spec=job_spec, metadata=Metadata(id=agent_job_name)) + + try: + launched_job = client.job.create(job) + if launched_job.status: + print(f" Agent sidecar launched: {agent_job_name}") + return True + except Exception as e: + print(f" ERROR submitting agent sidecar {agent_job_name}: {e}") + return False + + +@hydra.main(version_base=None, config_path="lepton_configs", config_name="og2_7b_thd_fp8_mixed") +def main(cfg: DictConfig): + """Main function to launch OG2 job.""" + print("=" * 60) + print(f"Launching OpenGenome2 job: {cfg.job_name}") + print("=" * 60) + + # Show git branch if specified + if cfg.get("git_branch"): + print(f" Will checkout branch: {cfg.git_branch}") + + client = APIClient() + OmegaConf.resolve(cfg) + + success = launch_single_job(client, cfg) + if not success: + print("\n Job submission failed!") + exit(1) + + print("\n Training job submitted successfully!") + + # Launch agent sidecar if enabled + if cfg.get("agent_enabled", False): + print("\nLaunching agent sidecar...") + if launch_agent_sidecar(client, cfg): + print(" Agent sidecar submitted successfully!") + else: + print(" WARNING: Agent sidecar submission failed (training continues without agent)") + + +if __name__ == "__main__": + main() diff --git a/bionemo-recipes/recipes/opengenome2_llama_native_te/OG2_FP8_1NODE_DEMO_GUIDE.md b/bionemo-recipes/recipes/opengenome2_llama_native_te/OG2_FP8_1NODE_DEMO_GUIDE.md new file mode 100644 index 0000000000..0f69b535d4 --- /dev/null +++ b/bionemo-recipes/recipes/opengenome2_llama_native_te/OG2_FP8_1NODE_DEMO_GUIDE.md @@ -0,0 +1,421 @@ +# FP8 Precision Agent — OpenGenome2 7B (1-Node Demo) + +## RUN VARIABLES + +``` +TOLERANCE_PCT = 3.0 +BASELINE_LOGFILE = ./baseline_bf16_1node.json +NUM_TRAIN_STEPS = 6000 +CHECKIN_INTERVAL = 100 +LAYERS_PER_PROMOTION = 2 +NUM_LAYERS = 32 +INITIAL_PRECISION = bf16 +PROMOTION_STRATEGY = gradual +WORKSPACE_ROOT = /data/savithas/agent_runs/demo_1node +CHECKPOINT_ROOT = /data/savithas/checkpoints + +# Training script & model +TRAINING_SCRIPT = train_fsdp2.py +CONFIG_NAME = og2_7b_bf16_1k_from_5k +NPROC_PER_NODE = 8 +MICRO_BATCH_SIZE = 1 +GRAD_ACC_STEPS = 8 +WANDB_PROJECT = llama3-metagenome-7b +RESULTS_FOLDER = /data/savithas/agent_runs/demo_1node/results +WARMUP_STEPS = 500 +``` + +______________________________________________________________________ + +## Project Overview + +You are an autonomous agent managing a pretraining loop for an OpenGenome2 7B +Transformer model (Llama 3.1-8B architecture via BioNeMo/TransformerEngine). +The model resumes from a BF16 checkpoint at step 5000. Your objective is to +**maximize the number of layers running in FP8** while keeping pretraining +accuracy within tolerance of a BF16 baseline. You control a per-layer precision +schedule and monitor training metrics at regular intervals. Training runs from +step 5001 to step 6000 (1000 steps total). + +### Architecture Summary + +``` +┌─────────────────────────────────────────────────────────┐ +│ Milestone 2: Model Persistence & Recovery │ +│ ┌───────────────────────────────────────────────────┐ │ +│ │ Milestone 1: Pretraining Agent Loop │ │ +│ │ │ │ +│ │ Agent → Change Layer Precision → Pretrain │ │ +│ │ ↑ ↓ │ │ +│ │ ←── Checkin (Training) ←── Baseline BF16 │ │ +│ └───────────────────────────────────────────────────┘ │ +│ │ +│ Checkin (Training) ── Save ──→ Models (disk) │ +│ Models ── Reload LKG (on failure) ──→ Agent │ +└─────────────────────────────────────────────────────────┘ +``` + +### Model Details + +- **Architecture**: Llama 3.1-8B (7B params) +- **Layers**: 32 transformer block layers (1-32, 1-indexed) +- **Precision**: BF16 compute, FP32 master weights +- **Dataset**: OpenGenome2 metagenomes (JSON files on NFS) +- **GBS**: 64 (mbs=1 x grad_acc=8 x 8 GPUs) +- **Checkpoint**: Resumes from BF16 step 5000 at + `/data/savithas/checkpoints/og2-7b-fp32mw-orig-ws-w1-b50k` + +______________________________________________________________________ + +## Metrics + +### Accuracy Metric: Loss (lower is better) + +At every check-in, the agent compares current loss against the BF16 baseline +loss at the same step using a relative tolerance. A check-in **passes** if: +`current_loss - baseline_loss <= baseline_loss * (TOLERANCE_PCT / 100)`. + +### Performance Metric: Unpadded Tokens Per Second (higher is better) + +Measures actual training throughput. Throughput changes do NOT trigger +rollbacks — they are logged for informational purposes only. + +______________________________________________________________________ + +## BF16 Warmup Phase + +Since we resume from a BF16 checkpoint at step 5000, the model needs a warmup +period. The warmup phase covers steps 5001-5500 (the first 500 steps after +resume). During this phase all 32 layers run in BF16. The agent does NOT +perform check-ins or promotions during the warmup phase. + +**Timeline:** + +- Steps 5001-5500: BF16 warmup (all layers BF16). `fp8_config.enabled=False`. + No agent intervention. +- Step 5500: Warmup ends. Training continues uninterrupted — agent begins + monitoring at step 5600 (first check-in). +- Steps 5500-6000: Active phase. Agent monitors check-ins and adjusts layers. + +______________________________________________________________________ + +## Base Training Command + +Every launch uses this exact template. Only the three AGENT CONTROLS fields +change between launches — everything else is hardcoded. + +```bash +cd /data/savithas/bionemo-framework/bionemo-recipes/recipes/opengenome2_llama_native_te + +torchrun --nproc_per_node=8 train_fsdp2.py \ + --config-name og2_7b_bf16_1k_from_5k \ + dataset.micro_batch_size=1 \ + dataset.buffer_size=10000 \ + dataset.num_workers=8 \ + dataset.use_stateful_dataloader=true \ + num_train_steps=6000 \ + grad_acc_steps=8 \ + checkpoint.ckpt_dir=/data/savithas/checkpoints/ \ + checkpoint.save_every_n_steps=100 \ + checkpoint.resume_from_checkpoint=true \ + checkpoint.max_checkpoints=4 \ + checkpoint.save_final_model=true \ + checkpoint.async_save=false \ + logger.frequency=1 \ + fp8_config.enabled= \ + fp8_config.fp8_recipe=transformer_engine.common.recipe.Float8BlockScaling \ + fp8_config.fp8_format=E4M3 \ + fp8_layers='' \ + wandb.project=llama3-metagenome-7b \ + wandb.name= \ + +wandb.id= \ + +wandb.resume=allow \ + hydra.run.dir=/data/savithas/agent_runs/demo_1node//hydra_outputs +``` + +**AGENT CONTROLS (change between launches):** + +- `fp8_config.enabled` — `false` during BF16 warmup; `true` once any layers + are in FP8 +- `fp8_layers` — `'[]'` during warmup; e.g. `'[16,17]'` after first expansion + +**FIXED (never change between launches):** + +- `dataset.micro_batch_size=1` — always 1 +- `dataset.buffer_size=10000` — always 10k +- `dataset.num_workers=8` — always 8 +- `dataset.use_stateful_dataloader=true` — always true (see Data Integrity section below) +- `num_train_steps=6000` — always 6000 +- `grad_acc_steps=8` — always 8 (GBS = 1 × 8 × 8 GPUs = 64) +- `checkpoint.ckpt_dir` — same directory for entire session +- `checkpoint.save_every_n_steps=100` — matches CHECKIN_INTERVAL +- `checkpoint.resume_from_checkpoint=true` — always true +- `checkpoint.async_save=false` — sync saves for reliability +- `wandb.name` — computed once at session start, never changes +- `+wandb.id` — same as `wandb.name` (WandB resumes by ID, not name) +- `+wandb.resume=allow` — resumes the same WandB run on relaunch +- `wandb.project=llama3-metagenome-7b` — fixed + +______________________________________________________________________ + +## IMPORTANT: Initial Checkpoint Setup + +The agent resumes from a pre-existing BF16 checkpoint. Before the first launch: + +1. Create a unique checkpoint directory for this agent session: + ```bash + mkdir -p $CHECKPOINT_ROOT//train_fsdp2 + ``` +2. Symlink the external 5k checkpoint: + ```bash + ln -s /data/savithas/checkpoints/og2-7b-fp32mw-orig-ws-w1-b50k/train_fsdp2/step_5000 \ + $CHECKPOINT_ROOT//train_fsdp2/step_5000 + ``` +3. Verify: + ```bash + ls $CHECKPOINT_ROOT//train_fsdp2/step_5000/.metadata + ``` + +The `` must be unique. Use format: `gradual_fp8_`. + +______________________________________________________________________ + +## Layer Precision Schedule + +The agent maintains a per-layer precision map for 32 transformer block layers +(1-32, 1-indexed). + +During warmup (steps 5001-5500): all BF16. +After warmup (step 5500+): the agent uses the `gradual` strategy. + +### Strategy: gradual + +Post-warmup starting state: `fp8_layers=[]` (all BF16). + +**On pass** — add `LAYERS_PER_PROMOTION` layers to FP8, expanding from the +center outward: + +| Round | Layers Added to FP8 | FP8 Layers | FP8 Count | +| ----- | ------------------- | ---------- | --------- | +| Start | (none) | (none) | 0 | +| 1 | 16, 17 | 16-17 | 2 | +| 2 | 15, 18 | 15-18 | 4 | +| 3 | 14, 19 | 14-19 | 6 | +| 4 | 13, 20 | 13-20 | 8 | +| 5 | 12, 21 | 12-21 | 10 | +| 6 | 11, 22 | 11-22 | 12 | +| 7 | 10, 23 | 10-23 | 14 | +| 8 | 9, 24 | 9-24 | 16 | +| 9 | 8, 25 | 8-25 | 18 | +| 10 | 7, 26 | 7-26 | 20 | +| 11 | 6, 27 | 6-27 | 22 | +| 12 | 5, 28 | 5-28 | 24 | +| 13 | 4, 29 | 4-29 | 26 | +| 14 | 3, 30 | 3-30 | 28 | +| 15 | 2, 31 | 2-31 | 30 | +| 16 | 1, 32 | 1-32 | 32 | + +**On fail** — remove the most recently added FP8 layers (demote back to BF16), +roll back to LKG checkpoint, and relaunch. + +**Rationale**: Middle layers are the most tolerant to quantization. Edge layers +(closest to embedding input and output projection) are most sensitive. Expanding +from the center outward tests the safest layers first. + +______________________________________________________________________ + +## Check-ins + +Check-ins begin at step 5600 (first `CHECKIN_INTERVAL` after warmup ends at +5500). + +Every `CHECKIN_INTERVAL` steps: + +1. Record current loss from `wandb-history.jsonl` +2. Look up baseline loss for this step from `BASELINE_LOGFILE` +3. Compute tolerance: `allowed_delta = baseline_loss * (TOLERANCE_PCT / 100)` +4. **Pass** if: `current_loss - baseline_loss <= allowed_delta` +5. Log throughput (unpadded_tokens_per_second_per_gpu) +6. **Pass** → kill training, expand FP8 (add next center-outward pair), relaunch +7. **Fail** → kill training, demote last-added layers, rollback to LKG, relaunch + +**IMPORTANT**: `CHECKIN_INTERVAL` must align with baseline logfile step +intervals. If the baseline has no entry for a check-in step, fail hard and +report the error. + +### Metric Retrieval (WandB Local Files) + +The agent monitors WandB's local log files: + +``` +/wandb/latest-run/files/wandb-history.jsonl +``` + +Each line is a JSON object: + +```json +{"train/global_step": 5600, "train/loss": 1.19, "train/unpadded_tokens_per_second_per_gpu": 22000, ...} +``` + +**Agent monitoring loop:** + +1. Launch training as a background process. +2. Poll `wandb-history.jsonl` periodically (every 30 seconds). +3. When `train/global_step` matches a check-in step, parse the metrics. +4. Compare loss against baseline (see Check-ins above). +5. Pass/fail → take appropriate action. + +If multiple check-in steps appear between polls, process in order (lowest step +first). Kill on the FIRST failure. + +### WandB Sync Before Killing + +Before killing training, sync wandb data: + +```bash +wandb sync /wandb/latest-run/ +``` + +______________________________________________________________________ + +## Checkpointing & Resume + +Checkpoints saved at `/train_fsdp2/step_/`. The training script +automatically finds the latest checkpoint and resumes. + +**What gets restored:** model weights, optimizer state, LR scheduler, step +counter, epoch counter, dataloader position (`use_stateful_dataloader=true`). + +All state — including where the dataloader left off in the dataset — is +restored from checkpoint. + +**Key behavior:** + +- `num_train_steps` is an absolute target (6000), not relative. +- Resuming at step 5400 with `num_train_steps=6000` trains steps 5401-6000. +- `checkpoint.resume_from_checkpoint=true` always — auto-finds latest checkpoint. + +**CRITICAL — Data Integrity on Relaunch:** + +The agent kills and relaunches training at every expansion (pass) and every +rollback (fail). With `dataset.use_stateful_dataloader=true`, the dataloader +position is saved in each checkpoint and restored on resume, so the model sees +each training batch exactly once — the same as a continuous baseline run. If +`use_stateful_dataloader` is NOT enabled, every relaunch resets the dataloader +to the start of the dataset. This causes the model to re-train on early batches +multiple times, artificially lowering training loss (overfitting to repeated +data) and invalidating comparisons against the BF16 baseline. The agent MUST +verify that `dataset.use_stateful_dataloader=true` is set in the training +command. + +**CRITICAL — Checkpoint Safety Rules:** + +- NEVER delete the checkpoint directory itself (`$CHECKPOINT_ROOT//`). + Only delete individual `step_/` subdirectories inside it. +- NEVER use `rm -rf` on any parent directory. Only `rm -rf` specific + `step_/` subdirectories that are newer than the LKG. +- Before deleting, always list the contents of the checkpoint directory first to + confirm which checkpoints exist and which is the LKG. +- When in doubt, do NOT delete — relaunch and let the training script skip the + corrupt checkpoint automatically. + +### Recovery on Failed Check-in + +1. Kill the current training process. +2. Delete any `step_/` checkpoint newer than the LKG. +3. Remove the most recently added FP8 layers (demote to BF16). +4. Relaunch with updated precision schedule. + +The agent discards all progress since the last successful check-in. + +______________________________________________________________________ + +## Agent Workspace & Artifacts + +All output under: `$WORKSPACE_ROOT//` + +``` +$CHECKPOINT_ROOT// # model checkpoints + train_fsdp2/step_/ + +$WORKSPACE_ROOT// + logs/ # training stdout/stderr + configs/ # CLI invocations per segment + graphs/ # plots (perplexity/throughput vs baseline) + history.json # structured log of all decisions + state.json # agent state for crash recovery + report.md # human-readable summary +``` + +### history.json + +```json +[ + { + "step": 5600, + "current_loss": 1.19, + "baseline_loss": 1.18, + "diff": 0.01, + "allowed_delta": 0.035, + "passed": true, + "action": "expand_fp8", + "added_layers": [16, 17], + "fp8_layers": [16, 17], + "throughput": 22000.0, + "timestamp": "2026-03-19T10:00:00" + } +] +``` + +### state.json + +```json +{ + "current_step": 5600, + "lkg_step": 5600, + "expansion_round": 1, + "fp8_layers": [16, 17], + "run_name": "gradual_fp8_20260319_100000", + "warmup_complete": true +} +``` + +### report.md + +Update after every check-in and at end of training. Include: + +- Run metadata (model, layers, steps, tolerance, start time) +- Final precision schedule (which layers in FP8 vs BF16) +- Summary of all check-in results (table: step, baseline loss, current loss, + pass/fail, action) +- Throughput comparison vs BF16 baseline +- Conclusion: how many layers ended in FP8, accuracy vs baseline + +______________________________________________________________________ + +## WandB Run Naming & Resume + +`wandb.name` and `+wandb.id` are both set to `` (e.g. +`gradual_fp8_20260319_100000`). These are computed ONCE at session start and +NEVER change. WandB resumes by **run ID** (not name), so `+wandb.id` is +required. Combined with `+wandb.resume=allow`, every relaunch appends to the +same WandB run, producing a single continuous curve in the dashboard. + +This means all training segments (warmup, expansions, rollbacks) appear as one +continuous line in WandB, making it easy to see the full training trajectory +including any loss spikes from precision changes. + +______________________________________________________________________ + +## Key Constraints + +1. The agent ONLY controls behavior via CLI arguments — do NOT modify training + scripts or config files. +2. All artifacts go under `$WORKSPACE_ROOT//`. Checkpoints go under + `$CHECKPOINT_ROOT//`. +3. The agent stops after reaching step 6000. +4. NEVER delete the checkpoint directory itself — only individual `step_/` + subdirectories. +5. Before deleting any checkpoint, always list its contents first. +6. If all 32 layers are successfully expanded to FP8, continue training in full + FP8 for the remaining steps. diff --git a/bionemo-recipes/recipes/opengenome2_llama_native_te/OG2_FP8_AGENT_GUIDE.md b/bionemo-recipes/recipes/opengenome2_llama_native_te/OG2_FP8_AGENT_GUIDE.md new file mode 100644 index 0000000000..eedd8bb216 --- /dev/null +++ b/bionemo-recipes/recipes/opengenome2_llama_native_te/OG2_FP8_AGENT_GUIDE.md @@ -0,0 +1,744 @@ +# OpenGenome2 7B FP8 Precision Agent — Milestones 1 & 2 + +## Run Variables + +These can be edited by the user before each run. + +``` +TOLERANCE_PCT = 5.0 # max allowed perplexity difference as % of baseline (e.g. 5.0 = 5%) +BASELINE_LOGFILE = ./baseline_bf16.json # BF16 baseline (co-located with this guide) +NUM_TRAIN_STEPS = ??? # total training steps; agent stops here +CHECKIN_INTERVAL = 100 # steps between check-ins (must align with baseline logfile steps) +LAYERS_PER_PROMOTION = 2 # layers demoted to BF16 per failed check-in +NUM_LAYERS = 32 # transformer block layers (OG2-7B has layers 1-32) +INITIAL_PRECISION = fp8 # starting precision for all transformer block layers +PROMOTION_STRATEGY = ends_in # "ends_in", "tail_in", or "research_guided" (see Promotion Strategies below) +WORKSPACE_ROOT = /data/savithas/agent_runs # root for all agent output (NFS) +CHECKPOINT_ROOT = /data/savithas/checkpoints # root for model checkpoints (NFS) +RESULTS_FOLDER = /data/savithas/agent_runs/results/$PROMOTION_STRATEGY # final reports, scoped by strategy + +# Training script & model +TRAINING_SCRIPT = train_fsdp2.py +CONFIG_NAME = og2_7b_thd_gqa_fp8 # Hydra config name (inherits og2_7b_thd_gqa) +NPROC_PER_NODE = 8 # GPUs per node +NNODES = 6 # Number of Lepton nodes +TOKENIZER = ./tokenizers/nucleotide_fast_tokenizer +MICRO_BATCH_SIZE = 1 # per-GPU micro batch size +GRAD_ACC_STEPS = 8 # gradient accumulation steps +DATASET_PATH = /data/opengenome2/json/pretraining_or_both_phases/metagenomes/data_metagenomics_train_*.jsonl.gz +WANDB_PROJECT = opengenome2-7b # WandB project name for all runs +LAUNCH_DIR = ??? # NFS directory for multi-node launch coordination (set by prompt) +``` + +______________________________________________________________________ + +## Project Overview + +We are building an agent that manages a pretraining loop for an OpenGenome2 7B Llama3 model using TransformerEngine with FP8 Block Scaling. The agent's objective is to **maximize the number of layers running in FP8** while keeping pretraining accuracy within tolerance of a BF16 baseline. The agent does this by controlling a per-layer precision schedule and monitoring training metrics at regular intervals. Training runs for a fixed number of steps (`--num_train_steps=$NUM_TRAIN_STEPS`). + +### Key Differences from ESM2/NVFP4 Guide + +| Aspect | ESM2 NVFP4 Guide | OG2 FP8 Guide | +| ----------------- | ------------------------------- | ------------------------------------------------------------------------------------ | +| Model | ESM2-3B (36 layers) | OG2 Llama3-7B (32 layers) | +| Precision levels | NVFP4 → MXFP8 → BF16 (3 levels) | **FP8 → BF16 (2 levels)** | +| FP8 recipe | MXFP8BlockScaling | **Float8BlockScaling** | +| FP4 support | Yes (NVFP4BlockScaling) | **No** — FP4 is not used | +| Infrastructure | Single node | **Lepton multi-node** (6+ nodes) | +| Model features | Standard | Spike-No-More init, Megatron scaled init, weight decay grouping, FP32 master weights | +| Sequence handling | Packing | THD sequence packing with genomic masking | + +### Reference Materials + +All reference materials are co-located with this guide: + +- **Strategy documents**: [OG2_STRATEGY_ENDS_IN.md](OG2_STRATEGY_ENDS_IN.md), [OG2_STRATEGY_TAIL_IN.md](OG2_STRATEGY_TAIL_IN.md) — full pseudocode, demotion tables, and worked examples for each strategy +- **BF16 baseline**: [baseline_bf16.json](baseline_bf16.json) — extracted from WandB run `8mfsb27t` via [extract_baseline_metrics.py](extract_baseline_metrics.py) +- **Quant stats config**: [fp8_debugging_stats.yaml](fp8_debugging_stats.yaml) — TE debug feature config for `research_guided` strategy +- **Research paper**: [references/NVIDIA-Nemotron-3-Super-Technical-Report.pdf](references/NVIDIA-Nemotron-3-Super-Technical-Report.pdf) — Nemotron-3 Super Technical Report describing the low-precision training agent approach this guide is adapted from + +______________________________________________________________________ + +## Architecture Summary + +``` +┌─────────────────────────────────────────────────────────┐ +│ Milestone 2: Model Persistence & Recovery │ +│ ┌───────────────────────────────────────────────────┐ │ +│ │ Milestone 1: Pretraining Agent Loop │ │ +│ │ │ │ +│ │ Agent → Change Layer Precision → Pretrain │ │ +│ │ ↑ ↓ │ │ +│ │ ←── Checkin (Training) ←── Baseline BF16 │ │ +│ └───────────────────────────────────────────────────┘ │ +│ │ +│ Checkin (Training) ── Save ──→ Models (disk) │ +│ Models ── Reload LKG (on failure) ──→ Agent │ +└─────────────────────────────────────────────────────────┘ +``` + +______________________________________________________________________ + +## Metrics + +### Accuracy Metric: Perplexity (lower is better) + +Perplexity is the primary accuracy metric: `perplexity = exp(loss)`. At every check-in, the agent compares current perplexity against the BF16 baseline perplexity at the same step using a relative tolerance. A check-in passes if: `current_perplexity - baseline_perplexity <= baseline_perplexity * (TOLERANCE_PCT / 100)`. This means the tolerance adapts to the scale of the perplexity — it is more lenient early in training when perplexity is high and volatile, and tighter later when perplexity has settled. Only perplexity failures trigger rollbacks. + +**Important**: The OG2 training script logs `loss` to stdout and WandB but does NOT log perplexity directly. The agent must compute perplexity from the loss value: `perplexity = exp(loss)`. + +### Performance Metric: Unpadded Tokens Per Second (higher is better) + +Measures actual training throughput excluding padding tokens. The agent should log this at every check-in to confirm FP8 is yielding a throughput benefit. Throughput drops do NOT trigger rollbacks — they are logged for informational purposes only. + +### Metric Retrieval (WandB Local Files) + +The agent launches training as a background process with `num_train_steps=$NUM_TRAIN_STEPS` (the full target). Training runs continuously — the agent does NOT stop/restart at each check-in. Instead, it monitors WandB's local log files to read per-step metrics in near-real-time. + +WandB writes run data to a local directory during training. The path is printed to stdout at launch: + +``` +wandb: Run data is saved locally in /wandb/run-- +``` + +A symlink `/wandb/latest-run` always points to the active run. + +The per-step metrics file is: + +``` +/wandb/latest-run/files/wandb-history.jsonl +``` + +Each line is a JSON object with all metrics logged at that step: + +```json +{"train/global_step": 100, "train/loss": 2.72, "train/unpadded_tokens_per_second_per_gpu": 10035.6, ...} +``` + +**Agent monitoring loop:** + +1. Launch training as a background process. +2. Poll `wandb-history.jsonl` periodically (e.g. every 30 seconds). +3. When a new line appears where `train/global_step` matches a check-in step (`$CHECKIN_INTERVAL` multiple), parse the metrics. Key WandB fields: + - `train/global_step` — step number + - `train/loss` — compute `perplexity = exp(loss)` (compare against baseline `"perplexity"`) + - `train/unpadded_tokens_per_second_per_gpu` — throughput per GPU +4. Compare perplexity against baseline (see Check-ins below). +5. Pass → do nothing, let training continue. +6. Fail → kill the training process **IMMEDIATELY**, then perform LKG recovery (see Milestone 2), and relaunch with updated precision schedule. + +If multiple new check-in steps appear between polls, process them in order (lowest step first). Kill on the **FIRST** failure — do not continue evaluating later steps. This ensures the LKG checkpoint hasn't been auto-deleted by `max_checkpoints` before the agent acts. + +Since the agent `cd`'s into `$(dirname $TRAINING_SCRIPT)` before launching, the wandb directory is relative to that working directory. + +### Stdout Metric Format + +The OG2 `perf_logger.py` also outputs metrics at every `logger.frequency` steps to stdout in this format: + +``` +loss: 2.94, learning_rate: 3e-05, grad_norm: 1.23, step_time: 0.456, tokens_per_second_per_gpu: 4.85e+04, unpadded_tokens_per_second_per_gpu: 4.5e+04, total_unpadded_tokens_per_batch: 12345, gpu_memory_allocated_max_gb: 65.3, gpu_memory_allocated_mean_gb: 65.3, global_step: 100 +``` + +Key fields to parse: + +- `loss` — compute `perplexity = exp(loss)` +- `unpadded_tokens_per_second_per_gpu` — throughput metric +- `global_step` — current training step + +______________________________________________________________________ + +## CLI Reference + +### Base Training Command (OG2-7B on Lepton) + +The agent constructs each training launch from this template. First `cd` into the training script directory, then run torchrun. Fields marked `← AGENT CONTROLS` are modified between launches; everything else stays fixed. + +```bash +cd $(dirname $TRAINING_SCRIPT) +torchrun \ + --nproc_per_node=$NPROC_PER_NODE \ + --nnodes=$NNODES \ + --node_rank=$NODE_RANK \ + --master_addr=$MASTER_ADDR \ + --master_port=$MASTER_PORT \ + $(basename $TRAINING_SCRIPT) \ + --config-name $CONFIG_NAME \ + num_train_steps=$NUM_TRAIN_STEPS \ + grad_acc_steps=$GRAD_ACC_STEPS \ # ← FIXED (do NOT change) + fp8_config.enabled=True \ + fp8_config.fp8_recipe=transformer_engine.common.recipe.Float8BlockScaling \ + fp8_config.fp8_format=E4M3 \ + fp8_layers='[2,3,4,...,31]' \ # ← AGENT CONTROLS + quant_stats_config.enabled= \ # ← AGENT CONTROLS (see below) + quant_stats_config.quant_stats_file=./fp8_debugging_stats.yaml \ + quant_stats_config.quant_log_dir=$WORKSPACE_ROOT//quant_stats \ + checkpoint.ckpt_dir=$CHECKPOINT_ROOT/ \ # ← FIXED (same dir for entire session) + checkpoint.save_every_n_steps=$CHECKIN_INTERVAL \ + checkpoint.resume_from_checkpoint=true \ # ← FIXED (always true; auto-finds latest checkpoint) + checkpoint.max_checkpoints=4 \ + checkpoint.save_final_model=true \ + checkpoint.async_save=false \ # ← FIXED (sync saves for reliability) + dataset.use_stateful_dataloader=true \ # ← FIXED (CRITICAL: preserves dataloader position across relaunches) + validation.enabled=true \ # ← FIXED + validation.eval_interval=1000 \ # ← FIXED (every 1000 steps) + validation.num_batches=40 \ # ← FIXED + hydra.run.dir=$WORKSPACE_ROOT//hydra_outputs \ + wandb.project=$WANDB_PROJECT \ # ← FIXED + wandb.name= \ # ← FIXED (same name for entire session — produces one continuous WandB curve) + +wandb.id= \ # ← FIXED (WandB resumes by ID, not name — must match across relaunches) + +wandb.resume=allow # ← FIXED (resumes the same WandB run on relaunch) +``` + +**CRITICAL**: The agent must use this command template EXACTLY. Do NOT add, remove, or modify any parameters not marked `← AGENT CONTROLS`. In particular: + +- Do NOT change `grad_acc_steps` — it is set in the Hydra config and must stay at $GRAD_ACC_STEPS +- Do NOT add `dataset.*`, `adamw_kwargs.*`, or `lr_scheduler_kwargs.*` overrides — these are already set in the Hydra config +- Do NOT scale any parameter by the number of nodes or GPUs +- `logger.frequency` is set in the Hydra config (every step) — do NOT override it + +**Notes on OG2 vs ESM2 differences:** + +- **No `fp4_config` or `fp4_layers`** — FP4 is not used for OG2 FP8 Block Scaling +- **Only `fp8_layers`** controls which layers run in FP8; layers absent from the list default to BF16 +- The `og2_7b_thd_gqa_fp8` config already sets: `use_sequence_packing=true`, `use_fp32_master_weights=true`, `spike_no_more_embedding_init=true`, `use_megatron_scaled_init=true`, `dataset.*`, `adamw_kwargs.*`, `lr_scheduler_kwargs.*`, `logger.frequency=1`, `grad_acc_steps=8` +- **Do NOT override** any Hydra config values that are not in the template above — the config is carefully tuned +- Multi-node Lepton: `MASTER_ADDR`/`MASTER_PORT` are provided by the Lepton environment + +The agent modifies these fields between launches: + +- `fp8_layers` — updated based on the current precision schedule +- `quant_stats_config.enabled` — `true` for `research_guided` only; `false` for `ends_in` and `tail_in` + +These fields are FIXED for the entire session (never change between launches): + +- `grad_acc_steps` — always `$GRAD_ACC_STEPS` (do NOT scale by nodes/GPUs — FSDP handles distributed scaling) +- `checkpoint.ckpt_dir` — always `$CHECKPOINT_ROOT/` (**NOT** `$WORKSPACE_ROOT/...`). CHECKPOINT_ROOT and WORKSPACE_ROOT are different directories. Same directory for the entire session; matches Lepton job name and wandb run name. +- `num_train_steps` — always `$NUM_TRAIN_STEPS` (absolute target) +- `checkpoint.resume_from_checkpoint` — always `true` (the script auto-finds the latest checkpoint; on first launch with no checkpoints it starts fresh automatically) +- `dataset.use_stateful_dataloader` — always `true` (see Data Integrity section below) +- `wandb.name` — always `` (computed once at session start, never changes) +- `+wandb.id` — always `` (WandB resumes by ID, not name — must match across relaunches) +- `+wandb.resume` — always `allow` (resumes the same WandB run on relaunch) + +### Multi-Node Launch Protocol + +Training runs on `$NNODES` nodes. This agent runs on rank 0 only. Worker nodes (ranks 1 through NNODES-1) poll a shared NFS directory for barrier-based round files and execute the same torchrun command. + +**IMPORTANT: `$TORCHRUN_PREFIX` is pre-configured in your environment.** It contains the correct `torchrun --nproc_per_node=... --nnodes=... --node_rank=... --master_addr=... --master_port=...` flags for THIS node. You MUST use `$TORCHRUN_PREFIX` for ALL training launches. Do NOT construct your own torchrun command — the prefix has the correct multi-node flags that you cannot easily determine yourself. + +**Before EVERY `torchrun` launch, you MUST:** + +1. Increment your round counter (start from 1 for the first launch in this session). +2. Write a `round_N_args.env` file to `$LAUNCH_DIR/` containing a `TRAIN_CMD` variable with all Python script arguments (NOT the torchrun prefix — workers prepend that themselves): + ```bash + TRAIN_CMD="train_fsdp2.py \ + --config-name og2_7b_thd_gqa_fp8 \ + fp8_layers='[3,4,...,30]' \ + ... all other args ..." + ``` +3. Touch `$LAUNCH_DIR/round_N_ready` to signal workers to start. +4. Then run `$TORCHRUN_PREFIX $TRAIN_CMD` on rank 0 (using the SAME TRAIN_CMD you wrote to the file). + +Example (first launch): + +```bash +# Step 1: Write training args for workers (use single-quoted heredoc to preserve $variables) +cat > $LAUNCH_DIR/round_1_args.env << 'ARGS_EOF' +TRAIN_CMD="train_fsdp2.py \ + --config-name og2_7b_thd_gqa_fp8 \ + fp8_layers='[3,4,...,30]' \ + num_train_steps=182300 \ + ... all other args ..." +ARGS_EOF + +# Step 2: Signal workers to start (MUST come AFTER writing args) +touch $LAUNCH_DIR/round_1_ready + +# Step 3: Run torchrun on rank 0 using $TORCHRUN_PREFIX (NEVER construct your own torchrun flags) +cd /data/savithas/bionemo-framework/bionemo-recipes/recipes/opengenome2_llama_native_te +$TORCHRUN_PREFIX train_fsdp2.py --config-name og2_7b_thd_gqa_fp8 \ + fp8_layers='[3,4,...,30]' \ + num_train_steps=182300 \ + ... all other args ... +``` + +**When killing training:** Kill the torchrun process on rank 0. Workers detect the disconnection (NCCL timeout, up to 30 minutes) and their processes exit automatically. Workers then poll for the next round's ready file. After killing: + +1. Write the next round's `round_N_args.env` immediately. +2. Wait at least 2 minutes for workers to finish dying and reach the polling loop. +3. Touch `round_N_ready` to signal workers. +4. Run `$TORCHRUN_PREFIX $TRAIN_CMD` on rank 0. C10D rendezvous waits for all nodes (retry logic handles any stragglers). + +**To signal completion:** `touch $LAUNCH_DIR/done` — all workers exit cleanly. + +**CRITICAL rules:** + +- **ALWAYS use `$TORCHRUN_PREFIX`** to launch training. NEVER write your own `torchrun --nproc_per_node=... --nnodes=...` command. The prefix already contains the correct multi-node flags (nnodes, node_rank, master_addr, master_port). If you construct your own torchrun command, you WILL get single-node training (8 GPUs instead of 48). +- **Use single-quoted heredoc** (`<< 'ARGS_EOF'`) when writing `round_N_args.env`. This preserves `$NODE_RANK`, `$MASTER_ADDR`, and `$MASTER_PORT` as literal variables. Each worker node has these set to its own values by the Lepton environment. If you expand them, all workers will think they are rank 0. +- Write `round_N_args.env` BEFORE touching `round_N_ready`. Workers source the args file immediately after detecting the ready file. +- The `TRAIN_CMD` must NOT include `torchrun` or its flags — only the Python script and its Hydra arguments. Workers prepend the torchrun prefix themselves (with their own `$NODE_RANK`, `$MASTER_ADDR`, etc.). +- When killing training: just kill the torchrun process on rank 0. Workers detect the disconnection (NCCL timeout) and exit automatically. Workers then block on the next round's ready file. +- Track the round counter in `state.json` so you can resume correctly after a crash. + +### Post-Launch Verification (MANDATORY) + +After the **first** torchrun launch in this session, verify the training setup is correct before proceeding. Check the stdout/WandB output for: + +1. **GPU count**: Must show `GPU count: {NPROC_PER_NODE * NNODES}` (e.g., 48 for 6 nodes × 8 GPUs). If it shows only `NPROC_PER_NODE` (e.g., 8), multi-node is broken — kill immediately and debug. +2. **grad_acc_steps**: Must be `$GRAD_ACC_STEPS` (e.g., 8). If it shows any other value, kill immediately and fix. +3. **Effective batch size**: Should be `$MICRO_BATCH_SIZE × $NPROC_PER_NODE × $NNODES × $GRAD_ACC_STEPS` (e.g., 1 × 8 × 6 × 8 = 384). +4. **Resume step**: For warm-start, must show `Starting training loop from step `. + +If ANY of these are wrong, kill training immediately, diagnose the issue, fix it, and relaunch. Do NOT let incorrect training continue — it wastes GPU time and produces unusable results. + +### Validation as Downstream Signal + +Validation is enabled in the training command (`validation.enabled=true`). The training script automatically runs validation every 1000 steps and logs `val/loss` and `val/ppl` to WandB. This provides a downstream-like signal: the FP8 paper (Nemotron-3 Super) notes that training loss can diverge slightly under low-precision without hurting downstream task quality. + +**How the agent uses validation metrics:** + +- At each check-in, also read `val/loss` and `val/ppl` from `wandb-history.jsonl` (if a validation step has occurred since the last check-in). +- Log validation metrics to `history.json` alongside training metrics. +- Validation metrics are **informational only** — they do NOT trigger rollbacks. Only training perplexity triggers rollbacks. +- In `report.md`, include a comparison of validation perplexity between this FP8 run and the BF16 baseline. This helps determine if FP8 precision loss affects downstream quality. +- If validation fails with an error (e.g., data loading issue), the training script already handles this with try/except — training continues uninterrupted. The agent should log the failure but NOT take any action. + +### Layer Precision Control + +Precision is controlled via a single list: + +- `fp8_layers=[2,3,...,31]` — layers running in FP8 Block Scaling +- Layers NOT in `fp8_layers` default to BF16 + +Embedding and lm_head are always in BF16 (not addressable via `fp8_layers`). The agent only manages transformer block layer indices 1-32 (1-indexed). A layer cannot appear in `fp8_layers` and also be in BF16 — its presence or absence in the list is the control. + +When using FP8, the recipe config must also be passed: + +``` +fp8_config.enabled=True +fp8_config.fp8_recipe=transformer_engine.common.recipe.Float8BlockScaling +fp8_config.fp8_format=E4M3 +fp8_layers=[...] +``` + +### WandB Run Naming & Resume + +`wandb.name` and `+wandb.id` are both set to `` (e.g. `ends_in_20260317_143000`). These are computed ONCE at session start and NEVER change. WandB resumes by **run ID** (not name), so `+wandb.id` is required. Combined with `+wandb.resume=allow`, every relaunch appends to the same WandB run, producing a single continuous curve in the dashboard. + +This means all training segments (warmup, expansions, rollbacks) appear as one continuous line in WandB, making it easy to see the full training trajectory including any loss spikes from precision changes. + +### Checkpointing & Resume + +The training script has built-in checkpoint support. To enable: + +``` +checkpoint.ckpt_dir=$CHECKPOINT_ROOT/ +checkpoint.save_every_n_steps=$CHECKIN_INTERVAL +checkpoint.resume_from_checkpoint=true +``` + +To resume after a stop or crash, re-run the exact same command. The script automatically finds the latest `step_*` checkpoint in `checkpoint.ckpt_dir`, restores state, and resumes from step + 1. + +**What gets restored on resume:** + +- Model weights, optimizer state (AdamW moments), LR scheduler, step counter, epoch counter, dataloader position (`use_stateful_dataloader=true`) + +All state — including where the dataloader left off in the dataset — is restored from checkpoint. + +**Key behavior:** + +- `num_train_steps` is an absolute target, not relative. Resuming at step 400 with `num_train_steps=$NUM_TRAIN_STEPS` trains steps 401–`$NUM_TRAIN_STEPS`. +- `checkpoint.resume_from_checkpoint=false` forces a fresh start even if checkpoints exist. +- Checkpoints are saved as `/train_fsdp2/step_/`. + +Additional checkpoint flags: + +``` +checkpoint.max_checkpoints=4 # keep 4 most recent (buffer so LKG isn't auto-deleted before agent can act) +checkpoint.save_final_model=true # save .safetensors at end of training +``` + +LKG recovery: the agent deletes any checkpoint newer than the LKG from `$CHECKPOINT_ROOT//train_fsdp2/`. Since `checkpoint.resume_from_checkpoint=true` always, the training script auto-finds the latest remaining checkpoint (the LKG) and resumes from there. The `checkpoint.ckpt_dir` never changes. + +**CRITICAL — Data Integrity on Relaunch:** + +The agent kills and relaunches training at every expansion (pass) and every rollback (fail). With `dataset.use_stateful_dataloader=true`, the dataloader position is saved in each checkpoint and restored on resume, so the model sees each training batch exactly once — the same as a continuous baseline run. If `use_stateful_dataloader` is NOT enabled, every relaunch resets the dataloader to the start of the dataset. This causes the model to re-train on early batches multiple times, artificially lowering training loss (overfitting to repeated data) and invalidating comparisons against the BF16 baseline. The agent MUST verify that `dataset.use_stateful_dataloader=true` is set in the training command. + +**CRITICAL — Checkpoint Safety Rules:** + +- NEVER delete the checkpoint directory itself (`$CHECKPOINT_ROOT//`). Only delete individual `step_/` subdirectories inside it. +- NEVER delete the run directory (`$WORKSPACE_ROOT//`). It contains checkpoints, logs, history, and configs that are irreplaceable. +- NEVER use `rm -rf` on any parent directory. Only `rm -rf` specific `step_/` subdirectories that are newer than the LKG. +- If a checkpoint appears corrupt or incomplete, delete ONLY that specific `step_/` subdirectory — not the entire checkpoints folder. +- Before deleting, always list the contents of the checkpoint directory first to confirm which checkpoints exist and which is the LKG. +- When in doubt, do NOT delete — relaunch and let the training script skip the corrupt checkpoint automatically. + +### Quantization Stats Logging + +TransformerEngine provides per-layer quantization statistics (underflow %, scale_inv_min, scale_inv_max, MSE). These are **only used by the `research_guided` strategy**. + +When enabled (`research_guided` only), these flags apply: + +``` +quant_stats_config.enabled=true +quant_stats_config.quant_stats_file=./fp8_debugging_stats.yaml +quant_stats_config.quant_log_dir=$WORKSPACE_ROOT//quant_stats +``` + +Stats are logged to `/rank_*/nvdlfw_inspect_statistics_logs/`. Logging frequency is controlled via the `freq` parameter in the stats YAML config. Available stats for FP8 Block Scaling: + +- `underflows%` — percentage of non-zero elements rounded to zero after quantization +- `scale_inv_min` / `scale_inv_max` — range of inverse scaling factors across blocks +- `mse` — mean squared error between quantized and original tensor + +These are collected per layer, per tensor type (activation, gradient, weight). + +For `ends_in` and `tail_in`: set `quant_stats_config.enabled=false`. These strategies use deterministic demotion orders and do not need runtime quant signals. + +Reference implementation: `quantization.py` in this recipe directory. + +______________________________________________________________________ + +## Precision Levels + +For OG2 FP8 Block Scaling, there are only **two precision levels**: + +``` +FP8 (Float8BlockScaling E4M3) → BF16 +``` + +Demotion is always FP8 → BF16. There is no intermediate level. + +______________________________________________________________________ + +## Promotion Strategies + +The agent uses `PROMOTION_STRATEGY` to decide which layers to **demote from FP8 to BF16** when a check-in fails. Three strategies are available: + +### Strategy 1: `ends_in` (default) + +Demote `LAYERS_PER_PROMOTION` layers per failed check-in, working inward from both ends simultaneously: + +``` +Failure 1: demote layers 1, 32 (outermost pair) +Failure 2: demote layers 2, 31 +Failure 3: demote layers 3, 30 +Failure 4: demote layers 4, 29 +...continuing inward... +Last: layers 16, 17 (very center, last to be demoted) +``` + +**Rationale**: Edge layers (closest to embedding input and projection output) are more sensitive to quantization error. Middle layers are most tolerant. Never demote from the middle outward. + +See [OG2_STRATEGY_ENDS_IN.md](OG2_STRATEGY_ENDS_IN.md) for full pseudocode, demotion table, and worked example. + +### Strategy 2: `tail_in` + +Demote `LAYERS_PER_PROMOTION` layers per failed check-in, starting from the last layer and working toward the first. The agent decides how many total layers to demote — there is no fixed cap. + +``` +Failure 1: demote layers 32, 31 (last 2 layers) +Failure 2: demote layers 30, 29 (next 2 toward head) +Failure 3: demote layers 28, 27 +...continuing toward layer 1... +``` + +**Rationale**: The final layers of a transformer (closest to the output projection) are typically most sensitive to quantization error. Demoting from the tail inward addresses the most sensitive layers first. + +See [OG2_STRATEGY_TAIL_IN.md](OG2_STRATEGY_TAIL_IN.md) for full pseudocode, demotion table, and worked example. + +### Strategy 3: `research_guided` + +This is the **only strategy that enables quant stats logging**. Set `quant_stats_config.enabled=true` in the training command. + +Use the quant stats (underflow %, scale_inv_min, scale_inv_max, MSE) to collect per-layer sensitivity signals at runtime. The agent should: + +1. Run an initial segment (e.g. first `CHECKIN_INTERVAL` steps) with all layers in FP8 and quant stats enabled. +2. Collect quant stats and identify which layers show the highest underflow/MSE. +3. When a check-in fails, demote the layers with the worst quant stats first (rather than following a fixed geometric pattern). +4. Document in `report.md` how the demotion order was derived from the runtime quant stats, and how it differs from `ends_in` / `tail_in`. + +This strategy is exploratory. The agent has freedom to define the demotion order based on runtime quant stats, but must still respect the check-in / rollback loop and log all decisions to `history.json` and `report.md`. + +______________________________________________________________________ + +## Check-ins + +Every `CHECKIN_INTERVAL` training steps, the agent performs a check-in: + +1. **Record** the current perplexity: `current_perplexity = exp(loss)`. +2. **Look up** the BF16 baseline perplexity for this step from `BASELINE_LOGFILE`. +3. **Compute tolerance** for this step: `allowed_delta = baseline_perplexity * (TOLERANCE_PCT / 100)`. +4. **Pass** if: `current_perplexity - baseline_perplexity <= allowed_delta`. +5. **Log** current `unpadded_tokens_per_second_per_gpu` for performance tracking. +6. **Pass** → do nothing, let training continue. +7. **Fail** → kill the training process, update precision schedule (demote `LAYERS_PER_PROMOTION` layers using `PROMOTION_STRATEGY`), and trigger recovery (Milestone 2). + +**IMPORTANT**: `CHECKIN_INTERVAL` must align with the baseline logfile step intervals (both are 100 steps). If the agent attempts a check-in at a step that does not exist in the baseline logfile, it must immediately stop and report the error to the user (e.g. "Baseline logfile has no entry for step\_\. Check that CHECKIN_INTERVAL aligns with the baseline step intervals."). Do not interpolate or skip — fail hard. + +### Configuration + +All tunable values are defined in the "Run Variables" block at the top of this document. The agent must read those values at startup. The two metrics are fixed: + +- **Accuracy metric**: perplexity (lower is better) +- **Performance metric**: unpadded_tokens_per_sec (higher is better) + +______________________________________________________________________ + +## Baseline Reference Logfile + +Before any agent-managed training begins, a full BF16 baseline run must be completed. The baseline logfile contains per-step metrics at 100-step intervals: + +```json +{ + "step_100": {"perplexity": 6.04, "loss": 1.80, "unpadded_tokens_per_sec": 9013}, + "step_200": {"perplexity": 4.16, "loss": 1.42, "unpadded_tokens_per_sec": 10047}, + "step_300": {"perplexity": 4.09, "loss": 1.41, "unpadded_tokens_per_sec": 10075}, + ... +} +``` + +The agent loads this file at startup. At every check-in it looks up the baseline perplexity for the current step and logs both baseline and current values. The logfile path is provided via config — do not generate baseline values within the agent loop. + +______________________________________________________________________ + +## Milestone 1: Pretraining Agent Loop + +### Agent Objective + +Run as many layers as possible in FP8 while keeping perplexity within tolerance of the BF16 baseline. Start with all 32 transformer block layers in `$INITIAL_PRECISION`. Demote layers to BF16 only when check-ins fail. Stop after `$NUM_TRAIN_STEPS` steps. + +### Layer Precision Schedule + +The agent maintains a per-layer precision map for transformer block layers only: + +```python +# OG2-7B: 32 transformer block layers (1-32, 1-indexed) +layer_precision = {i: "fp8" for i in range(1, 33)} +``` + +This map translates directly to the `fp8_layers` CLI argument: + +```python +fp8_layers = sorted(k for k, v in layer_precision.items() if v == "fp8") +# Layers not in fp8_layers default to BF16 +``` + +### Precision Levels & Demotion Mechanics + +Demotion is always FP8 → BF16. When a check-in fails, the agent removes the selected layers from `fp8_layers` (layers absent from the list default to BF16). Example: `fp8_layers=[1,2,3,...,32]` becomes `fp8_layers=[2,3,...,31]` after demoting layers 1 and 32. + +______________________________________________________________________ + +## Milestone 2: Model Persistence & Recovery + +### Saving Checkpoints + +The agent uses the built-in checkpoint system (see "Checkpointing & Resume" above). Set `checkpoint.save_every_n_steps` to match `$CHECKIN_INTERVAL` so that a checkpoint exists at every check-in step. The most recent checkpoint that passed a check-in is the "last known good" (LKG). + +The agent must also persist alongside each checkpoint: + +- The current `fp8_layers` list (the layer precision schedule) + +(Model weights, optimizer state, LR scheduler, step counter, and RNG states are handled by the built-in checkpoint system. Each relaunch resumes the same WandB run via `+wandb.id=` and `+wandb.resume=allow`.) + +### Recovery on Failed Check-in + +1. Kill the current training process. +2. Delete any checkpoint newer than the LKG from the checkpoint directory (e.g. if LKG is `step_400` and `step_500` exists, delete `step_500`). This ensures the script resumes from the LKG on relaunch. +3. Demote `LAYERS_PER_PROMOTION` layers using `PROMOTION_STRATEGY`. Update `fp8_layers` accordingly. +4. Relaunch training with the updated precision schedule. Do NOT change `num_train_steps` — keep it at `$NUM_TRAIN_STEPS`. The checkpoint stores the step counter; the script automatically loads the LKG, reads the step from it, and resumes toward the same target. `checkpoint.ckpt_dir` stays the same — the script auto-finds the latest remaining checkpoint (the LKG). + +The agent discards all training progress since the last successful check-in. The assumption is that divergence started after the LKG point and the updated schedule will prevent it from recurring. + +### Recovery Flow Example + +``` +Check-in fails at step 200 (current_ppl - baseline_ppl > allowed_delta) +→ Kill training +→ Delete step_200 checkpoint +→ Demote layers 1, 32: FP8 → BF16 (ends_in, first demotion) +→ Update fp8_layers to exclude [1, 32] +→ Relaunch (script auto-resumes from step_100) +→ Next check-in at step 200 (re-do this interval) + +Check-in fails again at step 200 +→ Kill training +→ Delete step_200 checkpoint +→ Demote layers 2, 31: FP8 → BF16 (second demotion) +→ Update fp8_layers to exclude [1, 2, 31, 32] +→ Relaunch from step_100 +→ ... +``` + +______________________________________________________________________ + +## Milestone 3: Code Intervention Mode (Optional) + +Beyond adjusting the layer precision schedule, the agent can optionally be granted permission to **edit the training and model code itself** to fix precision/casting issues it discovers during training. This mode is opt-in — by default the agent must ONLY control behavior via CLI arguments. + +### When to Intervene + +The agent should consider code intervention when: + +- **NaN/Inf loss detected** — likely a casting or autocast bug +- **Loss spikes that don't correlate with precision schedule changes** — may indicate a code-level issue +- **Quant stats show anomalous patterns** across all layers (not just specific ones) — suggests a systematic bug +- **Training crashes** with CUDA errors, dtype mismatches, or TE assertion failures + +### What the Agent Can Edit + +Files the agent is allowed to modify: + +- `opengenome_modeling_llama_te.py` — autocast contexts, `get_layer_autocast()`, `set_recipes()` +- `train_fsdp2.py` — recipe creation, FSDP wrapping, precision schedule application +- `quantization.py` — layer precision resolution logic + +### Example Interventions + +1. **Fix autocast nesting**: If the agent detects that FP8 layers have a double-nested autocast (outer FP8 + inner FP8), it can fix `get_layer_autocast()` to return `nullcontext()` for FP8 layers. + +2. **Fix recipe serialization**: If recipes are lost after FSDP wrapping, the agent can add/fix the `set_recipes()` call. + +3. **Fix dtype casting**: If embeddings or the lm_head are accidentally running in FP8, the agent can verify and fix the `te.autocast(enabled=False)` wrappers. + +4. **Fix loss computation precision**: If loss is computed in FP8 (causing NaN), the agent can ensure the loss function runs in BF16/FP32. + +### Guardrails + +- The agent must **log every code change** to `history.json` with a diff and rationale. +- Code changes must be **committed to a branch** (not just edited in place) so they can be reviewed. +- The agent should **never modify the dataset, tokenizer, or optimizer code** — only precision-related code paths. +- After any code edit, the agent must **restart training from the LKG checkpoint** (not continue from current state). + +______________________________________________________________________ + +## Agent Workspace & Artifacts + +All agent output must be saved under: `$WORKSPACE_ROOT//` + +The agent creates `` ONCE at startup using the format: `_` +This value is computed once and stored — it does NOT change across relaunches within the same session. It is used for the checkpoint directory name, workspace directory, WandB group name, Lepton job name, and results folder — all the same value for easy cross-referencing. +Examples: + +- `ends_in_20260317_143000` +- `tail_in_20260317_160000` +- `research_guided_20260318_091500` + +The directory layout: + +``` +$CHECKPOINT_ROOT// # model checkpoints (set checkpoint.ckpt_dir here) + train_fsdp2/step_/ # auto-created by the training script + +$WORKSPACE_ROOT// + logs/ # training stdout/stderr logs from each launch + quant_stats/ # quantization stats output (research_guided only) + configs/ # copy of every config/CLI invocation used per segment + graphs/ # any plots the agent generates (perplexity vs baseline, throughput over time, etc.) + history.json # structured log of all agent decisions: + # - every check-in result (step, baseline ppl, current ppl, pass/fail) + # - every demotion event (which layers, reason) + # - every recovery event (rolled back to which step, new schedule) + # - throughput at each check-in + state.json # agent state for crash recovery + report.md # human-readable summary maintained by the agent (see below) +``` + +### history.json + +```json +[ + { + "step": 100, + "current_loss": 1.82, + "baseline_loss": 1.80, + "current_ppl": 6.15, + "baseline_ppl": 6.04, + "diff": 0.11, + "allowed_delta": 0.302, + "passed": true, + "action": "continue", + "fp8_layers": [1,2,3,...,32], + "throughput": 62000.0, + "timestamp": "2026-03-17T14:35:00" + }, + { + "step": 200, + "current_loss": 1.49, + "baseline_loss": 1.42, + "current_ppl": 4.45, + "baseline_ppl": 4.16, + "diff": 0.29, + "allowed_delta": 0.208, + "passed": false, + "action": "demote_layers", + "demoted": [1, 32], + "fp8_layers": [2,3,...,31], + "rollback_to_step": 100, + "throughput": 61500.0, + "timestamp": "2026-03-17T15:10:00" + } +] +``` + +### state.json (crash recovery) + +```json +{ + "current_step": 200, + "lkg_step": 100, + "promotion_round": 1, + "layer_precision": {"1": "bf16", "2": "fp8", ..., "31": "fp8", "32": "bf16"}, + "fp8_layers": [2,3,4,...,31], + "run_name": "ends_in_20260317_143000" +} +``` + +### report.md + +The agent must maintain a `report.md` in the run directory. Update it after every check-in and at the end of training. It should contain: + +- Run metadata: model name, num_layers, num_train_steps, tolerance, start time +- Final precision schedule: which layers ended in FP8 vs BF16 +- Summary of all check-in results (table or list: step, baseline ppl, current ppl, pass/fail) +- Summary of all demotions and rollbacks +- Throughput comparison: average unpadded tokens/sec across the run vs. the BF16 baseline average +- Any observations from quant stats (research_guided only: layers with high underflow/overflow) +- Conclusion: how many layers stayed in FP8, overall accuracy vs baseline, throughput gain/loss + +### Final Report + +At the end of your session (or when training completes), you MUST produce a full markdown-based final report. Create a folder inside `$RESULTS_FOLDER` with the strategy name and date, and save all deliverables there: + +``` +$RESULTS_FOLDER/_/ + report.md # the final polished report + graphs/ # copies of all graphs referenced in the report + history.json # copy of the run's history.json for reference +``` + +The folder name must be unique (strategy + timestamp) because these experiments will be run multiple times and results must not overwrite each other. + +The final report should be a polished, self-contained document that includes everything from the run's `report.md` plus: + +- A high-level executive summary (2-3 paragraphs) suitable for sharing with the team +- Graphs and visualizations (saved to graphs/ and referenced in the report), including at minimum: + - **Perplexity over training steps**: baseline BF16 perplexity vs. this run's perplexity plotted on the same axes, with the tolerance band shaded. Mark any check-in failure points and demotion events on the plot. + - **Throughput over training steps**: unpadded tokens/sec for this run vs. the BF16 baseline. +- Comparison against the BF16 baseline with concrete numbers +- Recommendations for next steps (e.g. try a different strategy, adjust tolerance, change layers_per_promotion) +- Any lessons learned or surprising findings during the run + +The run-level `report.md` (inside the run directory) is a living document updated during training. The final report in `$RESULTS_FOLDER` is the cleaned-up deliverable produced at the end. + +______________________________________________________________________ + +## Key Constraints + +1. The agent is an **outer loop** around the training script. It configures precision, launches training, evaluates, and decides what to do next. Unless Milestone 3 (Code Intervention Mode) is explicitly enabled, the agent must ONLY control behavior via CLI arguments — do NOT modify the training script, config files, or any other source code. +2. All config (tolerance_pct, interval, baseline logfile path, layers_per_promotion) must be **passed via config** — not hardcoded. +3. All artifacts (checkpoints, logs, configs, graphs, history, report) go under `$WORKSPACE_ROOT//`. Final deliverables go under `$RESULTS_FOLDER/_/`. +4. WandB logging is desirable: training metrics should be published to WandB when available, but WandB failures should not block the agent loop. +5. The agent stops after reaching `$NUM_TRAIN_STEPS`. If all layers have been demoted to BF16 before that point, continue training in full BF16 for the remaining steps. +6. **Lepton-specific**: The agent must handle multi-node torchrun setup, NFS paths for checkpoints/data, and Lepton environment variables for distributed training. diff --git a/bionemo-recipes/recipes/opengenome2_llama_native_te/OG2_STRATEGY_ENDS_IN.md b/bionemo-recipes/recipes/opengenome2_llama_native_te/OG2_STRATEGY_ENDS_IN.md new file mode 100644 index 0000000000..16a75f9686 --- /dev/null +++ b/bionemo-recipes/recipes/opengenome2_llama_native_te/OG2_STRATEGY_ENDS_IN.md @@ -0,0 +1,289 @@ +# Strategy: `ends_in` — Demote from Both Ends Inward + +## Overview + +Demote transformer block layers from FP8 to BF16 starting at both the outermost positions +(layer 1 and layer 32) and working inward toward the middle. Each failed check-in demotes +`LAYERS_PER_PROMOTION` layers (default 2 — one from each end). + +**Rationale**: Edge layers sit closest to the embedding input (layer 1) and the output +projection/lm_head (layer 32). These positions are most sensitive to quantization error +because: + +- Layer 1 receives freshly embedded token representations that have not been refined by + deeper layers — small rounding errors propagate through the entire stack. +- Layer 32 feeds directly into the language modeling head — quantization noise here + has the most direct impact on loss. + +Middle layers are the most tolerant because they receive and produce already-compressed +representations and their errors are attenuated by subsequent layers. + +______________________________________________________________________ + +## Run Variables + +``` +TOLERANCE_PCT = 5.0 +CHECKIN_INTERVAL = 100 +LAYERS_PER_PROMOTION = 2 # layers demoted per failed check-in (1 from each end) +NUM_LAYERS = 32 # OG2-7B transformer block layers (1-indexed: 1..32) +``` + +______________________________________________________________________ + +## Demotion Order + +Each failure round demotes 2 layers: 1 from the bottom and 1 from the top. + +| Round | Layers Demoted to BF16 | Remaining FP8 Layers | FP8 Count | +| ----- | ---------------------- | -------------------- | --------- | +| Start | (none) | 1-32 | 32 | +| 1 | 1, 32 | 2-31 | 30 | +| 2 | 2, 31 | 3-30 | 28 | +| 3 | 3, 30 | 4-29 | 26 | +| 4 | 4, 29 | 5-28 | 24 | +| 5 | 5, 28 | 6-27 | 22 | +| 6 | 6, 27 | 7-26 | 20 | +| 7 | 7, 26 | 8-25 | 18 | +| 8 | 8, 25 | 9-24 | 16 | +| 9 | 9, 24 | 10-23 | 14 | +| 10 | 10, 23 | 11-22 | 12 | +| 11 | 11, 22 | 12-21 | 10 | +| 12 | 12, 21 | 13-20 | 8 | +| 13 | 13, 20 | 14-19 | 6 | +| 14 | 14, 19 | 15-18 | 4 | +| 15 | 15, 18 | 16-17 | 2 | +| 16 | 16, 17 | (none) | 0 | + +After round 16, all layers are BF16. Training continues in full BF16 for remaining steps. + +______________________________________________________________________ + +## Pseudocode + +```python +# --- Initialization --- + +layer_precision = {i: "fp8" for i in range(1, NUM_LAYERS + 1)} +bottom_ptr = 1 # next layer to demote from bottom +top_ptr = NUM_LAYERS # next layer to demote from top +lkg_step = 0 # last known good checkpoint step +promotion_round = 0 +history = [] + +# --- Helpers --- + + +def get_fp8_layers(): + """Return sorted list of 1-indexed layers currently in FP8.""" + return sorted(k for k, v in layer_precision.items() if v == "fp8") + + +def demote_next_batch(): + """Demote LAYERS_PER_PROMOTION layers using ends_in pattern.""" + global bottom_ptr, top_ptr, promotion_round + demoted = [] + pairs_to_demote = LAYERS_PER_PROMOTION // 2 # 2 layers = 1 pair + for _ in range(pairs_to_demote): + if bottom_ptr > top_ptr: + break + # Demote from bottom + if layer_precision[bottom_ptr] == "fp8": + layer_precision[bottom_ptr] = "bf16" + demoted.append(bottom_ptr) + bottom_ptr += 1 + # Demote from top + if bottom_ptr <= top_ptr and layer_precision[top_ptr] == "fp8": + layer_precision[top_ptr] = "bf16" + demoted.append(top_ptr) + top_ptr -= 1 + promotion_round += 1 + return demoted + + +# --- Main Agent Loop --- + +current_step = 0 + +while current_step < NUM_TRAIN_STEPS: + fp8_layers = get_fp8_layers() + resume = current_step > 0 + + # Build the torchrun command with current fp8_layers + cmd = build_torchrun_command( + fp8_layers=fp8_layers, + num_train_steps=NUM_TRAIN_STEPS, + resume_from_checkpoint=resume, + ckpt_dir=f"{WORKSPACE_ROOT}/{run_name}/checkpoints", + ) + + # Launch training and monitor + launch_training(cmd) + + # Wait for next check-in step + next_checkin = current_step + CHECKIN_INTERVAL + metrics = wait_for_step(next_checkin) + + # --- Check-in evaluation --- + # Baseline lookup — fail hard if step is missing + baseline_key = f"step_{next_checkin}" + if baseline_key not in baseline: + raise RuntimeError( + f"Baseline logfile missing entry for {baseline_key}. " + f"CHECKIN_INTERVAL={CHECKIN_INTERVAL} must align with baseline logfile steps." + ) + + import math + + current_ppl = math.exp(metrics["loss"]) + baseline_ppl = baseline[baseline_key]["perplexity"] + allowed_delta = baseline_ppl * (TOLERANCE_PCT / 100) + diff = current_ppl - baseline_ppl + passed = diff <= allowed_delta + + entry = { + "step": next_checkin, + "baseline_ppl": baseline_ppl, + "current_ppl": current_ppl, + "diff": diff, + "allowed_delta": allowed_delta, + "passed": passed, + "fp8_layers": fp8_layers, + "throughput": metrics["unpadded_tokens_per_second_per_gpu"], + } + + if passed: + # --- PASS --- + entry["action"] = "continue" + lkg_step = next_checkin + current_step = next_checkin + history.append(entry) + save_state() + update_report() + continue + + # --- FAIL --- + stop_training() + + if not get_fp8_layers(): + # All layers already BF16 — nothing more to demote + entry["action"] = "continue_bf16_exhausted" + lkg_step = next_checkin + current_step = next_checkin + history.append(entry) + save_state() + update_report() + continue + + demoted = demote_next_batch() + entry["action"] = "demote_layers" + entry["demoted"] = demoted + entry["rollback_to_step"] = lkg_step + history.append(entry) + + # Rollback: delete checkpoints after LKG, resume from LKG + delete_checkpoints_after(lkg_step) + current_step = lkg_step + save_state() + update_report() + +# --- Training Complete --- +generate_final_report() +``` + +______________________________________________________________________ + +## Check-in Decision Tree + +``` +[Check-in at step N] + | + v + baseline_key = f"step_{N}" + baseline_key in baseline? + / \ + YES NO → FATAL: misaligned CHECKIN_INTERVAL + | + v + current_ppl = exp(loss) + baseline_ppl = baseline[step_N] + allowed_delta = baseline_ppl * (TOLERANCE_PCT / 100) + | + diff = current_ppl - baseline_ppl + | + v + diff <= allowed_delta? + / \ + YES NO + | | + v v + PASS FAIL + LKG = N | + continue Any FP8 layers left? + / \ + YES NO + | | + v v + demote ends_in continue (all BF16) + rollback to LKG LKG = N + resume from LKG +``` + +______________________________________________________________________ + +## Worked Example (32-layer OG2-7B) + +### Setup + +- `TOLERANCE_PCT = 5.0` +- `CHECKIN_INTERVAL = 100` +- `LAYERS_PER_PROMOTION = 2` + +### Timeline + +**Step 100 — Check-in 1** + +- Baseline ppl: 6.04 +- Current ppl: 6.15 +- Allowed delta: 6.04 * 0.05 = 0.302 +- Diff: 0.11 < 0.302 → **PASS** +- FP8 layers: [1..32] (all 32) + +**Step 200 — Check-in 2** + +- Baseline ppl: 4.16 +- Current ppl: 4.45 +- Allowed delta: 4.16 * 0.05 = 0.208 +- Diff: 0.29 > 0.208 → **FAIL** +- Demote: layers 1, 32 +- Rollback to step 100 checkpoint +- FP8 layers: [2..31] (30 layers) + +**Step 200 (retry) — Check-in 2b** + +- Baseline ppl: 4.16 +- Current ppl: 4.30 +- Allowed delta: 0.208 +- Diff: 0.14 < 0.208 → **PASS** +- FP8 layers: [2..31] (30 layers) + +**Step 300 — Check-in 3** + +- Baseline ppl: 4.09 +- Current ppl: 4.20 +- Allowed delta: 4.09 * 0.05 = 0.205 +- Diff: 0.11 < 0.205 → **PASS** +- FP8 layers: [2..31] (30 layers, stable) + +Training proceeds with 30 of 32 layers in FP8 — 93.75% quantized coverage. + +______________________________________________________________________ + +## Notes + +- Each round demotes exactly `LAYERS_PER_PROMOTION` layers (default 2). +- For odd `LAYERS_PER_PROMOTION`, the extra layer comes from the bottom side. +- The pointers (`bottom_ptr`, `top_ptr`) never move backward — demotions are permanent. +- After round 16, `bottom_ptr > top_ptr` and `get_fp8_layers()` returns `[]`. +- The agent always resumes from the LKG checkpoint, never from the failed step. +- If the baseline logfile is missing an entry for the check-in step, the agent must **fail hard** and tell the user to fix `CHECKIN_INTERVAL` alignment. diff --git a/bionemo-recipes/recipes/opengenome2_llama_native_te/OG2_STRATEGY_TAIL_IN.md b/bionemo-recipes/recipes/opengenome2_llama_native_te/OG2_STRATEGY_TAIL_IN.md new file mode 100644 index 0000000000..8815bbc633 --- /dev/null +++ b/bionemo-recipes/recipes/opengenome2_llama_native_te/OG2_STRATEGY_TAIL_IN.md @@ -0,0 +1,315 @@ +# Strategy: `tail_in` — Demote from Tail Toward Head + +## Overview + +Demote transformer block layers from FP8 to BF16 starting at the output end (layer 32) +and working backward toward layer 1. Each failed check-in demotes `LAYERS_PER_PROMOTION` +layers (default 2). There is no fixed cap — the agent decides how many total layers to +demote based on check-in results. + +**Rationale**: The final layers of a transformer (closest to the output projection / lm_head) +are typically most sensitive to quantization error. Quantization noise in these layers passes +through fewer corrective transformations before reaching the output, making them the primary +source of perplexity degradation. Demoting from the tail inward addresses the most sensitive +layers first. + +______________________________________________________________________ + +## Run Variables + +``` +TOLERANCE_PCT = 5.0 +CHECKIN_INTERVAL = 100 +LAYERS_PER_PROMOTION = 2 # layers demoted per failed check-in +NUM_LAYERS = 32 # OG2-7B transformer block layers (1-indexed: 1..32) +``` + +______________________________________________________________________ + +## Demotion Order + +Layers are demoted from the output end inward, in batches of `LAYERS_PER_PROMOTION`: + +| Round | Layers Demoted to BF16 | Remaining FP8 Layers | FP8 Count | +| ----- | ---------------------- | -------------------- | --------- | +| Start | (none) | 1-32 | 32 | +| 1 | 32, 31 | 1-30 | 30 | +| 2 | 30, 29 | 1-28 | 28 | +| 3 | 28, 27 | 1-26 | 26 | +| 4 | 26, 25 | 1-24 | 24 | +| 5 | 24, 23 | 1-22 | 22 | +| 6 | 22, 21 | 1-20 | 20 | +| 7 | 20, 19 | 1-18 | 18 | +| 8 | 18, 17 | 1-16 | 16 | +| 9 | 16, 15 | 1-14 | 14 | +| 10 | 14, 13 | 1-12 | 12 | +| 11 | 12, 11 | 1-10 | 10 | +| 12 | 10, 9 | 1-8 | 8 | +| 13 | 8, 7 | 1-6 | 6 | +| 14 | 6, 5 | 1-4 | 4 | +| 15 | 4, 3 | 1-2 | 2 | +| 16 | 2, 1 | (none) | 0 | + +After round 16, all layers are BF16. Training continues in full BF16 for remaining steps. + +The agent is not required to demote all the way to BF16. It demotes only when check-ins fail, +and stops demoting when check-ins pass. Most runs will stabilize well before reaching round 16. + +______________________________________________________________________ + +## Pseudocode + +```python +# --- Initialization --- + +layer_precision = {i: "fp8" for i in range(1, NUM_LAYERS + 1)} +tail_ptr = NUM_LAYERS # next layer to demote (starts at 32) +lkg_step = 0 +promotion_round = 0 +history = [] + +# --- Helpers --- + + +def get_fp8_layers(): + """Return sorted list of 1-indexed layers currently in FP8.""" + return sorted(k for k, v in layer_precision.items() if v == "fp8") + + +def demote_next_batch(): + """Demote LAYERS_PER_PROMOTION layers from the tail.""" + global tail_ptr, promotion_round + demoted = [] + for _ in range(LAYERS_PER_PROMOTION): + if tail_ptr < 1: + break + if layer_precision[tail_ptr] == "fp8": + layer_precision[tail_ptr] = "bf16" + demoted.append(tail_ptr) + tail_ptr -= 1 + promotion_round += 1 + return demoted + + +# --- Main Agent Loop --- + +current_step = 0 + +while current_step < NUM_TRAIN_STEPS: + fp8_layers = get_fp8_layers() + resume = current_step > 0 + + cmd = build_torchrun_command( + fp8_layers=fp8_layers, + num_train_steps=NUM_TRAIN_STEPS, + resume_from_checkpoint=resume, + ckpt_dir=f"{WORKSPACE_ROOT}/{run_name}/checkpoints", + ) + + launch_training(cmd) + + next_checkin = current_step + CHECKIN_INTERVAL + metrics = wait_for_step(next_checkin) + + # --- Check-in evaluation --- + # Baseline lookup — fail hard if step is missing + baseline_key = f"step_{next_checkin}" + if baseline_key not in baseline: + raise RuntimeError( + f"Baseline logfile missing entry for {baseline_key}. " + f"CHECKIN_INTERVAL={CHECKIN_INTERVAL} must align with baseline logfile steps." + ) + + import math + + current_ppl = math.exp(metrics["loss"]) + baseline_ppl = baseline[baseline_key]["perplexity"] + allowed_delta = baseline_ppl * (TOLERANCE_PCT / 100) + diff = current_ppl - baseline_ppl + passed = diff <= allowed_delta + + entry = { + "step": next_checkin, + "baseline_ppl": baseline_ppl, + "current_ppl": current_ppl, + "diff": diff, + "allowed_delta": allowed_delta, + "passed": passed, + "fp8_layers": fp8_layers, + "throughput": metrics["unpadded_tokens_per_second_per_gpu"], + } + + if passed: + entry["action"] = "continue" + lkg_step = next_checkin + current_step = next_checkin + history.append(entry) + save_state() + update_report() + continue + + # --- FAIL --- + stop_training() + + if not get_fp8_layers(): + # All layers already BF16 — nothing more to demote + entry["action"] = "continue_bf16_exhausted" + lkg_step = next_checkin + current_step = next_checkin + history.append(entry) + save_state() + update_report() + continue + + demoted = demote_next_batch() + entry["action"] = "demote_layers" + entry["demoted"] = demoted + entry["rollback_to_step"] = lkg_step + history.append(entry) + + delete_checkpoints_after(lkg_step) + current_step = lkg_step + save_state() + update_report() + +# --- Training Complete --- +generate_final_report() +``` + +______________________________________________________________________ + +## Check-in Decision Tree + +``` +[Check-in at step N] + | + v + baseline_key = f"step_{N}" + baseline_key in baseline? + / \ + YES NO → FATAL: misaligned CHECKIN_INTERVAL + | + v + current_ppl = exp(loss) + baseline_ppl = baseline[step_N] + allowed_delta = baseline_ppl * (TOLERANCE_PCT / 100) + | + diff = current_ppl - baseline_ppl + | + v + diff <= allowed_delta? + / \ + YES NO + | | + v v + PASS FAIL + LKG = N | + continue Any FP8 layers left? + / \ + YES NO + | | + v v + demote tail_in continue (all BF16) + rollback to LKG LKG = N + resume from LKG +``` + +______________________________________________________________________ + +## Worked Example (32-layer OG2-7B) + +### Setup + +- `TOLERANCE_PCT = 5.0` +- `CHECKIN_INTERVAL = 100` +- `LAYERS_PER_PROMOTION = 2` + +### Timeline + +**Step 100 — Check-in 1** + +- Baseline ppl: 6.04 +- Current ppl: 6.15 +- Allowed delta: 6.04 * 0.05 = 0.302 +- Diff: 0.11 < 0.302 → **PASS** +- FP8 layers: [1..32] (all 32) + +**Step 200 — Check-in 2** + +- Baseline ppl: 4.16 +- Current ppl: 4.45 +- Allowed delta: 4.16 * 0.05 = 0.208 +- Diff: 0.29 > 0.208 → **FAIL** +- Demote: layers 32, 31 (tail batch 1) +- Rollback to step 100 checkpoint +- FP8 layers: [1..30] (30 layers) + +**Step 200 (retry) — Check-in 2b** + +- Baseline ppl: 4.16 +- Current ppl: 4.30 +- Allowed delta: 0.208 +- Diff: 0.14 < 0.208 → **PASS** +- FP8 layers: [1..30] (30 layers) + +**Step 300 — Check-in 3** + +- Baseline ppl: 4.09 +- Current ppl: 4.35 +- Allowed delta: 4.09 * 0.05 = 0.205 +- Diff: 0.26 > 0.205 → **FAIL** +- Demote: layers 30, 29 (tail batch 2) +- Rollback to step 200 checkpoint +- FP8 layers: [1..28] (28 layers) + +**Step 300 (retry) — Check-in 3b** + +- Baseline ppl: 4.09 +- Current ppl: 4.22 +- Allowed delta: 0.205 +- Diff: 0.13 < 0.205 → **PASS** +- FP8 layers: [1..28] (28 layers) + +**Step 400 — Check-in 4** + +- Baseline ppl: 3.95 +- Current ppl: 4.05 +- Allowed delta: 3.95 * 0.05 = 0.198 +- Diff: 0.10 < 0.198 → **PASS** +- FP8 layers: [1..28] (28 layers, stable) + +Training proceeds with 28 of 32 layers in FP8 — 87.5% quantized coverage. + +______________________________________________________________________ + +## Comparison with `ends_in` + +| Aspect | `ends_in` | `tail_in` | +| ------------------ | ------------------------------------------------------ | ------------------------------------------------- | +| Demotion direction | Both ends inward | Output end toward head only | +| Maximum rounds | 16 (all layers) | 16 (all layers) | +| Minimum FP8 | 0 | 0 | +| Cap | None (demotes until all BF16 or passes) | None (demotes until all BF16 or passes) | +| Best for | Unknown sensitivity distribution | Output-sensitive models | +| Risk | Over-demotion at input end if only output is sensitive | Under-demotion if input layers are also sensitive | + +### When to Choose `tail_in` + +- Prior evidence suggests the output-end layers are the primary source of quantization divergence. +- You want to preserve input-end FP8 layers as long as possible. +- The model shows stronger sensitivity near the output projection than near the embedding. + +### When to Choose `ends_in` + +- You have no prior knowledge of which layers are most sensitive. +- The model shows sensitivity at both the input and output ends. +- You want symmetric coverage: both ends get demoted at the same rate. + +______________________________________________________________________ + +## Notes + +- There is **no fixed cap** on demotions. The agent keeps demoting from the tail as long as check-ins fail. +- The `tail_ptr` never moves forward — demotions are permanent. +- The agent always resumes from the LKG checkpoint, never from the failed step. +- If the baseline logfile is missing an entry for the check-in step, the agent must **fail hard** and tell the user to fix `CHECKIN_INTERVAL` alignment. diff --git a/bionemo-recipes/recipes/opengenome2_llama_native_te/agent_daemon.py b/bionemo-recipes/recipes/opengenome2_llama_native_te/agent_daemon.py new file mode 100644 index 0000000000..c9cf0a2aa1 --- /dev/null +++ b/bionemo-recipes/recipes/opengenome2_llama_native_te/agent_daemon.py @@ -0,0 +1,445 @@ +#!/usr/bin/env python3 + +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: LicenseRef-Apache2 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +r"""AI-in-the-loop FP8 precision debugging daemon. + +Runs as a sidecar Lepton job on a CPU-only node with shared NFS access. +Monitors training metrics and FP8 stats, detects anomalies, and either +applies Tier 1 hot-reload fixes (via the NFS control file) or proposes +Tier 2 checkpoint-and-restart changes. + +Usage (standalone):: + + python agent_daemon.py --config /data/agent/agent_config.yaml + +Usage (as Lepton sidecar): + Launched automatically by ``submit_og2_lepton_eden.py`` when + ``agent.enabled=true`` is set in the Lepton config. +""" + +import argparse +import json +import logging +import os +import time +import uuid +from collections import deque +from dataclasses import dataclass +from pathlib import Path +from typing import Any + +import yaml + +from agent_fp8_analyzer import FP8StatsAnalyzer +from agent_interventions import InterventionLimits, Tier1Executor, Tier2Executor +from agent_journal import EntryType, ExperimentJournal + + +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s [%(name)s] %(levelname)s: %(message)s", + datefmt="%Y-%m-%dT%H:%M:%S", +) +logger = logging.getLogger("agent_daemon") + + +@dataclass +class AgentConfig: + """Daemon configuration loaded from YAML or CLI args.""" + + metrics_file: str = "/data/agent/metrics.jsonl" + control_file: str = "/data/agent/control.yaml" + journal_file: str = "/data/agent/journal.jsonl" + fp8_stats_dir: str = "./log_fp8_stats" + + monitor_interval_seconds: float = 30.0 + loss_spike_threshold_pct: float = 50.0 + grad_norm_spike_multiplier: float = 3.0 + nan_halt: bool = True + observation_only: bool = True + max_tier2_restarts: int = 3 + cooldown_steps: int = 500 + + hydra_config_dir: str | None = None + hydra_config_name: str | None = None + lepton_config_path: str | None = None + + rolling_window_size: int = 20 + + @classmethod + def from_yaml(cls, path: str | os.PathLike) -> "AgentConfig": + """Load config from a YAML file, falling back to defaults for missing keys.""" + with open(path) as f: + data = yaml.safe_load(f) or {} + return cls(**{k: v for k, v in data.items() if k in cls.__dataclass_fields__}) + + +@dataclass +class MetricSnapshot: + """A single training metric record parsed from the JSONL stream.""" + + step: int = 0 + loss: float = 0.0 + grad_norm: float = 0.0 + lr: float = 0.0 + loss_delta_pct: float = 0.0 + nan_detected: bool = False + timestamp: str = "" + + +class AgentDaemon: + """Main monitoring and intervention loop. + + Args: + config: Agent configuration. + """ + + def __init__(self, config: AgentConfig): + """Initialize the agent daemon.""" + self._config = config + self._session_id = f"agent-{uuid.uuid4().hex[:8]}" + + self._journal = ExperimentJournal( + journal_path=config.journal_file, + session_id=self._session_id, + ) + + limits = InterventionLimits( + max_tier2_restarts=config.max_tier2_restarts, + cooldown_steps=config.cooldown_steps, + ) + + self._tier1 = Tier1Executor( + control_file=config.control_file, + journal=self._journal, + limits=limits, + agent_id=self._session_id, + ) + self._tier2 = Tier2Executor( + control_file=config.control_file, + journal=self._journal, + hydra_config_dir=config.hydra_config_dir, + lepton_config_path=config.lepton_config_path, + limits=limits, + agent_id=self._session_id, + ) + + self._fp8_analyzer = FP8StatsAnalyzer(log_dir=config.fp8_stats_dir) + + self._metrics_read_pos: int = 0 + self._loss_history: deque[float] = deque(maxlen=config.rolling_window_size) + self._grad_norm_history: deque[float] = deque(maxlen=config.rolling_window_size) + self._last_step: int = 0 + self._last_intervention_entry_id: int | None = None + + def run(self) -> None: + """Start the blocking monitoring loop.""" + self._journal.log( + EntryType.SESSION_START, + extra={ + "session_id": self._session_id, + "config": dict(self._config.__dict__), + "observation_only": self._config.observation_only, + }, + ) + + logger.info( + "Agent daemon started (session=%s, observation_only=%s)", self._session_id, self._config.observation_only + ) + + prior = self._journal.load_history() + if prior: + logger.info("Loaded %d prior journal entries for context", len(prior)) + + try: + while True: + self._monitor_tick() + time.sleep(self._config.monitor_interval_seconds) + except KeyboardInterrupt: + logger.info("Agent daemon interrupted") + finally: + self._journal.log(EntryType.SESSION_END, extra={"session_id": self._session_id}) + + def _monitor_tick(self) -> None: + """One iteration of the monitoring loop.""" + snapshots = self._read_new_metrics() + fp8_report = self._fp8_analyzer.analyze() + + for snap in snapshots: + self._analyze_snapshot(snap, fp8_report) + + if not fp8_report.healthy: + self._handle_fp8_anomalies(fp8_report) + + # ------------------------------------------------------------------ + # Metric reading + # ------------------------------------------------------------------ + + def _read_new_metrics(self) -> list[MetricSnapshot]: + """Tail-read new JSONL records since the last read position.""" + path = Path(self._config.metrics_file) + if not path.exists(): + return [] + + snapshots: list[MetricSnapshot] = [] + try: + with open(path) as f: + f.seek(self._metrics_read_pos) + for raw_line in f: + stripped = raw_line.strip() + if not stripped: + continue + try: + data = json.loads(stripped) + except json.JSONDecodeError: + continue + + if data.get("type") == "intervention": + continue + + snap = MetricSnapshot( + step=data.get("step", 0), + loss=data.get("loss", 0.0), + grad_norm=data.get("grad_norm", 0.0), + lr=data.get("lr", 0.0), + loss_delta_pct=data.get("loss_delta_pct", 0.0), + nan_detected=data.get("nan_detected", False), + timestamp=data.get("timestamp", ""), + ) + snapshots.append(snap) + + self._metrics_read_pos = f.tell() + except OSError: + logger.debug("Could not read metrics file", exc_info=True) + + return snapshots + + # ------------------------------------------------------------------ + # Anomaly detection + # ------------------------------------------------------------------ + + def _analyze_snapshot(self, snap: MetricSnapshot, fp8_report: Any) -> None: + """Check a single metric snapshot for anomalies and decide on interventions.""" + self._last_step = snap.step + + if snap.nan_detected: + self._handle_nan(snap) + return + + self._loss_history.append(snap.loss) + self._grad_norm_history.append(snap.grad_norm) + + if self._detect_loss_spike(snap): + self._handle_loss_spike(snap) + + if self._detect_grad_norm_spike(snap): + self._handle_grad_norm_spike(snap) + + self._journal.log( + EntryType.OBSERVATION, + step=snap.step, + extra={ + "loss": snap.loss, + "grad_norm": snap.grad_norm, + "lr": snap.lr, + "loss_delta_pct": snap.loss_delta_pct, + "fp8_healthy": fp8_report.healthy if hasattr(fp8_report, "healthy") else True, + }, + ) + + def _detect_loss_spike(self, snap: MetricSnapshot) -> bool: + """True if loss jumped significantly vs rolling average.""" + if len(self._loss_history) < 3: + return False + recent = list(self._loss_history)[-5:] + avg = sum(recent[:-1]) / len(recent[:-1]) + if avg == 0: + return False + pct_change = ((snap.loss - avg) / abs(avg)) * 100.0 + return pct_change > self._config.loss_spike_threshold_pct + + def _detect_grad_norm_spike(self, snap: MetricSnapshot) -> bool: + """True if grad norm exceeds N * rolling average.""" + if len(self._grad_norm_history) < 3: + return False + recent = list(self._grad_norm_history)[-5:] + avg = sum(recent[:-1]) / len(recent[:-1]) + if avg == 0: + return False + return snap.grad_norm > avg * self._config.grad_norm_spike_multiplier + + # ------------------------------------------------------------------ + # Intervention handlers + # ------------------------------------------------------------------ + + def _handle_nan(self, snap: MetricSnapshot) -> None: + """React to NaN loss or gradient norm.""" + logger.error("NaN detected at step %d", snap.step) + self._journal.log( + EntryType.OBSERVATION, + step=snap.step, + trigger=f"NaN detected at step {snap.step}", + evidence={"loss": snap.loss, "grad_norm": snap.grad_norm}, + ) + + if self._config.nan_halt and not self._config.observation_only: + logger.info("Requesting checkpoint-and-stop due to NaN") + self._tier2.request_checkpoint_and_stop( + snap.step, + reason=f"NaN detected at step {snap.step}", + ) + + def _handle_loss_spike(self, snap: MetricSnapshot) -> None: + """React to a sudden loss spike.""" + recent = list(self._loss_history)[-5:] + avg = sum(recent[:-1]) / len(recent[:-1]) if len(recent) > 1 else snap.loss + pct = ((snap.loss - avg) / abs(avg)) * 100.0 if avg != 0 else 0 + + trigger = f"Loss spike at step {snap.step}: {snap.loss:.4f} ({pct:+.1f}% vs rolling avg {avg:.4f})" + logger.warning(trigger) + + self._journal.log( + EntryType.OBSERVATION, + step=snap.step, + trigger=trigger, + evidence={"loss": snap.loss, "rolling_avg": avg, "pct_change": pct}, + ) + + if self._config.observation_only: + return + + if self._tier1.can_intervene(snap.step): + new_clip = max(0.1, snap.grad_norm * 0.5) + self._journal.log( + EntryType.HYPOTHESIS, + step=snap.step, + hypothesis="Loss spike likely from gradient explosion; reducing grad clip", + ) + self._tier1.apply( + {"grad_clip_norm": new_clip}, + current_step=snap.step, + trigger=trigger, + hypothesis="Reducing grad clip to stabilize", + ) + + def _handle_grad_norm_spike(self, snap: MetricSnapshot) -> None: + """React to a gradient norm spike.""" + recent = list(self._grad_norm_history)[-5:] + avg = sum(recent[:-1]) / len(recent[:-1]) if len(recent) > 1 else snap.grad_norm + + trigger = ( + f"Grad norm spike at step {snap.step}: {snap.grad_norm:.4f} " + f"({snap.grad_norm / avg:.1f}x rolling avg {avg:.4f})" + ) + logger.warning(trigger) + + self._journal.log( + EntryType.OBSERVATION, + step=snap.step, + trigger=trigger, + evidence={"grad_norm": snap.grad_norm, "rolling_avg": avg}, + ) + + def _handle_fp8_anomalies(self, fp8_report: Any) -> None: + """React to FP8 stat anomalies (overflow, underflow, instability).""" + summary = fp8_report.summary() + logger.warning("FP8 anomalies detected: %s", summary) + + self._journal.log( + EntryType.OBSERVATION, + step=self._last_step, + trigger=f"FP8 anomalies: {len(summary.get('anomalies', []))} issues", + evidence=summary, + ) + + if self._config.observation_only: + return + + overflow_layers = summary.get("overflow_layers", []) + if overflow_layers and self._tier2.can_restart(): + layer_indices = self._extract_layer_indices(overflow_layers) + if layer_indices: + min_idx = min(layer_indices) + + hypothesis = ( + f"FP8 overflow in layers {overflow_layers} — proposing to extend BF16 buffer to cover these layers" + ) + self._journal.log( + EntryType.HYPOTHESIS, + step=self._last_step, + hypothesis=hypothesis, + evidence=summary, + ) + + n_start_bf16 = min_idx + 1 if min_idx < 4 else 1 + logger.info( + "FP8 overflow: would propose num_layers_at_start_in_bf16=%d (requires human approval for Tier 2)", + n_start_bf16, + ) + + @staticmethod + def _extract_layer_indices(layer_names: list[str]) -> list[int]: + """Extract numeric layer indices from layer name strings like 'model.layers.3.self_attn'.""" + import re + + indices = [] + for name in layer_names: + match = re.search(r"layers?[._](\d+)", name) + if match: + indices.append(int(match.group(1))) + return indices + + +def main() -> None: + """CLI entry point for the agent daemon.""" + parser = argparse.ArgumentParser(description="AI-in-the-loop FP8 debugging daemon") + parser.add_argument("--config", type=str, default=None, help="Path to agent config YAML") + parser.add_argument("--metrics-file", type=str, default=None) + parser.add_argument("--control-file", type=str, default=None) + parser.add_argument("--journal-file", type=str, default=None) + parser.add_argument("--fp8-stats-dir", type=str, default=None) + parser.add_argument("--monitor-interval", type=float, default=None) + parser.add_argument("--observe-only", action="store_true", default=None) + parser.add_argument("--intervene", action="store_true", help="Enable active interventions (disable observe-only)") + args = parser.parse_args() + + if args.config: + config = AgentConfig.from_yaml(args.config) + else: + config = AgentConfig() + + if args.metrics_file: + config.metrics_file = args.metrics_file + if args.control_file: + config.control_file = args.control_file + if args.journal_file: + config.journal_file = args.journal_file + if args.fp8_stats_dir: + config.fp8_stats_dir = args.fp8_stats_dir + if args.monitor_interval is not None: + config.monitor_interval_seconds = args.monitor_interval + if args.observe_only: + config.observation_only = True + if args.intervene: + config.observation_only = False + + daemon = AgentDaemon(config) + daemon.run() + + +if __name__ == "__main__": + main() diff --git a/bionemo-recipes/recipes/opengenome2_llama_native_te/agent_fp8_analyzer.py b/bionemo-recipes/recipes/opengenome2_llama_native_te/agent_fp8_analyzer.py new file mode 100644 index 0000000000..cf18f6b976 --- /dev/null +++ b/bionemo-recipes/recipes/opengenome2_llama_native_te/agent_fp8_analyzer.py @@ -0,0 +1,262 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: LicenseRef-Apache2 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""FP8 stats file parser and anomaly detection for the agent daemon. + +Reads the per-rank log files produced by ``nvdlfw_inspect`` / TransformerEngine's +``LogFp8TensorStats`` feature and surfaces actionable signals: + +* Per-layer overflow / underflow rates for activations, gradients, and weights +* Scaling factor oscillation (sign of numerical instability) +* Layers consistently near E4M3 saturation +* Sudden distribution shifts in amax histograms + +The analyzer is designed to run in the agent sidecar (CPU-only) and never +imports torch or TransformerEngine itself. +""" + +import csv +import logging +import math +import os +import re +from collections import defaultdict +from dataclasses import dataclass, field +from pathlib import Path +from typing import Any + + +logger = logging.getLogger(__name__) + +# E4M3 representable range: ~[-448, 448] +E4M3_MAX = 448.0 + + +@dataclass +class LayerFP8Stats: + """Aggregated FP8 statistics for a single layer.""" + + layer_name: str + tensor_type: str # "activation", "gradient", or "weight" + underflow_pct: float = 0.0 + overflow_pct: float = 0.0 + scale_inv_min: float = float("inf") + scale_inv_max: float = 0.0 + mse: float = 0.0 + sample_count: int = 0 + + +@dataclass +class FP8HealthReport: + """Summary of FP8 numerical health across all layers. + + Attributes: + layer_stats: Per-layer, per-tensor-type statistics. + anomalies: List of human-readable anomaly descriptions. + overflow_layers: Layers with overflow rate above threshold. + underflow_layers: Layers with underflow rate above threshold. + unstable_layers: Layers with scaling factor oscillation. + healthy: True if no anomalies were detected. + """ + + layer_stats: dict[str, list[LayerFP8Stats]] = field(default_factory=dict) + anomalies: list[str] = field(default_factory=list) + overflow_layers: list[str] = field(default_factory=list) + underflow_layers: list[str] = field(default_factory=list) + unstable_layers: list[str] = field(default_factory=list) + + @property + def healthy(self) -> bool: + """True when no anomalies were detected.""" + return len(self.anomalies) == 0 + + def summary(self) -> dict[str, Any]: + """Return a JSON-serializable summary for the experiment journal.""" + return { + "healthy": self.healthy, + "num_anomalies": len(self.anomalies), + "overflow_layers": self.overflow_layers, + "underflow_layers": self.underflow_layers, + "unstable_layers": self.unstable_layers, + "anomalies": self.anomalies[:10], + } + + +class FP8StatsAnalyzer: + """Parses nvdlfw_inspect log directories and produces FP8 health reports. + + Args: + log_dir: Root directory containing per-rank subdirectories of FP8 stats. + overflow_threshold: Overflow % above which a layer is flagged. + underflow_threshold: Underflow % above which a layer is flagged. + scale_oscillation_ratio: If ``scale_inv_max / scale_inv_min`` exceeds + this ratio within a window, the layer is flagged as unstable. + rank: Which rank's logs to analyze (default: rank 0, the most informative). + """ + + def __init__( + self, + log_dir: str | os.PathLike = "./log_fp8_stats", + overflow_threshold: float = 5.0, + underflow_threshold: float = 10.0, + scale_oscillation_ratio: float = 100.0, + rank: int = 0, + ): + """Initialize the analyzer.""" + self._log_dir = Path(log_dir) + self._overflow_threshold = overflow_threshold + self._underflow_threshold = underflow_threshold + self._scale_oscillation_ratio = scale_oscillation_ratio + self._rank = rank + self._last_read_positions: dict[str, int] = {} + + def analyze(self) -> FP8HealthReport: + """Parse all available stat files for the configured rank and return a health report.""" + report = FP8HealthReport() + rank_dir = self._log_dir / f"rank_{self._rank}" + + if not rank_dir.exists(): + return report + + raw_stats = self._parse_rank_directory(rank_dir) + report.layer_stats = raw_stats + + for stats_list in raw_stats.values(): + for stats in stats_list: + self._check_overflow(stats, report) + self._check_underflow(stats, report) + self._check_scale_stability(stats, report) + + return report + + def _parse_rank_directory(self, rank_dir: Path) -> dict[str, list[LayerFP8Stats]]: + """Parse all CSV/log files in a rank directory into LayerFP8Stats.""" + result: dict[str, list[LayerFP8Stats]] = defaultdict(list) + + for log_file in sorted(rank_dir.glob("*.csv")): + try: + stats = self._parse_csv_file(log_file) + for s in stats: + result[s.layer_name].append(s) + except Exception: + logger.debug("Failed to parse %s", log_file, exc_info=True) + + for log_file in sorted(rank_dir.glob("*.log")): + try: + stats = self._parse_log_file(log_file) + for s in stats: + result[s.layer_name].append(s) + except Exception: + logger.debug("Failed to parse %s", log_file, exc_info=True) + + return dict(result) + + def _parse_csv_file(self, path: Path) -> list[LayerFP8Stats]: + """Parse a CSV file with columns like layer, tensor, underflows%, scale_inv_min, scale_inv_max, mse.""" + stats: list[LayerFP8Stats] = [] + file_key = str(path) + start_pos = self._last_read_positions.get(file_key, 0) + + with open(path) as f: + f.seek(start_pos) + reader = csv.DictReader(f) + for row in reader: + try: + s = LayerFP8Stats( + layer_name=row.get("layer", row.get("layer_name", "unknown")), + tensor_type=row.get("tensor", row.get("tensor_type", "unknown")), + underflow_pct=float(row.get("underflows%", row.get("underflow_pct", 0))), + scale_inv_min=float(row.get("scale_inv_min", float("inf"))), + scale_inv_max=float(row.get("scale_inv_max", 0)), + mse=float(row.get("mse", 0)), + sample_count=1, + ) + stats.append(s) + except (ValueError, KeyError): + continue + + self._last_read_positions[file_key] = f.tell() + + return stats + + def _parse_log_file(self, path: Path) -> list[LayerFP8Stats]: + """Parse a structured log file with key=value pairs per line.""" + stats: list[LayerFP8Stats] = [] + kv_pattern = re.compile(r"(\w+)=([\d.eE+\-]+|[\w/]+)") + + file_key = str(path) + start_pos = self._last_read_positions.get(file_key, 0) + + with open(path) as f: + f.seek(start_pos) + for line in f: + pairs = dict(kv_pattern.findall(line)) + if "layer" in pairs or "layer_name" in pairs: + try: + s = LayerFP8Stats( + layer_name=pairs.get("layer", pairs.get("layer_name", "unknown")), + tensor_type=pairs.get("tensor", pairs.get("tensor_type", "unknown")), + underflow_pct=float(pairs.get("underflows", pairs.get("underflow_pct", "0"))), + scale_inv_min=float(pairs.get("scale_inv_min", "inf")), + scale_inv_max=float(pairs.get("scale_inv_max", "0")), + mse=float(pairs.get("mse", "0")), + sample_count=1, + ) + stats.append(s) + except (ValueError, KeyError): + continue + + self._last_read_positions[file_key] = f.tell() + + return stats + + def _check_overflow(self, stats: LayerFP8Stats, report: FP8HealthReport) -> None: + """Flag layers with high overflow rates.""" + if stats.overflow_pct > self._overflow_threshold: + msg = ( + f"FP8 overflow: {stats.layer_name}/{stats.tensor_type} " + f"overflow={stats.overflow_pct:.1f}% (threshold={self._overflow_threshold}%)" + ) + report.anomalies.append(msg) + if stats.layer_name not in report.overflow_layers: + report.overflow_layers.append(stats.layer_name) + + def _check_underflow(self, stats: LayerFP8Stats, report: FP8HealthReport) -> None: + """Flag layers with high underflow rates.""" + if stats.underflow_pct > self._underflow_threshold: + msg = ( + f"FP8 underflow: {stats.layer_name}/{stats.tensor_type} " + f"underflow={stats.underflow_pct:.1f}% (threshold={self._underflow_threshold}%)" + ) + report.anomalies.append(msg) + if stats.layer_name not in report.underflow_layers: + report.underflow_layers.append(stats.layer_name) + + def _check_scale_stability(self, stats: LayerFP8Stats, report: FP8HealthReport) -> None: + """Flag layers where the scaling factor swings wildly.""" + if stats.scale_inv_min <= 0 or not math.isfinite(stats.scale_inv_min): + return + if stats.scale_inv_max <= 0 or not math.isfinite(stats.scale_inv_max): + return + + ratio = stats.scale_inv_max / stats.scale_inv_min + if ratio > self._scale_oscillation_ratio: + msg = ( + f"FP8 scale oscillation: {stats.layer_name}/{stats.tensor_type} " + f"scale_inv ratio={ratio:.1f}x (threshold={self._scale_oscillation_ratio}x)" + ) + report.anomalies.append(msg) + if stats.layer_name not in report.unstable_layers: + report.unstable_layers.append(stats.layer_name) diff --git a/bionemo-recipes/recipes/opengenome2_llama_native_te/agent_interventions.py b/bionemo-recipes/recipes/opengenome2_llama_native_te/agent_interventions.py new file mode 100644 index 0000000000..e7088b0c2e --- /dev/null +++ b/bionemo-recipes/recipes/opengenome2_llama_native_te/agent_interventions.py @@ -0,0 +1,394 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: LicenseRef-Apache2 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tier 1 and Tier 2 intervention executors for the agent daemon. + +Tier 1 (hot-reload): writes a new version of the NFS control file so the +training loop picks up parameter changes on its next poll. + +Tier 2 (checkpoint-restart): signals the training loop to save a checkpoint +and exit, then modifies the Hydra config on disk and resubmits the job +via the Lepton API. +""" + +import logging +import os +from dataclasses import dataclass, field +from pathlib import Path +from typing import Any + +import yaml + +from agent_journal import EntryType, ExperimentJournal +from control_plane import write_control_file + + +logger = logging.getLogger(__name__) + + +@dataclass +class InterventionLimits: + """Hard safety limits for automated interventions. + + Attributes: + lr_max_multiplier: LR cannot exceed ``initial_lr * lr_max_multiplier``. + grad_clip_min: Gradient clip norm floor. + max_tier2_restarts: Total Tier 2 restarts allowed per session. + cooldown_steps: Minimum steps between any two interventions. + """ + + lr_max_multiplier: float = 2.0 + grad_clip_min: float = 0.1 + max_tier2_restarts: int = 3 + cooldown_steps: int = 500 + + +@dataclass +class InterventionState: + """Mutable state tracking for the intervention engine.""" + + version: int = 0 + tier2_restart_count: int = 0 + last_intervention_step: int = -999_999 + initial_lr: float | None = None + previous_values: dict[str, Any] = field(default_factory=dict) + + +class Tier1Executor: + """Write safe hot-reload interventions to the NFS control file. + + Args: + control_file: Path to the YAML control file. + journal: Experiment journal for audit logging. + limits: Safety limits. + agent_id: Identifier for this agent session. + """ + + def __init__( + self, + control_file: str | os.PathLike, + journal: ExperimentJournal, + limits: InterventionLimits | None = None, + agent_id: str = "agent", + ): + """Initialize the Tier 1 executor.""" + self._control_file = Path(control_file) + self._journal = journal + self._limits = limits or InterventionLimits() + self._agent_id = agent_id + self._state = InterventionState() + + def can_intervene(self, current_step: int) -> bool: + """Check whether an intervention is allowed right now.""" + return (current_step - self._state.last_intervention_step) >= self._limits.cooldown_steps + + def apply( + self, + interventions: dict[str, Any], + current_step: int, + trigger: str = "", + hypothesis: str = "", + evidence: dict[str, Any] | None = None, + ) -> bool: + """Validate, write control file, and journal the intervention. + + Args: + interventions: Parameter overrides to apply. + current_step: Current training step (for cooldown checks). + trigger: Human-readable description of what triggered this. + hypothesis: Agent's hypothesis about the root cause. + evidence: Supporting metric data. + + Returns: + True if the intervention was written; False if rejected. + """ + if not self.can_intervene(current_step): + logger.warning("Intervention rejected: cooldown not elapsed (step %d)", current_step) + return False + + validated = self._validate(interventions) + if not validated: + logger.warning("Intervention rejected: all values out of bounds") + return False + + self._state.version += 1 + write_control_file( + path=self._control_file, + version=self._state.version, + interventions=validated, + agent_id=self._agent_id, + ) + + config_diff = {} + for k, v in validated.items(): + old = self._state.previous_values.get(k) + config_diff[k] = [old, v] + self._state.previous_values[k] = v + + self._state.last_intervention_step = current_step + + self._journal.log( + EntryType.INTERVENTION, + step=current_step, + trigger=trigger, + hypothesis=hypothesis, + evidence=evidence, + action=f"tier1: {validated}", + config_diff=config_diff, + ) + + logger.info("Tier 1 intervention v%d applied at step %d: %s", self._state.version, current_step, validated) + return True + + def rollback(self, current_step: int) -> bool: + """Revert to the previously saved parameter values. + + Returns: + True if a rollback was written; False if there is nothing to revert. + """ + if not self._state.previous_values: + return False + + rollback_values = dict(self._state.previous_values) + self._state.version += 1 + write_control_file( + path=self._control_file, + version=self._state.version, + interventions=rollback_values, + agent_id=self._agent_id, + ) + + self._journal.log( + EntryType.INTERVENTION, + step=current_step, + trigger="auto-rollback due to degradation", + action=f"tier1_rollback: {rollback_values}", + ) + + logger.info("Tier 1 rollback v%d at step %d: %s", self._state.version, current_step, rollback_values) + return True + + def _validate(self, raw: dict[str, Any]) -> dict[str, Any]: + """Enforce safety bounds on intervention values.""" + result: dict[str, Any] = {} + for key, value in raw.items(): + if key == "learning_rate" and isinstance(value, (int, float)): + clamped = float(value) + if self._state.initial_lr is not None: + max_lr = self._state.initial_lr * self._limits.lr_max_multiplier + clamped = min(clamped, max_lr) + result[key] = max(0.0, clamped) + elif key == "grad_clip_norm" and isinstance(value, (int, float)): + result[key] = max(self._limits.grad_clip_min, float(value)) + else: + result[key] = value + return result + + def set_initial_lr(self, lr: float) -> None: + """Record the initial LR so we can enforce the max-multiplier bound.""" + self._state.initial_lr = lr + self._state.previous_values["learning_rate"] = lr + + +class Tier2Executor: + """Checkpoint-and-restart interventions via Lepton API. + + Args: + control_file: Path to the NFS control file (for checkpoint-stop signal). + journal: Experiment journal. + hydra_config_dir: Path to the Hydra config directory on shared NFS. + lepton_config_path: Path to the Lepton YAML config to resubmit. + limits: Safety limits. + agent_id: Identifier for this agent session. + """ + + def __init__( + self, + control_file: str | os.PathLike, + journal: ExperimentJournal, + hydra_config_dir: str | os.PathLike | None = None, + lepton_config_path: str | os.PathLike | None = None, + limits: InterventionLimits | None = None, + agent_id: str = "agent", + ): + """Initialize the Tier 2 executor.""" + self._control_file = Path(control_file) + self._journal = journal + self._hydra_config_dir = Path(hydra_config_dir) if hydra_config_dir else None + self._lepton_config_path = Path(lepton_config_path) if lepton_config_path else None + self._limits = limits or InterventionLimits() + self._agent_id = agent_id + self._state = InterventionState() + + def can_restart(self) -> bool: + """Check whether a Tier 2 restart is allowed.""" + return self._state.tier2_restart_count < self._limits.max_tier2_restarts + + def request_checkpoint_and_stop(self, current_step: int, reason: str = "") -> bool: + """Signal the training loop to save a checkpoint and exit. + + Args: + current_step: Current training step. + reason: Why the restart is needed. + + Returns: + True if the signal was written. + """ + if not self.can_restart(): + logger.warning("Tier 2 restart rejected: max restarts (%d) reached", self._limits.max_tier2_restarts) + return False + + self._state.version += 1 + write_control_file( + path=self._control_file, + version=self._state.version, + interventions={"request_checkpoint_and_stop": True}, + agent_id=self._agent_id, + ) + + self._journal.log( + EntryType.INTERVENTION, + step=current_step, + trigger=reason, + action="tier2: request_checkpoint_and_stop", + ) + + logger.info("Tier 2 checkpoint-and-stop requested at step %d: %s", current_step, reason) + return True + + def modify_hydra_config( + self, + config_name: str, + overrides: dict[str, Any], + current_step: int, + ) -> bool: + """Modify a Hydra YAML config file on NFS for the next restart. + + Args: + config_name: Name of the YAML file (without extension) in the hydra config dir. + overrides: Flat dict of dotted keys → new values. + current_step: Current training step. + + Returns: + True if the config was modified. + """ + if self._hydra_config_dir is None: + logger.error("Cannot modify Hydra config: hydra_config_dir not set") + return False + + config_path = self._hydra_config_dir / f"{config_name}.yaml" + if not config_path.exists(): + logger.error("Hydra config not found: %s", config_path) + return False + + try: + with open(config_path) as f: + data = yaml.safe_load(f) or {} + + config_diff = {} + + for dotted_key, new_value in overrides.items(): + keys = dotted_key.split(".") + node = data + for k in keys[:-1]: + if k not in node or not isinstance(node[k], dict): + node[k] = {} + node = node[k] + + old_value = node.get(keys[-1]) + node[keys[-1]] = new_value + config_diff[dotted_key] = [old_value, new_value] + + # Atomic write + tmp = config_path.with_suffix(".tmp") + with open(tmp, "w") as f: + yaml.safe_dump(data, f, default_flow_style=False) + tmp.rename(config_path) + + self._journal.log( + EntryType.INTERVENTION, + step=current_step, + action=f"tier2: modify {config_name}.yaml", + config_diff=config_diff, + ) + + logger.info("Modified Hydra config %s: %s", config_name, config_diff) + return True + + except Exception: + logger.exception("Failed to modify Hydra config %s", config_path) + return False + + def resubmit_lepton_job( + self, + current_step: int, + extra_overrides: dict[str, str] | None = None, + ) -> bool: + """Resubmit the Lepton training job with resume_from_checkpoint=true. + + Requires ``leptonai`` to be installed in the agent environment. + + Args: + current_step: Current training step (for journaling). + extra_overrides: Additional Lepton config overrides. + + Returns: + True if the job was submitted. + """ + if self._lepton_config_path is None: + logger.error("Cannot resubmit: lepton_config_path not set") + return False + + try: + import importlib.util + + if importlib.util.find_spec("leptonai") is None: + raise ImportError("leptonai not available") + except ImportError: + logger.error("leptonai not installed — cannot resubmit job") + return False + + try: + with open(self._lepton_config_path) as f: + cfg = yaml.safe_load(f) or {} + + cfg["resume_from_checkpoint"] = True + if extra_overrides: + cfg.update(extra_overrides) + + # Atomic rewrite of config with resume enabled + tmp = self._lepton_config_path.with_suffix(".tmp") + with open(tmp, "w") as f: + yaml.safe_dump(cfg, f, default_flow_style=False) + tmp.rename(self._lepton_config_path) + + self._state.tier2_restart_count += 1 + + self._journal.log( + EntryType.INTERVENTION, + step=current_step, + action=f"tier2: resubmit Lepton job (restart #{self._state.tier2_restart_count})", + extra={"lepton_config": str(self._lepton_config_path)}, + ) + + logger.info( + "Lepton job config updated for restart #%d. Run submit_og2_lepton_eden.py to launch.", + self._state.tier2_restart_count, + ) + return True + + except Exception: + logger.exception("Failed to resubmit Lepton job") + return False diff --git a/bionemo-recipes/recipes/opengenome2_llama_native_te/agent_journal.py b/bionemo-recipes/recipes/opengenome2_llama_native_te/agent_journal.py new file mode 100644 index 0000000000..d04572889f --- /dev/null +++ b/bionemo-recipes/recipes/opengenome2_llama_native_te/agent_journal.py @@ -0,0 +1,190 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: LicenseRef-Apache2 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Experiment journal for cross-session learning and audit trail. + +Records every observation, hypothesis, intervention, and outcome as a +JSONL log on shared NFS. The journal serves three purposes: + +1. **Audit trail** — reproducible record of what the agent did and why. +2. **Cross-session context** — the agent loads prior entries on startup + so it can reason about patterns across multiple training runs. +3. **Knowledge accumulation** — over time the journal becomes a model- + and dataset-specific knowledge base for FP8 debugging heuristics. +""" + +import json +import logging +import os +import time +from enum import Enum +from pathlib import Path +from typing import Any + + +logger = logging.getLogger(__name__) + + +class EntryType(str, Enum): + """Journal entry categories.""" + + OBSERVATION = "observation" + HYPOTHESIS = "hypothesis" + INTERVENTION = "intervention" + OUTCOME = "outcome" + SESSION_START = "session_start" + SESSION_END = "session_end" + ERROR = "error" + + +class ExperimentJournal: + """Append-only JSONL journal on shared storage. + + Args: + journal_path: Path to the JSONL file. + session_id: Unique identifier for the current agent session. + """ + + def __init__( + self, + journal_path: str | os.PathLike = "/data/agent/journal.jsonl", + session_id: str = "default", + ): + """Initialize the experiment journal.""" + self._path = Path(journal_path) + self._path.parent.mkdir(parents=True, exist_ok=True) + self._session_id = session_id + self._entry_counter = 0 + + def log( + self, + entry_type: EntryType | str, + *, + step: int | None = None, + trigger: str | None = None, + hypothesis: str | None = None, + evidence: dict[str, Any] | None = None, + action: str | None = None, + config_diff: dict[str, Any] | None = None, + outcome: str | None = None, + outcome_step_range: tuple[int, int] | None = None, + extra: dict[str, Any] | None = None, + ) -> dict[str, Any]: + """Append one journal entry and return it. + + All keyword arguments are optional; only relevant fields need to be + supplied for each entry type. + + Args: + entry_type: Category tag (observation / hypothesis / intervention / …). + step: Training step associated with the event. + trigger: What triggered this entry (e.g. metric anomaly description). + hypothesis: Agent's hypothesis for the root cause. + evidence: Supporting data (metric snapshots, per-layer stats, …). + action: Description of the action taken. + config_diff: Mapping of config keys to ``[old, new]`` value pairs. + outcome: Textual summary filled in after observing the result. + outcome_step_range: ``(start, end)`` step range over which the + outcome was measured. + extra: Arbitrary extra data to attach. + + Returns: + The full entry dict that was written. + """ + self._entry_counter += 1 + entry: dict[str, Any] = { + "id": self._entry_counter, + "session_id": self._session_id, + "timestamp": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()), + "type": str(entry_type), + } + if step is not None: + entry["step"] = step + if trigger is not None: + entry["trigger"] = trigger + if hypothesis is not None: + entry["hypothesis"] = hypothesis + if evidence is not None: + entry["evidence"] = evidence + if action is not None: + entry["action"] = action + if config_diff is not None: + entry["config_diff"] = config_diff + if outcome is not None: + entry["outcome"] = outcome + if outcome_step_range is not None: + entry["outcome_step_range"] = list(outcome_step_range) + if extra is not None: + entry["extra"] = extra + + try: + with open(self._path, "a") as f: + f.write(json.dumps(entry) + "\n") + except OSError: + logger.exception("Failed to write journal entry") + + return entry + + def load_history(self, max_entries: int = 500) -> list[dict[str, Any]]: + """Load the most recent journal entries (for agent context on startup). + + Args: + max_entries: Maximum number of entries to return (from the tail). + + Returns: + List of entry dicts, oldest first. + """ + if not self._path.exists(): + return [] + + entries: list[dict[str, Any]] = [] + try: + with open(self._path) as f: + for raw_line in f: + stripped = raw_line.strip() + if stripped: + try: + entries.append(json.loads(stripped)) + except json.JSONDecodeError: + continue + except OSError: + logger.exception("Failed to read journal") + return [] + + return entries[-max_entries:] + + def update_outcome( + self, + entry_id: int, + outcome: str, + outcome_step_range: tuple[int, int] | None = None, + ) -> None: + """Append a follow-up outcome entry referencing an earlier intervention. + + Rather than mutating the original JSONL line (which is fragile), we + append a new ``outcome`` entry that references the original ``entry_id``. + + Args: + entry_id: The ``id`` of the original intervention entry. + outcome: Textual summary of the observed result. + outcome_step_range: Step range over which the outcome was measured. + """ + self.log( + EntryType.OUTCOME, + action=f"outcome for entry {entry_id}", + outcome=outcome, + outcome_step_range=outcome_step_range, + extra={"references_entry_id": entry_id}, + ) diff --git a/bionemo-recipes/recipes/opengenome2_llama_native_te/agent_metrics.py b/bionemo-recipes/recipes/opengenome2_llama_native_te/agent_metrics.py new file mode 100644 index 0000000000..577d92a654 --- /dev/null +++ b/bionemo-recipes/recipes/opengenome2_llama_native_te/agent_metrics.py @@ -0,0 +1,131 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: LicenseRef-Apache2 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Structured JSONL metric emitter for low-latency agent monitoring. + +Writes machine-readable metric records to a JSONL file on shared NFS so +the agent daemon can tail them with sub-second latency (bypassing the +wandb API round-trip). Designed to be called from PerfLogger alongside +the existing wandb logging path. +""" + +import json +import logging +import math +import os +import time +from pathlib import Path + + +logger = logging.getLogger(__name__) + + +class AgentMetricWriter: + """Appends one JSON line per logging window to an NFS-backed JSONL file. + + Only the global-rank-0 process writes; other ranks are no-ops. + + Args: + output_path: JSONL file path on shared storage. + enabled: Master switch. When False every method is a no-op. + is_main_process: True only on global rank 0. + """ + + def __init__( + self, + output_path: str | os.PathLike = "/data/agent/metrics.jsonl", + enabled: bool = False, + is_main_process: bool = False, + ): + """Initialize the metric writer.""" + self._enabled = enabled and is_main_process + self._path = Path(output_path) + self._prev_loss: float | None = None + + if self._enabled: + self._path.parent.mkdir(parents=True, exist_ok=True) + logger.info("AgentMetricWriter enabled → %s", self._path) + + def write_step( + self, + step: int, + loss: float, + grad_norm: float, + lr: float, + step_time: float, + tokens_per_sec: float, + gpu_mem_gb: float, + ) -> None: + """Append a single metric record for the current logging window. + + Args: + step: Global optimizer step. + loss: Average loss over the logging window. + grad_norm: Gradient norm after clipping. + lr: Current learning rate. + step_time: Wall-clock seconds per step (averaged over window). + tokens_per_sec: Tokens/sec/GPU throughput. + gpu_mem_gb: GPU memory allocated in GiB. + """ + if not self._enabled: + return + + loss_delta_pct = 0.0 + if self._prev_loss is not None and self._prev_loss != 0.0: + loss_delta_pct = ((loss - self._prev_loss) / abs(self._prev_loss)) * 100.0 + self._prev_loss = loss + + record = { + "step": step, + "loss": loss, + "grad_norm": grad_norm, + "lr": lr, + "step_time": step_time, + "tokens_per_sec": tokens_per_sec, + "gpu_mem_gb": round(gpu_mem_gb, 3), + "loss_delta_pct": round(loss_delta_pct, 2), + "nan_detected": math.isnan(loss) or math.isnan(grad_norm), + "grad_norm_spike": False, + "timestamp": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()), + } + + try: + with open(self._path, "a") as f: + f.write(json.dumps(record) + "\n") + except OSError: + logger.exception("Failed to write agent metric record") + + def write_intervention(self, step: int, interventions: dict) -> None: + """Log an agent intervention event alongside normal metrics. + + Args: + step: Global optimizer step at which the intervention was applied. + interventions: The dict of parameter overrides that were applied. + """ + if not self._enabled: + return + + record = { + "step": step, + "type": "intervention", + "interventions": interventions, + "timestamp": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()), + } + + try: + with open(self._path, "a") as f: + f.write(json.dumps(record) + "\n") + except OSError: + logger.exception("Failed to write agent intervention record") diff --git a/bionemo-recipes/recipes/opengenome2_llama_native_te/baseline_bf16.json b/bionemo-recipes/recipes/opengenome2_llama_native_te/baseline_bf16.json new file mode 100644 index 0000000000..9c84eb823c --- /dev/null +++ b/bionemo-recipes/recipes/opengenome2_llama_native_te/baseline_bf16.json @@ -0,0 +1,9117 @@ +{ + "step_100": { + "perplexity": 6.042289187401148, + "loss": 1.7987829446792603, + "unpadded_tokens_per_sec": 9013.4619140625 + }, + "step_200": { + "perplexity": 4.156514140201405, + "loss": 1.424676775932312, + "unpadded_tokens_per_sec": 10047.052734375 + }, + "step_300": { + "perplexity": 4.091946531935261, + "loss": 1.4090207815170288, + "unpadded_tokens_per_sec": 10075.2119140625 + }, + "step_400": { + "perplexity": 4.159171338411974, + "loss": 1.4253158569335938, + "unpadded_tokens_per_sec": 10064.3994140625 + }, + "step_500": { + "perplexity": 4.101186086544829, + "loss": 1.4112762212753296, + "unpadded_tokens_per_sec": 10087.787109375 + }, + "step_600": { + "perplexity": 4.073528851942108, + "loss": 1.4045096635818481, + "unpadded_tokens_per_sec": 10064.1298828125 + }, + "step_700": { + "perplexity": 3.9552375015116237, + "loss": 1.3750406503677368, + "unpadded_tokens_per_sec": 10045.5263671875 + }, + "step_800": { + "perplexity": 3.83192610630409, + "loss": 1.343367576599121, + "unpadded_tokens_per_sec": 10049.9287109375 + }, + "step_900": { + "perplexity": 3.7742822593180905, + "loss": 1.3282102346420288, + "unpadded_tokens_per_sec": 10044.5595703125 + }, + "step_1000": { + "perplexity": 3.77452612898902, + "loss": 1.3282748460769653, + "unpadded_tokens_per_sec": 10017.5244140625 + }, + "step_1100": { + "perplexity": 3.7927641244316765, + "loss": 1.3330950736999512, + "unpadded_tokens_per_sec": 10031.5859375 + }, + "step_1200": { + "perplexity": 3.7324140599570694, + "loss": 1.3170552253723145, + "unpadded_tokens_per_sec": 10026.564453125 + }, + "step_1300": { + "perplexity": 3.7611003512681425, + "loss": 1.324711561203003, + "unpadded_tokens_per_sec": 10041.3544921875 + }, + "step_1400": { + "perplexity": 3.722907619184012, + "loss": 1.3145049810409546, + "unpadded_tokens_per_sec": 10018.9599609375 + }, + "step_1500": { + "perplexity": 3.7327019462222046, + "loss": 1.3171323537826538, + "unpadded_tokens_per_sec": 10016.90625 + }, + "step_1600": { + "perplexity": 3.704313062226817, + "loss": 1.3094978332519531, + "unpadded_tokens_per_sec": 10018.48828125 + }, + "step_1700": { + "perplexity": 3.7007939495349294, + "loss": 1.3085473775863647, + "unpadded_tokens_per_sec": 10012.1962890625 + }, + "step_1800": { + "perplexity": 3.6713025850787466, + "loss": 1.3005465269088745, + "unpadded_tokens_per_sec": 10006.4228515625 + }, + "step_1900": { + "perplexity": 3.6674838585468548, + "loss": 1.2995058298110962, + "unpadded_tokens_per_sec": 9989.015625 + }, + "step_2000": { + "perplexity": 3.66703226066902, + "loss": 1.2993826866149902, + "unpadded_tokens_per_sec": 10045.7470703125 + }, + "step_2100": { + "perplexity": 3.641972427785213, + "loss": 1.2925254106521606, + "unpadded_tokens_per_sec": 9984.7236328125 + }, + "step_2200": { + "perplexity": 3.6142089550810517, + "loss": 1.2848730087280273, + "unpadded_tokens_per_sec": 9987.4453125 + }, + "step_2300": { + "perplexity": 3.5907264132892336, + "loss": 1.278354525566101, + "unpadded_tokens_per_sec": 9980.5810546875 + }, + "step_2400": { + "perplexity": 3.5827376654874974, + "loss": 1.2761272192001343, + "unpadded_tokens_per_sec": 10027.7099609375 + }, + "step_2500": { + "perplexity": 3.575757190376387, + "loss": 1.2741769552230835, + "unpadded_tokens_per_sec": 9977.9853515625 + }, + "step_2600": { + "perplexity": 3.5695391563278394, + "loss": 1.272436499595642, + "unpadded_tokens_per_sec": 9957.9365234375 + }, + "step_2700": { + "perplexity": 3.5577860501849243, + "loss": 1.2691384553909302, + "unpadded_tokens_per_sec": 10037.7626953125 + }, + "step_2800": { + "perplexity": 3.5563960526646925, + "loss": 1.2687476873397827, + "unpadded_tokens_per_sec": 9943.2001953125 + }, + "step_2900": { + "perplexity": 3.533474892189335, + "loss": 1.2622817754745483, + "unpadded_tokens_per_sec": 9949.7080078125 + }, + "step_3000": { + "perplexity": 3.53017741298686, + "loss": 1.2613481283187866, + "unpadded_tokens_per_sec": 10035.1767578125 + }, + "step_3100": { + "perplexity": 3.5188840279365134, + "loss": 1.2581439018249512, + "unpadded_tokens_per_sec": 9955.0048828125 + }, + "step_3200": { + "perplexity": 3.5107332665736926, + "loss": 1.2558249235153198, + "unpadded_tokens_per_sec": 10039.791015625 + }, + "step_3300": { + "perplexity": 3.501614035636209, + "loss": 1.2532240152359009, + "unpadded_tokens_per_sec": 9944.2138671875 + }, + "step_3400": { + "perplexity": 3.489634026925303, + "loss": 1.2497968673706055, + "unpadded_tokens_per_sec": 10038.6962890625 + }, + "step_3500": { + "perplexity": 3.4835807059297874, + "loss": 1.248060703277588, + "unpadded_tokens_per_sec": 9928.0693359375 + }, + "step_3600": { + "perplexity": 3.4764320289816624, + "loss": 1.2460064888000488, + "unpadded_tokens_per_sec": 10020.5791015625 + }, + "step_3700": { + "perplexity": 3.4658559119302588, + "loss": 1.2429596185684204, + "unpadded_tokens_per_sec": 9937.7998046875 + }, + "step_3800": { + "perplexity": 3.455005937820005, + "loss": 1.2398241758346558, + "unpadded_tokens_per_sec": 10022.22265625 + }, + "step_3900": { + "perplexity": 3.4464373535035158, + "loss": 1.237341046333313, + "unpadded_tokens_per_sec": 9936.1396484375 + }, + "step_4000": { + "perplexity": 3.436568986721773, + "loss": 1.2344735860824585, + "unpadded_tokens_per_sec": 10036.4638671875 + }, + "step_4100": { + "perplexity": 3.4259952928356054, + "loss": 1.2313920259475708, + "unpadded_tokens_per_sec": 9918.2197265625 + }, + "step_4200": { + "perplexity": 3.4179386740611517, + "loss": 1.2290376424789429, + "unpadded_tokens_per_sec": 10043.3017578125 + }, + "step_4300": { + "perplexity": 3.4064849299232365, + "loss": 1.225680947303772, + "unpadded_tokens_per_sec": 9905.279296875 + }, + "step_4400": { + "perplexity": 3.3941614892445164, + "loss": 1.2220567464828491, + "unpadded_tokens_per_sec": 10038.1455078125 + }, + "step_4500": { + "perplexity": 3.383411727659011, + "loss": 1.2188845872879028, + "unpadded_tokens_per_sec": 9907.6845703125 + }, + "step_4600": { + "perplexity": 3.3698956441707355, + "loss": 1.2148817777633667, + "unpadded_tokens_per_sec": 10036.671875 + }, + "step_4700": { + "perplexity": 3.3618571310469276, + "loss": 1.2124935388565063, + "unpadded_tokens_per_sec": 10017.34375 + }, + "step_4800": { + "perplexity": 3.3511713303354402, + "loss": 1.2093099355697632, + "unpadded_tokens_per_sec": 9894.0244140625 + }, + "step_4900": { + "perplexity": 3.34457863593668, + "loss": 1.2073407173156738, + "unpadded_tokens_per_sec": 10034.5654296875 + }, + "step_5000": { + "perplexity": 3.3341305819128886, + "loss": 1.204211950302124, + "unpadded_tokens_per_sec": 9856.0517578125 + }, + "step_5100": { + "perplexity": 3.3219120725124536, + "loss": 1.200540542602539, + "unpadded_tokens_per_sec": 9917.23046875 + }, + "step_5200": { + "perplexity": 3.3111187691327855, + "loss": 1.1972861289978027, + "unpadded_tokens_per_sec": 10030.595703125 + }, + "step_5300": { + "perplexity": 3.3018256130416517, + "loss": 1.194475531578064, + "unpadded_tokens_per_sec": 9846.0986328125 + }, + "step_5400": { + "perplexity": 3.2967665094430227, + "loss": 1.1929421424865723, + "unpadded_tokens_per_sec": 10030.9287109375 + }, + "step_5500": { + "perplexity": 3.286697404276669, + "loss": 1.1898832321166992, + "unpadded_tokens_per_sec": 9849.453125 + }, + "step_5600": { + "perplexity": 3.278256638798674, + "loss": 1.1873117685317993, + "unpadded_tokens_per_sec": 10011.9853515625 + }, + "step_5700": { + "perplexity": 3.2707478485967068, + "loss": 1.1850186586380005, + "unpadded_tokens_per_sec": 10027.150390625 + }, + "step_5800": { + "perplexity": 3.263951103797122, + "loss": 1.1829384565353394, + "unpadded_tokens_per_sec": 9841.0361328125 + }, + "step_5900": { + "perplexity": 3.2546502654645297, + "loss": 1.1800848245620728, + "unpadded_tokens_per_sec": 10003.0576171875 + }, + "step_6000": { + "perplexity": 3.2480321671935743, + "loss": 1.1780493259429932, + "unpadded_tokens_per_sec": 10032.1162109375 + }, + "step_6100": { + "perplexity": 3.2418267106750274, + "loss": 1.1761369705200195, + "unpadded_tokens_per_sec": 9832.53125 + }, + "step_6200": { + "perplexity": 3.2366118232048393, + "loss": 1.1745270490646362, + "unpadded_tokens_per_sec": 10002.111328125 + }, + "step_6300": { + "perplexity": 3.2278467567205995, + "loss": 1.171815276145935, + "unpadded_tokens_per_sec": 10034.5361328125 + }, + "step_6400": { + "perplexity": 3.2240149920877306, + "loss": 1.170627474784851, + "unpadded_tokens_per_sec": 9839.5888671875 + }, + "step_6500": { + "perplexity": 3.217835090412097, + "loss": 1.1687088012695312, + "unpadded_tokens_per_sec": 9982.0419921875 + }, + "step_6600": { + "perplexity": 3.212604413966056, + "loss": 1.1670819520950317, + "unpadded_tokens_per_sec": 10042.78515625 + }, + "step_6700": { + "perplexity": 3.203845544456811, + "loss": 1.1643518209457397, + "unpadded_tokens_per_sec": 10028.921875 + }, + "step_6800": { + "perplexity": 3.1999552965073166, + "loss": 1.1631368398666382, + "unpadded_tokens_per_sec": 9814.4462890625 + }, + "step_6900": { + "perplexity": 3.197379160558673, + "loss": 1.162331461906433, + "unpadded_tokens_per_sec": 9997.9033203125 + }, + "step_7000": { + "perplexity": 3.192603779995315, + "loss": 1.1608368158340454, + "unpadded_tokens_per_sec": 10036.5693359375 + }, + "step_7100": { + "perplexity": 3.1855661026506374, + "loss": 1.1586300134658813, + "unpadded_tokens_per_sec": 9814.6494140625 + }, + "step_7200": { + "perplexity": 3.1780108545181998, + "loss": 1.1562554836273193, + "unpadded_tokens_per_sec": 10000.3095703125 + }, + "step_7300": { + "perplexity": 3.175279773915199, + "loss": 1.155395746231079, + "unpadded_tokens_per_sec": 10023.8740234375 + }, + "step_7400": { + "perplexity": 3.1692624272210628, + "loss": 1.153498888015747, + "unpadded_tokens_per_sec": 10031.7998046875 + }, + "step_7500": { + "perplexity": 3.1672916510139664, + "loss": 1.152876853942871, + "unpadded_tokens_per_sec": 9798.947265625 + }, + "step_7600": { + "perplexity": 3.159777484583934, + "loss": 1.1505016088485718, + "unpadded_tokens_per_sec": 9994.970703125 + }, + "step_7700": { + "perplexity": 3.157738072141091, + "loss": 1.1498559713363647, + "unpadded_tokens_per_sec": 10029.4609375 + }, + "step_7800": { + "perplexity": 3.1544418754720858, + "loss": 1.1488115787506104, + "unpadded_tokens_per_sec": 9813.19140625 + }, + "step_7900": { + "perplexity": 3.148182911109356, + "loss": 1.1468254327774048, + "unpadded_tokens_per_sec": 9932.1767578125 + }, + "step_8000": { + "perplexity": 3.147229436906366, + "loss": 1.1465225219726562, + "unpadded_tokens_per_sec": 10022.8134765625 + }, + "step_8100": { + "perplexity": 3.141646478846879, + "loss": 1.144747018814087, + "unpadded_tokens_per_sec": 10028.2421875 + }, + "step_8200": { + "perplexity": 3.136890391196897, + "loss": 1.143231987953186, + "unpadded_tokens_per_sec": 9790.34765625 + }, + "step_8300": { + "perplexity": 3.1331210335387802, + "loss": 1.1420296430587769, + "unpadded_tokens_per_sec": 9956.2607421875 + }, + "step_8400": { + "perplexity": 3.129853145053356, + "loss": 1.1409860849380493, + "unpadded_tokens_per_sec": 10033.6376953125 + }, + "step_8500": { + "perplexity": 3.125639122719989, + "loss": 1.1396387815475464, + "unpadded_tokens_per_sec": 10035.2958984375 + }, + "step_8600": { + "perplexity": 3.1220783014525395, + "loss": 1.1384989023208618, + "unpadded_tokens_per_sec": 9976.7802734375 + }, + "step_8700": { + "perplexity": 3.1192413408796735, + "loss": 1.1375898122787476, + "unpadded_tokens_per_sec": 9806.5810546875 + }, + "step_8800": { + "perplexity": 3.1165968028506716, + "loss": 1.1367416381835938, + "unpadded_tokens_per_sec": 9982.67578125 + }, + "step_8900": { + "perplexity": 3.1109176160180603, + "loss": 1.1349177360534668, + "unpadded_tokens_per_sec": 10021.2255859375 + }, + "step_9000": { + "perplexity": 3.108286804616421, + "loss": 1.134071707725525, + "unpadded_tokens_per_sec": 10027.94140625 + }, + "step_9100": { + "perplexity": 3.1050477797774647, + "loss": 1.1330291032791138, + "unpadded_tokens_per_sec": 9718.09375 + }, + "step_9200": { + "perplexity": 3.1017884653421413, + "loss": 1.1319788694381714, + "unpadded_tokens_per_sec": 9985.7109375 + }, + "step_9300": { + "perplexity": 3.101891260881266, + "loss": 1.1320120096206665, + "unpadded_tokens_per_sec": 10029.1435546875 + }, + "step_9400": { + "perplexity": 3.0975631174434257, + "loss": 1.1306157112121582, + "unpadded_tokens_per_sec": 10037.0029296875 + }, + "step_9500": { + "perplexity": 3.093350531796798, + "loss": 1.1292548179626465, + "unpadded_tokens_per_sec": 9786.5693359375 + }, + "step_9600": { + "perplexity": 3.0927871237610693, + "loss": 1.129072666168213, + "unpadded_tokens_per_sec": 9951.13671875 + }, + "step_9700": { + "perplexity": 3.0877667098188017, + "loss": 1.1274480819702148, + "unpadded_tokens_per_sec": 10033.7099609375 + }, + "step_9800": { + "perplexity": 3.0815107325963846, + "loss": 1.1254199743270874, + "unpadded_tokens_per_sec": 10033.2314453125 + }, + "step_9900": { + "perplexity": 3.079022976963178, + "loss": 1.1246123313903809, + "unpadded_tokens_per_sec": 10029.6162109375 + }, + "step_10000": { + "perplexity": 3.0804584697183826, + "loss": 1.1250784397125244, + "unpadded_tokens_per_sec": 9726.45703125 + }, + "step_10100": { + "perplexity": 3.074486670490578, + "loss": 1.1231379508972168, + "unpadded_tokens_per_sec": 9926.2236328125 + }, + "step_10200": { + "perplexity": 3.072984357430251, + "loss": 1.1226491928100586, + "unpadded_tokens_per_sec": 10030.7060546875 + }, + "step_10300": { + "perplexity": 3.072530876480505, + "loss": 1.1225016117095947, + "unpadded_tokens_per_sec": 10036.0634765625 + }, + "step_10400": { + "perplexity": 3.067248254507577, + "loss": 1.1207808256149292, + "unpadded_tokens_per_sec": 9984.9384765625 + }, + "step_10500": { + "perplexity": 3.0638123908843418, + "loss": 1.1196600198745728, + "unpadded_tokens_per_sec": 9712.3857421875 + }, + "step_10600": { + "perplexity": 3.061471034565495, + "loss": 1.1188955307006836, + "unpadded_tokens_per_sec": 10034.5341796875 + }, + "step_10700": { + "perplexity": 3.0630279667572218, + "loss": 1.1194039583206177, + "unpadded_tokens_per_sec": 10020.7177734375 + }, + "step_10800": { + "perplexity": 3.0569456461609894, + "loss": 1.117416262626648, + "unpadded_tokens_per_sec": 10025.2158203125 + }, + "step_10900": { + "perplexity": 3.055146687914631, + "loss": 1.1168276071548462, + "unpadded_tokens_per_sec": 9978.6826171875 + }, + "step_11000": { + "perplexity": 3.052591060300954, + "loss": 1.1159907579421997, + "unpadded_tokens_per_sec": 9693.6982421875 + }, + "step_11100": { + "perplexity": 3.0505971911383907, + "loss": 1.1153373718261719, + "unpadded_tokens_per_sec": 10022.5625 + }, + "step_11200": { + "perplexity": 3.049494411176074, + "loss": 1.1149758100509644, + "unpadded_tokens_per_sec": 10033.23828125 + }, + "step_11300": { + "perplexity": 3.0453421149479296, + "loss": 1.113613247871399, + "unpadded_tokens_per_sec": 10021.07421875 + }, + "step_11400": { + "perplexity": 3.043814129181417, + "loss": 1.1131113767623901, + "unpadded_tokens_per_sec": 10030.9814453125 + }, + "step_11500": { + "perplexity": 3.0432281813518633, + "loss": 1.1129188537597656, + "unpadded_tokens_per_sec": 9686.65625 + }, + "step_11600": { + "perplexity": 3.039986283805579, + "loss": 1.111853003501892, + "unpadded_tokens_per_sec": 9976.748046875 + }, + "step_11700": { + "perplexity": 3.037092853543908, + "loss": 1.1109007596969604, + "unpadded_tokens_per_sec": 10030.333984375 + }, + "step_11800": { + "perplexity": 3.0384707650622693, + "loss": 1.1113543510437012, + "unpadded_tokens_per_sec": 10025.3076171875 + }, + "step_11900": { + "perplexity": 3.038298718311257, + "loss": 1.1112977266311646, + "unpadded_tokens_per_sec": 10034.9462890625 + }, + "step_12000": { + "perplexity": 3.027039479837798, + "loss": 1.107585072517395, + "unpadded_tokens_per_sec": 9969.2802734375 + }, + "step_12100": { + "perplexity": 3.030556934559515, + "loss": 1.1087464094161987, + "unpadded_tokens_per_sec": 9631.3525390625 + }, + "step_12200": { + "perplexity": 3.0267796781048384, + "loss": 1.1074992418289185, + "unpadded_tokens_per_sec": 10031.146484375 + }, + "step_12300": { + "perplexity": 3.024949790317874, + "loss": 1.1068944931030273, + "unpadded_tokens_per_sec": 10029.6103515625 + }, + "step_12400": { + "perplexity": 3.0240531057849687, + "loss": 1.1065980195999146, + "unpadded_tokens_per_sec": 10026.6728515625 + }, + "step_12500": { + "perplexity": 3.0206951560509268, + "loss": 1.1054869890213013, + "unpadded_tokens_per_sec": 10027.90234375 + }, + "step_12600": { + "perplexity": 3.0208845719192015, + "loss": 1.105549693107605, + "unpadded_tokens_per_sec": 9971.5908203125 + }, + "step_12700": { + "perplexity": 3.0176632390245652, + "loss": 1.1044827699661255, + "unpadded_tokens_per_sec": 9644.9326171875 + }, + "step_12800": { + "perplexity": 3.0173096417212353, + "loss": 1.104365587234497, + "unpadded_tokens_per_sec": 10034.38671875 + }, + "step_12900": { + "perplexity": 3.016337912231085, + "loss": 1.1040434837341309, + "unpadded_tokens_per_sec": 10030.7080078125 + }, + "step_13000": { + "perplexity": 3.013797502988378, + "loss": 1.103200912475586, + "unpadded_tokens_per_sec": 10024.3955078125 + }, + "step_13100": { + "perplexity": 3.0092349638293663, + "loss": 1.101685881614685, + "unpadded_tokens_per_sec": 10022.603515625 + }, + "step_13200": { + "perplexity": 3.007550127941081, + "loss": 1.1011258363723755, + "unpadded_tokens_per_sec": 9967.0732421875 + }, + "step_13300": { + "perplexity": 3.007492405500847, + "loss": 1.1011066436767578, + "unpadded_tokens_per_sec": 9631.4453125 + }, + "step_13400": { + "perplexity": 3.006650715916777, + "loss": 1.1008267402648926, + "unpadded_tokens_per_sec": 10017.251953125 + }, + "step_13500": { + "perplexity": 3.005771638486394, + "loss": 1.1005343198776245, + "unpadded_tokens_per_sec": 10037.16796875 + }, + "step_13600": { + "perplexity": 3.0022321297636587, + "loss": 1.0993560552597046, + "unpadded_tokens_per_sec": 10023.51953125 + }, + "step_13700": { + "perplexity": 3.002378869872945, + "loss": 1.0994049310684204, + "unpadded_tokens_per_sec": 10034.0859375 + }, + "step_13800": { + "perplexity": 3.001955848340339, + "loss": 1.0992640256881714, + "unpadded_tokens_per_sec": 10031.0107421875 + }, + "step_13900": { + "perplexity": 2.999459732446742, + "loss": 1.098432183265686, + "unpadded_tokens_per_sec": 9618.2529296875 + }, + "step_14000": { + "perplexity": 2.996616306136499, + "loss": 1.09748375415802, + "unpadded_tokens_per_sec": 9967.4833984375 + }, + "step_14100": { + "perplexity": 2.9949163998519723, + "loss": 1.0969163179397583, + "unpadded_tokens_per_sec": 10031.7958984375 + }, + "step_14200": { + "perplexity": 2.9923979560936034, + "loss": 1.0960750579833984, + "unpadded_tokens_per_sec": 10026.75 + }, + "step_14300": { + "perplexity": 2.9898784223169326, + "loss": 1.0952327251434326, + "unpadded_tokens_per_sec": 10024.380859375 + }, + "step_14400": { + "perplexity": 2.9907314601124524, + "loss": 1.0955179929733276, + "unpadded_tokens_per_sec": 10030.4384765625 + }, + "step_14500": { + "perplexity": 2.9887687324240404, + "loss": 1.0948615074157715, + "unpadded_tokens_per_sec": 9973.015625 + }, + "step_14600": { + "perplexity": 2.9827062189592795, + "loss": 1.092831015586853, + "unpadded_tokens_per_sec": 9613.0791015625 + }, + "step_14700": { + "perplexity": 2.9838990267366525, + "loss": 1.0932308435440063, + "unpadded_tokens_per_sec": 10036.05859375 + }, + "step_14800": { + "perplexity": 2.985392664655928, + "loss": 1.0937312841415405, + "unpadded_tokens_per_sec": 10027.7314453125 + }, + "step_14900": { + "perplexity": 2.980366088984198, + "loss": 1.0920461416244507, + "unpadded_tokens_per_sec": 10024.0224609375 + }, + "step_15000": { + "perplexity": 2.980671651745775, + "loss": 1.0921486616134644, + "unpadded_tokens_per_sec": 10029.3017578125 + }, + "step_15100": { + "perplexity": 2.9795724827809615, + "loss": 1.0917798280715942, + "unpadded_tokens_per_sec": 9904.9609375 + }, + "step_15200": { + "perplexity": 2.9804641498988205, + "loss": 1.0920790433883667, + "unpadded_tokens_per_sec": 10036.599609375 + }, + "step_15300": { + "perplexity": 2.9724756727452437, + "loss": 1.0893951654434204, + "unpadded_tokens_per_sec": 9567.8310546875 + }, + "step_15400": { + "perplexity": 2.9750450875867207, + "loss": 1.0902591943740845, + "unpadded_tokens_per_sec": 10028.3388671875 + }, + "step_15500": { + "perplexity": 2.9724200408318793, + "loss": 1.089376449584961, + "unpadded_tokens_per_sec": 10037.76953125 + }, + "step_15600": { + "perplexity": 2.9663669970383544, + "loss": 1.0873379707336426, + "unpadded_tokens_per_sec": 10034.1015625 + }, + "step_15700": { + "perplexity": 2.9690111194665434, + "loss": 1.0882289409637451, + "unpadded_tokens_per_sec": 10029.6904296875 + }, + "step_15800": { + "perplexity": 2.9712130492988553, + "loss": 1.0889703035354614, + "unpadded_tokens_per_sec": 9964.2705078125 + }, + "step_15900": { + "perplexity": 2.9667146243936435, + "loss": 1.087455153465271, + "unpadded_tokens_per_sec": 10028.759765625 + }, + "step_16000": { + "perplexity": 2.968338367722342, + "loss": 1.088002324104309, + "unpadded_tokens_per_sec": 9622.1201171875 + }, + "step_16100": { + "perplexity": 2.9640308527482717, + "loss": 1.0865501165390015, + "unpadded_tokens_per_sec": 9954.1328125 + }, + "step_16200": { + "perplexity": 2.964887472697665, + "loss": 1.0868390798568726, + "unpadded_tokens_per_sec": 10032.3623046875 + }, + "step_16300": { + "perplexity": 2.961918632435874, + "loss": 1.0858372449874878, + "unpadded_tokens_per_sec": 10028.7646484375 + }, + "step_16400": { + "perplexity": 2.9553781964238275, + "loss": 1.083626627922058, + "unpadded_tokens_per_sec": 10025.3583984375 + }, + "step_16500": { + "perplexity": 2.958707624540979, + "loss": 1.0847525596618652, + "unpadded_tokens_per_sec": 9968.609375 + }, + "step_16600": { + "perplexity": 2.9587820463235017, + "loss": 1.0847777128219604, + "unpadded_tokens_per_sec": 10025.759765625 + }, + "step_16700": { + "perplexity": 2.9569044386516032, + "loss": 1.0841429233551025, + "unpadded_tokens_per_sec": 10031.0830078125 + }, + "step_16800": { + "perplexity": 2.9566785008880077, + "loss": 1.0840665102005005, + "unpadded_tokens_per_sec": 9527.5224609375 + }, + "step_16900": { + "perplexity": 2.955292234391449, + "loss": 1.0835975408554077, + "unpadded_tokens_per_sec": 10028.4091796875 + }, + "step_17000": { + "perplexity": 2.951335405194577, + "loss": 1.0822577476501465, + "unpadded_tokens_per_sec": 10028.0537109375 + }, + "step_17100": { + "perplexity": 2.9506659551177283, + "loss": 1.0820308923721313, + "unpadded_tokens_per_sec": 10031.6669921875 + }, + "step_17200": { + "perplexity": 2.949345793075761, + "loss": 1.0815833806991577, + "unpadded_tokens_per_sec": 10030.9384765625 + }, + "step_17300": { + "perplexity": 2.9521162116800785, + "loss": 1.0825222730636597, + "unpadded_tokens_per_sec": 9964.78125 + }, + "step_17400": { + "perplexity": 2.9494424817501774, + "loss": 1.0816161632537842, + "unpadded_tokens_per_sec": 10032.1787109375 + }, + "step_17500": { + "perplexity": 2.9445189086334045, + "loss": 1.07994544506073, + "unpadded_tokens_per_sec": 10033.6904296875 + }, + "step_17600": { + "perplexity": 2.9418257574869697, + "loss": 1.0790303945541382, + "unpadded_tokens_per_sec": 9433.25 + }, + "step_17700": { + "perplexity": 2.9427913736426294, + "loss": 1.0793585777282715, + "unpadded_tokens_per_sec": 10029.57421875 + }, + "step_17800": { + "perplexity": 2.94132991943449, + "loss": 1.0788618326187134, + "unpadded_tokens_per_sec": 10038.1298828125 + }, + "step_17900": { + "perplexity": 2.9417938446008396, + "loss": 1.079019546508789, + "unpadded_tokens_per_sec": 10020.9287109375 + }, + "step_18000": { + "perplexity": 2.943048878030841, + "loss": 1.0794460773468018, + "unpadded_tokens_per_sec": 10033.3095703125 + }, + "step_18100": { + "perplexity": 2.940188827653734, + "loss": 1.0784738063812256, + "unpadded_tokens_per_sec": 9955.6845703125 + }, + "step_18200": { + "perplexity": 2.9424981127102723, + "loss": 1.079258918762207, + "unpadded_tokens_per_sec": 10036.5693359375 + }, + "step_18300": { + "perplexity": 2.938013041638866, + "loss": 1.0777335166931152, + "unpadded_tokens_per_sec": 10035.279296875 + }, + "step_18400": { + "perplexity": 2.9356334350987727, + "loss": 1.0769232511520386, + "unpadded_tokens_per_sec": 9475.845703125 + }, + "step_18500": { + "perplexity": 2.934179733053389, + "loss": 1.076427936553955, + "unpadded_tokens_per_sec": 9950.7412109375 + }, + "step_18600": { + "perplexity": 2.9348370460713453, + "loss": 1.076651930809021, + "unpadded_tokens_per_sec": 10035.8603515625 + }, + "step_18700": { + "perplexity": 2.934678913684353, + "loss": 1.076598048210144, + "unpadded_tokens_per_sec": 10038.3193359375 + }, + "step_18800": { + "perplexity": 2.931361848279868, + "loss": 1.0754671096801758, + "unpadded_tokens_per_sec": 10031.9287109375 + }, + "step_18900": { + "perplexity": 2.9329630955845585, + "loss": 1.076013207435608, + "unpadded_tokens_per_sec": 10042.7041015625 + }, + "step_19000": { + "perplexity": 2.9319157718271494, + "loss": 1.0756560564041138, + "unpadded_tokens_per_sec": 9953.21875 + }, + "step_19100": { + "perplexity": 2.9266338229531437, + "loss": 1.0738528966903687, + "unpadded_tokens_per_sec": 10037.2958984375 + }, + "step_19200": { + "perplexity": 2.925021037411776, + "loss": 1.0733016729354858, + "unpadded_tokens_per_sec": 10029.6103515625 + }, + "step_19300": { + "perplexity": 2.924228921733827, + "loss": 1.0730308294296265, + "unpadded_tokens_per_sec": 9453.72265625 + }, + "step_19400": { + "perplexity": 2.9270755408186386, + "loss": 1.07400381565094, + "unpadded_tokens_per_sec": 9944.798828125 + }, + "step_19500": { + "perplexity": 2.9220750480993325, + "loss": 1.072293996810913, + "unpadded_tokens_per_sec": 10039.796875 + }, + "step_19600": { + "perplexity": 2.9247183904279836, + "loss": 1.0731981992721558, + "unpadded_tokens_per_sec": 10032.5654296875 + }, + "step_19700": { + "perplexity": 2.9237939072190784, + "loss": 1.072882056236267, + "unpadded_tokens_per_sec": 10034.87109375 + }, + "step_19800": { + "perplexity": 2.919481078379815, + "loss": 1.0714058876037598, + "unpadded_tokens_per_sec": 10036.46484375 + }, + "step_19900": { + "perplexity": 2.9188780059596247, + "loss": 1.0711992979049683, + "unpadded_tokens_per_sec": 9951.6826171875 + }, + "step_20000": { + "perplexity": 2.9170093773414028, + "loss": 1.0705589056015015, + "unpadded_tokens_per_sec": 10035.923828125 + }, + "step_20100": { + "perplexity": 2.919138289679298, + "loss": 1.0712884664535522, + "unpadded_tokens_per_sec": 9968.4697265625 + }, + "step_20200": { + "perplexity": 2.920314727470344, + "loss": 1.0716913938522339, + "unpadded_tokens_per_sec": 9306.5166015625 + }, + "step_20300": { + "perplexity": 2.920930283778474, + "loss": 1.0719021558761597, + "unpadded_tokens_per_sec": 10035.2265625 + }, + "step_20400": { + "perplexity": 2.917036500767512, + "loss": 1.0705682039260864, + "unpadded_tokens_per_sec": 10036.990234375 + }, + "step_20500": { + "perplexity": 2.9129937227077773, + "loss": 1.0691813230514526, + "unpadded_tokens_per_sec": 10035.40234375 + }, + "step_20600": { + "perplexity": 2.913447968854285, + "loss": 1.069337248802185, + "unpadded_tokens_per_sec": 10042.8642578125 + }, + "step_20700": { + "perplexity": 2.910051868098019, + "loss": 1.0681709051132202, + "unpadded_tokens_per_sec": 9943.8359375 + }, + "step_20800": { + "perplexity": 2.9085293526236997, + "loss": 1.0676475763320923, + "unpadded_tokens_per_sec": 10035.12890625 + }, + "step_20900": { + "perplexity": 2.910762416713241, + "loss": 1.0684150457382202, + "unpadded_tokens_per_sec": 10036.1337890625 + }, + "step_21000": { + "perplexity": 2.912435388715863, + "loss": 1.068989634513855, + "unpadded_tokens_per_sec": 10041.30078125 + }, + "step_21100": { + "perplexity": 2.908823389198115, + "loss": 1.0677486658096313, + "unpadded_tokens_per_sec": 9316.7275390625 + }, + "step_21200": { + "perplexity": 2.9079632078997277, + "loss": 1.0674529075622559, + "unpadded_tokens_per_sec": 9948.2919921875 + }, + "step_21300": { + "perplexity": 2.9038281153242593, + "loss": 1.0660299062728882, + "unpadded_tokens_per_sec": 10028.5283203125 + }, + "step_21400": { + "perplexity": 2.9047615215298044, + "loss": 1.066351294517517, + "unpadded_tokens_per_sec": 10036.8857421875 + }, + "step_21500": { + "perplexity": 2.9048903385213474, + "loss": 1.06639564037323, + "unpadded_tokens_per_sec": 10035.2041015625 + }, + "step_21600": { + "perplexity": 2.903241773921791, + "loss": 1.0658279657363892, + "unpadded_tokens_per_sec": 9951.58203125 + }, + "step_21700": { + "perplexity": 2.904198879872049, + "loss": 1.066157579421997, + "unpadded_tokens_per_sec": 10035.2060546875 + }, + "step_21800": { + "perplexity": 2.900163176184779, + "loss": 1.0647670030593872, + "unpadded_tokens_per_sec": 10040.7421875 + }, + "step_21900": { + "perplexity": 2.902853829154235, + "loss": 1.0656943321228027, + "unpadded_tokens_per_sec": 10045.2734375 + }, + "step_22000": { + "perplexity": 2.8999405369346545, + "loss": 1.0646902322769165, + "unpadded_tokens_per_sec": 10036.5947265625 + }, + "step_22100": { + "perplexity": 2.899666409909919, + "loss": 1.0645956993103027, + "unpadded_tokens_per_sec": 9241.5498046875 + }, + "step_22200": { + "perplexity": 2.897536845502091, + "loss": 1.0638610124588013, + "unpadded_tokens_per_sec": 9938.2197265625 + }, + "step_22300": { + "perplexity": 2.8955396635381234, + "loss": 1.0631715059280396, + "unpadded_tokens_per_sec": 10039.4326171875 + }, + "step_22400": { + "perplexity": 2.897876752133414, + "loss": 1.0639783143997192, + "unpadded_tokens_per_sec": 10034.7099609375 + }, + "step_22500": { + "perplexity": 2.899126873654945, + "loss": 1.064409613609314, + "unpadded_tokens_per_sec": 10026.037109375 + }, + "step_22600": { + "perplexity": 2.8969241470804623, + "loss": 1.0636495351791382, + "unpadded_tokens_per_sec": 9947.3154296875 + }, + "step_22700": { + "perplexity": 2.895750228081867, + "loss": 1.0632442235946655, + "unpadded_tokens_per_sec": 10032.064453125 + }, + "step_22800": { + "perplexity": 2.897381413682175, + "loss": 1.0638073682785034, + "unpadded_tokens_per_sec": 10031.55859375 + }, + "step_22900": { + "perplexity": 2.8963747628120586, + "loss": 1.063459873199463, + "unpadded_tokens_per_sec": 10028.8623046875 + }, + "step_23000": { + "perplexity": 2.891483246890646, + "loss": 1.0617696046829224, + "unpadded_tokens_per_sec": 10030.7529296875 + }, + "step_23100": { + "perplexity": 2.89273198966545, + "loss": 1.0622013807296753, + "unpadded_tokens_per_sec": 10040.2177734375 + }, + "step_23200": { + "perplexity": 2.8911723517251477, + "loss": 1.0616620779037476, + "unpadded_tokens_per_sec": 9231.9287109375 + }, + "step_23300": { + "perplexity": 2.8884467300972565, + "loss": 1.0607188940048218, + "unpadded_tokens_per_sec": 9927.9501953125 + }, + "step_23400": { + "perplexity": 2.8893066492925397, + "loss": 1.06101655960083, + "unpadded_tokens_per_sec": 10039.9560546875 + }, + "step_23500": { + "perplexity": 2.888191937370552, + "loss": 1.0606306791305542, + "unpadded_tokens_per_sec": 10041.0419921875 + }, + "step_23600": { + "perplexity": 2.8819606541818836, + "loss": 1.0584708452224731, + "unpadded_tokens_per_sec": 10040.8955078125 + }, + "step_23700": { + "perplexity": 2.886918310830134, + "loss": 1.0601896047592163, + "unpadded_tokens_per_sec": 9946.65234375 + }, + "step_23800": { + "perplexity": 2.888180575515705, + "loss": 1.060626745223999, + "unpadded_tokens_per_sec": 10033.5537109375 + }, + "step_23900": { + "perplexity": 2.8800617562506683, + "loss": 1.0578117370605469, + "unpadded_tokens_per_sec": 10042.1044921875 + }, + "step_24000": { + "perplexity": 2.884289533368178, + "loss": 1.0592786073684692, + "unpadded_tokens_per_sec": 10035.0400390625 + }, + "step_24100": { + "perplexity": 2.8873361361049796, + "loss": 1.060334324836731, + "unpadded_tokens_per_sec": 10035.6533203125 + }, + "step_24200": { + "perplexity": 2.882067502228451, + "loss": 1.0585079193115234, + "unpadded_tokens_per_sec": 10035.6298828125 + }, + "step_24300": { + "perplexity": 2.88436896014029, + "loss": 1.0593061447143555, + "unpadded_tokens_per_sec": 9154.7822265625 + }, + "step_24400": { + "perplexity": 2.881511317335802, + "loss": 1.0583149194717407, + "unpadded_tokens_per_sec": 9922.5048828125 + }, + "step_24500": { + "perplexity": 2.8815456678322438, + "loss": 1.0583268404006958, + "unpadded_tokens_per_sec": 10034.53125 + }, + "step_24600": { + "perplexity": 2.875821320771911, + "loss": 1.0563383102416992, + "unpadded_tokens_per_sec": 10028.341796875 + }, + "step_24700": { + "perplexity": 2.8790120441001874, + "loss": 1.0574471950531006, + "unpadded_tokens_per_sec": 10030.4501953125 + }, + "step_24800": { + "perplexity": 2.878504831823693, + "loss": 1.0572710037231445, + "unpadded_tokens_per_sec": 9877.1474609375 + }, + "step_24900": { + "perplexity": 2.8777033578339593, + "loss": 1.056992530822754, + "unpadded_tokens_per_sec": 10034.703125 + }, + "step_25000": { + "perplexity": 2.8809342902182915, + "loss": 1.0581146478652954, + "unpadded_tokens_per_sec": 10042.8076171875 + }, + "step_25100": { + "perplexity": 2.8741758906922894, + "loss": 1.055765986442566, + "unpadded_tokens_per_sec": 9970.541015625 + }, + "step_25200": { + "perplexity": 2.870465907746808, + "loss": 1.0544743537902832, + "unpadded_tokens_per_sec": 10028.5771484375 + }, + "step_25300": { + "perplexity": 2.8735400426035076, + "loss": 1.0555447340011597, + "unpadded_tokens_per_sec": 10032.3720703125 + }, + "step_25400": { + "perplexity": 2.8734263173685664, + "loss": 1.0555051565170288, + "unpadded_tokens_per_sec": 9057.1826171875 + }, + "step_25500": { + "perplexity": 2.87300160026384, + "loss": 1.0553573369979858, + "unpadded_tokens_per_sec": 9913.8623046875 + }, + "step_25600": { + "perplexity": 2.8736866588852377, + "loss": 1.0555957555770874, + "unpadded_tokens_per_sec": 10045.3759765625 + }, + "step_25700": { + "perplexity": 2.869610227620367, + "loss": 1.0541762113571167, + "unpadded_tokens_per_sec": 10039.7431640625 + }, + "step_25800": { + "perplexity": 2.873092703643847, + "loss": 1.0553890466690063, + "unpadded_tokens_per_sec": 9919.8681640625 + }, + "step_25900": { + "perplexity": 2.867529058243861, + "loss": 1.0534507036209106, + "unpadded_tokens_per_sec": 10047.4638671875 + }, + "step_26000": { + "perplexity": 2.868133488165042, + "loss": 1.0536614656448364, + "unpadded_tokens_per_sec": 10042.24609375 + }, + "step_26100": { + "perplexity": 2.8707410386374974, + "loss": 1.054570198059082, + "unpadded_tokens_per_sec": 10040.2685546875 + }, + "step_26200": { + "perplexity": 2.871256124436431, + "loss": 1.054749608039856, + "unpadded_tokens_per_sec": 10036.419921875 + }, + "step_26300": { + "perplexity": 2.8659741257182594, + "loss": 1.0529083013534546, + "unpadded_tokens_per_sec": 10043.3984375 + }, + "step_26400": { + "perplexity": 2.863935196422161, + "loss": 1.0521966218948364, + "unpadded_tokens_per_sec": 10032.2607421875 + }, + "step_26500": { + "perplexity": 2.8667942048004735, + "loss": 1.0531944036483765, + "unpadded_tokens_per_sec": 10034.1630859375 + }, + "step_26600": { + "perplexity": 2.8656974019797183, + "loss": 1.0528117418289185, + "unpadded_tokens_per_sec": 9067.3583984375 + }, + "step_26700": { + "perplexity": 2.8619093173004293, + "loss": 1.051488995552063, + "unpadded_tokens_per_sec": 9916.5771484375 + }, + "step_26800": { + "perplexity": 2.8641332197223344, + "loss": 1.0522657632827759, + "unpadded_tokens_per_sec": 10036.1650390625 + }, + "step_26900": { + "perplexity": 2.861992221882091, + "loss": 1.0515179634094238, + "unpadded_tokens_per_sec": 10040.7470703125 + }, + "step_27000": { + "perplexity": 2.862021904353186, + "loss": 1.0515283346176147, + "unpadded_tokens_per_sec": 9930.4296875 + }, + "step_27100": { + "perplexity": 2.8611458892396433, + "loss": 1.0512222051620483, + "unpadded_tokens_per_sec": 10041.2509765625 + }, + "step_27200": { + "perplexity": 2.8614825502365124, + "loss": 1.051339864730835, + "unpadded_tokens_per_sec": 10043.9736328125 + }, + "step_27300": { + "perplexity": 2.859449202474587, + "loss": 1.0506290197372437, + "unpadded_tokens_per_sec": 10035.140625 + }, + "step_27400": { + "perplexity": 2.858593057924098, + "loss": 1.050329566001892, + "unpadded_tokens_per_sec": 10034.0693359375 + }, + "step_27500": { + "perplexity": 2.8586727994146126, + "loss": 1.050357460975647, + "unpadded_tokens_per_sec": 10034.28515625 + }, + "step_27600": { + "perplexity": 2.857079413932222, + "loss": 1.049799919128418, + "unpadded_tokens_per_sec": 10047.0263671875 + }, + "step_27700": { + "perplexity": 2.8577743028532914, + "loss": 1.0500431060791016, + "unpadded_tokens_per_sec": 10037.5068359375 + }, + "step_27800": { + "perplexity": 2.858874548502695, + "loss": 1.050428032875061, + "unpadded_tokens_per_sec": 9528.73046875 + }, + "step_27900": { + "perplexity": 2.8569176380691124, + "loss": 1.0497432947158813, + "unpadded_tokens_per_sec": 9442.8955078125 + }, + "step_28000": { + "perplexity": 2.8527194403284426, + "loss": 1.048272728919983, + "unpadded_tokens_per_sec": 10040.4208984375 + }, + "step_28100": { + "perplexity": 2.8523623884842966, + "loss": 1.0481475591659546, + "unpadded_tokens_per_sec": 10039.0576171875 + }, + "step_28200": { + "perplexity": 2.850961816705699, + "loss": 1.0476564168930054, + "unpadded_tokens_per_sec": 10028.5537109375 + }, + "step_28300": { + "perplexity": 2.8545414403693647, + "loss": 1.048911213874817, + "unpadded_tokens_per_sec": 9864.41015625 + }, + "step_28400": { + "perplexity": 2.850837770091001, + "loss": 1.0476129055023193, + "unpadded_tokens_per_sec": 10045.9443359375 + }, + "step_28500": { + "perplexity": 2.8526799924048905, + "loss": 1.048258900642395, + "unpadded_tokens_per_sec": 10040.51953125 + }, + "step_28600": { + "perplexity": 2.8515607149176416, + "loss": 1.0478664636611938, + "unpadded_tokens_per_sec": 10031.1845703125 + }, + "step_28700": { + "perplexity": 2.8497147991524594, + "loss": 1.047218918800354, + "unpadded_tokens_per_sec": 10037.7412109375 + }, + "step_28800": { + "perplexity": 2.852331106071202, + "loss": 1.048136591911316, + "unpadded_tokens_per_sec": 10035.3525390625 + }, + "step_28900": { + "perplexity": 2.8481335367325906, + "loss": 1.0466638803482056, + "unpadded_tokens_per_sec": 10039.3994140625 + }, + "step_29000": { + "perplexity": 2.8448820753659474, + "loss": 1.04552161693573, + "unpadded_tokens_per_sec": 10037.63671875 + }, + "step_29100": { + "perplexity": 2.8507266425020923, + "loss": 1.0475739240646362, + "unpadded_tokens_per_sec": 9619.794921875 + }, + "step_29200": { + "perplexity": 2.848489040520027, + "loss": 1.0467886924743652, + "unpadded_tokens_per_sec": 9234.61328125 + }, + "step_29300": { + "perplexity": 2.8433611163254366, + "loss": 1.0449868440628052, + "unpadded_tokens_per_sec": 10042.0244140625 + }, + "step_29400": { + "perplexity": 2.8426995530257515, + "loss": 1.044754147529602, + "unpadded_tokens_per_sec": 10045.9560546875 + }, + "step_29500": { + "perplexity": 2.844905815010971, + "loss": 1.0455299615859985, + "unpadded_tokens_per_sec": 10044.3642578125 + }, + "step_29600": { + "perplexity": 2.8438417952240918, + "loss": 1.0451558828353882, + "unpadded_tokens_per_sec": 9922.0009765625 + }, + "step_29700": { + "perplexity": 2.840307416401427, + "loss": 1.0439122915267944, + "unpadded_tokens_per_sec": 10029.7080078125 + }, + "step_29800": { + "perplexity": 2.8427402184599075, + "loss": 1.0447684526443481, + "unpadded_tokens_per_sec": 10037.9423828125 + }, + "step_29900": { + "perplexity": 2.842267857536635, + "loss": 1.0446022748947144, + "unpadded_tokens_per_sec": 10037.0029296875 + }, + "step_30000": { + "perplexity": 2.8446596106051616, + "loss": 1.0454434156417847, + "unpadded_tokens_per_sec": 10038.900390625 + }, + "step_30100": { + "perplexity": 2.8431855430259927, + "loss": 1.0449250936508179, + "unpadded_tokens_per_sec": 9964.7490234375 + }, + "step_30200": { + "perplexity": 2.8388951658418144, + "loss": 1.0434149503707886, + "unpadded_tokens_per_sec": 10030.94140625 + }, + "step_30300": { + "perplexity": 2.8404049322918024, + "loss": 1.043946623802185, + "unpadded_tokens_per_sec": 10032.0654296875 + }, + "step_30400": { + "perplexity": 2.843932651987915, + "loss": 1.0451878309249878, + "unpadded_tokens_per_sec": 10036.5224609375 + }, + "step_30500": { + "perplexity": 2.8382745664975824, + "loss": 1.0431963205337524, + "unpadded_tokens_per_sec": 9002.3193359375 + }, + "step_30600": { + "perplexity": 2.843836710043238, + "loss": 1.045154094696045, + "unpadded_tokens_per_sec": 9895.0380859375 + }, + "step_30700": { + "perplexity": 2.837332760023135, + "loss": 1.042864441871643, + "unpadded_tokens_per_sec": 10043.6845703125 + }, + "step_30800": { + "perplexity": 2.838373704396457, + "loss": 1.0432312488555908, + "unpadded_tokens_per_sec": 9849.5478515625 + }, + "step_30900": { + "perplexity": 2.8363584682057166, + "loss": 1.0425209999084473, + "unpadded_tokens_per_sec": 10037.8076171875 + }, + "step_31000": { + "perplexity": 2.8379176310703613, + "loss": 1.0430705547332764, + "unpadded_tokens_per_sec": 10041.7568359375 + }, + "step_31100": { + "perplexity": 2.8346464059175047, + "loss": 1.0419172048568726, + "unpadded_tokens_per_sec": 10039.251953125 + }, + "step_31200": { + "perplexity": 2.8356758855111344, + "loss": 1.0422803163528442, + "unpadded_tokens_per_sec": 10043.2001953125 + }, + "step_31300": { + "perplexity": 2.8354581968107433, + "loss": 1.0422035455703735, + "unpadded_tokens_per_sec": 10044.8212890625 + }, + "step_31400": { + "perplexity": 2.8327644998256085, + "loss": 1.0412530899047852, + "unpadded_tokens_per_sec": 10037.646484375 + }, + "step_31500": { + "perplexity": 2.831733714213945, + "loss": 1.0408891439437866, + "unpadded_tokens_per_sec": 10037.3115234375 + }, + "step_31600": { + "perplexity": 2.8292371426341116, + "loss": 1.0400071144104004, + "unpadded_tokens_per_sec": 10037.470703125 + }, + "step_31700": { + "perplexity": 2.8330782329201423, + "loss": 1.0413638353347778, + "unpadded_tokens_per_sec": 10039.7822265625 + }, + "step_31800": { + "perplexity": 2.8323481563814683, + "loss": 1.041106104850769, + "unpadded_tokens_per_sec": 10035.9189453125 + }, + "step_31900": { + "perplexity": 2.8294489569702264, + "loss": 1.0400819778442383, + "unpadded_tokens_per_sec": 8924.3759765625 + }, + "step_32000": { + "perplexity": 2.831070806373477, + "loss": 1.0406550168991089, + "unpadded_tokens_per_sec": 9891.61328125 + }, + "step_32100": { + "perplexity": 2.8273332122330452, + "loss": 1.0393339395523071, + "unpadded_tokens_per_sec": 10038.11328125 + }, + "step_32200": { + "perplexity": 2.8227376434632037, + "loss": 1.0377072095870972, + "unpadded_tokens_per_sec": 9838.5107421875 + }, + "step_32300": { + "perplexity": 2.828593364882652, + "loss": 1.039779543876648, + "unpadded_tokens_per_sec": 10031.0205078125 + }, + "step_32400": { + "perplexity": 2.827226708213865, + "loss": 1.039296269416809, + "unpadded_tokens_per_sec": 10029.919921875 + }, + "step_32500": { + "perplexity": 2.8239496276536102, + "loss": 1.0381364822387695, + "unpadded_tokens_per_sec": 10035.94140625 + }, + "step_32600": { + "perplexity": 2.8252187127698107, + "loss": 1.0385857820510864, + "unpadded_tokens_per_sec": 10038.3916015625 + }, + "step_32700": { + "perplexity": 2.8231344007755728, + "loss": 1.0378477573394775, + "unpadded_tokens_per_sec": 10032.6884765625 + }, + "step_32800": { + "perplexity": 2.825102858588743, + "loss": 1.038544774055481, + "unpadded_tokens_per_sec": 10036.7216796875 + }, + "step_32900": { + "perplexity": 2.8277107271439035, + "loss": 1.039467453956604, + "unpadded_tokens_per_sec": 10035.421875 + }, + "step_33000": { + "perplexity": 2.8245973992762785, + "loss": 1.0383658409118652, + "unpadded_tokens_per_sec": 10029.3876953125 + }, + "step_33100": { + "perplexity": 2.8210731494855463, + "loss": 1.0371173620224, + "unpadded_tokens_per_sec": 10037.3974609375 + }, + "step_33200": { + "perplexity": 2.8231175736334104, + "loss": 1.037841796875, + "unpadded_tokens_per_sec": 10033.20703125 + }, + "step_33300": { + "perplexity": 2.8243889783698433, + "loss": 1.0382920503616333, + "unpadded_tokens_per_sec": 10033.822265625 + }, + "step_33400": { + "perplexity": 2.8212752718996468, + "loss": 1.03718900680542, + "unpadded_tokens_per_sec": 8988.0283203125 + }, + "step_33500": { + "perplexity": 2.8222490927572625, + "loss": 1.0375341176986694, + "unpadded_tokens_per_sec": 9888.265625 + }, + "step_33600": { + "perplexity": 2.821062387966043, + "loss": 1.0371135473251343, + "unpadded_tokens_per_sec": 10037.267578125 + }, + "step_33700": { + "perplexity": 2.8188013619680583, + "loss": 1.0363117456436157, + "unpadded_tokens_per_sec": 9893.2900390625 + }, + "step_33800": { + "perplexity": 2.816076490305676, + "loss": 1.0353446006774902, + "unpadded_tokens_per_sec": 10044.4404296875 + }, + "step_33900": { + "perplexity": 2.819225796408646, + "loss": 1.0364623069763184, + "unpadded_tokens_per_sec": 10041.1572265625 + }, + "step_34000": { + "perplexity": 2.8195074437601093, + "loss": 1.036562204360962, + "unpadded_tokens_per_sec": 10039.8642578125 + }, + "step_34100": { + "perplexity": 2.8180353238146205, + "loss": 1.03603994846344, + "unpadded_tokens_per_sec": 10038.6533203125 + }, + "step_34200": { + "perplexity": 2.81430453893705, + "loss": 1.034715175628662, + "unpadded_tokens_per_sec": 10035.5380859375 + }, + "step_34300": { + "perplexity": 2.815290720452392, + "loss": 1.0350655317306519, + "unpadded_tokens_per_sec": 10038.9765625 + }, + "step_34400": { + "perplexity": 2.8176224887268906, + "loss": 1.035893440246582, + "unpadded_tokens_per_sec": 10042.3876953125 + }, + "step_34500": { + "perplexity": 2.8134702958831563, + "loss": 1.0344187021255493, + "unpadded_tokens_per_sec": 10035.9423828125 + }, + "step_34600": { + "perplexity": 2.8150249308248103, + "loss": 1.0349711179733276, + "unpadded_tokens_per_sec": 10035.63671875 + }, + "step_34700": { + "perplexity": 2.814567911887048, + "loss": 1.0348087549209595, + "unpadded_tokens_per_sec": 10038.8466796875 + }, + "step_34800": { + "perplexity": 2.8111349264316874, + "loss": 1.0335882902145386, + "unpadded_tokens_per_sec": 10039.6689453125 + }, + "step_34900": { + "perplexity": 2.811533739661572, + "loss": 1.033730149269104, + "unpadded_tokens_per_sec": 10037.4150390625 + }, + "step_35000": { + "perplexity": 2.8172009823552124, + "loss": 1.0357438325881958, + "unpadded_tokens_per_sec": 6999.0078125 + }, + "step_35100": { + "perplexity": 2.8130259368492814, + "loss": 1.0342607498168945, + "unpadded_tokens_per_sec": 9603.6591796875 + }, + "step_35200": { + "perplexity": 2.807609734972876, + "loss": 1.032333493232727, + "unpadded_tokens_per_sec": 10039.35546875 + }, + "step_35300": { + "perplexity": 2.812592712452544, + "loss": 1.034106731414795, + "unpadded_tokens_per_sec": 10035.13671875 + }, + "step_35400": { + "perplexity": 2.8116463559926657, + "loss": 1.033770203590393, + "unpadded_tokens_per_sec": 10039.3701171875 + }, + "step_35500": { + "perplexity": 2.8077908098123294, + "loss": 1.032397985458374, + "unpadded_tokens_per_sec": 10037.5791015625 + }, + "step_35600": { + "perplexity": 2.8117378600831446, + "loss": 1.0338027477264404, + "unpadded_tokens_per_sec": 10040.134765625 + }, + "step_35700": { + "perplexity": 2.8128512307476625, + "loss": 1.0341986417770386, + "unpadded_tokens_per_sec": 10033.2666015625 + }, + "step_35800": { + "perplexity": 2.8103696316216737, + "loss": 1.0333160161972046, + "unpadded_tokens_per_sec": 10034.7646484375 + }, + "step_35900": { + "perplexity": 2.809200647447478, + "loss": 1.0328999757766724, + "unpadded_tokens_per_sec": 10022.9111328125 + }, + "step_36000": { + "perplexity": 2.8083106699353166, + "loss": 1.0325831174850464, + "unpadded_tokens_per_sec": 10037.61328125 + }, + "step_36100": { + "perplexity": 2.81232784805314, + "loss": 1.0340125560760498, + "unpadded_tokens_per_sec": 10035.43359375 + }, + "step_36200": { + "perplexity": 2.8075033045647344, + "loss": 1.03229558467865, + "unpadded_tokens_per_sec": 10032.8330078125 + }, + "step_36300": { + "perplexity": 2.8039288517268393, + "loss": 1.0310215950012207, + "unpadded_tokens_per_sec": 10041.4736328125 + }, + "step_36400": { + "perplexity": 2.8044613694935268, + "loss": 1.031211495399475, + "unpadded_tokens_per_sec": 10029.3876953125 + }, + "step_36500": { + "perplexity": 2.8058648525662133, + "loss": 1.0317118167877197, + "unpadded_tokens_per_sec": 10040.3701171875 + }, + "step_36600": { + "perplexity": 2.8040207731842917, + "loss": 1.0310543775558472, + "unpadded_tokens_per_sec": 8920.234375 + }, + "step_36700": { + "perplexity": 2.8046689885615987, + "loss": 1.0312855243682861, + "unpadded_tokens_per_sec": 9637.5224609375 + }, + "step_36800": { + "perplexity": 2.8034295201699546, + "loss": 1.0308434963226318, + "unpadded_tokens_per_sec": 10034.12109375 + }, + "step_36900": { + "perplexity": 2.8011689355954066, + "loss": 1.0300368070602417, + "unpadded_tokens_per_sec": 10028.0869140625 + }, + "step_37000": { + "perplexity": 2.8051581744383656, + "loss": 1.031459927558899, + "unpadded_tokens_per_sec": 10043.326171875 + }, + "step_37100": { + "perplexity": 2.8023466038987386, + "loss": 1.0304571390151978, + "unpadded_tokens_per_sec": 10034.4599609375 + }, + "step_37200": { + "perplexity": 2.8031514838502507, + "loss": 1.0307443141937256, + "unpadded_tokens_per_sec": 10037.9228515625 + }, + "step_37300": { + "perplexity": 2.804569690567967, + "loss": 1.0312501192092896, + "unpadded_tokens_per_sec": 10041.9658203125 + }, + "step_37400": { + "perplexity": 2.801786765633791, + "loss": 1.0302573442459106, + "unpadded_tokens_per_sec": 10040.9052734375 + }, + "step_37500": { + "perplexity": 2.799359644718464, + "loss": 1.0293906927108765, + "unpadded_tokens_per_sec": 10034.3994140625 + }, + "step_37600": { + "perplexity": 2.80165149930016, + "loss": 1.0302090644836426, + "unpadded_tokens_per_sec": 10038.8544921875 + }, + "step_37700": { + "perplexity": 2.801750025983597, + "loss": 1.03024423122406, + "unpadded_tokens_per_sec": 10030.2353515625 + }, + "step_37800": { + "perplexity": 2.7987790501016896, + "loss": 1.029183268547058, + "unpadded_tokens_per_sec": 10042.19140625 + }, + "step_37900": { + "perplexity": 2.8008994707926367, + "loss": 1.0299406051635742, + "unpadded_tokens_per_sec": 10034.8818359375 + }, + "step_38000": { + "perplexity": 2.800971258758339, + "loss": 1.0299662351608276, + "unpadded_tokens_per_sec": 10037.6513671875 + }, + "step_38100": { + "perplexity": 2.7983967242481373, + "loss": 1.029046654701233, + "unpadded_tokens_per_sec": 10041.9189453125 + }, + "step_38200": { + "perplexity": 2.8003679631975658, + "loss": 1.0297508239746094, + "unpadded_tokens_per_sec": 10034.4287109375 + }, + "step_38300": { + "perplexity": 2.795734569759587, + "loss": 1.0280948877334595, + "unpadded_tokens_per_sec": 8645.0322265625 + }, + "step_38400": { + "perplexity": 2.796502546653681, + "loss": 1.0283695459365845, + "unpadded_tokens_per_sec": 9718.5556640625 + }, + "step_38500": { + "perplexity": 2.798056478151323, + "loss": 1.0289250612258911, + "unpadded_tokens_per_sec": 10035.9150390625 + }, + "step_38600": { + "perplexity": 2.8002384402013707, + "loss": 1.0297045707702637, + "unpadded_tokens_per_sec": 10038.7021484375 + }, + "step_38700": { + "perplexity": 2.799139738680909, + "loss": 1.0293121337890625, + "unpadded_tokens_per_sec": 10033.5634765625 + }, + "step_38800": { + "perplexity": 2.795514281990204, + "loss": 1.0280160903930664, + "unpadded_tokens_per_sec": 10042.52734375 + }, + "step_38900": { + "perplexity": 2.7985428426225885, + "loss": 1.0290988683700562, + "unpadded_tokens_per_sec": 10037.81640625 + }, + "step_39000": { + "perplexity": 2.7937063122348587, + "loss": 1.0273691415786743, + "unpadded_tokens_per_sec": 10038.556640625 + }, + "step_39100": { + "perplexity": 2.7983213328196177, + "loss": 1.0290197134017944, + "unpadded_tokens_per_sec": 10038.0087890625 + }, + "step_39200": { + "perplexity": 2.7944524119135963, + "loss": 1.027636170387268, + "unpadded_tokens_per_sec": 10041.3505859375 + }, + "step_39300": { + "perplexity": 2.793784576731115, + "loss": 1.0273971557617188, + "unpadded_tokens_per_sec": 10036.6162109375 + }, + "step_39400": { + "perplexity": 2.7910735738325556, + "loss": 1.0264263153076172, + "unpadded_tokens_per_sec": 10035.4462890625 + }, + "step_39500": { + "perplexity": 2.7929983679557533, + "loss": 1.0271157026290894, + "unpadded_tokens_per_sec": 10041.3662109375 + }, + "step_39600": { + "perplexity": 2.7929454291926286, + "loss": 1.0270967483520508, + "unpadded_tokens_per_sec": 10041.0341796875 + }, + "step_39700": { + "perplexity": 2.7953476613201937, + "loss": 1.027956485748291, + "unpadded_tokens_per_sec": 10033.033203125 + }, + "step_39800": { + "perplexity": 2.7900093994060584, + "loss": 1.0260449647903442, + "unpadded_tokens_per_sec": 10040.0810546875 + }, + "step_39900": { + "perplexity": 2.7935491237856485, + "loss": 1.0273128747940063, + "unpadded_tokens_per_sec": 10037.50390625 + }, + "step_40000": { + "perplexity": 2.7928442157343367, + "loss": 1.0270605087280273, + "unpadded_tokens_per_sec": 10038.9267578125 + }, + "step_40100": { + "perplexity": 2.7910186752593313, + "loss": 1.0264066457748413, + "unpadded_tokens_per_sec": 8801.23828125 + }, + "step_40200": { + "perplexity": 2.7923694936630215, + "loss": 1.026890516281128, + "unpadded_tokens_per_sec": 9876.1943359375 + }, + "step_40300": { + "perplexity": 2.788826609640054, + "loss": 1.025620937347412, + "unpadded_tokens_per_sec": 10029.58203125 + }, + "step_40400": { + "perplexity": 2.789678819524204, + "loss": 1.0259264707565308, + "unpadded_tokens_per_sec": 10042.5693359375 + }, + "step_40500": { + "perplexity": 2.790463761221545, + "loss": 1.0262078046798706, + "unpadded_tokens_per_sec": 10031.7880859375 + }, + "step_40600": { + "perplexity": 2.7884393275780943, + "loss": 1.0254820585250854, + "unpadded_tokens_per_sec": 10037.4501953125 + }, + "step_40700": { + "perplexity": 2.7902678377196706, + "loss": 1.0261375904083252, + "unpadded_tokens_per_sec": 10039.8671875 + }, + "step_40800": { + "perplexity": 2.7885982230675777, + "loss": 1.0255390405654907, + "unpadded_tokens_per_sec": 10033.3125 + }, + "step_40900": { + "perplexity": 2.7852832402549526, + "loss": 1.024349570274353, + "unpadded_tokens_per_sec": 10030.5791015625 + }, + "step_41000": { + "perplexity": 2.7857208923321295, + "loss": 1.024506688117981, + "unpadded_tokens_per_sec": 10039.5732421875 + }, + "step_41100": { + "perplexity": 2.787928795829941, + "loss": 1.0252989530563354, + "unpadded_tokens_per_sec": 10034.4619140625 + }, + "step_41200": { + "perplexity": 2.7832054975662337, + "loss": 1.0236033201217651, + "unpadded_tokens_per_sec": 10038.296875 + }, + "step_41300": { + "perplexity": 2.7842869915276713, + "loss": 1.0239918231964111, + "unpadded_tokens_per_sec": 10033.7763671875 + }, + "step_41400": { + "perplexity": 2.7855518668017, + "loss": 1.0244460105895996, + "unpadded_tokens_per_sec": 10033.208984375 + }, + "step_41500": { + "perplexity": 2.78476830679252, + "loss": 1.0241646766662598, + "unpadded_tokens_per_sec": 10039.8974609375 + }, + "step_41600": { + "perplexity": 2.7832054975662337, + "loss": 1.0236033201217651, + "unpadded_tokens_per_sec": 10039.611328125 + }, + "step_41700": { + "perplexity": 2.781522203256838, + "loss": 1.022998332977295, + "unpadded_tokens_per_sec": 10046.9365234375 + }, + "step_41800": { + "perplexity": 2.7828233086992165, + "loss": 1.0234659910202026, + "unpadded_tokens_per_sec": 10036.693359375 + }, + "step_41900": { + "perplexity": 2.7840871871468895, + "loss": 1.0239200592041016, + "unpadded_tokens_per_sec": 8508.1845703125 + }, + "step_42000": { + "perplexity": 2.781916152095257, + "loss": 1.0231399536132812, + "unpadded_tokens_per_sec": 9866.4423828125 + }, + "step_42100": { + "perplexity": 2.7786866229917235, + "loss": 1.0219783782958984, + "unpadded_tokens_per_sec": 10042.751953125 + }, + "step_42200": { + "perplexity": 2.782076997415289, + "loss": 1.0231977701187134, + "unpadded_tokens_per_sec": 10033.478515625 + }, + "step_42300": { + "perplexity": 2.7800185480476998, + "loss": 1.0224575996398926, + "unpadded_tokens_per_sec": 10036.482421875 + }, + "step_42400": { + "perplexity": 2.7810122749109514, + "loss": 1.0228149890899658, + "unpadded_tokens_per_sec": 10035.4150390625 + }, + "step_42500": { + "perplexity": 2.7813202763657134, + "loss": 1.0229257345199585, + "unpadded_tokens_per_sec": 10038.9384765625 + }, + "step_42600": { + "perplexity": 2.782360240439461, + "loss": 1.0232995748519897, + "unpadded_tokens_per_sec": 10033.7255859375 + }, + "step_42700": { + "perplexity": 2.781529166514554, + "loss": 1.0230008363723755, + "unpadded_tokens_per_sec": 10034.6494140625 + }, + "step_42800": { + "perplexity": 2.7801650324907703, + "loss": 1.022510290145874, + "unpadded_tokens_per_sec": 10040.3701171875 + }, + "step_42900": { + "perplexity": 2.774069261000174, + "loss": 1.0203152894973755, + "unpadded_tokens_per_sec": 10038.23046875 + }, + "step_43000": { + "perplexity": 2.776753816944549, + "loss": 1.0212825536727905, + "unpadded_tokens_per_sec": 10039.623046875 + }, + "step_43100": { + "perplexity": 2.781336854375828, + "loss": 1.022931694984436, + "unpadded_tokens_per_sec": 10051.3828125 + }, + "step_43200": { + "perplexity": 2.77973753162927, + "loss": 1.0223565101623535, + "unpadded_tokens_per_sec": 10032.1904296875 + }, + "step_43300": { + "perplexity": 2.775144558230687, + "loss": 1.020702838897705, + "unpadded_tokens_per_sec": 10038.0849609375 + }, + "step_43400": { + "perplexity": 2.7773503697758404, + "loss": 1.021497368812561, + "unpadded_tokens_per_sec": 10044.69140625 + }, + "step_43500": { + "perplexity": 2.7772682616703896, + "loss": 1.0214678049087524, + "unpadded_tokens_per_sec": 10030.6162109375 + }, + "step_43600": { + "perplexity": 2.775222964391922, + "loss": 1.0207310914993286, + "unpadded_tokens_per_sec": 10041.7568359375 + }, + "step_43700": { + "perplexity": 2.7732578545877766, + "loss": 1.0200227499008179, + "unpadded_tokens_per_sec": 10046.4990234375 + }, + "step_43800": { + "perplexity": 2.776813400256766, + "loss": 1.0213040113449097, + "unpadded_tokens_per_sec": 9667.8369140625 + }, + "step_43900": { + "perplexity": 2.775025795310986, + "loss": 1.0206600427627563, + "unpadded_tokens_per_sec": 8695.6376953125 + }, + "step_44000": { + "perplexity": 2.775614697548414, + "loss": 1.0208722352981567, + "unpadded_tokens_per_sec": 9867.28125 + }, + "step_44100": { + "perplexity": 2.7732710785432486, + "loss": 1.0200275182724, + "unpadded_tokens_per_sec": 10041.3173828125 + }, + "step_44200": { + "perplexity": 2.7765436304703797, + "loss": 1.0212068557739258, + "unpadded_tokens_per_sec": 10040.828125 + }, + "step_44300": { + "perplexity": 2.7724070256248745, + "loss": 1.0197159051895142, + "unpadded_tokens_per_sec": 10043.142578125 + }, + "step_44400": { + "perplexity": 2.768533668932043, + "loss": 1.0183178186416626, + "unpadded_tokens_per_sec": 10035.6572265625 + }, + "step_44500": { + "perplexity": 2.770772210285782, + "loss": 1.019126057624817, + "unpadded_tokens_per_sec": 10042.3251953125 + }, + "step_44600": { + "perplexity": 2.772071261339985, + "loss": 1.0195947885513306, + "unpadded_tokens_per_sec": 10036.5810546875 + }, + "step_44700": { + "perplexity": 2.775153490466364, + "loss": 1.020706057548523, + "unpadded_tokens_per_sec": 10031.2236328125 + }, + "step_44800": { + "perplexity": 2.7697768595245615, + "loss": 1.0187667608261108, + "unpadded_tokens_per_sec": 10043.4599609375 + }, + "step_44900": { + "perplexity": 2.7713648350586557, + "loss": 1.019339919090271, + "unpadded_tokens_per_sec": 10041.4248046875 + }, + "step_45000": { + "perplexity": 2.7707054901281927, + "loss": 1.0191019773483276, + "unpadded_tokens_per_sec": 10036.775390625 + }, + "step_45100": { + "perplexity": 2.773272070342451, + "loss": 1.0200278759002686, + "unpadded_tokens_per_sec": 9975.0654296875 + }, + "step_45200": { + "perplexity": 2.7710331609846386, + "loss": 1.019220232963562, + "unpadded_tokens_per_sec": 10032.072265625 + }, + "step_45300": { + "perplexity": 2.7672125243907053, + "loss": 1.0178405046463013, + "unpadded_tokens_per_sec": 10038.9033203125 + }, + "step_45400": { + "perplexity": 2.76497290362321, + "loss": 1.0170308351516724, + "unpadded_tokens_per_sec": 10040.064453125 + }, + "step_45500": { + "perplexity": 2.7709885664027256, + "loss": 1.0192041397094727, + "unpadded_tokens_per_sec": 10031.587890625 + }, + "step_45600": { + "perplexity": 2.7689974068456085, + "loss": 1.0184853076934814, + "unpadded_tokens_per_sec": 10038.1953125 + }, + "step_45700": { + "perplexity": 2.765070799661486, + "loss": 1.017066240310669, + "unpadded_tokens_per_sec": 9831.125 + }, + "step_45800": { + "perplexity": 2.7660225825101254, + "loss": 1.017410397529602, + "unpadded_tokens_per_sec": 10036.8193359375 + }, + "step_45900": { + "perplexity": 2.7668028469401293, + "loss": 1.0176924467086792, + "unpadded_tokens_per_sec": 8458.40234375 + }, + "step_46000": { + "perplexity": 2.7665597739386873, + "loss": 1.0176045894622803, + "unpadded_tokens_per_sec": 9858.0693359375 + }, + "step_46100": { + "perplexity": 2.7685957161944947, + "loss": 1.0183402299880981, + "unpadded_tokens_per_sec": 10048.4443359375 + }, + "step_46200": { + "perplexity": 2.7647985451899957, + "loss": 1.0169677734375, + "unpadded_tokens_per_sec": 10043.2646484375 + }, + "step_46300": { + "perplexity": 2.7665033787775903, + "loss": 1.017584204673767, + "unpadded_tokens_per_sec": 10036.02734375 + }, + "step_46400": { + "perplexity": 2.7684369703858027, + "loss": 1.0182828903198242, + "unpadded_tokens_per_sec": 10036.1201171875 + }, + "step_46500": { + "perplexity": 2.7632910835572293, + "loss": 1.0164223909378052, + "unpadded_tokens_per_sec": 10031.6943359375 + }, + "step_46600": { + "perplexity": 2.766583189811167, + "loss": 1.0176130533218384, + "unpadded_tokens_per_sec": 10041.453125 + }, + "step_46700": { + "perplexity": 2.765403078727495, + "loss": 1.0171864032745361, + "unpadded_tokens_per_sec": 10033.9404296875 + }, + "step_46800": { + "perplexity": 2.7657901286894737, + "loss": 1.0173263549804688, + "unpadded_tokens_per_sec": 10035.3837890625 + }, + "step_46900": { + "perplexity": 2.767126097851349, + "loss": 1.017809271812439, + "unpadded_tokens_per_sec": 10034.7587890625 + }, + "step_47000": { + "perplexity": 2.764444258995769, + "loss": 1.016839623451233, + "unpadded_tokens_per_sec": 10035.7841796875 + }, + "step_47100": { + "perplexity": 2.766175584053923, + "loss": 1.0174657106399536, + "unpadded_tokens_per_sec": 10039.2314453125 + }, + "step_47200": { + "perplexity": 2.763100361768491, + "loss": 1.0163533687591553, + "unpadded_tokens_per_sec": 10036.8505859375 + }, + "step_47300": { + "perplexity": 2.760456318781066, + "loss": 1.015395998954773, + "unpadded_tokens_per_sec": 10039.158203125 + }, + "step_47400": { + "perplexity": 2.7598752387321137, + "loss": 1.0151854753494263, + "unpadded_tokens_per_sec": 10035.6904296875 + }, + "step_47500": { + "perplexity": 2.759673238426255, + "loss": 1.015112280845642, + "unpadded_tokens_per_sec": 10037.9921875 + }, + "step_47600": { + "perplexity": 2.760433941973273, + "loss": 1.0153878927230835, + "unpadded_tokens_per_sec": 10036.5849609375 + }, + "step_47700": { + "perplexity": 2.764786679987324, + "loss": 1.0169634819030762, + "unpadded_tokens_per_sec": 10037.2978515625 + }, + "step_47800": { + "perplexity": 2.7648512800401646, + "loss": 1.0169868469238281, + "unpadded_tokens_per_sec": 9838.9814453125 + }, + "step_47900": { + "perplexity": 2.756953268035241, + "loss": 1.014126181602478, + "unpadded_tokens_per_sec": 10045.275390625 + }, + "step_48000": { + "perplexity": 2.761010531717528, + "loss": 1.0155967473983765, + "unpadded_tokens_per_sec": 8585.9970703125 + }, + "step_48100": { + "perplexity": 2.754604061583031, + "loss": 1.0132737159729004, + "unpadded_tokens_per_sec": 10039.44921875 + }, + "step_48200": { + "perplexity": 2.7573404501511147, + "loss": 1.0142666101455688, + "unpadded_tokens_per_sec": 9847.6171875 + }, + "step_48300": { + "perplexity": 2.7598074649943904, + "loss": 1.0151609182357788, + "unpadded_tokens_per_sec": 10036.947265625 + }, + "step_48400": { + "perplexity": 2.7570364188625702, + "loss": 1.0141563415527344, + "unpadded_tokens_per_sec": 10041.48046875 + }, + "step_48500": { + "perplexity": 2.7575948761500126, + "loss": 1.0143588781356812, + "unpadded_tokens_per_sec": 10036.248046875 + }, + "step_48600": { + "perplexity": 2.7569907348960316, + "loss": 1.0141397714614868, + "unpadded_tokens_per_sec": 10040.0302734375 + }, + "step_48700": { + "perplexity": 2.7584069610894653, + "loss": 1.0146533250808716, + "unpadded_tokens_per_sec": 10044.80859375 + }, + "step_48800": { + "perplexity": 2.7552493928458848, + "loss": 1.0135079622268677, + "unpadded_tokens_per_sec": 10037.0458984375 + }, + "step_48900": { + "perplexity": 2.752805157554642, + "loss": 1.012620449066162, + "unpadded_tokens_per_sec": 10035.2900390625 + }, + "step_49000": { + "perplexity": 2.758062371149593, + "loss": 1.0145283937454224, + "unpadded_tokens_per_sec": 10034.328125 + }, + "step_49100": { + "perplexity": 2.757637611501538, + "loss": 1.0143743753433228, + "unpadded_tokens_per_sec": 10035.984375 + }, + "step_49200": { + "perplexity": 2.7553929298125936, + "loss": 1.0135600566864014, + "unpadded_tokens_per_sec": 10032.9736328125 + }, + "step_49300": { + "perplexity": 2.756191223724971, + "loss": 1.0138497352600098, + "unpadded_tokens_per_sec": 10025.5087890625 + }, + "step_49400": { + "perplexity": 2.755372236378983, + "loss": 1.0135525465011597, + "unpadded_tokens_per_sec": 10045.1748046875 + }, + "step_49500": { + "perplexity": 2.7552855227282627, + "loss": 1.0135210752487183, + "unpadded_tokens_per_sec": 10036.4658203125 + }, + "step_49600": { + "perplexity": 2.756454087137193, + "loss": 1.0139451026916504, + "unpadded_tokens_per_sec": 10032.341796875 + }, + "step_49700": { + "perplexity": 2.7552622024772453, + "loss": 1.0135126113891602, + "unpadded_tokens_per_sec": 10034.3818359375 + }, + "step_49800": { + "perplexity": 2.756802748410596, + "loss": 1.0140715837478638, + "unpadded_tokens_per_sec": 10038.6845703125 + }, + "step_49900": { + "perplexity": 2.751006116524707, + "loss": 1.0119667053222656, + "unpadded_tokens_per_sec": 10039.5234375 + }, + "step_50000": { + "perplexity": 2.7510674430138926, + "loss": 1.0119889974594116, + "unpadded_tokens_per_sec": 9845.1630859375 + }, + "step_50100": { + "perplexity": 2.750142771563098, + "loss": 1.0116528272628784, + "unpadded_tokens_per_sec": 9989.3935546875 + }, + "step_50200": { + "perplexity": 2.7534960209207613, + "loss": 1.0128713846206665, + "unpadded_tokens_per_sec": 8373.828125 + }, + "step_50300": { + "perplexity": 2.7492308647654484, + "loss": 1.0113211870193481, + "unpadded_tokens_per_sec": 10043.6865234375 + }, + "step_50400": { + "perplexity": 2.7500893337440266, + "loss": 1.0116333961486816, + "unpadded_tokens_per_sec": 9837.0283203125 + }, + "step_50500": { + "perplexity": 2.7524405959954863, + "loss": 1.0124880075454712, + "unpadded_tokens_per_sec": 10042.0263671875 + }, + "step_50600": { + "perplexity": 2.7494766761475598, + "loss": 1.0114105939865112, + "unpadded_tokens_per_sec": 10045.9072265625 + }, + "step_50700": { + "perplexity": 2.7497385713856883, + "loss": 1.0115058422088623, + "unpadded_tokens_per_sec": 10039.0625 + }, + "step_50800": { + "perplexity": 2.748187558173431, + "loss": 1.0109416246414185, + "unpadded_tokens_per_sec": 10038.6181640625 + }, + "step_50900": { + "perplexity": 2.7522014094689817, + "loss": 1.0124011039733887, + "unpadded_tokens_per_sec": 10037.328125 + }, + "step_51000": { + "perplexity": 2.75092380343952, + "loss": 1.0119367837905884, + "unpadded_tokens_per_sec": 10032.83984375 + }, + "step_51100": { + "perplexity": 2.747110589260506, + "loss": 1.0105496644973755, + "unpadded_tokens_per_sec": 10039.4658203125 + }, + "step_51200": { + "perplexity": 2.745581023535677, + "loss": 1.0099927186965942, + "unpadded_tokens_per_sec": 10036.2890625 + }, + "step_51300": { + "perplexity": 2.751798874964034, + "loss": 1.0122548341751099, + "unpadded_tokens_per_sec": 10044.7294921875 + }, + "step_51400": { + "perplexity": 2.7499824612209016, + "loss": 1.011594533920288, + "unpadded_tokens_per_sec": 10032.765625 + }, + "step_51500": { + "perplexity": 2.748576785803911, + "loss": 1.0110832452774048, + "unpadded_tokens_per_sec": 10036.1181640625 + }, + "step_51600": { + "perplexity": 2.744479229732427, + "loss": 1.0095913410186768, + "unpadded_tokens_per_sec": 10039.5439453125 + }, + "step_51700": { + "perplexity": 2.748240959038526, + "loss": 1.0109610557556152, + "unpadded_tokens_per_sec": 10038.341796875 + }, + "step_51800": { + "perplexity": 2.7494406224362145, + "loss": 1.0113974809646606, + "unpadded_tokens_per_sec": 10039.0048828125 + }, + "step_51900": { + "perplexity": 2.749301656190227, + "loss": 1.0113469362258911, + "unpadded_tokens_per_sec": 10029.4189453125 + }, + "step_52000": { + "perplexity": 2.745953187450616, + "loss": 1.0101282596588135, + "unpadded_tokens_per_sec": 10043.1298828125 + }, + "step_52100": { + "perplexity": 2.7452079282977393, + "loss": 1.0098568201065063, + "unpadded_tokens_per_sec": 9824.5009765625 + }, + "step_52200": { + "perplexity": 2.7499021456471744, + "loss": 1.0115653276443481, + "unpadded_tokens_per_sec": 10042.3857421875 + }, + "step_52300": { + "perplexity": 2.7451375695277282, + "loss": 1.009831190109253, + "unpadded_tokens_per_sec": 10036.6923828125 + }, + "step_52400": { + "perplexity": 2.748295671430454, + "loss": 1.0109809637069702, + "unpadded_tokens_per_sec": 10030.935546875 + }, + "step_52500": { + "perplexity": 2.7436133541932963, + "loss": 1.0092757940292358, + "unpadded_tokens_per_sec": 9596.5341796875 + }, + "step_52600": { + "perplexity": 2.7433419043376785, + "loss": 1.0091768503189087, + "unpadded_tokens_per_sec": 8646.7216796875 + }, + "step_52700": { + "perplexity": 2.7469976106036964, + "loss": 1.0105085372924805, + "unpadded_tokens_per_sec": 9821.6708984375 + }, + "step_52800": { + "perplexity": 2.744244333578259, + "loss": 1.0095057487487793, + "unpadded_tokens_per_sec": 10039.6484375 + }, + "step_52900": { + "perplexity": 2.738824086254668, + "loss": 1.0075286626815796, + "unpadded_tokens_per_sec": 10039.1435546875 + }, + "step_53000": { + "perplexity": 2.743946979958974, + "loss": 1.0093973875045776, + "unpadded_tokens_per_sec": 10039.13671875 + }, + "step_53100": { + "perplexity": 2.740468146644226, + "loss": 1.0081287622451782, + "unpadded_tokens_per_sec": 10035.00390625 + }, + "step_53200": { + "perplexity": 2.7417516825476556, + "loss": 1.0085970163345337, + "unpadded_tokens_per_sec": 10038.4560546875 + }, + "step_53300": { + "perplexity": 2.742126923149524, + "loss": 1.008733868598938, + "unpadded_tokens_per_sec": 10043.5595703125 + }, + "step_53400": { + "perplexity": 2.7431646588068235, + "loss": 1.0091122388839722, + "unpadded_tokens_per_sec": 10041.0078125 + }, + "step_53500": { + "perplexity": 2.745713255420173, + "loss": 1.0100408792495728, + "unpadded_tokens_per_sec": 10032.158203125 + }, + "step_53600": { + "perplexity": 2.7420791980624966, + "loss": 1.0087164640426636, + "unpadded_tokens_per_sec": 10044.9140625 + }, + "step_53700": { + "perplexity": 2.740234900438583, + "loss": 1.008043646812439, + "unpadded_tokens_per_sec": 10041.9951171875 + }, + "step_53800": { + "perplexity": 2.742174975959904, + "loss": 1.008751392364502, + "unpadded_tokens_per_sec": 10038.5537109375 + }, + "step_53900": { + "perplexity": 2.74340338701244, + "loss": 1.0091992616653442, + "unpadded_tokens_per_sec": 10032.9755859375 + }, + "step_54000": { + "perplexity": 2.7396068024544764, + "loss": 1.0078144073486328, + "unpadded_tokens_per_sec": 10037.2841796875 + }, + "step_54100": { + "perplexity": 2.739415755966467, + "loss": 1.0077446699142456, + "unpadded_tokens_per_sec": 10044.5205078125 + }, + "step_54200": { + "perplexity": 2.7406850768988362, + "loss": 1.00820791721344, + "unpadded_tokens_per_sec": 10043.2822265625 + }, + "step_54300": { + "perplexity": 2.7380595457383623, + "loss": 1.0072494745254517, + "unpadded_tokens_per_sec": 10036.150390625 + }, + "step_54400": { + "perplexity": 2.736113575255632, + "loss": 1.0065385103225708, + "unpadded_tokens_per_sec": 10041.3837890625 + }, + "step_54500": { + "perplexity": 2.7370491911922255, + "loss": 1.0068804025650024, + "unpadded_tokens_per_sec": 9814.7607421875 + }, + "step_54600": { + "perplexity": 2.7388084146223757, + "loss": 1.0075229406356812, + "unpadded_tokens_per_sec": 10040.7705078125 + }, + "step_54700": { + "perplexity": 2.7406870371902627, + "loss": 1.0082086324691772, + "unpadded_tokens_per_sec": 10043.7373046875 + }, + "step_54800": { + "perplexity": 2.7354257688629646, + "loss": 1.0062870979309082, + "unpadded_tokens_per_sec": 10033.2802734375 + }, + "step_54900": { + "perplexity": 2.7364162779029932, + "loss": 1.006649136543274, + "unpadded_tokens_per_sec": 10040.8076171875 + }, + "step_55000": { + "perplexity": 2.7372289783076984, + "loss": 1.006946086883545, + "unpadded_tokens_per_sec": 8249.69921875 + }, + "step_55100": { + "perplexity": 2.735404899300174, + "loss": 1.006279468536377, + "unpadded_tokens_per_sec": 9753.6630859375 + }, + "step_55200": { + "perplexity": 2.740800083034301, + "loss": 1.0082498788833618, + "unpadded_tokens_per_sec": 10036.8662109375 + }, + "step_55300": { + "perplexity": 2.7371519718541752, + "loss": 1.006917953491211, + "unpadded_tokens_per_sec": 10039.1337890625 + }, + "step_55400": { + "perplexity": 2.7353452262726026, + "loss": 1.0062576532363892, + "unpadded_tokens_per_sec": 10031.0576171875 + }, + "step_55500": { + "perplexity": 2.7374312937196903, + "loss": 1.0070199966430664, + "unpadded_tokens_per_sec": 10029.2880859375 + }, + "step_55600": { + "perplexity": 2.737223757462728, + "loss": 1.006944179534912, + "unpadded_tokens_per_sec": 10035.3447265625 + }, + "step_55700": { + "perplexity": 2.733799398697814, + "loss": 1.0056923627853394, + "unpadded_tokens_per_sec": 10033.2138671875 + }, + "step_55800": { + "perplexity": 2.735770139637695, + "loss": 1.0064129829406738, + "unpadded_tokens_per_sec": 10037.6376953125 + }, + "step_55900": { + "perplexity": 2.735151542464796, + "loss": 1.006186842918396, + "unpadded_tokens_per_sec": 10044.8115234375 + }, + "step_56000": { + "perplexity": 2.7366084201240755, + "loss": 1.0067193508148193, + "unpadded_tokens_per_sec": 10037.9169921875 + }, + "step_56100": { + "perplexity": 2.730549558708359, + "loss": 1.0045028924942017, + "unpadded_tokens_per_sec": 10036.6005859375 + }, + "step_56200": { + "perplexity": 2.7332995225728443, + "loss": 1.0055094957351685, + "unpadded_tokens_per_sec": 10035.7568359375 + }, + "step_56300": { + "perplexity": 2.7341024971831622, + "loss": 1.0058032274246216, + "unpadded_tokens_per_sec": 10038.4169921875 + }, + "step_56400": { + "perplexity": 2.734359994336749, + "loss": 1.0058974027633667, + "unpadded_tokens_per_sec": 10035.0732421875 + }, + "step_56500": { + "perplexity": 2.728830121986619, + "loss": 1.0038729906082153, + "unpadded_tokens_per_sec": 10032.6259765625 + }, + "step_56600": { + "perplexity": 2.731126092237038, + "loss": 1.004714012145996, + "unpadded_tokens_per_sec": 10033.1806640625 + }, + "step_56700": { + "perplexity": 2.732323170415451, + "loss": 1.0051522254943848, + "unpadded_tokens_per_sec": 10026.6611328125 + }, + "step_56800": { + "perplexity": 2.7307253380774297, + "loss": 1.004567265510559, + "unpadded_tokens_per_sec": 9803.8330078125 + }, + "step_56900": { + "perplexity": 2.7301423799724693, + "loss": 1.0043537616729736, + "unpadded_tokens_per_sec": 10039.3330078125 + }, + "step_57000": { + "perplexity": 2.7334640740464295, + "loss": 1.0055696964263916, + "unpadded_tokens_per_sec": 10036.4462890625 + }, + "step_57100": { + "perplexity": 2.734321205239515, + "loss": 1.0058832168579102, + "unpadded_tokens_per_sec": 10040.765625 + }, + "step_57200": { + "perplexity": 2.732332616262594, + "loss": 1.0051556825637817, + "unpadded_tokens_per_sec": 10038.0166015625 + }, + "step_57300": { + "perplexity": 2.728516548972776, + "loss": 1.0037580728530884, + "unpadded_tokens_per_sec": 10037.8076171875 + }, + "step_57400": { + "perplexity": 2.726247796909817, + "loss": 1.002926230430603, + "unpadded_tokens_per_sec": 10031.8740234375 + }, + "step_57500": { + "perplexity": 2.7301732986892255, + "loss": 1.004365086555481, + "unpadded_tokens_per_sec": 8152.65087890625 + }, + "step_57600": { + "perplexity": 2.7280108095212388, + "loss": 1.003572702407837, + "unpadded_tokens_per_sec": 10041.4013671875 + }, + "step_57700": { + "perplexity": 2.7302084488135803, + "loss": 1.0043779611587524, + "unpadded_tokens_per_sec": 9800.94921875 + }, + "step_57800": { + "perplexity": 2.732211125616268, + "loss": 1.0051112174987793, + "unpadded_tokens_per_sec": 10032.3359375 + }, + "step_57900": { + "perplexity": 2.7276261200425447, + "loss": 1.0034316778182983, + "unpadded_tokens_per_sec": 10034.8291015625 + }, + "step_58000": { + "perplexity": 2.724399857259457, + "loss": 1.0022481679916382, + "unpadded_tokens_per_sec": 10040.51171875 + }, + "step_58100": { + "perplexity": 2.727019116913101, + "loss": 1.003209114074707, + "unpadded_tokens_per_sec": 10031.205078125 + }, + "step_58200": { + "perplexity": 2.72795422457199, + "loss": 1.003551959991455, + "unpadded_tokens_per_sec": 10032.0234375 + }, + "step_58300": { + "perplexity": 2.7268165958483728, + "loss": 1.003134846687317, + "unpadded_tokens_per_sec": 10039.365234375 + }, + "step_58400": { + "perplexity": 2.7242618319030814, + "loss": 1.002197504043579, + "unpadded_tokens_per_sec": 10025.7685546875 + }, + "step_58500": { + "perplexity": 2.726954100485856, + "loss": 1.0031852722167969, + "unpadded_tokens_per_sec": 10033.6552734375 + }, + "step_58600": { + "perplexity": 2.727444037431994, + "loss": 1.00336492061615, + "unpadded_tokens_per_sec": 10028.6396484375 + }, + "step_58700": { + "perplexity": 2.7251706544368925, + "loss": 1.0025310516357422, + "unpadded_tokens_per_sec": 10035.5087890625 + }, + "step_58800": { + "perplexity": 2.7217545608727844, + "loss": 1.0012767314910889, + "unpadded_tokens_per_sec": 10040.4912109375 + }, + "step_58900": { + "perplexity": 2.7233874008140546, + "loss": 1.0018764734268188, + "unpadded_tokens_per_sec": 10034.7275390625 + }, + "step_59000": { + "perplexity": 2.725859781613491, + "loss": 1.0027838945388794, + "unpadded_tokens_per_sec": 10039.3779296875 + }, + "step_59100": { + "perplexity": 2.7230939302454864, + "loss": 1.001768708229065, + "unpadded_tokens_per_sec": 10042.8427734375 + }, + "step_59200": { + "perplexity": 2.724532692970257, + "loss": 1.0022969245910645, + "unpadded_tokens_per_sec": 10039.1357421875 + }, + "step_59300": { + "perplexity": 2.726717778830715, + "loss": 1.0030986070632935, + "unpadded_tokens_per_sec": 10042.4931640625 + }, + "step_59400": { + "perplexity": 2.723890984290075, + "loss": 1.002061367034912, + "unpadded_tokens_per_sec": 9790.0205078125 + }, + "step_59500": { + "perplexity": 2.723377011935399, + "loss": 1.0018726587295532, + "unpadded_tokens_per_sec": 10041.09765625 + }, + "step_59600": { + "perplexity": 2.7210320878596495, + "loss": 1.0010112524032593, + "unpadded_tokens_per_sec": 10039.7841796875 + }, + "step_59700": { + "perplexity": 2.720107135298094, + "loss": 1.0006712675094604, + "unpadded_tokens_per_sec": 10043.8974609375 + }, + "step_59800": { + "perplexity": 2.72226433279543, + "loss": 1.0014640092849731, + "unpadded_tokens_per_sec": 10037.2236328125 + }, + "step_59900": { + "perplexity": 2.719345550403722, + "loss": 1.0003912448883057, + "unpadded_tokens_per_sec": 10033.1640625 + }, + "step_60000": { + "perplexity": 2.7234328526241445, + "loss": 1.001893162727356, + "unpadded_tokens_per_sec": 10039.3076171875 + }, + "step_60100": { + "perplexity": 2.719789701285385, + "loss": 1.0005545616149902, + "unpadded_tokens_per_sec": 9011.05859375 + }, + "step_60200": { + "perplexity": 2.7178779370663815, + "loss": 0.999851405620575, + "unpadded_tokens_per_sec": 9052.3916015625 + }, + "step_60300": { + "perplexity": 2.7211508107121514, + "loss": 1.0010548830032349, + "unpadded_tokens_per_sec": 9796.5439453125 + }, + "step_60400": { + "perplexity": 2.7209915416239885, + "loss": 1.0009963512420654, + "unpadded_tokens_per_sec": 10034.1357421875 + }, + "step_60500": { + "perplexity": 2.7242718993985267, + "loss": 1.0022011995315552, + "unpadded_tokens_per_sec": 10030.2958984375 + }, + "step_60600": { + "perplexity": 2.7206324905278563, + "loss": 1.0008643865585327, + "unpadded_tokens_per_sec": 10036.7978515625 + }, + "step_60700": { + "perplexity": 2.7188570682071402, + "loss": 1.0002115964889526, + "unpadded_tokens_per_sec": 10034.0361328125 + }, + "step_60800": { + "perplexity": 2.71873974182564, + "loss": 1.0001684427261353, + "unpadded_tokens_per_sec": 10034.9072265625 + }, + "step_60900": { + "perplexity": 2.7161489949829947, + "loss": 0.9992150664329529, + "unpadded_tokens_per_sec": 10036.3544921875 + }, + "step_61000": { + "perplexity": 2.71803994004338, + "loss": 0.9999110102653503, + "unpadded_tokens_per_sec": 10038.826171875 + }, + "step_61100": { + "perplexity": 2.7192583497353264, + "loss": 1.0003591775894165, + "unpadded_tokens_per_sec": 10040.75390625 + }, + "step_61200": { + "perplexity": 2.7221510779515943, + "loss": 1.00142240524292, + "unpadded_tokens_per_sec": 10035.2587890625 + }, + "step_61300": { + "perplexity": 2.718253150676888, + "loss": 0.9999894499778748, + "unpadded_tokens_per_sec": 10037.3486328125 + }, + "step_61400": { + "perplexity": 2.7159489000150576, + "loss": 0.9991413950920105, + "unpadded_tokens_per_sec": 10028.818359375 + }, + "step_61500": { + "perplexity": 2.718137146464623, + "loss": 0.9999467730522156, + "unpadded_tokens_per_sec": 10029.1982421875 + }, + "step_61600": { + "perplexity": 2.7175469949982074, + "loss": 0.9997296333312988, + "unpadded_tokens_per_sec": 10034.99609375 + }, + "step_61700": { + "perplexity": 2.7203266695571684, + "loss": 1.0007519721984863, + "unpadded_tokens_per_sec": 10039.92578125 + }, + "step_61800": { + "perplexity": 2.7189176780173434, + "loss": 1.0002338886260986, + "unpadded_tokens_per_sec": 10033.4775390625 + }, + "step_61900": { + "perplexity": 2.7165919760849815, + "loss": 0.9993781447410583, + "unpadded_tokens_per_sec": 10039.87890625 + }, + "step_62000": { + "perplexity": 2.7171824060263736, + "loss": 0.9995954632759094, + "unpadded_tokens_per_sec": 9781.318359375 + }, + "step_62100": { + "perplexity": 2.714529393821944, + "loss": 0.9986186027526855, + "unpadded_tokens_per_sec": 10040.4814453125 + }, + "step_62200": { + "perplexity": 2.713857204043889, + "loss": 0.9983709454536438, + "unpadded_tokens_per_sec": 10040.0224609375 + }, + "step_62300": { + "perplexity": 2.712528198255787, + "loss": 0.9978811144828796, + "unpadded_tokens_per_sec": 10024.4921875 + }, + "step_62400": { + "perplexity": 2.715344495522226, + "loss": 0.9989188313484192, + "unpadded_tokens_per_sec": 10037.2841796875 + }, + "step_62500": { + "perplexity": 2.715829918491813, + "loss": 0.9990975856781006, + "unpadded_tokens_per_sec": 10026.3486328125 + }, + "step_62600": { + "perplexity": 2.713327011495794, + "loss": 0.9981755614280701, + "unpadded_tokens_per_sec": 10042.03515625 + }, + "step_62700": { + "perplexity": 2.7108409532222324, + "loss": 0.9972589015960693, + "unpadded_tokens_per_sec": 10035.7431640625 + }, + "step_62800": { + "perplexity": 2.7137549746008007, + "loss": 0.9983332753181458, + "unpadded_tokens_per_sec": 10038.5048828125 + }, + "step_62900": { + "perplexity": 2.7165385425155772, + "loss": 0.9993584752082825, + "unpadded_tokens_per_sec": 7941.22802734375 + }, + "step_63000": { + "perplexity": 2.7135911243649637, + "loss": 0.9982728958129883, + "unpadded_tokens_per_sec": 10038.62890625 + }, + "step_63100": { + "perplexity": 2.7166109209665086, + "loss": 0.9993851184844971, + "unpadded_tokens_per_sec": 9784.9482421875 + }, + "step_63200": { + "perplexity": 2.718046582371506, + "loss": 0.9999134540557861, + "unpadded_tokens_per_sec": 10035.70703125 + }, + "step_63300": { + "perplexity": 2.711804295053944, + "loss": 0.9976142048835754, + "unpadded_tokens_per_sec": 10033.0791015625 + }, + "step_63400": { + "perplexity": 2.711045196408047, + "loss": 0.9973342418670654, + "unpadded_tokens_per_sec": 10037.4658203125 + }, + "step_63500": { + "perplexity": 2.7104000407776407, + "loss": 0.9970962405204773, + "unpadded_tokens_per_sec": 10028.0986328125 + }, + "step_63600": { + "perplexity": 2.7087573889655103, + "loss": 0.9964900016784668, + "unpadded_tokens_per_sec": 10027.880859375 + }, + "step_63700": { + "perplexity": 2.7092616583698015, + "loss": 0.9966761469841003, + "unpadded_tokens_per_sec": 10045.0224609375 + }, + "step_63800": { + "perplexity": 2.7146399044880236, + "loss": 0.9986593127250671, + "unpadded_tokens_per_sec": 10025.65234375 + }, + "step_63900": { + "perplexity": 2.7145389399537843, + "loss": 0.9986221194267273, + "unpadded_tokens_per_sec": 10032.10546875 + }, + "step_64000": { + "perplexity": 2.711986303445909, + "loss": 0.9976813197135925, + "unpadded_tokens_per_sec": 10035.7451171875 + }, + "step_64100": { + "perplexity": 2.7099897286618497, + "loss": 0.9969448447227478, + "unpadded_tokens_per_sec": 10033.3154296875 + }, + "step_64200": { + "perplexity": 2.7091169720508166, + "loss": 0.9966227412223816, + "unpadded_tokens_per_sec": 10031.7587890625 + }, + "step_64300": { + "perplexity": 2.7105948800129362, + "loss": 0.9971681237220764, + "unpadded_tokens_per_sec": 10032.09375 + }, + "step_64400": { + "perplexity": 2.711622622127618, + "loss": 0.9975472092628479, + "unpadded_tokens_per_sec": 10027.349609375 + }, + "step_64500": { + "perplexity": 2.709240019522664, + "loss": 0.9966681599617004, + "unpadded_tokens_per_sec": 10033.7568359375 + }, + "step_64600": { + "perplexity": 2.7082046250900333, + "loss": 0.9962859153747559, + "unpadded_tokens_per_sec": 10030.908203125 + }, + "step_64700": { + "perplexity": 2.7089395157887646, + "loss": 0.9965572357177734, + "unpadded_tokens_per_sec": 10032.9306640625 + }, + "step_64800": { + "perplexity": 2.7056308519138716, + "loss": 0.9953351020812988, + "unpadded_tokens_per_sec": 9766.4873046875 + }, + "step_64900": { + "perplexity": 2.708329891128923, + "loss": 0.9963321685791016, + "unpadded_tokens_per_sec": 10034.32421875 + }, + "step_65000": { + "perplexity": 2.7091629930888135, + "loss": 0.9966397285461426, + "unpadded_tokens_per_sec": 10029.1357421875 + }, + "step_65100": { + "perplexity": 2.708417871385311, + "loss": 0.9963646531105042, + "unpadded_tokens_per_sec": 9974.623046875 + }, + "step_65200": { + "perplexity": 2.7070090780320184, + "loss": 0.9958443641662598, + "unpadded_tokens_per_sec": 10036.6865234375 + }, + "step_65300": { + "perplexity": 2.708697167098267, + "loss": 0.9964677691459656, + "unpadded_tokens_per_sec": 10047.0888671875 + }, + "step_65400": { + "perplexity": 2.7043668048616882, + "loss": 0.9948678016662598, + "unpadded_tokens_per_sec": 10044.0888671875 + }, + "step_65500": { + "perplexity": 2.7102662786650353, + "loss": 0.9970468878746033, + "unpadded_tokens_per_sec": 10020.8564453125 + }, + "step_65600": { + "perplexity": 2.7033635656719928, + "loss": 0.994496762752533, + "unpadded_tokens_per_sec": 10034.5537109375 + }, + "step_65700": { + "perplexity": 2.7093516067732915, + "loss": 0.9967093467712402, + "unpadded_tokens_per_sec": 10033.0107421875 + }, + "step_65800": { + "perplexity": 2.708531846388678, + "loss": 0.9964067339897156, + "unpadded_tokens_per_sec": 7911.75244140625 + }, + "step_65900": { + "perplexity": 2.706194704166701, + "loss": 0.9955434799194336, + "unpadded_tokens_per_sec": 10049.5244140625 + }, + "step_66000": { + "perplexity": 2.7064623170409363, + "loss": 0.995642364025116, + "unpadded_tokens_per_sec": 9773.2529296875 + }, + "step_66100": { + "perplexity": 2.70168588600716, + "loss": 0.9938759803771973, + "unpadded_tokens_per_sec": 10037.66796875 + }, + "step_66200": { + "perplexity": 2.703461214048701, + "loss": 0.9945328831672668, + "unpadded_tokens_per_sec": 10035.2568359375 + }, + "step_66300": { + "perplexity": 2.7014089233971528, + "loss": 0.9937734603881836, + "unpadded_tokens_per_sec": 10046.7197265625 + }, + "step_66400": { + "perplexity": 2.7075075350397406, + "loss": 0.996028482913971, + "unpadded_tokens_per_sec": 10031.7275390625 + }, + "step_66500": { + "perplexity": 2.7061301842262275, + "loss": 0.9955196380615234, + "unpadded_tokens_per_sec": 10034.6103515625 + }, + "step_66600": { + "perplexity": 2.7042784726374443, + "loss": 0.9948351383209229, + "unpadded_tokens_per_sec": 10041.5498046875 + }, + "step_66700": { + "perplexity": 2.704789969125138, + "loss": 0.9950242638587952, + "unpadded_tokens_per_sec": 10037.3505859375 + }, + "step_66800": { + "perplexity": 2.705677942628087, + "loss": 0.9953525066375732, + "unpadded_tokens_per_sec": 10035.4873046875 + }, + "step_66900": { + "perplexity": 2.70428250232939, + "loss": 0.9948366284370422, + "unpadded_tokens_per_sec": 10043.1533203125 + }, + "step_67000": { + "perplexity": 2.703412228283529, + "loss": 0.9945147633552551, + "unpadded_tokens_per_sec": 10037.3818359375 + }, + "step_67100": { + "perplexity": 2.704131473580219, + "loss": 0.9947807788848877, + "unpadded_tokens_per_sec": 10038.2236328125 + }, + "step_67200": { + "perplexity": 2.7016013449904612, + "loss": 0.9938446879386902, + "unpadded_tokens_per_sec": 10036.2451171875 + }, + "step_67300": { + "perplexity": 2.7030422855128005, + "loss": 0.9943779110908508, + "unpadded_tokens_per_sec": 10038.5087890625 + }, + "step_67400": { + "perplexity": 2.7041216416915423, + "loss": 0.9947771430015564, + "unpadded_tokens_per_sec": 10041.2802734375 + }, + "step_67500": { + "perplexity": 2.6999929932997224, + "loss": 0.9932491779327393, + "unpadded_tokens_per_sec": 10033.86328125 + }, + "step_67600": { + "perplexity": 2.7011649943841425, + "loss": 0.9936831593513489, + "unpadded_tokens_per_sec": 9748.984375 + }, + "step_67700": { + "perplexity": 2.7025570540784813, + "loss": 0.9941983819007874, + "unpadded_tokens_per_sec": 10037.0498046875 + }, + "step_67800": { + "perplexity": 2.6959429583766603, + "loss": 0.991748034954071, + "unpadded_tokens_per_sec": 10035.548828125 + }, + "step_67900": { + "perplexity": 2.704534934201861, + "loss": 0.9949299693107605, + "unpadded_tokens_per_sec": 10033.5380859375 + }, + "step_68000": { + "perplexity": 2.6970873191275597, + "loss": 0.9921724200248718, + "unpadded_tokens_per_sec": 10041.6796875 + }, + "step_68100": { + "perplexity": 2.700576918176248, + "loss": 0.9934654235839844, + "unpadded_tokens_per_sec": 10037.9853515625 + }, + "step_68200": { + "perplexity": 2.6980991648397907, + "loss": 0.9925475120544434, + "unpadded_tokens_per_sec": 10031.8203125 + }, + "step_68300": { + "perplexity": 2.699386669149083, + "loss": 0.9930245876312256, + "unpadded_tokens_per_sec": 10042.84765625 + }, + "step_68400": { + "perplexity": 2.6990943369773284, + "loss": 0.9929162859916687, + "unpadded_tokens_per_sec": 10030.3544921875 + }, + "step_68500": { + "perplexity": 2.698425969330453, + "loss": 0.992668628692627, + "unpadded_tokens_per_sec": 10040.6767578125 + }, + "step_68600": { + "perplexity": 2.698883916024551, + "loss": 0.9928383231163025, + "unpadded_tokens_per_sec": 10032.7255859375 + }, + "step_68700": { + "perplexity": 2.6999995915248385, + "loss": 0.993251621723175, + "unpadded_tokens_per_sec": 10031.0380859375 + }, + "step_68800": { + "perplexity": 2.7003691178675107, + "loss": 0.9933884739875793, + "unpadded_tokens_per_sec": 9365.9814453125 + }, + "step_68900": { + "perplexity": 2.6976965035064544, + "loss": 0.9923982620239258, + "unpadded_tokens_per_sec": 8200.8671875 + }, + "step_69000": { + "perplexity": 2.7002767315408027, + "loss": 0.9933542609214783, + "unpadded_tokens_per_sec": 10044.9853515625 + }, + "step_69100": { + "perplexity": 2.696830920819693, + "loss": 0.9920773506164551, + "unpadded_tokens_per_sec": 9756.6884765625 + }, + "step_69200": { + "perplexity": 2.698404095353007, + "loss": 0.9926605224609375, + "unpadded_tokens_per_sec": 10034.173828125 + }, + "step_69300": { + "perplexity": 2.6988737814845014, + "loss": 0.9928345680236816, + "unpadded_tokens_per_sec": 10036.32421875 + }, + "step_69400": { + "perplexity": 2.697831574890911, + "loss": 0.9924483299255371, + "unpadded_tokens_per_sec": 10048.1337890625 + }, + "step_69500": { + "perplexity": 2.6973402047817974, + "loss": 0.9922661781311035, + "unpadded_tokens_per_sec": 10033.9013671875 + }, + "step_69600": { + "perplexity": 2.6935415415556005, + "loss": 0.9908568859100342, + "unpadded_tokens_per_sec": 10039.84765625 + }, + "step_69700": { + "perplexity": 2.6980468990922817, + "loss": 0.9925281405448914, + "unpadded_tokens_per_sec": 10036.578125 + }, + "step_69800": { + "perplexity": 2.6959039108139065, + "loss": 0.9917335510253906, + "unpadded_tokens_per_sec": 10031.94921875 + }, + "step_69900": { + "perplexity": 2.6981482151545406, + "loss": 0.9925656914710999, + "unpadded_tokens_per_sec": 10037.59765625 + }, + "step_70000": { + "perplexity": 2.696533882915811, + "loss": 0.9919672012329102, + "unpadded_tokens_per_sec": 10034.2880859375 + }, + "step_70100": { + "perplexity": 2.6922292242763217, + "loss": 0.9903695583343506, + "unpadded_tokens_per_sec": 9961.5087890625 + }, + "step_70200": { + "perplexity": 2.694940755978398, + "loss": 0.9913762211799622, + "unpadded_tokens_per_sec": 10039.16015625 + }, + "step_70300": { + "perplexity": 2.694071400700649, + "loss": 0.991053581237793, + "unpadded_tokens_per_sec": 10035.2373046875 + }, + "step_70400": { + "perplexity": 2.69057304815905, + "loss": 0.9897541999816895, + "unpadded_tokens_per_sec": 10028.765625 + }, + "step_70500": { + "perplexity": 2.694294775584388, + "loss": 0.9911364912986755, + "unpadded_tokens_per_sec": 10029.5439453125 + }, + "step_70600": { + "perplexity": 2.689513046645315, + "loss": 0.9893601536750793, + "unpadded_tokens_per_sec": 9750.5146484375 + }, + "step_70700": { + "perplexity": 2.694624331512916, + "loss": 0.9912588000297546, + "unpadded_tokens_per_sec": 10033.8369140625 + }, + "step_70800": { + "perplexity": 2.695682330623762, + "loss": 0.9916513562202454, + "unpadded_tokens_per_sec": 10034.4111328125 + }, + "step_70900": { + "perplexity": 2.696856157690677, + "loss": 0.9920867085456848, + "unpadded_tokens_per_sec": 10038.7333984375 + }, + "step_71000": { + "perplexity": 2.6956386273269652, + "loss": 0.9916351437568665, + "unpadded_tokens_per_sec": 10039.3154296875 + }, + "step_71100": { + "perplexity": 2.692186218829568, + "loss": 0.9903535842895508, + "unpadded_tokens_per_sec": 10041.8896484375 + }, + "step_71200": { + "perplexity": 2.6921841327619345, + "loss": 0.9903528094291687, + "unpadded_tokens_per_sec": 10037.6171875 + }, + "step_71300": { + "perplexity": 2.6922216822266574, + "loss": 0.9903667569160461, + "unpadded_tokens_per_sec": 10041.0361328125 + }, + "step_71400": { + "perplexity": 2.6939249564790178, + "loss": 0.9909992218017578, + "unpadded_tokens_per_sec": 10035.74609375 + }, + "step_71500": { + "perplexity": 2.6897474263781196, + "loss": 0.989447295665741, + "unpadded_tokens_per_sec": 10042.0380859375 + }, + "step_71600": { + "perplexity": 2.692050627795441, + "loss": 0.9903032183647156, + "unpadded_tokens_per_sec": 10036.6220703125 + }, + "step_71700": { + "perplexity": 2.691023406633596, + "loss": 0.9899215698242188, + "unpadded_tokens_per_sec": 10034.1005859375 + }, + "step_71800": { + "perplexity": 2.6933705638024894, + "loss": 0.9907934069633484, + "unpadded_tokens_per_sec": 10037.87109375 + }, + "step_71900": { + "perplexity": 2.689877289879291, + "loss": 0.989495575428009, + "unpadded_tokens_per_sec": 10036.6181640625 + }, + "step_72000": { + "perplexity": 2.6931915705542173, + "loss": 0.9907269477844238, + "unpadded_tokens_per_sec": 7668.80859375 + }, + "step_72100": { + "perplexity": 2.693389988897418, + "loss": 0.9908006191253662, + "unpadded_tokens_per_sec": 10053.0869140625 + }, + "step_72200": { + "perplexity": 2.69144608722277, + "loss": 0.9900786280632019, + "unpadded_tokens_per_sec": 9748.3408203125 + }, + "step_72300": { + "perplexity": 2.6894053221830627, + "loss": 0.9893200993537903, + "unpadded_tokens_per_sec": 10031.57421875 + }, + "step_72400": { + "perplexity": 2.690305082146444, + "loss": 0.9896546006202698, + "unpadded_tokens_per_sec": 10031.1513671875 + }, + "step_72500": { + "perplexity": 2.6888475323844814, + "loss": 0.9891126751899719, + "unpadded_tokens_per_sec": 10039.4326171875 + }, + "step_72600": { + "perplexity": 2.6893665296090115, + "loss": 0.9893056750297546, + "unpadded_tokens_per_sec": 10035.4443359375 + }, + "step_72700": { + "perplexity": 2.6897315546023997, + "loss": 0.9894413948059082, + "unpadded_tokens_per_sec": 10036.1767578125 + }, + "step_72800": { + "perplexity": 2.687460451918109, + "loss": 0.9885966777801514, + "unpadded_tokens_per_sec": 10037.78515625 + }, + "step_72900": { + "perplexity": 2.6903010732824617, + "loss": 0.9896531105041504, + "unpadded_tokens_per_sec": 10041.89453125 + }, + "step_73000": { + "perplexity": 2.689164721071724, + "loss": 0.9892306327819824, + "unpadded_tokens_per_sec": 10037.49609375 + }, + "step_73100": { + "perplexity": 2.6884747753171685, + "loss": 0.9889740347862244, + "unpadded_tokens_per_sec": 10043.6767578125 + }, + "step_73200": { + "perplexity": 2.6879501824629095, + "loss": 0.9887788891792297, + "unpadded_tokens_per_sec": 10033.8544921875 + }, + "step_73300": { + "perplexity": 2.688822370457296, + "loss": 0.9891033172607422, + "unpadded_tokens_per_sec": 10039.5791015625 + }, + "step_73400": { + "perplexity": 2.6889787949181816, + "loss": 0.989161491394043, + "unpadded_tokens_per_sec": 10043.5859375 + }, + "step_73500": { + "perplexity": 2.6878960305696533, + "loss": 0.9887587428092957, + "unpadded_tokens_per_sec": 10042.3984375 + }, + "step_73600": { + "perplexity": 2.6841129073679677, + "loss": 0.9873502850532532, + "unpadded_tokens_per_sec": 10048.1845703125 + }, + "step_73700": { + "perplexity": 2.6835400600849777, + "loss": 0.9871368408203125, + "unpadded_tokens_per_sec": 10040.2548828125 + }, + "step_73800": { + "perplexity": 2.686129322924631, + "loss": 0.9881012439727783, + "unpadded_tokens_per_sec": 9732.18359375 + }, + "step_73900": { + "perplexity": 2.691230006536487, + "loss": 0.9899983406066895, + "unpadded_tokens_per_sec": 10034.50390625 + }, + "step_74000": { + "perplexity": 2.6828728255913745, + "loss": 0.9868881702423096, + "unpadded_tokens_per_sec": 10041.3896484375 + }, + "step_74100": { + "perplexity": 2.6834778396914714, + "loss": 0.9871136546134949, + "unpadded_tokens_per_sec": 10038.36328125 + }, + "step_74200": { + "perplexity": 2.686485581917355, + "loss": 0.9882338643074036, + "unpadded_tokens_per_sec": 10035.0537109375 + }, + "step_74300": { + "perplexity": 2.6869626426591062, + "loss": 0.9884114265441895, + "unpadded_tokens_per_sec": 10038.53125 + }, + "step_74400": { + "perplexity": 2.686687829954693, + "loss": 0.9883091449737549, + "unpadded_tokens_per_sec": 10040.140625 + }, + "step_74500": { + "perplexity": 2.6874180031950736, + "loss": 0.9885808825492859, + "unpadded_tokens_per_sec": 10033.0673828125 + }, + "step_74600": { + "perplexity": 2.6853250718892423, + "loss": 0.9878017902374268, + "unpadded_tokens_per_sec": 10036.1513671875 + }, + "step_74700": { + "perplexity": 2.6892532007900694, + "loss": 0.9892635345458984, + "unpadded_tokens_per_sec": 10037.9912109375 + }, + "step_74800": { + "perplexity": 2.6842392989648283, + "loss": 0.9873973727226257, + "unpadded_tokens_per_sec": 10032.5 + }, + "step_74900": { + "perplexity": 2.6849596846855386, + "loss": 0.9876657128334045, + "unpadded_tokens_per_sec": 10035.3935546875 + }, + "step_75000": { + "perplexity": 2.6859884335296145, + "loss": 0.988048791885376, + "unpadded_tokens_per_sec": 10031.8037109375 + }, + "step_75100": { + "perplexity": 2.6838505438133433, + "loss": 0.9872525334358215, + "unpadded_tokens_per_sec": 9964.4560546875 + }, + "step_75200": { + "perplexity": 2.68336044061593, + "loss": 0.9870699048042297, + "unpadded_tokens_per_sec": 10037.283203125 + }, + "step_75300": { + "perplexity": 2.682531755655922, + "loss": 0.9867610335350037, + "unpadded_tokens_per_sec": 7646.90283203125 + }, + "step_75400": { + "perplexity": 2.686226028558985, + "loss": 0.9881372451782227, + "unpadded_tokens_per_sec": 10039.7216796875 + }, + "step_75500": { + "perplexity": 2.6841289059752853, + "loss": 0.9873562455177307, + "unpadded_tokens_per_sec": 9740.9599609375 + }, + "step_75600": { + "perplexity": 2.684465378777628, + "loss": 0.9874815940856934, + "unpadded_tokens_per_sec": 10045.1142578125 + }, + "step_75700": { + "perplexity": 2.6871644461087567, + "loss": 0.9884865283966064, + "unpadded_tokens_per_sec": 10045.2041015625 + }, + "step_75800": { + "perplexity": 2.686319535330579, + "loss": 0.9881720542907715, + "unpadded_tokens_per_sec": 10036.6025390625 + }, + "step_75900": { + "perplexity": 2.6865781369285435, + "loss": 0.9882683157920837, + "unpadded_tokens_per_sec": 10037.294921875 + }, + "step_76000": { + "perplexity": 2.6807952582941565, + "loss": 0.9861134886741638, + "unpadded_tokens_per_sec": 10033.8017578125 + }, + "step_76100": { + "perplexity": 2.6842776975906535, + "loss": 0.9874116778373718, + "unpadded_tokens_per_sec": 10041.5625 + }, + "step_76200": { + "perplexity": 2.680918137966237, + "loss": 0.9861593246459961, + "unpadded_tokens_per_sec": 10039.0927734375 + }, + "step_76300": { + "perplexity": 2.683061048243062, + "loss": 0.9869583249092102, + "unpadded_tokens_per_sec": 10036.9228515625 + }, + "step_76400": { + "perplexity": 2.6846117888139274, + "loss": 0.9875361323356628, + "unpadded_tokens_per_sec": 10038.169921875 + }, + "step_76500": { + "perplexity": 2.681807225909252, + "loss": 0.9864909052848816, + "unpadded_tokens_per_sec": 10037.1728515625 + }, + "step_76600": { + "perplexity": 2.680330955017113, + "loss": 0.9859402775764465, + "unpadded_tokens_per_sec": 10034.6796875 + }, + "step_76700": { + "perplexity": 2.6841609034759992, + "loss": 0.9873681664466858, + "unpadded_tokens_per_sec": 10037.1279296875 + }, + "step_76800": { + "perplexity": 2.6846964382591003, + "loss": 0.987567663192749, + "unpadded_tokens_per_sec": 10037.4013671875 + }, + "step_76900": { + "perplexity": 2.6842533783971847, + "loss": 0.987402617931366, + "unpadded_tokens_per_sec": 10036.263671875 + }, + "step_77000": { + "perplexity": 2.68216275179665, + "loss": 0.9866234660148621, + "unpadded_tokens_per_sec": 9710.2919921875 + }, + "step_77100": { + "perplexity": 2.6805079751357206, + "loss": 0.9860063195228577, + "unpadded_tokens_per_sec": 10038.8857421875 + }, + "step_77200": { + "perplexity": 2.6797167475310175, + "loss": 0.9857110977172852, + "unpadded_tokens_per_sec": 10035.14453125 + }, + "step_77300": { + "perplexity": 2.679245923828369, + "loss": 0.9855353832244873, + "unpadded_tokens_per_sec": 10036.2900390625 + }, + "step_77400": { + "perplexity": 2.6842452187171624, + "loss": 0.9873995780944824, + "unpadded_tokens_per_sec": 10032.6923828125 + }, + "step_77500": { + "perplexity": 2.6812062641454464, + "loss": 0.9862667918205261, + "unpadded_tokens_per_sec": 10033.197265625 + }, + "step_77600": { + "perplexity": 2.678180487644783, + "loss": 0.9851376414299011, + "unpadded_tokens_per_sec": 10040.39453125 + }, + "step_77700": { + "perplexity": 2.680579872926495, + "loss": 0.9860331416130066, + "unpadded_tokens_per_sec": 10035.6982421875 + }, + "step_77800": { + "perplexity": 2.6790247546878647, + "loss": 0.9854528307914734, + "unpadded_tokens_per_sec": 10031.4130859375 + }, + "step_77900": { + "perplexity": 2.6817049250335825, + "loss": 0.9864527583122253, + "unpadded_tokens_per_sec": 10043.9501953125 + }, + "step_78000": { + "perplexity": 2.6764164967381827, + "loss": 0.984478771686554, + "unpadded_tokens_per_sec": 10034.7373046875 + }, + "step_78100": { + "perplexity": 2.678539045109043, + "loss": 0.9852715134620667, + "unpadded_tokens_per_sec": 10027.0302734375 + }, + "step_78200": { + "perplexity": 2.6785318607171043, + "loss": 0.9852688312530518, + "unpadded_tokens_per_sec": 10039.072265625 + }, + "step_78300": { + "perplexity": 2.678722812439559, + "loss": 0.9853401184082031, + "unpadded_tokens_per_sec": 10036.8466796875 + }, + "step_78400": { + "perplexity": 2.6731794226524013, + "loss": 0.9832685589790344, + "unpadded_tokens_per_sec": 10027.9931640625 + }, + "step_78500": { + "perplexity": 2.678824201202547, + "loss": 0.9853779673576355, + "unpadded_tokens_per_sec": 10038.474609375 + }, + "step_78600": { + "perplexity": 2.6764311732490427, + "loss": 0.9844842553138733, + "unpadded_tokens_per_sec": 10042.3203125 + }, + "step_78700": { + "perplexity": 2.6747016540757906, + "loss": 0.9838378429412842, + "unpadded_tokens_per_sec": 9238.7880859375 + }, + "step_78800": { + "perplexity": 2.6756215329806694, + "loss": 0.9841817021369934, + "unpadded_tokens_per_sec": 7792.29296875 + }, + "step_78900": { + "perplexity": 2.6782476935585704, + "loss": 0.9851627349853516, + "unpadded_tokens_per_sec": 10048.3525390625 + }, + "step_79000": { + "perplexity": 2.675872884432413, + "loss": 0.9842756390571594, + "unpadded_tokens_per_sec": 9717.3310546875 + }, + "step_79100": { + "perplexity": 2.6786370740709637, + "loss": 0.9853081107139587, + "unpadded_tokens_per_sec": 10034.3251953125 + }, + "step_79200": { + "perplexity": 2.678029480033387, + "loss": 0.9850812554359436, + "unpadded_tokens_per_sec": 10039.271484375 + }, + "step_79300": { + "perplexity": 2.6798694476095397, + "loss": 0.9857680797576904, + "unpadded_tokens_per_sec": 10037.3876953125 + }, + "step_79400": { + "perplexity": 2.6784145183765156, + "loss": 0.9852250218391418, + "unpadded_tokens_per_sec": 10030.8896484375 + }, + "step_79500": { + "perplexity": 2.6756531101022647, + "loss": 0.9841935038566589, + "unpadded_tokens_per_sec": 10036.6103515625 + }, + "step_79600": { + "perplexity": 2.670682870804416, + "loss": 0.9823341965675354, + "unpadded_tokens_per_sec": 10046.4208984375 + }, + "step_79700": { + "perplexity": 2.6755682673675527, + "loss": 0.9841617941856384, + "unpadded_tokens_per_sec": 10037.26953125 + }, + "step_79800": { + "perplexity": 2.6754426029973417, + "loss": 0.9841148257255554, + "unpadded_tokens_per_sec": 10039.5009765625 + }, + "step_79900": { + "perplexity": 2.6745926098435406, + "loss": 0.9837970733642578, + "unpadded_tokens_per_sec": 10037.3115234375 + }, + "step_80000": { + "perplexity": 2.6784288865501877, + "loss": 0.9852303862571716, + "unpadded_tokens_per_sec": 10031.6640625 + }, + "step_80100": { + "perplexity": 2.6721425193877497, + "loss": 0.9828805923461914, + "unpadded_tokens_per_sec": 9976.3427734375 + }, + "step_80200": { + "perplexity": 2.6749382507081974, + "loss": 0.9839262962341309, + "unpadded_tokens_per_sec": 10043.916015625 + }, + "step_80300": { + "perplexity": 2.674456310804828, + "loss": 0.9837461113929749, + "unpadded_tokens_per_sec": 9691.2919921875 + }, + "step_80400": { + "perplexity": 2.671568405130391, + "loss": 0.9826657176017761, + "unpadded_tokens_per_sec": 10037.4287109375 + }, + "step_80500": { + "perplexity": 2.672113691292131, + "loss": 0.9828698039054871, + "unpadded_tokens_per_sec": 10032.9423828125 + }, + "step_80600": { + "perplexity": 2.6729759609928023, + "loss": 0.9831924438476562, + "unpadded_tokens_per_sec": 10040.625 + }, + "step_80700": { + "perplexity": 2.6724494543620056, + "loss": 0.9829954504966736, + "unpadded_tokens_per_sec": 10041.8515625 + }, + "step_80800": { + "perplexity": 2.6771935052873603, + "loss": 0.9847690463066101, + "unpadded_tokens_per_sec": 10031.2939453125 + }, + "step_80900": { + "perplexity": 2.673398037715578, + "loss": 0.9833503365516663, + "unpadded_tokens_per_sec": 10037.9697265625 + }, + "step_81000": { + "perplexity": 2.6738442463821626, + "loss": 0.9835172295570374, + "unpadded_tokens_per_sec": 10039.6328125 + }, + "step_81100": { + "perplexity": 2.6707127977716096, + "loss": 0.9823454022407532, + "unpadded_tokens_per_sec": 10040.6123046875 + }, + "step_81200": { + "perplexity": 2.6730108526909295, + "loss": 0.9832054972648621, + "unpadded_tokens_per_sec": 10046.8095703125 + }, + "step_81300": { + "perplexity": 2.6750343939987293, + "loss": 0.9839622378349304, + "unpadded_tokens_per_sec": 10039.7451171875 + }, + "step_81400": { + "perplexity": 2.671567131227609, + "loss": 0.9826652407646179, + "unpadded_tokens_per_sec": 10037.7041015625 + }, + "step_81500": { + "perplexity": 2.6724571003121627, + "loss": 0.9829983115196228, + "unpadded_tokens_per_sec": 10028.4404296875 + }, + "step_81600": { + "perplexity": 2.6722731257061074, + "loss": 0.9829294681549072, + "unpadded_tokens_per_sec": 10040.5224609375 + }, + "step_81700": { + "perplexity": 2.669819431222543, + "loss": 0.9820108413696289, + "unpadded_tokens_per_sec": 10034.98828125 + }, + "step_81800": { + "perplexity": 2.671757586436614, + "loss": 0.9827365279197693, + "unpadded_tokens_per_sec": 10030.0498046875 + }, + "step_81900": { + "perplexity": 2.6762994065882912, + "loss": 0.9844350218772888, + "unpadded_tokens_per_sec": 10039.4072265625 + }, + "step_82000": { + "perplexity": 2.6698894509417865, + "loss": 0.9820370674133301, + "unpadded_tokens_per_sec": 10033.828125 + }, + "step_82100": { + "perplexity": 2.6668619804544473, + "loss": 0.9809024930000305, + "unpadded_tokens_per_sec": 10031.8271484375 + }, + "step_82200": { + "perplexity": 2.675411028360078, + "loss": 0.9841030240058899, + "unpadded_tokens_per_sec": 10039.3525390625 + }, + "step_82300": { + "perplexity": 2.666405493986454, + "loss": 0.9807313084602356, + "unpadded_tokens_per_sec": 10038.2998046875 + }, + "step_82400": { + "perplexity": 2.670776791667106, + "loss": 0.9823693633079529, + "unpadded_tokens_per_sec": 7504.12744140625 + }, + "step_82500": { + "perplexity": 2.6714736602684135, + "loss": 0.9826302528381348, + "unpadded_tokens_per_sec": 10046.8466796875 + }, + "step_82600": { + "perplexity": 2.6698647846948207, + "loss": 0.9820278286933899, + "unpadded_tokens_per_sec": 9705.1806640625 + }, + "step_82700": { + "perplexity": 2.6689863374965603, + "loss": 0.981698751449585, + "unpadded_tokens_per_sec": 10038.341796875 + }, + "step_82800": { + "perplexity": 2.667925776271558, + "loss": 0.9813013076782227, + "unpadded_tokens_per_sec": 10036.59765625 + }, + "step_82900": { + "perplexity": 2.668517399116824, + "loss": 0.9815230369567871, + "unpadded_tokens_per_sec": 10035.3740234375 + }, + "step_83000": { + "perplexity": 2.670451744035353, + "loss": 0.9822476506233215, + "unpadded_tokens_per_sec": 10041.1904296875 + }, + "step_83100": { + "perplexity": 2.668379978243986, + "loss": 0.9814715385437012, + "unpadded_tokens_per_sec": 10030.34375 + }, + "step_83200": { + "perplexity": 2.6665733295092178, + "loss": 0.9807942509651184, + "unpadded_tokens_per_sec": 10034.8662109375 + }, + "step_83300": { + "perplexity": 2.66751155932903, + "loss": 0.9811460375785828, + "unpadded_tokens_per_sec": 10039.1982421875 + }, + "step_83400": { + "perplexity": 2.6679249811678356, + "loss": 0.9813010096549988, + "unpadded_tokens_per_sec": 10033.98828125 + }, + "step_83500": { + "perplexity": 2.6699118894676124, + "loss": 0.9820454716682434, + "unpadded_tokens_per_sec": 10039.6533203125 + }, + "step_83600": { + "perplexity": 2.6632719626321157, + "loss": 0.9795554280281067, + "unpadded_tokens_per_sec": 10038.7626953125 + }, + "step_83700": { + "perplexity": 2.6683335366786416, + "loss": 0.9814541339874268, + "unpadded_tokens_per_sec": 10044.6591796875 + }, + "step_83800": { + "perplexity": 2.6647853756180706, + "loss": 0.9801235198974609, + "unpadded_tokens_per_sec": 10033.671875 + }, + "step_83900": { + "perplexity": 2.6656799176500927, + "loss": 0.9804591536521912, + "unpadded_tokens_per_sec": 10031.8408203125 + }, + "step_84000": { + "perplexity": 2.662993700025908, + "loss": 0.9794509410858154, + "unpadded_tokens_per_sec": 9688.1220703125 + }, + "step_84100": { + "perplexity": 2.6664352140905647, + "loss": 0.9807424545288086, + "unpadded_tokens_per_sec": 10043.28125 + }, + "step_84200": { + "perplexity": 2.6685690928278016, + "loss": 0.9815424084663391, + "unpadded_tokens_per_sec": 10040.0849609375 + }, + "step_84300": { + "perplexity": 2.667016491485159, + "loss": 0.9809604287147522, + "unpadded_tokens_per_sec": 10044.1279296875 + }, + "step_84400": { + "perplexity": 2.6640961270087535, + "loss": 0.9798648357391357, + "unpadded_tokens_per_sec": 10034.4326171875 + }, + "step_84500": { + "perplexity": 2.667463224958942, + "loss": 0.981127917766571, + "unpadded_tokens_per_sec": 10043.1435546875 + }, + "step_84600": { + "perplexity": 2.6645787411349593, + "loss": 0.9800459742546082, + "unpadded_tokens_per_sec": 10039.9970703125 + }, + "step_84700": { + "perplexity": 2.665693581958908, + "loss": 0.9804642796516418, + "unpadded_tokens_per_sec": 10037.392578125 + }, + "step_84800": { + "perplexity": 2.6652873370198353, + "loss": 0.9803118705749512, + "unpadded_tokens_per_sec": 10034.19140625 + }, + "step_84900": { + "perplexity": 2.6628489451246424, + "loss": 0.9793965816497803, + "unpadded_tokens_per_sec": 10037.724609375 + }, + "step_85000": { + "perplexity": 2.6646800710314613, + "loss": 0.9800840020179749, + "unpadded_tokens_per_sec": 10036.01953125 + }, + "step_85100": { + "perplexity": 2.6630971919062385, + "loss": 0.979489803314209, + "unpadded_tokens_per_sec": 9966.0810546875 + }, + "step_85200": { + "perplexity": 2.6630762392376637, + "loss": 0.9794819355010986, + "unpadded_tokens_per_sec": 10034.92578125 + }, + "step_85300": { + "perplexity": 2.660531874999062, + "loss": 0.9785260558128357, + "unpadded_tokens_per_sec": 10038.8671875 + }, + "step_85400": { + "perplexity": 2.6628602141382354, + "loss": 0.9794008135795593, + "unpadded_tokens_per_sec": 10037.5712890625 + }, + "step_85500": { + "perplexity": 2.662634842927168, + "loss": 0.9793161749839783, + "unpadded_tokens_per_sec": 10043.38671875 + }, + "step_85600": { + "perplexity": 2.663899072800238, + "loss": 0.9797908663749695, + "unpadded_tokens_per_sec": 10049.34765625 + }, + "step_85700": { + "perplexity": 2.6645523769346773, + "loss": 0.9800360798835754, + "unpadded_tokens_per_sec": 10037.0068359375 + }, + "step_85800": { + "perplexity": 2.6623501406544117, + "loss": 0.9792092442512512, + "unpadded_tokens_per_sec": 10041.0166015625 + }, + "step_85900": { + "perplexity": 2.66327735991248, + "loss": 0.979557454586029, + "unpadded_tokens_per_sec": 10040.3388671875 + }, + "step_86000": { + "perplexity": 2.6630229059160713, + "loss": 0.9794619083404541, + "unpadded_tokens_per_sec": 10034.4541015625 + }, + "step_86100": { + "perplexity": 2.6598006042912172, + "loss": 0.9782511591911316, + "unpadded_tokens_per_sec": 9543.4794921875 + }, + "step_86200": { + "perplexity": 2.664718507517467, + "loss": 0.9800984263420105, + "unpadded_tokens_per_sec": 7732.15087890625 + }, + "step_86300": { + "perplexity": 2.6641039078527773, + "loss": 0.9798677563667297, + "unpadded_tokens_per_sec": 10042.990234375 + }, + "step_86400": { + "perplexity": 2.661565700621906, + "loss": 0.9789145588874817, + "unpadded_tokens_per_sec": 9693.7138671875 + }, + "step_86500": { + "perplexity": 2.660937553711557, + "loss": 0.9786785244941711, + "unpadded_tokens_per_sec": 10041.9638671875 + }, + "step_86600": { + "perplexity": 2.6635310444279474, + "loss": 0.9796527028083801, + "unpadded_tokens_per_sec": 10037.767578125 + }, + "step_86700": { + "perplexity": 2.6604229327302016, + "loss": 0.978485107421875, + "unpadded_tokens_per_sec": 10037.0263671875 + }, + "step_86800": { + "perplexity": 2.657521192973903, + "loss": 0.9773938059806824, + "unpadded_tokens_per_sec": 10029.0673828125 + }, + "step_86900": { + "perplexity": 2.659688045779096, + "loss": 0.9782088398933411, + "unpadded_tokens_per_sec": 10037.7294921875 + }, + "step_87000": { + "perplexity": 2.6603163734293838, + "loss": 0.9784450531005859, + "unpadded_tokens_per_sec": 10049.2080078125 + }, + "step_87100": { + "perplexity": 2.6588232480270952, + "loss": 0.9778836369514465, + "unpadded_tokens_per_sec": 10046.23046875 + }, + "step_87200": { + "perplexity": 2.66382254157426, + "loss": 0.9797621369361877, + "unpadded_tokens_per_sec": 10038.5888671875 + }, + "step_87300": { + "perplexity": 2.6605142726709285, + "loss": 0.9785194396972656, + "unpadded_tokens_per_sec": 10042.8017578125 + }, + "step_87400": { + "perplexity": 2.6642944662280814, + "loss": 0.9799392819404602, + "unpadded_tokens_per_sec": 10034.2412109375 + }, + "step_87500": { + "perplexity": 2.6599239485248893, + "loss": 0.9782975316047668, + "unpadded_tokens_per_sec": 10038.67578125 + }, + "step_87600": { + "perplexity": 2.6576645493894033, + "loss": 0.9774477481842041, + "unpadded_tokens_per_sec": 10036.0712890625 + }, + "step_87700": { + "perplexity": 2.6569665319400024, + "loss": 0.977185070514679, + "unpadded_tokens_per_sec": 9670.2626953125 + }, + "step_87800": { + "perplexity": 2.657884905653748, + "loss": 0.9775306582450867, + "unpadded_tokens_per_sec": 10037.7080078125 + }, + "step_87900": { + "perplexity": 2.659152744651405, + "loss": 0.9780075550079346, + "unpadded_tokens_per_sec": 10038.7451171875 + }, + "step_88000": { + "perplexity": 2.659505901320718, + "loss": 0.9781403541564941, + "unpadded_tokens_per_sec": 10042.1845703125 + }, + "step_88100": { + "perplexity": 2.656748310423135, + "loss": 0.9771029353141785, + "unpadded_tokens_per_sec": 10031.8193359375 + }, + "step_88200": { + "perplexity": 2.6598947766215946, + "loss": 0.9782865643501282, + "unpadded_tokens_per_sec": 10036.8291015625 + }, + "step_88300": { + "perplexity": 2.65586563053011, + "loss": 0.9767706394195557, + "unpadded_tokens_per_sec": 10048.5830078125 + }, + "step_88400": { + "perplexity": 2.6579541370942192, + "loss": 0.9775567054748535, + "unpadded_tokens_per_sec": 10041.8994140625 + }, + "step_88500": { + "perplexity": 2.655644491944399, + "loss": 0.9766873717308044, + "unpadded_tokens_per_sec": 10036.5849609375 + }, + "step_88600": { + "perplexity": 2.6553097323463377, + "loss": 0.9765613079071045, + "unpadded_tokens_per_sec": 10042.7255859375 + }, + "step_88700": { + "perplexity": 2.6590080400473983, + "loss": 0.9779531359672546, + "unpadded_tokens_per_sec": 10035.1396484375 + }, + "step_88800": { + "perplexity": 2.658226486048992, + "loss": 0.9776591658592224, + "unpadded_tokens_per_sec": 10037.9609375 + }, + "step_88900": { + "perplexity": 2.6549617220771857, + "loss": 0.9764302372932434, + "unpadded_tokens_per_sec": 10038.5126953125 + }, + "step_89000": { + "perplexity": 2.6627741899180446, + "loss": 0.9793685078620911, + "unpadded_tokens_per_sec": 10038.869140625 + }, + "step_89100": { + "perplexity": 2.6557670102603694, + "loss": 0.9767335057258606, + "unpadded_tokens_per_sec": 10040.5185546875 + }, + "step_89200": { + "perplexity": 2.661309823895564, + "loss": 0.978818416595459, + "unpadded_tokens_per_sec": 10038.9267578125 + }, + "step_89300": { + "perplexity": 2.658677293597063, + "loss": 0.9778287410736084, + "unpadded_tokens_per_sec": 10038.6845703125 + }, + "step_89400": { + "perplexity": 2.65546499857196, + "loss": 0.9766197800636292, + "unpadded_tokens_per_sec": 10038.3154296875 + }, + "step_89500": { + "perplexity": 2.657236087176401, + "loss": 0.9772865176200867, + "unpadded_tokens_per_sec": 10041.4033203125 + }, + "step_89600": { + "perplexity": 2.654612333373556, + "loss": 0.9762986302375793, + "unpadded_tokens_per_sec": 10045.46875 + }, + "step_89700": { + "perplexity": 2.658682523096223, + "loss": 0.977830708026886, + "unpadded_tokens_per_sec": 10035.7978515625 + }, + "step_89800": { + "perplexity": 2.6524755440114505, + "loss": 0.9754933714866638, + "unpadded_tokens_per_sec": 10035.6533203125 + }, + "step_89900": { + "perplexity": 2.6532561978881346, + "loss": 0.9757876396179199, + "unpadded_tokens_per_sec": 10036.197265625 + }, + "step_90000": { + "perplexity": 2.6593182215602953, + "loss": 0.9780697822570801, + "unpadded_tokens_per_sec": 10044.810546875 + }, + "step_90100": { + "perplexity": 2.653579468807735, + "loss": 0.9759094715118408, + "unpadded_tokens_per_sec": 7304.01513671875 + }, + "step_90200": { + "perplexity": 2.656381903288154, + "loss": 0.9769650101661682, + "unpadded_tokens_per_sec": 10043.8740234375 + }, + "step_90300": { + "perplexity": 2.6563057463512907, + "loss": 0.9769363403320312, + "unpadded_tokens_per_sec": 10038.625 + }, + "step_90400": { + "perplexity": 2.656600411400291, + "loss": 0.9770472645759583, + "unpadded_tokens_per_sec": 9674.8994140625 + }, + "step_90500": { + "perplexity": 2.654440187734466, + "loss": 0.9762337803840637, + "unpadded_tokens_per_sec": 10040.3857421875 + }, + "step_90600": { + "perplexity": 2.650200198666391, + "loss": 0.9746351838111877, + "unpadded_tokens_per_sec": 10044.9443359375 + }, + "step_90700": { + "perplexity": 2.6538162533669496, + "loss": 0.9759986996650696, + "unpadded_tokens_per_sec": 10042.494140625 + }, + "step_90800": { + "perplexity": 2.655898557535136, + "loss": 0.976783037185669, + "unpadded_tokens_per_sec": 10044.67578125 + }, + "step_90900": { + "perplexity": 2.654218376818305, + "loss": 0.9761502146720886, + "unpadded_tokens_per_sec": 10034.2978515625 + }, + "step_91000": { + "perplexity": 2.6541900584992852, + "loss": 0.9761395454406738, + "unpadded_tokens_per_sec": 10038.05078125 + }, + "step_91100": { + "perplexity": 2.653728623218379, + "loss": 0.975965678691864, + "unpadded_tokens_per_sec": 10034.1259765625 + }, + "step_91200": { + "perplexity": 2.653616321662796, + "loss": 0.9759233593940735, + "unpadded_tokens_per_sec": 10039.0947265625 + }, + "step_91300": { + "perplexity": 2.656296246678652, + "loss": 0.9769327640533447, + "unpadded_tokens_per_sec": 10038.0068359375 + }, + "step_91400": { + "perplexity": 2.652301639863701, + "loss": 0.9754278063774109, + "unpadded_tokens_per_sec": 10040.6875 + }, + "step_91500": { + "perplexity": 2.6538951862485485, + "loss": 0.9760284423828125, + "unpadded_tokens_per_sec": 9662.6640625 + }, + "step_91600": { + "perplexity": 2.653263630779025, + "loss": 0.9757904410362244, + "unpadded_tokens_per_sec": 10043.806640625 + }, + "step_91700": { + "perplexity": 2.6509191914419636, + "loss": 0.9749064445495605, + "unpadded_tokens_per_sec": 10038.8076171875 + }, + "step_91800": { + "perplexity": 2.6527199776609516, + "loss": 0.9755855202674866, + "unpadded_tokens_per_sec": 10045.3759765625 + }, + "step_91900": { + "perplexity": 2.6507624530358482, + "loss": 0.9748473167419434, + "unpadded_tokens_per_sec": 10030.6259765625 + }, + "step_92000": { + "perplexity": 2.653396477447089, + "loss": 0.9758405089378357, + "unpadded_tokens_per_sec": 10033.8466796875 + }, + "step_92100": { + "perplexity": 2.6528401473510335, + "loss": 0.9756308197975159, + "unpadded_tokens_per_sec": 10026.0126953125 + }, + "step_92200": { + "perplexity": 2.652164579811204, + "loss": 0.9753761291503906, + "unpadded_tokens_per_sec": 10025.6416015625 + }, + "step_92300": { + "perplexity": 2.6486857544669804, + "loss": 0.9740635752677917, + "unpadded_tokens_per_sec": 10029.9326171875 + }, + "step_92400": { + "perplexity": 2.6506709739145835, + "loss": 0.9748128056526184, + "unpadded_tokens_per_sec": 10032.41015625 + }, + "step_92500": { + "perplexity": 2.6487662714172595, + "loss": 0.9740939736366272, + "unpadded_tokens_per_sec": 10039.1455078125 + }, + "step_92600": { + "perplexity": 2.652526294552843, + "loss": 0.9755125045776367, + "unpadded_tokens_per_sec": 10026.3154296875 + }, + "step_92700": { + "perplexity": 2.6462622354054797, + "loss": 0.9731481671333313, + "unpadded_tokens_per_sec": 10035.8212890625 + }, + "step_92800": { + "perplexity": 2.6506449053129733, + "loss": 0.9748029708862305, + "unpadded_tokens_per_sec": 10041.5859375 + }, + "step_92900": { + "perplexity": 2.6508564633667318, + "loss": 0.9748827815055847, + "unpadded_tokens_per_sec": 10035.533203125 + }, + "step_93000": { + "perplexity": 2.6561171842052342, + "loss": 0.9768653512001038, + "unpadded_tokens_per_sec": 10034.1572265625 + }, + "step_93100": { + "perplexity": 2.646991361679201, + "loss": 0.9734236598014832, + "unpadded_tokens_per_sec": 10031.4873046875 + }, + "step_93200": { + "perplexity": 2.651927626486768, + "loss": 0.9752867817878723, + "unpadded_tokens_per_sec": 10030.6708984375 + }, + "step_93300": { + "perplexity": 2.6486557585818624, + "loss": 0.9740522503852844, + "unpadded_tokens_per_sec": 10039.904296875 + }, + "step_93400": { + "perplexity": 2.6554770277308504, + "loss": 0.9766243100166321, + "unpadded_tokens_per_sec": 10035.7265625 + }, + "step_93500": { + "perplexity": 2.649426286904311, + "loss": 0.9743431210517883, + "unpadded_tokens_per_sec": 10039.0478515625 + }, + "step_93600": { + "perplexity": 2.650713632179328, + "loss": 0.9748288989067078, + "unpadded_tokens_per_sec": 10034.56640625 + }, + "step_93700": { + "perplexity": 2.647775609544119, + "loss": 0.9737198948860168, + "unpadded_tokens_per_sec": 10041.3056640625 + }, + "step_93800": { + "perplexity": 2.6484950495726607, + "loss": 0.9739915728569031, + "unpadded_tokens_per_sec": 10044.82421875 + }, + "step_93900": { + "perplexity": 2.651607401666572, + "loss": 0.9751660227775574, + "unpadded_tokens_per_sec": 10032.7783203125 + }, + "step_94000": { + "perplexity": 2.651292588506629, + "loss": 0.9750472903251648, + "unpadded_tokens_per_sec": 10035.029296875 + }, + "step_94100": { + "perplexity": 2.651395783682354, + "loss": 0.9750862121582031, + "unpadded_tokens_per_sec": 10031.734375 + }, + "step_94200": { + "perplexity": 2.649585315216331, + "loss": 0.9744031429290771, + "unpadded_tokens_per_sec": 9517.7314453125 + }, + "step_94300": { + "perplexity": 2.64769386018877, + "loss": 0.9736890196800232, + "unpadded_tokens_per_sec": 7434.31640625 + }, + "step_94400": { + "perplexity": 2.6468941753078026, + "loss": 0.9733869433403015, + "unpadded_tokens_per_sec": 10036.4111328125 + }, + "step_94500": { + "perplexity": 2.649385228513158, + "loss": 0.9743276238441467, + "unpadded_tokens_per_sec": 10037.6572265625 + }, + "step_94600": { + "perplexity": 2.6466736259692856, + "loss": 0.9733036160469055, + "unpadded_tokens_per_sec": 9665.2138671875 + }, + "step_94700": { + "perplexity": 2.647834319132569, + "loss": 0.9737420678138733, + "unpadded_tokens_per_sec": 10039.6279296875 + }, + "step_94800": { + "perplexity": 2.6442609317518335, + "loss": 0.9723916053771973, + "unpadded_tokens_per_sec": 10026.4462890625 + }, + "step_94900": { + "perplexity": 2.6476409927411653, + "loss": 0.9736690521240234, + "unpadded_tokens_per_sec": 10031.6748046875 + }, + "step_95000": { + "perplexity": 2.6426826212622596, + "loss": 0.9717945456504822, + "unpadded_tokens_per_sec": 10042.30078125 + }, + "step_95100": { + "perplexity": 2.6460297523050396, + "loss": 0.9730603098869324, + "unpadded_tokens_per_sec": 9969.9755859375 + }, + "step_95200": { + "perplexity": 2.6434583446578857, + "loss": 0.9720880389213562, + "unpadded_tokens_per_sec": 10038.279296875 + }, + "step_95300": { + "perplexity": 2.6470640960221887, + "loss": 0.9734511375427246, + "unpadded_tokens_per_sec": 10031.6259765625 + }, + "step_95400": { + "perplexity": 2.6458379770082314, + "loss": 0.9729878306388855, + "unpadded_tokens_per_sec": 10033.5546875 + }, + "step_95500": { + "perplexity": 2.6443487221092123, + "loss": 0.9724248051643372, + "unpadded_tokens_per_sec": 9642.7529296875 + }, + "step_95600": { + "perplexity": 2.648689069822499, + "loss": 0.974064826965332, + "unpadded_tokens_per_sec": 10042.0791015625 + }, + "step_95700": { + "perplexity": 2.6451028620155888, + "loss": 0.9727099537849426, + "unpadded_tokens_per_sec": 10036.1806640625 + }, + "step_95800": { + "perplexity": 2.6425125092860227, + "loss": 0.9717301726341248, + "unpadded_tokens_per_sec": 10035.021484375 + }, + "step_95900": { + "perplexity": 2.645426243295936, + "loss": 0.972832202911377, + "unpadded_tokens_per_sec": 10028.1201171875 + }, + "step_96000": { + "perplexity": 2.6510802055632543, + "loss": 0.9749671816825867, + "unpadded_tokens_per_sec": 10036.7568359375 + }, + "step_96100": { + "perplexity": 2.6426756905603566, + "loss": 0.9717919230461121, + "unpadded_tokens_per_sec": 10035.9599609375 + }, + "step_96200": { + "perplexity": 2.642367607721114, + "loss": 0.9716753363609314, + "unpadded_tokens_per_sec": 10034.513671875 + }, + "step_96300": { + "perplexity": 2.6430994418870313, + "loss": 0.9719522595405579, + "unpadded_tokens_per_sec": 10034.9384765625 + }, + "step_96400": { + "perplexity": 2.644559777640637, + "loss": 0.9725046157836914, + "unpadded_tokens_per_sec": 10039.8486328125 + }, + "step_96500": { + "perplexity": 2.6441759812005667, + "loss": 0.9723594784736633, + "unpadded_tokens_per_sec": 10033.1787109375 + }, + "step_96600": { + "perplexity": 2.6453472469499864, + "loss": 0.9728023409843445, + "unpadded_tokens_per_sec": 10035.47265625 + }, + "step_96700": { + "perplexity": 2.6417680832085533, + "loss": 0.9714484214782715, + "unpadded_tokens_per_sec": 10034.376953125 + }, + "step_96800": { + "perplexity": 2.6433400179471356, + "loss": 0.9720432758331299, + "unpadded_tokens_per_sec": 10032.98828125 + }, + "step_96900": { + "perplexity": 2.646682460210343, + "loss": 0.9733069539070129, + "unpadded_tokens_per_sec": 10042.8173828125 + }, + "step_97000": { + "perplexity": 2.6474408950661235, + "loss": 0.9735934734344482, + "unpadded_tokens_per_sec": 10036.236328125 + }, + "step_97100": { + "perplexity": 2.6352780983689574, + "loss": 0.9689887166023254, + "unpadded_tokens_per_sec": 10037.6806640625 + }, + "step_97200": { + "perplexity": 2.641810913124046, + "loss": 0.9714646339416504, + "unpadded_tokens_per_sec": 10040.8681640625 + }, + "step_97300": { + "perplexity": 2.6438519648249232, + "loss": 0.9722369313240051, + "unpadded_tokens_per_sec": 10030.615234375 + }, + "step_97400": { + "perplexity": 2.6407461973435997, + "loss": 0.9710615277290344, + "unpadded_tokens_per_sec": 10033.2705078125 + }, + "step_97500": { + "perplexity": 2.643936432193662, + "loss": 0.9722688794136047, + "unpadded_tokens_per_sec": 10031.9384765625 + }, + "step_97600": { + "perplexity": 2.6449633362270326, + "loss": 0.9726572036743164, + "unpadded_tokens_per_sec": 10043.197265625 + }, + "step_97700": { + "perplexity": 2.6404109550507338, + "loss": 0.9709345698356628, + "unpadded_tokens_per_sec": 10041.6591796875 + }, + "step_97800": { + "perplexity": 2.637769539470668, + "loss": 0.9699336886405945, + "unpadded_tokens_per_sec": 10038.3603515625 + }, + "step_97900": { + "perplexity": 2.640891009993968, + "loss": 0.9711163640022278, + "unpadded_tokens_per_sec": 10044.1689453125 + }, + "step_98000": { + "perplexity": 2.643792713206572, + "loss": 0.9722145199775696, + "unpadded_tokens_per_sec": 10037.8251953125 + }, + "step_98100": { + "perplexity": 2.6399874773965375, + "loss": 0.9707741737365723, + "unpadded_tokens_per_sec": 10039.671875 + }, + "step_98200": { + "perplexity": 2.6404230733968346, + "loss": 0.9709391593933105, + "unpadded_tokens_per_sec": 10035.9814453125 + }, + "step_98300": { + "perplexity": 2.6442421762005597, + "loss": 0.972384512424469, + "unpadded_tokens_per_sec": 10034.37890625 + }, + "step_98400": { + "perplexity": 2.6399910965758817, + "loss": 0.9707755446434021, + "unpadded_tokens_per_sec": 10032.9365234375 + }, + "step_98500": { + "perplexity": 2.6434047739862, + "loss": 0.9720677733421326, + "unpadded_tokens_per_sec": 9486.6943359375 + }, + "step_98600": { + "perplexity": 2.642280828088311, + "loss": 0.9716424942016602, + "unpadded_tokens_per_sec": 7728.10888671875 + }, + "step_98700": { + "perplexity": 2.6404471528729707, + "loss": 0.9709482789039612, + "unpadded_tokens_per_sec": 10036.697265625 + }, + "step_98800": { + "perplexity": 2.640418509337878, + "loss": 0.9709374308586121, + "unpadded_tokens_per_sec": 10036.486328125 + }, + "step_98900": { + "perplexity": 2.641885709678363, + "loss": 0.9714929461479187, + "unpadded_tokens_per_sec": 9637.29296875 + }, + "step_99000": { + "perplexity": 2.6414054739209645, + "loss": 0.9713111519813538, + "unpadded_tokens_per_sec": 10032.4912109375 + }, + "step_99100": { + "perplexity": 2.6432339853281444, + "loss": 0.972003161907196, + "unpadded_tokens_per_sec": 10024.765625 + }, + "step_99200": { + "perplexity": 2.641430664446679, + "loss": 0.9713206887245178, + "unpadded_tokens_per_sec": 10032.541015625 + }, + "step_99300": { + "perplexity": 2.6393878636305184, + "loss": 0.9705470204353333, + "unpadded_tokens_per_sec": 10030.0966796875 + }, + "step_99400": { + "perplexity": 2.640956493103954, + "loss": 0.9711411595344543, + "unpadded_tokens_per_sec": 10039.1298828125 + }, + "step_99500": { + "perplexity": 2.6420003493495208, + "loss": 0.9715363383293152, + "unpadded_tokens_per_sec": 10036.5322265625 + }, + "step_99600": { + "perplexity": 2.6365096955470793, + "loss": 0.9694559574127197, + "unpadded_tokens_per_sec": 10040.1416015625 + }, + "step_99700": { + "perplexity": 2.6372180577342594, + "loss": 0.9697245955467224, + "unpadded_tokens_per_sec": 10028.6337890625 + }, + "step_99800": { + "perplexity": 2.639142456189469, + "loss": 0.9704540371894836, + "unpadded_tokens_per_sec": 10038.9775390625 + }, + "step_99900": { + "perplexity": 2.6386686956100465, + "loss": 0.9702745079994202, + "unpadded_tokens_per_sec": 9628.3271484375 + }, + "step_100000": { + "perplexity": 2.635730512665898, + "loss": 0.9691603779792786, + "unpadded_tokens_per_sec": 10036.4873046875 + }, + "step_100100": { + "perplexity": 2.6350855316818635, + "loss": 0.9689156413078308, + "unpadded_tokens_per_sec": 9971.54296875 + }, + "step_100200": { + "perplexity": 2.638791059883387, + "loss": 0.9703208804130554, + "unpadded_tokens_per_sec": 10031.2939453125 + }, + "step_100300": { + "perplexity": 2.6396174031643542, + "loss": 0.9706339836120605, + "unpadded_tokens_per_sec": 10039.7529296875 + }, + "step_100400": { + "perplexity": 2.6399371241116887, + "loss": 0.9707551002502441, + "unpadded_tokens_per_sec": 10039.9208984375 + }, + "step_100500": { + "perplexity": 2.6385246338929944, + "loss": 0.9702199101448059, + "unpadded_tokens_per_sec": 10031.4375 + }, + "step_100600": { + "perplexity": 2.638135895236819, + "loss": 0.9700725674629211, + "unpadded_tokens_per_sec": 10024.796875 + }, + "step_100700": { + "perplexity": 2.6388992736343737, + "loss": 0.9703618884086609, + "unpadded_tokens_per_sec": 10027.84375 + }, + "step_100800": { + "perplexity": 2.6355474954454907, + "loss": 0.9690909385681152, + "unpadded_tokens_per_sec": 10042.3232421875 + }, + "step_100900": { + "perplexity": 2.6405074312172925, + "loss": 0.9709711074829102, + "unpadded_tokens_per_sec": 10042.2451171875 + }, + "step_101000": { + "perplexity": 2.635611589300707, + "loss": 0.9691152572631836, + "unpadded_tokens_per_sec": 10041.4267578125 + }, + "step_101100": { + "perplexity": 2.636615458422998, + "loss": 0.9694960713386536, + "unpadded_tokens_per_sec": 10042.9580078125 + }, + "step_101200": { + "perplexity": 2.6390465017932208, + "loss": 0.9704176783561707, + "unpadded_tokens_per_sec": 10041.9306640625 + }, + "step_101300": { + "perplexity": 2.6367509291061273, + "loss": 0.96954745054245, + "unpadded_tokens_per_sec": 10041.0791015625 + }, + "step_101400": { + "perplexity": 2.6367685313763602, + "loss": 0.9695541262626648, + "unpadded_tokens_per_sec": 10035.8876953125 + }, + "step_101500": { + "perplexity": 2.6386211984106316, + "loss": 0.970256507396698, + "unpadded_tokens_per_sec": 10037.61328125 + }, + "step_101600": { + "perplexity": 2.635398106260006, + "loss": 0.9690342545509338, + "unpadded_tokens_per_sec": 10031.234375 + }, + "step_101700": { + "perplexity": 2.636672977607573, + "loss": 0.9695178866386414, + "unpadded_tokens_per_sec": 10034.6201171875 + }, + "step_101800": { + "perplexity": 2.6351511849746223, + "loss": 0.9689405560493469, + "unpadded_tokens_per_sec": 10035.185546875 + }, + "step_101900": { + "perplexity": 2.632249866412991, + "loss": 0.9678389430046082, + "unpadded_tokens_per_sec": 10037.357421875 + }, + "step_102000": { + "perplexity": 2.6345039903248, + "loss": 0.9686949253082275, + "unpadded_tokens_per_sec": 10036.8310546875 + }, + "step_102100": { + "perplexity": 2.6325275840056652, + "loss": 0.9679444432258606, + "unpadded_tokens_per_sec": 10041.14453125 + }, + "step_102200": { + "perplexity": 2.6352232798287147, + "loss": 0.9689679145812988, + "unpadded_tokens_per_sec": 10033.109375 + }, + "step_102300": { + "perplexity": 2.634540264198331, + "loss": 0.9687086939811707, + "unpadded_tokens_per_sec": 10045.59375 + }, + "step_102400": { + "perplexity": 2.6300109958060855, + "loss": 0.9669880270957947, + "unpadded_tokens_per_sec": 10042.5888671875 + }, + "step_102500": { + "perplexity": 2.631495940309455, + "loss": 0.9675524830818176, + "unpadded_tokens_per_sec": 10045.015625 + }, + "step_102600": { + "perplexity": 2.6365639122417512, + "loss": 0.9694765210151672, + "unpadded_tokens_per_sec": 10035.46875 + }, + "step_102700": { + "perplexity": 2.6332836891466997, + "loss": 0.9682316184043884, + "unpadded_tokens_per_sec": 10029.50390625 + }, + "step_102800": { + "perplexity": 2.636674863503722, + "loss": 0.9695186018943787, + "unpadded_tokens_per_sec": 10039.0205078125 + }, + "step_102900": { + "perplexity": 2.631222409206521, + "loss": 0.9674485325813293, + "unpadded_tokens_per_sec": 10035.6298828125 + }, + "step_103000": { + "perplexity": 2.6329719931009214, + "loss": 0.9681132435798645, + "unpadded_tokens_per_sec": 10031.1240234375 + }, + "step_103100": { + "perplexity": 2.633534987594996, + "loss": 0.9683270454406738, + "unpadded_tokens_per_sec": 7072.4296875 + }, + "step_103200": { + "perplexity": 2.6294708533179527, + "loss": 0.9667826294898987, + "unpadded_tokens_per_sec": 10043.9404296875 + }, + "step_103300": { + "perplexity": 2.6371152571865544, + "loss": 0.9696856141090393, + "unpadded_tokens_per_sec": 10037.9296875 + }, + "step_103400": { + "perplexity": 2.630371884089652, + "loss": 0.9671252369880676, + "unpadded_tokens_per_sec": 9626.2490234375 + }, + "step_103500": { + "perplexity": 2.6314278685586676, + "loss": 0.9675266146659851, + "unpadded_tokens_per_sec": 10031.6875 + }, + "step_103600": { + "perplexity": 2.631831933131196, + "loss": 0.9676801562309265, + "unpadded_tokens_per_sec": 10040.7587890625 + }, + "step_103700": { + "perplexity": 2.632388878449519, + "loss": 0.9678917527198792, + "unpadded_tokens_per_sec": 10044.869140625 + }, + "step_103800": { + "perplexity": 2.6321353360523756, + "loss": 0.9677954316139221, + "unpadded_tokens_per_sec": 10035.91796875 + }, + "step_103900": { + "perplexity": 2.62986677975879, + "loss": 0.9669331908226013, + "unpadded_tokens_per_sec": 10043.1611328125 + }, + "step_104000": { + "perplexity": 2.6328858358997547, + "loss": 0.9680805206298828, + "unpadded_tokens_per_sec": 10039.7197265625 + }, + "step_104100": { + "perplexity": 2.6334767520285194, + "loss": 0.9683049321174622, + "unpadded_tokens_per_sec": 10039.22265625 + }, + "step_104200": { + "perplexity": 2.6346472043684286, + "loss": 0.9687492847442627, + "unpadded_tokens_per_sec": 10047.6455078125 + }, + "step_104300": { + "perplexity": 2.627683813566515, + "loss": 0.9661027789115906, + "unpadded_tokens_per_sec": 9600.9892578125 + }, + "step_104400": { + "perplexity": 2.6319417440072512, + "loss": 0.9677218794822693, + "unpadded_tokens_per_sec": 10036.7197265625 + }, + "step_104500": { + "perplexity": 2.6314055966170042, + "loss": 0.967518150806427, + "unpadded_tokens_per_sec": 10037.2734375 + }, + "step_104600": { + "perplexity": 2.6297495316706594, + "loss": 0.9668886065483093, + "unpadded_tokens_per_sec": 10037.74609375 + }, + "step_104700": { + "perplexity": 2.6282260954337926, + "loss": 0.966309130191803, + "unpadded_tokens_per_sec": 10037.1064453125 + }, + "step_104800": { + "perplexity": 2.632139728905809, + "loss": 0.9677971005439758, + "unpadded_tokens_per_sec": 10036.240234375 + }, + "step_104900": { + "perplexity": 2.6304041814585806, + "loss": 0.9671375155448914, + "unpadded_tokens_per_sec": 10034.4638671875 + }, + "step_105000": { + "perplexity": 2.6294199169915595, + "loss": 0.9667632579803467, + "unpadded_tokens_per_sec": 10031.2919921875 + }, + "step_105100": { + "perplexity": 2.6272794462653457, + "loss": 0.9659488797187805, + "unpadded_tokens_per_sec": 9964.0576171875 + }, + "step_105200": { + "perplexity": 2.631675382118827, + "loss": 0.9676206707954407, + "unpadded_tokens_per_sec": 10038.8369140625 + }, + "step_105300": { + "perplexity": 2.6337070333400376, + "loss": 0.9683923721313477, + "unpadded_tokens_per_sec": 10040.8935546875 + }, + "step_105400": { + "perplexity": 2.6340833440912736, + "loss": 0.9685352444648743, + "unpadded_tokens_per_sec": 10036.30078125 + }, + "step_105500": { + "perplexity": 2.629211950291833, + "loss": 0.9666841626167297, + "unpadded_tokens_per_sec": 10038.7626953125 + }, + "step_105600": { + "perplexity": 2.6342232380153776, + "loss": 0.9685883522033691, + "unpadded_tokens_per_sec": 10040.4765625 + }, + "step_105700": { + "perplexity": 2.631975315675243, + "loss": 0.9677346348762512, + "unpadded_tokens_per_sec": 10035.8662109375 + }, + "step_105800": { + "perplexity": 2.629867720272609, + "loss": 0.96693354845047, + "unpadded_tokens_per_sec": 10039.6376953125 + }, + "step_105900": { + "perplexity": 2.6276872592563008, + "loss": 0.9661040902137756, + "unpadded_tokens_per_sec": 10029.0546875 + }, + "step_106000": { + "perplexity": 2.6266327758830874, + "loss": 0.9657027125358582, + "unpadded_tokens_per_sec": 10038.93359375 + }, + "step_106100": { + "perplexity": 2.628425680822683, + "loss": 0.9663850665092468, + "unpadded_tokens_per_sec": 10039.7109375 + }, + "step_106200": { + "perplexity": 2.626185053836813, + "loss": 0.9655322432518005, + "unpadded_tokens_per_sec": 10040.2998046875 + }, + "step_106300": { + "perplexity": 2.6257500851352766, + "loss": 0.9653666019439697, + "unpadded_tokens_per_sec": 10031.4208984375 + }, + "step_106400": { + "perplexity": 2.630516754999739, + "loss": 0.9671803116798401, + "unpadded_tokens_per_sec": 10038.177734375 + }, + "step_106500": { + "perplexity": 2.625004122719926, + "loss": 0.9650824666023254, + "unpadded_tokens_per_sec": 10042.8271484375 + }, + "step_106600": { + "perplexity": 2.630673393888968, + "loss": 0.9672398567199707, + "unpadded_tokens_per_sec": 10044.8056640625 + }, + "step_106700": { + "perplexity": 2.6264093749581447, + "loss": 0.9656176567077637, + "unpadded_tokens_per_sec": 10033.7392578125 + }, + "step_106800": { + "perplexity": 2.6283730414464417, + "loss": 0.9663650393486023, + "unpadded_tokens_per_sec": 10044.37890625 + }, + "step_106900": { + "perplexity": 2.630288790739795, + "loss": 0.9670936465263367, + "unpadded_tokens_per_sec": 10038.20703125 + }, + "step_107000": { + "perplexity": 2.624218642360528, + "loss": 0.9647831916809082, + "unpadded_tokens_per_sec": 10043.7001953125 + }, + "step_107100": { + "perplexity": 2.633432487580665, + "loss": 0.9682881236076355, + "unpadded_tokens_per_sec": 10040.431640625 + }, + "step_107200": { + "perplexity": 2.6257464854790187, + "loss": 0.9653652310371399, + "unpadded_tokens_per_sec": 10044.138671875 + }, + "step_107300": { + "perplexity": 2.6264583743751366, + "loss": 0.9656363129615784, + "unpadded_tokens_per_sec": 10038.3935546875 + }, + "step_107400": { + "perplexity": 2.625882806341555, + "loss": 0.9654171466827393, + "unpadded_tokens_per_sec": 10038.1083984375 + }, + "step_107500": { + "perplexity": 2.6259508911673772, + "loss": 0.9654430747032166, + "unpadded_tokens_per_sec": 10036.9658203125 + }, + "step_107600": { + "perplexity": 2.6241943980514226, + "loss": 0.964773952960968, + "unpadded_tokens_per_sec": 10038.671875 + }, + "step_107700": { + "perplexity": 2.6247298583944105, + "loss": 0.9649779796600342, + "unpadded_tokens_per_sec": 10036.87890625 + }, + "step_107800": { + "perplexity": 2.62852093570712, + "loss": 0.9664213061332703, + "unpadded_tokens_per_sec": 7228.13671875 + }, + "step_107900": { + "perplexity": 2.62230606379675, + "loss": 0.9640541076660156, + "unpadded_tokens_per_sec": 10054.5361328125 + }, + "step_108000": { + "perplexity": 2.626657355841725, + "loss": 0.9657120704650879, + "unpadded_tokens_per_sec": 10042.65625 + }, + "step_108100": { + "perplexity": 2.6284097009005993, + "loss": 0.9663789868354797, + "unpadded_tokens_per_sec": 10044.7470703125 + }, + "step_108200": { + "perplexity": 2.6275121612856362, + "loss": 0.9660374522209167, + "unpadded_tokens_per_sec": 9608.7138671875 + }, + "step_108300": { + "perplexity": 2.624877704114024, + "loss": 0.9650343060493469, + "unpadded_tokens_per_sec": 10035.1328125 + }, + "step_108400": { + "perplexity": 2.6242206757643753, + "loss": 0.9647839665412903, + "unpadded_tokens_per_sec": 10034.2861328125 + }, + "step_108500": { + "perplexity": 2.623838267142686, + "loss": 0.9646382331848145, + "unpadded_tokens_per_sec": 10033.96875 + }, + "step_108600": { + "perplexity": 2.624563248595828, + "loss": 0.9649145007133484, + "unpadded_tokens_per_sec": 10040.4501953125 + }, + "step_108700": { + "perplexity": 2.630104113411967, + "loss": 0.9670234322547913, + "unpadded_tokens_per_sec": 10040.75390625 + }, + "step_108800": { + "perplexity": 2.623272342104192, + "loss": 0.9644225239753723, + "unpadded_tokens_per_sec": 10037.7822265625 + }, + "step_108900": { + "perplexity": 2.6222777733558775, + "loss": 0.9640433192253113, + "unpadded_tokens_per_sec": 9591.3017578125 + }, + "step_109000": { + "perplexity": 2.624741748324245, + "loss": 0.9649825096130371, + "unpadded_tokens_per_sec": 10032.38671875 + }, + "step_109100": { + "perplexity": 2.6263179535697705, + "loss": 0.9655828475952148, + "unpadded_tokens_per_sec": 10042.8466796875 + }, + "step_109200": { + "perplexity": 2.6242355353018594, + "loss": 0.964789628982544, + "unpadded_tokens_per_sec": 10046.470703125 + }, + "step_109300": { + "perplexity": 2.6258083063479485, + "loss": 0.9653887748718262, + "unpadded_tokens_per_sec": 10032.5615234375 + }, + "step_109400": { + "perplexity": 2.6219514393985004, + "loss": 0.9639188647270203, + "unpadded_tokens_per_sec": 10033.4775390625 + }, + "step_109500": { + "perplexity": 2.6248287341861425, + "loss": 0.9650156497955322, + "unpadded_tokens_per_sec": 10033.046875 + }, + "step_109600": { + "perplexity": 2.622110225178262, + "loss": 0.9639794230461121, + "unpadded_tokens_per_sec": 10039.9267578125 + }, + "step_109700": { + "perplexity": 2.6279670016909566, + "loss": 0.9662105441093445, + "unpadded_tokens_per_sec": 10038.95703125 + }, + "step_109800": { + "perplexity": 2.6229930994077373, + "loss": 0.9643160700798035, + "unpadded_tokens_per_sec": 10032.4189453125 + }, + "step_109900": { + "perplexity": 2.6218557974866177, + "loss": 0.9638823866844177, + "unpadded_tokens_per_sec": 10020.4521484375 + }, + "step_110000": { + "perplexity": 2.6186335178805002, + "loss": 0.9626526236534119, + "unpadded_tokens_per_sec": 10026.4716796875 + }, + "step_110100": { + "perplexity": 2.6227967404875256, + "loss": 0.9642412066459656, + "unpadded_tokens_per_sec": 9975.26953125 + }, + "step_110200": { + "perplexity": 2.6241502896247955, + "loss": 0.9647571444511414, + "unpadded_tokens_per_sec": 10037.0478515625 + }, + "step_110300": { + "perplexity": 2.6234629509131677, + "loss": 0.9644951820373535, + "unpadded_tokens_per_sec": 10037.8974609375 + }, + "step_110400": { + "perplexity": 2.6255771507046326, + "loss": 0.9653007388114929, + "unpadded_tokens_per_sec": 10038.2548828125 + }, + "step_110500": { + "perplexity": 2.626054195653558, + "loss": 0.9654824137687683, + "unpadded_tokens_per_sec": 10039.4697265625 + }, + "step_110600": { + "perplexity": 2.625603755260299, + "loss": 0.9653108716011047, + "unpadded_tokens_per_sec": 10034.986328125 + }, + "step_110700": { + "perplexity": 2.6207114907610753, + "loss": 0.9634458422660828, + "unpadded_tokens_per_sec": 10034.8525390625 + }, + "step_110800": { + "perplexity": 2.61970321510546, + "loss": 0.9630610346794128, + "unpadded_tokens_per_sec": 10043.9140625 + }, + "step_110900": { + "perplexity": 2.620559818563497, + "loss": 0.9633879661560059, + "unpadded_tokens_per_sec": 10037.095703125 + }, + "step_111000": { + "perplexity": 2.6205960566426585, + "loss": 0.9634017944335938, + "unpadded_tokens_per_sec": 10035.396484375 + }, + "step_111100": { + "perplexity": 2.6191283468515825, + "loss": 0.9628415703773499, + "unpadded_tokens_per_sec": 10039.642578125 + }, + "step_111200": { + "perplexity": 2.6197408466773155, + "loss": 0.9630753993988037, + "unpadded_tokens_per_sec": 10040.5146484375 + }, + "step_111300": { + "perplexity": 2.6251147439945823, + "loss": 0.9651246070861816, + "unpadded_tokens_per_sec": 10038.4931640625 + }, + "step_111400": { + "perplexity": 2.6204898430011556, + "loss": 0.9633612632751465, + "unpadded_tokens_per_sec": 10031.9267578125 + }, + "step_111500": { + "perplexity": 2.6205196761039224, + "loss": 0.9633726477622986, + "unpadded_tokens_per_sec": 10035.5615234375 + }, + "step_111600": { + "perplexity": 2.6202516589429377, + "loss": 0.963270366191864, + "unpadded_tokens_per_sec": 10030.5009765625 + }, + "step_111700": { + "perplexity": 2.6201132878525915, + "loss": 0.963217556476593, + "unpadded_tokens_per_sec": 10035.2958984375 + }, + "step_111800": { + "perplexity": 2.6202493162564444, + "loss": 0.9632694721221924, + "unpadded_tokens_per_sec": 10042.2802734375 + }, + "step_111900": { + "perplexity": 2.6211146909507943, + "loss": 0.963599681854248, + "unpadded_tokens_per_sec": 10040.6044921875 + }, + "step_112000": { + "perplexity": 2.620851768021774, + "loss": 0.9634993672370911, + "unpadded_tokens_per_sec": 10037.587890625 + }, + "step_112100": { + "perplexity": 2.6188573500675587, + "loss": 0.9627380967140198, + "unpadded_tokens_per_sec": 10039.6259765625 + }, + "step_112200": { + "perplexity": 2.6183277696815535, + "loss": 0.9625358581542969, + "unpadded_tokens_per_sec": 10039.9560546875 + }, + "step_112300": { + "perplexity": 2.6189603754949724, + "loss": 0.9627774357795715, + "unpadded_tokens_per_sec": 10035.1533203125 + }, + "step_112400": { + "perplexity": 2.615953545133575, + "loss": 0.9616286754608154, + "unpadded_tokens_per_sec": 10037.8544921875 + }, + "step_112500": { + "perplexity": 2.617905025091205, + "loss": 0.9623743891716003, + "unpadded_tokens_per_sec": 10035.3486328125 + }, + "step_112600": { + "perplexity": 2.6203584876724415, + "loss": 0.9633111357688904, + "unpadded_tokens_per_sec": 10030.7451171875 + }, + "step_112700": { + "perplexity": 2.617053345125623, + "loss": 0.9620490074157715, + "unpadded_tokens_per_sec": 9033.5673828125 + }, + "step_112800": { + "perplexity": 2.621039232652322, + "loss": 0.9635708928108215, + "unpadded_tokens_per_sec": 7482.61865234375 + }, + "step_112900": { + "perplexity": 2.6197627075896857, + "loss": 0.9630837440490723, + "unpadded_tokens_per_sec": 10038.3251953125 + }, + "step_113000": { + "perplexity": 2.62286834100239, + "loss": 0.9642685055732727, + "unpadded_tokens_per_sec": 10035.38671875 + }, + "step_113100": { + "perplexity": 2.6132643213806697, + "loss": 0.9606001377105713, + "unpadded_tokens_per_sec": 9594.45703125 + }, + "step_113200": { + "perplexity": 2.6179114227102844, + "loss": 0.9623768329620361, + "unpadded_tokens_per_sec": 10036.0654296875 + }, + "step_113300": { + "perplexity": 2.6188083363627377, + "loss": 0.9627193808555603, + "unpadded_tokens_per_sec": 10042.7900390625 + }, + "step_113400": { + "perplexity": 2.6198134568395433, + "loss": 0.9631031155586243, + "unpadded_tokens_per_sec": 10032.8681640625 + }, + "step_113500": { + "perplexity": 2.6182541082753277, + "loss": 0.9625077247619629, + "unpadded_tokens_per_sec": 10041.1552734375 + }, + "step_113600": { + "perplexity": 2.6143858976108705, + "loss": 0.9610292315483093, + "unpadded_tokens_per_sec": 10039.5244140625 + }, + "step_113700": { + "perplexity": 2.615434061345338, + "loss": 0.9614300727844238, + "unpadded_tokens_per_sec": 9577.8701171875 + }, + "step_113800": { + "perplexity": 2.6181400308230534, + "loss": 0.9624641537666321, + "unpadded_tokens_per_sec": 10042.89453125 + }, + "step_113900": { + "perplexity": 2.619390003992693, + "loss": 0.9629414677619934, + "unpadded_tokens_per_sec": 10042.2666015625 + }, + "step_114000": { + "perplexity": 2.6165818342651654, + "loss": 0.9618688225746155, + "unpadded_tokens_per_sec": 10033.4541015625 + }, + "step_114100": { + "perplexity": 2.6187964733109195, + "loss": 0.9627148509025574, + "unpadded_tokens_per_sec": 10040.203125 + }, + "step_114200": { + "perplexity": 2.61904373539792, + "loss": 0.9628092646598816, + "unpadded_tokens_per_sec": 10036.3857421875 + }, + "step_114300": { + "perplexity": 2.6156106929659955, + "loss": 0.9614976048469543, + "unpadded_tokens_per_sec": 10036.4296875 + }, + "step_114400": { + "perplexity": 2.617746493963581, + "loss": 0.9623138308525085, + "unpadded_tokens_per_sec": 10041.78515625 + }, + "step_114500": { + "perplexity": 2.616076415328728, + "loss": 0.9616756439208984, + "unpadded_tokens_per_sec": 10036.49609375 + }, + "step_114600": { + "perplexity": 2.6167426344100018, + "loss": 0.9619302749633789, + "unpadded_tokens_per_sec": 10037.0703125 + }, + "step_114700": { + "perplexity": 2.61955534856257, + "loss": 0.9630045890808105, + "unpadded_tokens_per_sec": 10037.3681640625 + }, + "step_114800": { + "perplexity": 2.6145108758912823, + "loss": 0.9610770344734192, + "unpadded_tokens_per_sec": 10049.0771484375 + }, + "step_114900": { + "perplexity": 2.617865391409259, + "loss": 0.9623592495918274, + "unpadded_tokens_per_sec": 10035.0498046875 + }, + "step_115000": { + "perplexity": 2.615814465531083, + "loss": 0.9615755081176758, + "unpadded_tokens_per_sec": 10033.708984375 + }, + "step_115100": { + "perplexity": 2.614525836285318, + "loss": 0.9610827565193176, + "unpadded_tokens_per_sec": 9974.0595703125 + }, + "step_115200": { + "perplexity": 2.6186580229822973, + "loss": 0.9626619815826416, + "unpadded_tokens_per_sec": 10038.9521484375 + }, + "step_115300": { + "perplexity": 2.616535046554254, + "loss": 0.9618509411811829, + "unpadded_tokens_per_sec": 10037.65234375 + }, + "step_115400": { + "perplexity": 2.6184838388293477, + "loss": 0.9625954627990723, + "unpadded_tokens_per_sec": 10037.048828125 + }, + "step_115500": { + "perplexity": 2.612688530152437, + "loss": 0.9603797793388367, + "unpadded_tokens_per_sec": 10037.03515625 + }, + "step_115600": { + "perplexity": 2.6125522713804123, + "loss": 0.9603276252746582, + "unpadded_tokens_per_sec": 10032.9951171875 + }, + "step_115700": { + "perplexity": 2.6152053777528916, + "loss": 0.9613426327705383, + "unpadded_tokens_per_sec": 10028.1572265625 + }, + "step_115800": { + "perplexity": 2.6133721113862034, + "loss": 0.9606413841247559, + "unpadded_tokens_per_sec": 10030.935546875 + }, + "step_115900": { + "perplexity": 2.6157793849606135, + "loss": 0.9615620970726013, + "unpadded_tokens_per_sec": 10036.505859375 + }, + "step_116000": { + "perplexity": 2.615681473679658, + "loss": 0.9615246653556824, + "unpadded_tokens_per_sec": 10031.9921875 + }, + "step_116100": { + "perplexity": 2.612939264861447, + "loss": 0.960475742816925, + "unpadded_tokens_per_sec": 10032.2548828125 + }, + "step_116200": { + "perplexity": 2.616525689112467, + "loss": 0.9618473649024963, + "unpadded_tokens_per_sec": 10041.1875 + }, + "step_116300": { + "perplexity": 2.6129297645363994, + "loss": 0.9604721069335938, + "unpadded_tokens_per_sec": 10035.3330078125 + }, + "step_116400": { + "perplexity": 2.616980499493628, + "loss": 0.9620211720466614, + "unpadded_tokens_per_sec": 10039.2353515625 + }, + "step_116500": { + "perplexity": 2.6115913427608235, + "loss": 0.9599597454071045, + "unpadded_tokens_per_sec": 10039.2216796875 + }, + "step_116600": { + "perplexity": 2.612909362315744, + "loss": 0.9604642987251282, + "unpadded_tokens_per_sec": 10032.2509765625 + }, + "step_116700": { + "perplexity": 2.615482076527667, + "loss": 0.9614484310150146, + "unpadded_tokens_per_sec": 10032.3115234375 + }, + "step_116800": { + "perplexity": 2.613559041010886, + "loss": 0.9607129096984863, + "unpadded_tokens_per_sec": 10030.2255859375 + }, + "step_116900": { + "perplexity": 2.6147265631844414, + "loss": 0.9611595273017883, + "unpadded_tokens_per_sec": 10038.5009765625 + }, + "step_117000": { + "perplexity": 2.611103851829612, + "loss": 0.959773063659668, + "unpadded_tokens_per_sec": 10034.0732421875 + }, + "step_117100": { + "perplexity": 2.612788353945678, + "loss": 0.9604179859161377, + "unpadded_tokens_per_sec": 10032.0029296875 + }, + "step_117200": { + "perplexity": 2.613000316954833, + "loss": 0.960499107837677, + "unpadded_tokens_per_sec": 10026.6826171875 + }, + "step_117300": { + "perplexity": 2.611368287244782, + "loss": 0.9598743319511414, + "unpadded_tokens_per_sec": 10037.4482421875 + }, + "step_117400": { + "perplexity": 2.613666064239496, + "loss": 0.960753858089447, + "unpadded_tokens_per_sec": 10033.8994140625 + }, + "step_117500": { + "perplexity": 2.6181072598337836, + "loss": 0.9624516367912292, + "unpadded_tokens_per_sec": 10034.4345703125 + }, + "step_117600": { + "perplexity": 2.612689775979708, + "loss": 0.9603802561759949, + "unpadded_tokens_per_sec": 10037.9267578125 + }, + "step_117700": { + "perplexity": 2.61494460621261, + "loss": 0.9612429141998291, + "unpadded_tokens_per_sec": 10037.4814453125 + }, + "step_117800": { + "perplexity": 2.6148032424339718, + "loss": 0.9611888527870178, + "unpadded_tokens_per_sec": 10035.802734375 + }, + "step_117900": { + "perplexity": 2.6146441199146184, + "loss": 0.9611279964447021, + "unpadded_tokens_per_sec": 6464.87353515625 + }, + "step_118000": { + "perplexity": 2.612333493589881, + "loss": 0.9602438807487488, + "unpadded_tokens_per_sec": 10046.6806640625 + }, + "step_118100": { + "perplexity": 2.6112084399168007, + "loss": 0.959813117980957, + "unpadded_tokens_per_sec": 10027.0673828125 + }, + "step_118200": { + "perplexity": 2.609928924540458, + "loss": 0.959322988986969, + "unpadded_tokens_per_sec": 10032.7568359375 + }, + "step_118300": { + "perplexity": 2.6089363046239327, + "loss": 0.9589425921440125, + "unpadded_tokens_per_sec": 9124.5126953125 + }, + "step_118400": { + "perplexity": 2.612857033684194, + "loss": 0.9604442715644836, + "unpadded_tokens_per_sec": 10030.4404296875 + }, + "step_118500": { + "perplexity": 2.6109331270036416, + "loss": 0.9597076773643494, + "unpadded_tokens_per_sec": 10032.462890625 + }, + "step_118600": { + "perplexity": 2.6129243135458196, + "loss": 0.9604700207710266, + "unpadded_tokens_per_sec": 10031.5087890625 + }, + "step_118700": { + "perplexity": 2.6106544197590256, + "loss": 0.9596009254455566, + "unpadded_tokens_per_sec": 9562.0029296875 + }, + "step_118800": { + "perplexity": 2.6110289929883663, + "loss": 0.959744393825531, + "unpadded_tokens_per_sec": 10047.578125 + }, + "step_118900": { + "perplexity": 2.6098368323444774, + "loss": 0.959287703037262, + "unpadded_tokens_per_sec": 10038.482421875 + }, + "step_119000": { + "perplexity": 2.613358871044853, + "loss": 0.96063631772995, + "unpadded_tokens_per_sec": 10037.5224609375 + }, + "step_119100": { + "perplexity": 2.612190869679109, + "loss": 0.9601892828941345, + "unpadded_tokens_per_sec": 10035.3515625 + }, + "step_119200": { + "perplexity": 2.609406126686258, + "loss": 0.9591226577758789, + "unpadded_tokens_per_sec": 10032.95703125 + }, + "step_119300": { + "perplexity": 2.6103214417390594, + "loss": 0.9594733715057373, + "unpadded_tokens_per_sec": 10029.4013671875 + }, + "step_119400": { + "perplexity": 2.6108248151261764, + "loss": 0.9596661925315857, + "unpadded_tokens_per_sec": 10035.7548828125 + }, + "step_119500": { + "perplexity": 2.607540401298122, + "loss": 0.9584074020385742, + "unpadded_tokens_per_sec": 10035.6328125 + }, + "step_119600": { + "perplexity": 2.609448587465714, + "loss": 0.9591389298439026, + "unpadded_tokens_per_sec": 10036.4208984375 + }, + "step_119700": { + "perplexity": 2.6129292973081895, + "loss": 0.9604719281196594, + "unpadded_tokens_per_sec": 10041.619140625 + }, + "step_119800": { + "perplexity": 2.607724582302974, + "loss": 0.9584780335426331, + "unpadded_tokens_per_sec": 10034.2001953125 + }, + "step_119900": { + "perplexity": 2.6123085805550885, + "loss": 0.9602343440055847, + "unpadded_tokens_per_sec": 10030.9580078125 + }, + "step_120000": { + "perplexity": 2.611021055898193, + "loss": 0.9597413539886475, + "unpadded_tokens_per_sec": 10029.5576171875 + }, + "step_120100": { + "perplexity": 2.6096761454669775, + "loss": 0.959226131439209, + "unpadded_tokens_per_sec": 9972.4443359375 + }, + "step_120200": { + "perplexity": 2.610843644885645, + "loss": 0.9596734046936035, + "unpadded_tokens_per_sec": 10037.6025390625 + }, + "step_120300": { + "perplexity": 2.6125894887852743, + "loss": 0.9603418707847595, + "unpadded_tokens_per_sec": 10029.0498046875 + }, + "step_120400": { + "perplexity": 2.6070584837053477, + "loss": 0.9582225680351257, + "unpadded_tokens_per_sec": 10030.3388671875 + }, + "step_120500": { + "perplexity": 2.6124206910826935, + "loss": 0.960277259349823, + "unpadded_tokens_per_sec": 10037.7744140625 + }, + "step_120600": { + "perplexity": 2.6077468092448277, + "loss": 0.9584865570068359, + "unpadded_tokens_per_sec": 10035.7548828125 + }, + "step_120700": { + "perplexity": 2.6062067595632334, + "loss": 0.957895815372467, + "unpadded_tokens_per_sec": 10036.4072265625 + }, + "step_120800": { + "perplexity": 2.610654264151901, + "loss": 0.9596008658409119, + "unpadded_tokens_per_sec": 10034.6787109375 + }, + "step_120900": { + "perplexity": 2.6126698428146584, + "loss": 0.9603726267814636, + "unpadded_tokens_per_sec": 10045.3310546875 + }, + "step_121000": { + "perplexity": 2.605425816872915, + "loss": 0.9575961232185364, + "unpadded_tokens_per_sec": 10033.654296875 + }, + "step_121100": { + "perplexity": 2.61049430493318, + "loss": 0.9595395922660828, + "unpadded_tokens_per_sec": 10042.7666015625 + }, + "step_121200": { + "perplexity": 2.606066955487801, + "loss": 0.9578421711921692, + "unpadded_tokens_per_sec": 10035.873046875 + }, + "step_121300": { + "perplexity": 2.6099233242465547, + "loss": 0.9593208432197571, + "unpadded_tokens_per_sec": 10032.9580078125 + }, + "step_121400": { + "perplexity": 2.6155692232174483, + "loss": 0.9614817500114441, + "unpadded_tokens_per_sec": 10040.69921875 + }, + "step_121500": { + "perplexity": 2.6082206145729603, + "loss": 0.9586682319641113, + "unpadded_tokens_per_sec": 10034.22265625 + }, + "step_121600": { + "perplexity": 2.6042015997136536, + "loss": 0.9571261405944824, + "unpadded_tokens_per_sec": 10026.9892578125 + }, + "step_121700": { + "perplexity": 2.608568095428831, + "loss": 0.9588014483451843, + "unpadded_tokens_per_sec": 10040.2998046875 + }, + "step_121800": { + "perplexity": 2.6123903273397286, + "loss": 0.9602656364440918, + "unpadded_tokens_per_sec": 10032.4462890625 + }, + "step_121900": { + "perplexity": 2.6081103942990858, + "loss": 0.9586259722709656, + "unpadded_tokens_per_sec": 10039.9365234375 + }, + "step_122000": { + "perplexity": 2.6123897044974127, + "loss": 0.9602653980255127, + "unpadded_tokens_per_sec": 10039.5390625 + }, + "step_122100": { + "perplexity": 2.6072365699304143, + "loss": 0.9582908749580383, + "unpadded_tokens_per_sec": 10037.1650390625 + }, + "step_122200": { + "perplexity": 2.6078905894939646, + "loss": 0.9585416913032532, + "unpadded_tokens_per_sec": 10032.8916015625 + }, + "step_122300": { + "perplexity": 2.6074230606912794, + "loss": 0.9583624005317688, + "unpadded_tokens_per_sec": 10043.8310546875 + }, + "step_122400": { + "perplexity": 2.6086746033038004, + "loss": 0.9588422775268555, + "unpadded_tokens_per_sec": 10038.41015625 + }, + "step_122500": { + "perplexity": 2.6059593114602557, + "loss": 0.9578008651733398, + "unpadded_tokens_per_sec": 10040.6357421875 + }, + "step_122600": { + "perplexity": 2.6024058290272802, + "loss": 0.9564363360404968, + "unpadded_tokens_per_sec": 10044.171875 + }, + "step_122700": { + "perplexity": 2.606381059146979, + "loss": 0.957962691783905, + "unpadded_tokens_per_sec": 10043.0986328125 + }, + "step_122800": { + "perplexity": 2.6069697559294, + "loss": 0.958188533782959, + "unpadded_tokens_per_sec": 10046.4716796875 + }, + "step_122900": { + "perplexity": 2.60535546897007, + "loss": 0.9575691223144531, + "unpadded_tokens_per_sec": 10050.1728515625 + }, + "step_123000": { + "perplexity": 2.6125247090415376, + "loss": 0.960317075252533, + "unpadded_tokens_per_sec": 10042.4189453125 + }, + "step_123100": { + "perplexity": 2.6050997167736383, + "loss": 0.9574709534645081, + "unpadded_tokens_per_sec": 10035.2568359375 + }, + "step_123200": { + "perplexity": 2.6060226857606987, + "loss": 0.9578251838684082, + "unpadded_tokens_per_sec": 10044.46484375 + }, + "step_123300": { + "perplexity": 2.6078736463282537, + "loss": 0.9585351943969727, + "unpadded_tokens_per_sec": 6609.46923828125 + }, + "step_123400": { + "perplexity": 2.608140241924644, + "loss": 0.9586374163627625, + "unpadded_tokens_per_sec": 10045.9306640625 + }, + "step_123500": { + "perplexity": 2.6077026664129344, + "loss": 0.9584696292877197, + "unpadded_tokens_per_sec": 10036.80078125 + }, + "step_123600": { + "perplexity": 2.6089258858283837, + "loss": 0.9589385986328125, + "unpadded_tokens_per_sec": 10038.2822265625 + }, + "step_123700": { + "perplexity": 2.600554409142667, + "loss": 0.9557246565818787, + "unpadded_tokens_per_sec": 9108.7216796875 + }, + "step_123800": { + "perplexity": 2.606628391929721, + "loss": 0.9580575823783875, + "unpadded_tokens_per_sec": 10037.1591796875 + }, + "step_123900": { + "perplexity": 2.6083667530063046, + "loss": 0.9587242603302002, + "unpadded_tokens_per_sec": 9534.45703125 + }, + "step_124000": { + "perplexity": 2.609232402415352, + "loss": 0.9590560793876648, + "unpadded_tokens_per_sec": 10039.3955078125 + }, + "step_124100": { + "perplexity": 2.6057238459437704, + "loss": 0.9577105045318604, + "unpadded_tokens_per_sec": 10032.5126953125 + }, + "step_124200": { + "perplexity": 2.602796283966104, + "loss": 0.9565863609313965, + "unpadded_tokens_per_sec": 10037.732421875 + }, + "step_124300": { + "perplexity": 2.60516120681244, + "loss": 0.9574945569038391, + "unpadded_tokens_per_sec": 10030.6796875 + }, + "step_124400": { + "perplexity": 2.603858270152436, + "loss": 0.9569942951202393, + "unpadded_tokens_per_sec": 10043.8251953125 + }, + "step_124500": { + "perplexity": 2.6049087342413793, + "loss": 0.9573976397514343, + "unpadded_tokens_per_sec": 10032.7109375 + }, + "step_124600": { + "perplexity": 2.607126857431655, + "loss": 0.9582487940788269, + "unpadded_tokens_per_sec": 10050.5029296875 + }, + "step_124700": { + "perplexity": 2.60893817068126, + "loss": 0.9589433073997498, + "unpadded_tokens_per_sec": 10036.90234375 + }, + "step_124800": { + "perplexity": 2.6072737116098583, + "loss": 0.9583051204681396, + "unpadded_tokens_per_sec": 10047.2001953125 + }, + "step_124900": { + "perplexity": 2.6000661888433183, + "loss": 0.9555369019508362, + "unpadded_tokens_per_sec": 10040.4931640625 + }, + "step_125000": { + "perplexity": 2.600790647677748, + "loss": 0.9558154940605164, + "unpadded_tokens_per_sec": 10036.0234375 + }, + "step_125100": { + "perplexity": 2.605920169279899, + "loss": 0.9577858448028564, + "unpadded_tokens_per_sec": 9965.36328125 + }, + "step_125200": { + "perplexity": 2.6051444366579193, + "loss": 0.9574881196022034, + "unpadded_tokens_per_sec": 10048.6171875 + }, + "step_125300": { + "perplexity": 2.6077381049653097, + "loss": 0.9584832191467285, + "unpadded_tokens_per_sec": 10047.341796875 + }, + "step_125400": { + "perplexity": 2.602689550696003, + "loss": 0.956545352935791, + "unpadded_tokens_per_sec": 10037.67578125 + }, + "step_125500": { + "perplexity": 2.6013086184063408, + "loss": 0.9560146331787109, + "unpadded_tokens_per_sec": 10040.642578125 + }, + "step_125600": { + "perplexity": 2.604389580959084, + "loss": 0.9571983218193054, + "unpadded_tokens_per_sec": 10043.5703125 + }, + "step_125700": { + "perplexity": 2.602247771158294, + "loss": 0.9563755989074707, + "unpadded_tokens_per_sec": 10041.7978515625 + }, + "step_125800": { + "perplexity": 2.602508827740882, + "loss": 0.9564759135246277, + "unpadded_tokens_per_sec": 10036.2763671875 + }, + "step_125900": { + "perplexity": 2.6057578597662525, + "loss": 0.9577235579490662, + "unpadded_tokens_per_sec": 10039.41015625 + }, + "step_126000": { + "perplexity": 2.605257326725061, + "loss": 0.9575314521789551, + "unpadded_tokens_per_sec": 10041.2490234375 + }, + "step_126100": { + "perplexity": 2.6063580670906488, + "loss": 0.9579538702964783, + "unpadded_tokens_per_sec": 10036.796875 + }, + "step_126200": { + "perplexity": 2.601143030195554, + "loss": 0.9559509754180908, + "unpadded_tokens_per_sec": 10038.6689453125 + }, + "step_126300": { + "perplexity": 2.606007618691778, + "loss": 0.957819402217865, + "unpadded_tokens_per_sec": 10037.4521484375 + }, + "step_126400": { + "perplexity": 2.6017534951119665, + "loss": 0.9561856389045715, + "unpadded_tokens_per_sec": 10041.7568359375 + }, + "step_126500": { + "perplexity": 2.601598888342739, + "loss": 0.9561262130737305, + "unpadded_tokens_per_sec": 10039.8779296875 + }, + "step_126600": { + "perplexity": 2.6031321810276626, + "loss": 0.9567154049873352, + "unpadded_tokens_per_sec": 10044.5009765625 + }, + "step_126700": { + "perplexity": 2.605562946390616, + "loss": 0.957648754119873, + "unpadded_tokens_per_sec": 10042.888671875 + }, + "step_126800": { + "perplexity": 2.6062637707111063, + "loss": 0.9579176902770996, + "unpadded_tokens_per_sec": 10042.89453125 + }, + "step_126900": { + "perplexity": 2.6037283692792434, + "loss": 0.9569444060325623, + "unpadded_tokens_per_sec": 10042.595703125 + }, + "step_127000": { + "perplexity": 2.6004893078063964, + "loss": 0.955699622631073, + "unpadded_tokens_per_sec": 10041.5322265625 + }, + "step_127100": { + "perplexity": 2.6042930273776665, + "loss": 0.9571612477302551, + "unpadded_tokens_per_sec": 10037.2001953125 + }, + "step_127200": { + "perplexity": 2.6032488630368973, + "loss": 0.9567602276802063, + "unpadded_tokens_per_sec": 10034.8935546875 + }, + "step_127300": { + "perplexity": 2.599931983072352, + "loss": 0.9554852843284607, + "unpadded_tokens_per_sec": 10035.931640625 + }, + "step_127400": { + "perplexity": 2.6026264125806877, + "loss": 0.9565210938453674, + "unpadded_tokens_per_sec": 10035.3095703125 + }, + "step_127500": { + "perplexity": 2.601659210251936, + "loss": 0.9561493992805481, + "unpadded_tokens_per_sec": 10038.74609375 + }, + "step_127600": { + "perplexity": 2.5980855751620404, + "loss": 0.9547748565673828, + "unpadded_tokens_per_sec": 10036.6005859375 + }, + "step_127700": { + "perplexity": 2.5990943609015362, + "loss": 0.9551630616188049, + "unpadded_tokens_per_sec": 10042.1357421875 + }, + "step_127800": { + "perplexity": 2.600365619757336, + "loss": 0.9556520581245422, + "unpadded_tokens_per_sec": 10035.5263671875 + }, + "step_127900": { + "perplexity": 2.6004226581266994, + "loss": 0.9556739926338196, + "unpadded_tokens_per_sec": 10039.6962890625 + }, + "step_128000": { + "perplexity": 2.6018242109993293, + "loss": 0.9562128186225891, + "unpadded_tokens_per_sec": 10043.1005859375 + }, + "step_128100": { + "perplexity": 2.6013652122997533, + "loss": 0.956036388874054, + "unpadded_tokens_per_sec": 10040.0263671875 + }, + "step_128200": { + "perplexity": 2.604856100053479, + "loss": 0.9573774337768555, + "unpadded_tokens_per_sec": 10036.02734375 + }, + "step_128300": { + "perplexity": 2.5992466498513864, + "loss": 0.9552216529846191, + "unpadded_tokens_per_sec": 10030.6259765625 + }, + "step_128400": { + "perplexity": 2.599566594153687, + "loss": 0.9553447365760803, + "unpadded_tokens_per_sec": 10034.046875 + }, + "step_128500": { + "perplexity": 2.6022409465008667, + "loss": 0.9563729763031006, + "unpadded_tokens_per_sec": 10037.4150390625 + }, + "step_128600": { + "perplexity": 2.5985286615863705, + "loss": 0.9549453854560852, + "unpadded_tokens_per_sec": 10044.1689453125 + }, + "step_128700": { + "perplexity": 2.6021882111151644, + "loss": 0.956352710723877, + "unpadded_tokens_per_sec": 10037.5830078125 + }, + "step_128800": { + "perplexity": 2.598627015027557, + "loss": 0.9549832344055176, + "unpadded_tokens_per_sec": 10039.5771484375 + }, + "step_128900": { + "perplexity": 2.596617936312469, + "loss": 0.9542098045349121, + "unpadded_tokens_per_sec": 9385.08984375 + }, + "step_129000": { + "perplexity": 2.5952939029136872, + "loss": 0.9536997675895691, + "unpadded_tokens_per_sec": 6639.15869140625 + }, + "step_129100": { + "perplexity": 2.5984620621573864, + "loss": 0.9549197554588318, + "unpadded_tokens_per_sec": 10045.3974609375 + }, + "step_129200": { + "perplexity": 2.602695290600619, + "loss": 0.9565475583076477, + "unpadded_tokens_per_sec": 10029.6923828125 + }, + "step_129300": { + "perplexity": 2.6029203979235307, + "loss": 0.9566340446472168, + "unpadded_tokens_per_sec": 10037.0048828125 + }, + "step_129400": { + "perplexity": 2.599154159974546, + "loss": 0.9551860690116882, + "unpadded_tokens_per_sec": 8625.8486328125 + }, + "step_129500": { + "perplexity": 2.5980618820010015, + "loss": 0.9547657370567322, + "unpadded_tokens_per_sec": 10040.8134765625 + }, + "step_129600": { + "perplexity": 2.598825901706426, + "loss": 0.9550597667694092, + "unpadded_tokens_per_sec": 10041.6162109375 + }, + "step_129700": { + "perplexity": 2.6047308070181745, + "loss": 0.9573293328285217, + "unpadded_tokens_per_sec": 10041.5009765625 + }, + "step_129800": { + "perplexity": 2.596313984861837, + "loss": 0.9540927410125732, + "unpadded_tokens_per_sec": 10034.908203125 + }, + "step_129900": { + "perplexity": 2.6027917849463025, + "loss": 0.956584632396698, + "unpadded_tokens_per_sec": 10042.0390625 + }, + "step_130000": { + "perplexity": 2.5984295374752246, + "loss": 0.954907238483429, + "unpadded_tokens_per_sec": 10037.6748046875 + }, + "step_130100": { + "perplexity": 2.5988358154593962, + "loss": 0.9550635814666748, + "unpadded_tokens_per_sec": 9969.9638671875 + }, + "step_130200": { + "perplexity": 2.6006923673602285, + "loss": 0.9557777047157288, + "unpadded_tokens_per_sec": 10035.5556640625 + }, + "step_130300": { + "perplexity": 2.600959934156308, + "loss": 0.9558805823326111, + "unpadded_tokens_per_sec": 10032.52734375 + }, + "step_130400": { + "perplexity": 2.596233360127479, + "loss": 0.9540616869926453, + "unpadded_tokens_per_sec": 10033.6357421875 + }, + "step_130500": { + "perplexity": 2.5965178017364434, + "loss": 0.9541712403297424, + "unpadded_tokens_per_sec": 10041.626953125 + }, + "step_130600": { + "perplexity": 2.5927310017200775, + "loss": 0.9527117609977722, + "unpadded_tokens_per_sec": 10039.4189453125 + }, + "step_130700": { + "perplexity": 2.597721219914758, + "loss": 0.9546346068382263, + "unpadded_tokens_per_sec": 10033.4658203125 + }, + "step_130800": { + "perplexity": 2.598995834859906, + "loss": 0.9551251530647278, + "unpadded_tokens_per_sec": 10039.0654296875 + }, + "step_130900": { + "perplexity": 2.5959789675911984, + "loss": 0.9539636969566345, + "unpadded_tokens_per_sec": 10038.2119140625 + }, + "step_131000": { + "perplexity": 2.597634203399395, + "loss": 0.9546011090278625, + "unpadded_tokens_per_sec": 10033.638671875 + }, + "step_131100": { + "perplexity": 2.5971932819544197, + "loss": 0.9544313549995422, + "unpadded_tokens_per_sec": 10036.1875 + }, + "step_131200": { + "perplexity": 2.597517463400455, + "loss": 0.9545561671257019, + "unpadded_tokens_per_sec": 10034.51953125 + }, + "step_131300": { + "perplexity": 2.5942448420674857, + "loss": 0.9532954692840576, + "unpadded_tokens_per_sec": 10036.0439453125 + }, + "step_131400": { + "perplexity": 2.5965151707409158, + "loss": 0.9541702270507812, + "unpadded_tokens_per_sec": 10045.224609375 + }, + "step_131500": { + "perplexity": 2.597822329954041, + "loss": 0.9546735286712646, + "unpadded_tokens_per_sec": 10041.9248046875 + }, + "step_131600": { + "perplexity": 2.5999468600450557, + "loss": 0.9554910063743591, + "unpadded_tokens_per_sec": 10037.0810546875 + }, + "step_131700": { + "perplexity": 2.5993137341830828, + "loss": 0.9552474617958069, + "unpadded_tokens_per_sec": 10028.1279296875 + }, + "step_131800": { + "perplexity": 2.5968258013997407, + "loss": 0.9542898535728455, + "unpadded_tokens_per_sec": 10033.1796875 + }, + "step_131900": { + "perplexity": 2.596273749556679, + "loss": 0.9540772438049316, + "unpadded_tokens_per_sec": 10040.2490234375 + }, + "step_132000": { + "perplexity": 2.595818205601211, + "loss": 0.9539017677307129, + "unpadded_tokens_per_sec": 10042.19140625 + }, + "step_132100": { + "perplexity": 2.59638996938875, + "loss": 0.954122006893158, + "unpadded_tokens_per_sec": 10041.28125 + }, + "step_132200": { + "perplexity": 2.5956320807193545, + "loss": 0.9538300633430481, + "unpadded_tokens_per_sec": 10036.3857421875 + }, + "step_132300": { + "perplexity": 2.5993236497969896, + "loss": 0.9552512764930725, + "unpadded_tokens_per_sec": 10040.0380859375 + }, + "step_132400": { + "perplexity": 2.5960529307340474, + "loss": 0.9539921879768372, + "unpadded_tokens_per_sec": 10034.0419921875 + }, + "step_132500": { + "perplexity": 2.5931628191116496, + "loss": 0.9528782963752747, + "unpadded_tokens_per_sec": 10032.9833984375 + }, + "step_132600": { + "perplexity": 2.5968586155775037, + "loss": 0.9543024897575378, + "unpadded_tokens_per_sec": 10040.8544921875 + }, + "step_132700": { + "perplexity": 2.595959780843981, + "loss": 0.9539563059806824, + "unpadded_tokens_per_sec": 10035.58984375 + }, + "step_132800": { + "perplexity": 2.5963896598749647, + "loss": 0.9541218876838684, + "unpadded_tokens_per_sec": 10030.4921875 + }, + "step_132900": { + "perplexity": 2.5955671026068687, + "loss": 0.9538050293922424, + "unpadded_tokens_per_sec": 10039.53125 + }, + "step_133000": { + "perplexity": 2.600903039028085, + "loss": 0.9558587074279785, + "unpadded_tokens_per_sec": 10043.6337890625 + }, + "step_133100": { + "perplexity": 2.596399254819463, + "loss": 0.9541255831718445, + "unpadded_tokens_per_sec": 10039.8623046875 + }, + "step_133200": { + "perplexity": 2.5949291658209557, + "loss": 0.9535592198371887, + "unpadded_tokens_per_sec": 10035.5419921875 + }, + "step_133300": { + "perplexity": 2.596707085644884, + "loss": 0.9542441368103027, + "unpadded_tokens_per_sec": 10047.4091796875 + }, + "step_133400": { + "perplexity": 2.597026562554413, + "loss": 0.9543671607971191, + "unpadded_tokens_per_sec": 10037.8466796875 + }, + "step_133500": { + "perplexity": 2.597755438951504, + "loss": 0.9546477794647217, + "unpadded_tokens_per_sec": 10035.3935546875 + }, + "step_133600": { + "perplexity": 2.5941049065584787, + "loss": 0.9532415270805359, + "unpadded_tokens_per_sec": 10031.6494140625 + }, + "step_133700": { + "perplexity": 2.5957078905731987, + "loss": 0.953859269618988, + "unpadded_tokens_per_sec": 10032.2060546875 + }, + "step_133800": { + "perplexity": 2.5942255135092065, + "loss": 0.9532880187034607, + "unpadded_tokens_per_sec": 10039.8623046875 + }, + "step_133900": { + "perplexity": 2.599004974697164, + "loss": 0.9551286697387695, + "unpadded_tokens_per_sec": 10034.4765625 + }, + "step_134000": { + "perplexity": 2.594783470930262, + "loss": 0.9535030722618103, + "unpadded_tokens_per_sec": 10041.0029296875 + }, + "step_134100": { + "perplexity": 2.593189713482583, + "loss": 0.9528886675834656, + "unpadded_tokens_per_sec": 10039.8154296875 + }, + "step_134200": { + "perplexity": 2.5950634227073737, + "loss": 0.9536109566688538, + "unpadded_tokens_per_sec": 10033.3330078125 + }, + "step_134300": { + "perplexity": 2.5958836541800028, + "loss": 0.9539269804954529, + "unpadded_tokens_per_sec": 10025.3974609375 + }, + "step_134400": { + "perplexity": 2.593497318367126, + "loss": 0.9530072808265686, + "unpadded_tokens_per_sec": 10040.80859375 + }, + "step_134500": { + "perplexity": 2.5938485581024113, + "loss": 0.9531427025794983, + "unpadded_tokens_per_sec": 10031.474609375 + }, + "step_134600": { + "perplexity": 2.5902660438683798, + "loss": 0.9517605900764465, + "unpadded_tokens_per_sec": 10038.66015625 + }, + "step_134700": { + "perplexity": 2.595180361784167, + "loss": 0.953656017780304, + "unpadded_tokens_per_sec": 10029.5791015625 + }, + "step_134800": { + "perplexity": 2.59706201081601, + "loss": 0.9543808102607727, + "unpadded_tokens_per_sec": 9287.5732421875 + }, + "step_134900": { + "perplexity": 2.5917986822048618, + "loss": 0.9523521065711975, + "unpadded_tokens_per_sec": 6417.08984375 + }, + "step_135000": { + "perplexity": 2.592130842453104, + "loss": 0.9524802565574646, + "unpadded_tokens_per_sec": 10042.736328125 + }, + "step_135100": { + "perplexity": 2.5931467444484095, + "loss": 0.952872097492218, + "unpadded_tokens_per_sec": 9434.8818359375 + }, + "step_135200": { + "perplexity": 2.595788653710417, + "loss": 0.9538903832435608, + "unpadded_tokens_per_sec": 9013.8857421875 + }, + "step_135300": { + "perplexity": 2.5885885018945154, + "loss": 0.9511127471923828, + "unpadded_tokens_per_sec": 10041.3857421875 + }, + "step_135400": { + "perplexity": 2.594806051555983, + "loss": 0.9535117745399475, + "unpadded_tokens_per_sec": 10036.765625 + }, + "step_135500": { + "perplexity": 2.596025387727488, + "loss": 0.9539815783500671, + "unpadded_tokens_per_sec": 10031.8759765625 + }, + "step_135600": { + "perplexity": 2.5861701809705107, + "loss": 0.9501780867576599, + "unpadded_tokens_per_sec": 10039.052734375 + }, + "step_135700": { + "perplexity": 2.586146442325196, + "loss": 0.9501689076423645, + "unpadded_tokens_per_sec": 10033.9248046875 + }, + "step_135800": { + "perplexity": 2.592435540350989, + "loss": 0.9525977969169617, + "unpadded_tokens_per_sec": 10035.7353515625 + }, + "step_135900": { + "perplexity": 2.591054179939894, + "loss": 0.9520648121833801, + "unpadded_tokens_per_sec": 10038.50390625 + }, + "step_136000": { + "perplexity": 2.5954402452658054, + "loss": 0.9537561535835266, + "unpadded_tokens_per_sec": 10031.7578125 + }, + "step_136100": { + "perplexity": 2.5946622194240376, + "loss": 0.9534563422203064, + "unpadded_tokens_per_sec": 10032.7958984375 + }, + "step_136200": { + "perplexity": 2.597366359468501, + "loss": 0.9544979929924011, + "unpadded_tokens_per_sec": 10027.1591796875 + }, + "step_136300": { + "perplexity": 2.5930031588481843, + "loss": 0.9528167247772217, + "unpadded_tokens_per_sec": 10035.0869140625 + }, + "step_136400": { + "perplexity": 2.5931294333840498, + "loss": 0.9528654217720032, + "unpadded_tokens_per_sec": 10032.76953125 + }, + "step_136500": { + "perplexity": 2.5907431587362697, + "loss": 0.9519447684288025, + "unpadded_tokens_per_sec": 10032.6455078125 + }, + "step_136600": { + "perplexity": 2.5920168217188078, + "loss": 0.9524362683296204, + "unpadded_tokens_per_sec": 10041.1298828125 + }, + "step_136700": { + "perplexity": 2.590656067136474, + "loss": 0.9519111514091492, + "unpadded_tokens_per_sec": 10035.0712890625 + }, + "step_136800": { + "perplexity": 2.5926942217440923, + "loss": 0.9526975750923157, + "unpadded_tokens_per_sec": 10044.2490234375 + }, + "step_136900": { + "perplexity": 2.5923341764258367, + "loss": 0.952558696269989, + "unpadded_tokens_per_sec": 10030.8720703125 + }, + "step_137000": { + "perplexity": 2.5917744284496984, + "loss": 0.9523427486419678, + "unpadded_tokens_per_sec": 10024.560546875 + }, + "step_137100": { + "perplexity": 2.590133887784184, + "loss": 0.9517095685005188, + "unpadded_tokens_per_sec": 10028.5908203125 + }, + "step_137200": { + "perplexity": 2.591486644845743, + "loss": 0.9522317051887512, + "unpadded_tokens_per_sec": 10030.06640625 + }, + "step_137300": { + "perplexity": 2.5915797886980694, + "loss": 0.9522676467895508, + "unpadded_tokens_per_sec": 10044.51171875 + }, + "step_137400": { + "perplexity": 2.591019585865301, + "loss": 0.9520514607429504, + "unpadded_tokens_per_sec": 10031.2265625 + }, + "step_137500": { + "perplexity": 2.591190553045484, + "loss": 0.9521174430847168, + "unpadded_tokens_per_sec": 10038.35546875 + }, + "step_137600": { + "perplexity": 2.5888151666160097, + "loss": 0.9512003064155579, + "unpadded_tokens_per_sec": 10036.0966796875 + }, + "step_137700": { + "perplexity": 2.590890325486377, + "loss": 0.9520015716552734, + "unpadded_tokens_per_sec": 10038.1083984375 + }, + "step_137800": { + "perplexity": 2.590206140508745, + "loss": 0.9517374634742737, + "unpadded_tokens_per_sec": 10039.4228515625 + }, + "step_137900": { + "perplexity": 2.586539853705309, + "loss": 0.9503210186958313, + "unpadded_tokens_per_sec": 10032.8720703125 + }, + "step_138000": { + "perplexity": 2.590065650950493, + "loss": 0.9516832232475281, + "unpadded_tokens_per_sec": 10041.5205078125 + }, + "step_138100": { + "perplexity": 2.5931597278225187, + "loss": 0.9528771042823792, + "unpadded_tokens_per_sec": 10040.005859375 + }, + "step_138200": { + "perplexity": 2.5929061001045217, + "loss": 0.9527792930603027, + "unpadded_tokens_per_sec": 10039.5205078125 + }, + "step_138300": { + "perplexity": 2.5941639723388694, + "loss": 0.9532642960548401, + "unpadded_tokens_per_sec": 10035.2724609375 + }, + "step_138400": { + "perplexity": 2.5884595170813327, + "loss": 0.9510629177093506, + "unpadded_tokens_per_sec": 10035.5068359375 + }, + "step_138500": { + "perplexity": 2.5897643188316954, + "loss": 0.9515668749809265, + "unpadded_tokens_per_sec": 10038.603515625 + }, + "step_138600": { + "perplexity": 2.5924598002928194, + "loss": 0.9526071548461914, + "unpadded_tokens_per_sec": 10035.638671875 + }, + "step_138700": { + "perplexity": 2.59371049913491, + "loss": 0.9530894756317139, + "unpadded_tokens_per_sec": 10043.0146484375 + }, + "step_138800": { + "perplexity": 2.5908680877917707, + "loss": 0.9519929885864258, + "unpadded_tokens_per_sec": 10045.2041015625 + }, + "step_138900": { + "perplexity": 2.593491753331587, + "loss": 0.9530051350593567, + "unpadded_tokens_per_sec": 10036.7587890625 + }, + "step_139000": { + "perplexity": 2.590274381043718, + "loss": 0.9517638087272644, + "unpadded_tokens_per_sec": 10037.6806640625 + }, + "step_139100": { + "perplexity": 2.590682626675772, + "loss": 0.9519214034080505, + "unpadded_tokens_per_sec": 10045.73046875 + }, + "step_139200": { + "perplexity": 2.59238454885665, + "loss": 0.9525781273841858, + "unpadded_tokens_per_sec": 10042.6357421875 + }, + "step_139300": { + "perplexity": 2.5913190561294868, + "loss": 0.9521670341491699, + "unpadded_tokens_per_sec": 10037.6611328125 + }, + "step_139400": { + "perplexity": 2.590135122856561, + "loss": 0.951710045337677, + "unpadded_tokens_per_sec": 10038.869140625 + }, + "step_139500": { + "perplexity": 2.585918778202442, + "loss": 0.9500808715820312, + "unpadded_tokens_per_sec": 10026.65625 + }, + "step_139600": { + "perplexity": 2.588426346188719, + "loss": 0.9510501027107239, + "unpadded_tokens_per_sec": 10035.6826171875 + }, + "step_139700": { + "perplexity": 2.5885280201770313, + "loss": 0.9510893821716309, + "unpadded_tokens_per_sec": 10033.7548828125 + }, + "step_139800": { + "perplexity": 2.5900116185340227, + "loss": 0.9516623616218567, + "unpadded_tokens_per_sec": 10038.537109375 + }, + "step_139900": { + "perplexity": 2.589853232859788, + "loss": 0.9516012072563171, + "unpadded_tokens_per_sec": 10043.3095703125 + }, + "step_140000": { + "perplexity": 2.5874025775217757, + "loss": 0.9506545066833496, + "unpadded_tokens_per_sec": 10028.9775390625 + }, + "step_140100": { + "perplexity": 2.590397897896048, + "loss": 0.9518114924430847, + "unpadded_tokens_per_sec": 9962.3271484375 + }, + "step_140200": { + "perplexity": 2.5886543853734176, + "loss": 0.9511381983757019, + "unpadded_tokens_per_sec": 10045.884765625 + }, + "step_140300": { + "perplexity": 2.593063745128598, + "loss": 0.9528400897979736, + "unpadded_tokens_per_sec": 10035.2607421875 + }, + "step_140400": { + "perplexity": 2.590206603673737, + "loss": 0.951737642288208, + "unpadded_tokens_per_sec": 10035.337890625 + }, + "step_140500": { + "perplexity": 2.5843660486365323, + "loss": 0.9494802355766296, + "unpadded_tokens_per_sec": 10035.27734375 + }, + "step_140600": { + "perplexity": 2.587283984128056, + "loss": 0.9506086707115173, + "unpadded_tokens_per_sec": 10039.4482421875 + }, + "step_140700": { + "perplexity": 2.5858095003774153, + "loss": 0.9500386118888855, + "unpadded_tokens_per_sec": 10042.45703125 + }, + "step_140800": { + "perplexity": 2.589841809706115, + "loss": 0.9515967965126038, + "unpadded_tokens_per_sec": 10039.064453125 + }, + "step_140900": { + "perplexity": 2.586627115277913, + "loss": 0.9503547549247742, + "unpadded_tokens_per_sec": 8843.6611328125 + }, + "step_141000": { + "perplexity": 2.5843853037358, + "loss": 0.9494876861572266, + "unpadded_tokens_per_sec": 6082.7265625 + }, + "step_141100": { + "perplexity": 2.5865074782522033, + "loss": 0.9503085017204285, + "unpadded_tokens_per_sec": 10043.1875 + }, + "step_141200": { + "perplexity": 2.587951663288692, + "loss": 0.95086669921875, + "unpadded_tokens_per_sec": 10037.9794921875 + }, + "step_141300": { + "perplexity": 2.587173106535266, + "loss": 0.9505658149719238, + "unpadded_tokens_per_sec": 10040.8427734375 + }, + "step_141400": { + "perplexity": 2.584769049271105, + "loss": 0.9496361613273621, + "unpadded_tokens_per_sec": 8985.0283203125 + }, + "step_141500": { + "perplexity": 2.5875271912613966, + "loss": 0.9507026672363281, + "unpadded_tokens_per_sec": 10037.0244140625 + }, + "step_141600": { + "perplexity": 2.58779741394614, + "loss": 0.9508070945739746, + "unpadded_tokens_per_sec": 10039.7177734375 + }, + "step_141700": { + "perplexity": 2.5881119380947184, + "loss": 0.9509286284446716, + "unpadded_tokens_per_sec": 10042.5048828125 + }, + "step_141800": { + "perplexity": 2.5883285331012233, + "loss": 0.9510123133659363, + "unpadded_tokens_per_sec": 10035.1748046875 + }, + "step_141900": { + "perplexity": 2.5878475439740085, + "loss": 0.9508264660835266, + "unpadded_tokens_per_sec": 10041.6708984375 + }, + "step_142000": { + "perplexity": 2.5859513004214616, + "loss": 0.9500934481620789, + "unpadded_tokens_per_sec": 10039.6318359375 + }, + "step_142100": { + "perplexity": 2.5866385242523617, + "loss": 0.9503591656684875, + "unpadded_tokens_per_sec": 10032.0537109375 + }, + "step_142200": { + "perplexity": 2.586863938903885, + "loss": 0.9504463076591492, + "unpadded_tokens_per_sec": 10031.7509765625 + }, + "step_142300": { + "perplexity": 2.5890253391131908, + "loss": 0.9512814879417419, + "unpadded_tokens_per_sec": 10039.7861328125 + }, + "step_142400": { + "perplexity": 2.586052414767153, + "loss": 0.9501325488090515, + "unpadded_tokens_per_sec": 10034.2666015625 + }, + "step_142500": { + "perplexity": 2.5892410845758507, + "loss": 0.9513648152351379, + "unpadded_tokens_per_sec": 10038.7353515625 + }, + "step_142600": { + "perplexity": 2.5869209895022514, + "loss": 0.9504683613777161, + "unpadded_tokens_per_sec": 10039.9892578125 + }, + "step_142700": { + "perplexity": 2.58629781846122, + "loss": 0.9502274394035339, + "unpadded_tokens_per_sec": 10030.5126953125 + }, + "step_142800": { + "perplexity": 2.5862484892155813, + "loss": 0.9502083659172058, + "unpadded_tokens_per_sec": 10039.4609375 + }, + "step_142900": { + "perplexity": 2.5845894166083085, + "loss": 0.949566662311554, + "unpadded_tokens_per_sec": 10043.89453125 + }, + "step_143000": { + "perplexity": 2.586865480795406, + "loss": 0.9504469037055969, + "unpadded_tokens_per_sec": 10034.9267578125 + }, + "step_143100": { + "perplexity": 2.584684315327456, + "loss": 0.9496033787727356, + "unpadded_tokens_per_sec": 10030.357421875 + }, + "step_143200": { + "perplexity": 2.58758441071915, + "loss": 0.9507247805595398, + "unpadded_tokens_per_sec": 10033.7490234375 + }, + "step_143300": { + "perplexity": 2.587067168128435, + "loss": 0.9505248665809631, + "unpadded_tokens_per_sec": 10034.9306640625 + }, + "step_143400": { + "perplexity": 2.5883720394124436, + "loss": 0.9510291218757629, + "unpadded_tokens_per_sec": 10029.134765625 + }, + "step_143500": { + "perplexity": 2.586797021697336, + "loss": 0.9504204392433167, + "unpadded_tokens_per_sec": 10025.064453125 + }, + "step_143600": { + "perplexity": 2.583884563765394, + "loss": 0.9492939114570618, + "unpadded_tokens_per_sec": 10034.7861328125 + }, + "step_143700": { + "perplexity": 2.5840088540525126, + "loss": 0.9493420124053955, + "unpadded_tokens_per_sec": 10030.8125 + }, + "step_143800": { + "perplexity": 2.586289185775313, + "loss": 0.9502241015434265, + "unpadded_tokens_per_sec": 10032.7236328125 + }, + "step_143900": { + "perplexity": 2.5885269401592046, + "loss": 0.9510889649391174, + "unpadded_tokens_per_sec": 10038.951171875 + }, + "step_144000": { + "perplexity": 2.586513953310405, + "loss": 0.950311005115509, + "unpadded_tokens_per_sec": 10037.4384765625 + }, + "step_144100": { + "perplexity": 2.5882915070293886, + "loss": 0.9509980082511902, + "unpadded_tokens_per_sec": 10032.2294921875 + }, + "step_144200": { + "perplexity": 2.584087712947877, + "loss": 0.9493725299835205, + "unpadded_tokens_per_sec": 10035.1669921875 + }, + "step_144300": { + "perplexity": 2.585938661406238, + "loss": 0.9500885605812073, + "unpadded_tokens_per_sec": 10032.162109375 + }, + "step_144400": { + "perplexity": 2.5838439050435977, + "loss": 0.9492781758308411, + "unpadded_tokens_per_sec": 10031.9453125 + }, + "step_144500": { + "perplexity": 2.5833311056396595, + "loss": 0.949079692363739, + "unpadded_tokens_per_sec": 10031.7890625 + }, + "step_144600": { + "perplexity": 2.586721163731776, + "loss": 0.9503911137580872, + "unpadded_tokens_per_sec": 10037.2177734375 + }, + "step_144700": { + "perplexity": 2.5878484694605755, + "loss": 0.9508268237113953, + "unpadded_tokens_per_sec": 10042.1552734375 + }, + "step_144800": { + "perplexity": 2.585309254942553, + "loss": 0.9498451352119446, + "unpadded_tokens_per_sec": 10037.9267578125 + }, + "step_144900": { + "perplexity": 2.58332448457123, + "loss": 0.9490771293640137, + "unpadded_tokens_per_sec": 10037.3115234375 + }, + "step_145000": { + "perplexity": 2.5839794366043645, + "loss": 0.9493306279182434, + "unpadded_tokens_per_sec": 10038.6748046875 + }, + "step_145100": { + "perplexity": 2.5828769089003543, + "loss": 0.9489038586616516, + "unpadded_tokens_per_sec": 9969.396484375 + }, + "step_145200": { + "perplexity": 2.5832485744643945, + "loss": 0.9490477442741394, + "unpadded_tokens_per_sec": 10038.5927734375 + }, + "step_145300": { + "perplexity": 2.5853391498247107, + "loss": 0.949856698513031, + "unpadded_tokens_per_sec": 10037.765625 + }, + "step_145400": { + "perplexity": 2.584810030684089, + "loss": 0.9496520161628723, + "unpadded_tokens_per_sec": 10035.9501953125 + }, + "step_145500": { + "perplexity": 2.5853265138014154, + "loss": 0.9498518109321594, + "unpadded_tokens_per_sec": 10045.4365234375 + }, + "step_145600": { + "perplexity": 2.583218549784221, + "loss": 0.9490361213684082, + "unpadded_tokens_per_sec": 10037.2314453125 + }, + "step_145700": { + "perplexity": 2.5890062037598844, + "loss": 0.9512740969657898, + "unpadded_tokens_per_sec": 10040.1923828125 + }, + "step_145800": { + "perplexity": 2.5836547887923316, + "loss": 0.9492049813270569, + "unpadded_tokens_per_sec": 10036.400390625 + }, + "step_145900": { + "perplexity": 2.5828188698517867, + "loss": 0.9488813877105713, + "unpadded_tokens_per_sec": 10035.1787109375 + }, + "step_146000": { + "perplexity": 2.587000399861971, + "loss": 0.9504990577697754, + "unpadded_tokens_per_sec": 10040.61328125 + }, + "step_146100": { + "perplexity": 2.5849952254719266, + "loss": 0.9497236609458923, + "unpadded_tokens_per_sec": 10031.546875 + }, + "step_146200": { + "perplexity": 2.5825891896468085, + "loss": 0.9487924575805664, + "unpadded_tokens_per_sec": 10041.95703125 + }, + "step_146300": { + "perplexity": 2.584506229038678, + "loss": 0.9495344758033752, + "unpadded_tokens_per_sec": 10035.9052734375 + }, + "step_146400": { + "perplexity": 2.584325382338329, + "loss": 0.9494644999504089, + "unpadded_tokens_per_sec": 10041.21875 + }, + "step_146500": { + "perplexity": 2.5863326578078563, + "loss": 0.9502409100532532, + "unpadded_tokens_per_sec": 10040.9951171875 + }, + "step_146600": { + "perplexity": 2.5819821440783897, + "loss": 0.9485573768615723, + "unpadded_tokens_per_sec": 10038.009765625 + }, + "step_146700": { + "perplexity": 2.5837555053335004, + "loss": 0.94924396276474, + "unpadded_tokens_per_sec": 10034.4794921875 + }, + "step_146800": { + "perplexity": 2.579107235880766, + "loss": 0.9474433064460754, + "unpadded_tokens_per_sec": 10041.6328125 + }, + "step_146900": { + "perplexity": 2.583224246747992, + "loss": 0.9490383267402649, + "unpadded_tokens_per_sec": 10041.7548828125 + }, + "step_147000": { + "perplexity": 2.5831675856132055, + "loss": 0.9490163922309875, + "unpadded_tokens_per_sec": 10034.09765625 + }, + "step_147100": { + "perplexity": 2.5834977157847168, + "loss": 0.949144184589386, + "unpadded_tokens_per_sec": 10036.9892578125 + }, + "step_147200": { + "perplexity": 2.5890173145929727, + "loss": 0.9512783885002136, + "unpadded_tokens_per_sec": 9426.4541015625 + }, + "step_147300": { + "perplexity": 2.5810156908533, + "loss": 0.948183000087738, + "unpadded_tokens_per_sec": 10028.2275390625 + }, + "step_147400": { + "perplexity": 2.579839694204298, + "loss": 0.9477272629737854, + "unpadded_tokens_per_sec": 9299.0224609375 + }, + "step_147500": { + "perplexity": 2.5846645958262995, + "loss": 0.9495957493782043, + "unpadded_tokens_per_sec": 5914.12158203125 + }, + "step_147600": { + "perplexity": 2.584228802444696, + "loss": 0.9494271278381348, + "unpadded_tokens_per_sec": 10033.2236328125 + }, + "step_147700": { + "perplexity": 2.5812275379476595, + "loss": 0.9482650756835938, + "unpadded_tokens_per_sec": 10034.2607421875 + }, + "step_147800": { + "perplexity": 2.5842052356500673, + "loss": 0.9494180083274841, + "unpadded_tokens_per_sec": 10040.6240234375 + }, + "step_147900": { + "perplexity": 2.5821956096062904, + "loss": 0.9486400485038757, + "unpadded_tokens_per_sec": 8926.30859375 + }, + "step_148000": { + "perplexity": 2.5792047004948975, + "loss": 0.947481095790863, + "unpadded_tokens_per_sec": 10031.62109375 + }, + "step_148100": { + "perplexity": 2.578684675608163, + "loss": 0.9472794532775879, + "unpadded_tokens_per_sec": 10039.4482421875 + }, + "step_148200": { + "perplexity": 2.5796041286631755, + "loss": 0.9476359486579895, + "unpadded_tokens_per_sec": 10030.3974609375 + }, + "step_148300": { + "perplexity": 2.5784060296921893, + "loss": 0.9471713900566101, + "unpadded_tokens_per_sec": 10041.4716796875 + }, + "step_148400": { + "perplexity": 2.5828687494858276, + "loss": 0.9489006996154785, + "unpadded_tokens_per_sec": 10033.7255859375 + }, + "step_148500": { + "perplexity": 2.585282134111407, + "loss": 0.9498346447944641, + "unpadded_tokens_per_sec": 10034.6064453125 + }, + "step_148600": { + "perplexity": 2.5804385457472523, + "loss": 0.9479593634605408, + "unpadded_tokens_per_sec": 10032.7314453125 + }, + "step_148700": { + "perplexity": 2.5834487479173602, + "loss": 0.9491252303123474, + "unpadded_tokens_per_sec": 10027.8974609375 + }, + "step_148800": { + "perplexity": 2.5868876841352137, + "loss": 0.9504554867744446, + "unpadded_tokens_per_sec": 10030.466796875 + }, + "step_148900": { + "perplexity": 2.5826526113617674, + "loss": 0.9488170146942139, + "unpadded_tokens_per_sec": 10029.2646484375 + }, + "step_149000": { + "perplexity": 2.5813923199313304, + "loss": 0.9483289122581482, + "unpadded_tokens_per_sec": 10028.2177734375 + }, + "step_149100": { + "perplexity": 2.583223938803683, + "loss": 0.9490382075309753, + "unpadded_tokens_per_sec": 10032.2958984375 + }, + "step_149200": { + "perplexity": 2.5852086319256085, + "loss": 0.9498062133789062, + "unpadded_tokens_per_sec": 10031.4111328125 + }, + "step_149300": { + "perplexity": 2.581304311811477, + "loss": 0.9482948184013367, + "unpadded_tokens_per_sec": 10035.75390625 + }, + "step_149400": { + "perplexity": 2.5840883290424714, + "loss": 0.9493727684020996, + "unpadded_tokens_per_sec": 10029.365234375 + }, + "step_149500": { + "perplexity": 2.583610283803961, + "loss": 0.9491877555847168, + "unpadded_tokens_per_sec": 10038.7119140625 + }, + "step_149600": { + "perplexity": 2.5803365743026414, + "loss": 0.9479198455810547, + "unpadded_tokens_per_sec": 10037.271484375 + }, + "step_149700": { + "perplexity": 2.578923846363741, + "loss": 0.9473721981048584, + "unpadded_tokens_per_sec": 10037.0712890625 + }, + "step_149800": { + "perplexity": 2.579137673960962, + "loss": 0.947455108165741, + "unpadded_tokens_per_sec": 10034.7900390625 + }, + "step_149900": { + "perplexity": 2.582950036879367, + "loss": 0.9489321708679199, + "unpadded_tokens_per_sec": 10036.8779296875 + }, + "step_150000": { + "perplexity": 2.579225915678186, + "loss": 0.947489321231842, + "unpadded_tokens_per_sec": 10027.3720703125 + }, + "step_150100": { + "perplexity": 2.5788265460727384, + "loss": 0.9473344683647156, + "unpadded_tokens_per_sec": 9973.7890625 + }, + "step_150200": { + "perplexity": 2.582344291769748, + "loss": 0.9486976265907288, + "unpadded_tokens_per_sec": 10035.125 + }, + "step_150300": { + "perplexity": 2.581605890571182, + "loss": 0.9484116435050964, + "unpadded_tokens_per_sec": 10046.1826171875 + }, + "step_150400": { + "perplexity": 2.5813858576945843, + "loss": 0.9483264088630676, + "unpadded_tokens_per_sec": 10029.8447265625 + }, + "step_150500": { + "perplexity": 2.5826716997556516, + "loss": 0.948824405670166, + "unpadded_tokens_per_sec": 10042.7099609375 + }, + "step_150600": { + "perplexity": 2.5811056891285498, + "loss": 0.9482178688049316, + "unpadded_tokens_per_sec": 10041.4541015625 + }, + "step_150700": { + "perplexity": 2.582072022149721, + "loss": 0.9485921859741211, + "unpadded_tokens_per_sec": 10040.8564453125 + }, + "step_150800": { + "perplexity": 2.5787822779611136, + "loss": 0.9473173022270203, + "unpadded_tokens_per_sec": 10046.0322265625 + }, + "step_150900": { + "perplexity": 2.5780183116527753, + "loss": 0.9470210075378418, + "unpadded_tokens_per_sec": 10041.4228515625 + }, + "step_151000": { + "perplexity": 2.582782076541557, + "loss": 0.94886714220047, + "unpadded_tokens_per_sec": 10040.4228515625 + }, + "step_151100": { + "perplexity": 2.5797657316884166, + "loss": 0.9476985931396484, + "unpadded_tokens_per_sec": 10043.0048828125 + }, + "step_151200": { + "perplexity": 2.5802487559714207, + "loss": 0.9478858113288879, + "unpadded_tokens_per_sec": 10037.779296875 + }, + "step_151300": { + "perplexity": 2.5805040679874542, + "loss": 0.9479847550392151, + "unpadded_tokens_per_sec": 10037.76171875 + }, + "step_151400": { + "perplexity": 2.58121276808747, + "loss": 0.9482593536376953, + "unpadded_tokens_per_sec": 10037.1357421875 + }, + "step_151500": { + "perplexity": 2.58102984422026, + "loss": 0.9481884837150574, + "unpadded_tokens_per_sec": 10045.234375 + }, + "step_151600": { + "perplexity": 2.5793347615367277, + "loss": 0.947531521320343, + "unpadded_tokens_per_sec": 10036.4404296875 + }, + "step_151700": { + "perplexity": 2.579176567586194, + "loss": 0.9474701881408691, + "unpadded_tokens_per_sec": 10039.7216796875 + }, + "step_151800": { + "perplexity": 2.5823950857749787, + "loss": 0.9487172961235046, + "unpadded_tokens_per_sec": 10035.4111328125 + }, + "step_151900": { + "perplexity": 2.577158716854473, + "loss": 0.9466875195503235, + "unpadded_tokens_per_sec": 10036.4560546875 + }, + "step_152000": { + "perplexity": 2.5769605667599467, + "loss": 0.9466106295585632, + "unpadded_tokens_per_sec": 10037.380859375 + }, + "step_152100": { + "perplexity": 2.5807758646202354, + "loss": 0.9480900764465332, + "unpadded_tokens_per_sec": 10043.0517578125 + }, + "step_152200": { + "perplexity": 2.5819629068839896, + "loss": 0.9485499262809753, + "unpadded_tokens_per_sec": 10032.9345703125 + }, + "step_152300": { + "perplexity": 2.58102984422026, + "loss": 0.9481884837150574, + "unpadded_tokens_per_sec": 10035.6162109375 + }, + "step_152400": { + "perplexity": 2.57858308086245, + "loss": 0.9472400546073914, + "unpadded_tokens_per_sec": 10029.9423828125 + }, + "step_152500": { + "perplexity": 2.5824118634083315, + "loss": 0.9487237930297852, + "unpadded_tokens_per_sec": 10030.7158203125 + }, + "step_152600": { + "perplexity": 2.5806157364841065, + "loss": 0.948028028011322, + "unpadded_tokens_per_sec": 10036.75390625 + }, + "step_152700": { + "perplexity": 2.581298772939263, + "loss": 0.9482926726341248, + "unpadded_tokens_per_sec": 10039.42578125 + }, + "step_152800": { + "perplexity": 2.5762498094605855, + "loss": 0.9463347792625427, + "unpadded_tokens_per_sec": 10038.1162109375 + }, + "step_152900": { + "perplexity": 2.582801473786229, + "loss": 0.9488746523857117, + "unpadded_tokens_per_sec": 10032.2177734375 + }, + "step_153000": { + "perplexity": 2.5821046498948266, + "loss": 0.9486048221588135, + "unpadded_tokens_per_sec": 10038.5556640625 + }, + "step_153100": { + "perplexity": 2.575836161583126, + "loss": 0.9461742043495178, + "unpadded_tokens_per_sec": 10039.1123046875 + }, + "step_153200": { + "perplexity": 2.5766500086622597, + "loss": 0.9464901089668274, + "unpadded_tokens_per_sec": 10039.8671875 + }, + "step_153300": { + "perplexity": 2.578602907661858, + "loss": 0.9472477436065674, + "unpadded_tokens_per_sec": 10038.7802734375 + }, + "step_153400": { + "perplexity": 2.576108541395562, + "loss": 0.9462799429893494, + "unpadded_tokens_per_sec": 10039.92578125 + }, + "step_153500": { + "perplexity": 2.5808381649947996, + "loss": 0.9481142163276672, + "unpadded_tokens_per_sec": 10038.205078125 + }, + "step_153600": { + "perplexity": 2.579627499739983, + "loss": 0.9476450085639954, + "unpadded_tokens_per_sec": 10033.43359375 + }, + "step_153700": { + "perplexity": 2.57587331654655, + "loss": 0.9461886286735535, + "unpadded_tokens_per_sec": 9407.005859375 + }, + "step_153800": { + "perplexity": 2.5780885360818244, + "loss": 0.9470482468605042, + "unpadded_tokens_per_sec": 10029.15234375 + }, + "step_153900": { + "perplexity": 2.5762433600975587, + "loss": 0.9463322758674622, + "unpadded_tokens_per_sec": 10036.046875 + }, + "step_154000": { + "perplexity": 2.5820357011825625, + "loss": 0.9485781192779541, + "unpadded_tokens_per_sec": 10033.8271484375 + }, + "step_154100": { + "perplexity": 2.578152922961365, + "loss": 0.947073221206665, + "unpadded_tokens_per_sec": 9275.544921875 + }, + "step_154200": { + "perplexity": 2.579701612054914, + "loss": 0.9476737380027771, + "unpadded_tokens_per_sec": 6261.025390625 + }, + "step_154300": { + "perplexity": 2.578527136296946, + "loss": 0.9472183585166931, + "unpadded_tokens_per_sec": 9218.732421875 + }, + "step_154400": { + "perplexity": 2.57833133987591, + "loss": 0.9471424221992493, + "unpadded_tokens_per_sec": 10035.7470703125 + }, + "step_154500": { + "perplexity": 2.576081517083236, + "loss": 0.9462694525718689, + "unpadded_tokens_per_sec": 10036.310546875 + }, + "step_154600": { + "perplexity": 2.5807481760476105, + "loss": 0.9480793476104736, + "unpadded_tokens_per_sec": 10042.9306640625 + }, + "step_154700": { + "perplexity": 2.5754084575029075, + "loss": 0.9460081458091736, + "unpadded_tokens_per_sec": 8887.7939453125 + }, + "step_154800": { + "perplexity": 2.5789920970997025, + "loss": 0.9473986625671387, + "unpadded_tokens_per_sec": 10033.6044921875 + }, + "step_154900": { + "perplexity": 2.578525599375464, + "loss": 0.9472177624702454, + "unpadded_tokens_per_sec": 10030.8115234375 + }, + "step_155000": { + "perplexity": 2.5761156046148326, + "loss": 0.946282684803009, + "unpadded_tokens_per_sec": 10032.51171875 + }, + "step_155100": { + "perplexity": 2.5753488977447883, + "loss": 0.9459850192070007, + "unpadded_tokens_per_sec": 9979.2900390625 + }, + "step_155200": { + "perplexity": 2.575220726132912, + "loss": 0.9459352493286133, + "unpadded_tokens_per_sec": 10043.8974609375 + }, + "step_155300": { + "perplexity": 2.5773000425087838, + "loss": 0.9467423558235168, + "unpadded_tokens_per_sec": 10038.4931640625 + }, + "step_155400": { + "perplexity": 2.577942096494007, + "loss": 0.9469914436340332, + "unpadded_tokens_per_sec": 10038.0673828125 + }, + "step_155500": { + "perplexity": 2.5782492757823046, + "loss": 0.9471105933189392, + "unpadded_tokens_per_sec": 10036.2255859375 + }, + "step_155600": { + "perplexity": 2.575775824292649, + "loss": 0.9461507797241211, + "unpadded_tokens_per_sec": 10043.486328125 + }, + "step_155700": { + "perplexity": 2.5759318136701053, + "loss": 0.9462113380432129, + "unpadded_tokens_per_sec": 10041.3740234375 + }, + "step_155800": { + "perplexity": 2.5801743203568117, + "loss": 0.9478569626808167, + "unpadded_tokens_per_sec": 10037.6650390625 + }, + "step_155900": { + "perplexity": 2.5785783163054665, + "loss": 0.9472382068634033, + "unpadded_tokens_per_sec": 10037.154296875 + }, + "step_156000": { + "perplexity": 2.57924682356578, + "loss": 0.9474974274635315, + "unpadded_tokens_per_sec": 10034.4931640625 + }, + "step_156100": { + "perplexity": 2.5781363266667543, + "loss": 0.9470667839050293, + "unpadded_tokens_per_sec": 10034.615234375 + }, + "step_156200": { + "perplexity": 2.57876045160245, + "loss": 0.9473088383674622, + "unpadded_tokens_per_sec": 10036.3173828125 + }, + "step_156300": { + "perplexity": 2.577769852389486, + "loss": 0.94692462682724, + "unpadded_tokens_per_sec": 10034.3193359375 + }, + "step_156400": { + "perplexity": 2.5752764454630204, + "loss": 0.9459568858146667, + "unpadded_tokens_per_sec": 10040.720703125 + }, + "step_156500": { + "perplexity": 2.5765784412324337, + "loss": 0.9464623332023621, + "unpadded_tokens_per_sec": 10036.9287109375 + }, + "step_156600": { + "perplexity": 2.5756018827120015, + "loss": 0.9460832476615906, + "unpadded_tokens_per_sec": 10031.4248046875 + }, + "step_156700": { + "perplexity": 2.576161669563338, + "loss": 0.9463005661964417, + "unpadded_tokens_per_sec": 10035.0380859375 + }, + "step_156800": { + "perplexity": 2.57421077286473, + "loss": 0.9455429911613464, + "unpadded_tokens_per_sec": 10034.087890625 + }, + "step_156900": { + "perplexity": 2.5792317575708927, + "loss": 0.9474915862083435, + "unpadded_tokens_per_sec": 10043.9296875 + }, + "step_157000": { + "perplexity": 2.576732021852173, + "loss": 0.9465219378471375, + "unpadded_tokens_per_sec": 10034.9482421875 + }, + "step_157100": { + "perplexity": 2.5769602595623264, + "loss": 0.9466105103492737, + "unpadded_tokens_per_sec": 10037.41015625 + }, + "step_157200": { + "perplexity": 2.575443457178555, + "loss": 0.9460217356681824, + "unpadded_tokens_per_sec": 10046.501953125 + }, + "step_157300": { + "perplexity": 2.580449927425447, + "loss": 0.9479637742042542, + "unpadded_tokens_per_sec": 10037.0732421875 + }, + "step_157400": { + "perplexity": 2.5762276974259968, + "loss": 0.9463261961936951, + "unpadded_tokens_per_sec": 10035.115234375 + }, + "step_157500": { + "perplexity": 2.573887505790536, + "loss": 0.9454174041748047, + "unpadded_tokens_per_sec": 10038.646484375 + }, + "step_157600": { + "perplexity": 2.5739870744735103, + "loss": 0.9454560875892639, + "unpadded_tokens_per_sec": 10040.3642578125 + }, + "step_157700": { + "perplexity": 2.579935033631617, + "loss": 0.9477642178535461, + "unpadded_tokens_per_sec": 10040.6884765625 + }, + "step_157800": { + "perplexity": 2.580107729984259, + "loss": 0.9478311538696289, + "unpadded_tokens_per_sec": 10040.9814453125 + }, + "step_157900": { + "perplexity": 2.577471333445316, + "loss": 0.9468088150024414, + "unpadded_tokens_per_sec": 10039.5634765625 + }, + "step_158000": { + "perplexity": 2.5807950929704484, + "loss": 0.9480975270271301, + "unpadded_tokens_per_sec": 10037.2470703125 + }, + "step_158100": { + "perplexity": 2.574937696692207, + "loss": 0.9458253383636475, + "unpadded_tokens_per_sec": 10042.40234375 + }, + "step_158200": { + "perplexity": 2.5769151019105374, + "loss": 0.9465929865837097, + "unpadded_tokens_per_sec": 10040.1572265625 + }, + "step_158300": { + "perplexity": 2.575781965428091, + "loss": 0.9461531639099121, + "unpadded_tokens_per_sec": 10043.0537109375 + }, + "step_158400": { + "perplexity": 2.5788720446460065, + "loss": 0.9473521113395691, + "unpadded_tokens_per_sec": 10045.3037109375 + }, + "step_158500": { + "perplexity": 2.577490383544358, + "loss": 0.9468162059783936, + "unpadded_tokens_per_sec": 10048.4345703125 + }, + "step_158600": { + "perplexity": 2.5779388696922454, + "loss": 0.9469901919364929, + "unpadded_tokens_per_sec": 10041.595703125 + }, + "step_158700": { + "perplexity": 2.570930584750972, + "loss": 0.9442679286003113, + "unpadded_tokens_per_sec": 10044.0595703125 + }, + "step_158800": { + "perplexity": 2.5786974328711976, + "loss": 0.9472844004631042, + "unpadded_tokens_per_sec": 10039.859375 + }, + "step_158900": { + "perplexity": 2.5739586916367294, + "loss": 0.9454450607299805, + "unpadded_tokens_per_sec": 10043.0419921875 + }, + "step_159000": { + "perplexity": 2.576289273771719, + "loss": 0.94635009765625, + "unpadded_tokens_per_sec": 10040.921875 + }, + "step_159100": { + "perplexity": 2.575619383804685, + "loss": 0.946090042591095, + "unpadded_tokens_per_sec": 10046.4306640625 + }, + "step_159200": { + "perplexity": 2.57699312991535, + "loss": 0.9466232657432556, + "unpadded_tokens_per_sec": 10043.4013671875 + }, + "step_159300": { + "perplexity": 2.5737281118653157, + "loss": 0.9453554749488831, + "unpadded_tokens_per_sec": 10039.7880859375 + }, + "step_159400": { + "perplexity": 2.577669676455279, + "loss": 0.9468857645988464, + "unpadded_tokens_per_sec": 10038.7197265625 + }, + "step_159500": { + "perplexity": 2.575220265647603, + "loss": 0.945935070514679, + "unpadded_tokens_per_sec": 10044.796875 + }, + "step_159600": { + "perplexity": 2.5770373673139493, + "loss": 0.9466404318809509, + "unpadded_tokens_per_sec": 10041.7724609375 + }, + "step_159700": { + "perplexity": 2.5711507992056295, + "loss": 0.9443535804748535, + "unpadded_tokens_per_sec": 10042.20703125 + }, + "step_159800": { + "perplexity": 2.5716984288070557, + "loss": 0.944566547870636, + "unpadded_tokens_per_sec": 10044.5712890625 + }, + "step_159900": { + "perplexity": 2.575687393565728, + "loss": 0.9461164474487305, + "unpadded_tokens_per_sec": 10038.6474609375 + }, + "step_160000": { + "perplexity": 2.576689018355907, + "loss": 0.9465052485466003, + "unpadded_tokens_per_sec": 10037.4658203125 + }, + "step_160100": { + "perplexity": 2.573030208523836, + "loss": 0.945084273815155, + "unpadded_tokens_per_sec": 9968.2900390625 + }, + "step_160200": { + "perplexity": 2.577615595360549, + "loss": 0.9468647837638855, + "unpadded_tokens_per_sec": 10045.9560546875 + }, + "step_160300": { + "perplexity": 2.575734064959937, + "loss": 0.9461345672607422, + "unpadded_tokens_per_sec": 10032.9912109375 + }, + "step_160400": { + "perplexity": 2.574570142526665, + "loss": 0.9456825852394104, + "unpadded_tokens_per_sec": 9389.3837890625 + }, + "step_160500": { + "perplexity": 2.576087812494302, + "loss": 0.9462718963623047, + "unpadded_tokens_per_sec": 10035.1318359375 + }, + "step_160600": { + "perplexity": 2.572820721074728, + "loss": 0.9450028538703918, + "unpadded_tokens_per_sec": 10042.1162109375 + }, + "step_160700": { + "perplexity": 2.5760624773970537, + "loss": 0.9462620615959167, + "unpadded_tokens_per_sec": 10037.9716796875 + }, + "step_160800": { + "perplexity": 2.5753789844607518, + "loss": 0.9459967017173767, + "unpadded_tokens_per_sec": 10035.8701171875 + }, + "step_160900": { + "perplexity": 2.576263322464042, + "loss": 0.946340024471283, + "unpadded_tokens_per_sec": 10035.4501953125 + }, + "step_161000": { + "perplexity": 2.5732473818934736, + "loss": 0.945168673992157, + "unpadded_tokens_per_sec": 10037.8271484375 + }, + "step_161100": { + "perplexity": 2.5780448952925994, + "loss": 0.9470313191413879, + "unpadded_tokens_per_sec": 10034.296875 + }, + "step_161200": { + "perplexity": 2.573723049467349, + "loss": 0.9453535079956055, + "unpadded_tokens_per_sec": 8629.7939453125 + }, + "step_161300": { + "perplexity": 2.574301607939188, + "loss": 0.9455782771110535, + "unpadded_tokens_per_sec": 5833.98681640625 + }, + "step_161400": { + "perplexity": 2.575200004375541, + "loss": 0.9459272027015686, + "unpadded_tokens_per_sec": 10035.2705078125 + }, + "step_161500": { + "perplexity": 2.5756424117393952, + "loss": 0.9460989832878113, + "unpadded_tokens_per_sec": 10037.96875 + }, + "step_161600": { + "perplexity": 2.57464902029308, + "loss": 0.945713222026825, + "unpadded_tokens_per_sec": 10035.8427734375 + }, + "step_161700": { + "perplexity": 2.5722549141787105, + "loss": 0.9447829127311707, + "unpadded_tokens_per_sec": 8829.3583984375 + }, + "step_161800": { + "perplexity": 2.5745491190940872, + "loss": 0.9456744194030762, + "unpadded_tokens_per_sec": 10045.8095703125 + }, + "step_161900": { + "perplexity": 2.5748844402913393, + "loss": 0.9458046555519104, + "unpadded_tokens_per_sec": 10050.3623046875 + }, + "step_162000": { + "perplexity": 2.5793725819357283, + "loss": 0.9475461840629578, + "unpadded_tokens_per_sec": 10044.6201171875 + }, + "step_162100": { + "perplexity": 2.5689322005578394, + "loss": 0.9434903264045715, + "unpadded_tokens_per_sec": 10036.4296875 + }, + "step_162200": { + "perplexity": 2.5770779189297697, + "loss": 0.9466561675071716, + "unpadded_tokens_per_sec": 10036.96484375 + }, + "step_162300": { + "perplexity": 2.5746755691899286, + "loss": 0.9457235336303711, + "unpadded_tokens_per_sec": 10035.2255859375 + }, + "step_162400": { + "perplexity": 2.5792966342161368, + "loss": 0.9475167393684387, + "unpadded_tokens_per_sec": 10039.6015625 + }, + "step_162500": { + "perplexity": 2.577952391555199, + "loss": 0.9469954371452332, + "unpadded_tokens_per_sec": 10034.87890625 + }, + "step_162600": { + "perplexity": 2.577602843449552, + "loss": 0.9468598365783691, + "unpadded_tokens_per_sec": 10038.9130859375 + }, + "step_162700": { + "perplexity": 2.5751998508816643, + "loss": 0.9459271430969238, + "unpadded_tokens_per_sec": 10036.46484375 + }, + "step_162800": { + "perplexity": 2.57517636642628, + "loss": 0.9459180235862732, + "unpadded_tokens_per_sec": 10034.3994140625 + }, + "step_162900": { + "perplexity": 2.572206619354855, + "loss": 0.9447641372680664, + "unpadded_tokens_per_sec": 10036.580078125 + }, + "step_163000": { + "perplexity": 2.576000138764051, + "loss": 0.9462378621101379, + "unpadded_tokens_per_sec": 10032.6318359375 + }, + "step_163100": { + "perplexity": 2.5786669999865732, + "loss": 0.9472725987434387, + "unpadded_tokens_per_sec": 10038.5986328125 + }, + "step_163200": { + "perplexity": 2.5753558053780825, + "loss": 0.9459877014160156, + "unpadded_tokens_per_sec": 10038.2890625 + }, + "step_163300": { + "perplexity": 2.5746006805191954, + "loss": 0.9456944465637207, + "unpadded_tokens_per_sec": 10039.3798828125 + }, + "step_163400": { + "perplexity": 2.5763591439763447, + "loss": 0.9463772177696228, + "unpadded_tokens_per_sec": 10028.7451171875 + }, + "step_163500": { + "perplexity": 2.5744335700076286, + "loss": 0.9456295371055603, + "unpadded_tokens_per_sec": 10035.015625 + }, + "step_163600": { + "perplexity": 2.575695990864276, + "loss": 0.9461197853088379, + "unpadded_tokens_per_sec": 10033.1181640625 + }, + "step_163700": { + "perplexity": 2.57641442723924, + "loss": 0.9463986754417419, + "unpadded_tokens_per_sec": 10045.8447265625 + }, + "step_163800": { + "perplexity": 2.5765773662003593, + "loss": 0.9464619159698486, + "unpadded_tokens_per_sec": 10039.046875 + }, + "step_163900": { + "perplexity": 2.5816923701641685, + "loss": 0.9484451413154602, + "unpadded_tokens_per_sec": 10037.2314453125 + }, + "step_164000": { + "perplexity": 2.573414262020266, + "loss": 0.9452335238456726, + "unpadded_tokens_per_sec": 10036.73828125 + }, + "step_164100": { + "perplexity": 2.5760762965101187, + "loss": 0.9462674260139465, + "unpadded_tokens_per_sec": 10032.9140625 + }, + "step_164200": { + "perplexity": 2.575993690026048, + "loss": 0.9462353587150574, + "unpadded_tokens_per_sec": 10039.115234375 + }, + "step_164300": { + "perplexity": 2.5709903488132575, + "loss": 0.9442911744117737, + "unpadded_tokens_per_sec": 10036.9091796875 + }, + "step_164400": { + "perplexity": 2.5736698181885416, + "loss": 0.9453328251838684, + "unpadded_tokens_per_sec": 10030.5380859375 + }, + "step_164500": { + "perplexity": 2.576891755407688, + "loss": 0.9465839266777039, + "unpadded_tokens_per_sec": 10040.2998046875 + }, + "step_164600": { + "perplexity": 2.578586462166295, + "loss": 0.9472413659095764, + "unpadded_tokens_per_sec": 10034.4755859375 + }, + "step_164700": { + "perplexity": 2.5759669739977293, + "loss": 0.9462249875068665, + "unpadded_tokens_per_sec": 10038.2705078125 + }, + "step_164800": { + "perplexity": 2.573283119098225, + "loss": 0.9451825618743896, + "unpadded_tokens_per_sec": 10041.2587890625 + }, + "step_164900": { + "perplexity": 2.577571962577064, + "loss": 0.9468478560447693, + "unpadded_tokens_per_sec": 10043.5478515625 + }, + "step_165000": { + "perplexity": 2.5718693474525693, + "loss": 0.9446330070495605, + "unpadded_tokens_per_sec": 10039.6005859375 + }, + "step_165100": { + "perplexity": 2.576206353350405, + "loss": 0.9463179111480713, + "unpadded_tokens_per_sec": 9980.59375 + }, + "step_165200": { + "perplexity": 2.575316048364461, + "loss": 0.9459722638130188, + "unpadded_tokens_per_sec": 10041.4013671875 + }, + "step_165300": { + "perplexity": 2.5799064314334785, + "loss": 0.9477531313896179, + "unpadded_tokens_per_sec": 10036.7041015625 + }, + "step_165400": { + "perplexity": 2.572068792447396, + "loss": 0.9447105526924133, + "unpadded_tokens_per_sec": 10039.4443359375 + }, + "step_165500": { + "perplexity": 2.573858050154192, + "loss": 0.9454059600830078, + "unpadded_tokens_per_sec": 10034.423828125 + }, + "step_165600": { + "perplexity": 2.570094646493562, + "loss": 0.9439427256584167, + "unpadded_tokens_per_sec": 10039.1796875 + }, + "step_165700": { + "perplexity": 2.572173350114793, + "loss": 0.9447512030601501, + "unpadded_tokens_per_sec": 10038.4931640625 + }, + "step_165800": { + "perplexity": 2.5750489708253337, + "loss": 0.9458685517311096, + "unpadded_tokens_per_sec": 10044.1728515625 + }, + "step_165900": { + "perplexity": 2.574527175110352, + "loss": 0.9456658959388733, + "unpadded_tokens_per_sec": 10045.6904296875 + }, + "step_166000": { + "perplexity": 2.57643500520116, + "loss": 0.9464066624641418, + "unpadded_tokens_per_sec": 10039.2802734375 + }, + "step_166100": { + "perplexity": 2.5749020899851556, + "loss": 0.9458115100860596, + "unpadded_tokens_per_sec": 10047.6357421875 + }, + "step_166200": { + "perplexity": 2.5780132408161944, + "loss": 0.9470190405845642, + "unpadded_tokens_per_sec": 10050.451171875 + }, + "step_166300": { + "perplexity": 2.575020576279236, + "loss": 0.9458575248718262, + "unpadded_tokens_per_sec": 10045.09375 + }, + "step_166400": { + "perplexity": 2.571900160004276, + "loss": 0.9446449875831604, + "unpadded_tokens_per_sec": 10050.5322265625 + }, + "step_166500": { + "perplexity": 2.5763058581762546, + "loss": 0.9463565349578857, + "unpadded_tokens_per_sec": 10042.4853515625 + }, + "step_166600": { + "perplexity": 2.571716209948399, + "loss": 0.9445734620094299, + "unpadded_tokens_per_sec": 10046.5009765625 + }, + "step_166700": { + "perplexity": 2.576343327038823, + "loss": 0.9463710784912109, + "unpadded_tokens_per_sec": 10040.1767578125 + }, + "step_166800": { + "perplexity": 2.5764862972544713, + "loss": 0.9464265704154968, + "unpadded_tokens_per_sec": 10043.099609375 + }, + "step_166900": { + "perplexity": 2.572881295853541, + "loss": 0.9450263977050781, + "unpadded_tokens_per_sec": 10034.74609375 + }, + "step_167000": { + "perplexity": 2.574141114354145, + "loss": 0.9455159306526184, + "unpadded_tokens_per_sec": 10045.921875 + }, + "step_167100": { + "perplexity": 2.5710801507599106, + "loss": 0.9443261027336121, + "unpadded_tokens_per_sec": 10038.0244140625 + }, + "step_167200": { + "perplexity": 2.577036752900435, + "loss": 0.9466401934623718, + "unpadded_tokens_per_sec": 10034.11328125 + }, + "step_167300": { + "perplexity": 2.5728017054889216, + "loss": 0.9449954628944397, + "unpadded_tokens_per_sec": 10040.6708984375 + }, + "step_167400": { + "perplexity": 2.572569695990887, + "loss": 0.9449052810668945, + "unpadded_tokens_per_sec": 10036.54296875 + }, + "step_167500": { + "perplexity": 2.572886663307789, + "loss": 0.9450284838676453, + "unpadded_tokens_per_sec": 9345.4423828125 + }, + "step_167600": { + "perplexity": 2.5727213508555513, + "loss": 0.9449642300605774, + "unpadded_tokens_per_sec": 10027.4990234375 + }, + "step_167700": { + "perplexity": 2.574609120731775, + "loss": 0.9456977248191833, + "unpadded_tokens_per_sec": 10033.6015625 + }, + "step_167800": { + "perplexity": 2.5676720235080785, + "loss": 0.9429996609687805, + "unpadded_tokens_per_sec": 10029.4638671875 + }, + "step_167900": { + "perplexity": 2.5768636477276425, + "loss": 0.94657301902771, + "unpadded_tokens_per_sec": 10032.0478515625 + }, + "step_168000": { + "perplexity": 2.5737936171248936, + "loss": 0.9453809261322021, + "unpadded_tokens_per_sec": 10036.0703125 + }, + "step_168100": { + "perplexity": 2.573673499855382, + "loss": 0.945334255695343, + "unpadded_tokens_per_sec": 10035.7724609375 + }, + "step_168200": { + "perplexity": 2.5708786371177603, + "loss": 0.9442477226257324, + "unpadded_tokens_per_sec": 10040.34765625 + }, + "step_168300": { + "perplexity": 2.571516179196442, + "loss": 0.944495677947998, + "unpadded_tokens_per_sec": 10040.6416015625 + }, + "step_168400": { + "perplexity": 2.5726457523181754, + "loss": 0.9449348449707031, + "unpadded_tokens_per_sec": 10036.7119140625 + }, + "step_168500": { + "perplexity": 2.569392215094917, + "loss": 0.9436693787574768, + "unpadded_tokens_per_sec": 9217.5029296875 + }, + "step_168600": { + "perplexity": 2.5751875714011536, + "loss": 0.9459223747253418, + "unpadded_tokens_per_sec": 7779.82568359375 + }, + "step_168700": { + "perplexity": 2.5737592535021387, + "loss": 0.9453675746917725, + "unpadded_tokens_per_sec": 6315.25390625 + }, + "step_168800": { + "perplexity": 2.570517637724189, + "loss": 0.9441072940826416, + "unpadded_tokens_per_sec": 10042.2841796875 + }, + "step_168900": { + "perplexity": 2.5734710159999725, + "loss": 0.9452555775642395, + "unpadded_tokens_per_sec": 10033.4970703125 + }, + "step_169000": { + "perplexity": 2.57122788638382, + "loss": 0.9443835616111755, + "unpadded_tokens_per_sec": 10034.2412109375 + }, + "step_169100": { + "perplexity": 2.573444479523935, + "loss": 0.9452452659606934, + "unpadded_tokens_per_sec": 8787.9580078125 + }, + "step_169200": { + "perplexity": 2.5764471370616917, + "loss": 0.9464113712310791, + "unpadded_tokens_per_sec": 10038.4541015625 + }, + "step_169300": { + "perplexity": 2.5739913702814814, + "loss": 0.9454577565193176, + "unpadded_tokens_per_sec": 10030.0244140625 + }, + "step_169400": { + "perplexity": 2.5738572830848323, + "loss": 0.9454056620597839, + "unpadded_tokens_per_sec": 10037.4873046875 + }, + "step_169500": { + "perplexity": 2.5715603225761448, + "loss": 0.9445128440856934, + "unpadded_tokens_per_sec": 10034.9853515625 + }, + "step_169600": { + "perplexity": 2.568806032536184, + "loss": 0.9434412121772766, + "unpadded_tokens_per_sec": 10036.7255859375 + }, + "step_169700": { + "perplexity": 2.5729621155639624, + "loss": 0.9450578093528748, + "unpadded_tokens_per_sec": 10032.44140625 + }, + "step_169800": { + "perplexity": 2.5698621152333403, + "loss": 0.9438522458076477, + "unpadded_tokens_per_sec": 10033.7490234375 + }, + "step_169900": { + "perplexity": 2.573157810994692, + "loss": 0.9451338648796082, + "unpadded_tokens_per_sec": 10033.9716796875 + }, + "step_170000": { + "perplexity": 2.5727054029061898, + "loss": 0.9449580311775208, + "unpadded_tokens_per_sec": 10041.6416015625 + }, + "step_170100": { + "perplexity": 2.569039693251172, + "loss": 0.9435321688652039, + "unpadded_tokens_per_sec": 9971.1376953125 + }, + "step_170200": { + "perplexity": 2.5727658216211413, + "loss": 0.9449815154075623, + "unpadded_tokens_per_sec": 10041.6103515625 + }, + "step_170300": { + "perplexity": 2.5760816706296645, + "loss": 0.9462695121765137, + "unpadded_tokens_per_sec": 10043.2890625 + }, + "step_170400": { + "perplexity": 2.572681941198847, + "loss": 0.9449489116668701, + "unpadded_tokens_per_sec": 10042.1513671875 + }, + "step_170500": { + "perplexity": 2.5653076464812714, + "loss": 0.9420784115791321, + "unpadded_tokens_per_sec": 10033.7373046875 + }, + "step_170600": { + "perplexity": 2.5754712423474593, + "loss": 0.9460325241088867, + "unpadded_tokens_per_sec": 10031.33984375 + }, + "step_170700": { + "perplexity": 2.569847410406435, + "loss": 0.9438465237617493, + "unpadded_tokens_per_sec": 10034.4599609375 + }, + "step_170800": { + "perplexity": 2.5711019121133116, + "loss": 0.9443345665931702, + "unpadded_tokens_per_sec": 10037.7021484375 + }, + "step_170900": { + "perplexity": 2.571315091166602, + "loss": 0.9444174766540527, + "unpadded_tokens_per_sec": 10043.5771484375 + }, + "step_171000": { + "perplexity": 2.572826088402607, + "loss": 0.945004940032959, + "unpadded_tokens_per_sec": 10033.2080078125 + }, + "step_171100": { + "perplexity": 2.5697723558298806, + "loss": 0.9438173174858093, + "unpadded_tokens_per_sec": 10033.76171875 + }, + "step_171200": { + "perplexity": 2.5727259512439313, + "loss": 0.9449660181999207, + "unpadded_tokens_per_sec": 10036.2294921875 + }, + "step_171300": { + "perplexity": 2.569221001659559, + "loss": 0.9436027407646179, + "unpadded_tokens_per_sec": 10031.6728515625 + }, + "step_171400": { + "perplexity": 2.572932670517808, + "loss": 0.9450463652610779, + "unpadded_tokens_per_sec": 10037.1650390625 + }, + "step_171500": { + "perplexity": 2.576112840744116, + "loss": 0.9462816119194031, + "unpadded_tokens_per_sec": 10035.90234375 + }, + "step_171600": { + "perplexity": 2.5762946483355704, + "loss": 0.9463521838188171, + "unpadded_tokens_per_sec": 10039.0166015625 + }, + "step_171700": { + "perplexity": 2.5725026885496067, + "loss": 0.9448792338371277, + "unpadded_tokens_per_sec": 10036.77734375 + }, + "step_171800": { + "perplexity": 2.578971037558315, + "loss": 0.9473904967308044, + "unpadded_tokens_per_sec": 10040.4189453125 + }, + "step_171900": { + "perplexity": 2.5743625244720763, + "loss": 0.9456019401550293, + "unpadded_tokens_per_sec": 10032.0009765625 + }, + "step_172000": { + "perplexity": 2.572343840353272, + "loss": 0.9448174834251404, + "unpadded_tokens_per_sec": 10036.3642578125 + }, + "step_172100": { + "perplexity": 2.5744173045499776, + "loss": 0.9456232190132141, + "unpadded_tokens_per_sec": 10044.2578125 + }, + "step_172200": { + "perplexity": 2.573003983319166, + "loss": 0.9450740814208984, + "unpadded_tokens_per_sec": 10035.5419921875 + }, + "step_172300": { + "perplexity": 2.571623013608566, + "loss": 0.9445372223854065, + "unpadded_tokens_per_sec": 10041.6396484375 + }, + "step_172400": { + "perplexity": 2.5754363958021846, + "loss": 0.9460189938545227, + "unpadded_tokens_per_sec": 10044.849609375 + }, + "step_172500": { + "perplexity": 2.575758168609812, + "loss": 0.9461439251899719, + "unpadded_tokens_per_sec": 10034.2421875 + }, + "step_172600": { + "perplexity": 2.5728187274986536, + "loss": 0.9450020790100098, + "unpadded_tokens_per_sec": 10042.3154296875 + }, + "step_172700": { + "perplexity": 2.572877155253629, + "loss": 0.9450247883796692, + "unpadded_tokens_per_sec": 10034.8544921875 + }, + "step_172800": { + "perplexity": 2.575579315689077, + "loss": 0.9460744857788086, + "unpadded_tokens_per_sec": 10039.0966796875 + }, + "step_172900": { + "perplexity": 2.5761701148932468, + "loss": 0.9463038444519043, + "unpadded_tokens_per_sec": 10027.873046875 + }, + "step_173000": { + "perplexity": 2.5742643222083195, + "loss": 0.945563793182373, + "unpadded_tokens_per_sec": 10037.0537109375 + }, + "step_173100": { + "perplexity": 2.5722852713892492, + "loss": 0.9447947144508362, + "unpadded_tokens_per_sec": 10044.8681640625 + }, + "step_173200": { + "perplexity": 2.5735027680967835, + "loss": 0.945267915725708, + "unpadded_tokens_per_sec": 10038.7802734375 + }, + "step_173300": { + "perplexity": 2.576247506114794, + "loss": 0.9463338851928711, + "unpadded_tokens_per_sec": 10039.0859375 + }, + "step_173400": { + "perplexity": 2.5729811323353604, + "loss": 0.9450652003288269, + "unpadded_tokens_per_sec": 10031.7607421875 + }, + "step_173500": { + "perplexity": 2.573030208523836, + "loss": 0.945084273815155, + "unpadded_tokens_per_sec": 10037.1572265625 + }, + "step_173600": { + "perplexity": 2.5717634225409642, + "loss": 0.9445918202400208, + "unpadded_tokens_per_sec": 10042.6318359375 + }, + "step_173700": { + "perplexity": 2.574486356729059, + "loss": 0.945650041103363, + "unpadded_tokens_per_sec": 10030.4169921875 + }, + "step_173800": { + "perplexity": 2.574982666211175, + "loss": 0.9458428025245667, + "unpadded_tokens_per_sec": 10039.06640625 + }, + "step_173900": { + "perplexity": 2.568376127852573, + "loss": 0.9432738423347473, + "unpadded_tokens_per_sec": 10039.61328125 + }, + "step_174000": { + "perplexity": 2.57018809383522, + "loss": 0.9439790844917297, + "unpadded_tokens_per_sec": 10034.1220703125 + }, + "step_174100": { + "perplexity": 2.5734624261280667, + "loss": 0.9452522397041321, + "unpadded_tokens_per_sec": 10036.5048828125 + }, + "step_174200": { + "perplexity": 2.572418356722007, + "loss": 0.9448464512825012, + "unpadded_tokens_per_sec": 10046.8837890625 + }, + "step_174300": { + "perplexity": 2.5735831471364095, + "loss": 0.9452991485595703, + "unpadded_tokens_per_sec": 10038.125 + }, + "step_174400": { + "perplexity": 2.572765514923574, + "loss": 0.9449813961982727, + "unpadded_tokens_per_sec": 10041.96875 + }, + "step_174500": { + "perplexity": 2.5706947601242813, + "loss": 0.944176197052002, + "unpadded_tokens_per_sec": 10034.08203125 + }, + "step_174600": { + "perplexity": 2.5715028443661865, + "loss": 0.9444904923439026, + "unpadded_tokens_per_sec": 10041.5146484375 + }, + "step_174700": { + "perplexity": 2.5719154897695056, + "loss": 0.9446509480476379, + "unpadded_tokens_per_sec": 10035.65625 + }, + "step_174800": { + "perplexity": 2.568346735288583, + "loss": 0.9432623982429504, + "unpadded_tokens_per_sec": 10037.400390625 + }, + "step_174900": { + "perplexity": 2.572121990612147, + "loss": 0.9447312355041504, + "unpadded_tokens_per_sec": 9329.8828125 + }, + "step_175000": { + "perplexity": 2.570439652578723, + "loss": 0.9440769553184509, + "unpadded_tokens_per_sec": 10040.6044921875 + }, + "step_175100": { + "perplexity": 2.5704937363279643, + "loss": 0.9440979957580566, + "unpadded_tokens_per_sec": 9969.7158203125 + }, + "step_175200": { + "perplexity": 2.570138305894245, + "loss": 0.9439597129821777, + "unpadded_tokens_per_sec": 10035.0498046875 + }, + "step_175300": { + "perplexity": 2.570851207964966, + "loss": 0.9442370533943176, + "unpadded_tokens_per_sec": 10037.59375 + }, + "step_175400": { + "perplexity": 2.568289481989073, + "loss": 0.9432401061058044, + "unpadded_tokens_per_sec": 10041.5185546875 + }, + "step_175500": { + "perplexity": 2.5706032862190358, + "loss": 0.944140613079071, + "unpadded_tokens_per_sec": 10040.3076171875 + }, + "step_175600": { + "perplexity": 2.5703327141238734, + "loss": 0.9440353512763977, + "unpadded_tokens_per_sec": 10040.9658203125 + }, + "step_175700": { + "perplexity": 2.5709158738102524, + "loss": 0.9442622065544128, + "unpadded_tokens_per_sec": 10038.2919921875 + }, + "step_175800": { + "perplexity": 2.5717177428111677, + "loss": 0.9445740580558777, + "unpadded_tokens_per_sec": 10029.85546875 + }, + "step_175900": { + "perplexity": 2.5717283195891714, + "loss": 0.9445781707763672, + "unpadded_tokens_per_sec": 10039.5751953125 + }, + "step_176000": { + "perplexity": 2.573993825032112, + "loss": 0.945458710193634, + "unpadded_tokens_per_sec": 10034.5224609375 + }, + "step_176100": { + "perplexity": 2.5728231747089407, + "loss": 0.9450038075447083, + "unpadded_tokens_per_sec": 10038.533203125 + }, + "step_176200": { + "perplexity": 2.574288872423061, + "loss": 0.9455733299255371, + "unpadded_tokens_per_sec": 8531.8955078125 + }, + "step_176300": { + "perplexity": 2.572959661797246, + "loss": 0.9450568556785583, + "unpadded_tokens_per_sec": 5946.361328125 + }, + "step_176400": { + "perplexity": 2.5690657249217748, + "loss": 0.9435423016548157, + "unpadded_tokens_per_sec": 8570.6044921875 + }, + "step_176500": { + "perplexity": 2.5723006034499285, + "loss": 0.9448006749153137, + "unpadded_tokens_per_sec": 10039.3095703125 + }, + "step_176600": { + "perplexity": 2.5757483428909103, + "loss": 0.9461401104927063, + "unpadded_tokens_per_sec": 10033.275390625 + }, + "step_176700": { + "perplexity": 2.572977144936164, + "loss": 0.9450636506080627, + "unpadded_tokens_per_sec": 10031.3671875 + }, + "step_176800": { + "perplexity": 2.5709554096543137, + "loss": 0.9442775845527649, + "unpadded_tokens_per_sec": 8734.3388671875 + }, + "step_176900": { + "perplexity": 2.5667366287517606, + "loss": 0.9426352977752686, + "unpadded_tokens_per_sec": 10041.40234375 + }, + "step_177000": { + "perplexity": 2.573438957517074, + "loss": 0.9452431201934814, + "unpadded_tokens_per_sec": 10032.16015625 + }, + "step_177100": { + "perplexity": 2.566840051686636, + "loss": 0.9426755905151367, + "unpadded_tokens_per_sec": 10037.4580078125 + }, + "step_177200": { + "perplexity": 2.5708092220074095, + "loss": 0.9442207217216492, + "unpadded_tokens_per_sec": 10035.0185546875 + }, + "step_177300": { + "perplexity": 2.5714009194996734, + "loss": 0.944450855255127, + "unpadded_tokens_per_sec": 10025.3896484375 + }, + "step_177400": { + "perplexity": 2.571228805926733, + "loss": 0.9443839192390442, + "unpadded_tokens_per_sec": 10033.91015625 + }, + "step_177500": { + "perplexity": 2.570628720847557, + "loss": 0.9441505074501038, + "unpadded_tokens_per_sec": 10033.22265625 + }, + "step_177600": { + "perplexity": 2.576412123746269, + "loss": 0.9463977813720703, + "unpadded_tokens_per_sec": 10035.5322265625 + }, + "step_177700": { + "perplexity": 2.5665516712012364, + "loss": 0.9425632357597351, + "unpadded_tokens_per_sec": 10027.587890625 + }, + "step_177800": { + "perplexity": 2.571643093455545, + "loss": 0.9445450305938721, + "unpadded_tokens_per_sec": 10036.0146484375 + }, + "step_177900": { + "perplexity": 2.5692687810054333, + "loss": 0.9436213374137878, + "unpadded_tokens_per_sec": 10037.2314453125 + }, + "step_178000": { + "perplexity": 2.571877165527754, + "loss": 0.9446360468864441, + "unpadded_tokens_per_sec": 10036.279296875 + }, + "step_178100": { + "perplexity": 2.56823314843743, + "loss": 0.9432181715965271, + "unpadded_tokens_per_sec": 10037.4775390625 + }, + "step_178200": { + "perplexity": 2.568281521738332, + "loss": 0.9432370066642761, + "unpadded_tokens_per_sec": 10044.7998046875 + }, + "step_178300": { + "perplexity": 2.5719335790100004, + "loss": 0.9446579813957214, + "unpadded_tokens_per_sec": 10035.67578125 + }, + "step_178400": { + "perplexity": 2.5709137284717642, + "loss": 0.944261372089386, + "unpadded_tokens_per_sec": 10040.9619140625 + }, + "step_178500": { + "perplexity": 2.5740754468236062, + "loss": 0.9454904198646545, + "unpadded_tokens_per_sec": 10039.2607421875 + }, + "step_178600": { + "perplexity": 2.5743508627572416, + "loss": 0.9455974102020264, + "unpadded_tokens_per_sec": 10045.51171875 + }, + "step_178700": { + "perplexity": 2.566461874450959, + "loss": 0.942528247833252, + "unpadded_tokens_per_sec": 10037.8828125 + }, + "step_178800": { + "perplexity": 2.5739226382142637, + "loss": 0.9454310536384583, + "unpadded_tokens_per_sec": 10042.740234375 + }, + "step_178900": { + "perplexity": 2.5748101594277535, + "loss": 0.9457758069038391, + "unpadded_tokens_per_sec": 10034.9755859375 + }, + "step_179000": { + "perplexity": 2.5679110912061813, + "loss": 0.9430927634239197, + "unpadded_tokens_per_sec": 10036.7119140625 + }, + "step_179100": { + "perplexity": 2.573955776660404, + "loss": 0.9454439282417297, + "unpadded_tokens_per_sec": 10040.7763671875 + }, + "step_179200": { + "perplexity": 2.569055924732471, + "loss": 0.94353848695755, + "unpadded_tokens_per_sec": 10031.890625 + }, + "step_179300": { + "perplexity": 2.5768207956251343, + "loss": 0.9465563893318176, + "unpadded_tokens_per_sec": 10036.8408203125 + }, + "step_179400": { + "perplexity": 2.569133102235264, + "loss": 0.9435685276985168, + "unpadded_tokens_per_sec": 10034.7216796875 + }, + "step_179500": { + "perplexity": 2.571700114944493, + "loss": 0.9445672035217285, + "unpadded_tokens_per_sec": 10038.8115234375 + }, + "step_179600": { + "perplexity": 2.570246155452249, + "loss": 0.9440016746520996, + "unpadded_tokens_per_sec": 10032.6201171875 + }, + "step_179700": { + "perplexity": 2.5710412259815185, + "loss": 0.9443109631538391, + "unpadded_tokens_per_sec": 10042.5078125 + }, + "step_179800": { + "perplexity": 2.569522087635385, + "loss": 0.9437199234962463, + "unpadded_tokens_per_sec": 10032.15234375 + }, + "step_179900": { + "perplexity": 2.5735961859577285, + "loss": 0.9453042149543762, + "unpadded_tokens_per_sec": 10033.8408203125 + }, + "step_180000": { + "perplexity": 2.571978955969751, + "loss": 0.944675624370575, + "unpadded_tokens_per_sec": 10037.5908203125 + }, + "step_180100": { + "perplexity": 2.5748253530664282, + "loss": 0.9457817077636719, + "unpadded_tokens_per_sec": 9964.892578125 + }, + "step_180200": { + "perplexity": 2.572781156546132, + "loss": 0.9449874758720398, + "unpadded_tokens_per_sec": 10046.5439453125 + }, + "step_180300": { + "perplexity": 2.571742575315307, + "loss": 0.9445837140083313, + "unpadded_tokens_per_sec": 10034.9775390625 + }, + "step_180400": { + "perplexity": 2.569917871799722, + "loss": 0.943873941898346, + "unpadded_tokens_per_sec": 10028.7177734375 + }, + "step_180500": { + "perplexity": 2.5741104283837033, + "loss": 0.9455040097236633, + "unpadded_tokens_per_sec": 10035.3974609375 + }, + "step_180600": { + "perplexity": 2.570895033454994, + "loss": 0.9442541003227234, + "unpadded_tokens_per_sec": 10039.18359375 + }, + "step_180700": { + "perplexity": 2.564303567913473, + "loss": 0.9416869282722473, + "unpadded_tokens_per_sec": 10033.5205078125 + }, + "step_180800": { + "perplexity": 2.5676867158872843, + "loss": 0.943005383014679, + "unpadded_tokens_per_sec": 10034.3505859375 + }, + "step_180900": { + "perplexity": 2.572503915214771, + "loss": 0.9448797106742859, + "unpadded_tokens_per_sec": 10037.8408203125 + }, + "step_181000": { + "perplexity": 2.5733222312001467, + "loss": 0.9451977610588074, + "unpadded_tokens_per_sec": 10037.5908203125 + }, + "step_181100": { + "perplexity": 2.5732390995220125, + "loss": 0.9451654553413391, + "unpadded_tokens_per_sec": 10053.5751953125 + }, + "step_181200": { + "perplexity": 2.5755588980768094, + "loss": 0.9460665583610535, + "unpadded_tokens_per_sec": 10042.4560546875 + }, + "step_181300": { + "perplexity": 2.5706671797095937, + "loss": 0.9441654682159424, + "unpadded_tokens_per_sec": 10036.4658203125 + }, + "step_181400": { + "perplexity": 2.5708242388039735, + "loss": 0.9442265629768372, + "unpadded_tokens_per_sec": 10031.2392578125 + }, + "step_181500": { + "perplexity": 2.5671125512949535, + "loss": 0.9427817463874817, + "unpadded_tokens_per_sec": 10031.2236328125 + }, + "step_181600": { + "perplexity": 2.5709125025648603, + "loss": 0.9442608952522278, + "unpadded_tokens_per_sec": 10030.6767578125 + }, + "step_181700": { + "perplexity": 2.5701222207656014, + "loss": 0.9439534544944763, + "unpadded_tokens_per_sec": 10035.5771484375 + }, + "step_181800": { + "perplexity": 2.5724642022271573, + "loss": 0.9448642730712891, + "unpadded_tokens_per_sec": 10040.2626953125 + }, + "step_181900": { + "perplexity": 2.5738114127527103, + "loss": 0.9453878402709961, + "unpadded_tokens_per_sec": 10037.3916015625 + }, + "step_182000": { + "perplexity": 2.56977786996906, + "loss": 0.9438194632530212, + "unpadded_tokens_per_sec": 10042.5302734375 + }, + "step_182100": { + "perplexity": 2.571889582519542, + "loss": 0.9446408748626709, + "unpadded_tokens_per_sec": 10034.6279296875 + }, + "step_182200": { + "perplexity": 2.567871908291452, + "loss": 0.9430775046348572, + "unpadded_tokens_per_sec": 10035.6455078125 + }, + "step_182300": { + "perplexity": 2.571363675781865, + "loss": 0.9444363713264465, + "unpadded_tokens_per_sec": 10035.6943359375 + } +} diff --git a/bionemo-recipes/recipes/opengenome2_llama_native_te/baseline_bf16_1node.json b/bionemo-recipes/recipes/opengenome2_llama_native_te/baseline_bf16_1node.json new file mode 100644 index 0000000000..87a87361b8 --- /dev/null +++ b/bionemo-recipes/recipes/opengenome2_llama_native_te/baseline_bf16_1node.json @@ -0,0 +1,47 @@ +{ + "step_5100": { + "perplexity": 3.3374, + "loss": 1.2052, + "unpadded_tokens_per_sec": 10538.7 + }, + "step_5200": { + "perplexity": 3.3256, + "loss": 1.2017, + "unpadded_tokens_per_sec": 10527.2 + }, + "step_5300": { + "perplexity": 3.3493, + "loss": 1.2087, + "unpadded_tokens_per_sec": 10520.4 + }, + "step_5400": { + "perplexity": 3.3755, + "loss": 1.2166, + "unpadded_tokens_per_sec": 10301.1 + }, + "step_5500": { + "perplexity": 3.3248, + "loss": 1.2014, + "unpadded_tokens_per_sec": 10754.5 + }, + "step_5600": { + "perplexity": 3.3476, + "loss": 1.2083, + "unpadded_tokens_per_sec": 10614.0 + }, + "step_5700": { + "perplexity": 3.3143, + "loss": 1.1982, + "unpadded_tokens_per_sec": 10669.2 + }, + "step_5800": { + "perplexity": 3.3342, + "loss": 1.2042, + "unpadded_tokens_per_sec": 10588.3 + }, + "step_5900": { + "perplexity": 3.3293, + "loss": 1.2028, + "unpadded_tokens_per_sec": 10685.3 + } +} diff --git a/bionemo-recipes/recipes/opengenome2_llama_native_te/checkpoint.py b/bionemo-recipes/recipes/opengenome2_llama_native_te/checkpoint.py index 58f897e689..b40386fa8f 100644 --- a/bionemo-recipes/recipes/opengenome2_llama_native_te/checkpoint.py +++ b/bionemo-recipes/recipes/opengenome2_llama_native_te/checkpoint.py @@ -160,9 +160,12 @@ def state_dict(self): def load_state_dict(self, state_dict: dict): """Load the state dict for the model, optimizer, scheduler, and step.""" - # Use strict=False to handle checkpoints saved without TransformerEngine - # _extra_state keys (FP8 metadata). These keys are registered by newer TE - # versions even when FP8 is disabled, and are safe to skip. + # Save optimizer param group hyperparameters before set_state_dict, + # which can strip them in certain PyTorch versions. + saved_hyperparams = [ + {k: v for k, v in group.items() if k != "params"} for group in self.optimizer.param_groups + ] + incompatible = set_state_dict( self.model, self.optimizer, @@ -175,6 +178,12 @@ def load_state_dict(self, state_dict: dict): logger.warning(f"Missing keys when loading checkpoint: {incompatible.missing_keys}") if incompatible.unexpected_keys: logger.warning(f"Unexpected keys when loading checkpoint: {incompatible.unexpected_keys}") + + for group, saved in zip(self.optimizer.param_groups, saved_hyperparams): + for key, value in saved.items(): + if key not in group: + group[key] = value + self.scheduler.load_state_dict(state_dict["scheduler"]) self.step = state_dict["step"] self.epoch = state_dict["epoch"] @@ -200,11 +209,29 @@ def load_checkpoint_fsdp2( dataloader: The dataloader to load. process_group: The process group to use for checkpointing. """ - checkpoint_path, _ = get_latest_checkpoint(ckpt_path) + checkpoint_path, step = get_latest_checkpoint(ckpt_path) if not checkpoint_path: logger.info("No FSDP2 checkpoint found, starting from scratch") return CheckpointOutput(model, optimizer, scheduler, dataloader, 0, 0) + # Validate checkpoint before attempting distributed load + logger.info(f"Found checkpoint at {checkpoint_path} (step {step})") + resolved = checkpoint_path.resolve() + if checkpoint_path.is_symlink(): + logger.info(f"Checkpoint is a symlink -> {resolved}") + if not resolved.is_dir(): + raise FileNotFoundError( + f"Checkpoint path {checkpoint_path} does not resolve to a directory " + f"(resolved: {resolved}). If this is a symlink, ensure the target exists." + ) + metadata_file = resolved / ".metadata" + if not metadata_file.exists(): + raise FileNotFoundError( + f"Checkpoint at {resolved} is missing .metadata file. " + f"Contents: {list(resolved.iterdir()) if resolved.is_dir() else 'N/A'}" + ) + logger.info(f"Checkpoint validated: {resolved} has .metadata and {len(list(resolved.iterdir()))} files") + app_state = AppState( model=model, optimizer=optimizer, diff --git a/bionemo-recipes/recipes/opengenome2_llama_native_te/claude_agent_prompt.txt b/bionemo-recipes/recipes/opengenome2_llama_native_te/claude_agent_prompt.txt new file mode 100644 index 0000000000..836859f9aa --- /dev/null +++ b/bionemo-recipes/recipes/opengenome2_llama_native_te/claude_agent_prompt.txt @@ -0,0 +1,72 @@ +You are an autonomous FP8 Precision Agent running on a Lepton GPU cluster. + +## Environment + +- Working directory: {code_path} +- Nodes: {num_nodes} (each with {gpus_per_node}x H100 GPUs) +- MASTER_ADDR and MASTER_PORT are set by the Lepton environment +- NFS is mounted at /data (shared across all nodes) +- You are running on rank 0 node only; torchrun handles multi-node coordination +- When running multi-node (NNODES > 1): **`$TORCHRUN_PREFIX` is pre-set in your environment** with the correct multi-node torchrun flags. ALWAYS use `$TORCHRUN_PREFIX train_fsdp2.py ...` to launch training. +- When running single-node (NNODES = 1): Use `torchrun --nproc_per_node={gpus_per_node} train_fsdp2.py ...` directly. No multi-node coordination needed. + +## Your Mission + +Read and execute the FP8 Precision Agent guide at: + {code_path}/{guide_filename} + +This guide contains the complete specification for your agent loop. It tells you: +- How to launch training (torchrun command template) +- How to monitor metrics (WandB local files) +- How to perform check-ins (perplexity vs BF16 baseline) +- How to demote layers on failure (promotion strategies) +- How to save/resume checkpoints (LKG recovery) +- What artifacts to produce (history.json, state.json, report.md) + +## Run Variables (override guide defaults) + +Apply these values to the Run Variables section of the guide: + +``` +BASELINE_LOGFILE = {code_path}/{baseline_logfile} +NUM_TRAIN_STEPS = {num_train_steps} +CHECKIN_INTERVAL = {checkin_interval} +TOLERANCE_PCT = {tolerance_pct} +PROMOTION_STRATEGY = {promotion_strategy} +WORKSPACE_ROOT = {workspace_root} +CHECKPOINT_ROOT = {checkpoint_root} +RESULTS_FOLDER = {workspace_root}/results/{promotion_strategy} +WANDB_PROJECT = {wandb_project} +NPROC_PER_NODE = {gpus_per_node} +NNODES = {num_nodes} +LAUNCH_DIR = {launch_dir} +``` + +All other Run Variables (LAYERS_PER_PROMOTION, NUM_LAYERS, etc.) use the guide defaults. + +**CRITICAL PATH DISTINCTION**: `CHECKPOINT_ROOT` and `WORKSPACE_ROOT` are DIFFERENT directories: +- `CHECKPOINT_ROOT = {checkpoint_root}` — for model checkpoints (`checkpoint.ckpt_dir=$CHECKPOINT_ROOT/`) +- `WORKSPACE_ROOT = {workspace_root}` — for agent artifacts (logs, history.json, state.json, report.md) +Do NOT confuse them. The `checkpoint.ckpt_dir` CLI argument must ALWAYS use `$CHECKPOINT_ROOT/`, never `$WORKSPACE_ROOT`. + +{warm_start_section} + +## Instructions + +1. Read `{guide_filename}` thoroughly before starting. +2. Also read the strategy document for your assigned strategy (if applicable): + - ends_in: `OG2_STRATEGY_ENDS_IN.md` + - tail_in: `OG2_STRATEGY_TAIL_IN.md` + - gradual: described in `OG2_FP8_1NODE_DEMO_GUIDE.md` + - research_guided: described in `OG2_FP8_AGENT_GUIDE.md` +3. Execute Milestone 1 (Agent Loop) and Milestone 2 (Persistence & Recovery) as described. +4. Produce all required artifacts under $WORKSPACE_ROOT//. +5. At the end, produce the final report under $RESULTS_FOLDER/. +6. Exit cleanly. + +## Important Notes + +- Do NOT modify training scripts or config files (unless Milestone 3 is explicitly enabled). +- The baseline logfile is already extracted and available at the path above. +- If training fails for non-precision reasons (OOM, NCCL timeout, etc.), log the error and retry. +- All checkpoints, logs, and agent state go on NFS so they persist across container restarts. diff --git a/bionemo-recipes/recipes/opengenome2_llama_native_te/control_plane.py b/bionemo-recipes/recipes/opengenome2_llama_native_te/control_plane.py new file mode 100644 index 0000000000..e87ff3b1f4 --- /dev/null +++ b/bionemo-recipes/recipes/opengenome2_llama_native_te/control_plane.py @@ -0,0 +1,237 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: LicenseRef-Apache2 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""NFS-based control plane for live hot-reload interventions during training. + +Polls a YAML control file on shared storage every N training steps. Only rank 0 +reads the file; updates are broadcast to all ranks via torch.distributed. A +monotonic version number prevents re-applying the same intervention. + +Tier 1 (hot-reload) changes — learning rate, grad clip, FP8 recipe knobs, +logging toggles — flow through this module. Tier 2 changes (checkpoint-and- +restart) are signaled via the ``request_checkpoint_and_stop`` field. +""" + +import logging +import os +import time +from dataclasses import dataclass, field +from pathlib import Path +from typing import Any + +import torch.distributed as dist +import yaml + + +logger = logging.getLogger(__name__) + + +# Hard safety bounds for Tier 1 hot-reload parameters +_DEFAULT_BOUNDS: dict[str, tuple[float, float]] = { + "learning_rate": (0.0, 1.0), + "grad_clip_norm": (0.1, 100.0), + "logging_frequency": (1, 10_000), +} + + +@dataclass +class ControlPlaneConfig: + """Configuration for the control plane. + + Attributes: + enabled: Master switch for the control plane. + control_file: Path to the YAML control file on shared NFS. + poll_every_n_steps: How often (in optimizer steps) to check the file. + bounds: Per-parameter (min, max) safety bounds for Tier 1 interventions. + cooldown_steps: Minimum steps between successive interventions. + """ + + enabled: bool = False + control_file: str = "/data/agent/control.yaml" + poll_every_n_steps: int = 1 + bounds: dict[str, tuple[float, float]] = field(default_factory=lambda: dict(_DEFAULT_BOUNDS)) + cooldown_steps: int = 500 + + +class ControlPlane: + """Reads a shared YAML control file and distributes interventions to all ranks. + + The control file must contain a monotonically increasing ``version`` field. + When the version changes the new ``interventions`` block is consumed. + + Args: + config: ControlPlaneConfig instance. + rank: The global rank of this process. + """ + + def __init__(self, config: ControlPlaneConfig, rank: int = 0): + """Initialize the control plane.""" + self._config = config + self._rank = rank + self._last_version: int = -1 + self._last_mtime: float = 0.0 + self._last_poll_step: int = -999_999 + self._last_intervention_step: int = -999_999 + self._pending: dict[str, Any] | None = None + + # ------------------------------------------------------------------ + # Public API called from the training loop + # ------------------------------------------------------------------ + + def poll(self, step: int) -> None: + """Check for a new control file version (respects poll frequency). + + Call once per optimizer step. Actual I/O only happens every + ``poll_every_n_steps`` steps on rank 0. + + Args: + step: Current global training step. + """ + if not self._config.enabled: + return + + if step - self._last_poll_step < self._config.poll_every_n_steps: + return + + self._last_poll_step = step + interventions = self._read_and_broadcast() + + if interventions is not None: + if step - self._last_intervention_step < self._config.cooldown_steps: + logger.warning( + "Control plane update at step %d ignored — cooldown (%d steps) not elapsed since last " + "intervention at step %d", + step, + self._config.cooldown_steps, + self._last_intervention_step, + ) + return + self._pending = interventions + self._last_intervention_step = step + + def has_update(self) -> bool: + """Return True if there is an unconsumed intervention.""" + return self._pending is not None + + def consume(self) -> dict[str, Any]: + """Return and clear the pending intervention dict. + + Raises: + RuntimeError: If there is no pending update. + """ + if self._pending is None: + raise RuntimeError("No pending control-plane update to consume") + result = self._pending + self._pending = None + return result + + # ------------------------------------------------------------------ + # Internal helpers + # ------------------------------------------------------------------ + + def _read_and_broadcast(self) -> dict[str, Any] | None: + """Rank 0 reads the file; result is broadcast to all ranks.""" + payload: list[dict[str, Any] | None] = [None] + + if self._rank == 0: + payload[0] = self._try_read_file() + + if dist.is_initialized() and dist.get_world_size() > 1: + dist.broadcast_object_list(payload, src=0) + + return payload[0] + + def _try_read_file(self) -> dict[str, Any] | None: + """Read the control YAML if it changed on disk. Returns validated interventions or None.""" + path = Path(self._config.control_file) + if not path.exists(): + return None + + try: + st = path.stat() + except OSError: + return None + + if st.st_mtime == self._last_mtime: + return None + + self._last_mtime = st.st_mtime + + try: + with open(path) as f: + data = yaml.safe_load(f) + except Exception: + logger.exception("Failed to parse control file %s", path) + return None + + if not isinstance(data, dict): + return None + + version = data.get("version", 0) + if version <= self._last_version: + return None + + self._last_version = version + interventions = data.get("interventions", {}) + if not interventions: + return None + + validated = self._validate(interventions) + logger.info("Control plane v%d consumed: %s", version, validated) + return validated + + def _validate(self, raw: dict[str, Any]) -> dict[str, Any]: + """Clamp numeric parameters to configured safety bounds.""" + result: dict[str, Any] = {} + for key, value in raw.items(): + if key in self._config.bounds and isinstance(value, (int, float)): + lo, hi = self._config.bounds[key] + clamped = max(lo, min(hi, float(value))) + if clamped != float(value): + logger.warning("Clamped %s from %s to %s (bounds [%s, %s])", key, value, clamped, lo, hi) + result[key] = clamped + else: + result[key] = value + return result + + +def write_control_file( + path: str | os.PathLike, + version: int, + interventions: dict[str, Any], + agent_id: str = "manual", +) -> None: + """Write a control file atomically (for use by the agent or manual debugging). + + Args: + path: Destination path for the YAML control file. + version: Monotonically increasing version number. + interventions: Dict of parameter overrides. + agent_id: Identifier for the writing agent. + """ + path = Path(path) + path.parent.mkdir(parents=True, exist_ok=True) + + payload = { + "version": version, + "timestamp": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()), + "agent_id": agent_id, + "interventions": interventions, + } + + tmp = path.with_suffix(".tmp") + with open(tmp, "w") as f: + yaml.safe_dump(payload, f, default_flow_style=False) + tmp.rename(path) diff --git a/bionemo-recipes/recipes/opengenome2_llama_native_te/extract_baseline_metrics.py b/bionemo-recipes/recipes/opengenome2_llama_native_te/extract_baseline_metrics.py new file mode 100644 index 0000000000..db0624e987 --- /dev/null +++ b/bionemo-recipes/recipes/opengenome2_llama_native_te/extract_baseline_metrics.py @@ -0,0 +1,69 @@ +#!/usr/bin/env python3 + +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: LicenseRef-Apache2 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Extract BF16 baseline metrics from a WandB run into a JSON file for the FP8 precision agent.""" + +import json +import math + +import wandb + + +ENTITY = "clara-discovery" +PROJECT = "llama3-metagenome-7b" +RUN_ID = "8mfsb27t" + +OUTPUT_FILE = "baseline_bf16.json" + + +def main(): + """Extract BF16 baseline metrics from a WandB run and save to JSON.""" + api = wandb.Api() + run = api.run(f"{ENTITY}/{PROJECT}/{RUN_ID}") + + print(f"Run: {run.name} ({run.id})") + print(f"State: {run.state}") + + baseline = {} + for row in run.scan_history(keys=["train/loss", "train/unpadded_tokens_per_second_per_gpu", "train/global_step"]): + step = row.get("train/global_step") + loss = row.get("train/loss") + tps = row.get("train/unpadded_tokens_per_second_per_gpu") + + if step is None or loss is None: + continue + + step = int(step) + baseline[f"step_{step}"] = { + "perplexity": math.exp(loss), + "loss": loss, + "unpadded_tokens_per_sec": tps, + } + + # Sort by step number + baseline = dict(sorted(baseline.items(), key=lambda x: int(x[0].split("_")[1]))) + + with open(OUTPUT_FILE, "w") as f: + json.dump(baseline, f, indent=2) + + steps = sorted(int(k.split("_")[1]) for k in baseline) + print(f"Extracted {len(baseline)} steps: {steps[0]} .. {steps[-1]}") + print(f"Saved to {OUTPUT_FILE}") + + +if __name__ == "__main__": + main() diff --git a/bionemo-recipes/recipes/opengenome2_llama_native_te/fix_checkpoint_dirs.py b/bionemo-recipes/recipes/opengenome2_llama_native_te/fix_checkpoint_dirs.py new file mode 100644 index 0000000000..da3171a834 --- /dev/null +++ b/bionemo-recipes/recipes/opengenome2_llama_native_te/fix_checkpoint_dirs.py @@ -0,0 +1,158 @@ +#!/usr/bin/env python3 + +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: LicenseRef-Apache2 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Quick Lepton job to rename old checkpoint directories so current runs can save cleanly. + +Usage: + python fix_checkpoint_dirs.py +""" + +from leptonai.api.v1.types.affinity import LeptonResourceAffinity +from leptonai.api.v1.types.common import Metadata +from leptonai.api.v1.types.deployment import LeptonContainer +from leptonai.api.v1.types.job import LeptonJob, LeptonJobUserSpec +from leptonai.api.v2.client import APIClient + + +NODE_GROUP = "yo-bom-lepton-001" +RESOURCE_SHAPE = "gpu.8xh100-sxm" +JOB_NAME = "fix-checkpoint-dirs" + +SCRIPT = r"""#!/bin/bash +set -e + +echo "==========================================" +echo "Fixing checkpoint directories on NFS" +echo "==========================================" + +# --- fl2: move old run checkpoints, create fresh dir --- +FL2="/data/savithas/checkpoints/og2-7b-fp8-refactor-bf16-fl2-fp32mw" # pragma: allowlist secret +FL2_OLD="${FL2}-old-run" + +echo "" +echo "=== fl2 directory ===" +echo "Current contents:" +ls -la ${FL2}/train_fsdp2/ 2>/dev/null || echo "(directory empty or missing)" + +if [ -d "${FL2}/train_fsdp2/step_15000" ]; then + echo "Old run checkpoints detected (step_15000+). Moving..." + mv "${FL2}" "${FL2_OLD}" + echo "Moved to: ${FL2_OLD}" + mkdir -p "${FL2}/train_fsdp2" + echo "Created fresh: ${FL2}/train_fsdp2/" +else + echo "No old run checkpoints found (no step_15000). Checking what's there..." + ls -la ${FL2}/train_fsdp2/ 2>/dev/null || echo "(empty)" +fi + +echo "" +echo "fl2 after fix:" +ls -la ${FL2}/train_fsdp2/ 2>/dev/null || echo "(empty - ready for new checkpoints)" + +# --- fl4: check if 5k checkpoint exists --- +FL4="/data/savithas/checkpoints/og2-7b-fp8-refactor-bf16-fl4-fp32mw" # pragma: allowlist secret + +echo "" +echo "=== fl4 directory ===" +echo "Current contents:" +ls -la ${FL4}/train_fsdp2/ 2>/dev/null || echo "(directory empty or missing)" + +if [ -d "${FL4}/train_fsdp2/step_5000" ]; then + echo "step_5000 checkpoint EXISTS in fl4!" +else + echo "step_5000 NOT found in fl4." + # Check if old run checkpoints are clobbering this one too + if [ -d "${FL4}/train_fsdp2/step_15000" ] || [ -d "${FL4}/train_fsdp2/step_20000" ]; then + FL4_OLD="${FL4}-old-run" + echo "Old run checkpoints detected. Moving..." + mv "${FL4}" "${FL4_OLD}" + echo "Moved to: ${FL4_OLD}" + mkdir -p "${FL4}/train_fsdp2" + echo "Created fresh: ${FL4}/train_fsdp2/" + fi +fi + +echo "" +echo "==========================================" +echo "Done! Summary:" +echo "==========================================" +echo "fl2: $(ls ${FL2}/train_fsdp2/ 2>/dev/null | tr '\n' ' ' || echo 'empty')" +echo "fl4: $(ls ${FL4}/train_fsdp2/ 2>/dev/null | tr '\n' ' ' || echo 'empty')" +if [ -d "${FL2_OLD}" ]; then + echo "fl2 old run: $(ls ${FL2_OLD}/train_fsdp2/ 2>/dev/null | tr '\n' ' ')" +fi +if [ -d "${FL4_OLD}" ]; then + echo "fl4 old run: $(ls ${FL4_OLD}/train_fsdp2/ 2>/dev/null | tr '\n' ' ')" +fi +echo "==========================================" +""" + + +def main(): + """Submit a quick Lepton job to fix checkpoint directories.""" + client = APIClient() + + node_groups = client.nodegroup.list_all() + node_group_map = {ng.metadata.name: ng for ng in node_groups} + + if NODE_GROUP not in node_group_map: + available = ", ".join(sorted(node_group_map.keys())) + raise SystemExit(f"Node group '{NODE_GROUP}' not found.\nAvailable: {available}") + + chosen_group = node_group_map[NODE_GROUP] + valid_node_ids = {n.metadata.id_ for n in client.nodegroup.list_nodes(chosen_group.metadata.id_)} + + job_spec = LeptonJobUserSpec( + resource_shape=RESOURCE_SHAPE, + affinity=LeptonResourceAffinity( + allowed_dedicated_node_groups=[chosen_group.metadata.id_], + allowed_nodes_in_node_group=valid_node_ids, + ), + container=LeptonContainer( + image="nvcr.io/nvidia/pytorch:26.02-py3", + command=["bash", "-c", SCRIPT], + ), + completions=1, + parallelism=1, + envs=[], + image_pull_secrets=["lepton-nvidia-cvai-bnmo-trng"], + mounts=[ + { + "path": "/BioNeMo", + "mount_path": "/data", + "from": "node-nfs:fs1", + }, + ], + ) + + job = LeptonJob(spec=job_spec, metadata=Metadata(id=JOB_NAME)) + + try: + launched_job = client.job.create(job) + if launched_job.status: + print(f"Job launched: {JOB_NAME}") + print( + f"View at: https://dashboard.dgxc-lepton.nvidia.com/workspace/" + f"vfco61g2/compute/jobs/detail/{launched_job.metadata.id_}/replicas/list" + ) + except Exception as e: + print(f"ERROR: {e}") + exit(1) + + +if __name__ == "__main__": + main() diff --git a/bionemo-recipes/recipes/opengenome2_llama_native_te/fp8_debugging_stats.yaml b/bionemo-recipes/recipes/opengenome2_llama_native_te/fp8_debugging_stats.yaml index 7544bbedcf..dfbd7d3b37 100644 --- a/bionemo-recipes/recipes/opengenome2_llama_native_te/fp8_debugging_stats.yaml +++ b/bionemo-recipes/recipes/opengenome2_llama_native_te/fp8_debugging_stats.yaml @@ -1,8 +1,8 @@ example_fp8_tensor_stat_collection: enabled: True layers: - # Match the actual linear layers within attention that support FP8 stats - layer_types: [layernorm_qkv] + # Match all TE linear layer types (proven working in ESM2 recipe) + layer_types: [layernorm_qkv, proj, fc1, fc2, layernorm_mlp] transformer_engine: LogFp8TensorStats: enabled: True @@ -16,3 +16,8 @@ example_fp8_tensor_stat_collection: - tensor: weight stats: [underflows%, scale_inv_min, scale_inv_max, mse] freq: 10 + LogTensorStats: + enabled: True + stats: [max, min, mean, std, l1_norm] + tensors: [dgrad, wgrad] + freq: 1 diff --git a/bionemo-recipes/recipes/opengenome2_llama_native_te/hydra_config/agent.yaml b/bionemo-recipes/recipes/opengenome2_llama_native_te/hydra_config/agent.yaml new file mode 100644 index 0000000000..6aff1295eb --- /dev/null +++ b/bionemo-recipes/recipes/opengenome2_llama_native_te/hydra_config/agent.yaml @@ -0,0 +1,43 @@ +# Agent control plane configuration +# +# Enable the AI-in-the-loop FP8 debugger by adding `agent.enabled=true` to +# your Hydra overrides. The agent sidecar job writes interventions to +# `control_file`; the training loop polls it every `poll_every_n_steps`. +# +# Usage: +# torchrun ... train_fsdp2.py --config-name og2_7b_thd_gqa_fp8 agent.enabled=true + +agent: + enabled: false + + # Shared NFS paths for agent <-> training communication + control_file: /data/agent/control.yaml + metrics_file: /data/agent/metrics.jsonl + journal_file: /data/agent/journal.jsonl + + # How often the training loop checks for new control-plane instructions + poll_every_n_steps: 1 + + # Minimum steps between successive Tier-1 interventions (observe before acting) + cooldown_steps: 500 + + # Safety bounds for Tier-1 hot-reload parameters [min, max] + bounds: + learning_rate: [0.0, 1.0] + grad_clip_norm: [0.1, 100.0] + logging_frequency: [1, 10000] + + # --- Agent daemon settings (used by agent_daemon.py, not the training loop) --- + # Monitor loop poll interval in seconds + monitor_interval_seconds: 30 + + # Anomaly detection thresholds + loss_spike_threshold_pct: 50.0 + grad_norm_spike_multiplier: 3.0 + nan_halt: true + + # Tier-2 restart limits + max_tier2_restarts: 3 + + # Start in observation-only mode (no interventions until toggled) + observation_only: true diff --git a/bionemo-recipes/recipes/opengenome2_llama_native_te/hydra_config/defaults.yaml b/bionemo-recipes/recipes/opengenome2_llama_native_te/hydra_config/defaults.yaml index 4295017f26..1b020f2fdb 100644 --- a/bionemo-recipes/recipes/opengenome2_llama_native_te/hydra_config/defaults.yaml +++ b/bionemo-recipes/recipes/opengenome2_llama_native_te/hydra_config/defaults.yaml @@ -18,10 +18,7 @@ use_weight_decay_grouping: true skip_embedding_weight_decay: true use_fp32_master_weights: true -config_kwargs: - fp8_first_last_bf16: false - num_layers_at_start_in_bf16: 1 - num_layers_at_end_in_bf16: 1 +config_kwargs: {} num_train_steps: ??? grad_acc_steps: 1 @@ -65,6 +62,17 @@ fp8_config: quantized_model_init_kwargs: enabled: false +# FP4 config +fp4_config: + enabled: false + fp4_recipe: transformer_engine.common.recipe.NVFP4BlockScaling + fp4_format: "E2M1" + fp4_recipe_kwargs: {} + +# Layer-wise quantization: 1-indexed layer lists (null = all layers use the enabled format) +fp8_layers: null +fp4_layers: null + # Optimizer config adamw_kwargs: lr: 3e-3 @@ -98,10 +106,10 @@ validation: num_batches: 10 data_path: null -fp8_stats_config: +quant_stats_config: enabled: false - fp8_stats_file: ./fp8_debugging_stats.yaml - fp8_log_dir: ./log_fp8_stats + quant_stats_file: ./fp8_debugging_stats.yaml + quant_log_dir: ./log_quant_stats profiler: enabled: false diff --git a/bionemo-recipes/recipes/opengenome2_llama_native_te/hydra_config/og2_7b_bf16_1k_from_5k.yaml b/bionemo-recipes/recipes/opengenome2_llama_native_te/hydra_config/og2_7b_bf16_1k_from_5k.yaml new file mode 100644 index 0000000000..04f07e2623 --- /dev/null +++ b/bionemo-recipes/recipes/opengenome2_llama_native_te/hydra_config/og2_7b_bf16_1k_from_5k.yaml @@ -0,0 +1,53 @@ +# OpenGenome2 7B - BF16 baseline, 1000 steps from 5k checkpoint (1 node) +# +# Resumes from the existing 5k BF16 checkpoint and trains for 1000 more steps +# (steps 5001-6000) in pure BF16. Single-node on Lepton (8x H100). +# Used as the baseline reference for the FP8 precision agent demo. +# +# Usage: +# torchrun --nproc_per_node=8 train_fsdp2.py --config-name og2_7b_bf16_1k_from_5k \ +# checkpoint.ckpt_dir=/data/savithas/checkpoints/og2-7b-bf16-baseline-1node + +defaults: + - og2_7b_thd_gqa + - _self_ + +# Dataset: JSON metagenomes on Lepton NFS, 10k buffer, 8 workers +dataset: + micro_batch_size: 1 + buffer_size: 10_000 + num_workers: 8 + load_dataset_kwargs: + path: "json" + data_files: "/data/opengenome2/json/pretraining_or_both_phases/metagenomes/data_metagenomics_train_*.jsonl.gz" + data_dir: null + split: "train" + streaming: true + +# Train for 6000 total steps (resumes from 5001, runs 1000 new steps) +num_train_steps: 6000 +grad_acc_steps: 8 + +# FP8 disabled (all BF16) +fp8_config: + enabled: false + +# Checkpoint: resume from existing 5k checkpoint +checkpoint: + ckpt_dir: ??? + save_final_model: true + resume_from_checkpoint: true + save_every_n_steps: 100 + max_checkpoints: 5 + async_save: true + +# Validation disabled for this short run +validation: + enabled: false + +logger: + frequency: 1 + +wandb: + name: og2-7b-bf16-baseline-1node-from-5k + project: llama3-metagenome-7b diff --git a/bionemo-recipes/recipes/opengenome2_llama_native_te/hydra_config/og2_7b_fp8_fl2_quant_stats.yaml b/bionemo-recipes/recipes/opengenome2_llama_native_te/hydra_config/og2_7b_fp8_fl2_quant_stats.yaml new file mode 100644 index 0000000000..bac2db33ce --- /dev/null +++ b/bionemo-recipes/recipes/opengenome2_llama_native_te/hydra_config/og2_7b_fp8_fl2_quant_stats.yaml @@ -0,0 +1,24 @@ +# OpenGenome2 7B - FP8 Block Scaling, FL2 (first/last 2 layers BF16) +# Quant stats logging enabled to diagnose fl2 vs fl4 convergence difference. +# +# Layers 3-30 in FP8 (1-indexed), layers 1-2 and 31-32 in BF16. +# Resume from step 25000 checkpoint. + +defaults: + - og2_7b_thd_gqa_fp8 + - _self_ + +# FL2: layers 3-30 in FP8 (1-indexed), layers 1-2 and 31-32 in BF16 +fp8_layers: [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30] + +# Enable quant stats logging +quant_stats_config: + enabled: true + quant_stats_file: ./fp8_debugging_stats.yaml + quant_log_dir: /data/savithas/checkpoints/og2-7b-fp8-fl2-quant-stats/quant_logs + +num_train_steps: 182314 + +wandb: + name: og2-7b-fp8-fl2-quant-stats-25k + project: llama3-metagenome-7b diff --git a/bionemo-recipes/recipes/opengenome2_llama_native_te/hydra_config/og2_7b_fp8_fl4_quant_stats.yaml b/bionemo-recipes/recipes/opengenome2_llama_native_te/hydra_config/og2_7b_fp8_fl4_quant_stats.yaml new file mode 100644 index 0000000000..b6d2d41a39 --- /dev/null +++ b/bionemo-recipes/recipes/opengenome2_llama_native_te/hydra_config/og2_7b_fp8_fl4_quant_stats.yaml @@ -0,0 +1,24 @@ +# OpenGenome2 7B - FP8 Block Scaling, FL4 (first/last 4 layers BF16) +# Quant stats logging enabled to diagnose fl2 vs fl4 convergence difference. +# Resume from step 15000 checkpoint. +# +# Layers 5-28 in FP8 (1-indexed), layers 1-4 and 29-32 in BF16. + +defaults: + - og2_7b_thd_gqa_fp8 + - _self_ + +# FL4: layers 5-28 in FP8 (1-indexed), layers 1-4 and 29-32 in BF16 +fp8_layers: [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28] + +# Enable quant stats logging +quant_stats_config: + enabled: true + quant_stats_file: ./fp8_debugging_stats.yaml + quant_log_dir: /data/savithas/checkpoints/og2-7b-fp8-fl4-quant-stats/quant_logs + +num_train_steps: 182314 + +wandb: + name: og2-7b-fp8-fl4-quant-stats-15k + project: llama3-metagenome-7b diff --git a/bionemo-recipes/recipes/opengenome2_llama_native_te/hydra_config/og2_7b_thd_gqa.yaml b/bionemo-recipes/recipes/opengenome2_llama_native_te/hydra_config/og2_7b_thd_gqa.yaml index 48df3cf681..1543dfef23 100644 --- a/bionemo-recipes/recipes/opengenome2_llama_native_te/hydra_config/og2_7b_thd_gqa.yaml +++ b/bionemo-recipes/recipes/opengenome2_llama_native_te/hydra_config/og2_7b_thd_gqa.yaml @@ -59,21 +59,21 @@ use_meta_device: false # Sequence packing use_sequence_packing: true -# Dataset +# Dataset: pre-chunked parquet2 shards (globally shuffled) dataset: tokenizer_name_or_path: ./tokenizers/nucleotide_fast_tokenizer micro_batch_size: 2 - num_workers: 1 + num_workers: 8 max_seq_length: 8192 stride: 200 - buffer_size: 50_000 + buffer_size: 10_000 mask_degenerate_bases: true uppercase_labels: false load_dataset_kwargs: - path: "json" - data_files: "/data/opengenome2/json/pretraining_or_both_phases/metagenomes/data_metagenomics_train_*.jsonl.gz" + path: "/data/opengenome2/parquet2" + data_files: null split: "train" - streaming: True + streaming: true # Training config num_train_steps: 182314 diff --git a/bionemo-recipes/recipes/opengenome2_llama_native_te/hydra_config/og2_7b_thd_gqa_fp8.yaml b/bionemo-recipes/recipes/opengenome2_llama_native_te/hydra_config/og2_7b_thd_gqa_fp8.yaml index 1070e88a0d..be66c6f671 100644 --- a/bionemo-recipes/recipes/opengenome2_llama_native_te/hydra_config/og2_7b_thd_gqa_fp8.yaml +++ b/bionemo-recipes/recipes/opengenome2_llama_native_te/hydra_config/og2_7b_thd_gqa_fp8.yaml @@ -15,7 +15,8 @@ # OpenGenome2 7B - THD + GQA + FP8 + FP32 Master Weights # -# Same as og2_7b_thd_gqa but with FP8 enabled (first/last layers in BF16 for stability). +# Same as og2_7b_thd_gqa but with FP8 enabled. +# First and last layers kept in BF16 for stability via fp8_layers list. # Uses Float8BlockScaling recipe with E4M3 format. # # Usage: @@ -25,7 +26,7 @@ defaults: - og2_7b_thd_gqa - _self_ -# Enable FP8 first/last BF16 for numerical stability +# Model architecture config config_kwargs: vocab_size: 256 num_hidden_layers: 32 @@ -43,7 +44,9 @@ config_kwargs: low_freq_factor: 1.0 high_freq_factor: 4.0 original_max_position_embeddings: 8192 - fp8_first_last_bf16: true + +# First/last layers in BF16 for numerical stability (layers 2-31 in FP8, 1-indexed) +fp8_layers: [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] # FP8 configuration - Float8BlockScaling with E4M3 format fp8_config: @@ -51,12 +54,9 @@ fp8_config: fp8_recipe: transformer_engine.common.recipe.Float8BlockScaling fp8_format: E4M3 fp8_recipe_kwargs: {} - quantized_model_init_kwargs: - enabled: false dataset: micro_batch_size: 1 - buffer_size: 500_000 grad_acc_steps: 8 diff --git a/bionemo-recipes/recipes/opengenome2_llama_native_te/hydra_config/og2_7b_thd_gqa_global_shuffle.yaml b/bionemo-recipes/recipes/opengenome2_llama_native_te/hydra_config/og2_7b_thd_gqa_global_shuffle.yaml index 80e3baa4e6..f22dcb5d66 100644 --- a/bionemo-recipes/recipes/opengenome2_llama_native_te/hydra_config/og2_7b_thd_gqa_global_shuffle.yaml +++ b/bionemo-recipes/recipes/opengenome2_llama_native_te/hydra_config/og2_7b_thd_gqa_global_shuffle.yaml @@ -37,8 +37,6 @@ fp8_config: fp8_recipe: transformer_engine.common.recipe.DelayedScaling fp8_format: "HYBRID" fp8_recipe_kwargs: {} - quantized_model_init_kwargs: - enabled: false # Dataset: pre-chunked globally shuffled shards dataset: diff --git a/bionemo-recipes/recipes/opengenome2_llama_native_te/lepton_configs/claude_agent_demo.yaml b/bionemo-recipes/recipes/opengenome2_llama_native_te/lepton_configs/claude_agent_demo.yaml new file mode 100644 index 0000000000..10d91f84cc --- /dev/null +++ b/bionemo-recipes/recipes/opengenome2_llama_native_te/lepton_configs/claude_agent_demo.yaml @@ -0,0 +1,39 @@ +defaults: + - _self_ + +job_name: "claude-agent-demo" +node_group: "yo-bom-lepton-001" +resource_shape: "gpu.8xh100-sxm" + +num_nodes: 1 +gpus_per_node: 8 + +# Code paths on NFS +repo_root: "/data/savithas/bionemo-framework" +code_path: "/data/savithas/bionemo-framework/bionemo-recipes/recipes/opengenome2_llama_native_te" +git_branch: "savitha/claude-lepton-dev-experiment" + +# Training config (tiny model for demo) +hydra_config: "L0_sanity" +train_script: "train_fsdp2.py" +num_train_steps: 250 + +# Agent config +checkin_interval: 50 +promotion_strategy: "ends_in" +workspace_root: "/data/savithas/agent_runs/demo" +checkpoint_root: "/data/savithas/checkpoints" + +# Claude Code config +claude_model: "aws/anthropic/bedrock-claude-opus-4-6" +anthropic_secret: "anthropic.nim.savithas" # pragma: allowlist secret +anthropic_base_url: "https://inference-api.nvidia.com" + +# WandB +wandb_project: "claude-agent-demo" +wandb_secret: "wandb.savithas" # pragma: allowlist secret + +# Container +container: + image: "nvcr.io/nvidia/pytorch:26.02-py3" + registry_auth: "lepton-nvidia-cvai-bnmo-trng" diff --git a/bionemo-recipes/recipes/opengenome2_llama_native_te/lepton_configs/og2_bf16_baseline_1node.yaml b/bionemo-recipes/recipes/opengenome2_llama_native_te/lepton_configs/og2_bf16_baseline_1node.yaml new file mode 100644 index 0000000000..4152a0dc75 --- /dev/null +++ b/bionemo-recipes/recipes/opengenome2_llama_native_te/lepton_configs/og2_bf16_baseline_1node.yaml @@ -0,0 +1,39 @@ +defaults: + - _self_ + +job_name: "og2-bf16-baseline-continuous" +node_group: "yo-bom-lepton-001" +resource_shape: "gpu.8xh100-sxm" + +num_nodes: 1 +gpus_per_node: 8 + +# Code paths on NFS +repo_root: "/data/savithas/bionemo-framework" +code_path: "/data/savithas/bionemo-framework/bionemo-recipes/recipes/opengenome2_llama_native_te" +git_branch: "savitha/claude-lepton-dev-experiment" + +# Training config +hydra_config: "og2_7b_bf16_1k_from_5k" +train_script: "train_fsdp2.py" + +# Resume from external 5k checkpoint (symlinked into ckpt_dir) +resume_from: + external_checkpoint: "/data/savithas/checkpoints/og2-7b-fp32mw-orig-ws-w1-b50k" # pragma: allowlist secret + step: 5000 + +# Hydra CLI overrides (passed after --config-name) +hydra_overrides: + checkpoint.ckpt_dir: "/data/savithas/checkpoints/og2-7b-bf16-baseline-continuous" + checkpoint.async_save: false + checkpoint.save_every_n_steps: 0 + checkpoint.save_final_model: false + wandb.name: "og2-7b-bf16-baseline-continuous" + +# WandB +wandb_secret: "wandb.savithas" # pragma: allowlist secret + +# Container +container: + image: "nvcr.io/nvidia/pytorch:26.02-py3" + registry_auth: "lepton-nvidia-cvai-bnmo-trng" diff --git a/bionemo-recipes/recipes/opengenome2_llama_native_te/lepton_configs/og2_fp8_agent.yaml b/bionemo-recipes/recipes/opengenome2_llama_native_te/lepton_configs/og2_fp8_agent.yaml new file mode 100644 index 0000000000..4185169c01 --- /dev/null +++ b/bionemo-recipes/recipes/opengenome2_llama_native_te/lepton_configs/og2_fp8_agent.yaml @@ -0,0 +1,39 @@ +defaults: + - _self_ + +job_name: "og2-fp8-agent" +node_group: "yo-bom-lepton-001" +resource_shape: "gpu.8xh100-sxm" + +num_nodes: 6 +gpus_per_node: 8 + +# Code paths on NFS +repo_root: "/data/savithas/bionemo-framework" +code_path: "/data/savithas/bionemo-framework/bionemo-recipes/recipes/opengenome2_llama_native_te" +git_branch: "savitha/claude-lepton-dev-experiment" + +# Training config (OG2-7B FP8 Block Scaling) +hydra_config: "og2_7b_thd_gqa_fp8" +train_script: "train_fsdp2.py" +num_train_steps: 182300 + +# Agent config +checkin_interval: 100 +promotion_strategy: "ends_in" +workspace_root: "/data/savithas/agent_runs" +checkpoint_root: "/data/savithas/checkpoints" + +# Claude Code config +claude_model: "aws/anthropic/bedrock-claude-opus-4-6" +anthropic_secret: "anthropic.nim.savithas" # pragma: allowlist secret +anthropic_base_url: "https://inference-api.nvidia.com" + +# WandB +wandb_project: "opengenome2-7b" +wandb_secret: "wandb.savithas" # pragma: allowlist secret + +# Container +container: + image: "nvcr.io/nvidia/pytorch:26.02-py3" + registry_auth: "lepton-nvidia-cvai-bnmo-trng" diff --git a/bionemo-recipes/recipes/opengenome2_llama_native_te/lepton_configs/og2_fp8_agent_1node_demo.yaml b/bionemo-recipes/recipes/opengenome2_llama_native_te/lepton_configs/og2_fp8_agent_1node_demo.yaml new file mode 100644 index 0000000000..ae31c4811b --- /dev/null +++ b/bionemo-recipes/recipes/opengenome2_llama_native_te/lepton_configs/og2_fp8_agent_1node_demo.yaml @@ -0,0 +1,56 @@ +defaults: + - _self_ + +job_name: "og2-fp8-agent-1node-demo" +node_group: "yo-bom-lepton-001" +resource_shape: "gpu.8xh100-sxm" + +num_nodes: 1 +gpus_per_node: 8 + +# Code paths on NFS +repo_root: "/data/savithas/bionemo-framework" +code_path: "/data/savithas/bionemo-framework/bionemo-recipes/recipes/opengenome2_llama_native_te" +git_branch: "savitha/claude-lepton-dev-experiment" + +# Training config (OG2-7B, agent controls fp8_layers via CLI) +hydra_config: "og2_7b_bf16_1k_from_5k" +train_script: "train_fsdp2.py" +num_train_steps: 6000 + +# Agent config — gradual strategy (expand FP8 from center outward) +checkin_interval: 100 +tolerance_pct: 3.0 +promotion_strategy: "gradual" +workspace_root: "/data/savithas/agent_runs/demo_1node" +checkpoint_root: "/data/savithas/checkpoints" + +# Claude Code config +claude_model: "aws/anthropic/bedrock-claude-opus-4-6" +anthropic_secret: "anthropic.nim.savithas" # pragma: allowlist secret +anthropic_base_url: "https://inference-api.nvidia.com" + +# WandB +wandb_project: "llama3-metagenome-7b" +wandb_secret: "wandb.savithas" # pragma: allowlist secret + +# Baseline (1-node specific) +baseline_logfile: "baseline_bf16_1node.json" + +# Warm start from the same 5k BF16 checkpoint +# Agent starts all BF16 (warmup), then gradually adds FP8 from center outward +warm_start: + enabled: true + external_checkpoint: "/data/savithas/checkpoints/og2-7b-fp32mw-orig-ws-w1-b50k" # pragma: allowlist secret + lkg_step: 5000 + # All layers start in BF16 — agent will expand FP8 from center after warmup + fp8_layers: [] + demotion_round: 0 + # Gradual strategy pointers (expand from center) + center_low: 16 + center_high: 17 + +# Container +container: + image: "nvcr.io/nvidia/pytorch:26.02-py3" + registry_auth: "lepton-nvidia-cvai-bnmo-trng" diff --git a/bionemo-recipes/recipes/opengenome2_llama_native_te/lepton_configs/og2_fp8_agent_fl2_research.yaml b/bionemo-recipes/recipes/opengenome2_llama_native_te/lepton_configs/og2_fp8_agent_fl2_research.yaml new file mode 100644 index 0000000000..95d36b1c10 --- /dev/null +++ b/bionemo-recipes/recipes/opengenome2_llama_native_te/lepton_configs/og2_fp8_agent_fl2_research.yaml @@ -0,0 +1,19 @@ +defaults: + - og2_fp8_agent + - _self_ + +# Warm-start from fl2 (first/last 2 layers BF16) 10K checkpoint +# Strategy: research_guided — uses runtime quant stats to pick demotion order + +job_name: "og2-fp8-research-fl2-10k" +promotion_strategy: "research_guided" +wandb_project: "llama3-metagenome-7b" +tolerance_pct: 1.0 + +warm_start: + enabled: true + external_checkpoint: "/data/savithas/checkpoints/og2-7b-fp8-refactor-bf16-fl2-fp32mw" # pragma: allowlist secret + lkg_step: 10000 + # Layers 3-30 in FP8, layers 1-2 and 31-32 in BF16 (1-indexed) + fp8_layers: [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30] + demotion_round: 2 diff --git a/bionemo-recipes/recipes/opengenome2_llama_native_te/lepton_configs/og2_fp8_agent_fl2_warmstart.yaml b/bionemo-recipes/recipes/opengenome2_llama_native_te/lepton_configs/og2_fp8_agent_fl2_warmstart.yaml new file mode 100644 index 0000000000..db087fd7ff --- /dev/null +++ b/bionemo-recipes/recipes/opengenome2_llama_native_te/lepton_configs/og2_fp8_agent_fl2_warmstart.yaml @@ -0,0 +1,22 @@ +defaults: + - og2_fp8_agent + - _self_ + +# Warm-start from fl2 (first/last 2 layers BF16) 10K checkpoint +# Strategy: ends_in, rounds 1-2 already done + +job_name: "og2-fp8-ends-in-fl2-10k" +promotion_strategy: "ends_in" +wandb_project: "llama3-metagenome-7b" +tolerance_pct: 1.0 + +warm_start: + enabled: true + external_checkpoint: "/data/savithas/checkpoints/og2-7b-fp8-refactor-bf16-fl2-fp32mw" # pragma: allowlist secret + lkg_step: 10000 + # Layers 3-30 in FP8, layers 1-2 and 31-32 in BF16 (1-indexed) + fp8_layers: [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30] + demotion_round: 2 + # ends_in pointers: next demotion would be layers 3 and 30 + bottom_ptr: 3 + top_ptr: 30 diff --git a/bionemo-recipes/recipes/opengenome2_llama_native_te/lepton_configs/og2_fp8_agent_fl4_warmstart.yaml b/bionemo-recipes/recipes/opengenome2_llama_native_te/lepton_configs/og2_fp8_agent_fl4_warmstart.yaml new file mode 100644 index 0000000000..6a9301fcf0 --- /dev/null +++ b/bionemo-recipes/recipes/opengenome2_llama_native_te/lepton_configs/og2_fp8_agent_fl4_warmstart.yaml @@ -0,0 +1,21 @@ +defaults: + - og2_fp8_agent + - _self_ + +# Warm-start from fl4 (first/last 4 layers BF16) 5K checkpoint +# Strategy: ends_in, rounds 1-4 already done + +job_name: "og2-fp8-ends-in-fl4-5k" +promotion_strategy: "ends_in" +wandb_project: "llama3-metagenome-7b" + +warm_start: + enabled: true + external_checkpoint: "/data/savithas/checkpoints/og2-7b-fp8-refactor-bf16-fl4-fp32mw" # pragma: allowlist secret + lkg_step: 5000 + # Layers 5-28 in FP8, layers 1-4 and 29-32 in BF16 (1-indexed) + fp8_layers: [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28] + demotion_round: 4 + # ends_in pointers: next demotion would be layers 5 and 28 + bottom_ptr: 5 + top_ptr: 28 diff --git a/bionemo-recipes/recipes/opengenome2_llama_native_te/lepton_configs/og2_fp8_fl2_quant_stats.yaml b/bionemo-recipes/recipes/opengenome2_llama_native_te/lepton_configs/og2_fp8_fl2_quant_stats.yaml new file mode 100644 index 0000000000..a9459164ce --- /dev/null +++ b/bionemo-recipes/recipes/opengenome2_llama_native_te/lepton_configs/og2_fp8_fl2_quant_stats.yaml @@ -0,0 +1,73 @@ +# OpenGenome2 7B - FP8 Block Scaling, FL2 (first/last 2 layers BF16) +# Quant stats logging enabled to diagnose fl2 vs fl4 convergence difference. +# Resume from fl2 checkpoint at step 25000. +# +# Layers 3-30 in FP8 (1-indexed), layers 1-2 and 31-32 in BF16. +# GBS = mbs * grad_acc * dp_size = 1 * 8 * 48 = 384 +defaults: + - _self_ + +job_name: "og2-fp8-fl2-quant-stats" +node_group: "yo-bom-lepton-001" +resource_shape: "gpu.8xh100-sxm" + +num_nodes: 6 +gpus_per_node: 8 +num_train_steps: 182314 +micro_batch_size: 1 +grad_acc_steps: 8 + +# Match original fl2 run: jsonl.gz metagenome files (not parquet2) +dataset_path: "json" +data_dir: "" +num_workers: 8 +buffer_size: 10000 + +repo_root: "/data/savithas/bionemo-framework" +code_path: "/data/savithas/bionemo-framework/bionemo-recipes/recipes/opengenome2_llama_native_te" +train_script: "train_fsdp2.py" +hydra_config: "og2_7b_fp8_fl2_quant_stats" + +git_branch: "savitha/claude-lepton-dev-experiment" + +validation_enabled: false + +spike_no_more_embedding_init: true +skip_embedding_weight_decay: true +use_megatron_scaled_init: true +use_weight_decay_grouping: true +use_meta_device: false + +# FP8 with first/last 2 layers in BF16 (block scaling) +fp8_enabled: true +fp8_recipe: transformer_engine.common.recipe.Float8BlockScaling +fp8_format: E4M3 +use_fp32_master_weights: true + +extra_hydra_overrides: + - "++quant_stats_config.enabled=true" + - "++quant_stats_config.quant_stats_file=./fp8_debugging_stats.yaml" + - "++quant_stats_config.quant_log_dir=/data/savithas/checkpoints/og2-7b-fp8-fl2-quant-stats/quant_logs" + - 'dataset.load_dataset_kwargs.data_files="/data/opengenome2/json/pretraining_or_both_phases/metagenomes/data_metagenomics_train_*.jsonl.gz"' + +logger_frequency: 1 + +# New checkpoint dir for this run's saves and logs +checkpoint_dir: "/data/savithas/checkpoints/og2-7b-fp8-fl2-quant-stats" # pragma: allowlist secret +save_every_n_steps: 5000 +resume_from_checkpoint: true +async_save: false + +# Copy step_25000 from original fl2 run into new dir so it resumes from there +pre_training_commands: + - 'if [ ! -d /data/savithas/checkpoints/og2-7b-fp8-fl2-quant-stats/train_fsdp2/step_25000 ]; then echo "Copying step_25000 checkpoint..."; cp -a /data/savithas/checkpoints/og2-7b-fp8-refactor-bf16-fl2-fp32mw/train_fsdp2/step_25000 /data/savithas/checkpoints/og2-7b-fp8-fl2-quant-stats/train_fsdp2/step_25000; echo "Done."; fi' + +wandb_project: "llama3-metagenome-7b" +wandb_name: "og2-7b-fp8-fl2-quant-stats-25k" +wandb_secret: "wandb.savithas" # pragma: allowlist secret + +hf_secret: "HUGGING_FACE_HUB_TOKEN.savithas" # pragma: allowlist secret + +container: + image: "nvcr.io/nvidia/pytorch:26.02-py3" + registry_auth: "lepton-nvidia-cvai-bnmo-trng" diff --git a/bionemo-recipes/recipes/opengenome2_llama_native_te/lepton_configs/og2_fp8_fl4_quant_stats.yaml b/bionemo-recipes/recipes/opengenome2_llama_native_te/lepton_configs/og2_fp8_fl4_quant_stats.yaml new file mode 100644 index 0000000000..cdeea4daf6 --- /dev/null +++ b/bionemo-recipes/recipes/opengenome2_llama_native_te/lepton_configs/og2_fp8_fl4_quant_stats.yaml @@ -0,0 +1,73 @@ +# OpenGenome2 7B - FP8 Block Scaling, FL4 (first/last 4 layers BF16) +# Quant stats logging enabled to diagnose fl2 vs fl4 convergence difference. +# Resume from fl4 checkpoint at step 15000. +# +# Layers 5-28 in FP8 (1-indexed), layers 1-4 and 29-32 in BF16. +# GBS = mbs * grad_acc * dp_size = 1 * 8 * 48 = 384 +defaults: + - _self_ + +job_name: "og2-fp8-fl4-quant-stats" +node_group: "yo-bom-lepton-001" +resource_shape: "gpu.8xh100-sxm" + +num_nodes: 6 +gpus_per_node: 8 +num_train_steps: 182314 +micro_batch_size: 1 +grad_acc_steps: 8 + +# Match original fl4 run: jsonl.gz metagenome files +dataset_path: "json" +data_dir: "" +num_workers: 8 +buffer_size: 10000 + +repo_root: "/data/savithas/bionemo-framework" +code_path: "/data/savithas/bionemo-framework/bionemo-recipes/recipes/opengenome2_llama_native_te" +train_script: "train_fsdp2.py" +hydra_config: "og2_7b_fp8_fl4_quant_stats" + +git_branch: "savitha/claude-lepton-dev-experiment" + +validation_enabled: false + +spike_no_more_embedding_init: true +skip_embedding_weight_decay: true +use_megatron_scaled_init: true +use_weight_decay_grouping: true +use_meta_device: false + +# FP8 with first/last 4 layers in BF16 (block scaling) +fp8_enabled: true +fp8_recipe: transformer_engine.common.recipe.Float8BlockScaling +fp8_format: E4M3 +use_fp32_master_weights: true + +extra_hydra_overrides: + - "++quant_stats_config.enabled=true" + - "++quant_stats_config.quant_stats_file=./fp8_debugging_stats.yaml" + - "++quant_stats_config.quant_log_dir=/data/savithas/checkpoints/og2-7b-fp8-fl4-quant-stats/quant_logs" + - 'dataset.load_dataset_kwargs.data_files="/data/opengenome2/json/pretraining_or_both_phases/metagenomes/data_metagenomics_train_*.jsonl.gz"' + +logger_frequency: 1 + +# New checkpoint dir for this run +checkpoint_dir: "/data/savithas/checkpoints/og2-7b-fp8-fl4-quant-stats" # pragma: allowlist secret +save_every_n_steps: 5000 +resume_from_checkpoint: true +async_save: false + +# Copy step_15000 from original fl4 checkpoint into new dir +pre_training_commands: + - 'if [ ! -d /data/savithas/checkpoints/og2-7b-fp8-fl4-quant-stats/train_fsdp2/step_15000 ]; then mkdir -p /data/savithas/checkpoints/og2-7b-fp8-fl4-quant-stats/train_fsdp2 && echo "Copying step_15000 checkpoint..." && cp -a /data/savithas/checkpoints/og2-7b-fp8-refactor-bf16-fl4-fp32mw/train_fsdp2/step_15000 /data/savithas/checkpoints/og2-7b-fp8-fl4-quant-stats/train_fsdp2/step_15000 && echo "Done."; fi' + +wandb_project: "llama3-metagenome-7b" +wandb_name: "og2-7b-fp8-fl4-quant-stats-15k" +wandb_secret: "wandb.savithas" # pragma: allowlist secret + +hf_secret: "HUGGING_FACE_HUB_TOKEN.savithas" # pragma: allowlist secret + +container: + image: "nvcr.io/nvidia/pytorch:26.02-py3" + registry_auth: "lepton-nvidia-cvai-bnmo-trng" diff --git a/bionemo-recipes/recipes/opengenome2_llama_native_te/opengenome_modeling_llama_te.py b/bionemo-recipes/recipes/opengenome2_llama_native_te/opengenome_modeling_llama_te.py index edf6090bd9..4d3bd620b2 100644 --- a/bionemo-recipes/recipes/opengenome2_llama_native_te/opengenome_modeling_llama_te.py +++ b/bionemo-recipes/recipes/opengenome2_llama_native_te/opengenome_modeling_llama_te.py @@ -18,7 +18,7 @@ Extends the base NVLlama model (modeling_llama_te.py) with OG2-specific features: - Megatron-style scaled initialization for residual output layers (proj/fc2) - Spike-No-More embedding initialization (std=1.0) -- FP8 training with configurable first/last layer BF16 override +- Layer-wise FP8/FP4 quantization with per-layer autocast - RoPE theta fix for transformers >=5.0 compatibility The base modeling_llama_te.py is kept as an exact CI-synced copy of models/llama3/modeling_llama_te.py. @@ -27,12 +27,14 @@ import logging import math +import warnings from collections import OrderedDict from contextlib import nullcontext -from typing import ClassVar, Unpack +from typing import ClassVar, ContextManager, Unpack import torch import torch.nn as nn +import transformer_engine.common.recipe import transformer_engine.pytorch import transformers from transformer_engine.pytorch.attention import InferenceParams @@ -74,19 +76,38 @@ class NVLlamaConfig(LlamaConfig): use_megatron_scaled_init: Whether to use Megatron's scaled initialization for residual output layers (attention proj, MLP fc2). Scaled init uses std / sqrt(2 * num_layers) for these layers. - fp8_first_last_bf16: When True, keeps first and last N transformer layers - in bf16 for FP8 numerical stability. The lm_head is always kept in bf16. - num_layers_at_start_in_bf16: Number of layers at the start to keep in BF16. - num_layers_at_end_in_bf16: Number of layers at the end to keep in BF16. """ attn_input_format: str = "thd" self_attn_mask_type: str = "padding_causal" embedding_init_std: float | None = None # None means use initializer_range use_megatron_scaled_init: bool = False # Use scaled init for proj/fc2 (std/sqrt(2*n)) - fp8_first_last_bf16: bool = False # Keep first/last transformer layers in bf16 for FP8 stability - num_layers_at_start_in_bf16: int = 1 # Number of layers at start to keep in BF16 - num_layers_at_end_in_bf16: int = 1 # Number of layers at end to keep in BF16 + + def __init__( + self, + layer_precision: list[str | None] | None = None, + use_quantized_model_init: bool = False, + **kwargs, + ): + """Initialize the NVLlamaConfig with additional TE-related config options. + + Args: + layer_precision: Per-layer quantization precision, a list of length ``num_hidden_layers`` + where each element is ``"fp8"``, ``"fp4"``, or ``None`` (BF16 fallback). ``None`` + (the default) means no quantization is configured. + use_quantized_model_init: Whether to use `quantized_model_init` for layer initialization. + **kwargs: Additional config options to pass to LlamaConfig. + """ + super().__init__(**kwargs) + self.layer_precision = layer_precision + self.use_quantized_model_init = use_quantized_model_init + + if layer_precision is not None: + if len(layer_precision) != self.num_hidden_layers: + raise ValueError(f"layer_precision must be a list of length {self.num_hidden_layers}") + for precision in layer_precision: + if precision not in {"fp8", "fp4", None}: + raise ValueError(f'layer_precision element must be "fp8", "fp4", or None, got {precision!r}') class NVLlamaPreTrainedModel(PreTrainedModel): @@ -245,12 +266,29 @@ def _init_weights(self, module): class NVLlamaModel(NVLlamaPreTrainedModel): """OpenGenome2 Llama3 model implemented in Transformer Engine.""" - def __init__(self, config: LlamaConfig): - """Initialize the OG2 NVLlama model.""" + def __init__( + self, + config: LlamaConfig, + fp8_recipe: transformer_engine.common.recipe.Recipe | None = None, + fp4_recipe: transformer_engine.common.recipe.Recipe | None = None, + ): + """Initialize the OG2 NVLlama model. + + Args: + config: The configuration of the model. + fp8_recipe: The FP8 recipe for the model (used during init for quantized_model_init). + fp4_recipe: The FP4 recipe for the model (used during init for quantized_model_init). + """ super().__init__(config) self.config = config self.padding_idx = config.pad_token_id self.vocab_size = config.vocab_size + self._fp8_recipe: transformer_engine.common.recipe.Recipe | None = fp8_recipe + self._fp4_recipe: transformer_engine.common.recipe.Recipe | None = fp4_recipe + + if self.config.layer_precision is None and fp8_recipe is not None: + warnings.warn("No layer precision provided, using FP8 recipe for all layers.", UserWarning) + self.config.layer_precision = ["fp8"] * self.config.num_hidden_layers self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx, dtype=config.dtype) @@ -261,31 +299,32 @@ def __init__(self, config: LlamaConfig): def _init_method(x): torch.nn.init.normal_(x, mean=0.0, std=config.initializer_range) - self.layers = nn.ModuleList( - [ - transformer_engine.pytorch.TransformerLayer( - hidden_size=config.hidden_size, - ffn_hidden_size=config.intermediate_size, - num_attention_heads=config.num_attention_heads, - bias=False, - layernorm_epsilon=config.rms_norm_eps, - hidden_dropout=0, - attention_dropout=0, - fuse_qkv_params=True, - qkv_weight_interleaved=True, - normalization="RMSNorm", - activation="swiglu", - attn_input_format=config.attn_input_format, - self_attn_mask_type=config.self_attn_mask_type, - num_gqa_groups=config.num_key_value_heads, - layer_number=layer_idx + 1, - params_dtype=config.dtype, - device="meta" if torch.get_default_device() == torch.device("meta") else "cuda", - init_method=_init_method, - ) - for layer_idx in range(config.num_hidden_layers) - ] - ) + layers: list[transformer_engine.pytorch.TransformerLayer] = [] + for layer_idx in range(config.num_hidden_layers): + with self.get_autocast_context(layer_idx, init=True): + layers += [ + transformer_engine.pytorch.TransformerLayer( + hidden_size=config.hidden_size, + ffn_hidden_size=config.intermediate_size, + num_attention_heads=config.num_attention_heads, + bias=False, + layernorm_epsilon=config.rms_norm_eps, + hidden_dropout=0, + attention_dropout=0, + fuse_qkv_params=True, + qkv_weight_interleaved=True, + normalization="RMSNorm", + activation="swiglu", + attn_input_format=config.attn_input_format, + self_attn_mask_type=config.self_attn_mask_type, + num_gqa_groups=config.num_key_value_heads, + layer_number=layer_idx + 1, + params_dtype=config.dtype, + device="meta" if torch.get_default_device() == torch.device("meta") else "cuda", + init_method=_init_method, + ) + ] + self.layers = nn.ModuleList(layers) self.norm = transformer_engine.pytorch.RMSNorm( config.hidden_size, eps=config.rms_norm_eps, @@ -304,6 +343,65 @@ def _init_method(x): # Initialize weights and apply final processing self.post_init() + def set_recipes( + self, + fp8_recipe: transformer_engine.common.recipe.Recipe | None = None, + fp4_recipe: transformer_engine.common.recipe.Recipe | None = None, + ) -> None: + """Set quantization recipes after FSDP wrapping. + + Recipes are not serializable, so they cannot be passed through FSDP's ``__init__``. + Call this after ``fully_shard()`` to attach recipes for the forward pass. + + Args: + fp8_recipe: The FP8 recipe for the model. + fp4_recipe: The FP4 recipe for the model. + """ + self._fp8_recipe = fp8_recipe + self._fp4_recipe = fp4_recipe + + def get_autocast_context(self, layer_number: int | None, init: bool = False) -> ContextManager: + """Return the appropriate TE context manager for layer initialization. + + Args: + layer_number: The 0-indexed layer number. + init: Whether to return a `quantized_model_init` context for layer initialization. + """ + if self.config.layer_precision is None: + return nullcontext() + + precision = self.config.layer_precision[layer_number] + + if init and self.config.use_quantized_model_init: + if precision == "fp8": + return transformer_engine.pytorch.quantized_model_init(recipe=self._fp8_recipe) + if precision == "fp4": + return transformer_engine.pytorch.quantized_model_init(recipe=self._fp4_recipe) + return nullcontext() + + return nullcontext() + + def get_layer_autocast(self, layer_number: int) -> ContextManager: + """Return the appropriate TE autocast context manager for a given layer. + + The context interacts with the outer FP8 autocast in the forward method: + - FP8 layer: nullcontext() -- lets the outer FP8 autocast take effect. + - FP4 layer: te.autocast(enabled=True, recipe=fp4_recipe) -- enables FP4 compute. + - BF16 layer: te.autocast(enabled=False) -- disables quantized compute. + + Args: + layer_number: The 0-indexed layer number. + + Returns: + A context manager for the layer's quantization mode. + """ + precision = self.config.layer_precision[layer_number] if self.config.layer_precision is not None else None + if precision == "fp8": + return nullcontext() + if precision == "fp4": + return transformer_engine.pytorch.autocast(enabled=True, recipe=self._fp4_recipe) + return transformer_engine.pytorch.autocast(enabled=False) + def forward( self, input_ids: torch.Tensor | None = None, @@ -370,32 +468,27 @@ def forward( te_rope_emb = self.rotary_emb(max_seq_len=self.config.max_position_embeddings) assert te_rope_emb.dtype == torch.float32, "RoPE embeddings should be float32 for optimal performance" - num_layers = self.config.num_hidden_layers - for layer_idx, decoder_layer in enumerate(self.layers[:num_layers]): - if output_hidden_states: - all_hidden_states = (*all_hidden_states, hidden_states) - - # Optionally keep first and last N layers in bf16 for FP8 numerical stability - num_start_bf16 = getattr(self.config, "num_layers_at_start_in_bf16", 1) - num_end_bf16 = getattr(self.config, "num_layers_at_end_in_bf16", 1) - use_bf16_for_layer = getattr(self.config, "fp8_first_last_bf16", False) and ( - layer_idx < num_start_bf16 or layer_idx >= num_layers - num_end_bf16 - ) - - with transformer_engine.pytorch.autocast(enabled=False) if use_bf16_for_layer else nullcontext(): - hidden_states = decoder_layer( - hidden_states, - attention_mask=None if self.config.attn_input_format == "thd" else attention_mask, - rotary_pos_emb=te_rope_emb, - inference_params=past_key_values, - cu_seqlens_q=kwargs.get("cu_seq_lens_q", None), - cu_seqlens_kv=kwargs.get("cu_seq_lens_k", None), - cu_seqlens_q_padded=kwargs.get("cu_seq_lens_q_padded", None), - cu_seqlens_kv_padded=kwargs.get("cu_seq_lens_k_padded", None), - max_seqlen_q=kwargs.get("max_length_q", None), - max_seqlen_kv=kwargs.get("max_length_k", None), - pad_between_seqs=kwargs.get("pad_between_seqs", None), - ) + # Outer FP8 autocast enables FP8 compute for the decoder stack. Per-layer overrides (BF16) are handled + # by get_layer_autocast(), which nests inside this context. + with transformer_engine.pytorch.autocast(enabled=self._fp8_recipe is not None, recipe=self._fp8_recipe): + for layer_idx, decoder_layer in enumerate(self.layers[: self.config.num_hidden_layers]): + if output_hidden_states: + all_hidden_states = (*all_hidden_states, hidden_states) + + with self.get_layer_autocast(layer_idx): + hidden_states = decoder_layer( + hidden_states, + attention_mask=None if self.config.attn_input_format == "thd" else attention_mask, + rotary_pos_emb=te_rope_emb, + inference_params=past_key_values, + cu_seqlens_q=kwargs.get("cu_seq_lens_q", None), + cu_seqlens_kv=kwargs.get("cu_seq_lens_k", None), + cu_seqlens_q_padded=kwargs.get("cu_seq_lens_q_padded", None), + cu_seqlens_kv_padded=kwargs.get("cu_seq_lens_k_padded", None), + max_seqlen_q=kwargs.get("max_length_q", None), + max_seqlen_kv=kwargs.get("max_length_k", None), + pad_between_seqs=kwargs.get("pad_between_seqs", None), + ) hidden_states = self.norm(hidden_states) @@ -417,10 +510,21 @@ class NVLlamaForCausalLM(NVLlamaPreTrainedModel, transformers.GenerationMixin): _tied_weights_keys: ClassVar[dict[str, str]] = {"lm_head.weight": "model.embed_tokens.weight"} - def __init__(self, config): - """Initialize the OG2 NVLlamaForCausalLM model.""" + def __init__( + self, + config, + fp8_recipe: transformer_engine.common.recipe.Recipe | None = None, + fp4_recipe: transformer_engine.common.recipe.Recipe | None = None, + ): + """Initialize the OG2 NVLlamaForCausalLM model. + + Args: + config: The configuration of the model. + fp8_recipe: The FP8 recipe for the model. + fp4_recipe: The FP4 recipe for the model. + """ super().__init__(config) - self.model = NVLlamaModel(config) + self.model = NVLlamaModel(config, fp8_recipe=fp8_recipe, fp4_recipe=fp4_recipe) self.vocab_size = config.vocab_size with transformer_engine.pytorch.quantized_model_init(enabled=False): self.lm_head = transformer_engine.pytorch.Linear( diff --git a/bionemo-recipes/recipes/opengenome2_llama_native_te/perf_logger.py b/bionemo-recipes/recipes/opengenome2_llama_native_te/perf_logger.py index 081103beb5..a99f66eee6 100644 --- a/bionemo-recipes/recipes/opengenome2_llama_native_te/perf_logger.py +++ b/bionemo-recipes/recipes/opengenome2_llama_native_te/perf_logger.py @@ -99,7 +99,11 @@ def __init__(self, dist_config: DistributedConfig, args: DictConfig): self.grad_acc_step_count = 0 # Whether to step debug_api.step() after each step - self.fp8_stats_enabled = args.fp8_stats_config.enabled + quant_stats_cfg = getattr(args, "quant_stats_config", None) + fp8_stats_cfg = getattr(args, "fp8_stats_config", None) + self.quant_stats_enabled = (quant_stats_cfg is not None and getattr(quant_stats_cfg, "enabled", False)) or ( + fp8_stats_cfg is not None and getattr(fp8_stats_cfg, "enabled", False) + ) @nvtx.annotate("PerfLogger.log_micro_step", color="pink") def log_micro_step(self, step: int, batch: dict[str, torch.Tensor], outputs: CausalLMOutputWithPast): @@ -158,7 +162,7 @@ def log_step( if self._profiler is not None: self._profiler.step(step) - if self.fp8_stats_enabled and HAS_NVDLFW_INSPECT: + if self.quant_stats_enabled and HAS_NVDLFW_INSPECT: debug_api.step() if step % self.logging_frequency == 0 and step > 0: @@ -236,7 +240,7 @@ def finish(self): wandb.finish() self._progress_bar.close() - if self.fp8_stats_enabled and HAS_NVDLFW_INSPECT: + if self.quant_stats_enabled and HAS_NVDLFW_INSPECT: debug_api.end_debug() diff --git a/bionemo-recipes/recipes/opengenome2_llama_native_te/quantization.py b/bionemo-recipes/recipes/opengenome2_llama_native_te/quantization.py new file mode 100644 index 0000000000..b41dd7798d --- /dev/null +++ b/bionemo-recipes/recipes/opengenome2_llama_native_te/quantization.py @@ -0,0 +1,219 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: LicenseRef-Apache2 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Utilities for layer-wise quantization configuration (FP8/FP4).""" + +import logging +import tempfile +from pathlib import Path + +import yaml + + +logger = logging.getLogger(__name__) + + +def generate_layer_regex(layer_numbers: list[int] | None) -> str: + """Generate a regex pattern to match specific layer numbers (1-indexed). + + The debug API (nvdlfw_inspect) uses 1-indexed layer names after ``infer_and_assign_layer_names``. + + Args: + layer_numbers: List of layer numbers (1-indexed, as shown in debug logs). + If empty or None, returns a pattern that matches nothing. + + Returns: + Regex pattern string for matching those layers' linear sublayers. + """ + if not layer_numbers: + return r"model\.model\.layers\.DISABLED_NO_LAYERS_SPECIFIED" + layer_pattern = "|".join(str(n) for n in sorted(layer_numbers)) + return rf"model\.model\.layers\.({layer_pattern})\..*(layernorm_qkv|proj|fc1|fc2)" + + +def update_quant_stats_config( + config_file: str, + fp4_layers: list[int] | None, + fp8_layers: list[int] | None, +) -> str: + """Update the quant stats YAML config with layer-specific regex patterns. + + Args: + config_file: Path to the original YAML config file. + fp4_layers: List of layer numbers for FP4 (1-indexed). + fp8_layers: List of layer numbers for FP8 (1-indexed). + + Returns: + Path to the updated config file (a temp file). + """ + with open(config_file) as f: + config = yaml.safe_load(f) + + if "example_fp4_tensor_stat_collection" in config: + config["example_fp4_tensor_stat_collection"]["enabled"] = False + if fp4_layers: + logger.warning( + "NVFP4 quant stats logging is not yet supported (requires a future TransformerEngine release). " + f"Disabling FP4 stats collection for layers {fp4_layers}. FP8 stats will still be collected." + ) + else: + logger.info("FP4 stats section disabled (no FP4 layers and feature not yet supported)") + + if "example_fp8_tensor_stat_collection" in config: + fp8_regex = generate_layer_regex(fp8_layers) + config["example_fp8_tensor_stat_collection"]["layers"]["layer_name_regex_pattern"] = fp8_regex + if fp8_layers: + logger.info(f"Updated FP8 layer regex to match layers: {fp8_layers}") + else: + logger.info("FP8 layers empty - regex set to match nothing") + + temp_file = tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) + yaml.dump(config, temp_file, default_flow_style=False) + temp_file.close() + + config_str = yaml.dump(config, default_flow_style=False) + logger.info(f"Created updated quant stats config at: {temp_file.name}") + logger.info(f"Updated quant stats config contents:\n{config_str}") + + return temp_file.name + + +def initialize_quant_stats_logging( + quant_stats_file: str, + quant_log_dir: str, + rank: int, + layer_precision: list[str | None], +) -> None: + """Set up quantization stats logging via nvdlfw_inspect. + + Updates the quant stats YAML config with resolved layer regex patterns, creates + the per-rank log directory, and initializes the debug API. + + Args: + quant_stats_file: Path to the base quant stats YAML config file. + quant_log_dir: Base directory for quant stats logs (a rank subdirectory will be created). + rank: The global rank of this process. + layer_precision: Per-layer precision list (0-indexed by position). Each element is + ``"fp8"``, ``"fp4"``, or ``None``. + """ + import nvdlfw_inspect.api as debug_api + import transformer_engine + + # Derive 1-indexed layer lists for the debug API, which uses 1-indexed layer names. + fp8_layers_1indexed = [i + 1 for i, p in enumerate(layer_precision) if p == "fp8"] or None + fp4_layers_1indexed = [i + 1 for i, p in enumerate(layer_precision) if p == "fp4"] or None + updated_config = update_quant_stats_config( + config_file=quant_stats_file, + fp4_layers=fp4_layers_1indexed, + fp8_layers=fp8_layers_1indexed, + ) + + rank_log_dir = Path(quant_log_dir) / f"rank_{rank}" + rank_log_dir.mkdir(parents=True, exist_ok=True) + logger.info(f"Logging quant stats to {rank_log_dir}") + + te_features_dir = str(Path(transformer_engine.__file__).parent / "debug" / "features") + debug_api.initialize( + config_file=updated_config, + feature_dirs=[te_features_dir], + log_dir=rank_log_dir, + default_logging_enabled=True, + ) + + +def resolve_layer_precision( + num_layers: int, + fp8_enabled: bool, + fp4_enabled: bool, + fp8_layers: list[int] | None, + fp4_layers: list[int] | None, +) -> list[str | None]: + """Resolve layer-wise quantization assignments from user config. + + Takes 1-indexed layer lists (as specified by the user in YAML config) and returns a per-layer + precision list (0-indexed by position). When a quantization format is enabled but no layer list + is provided, all layers default to that format. When one format has explicit layers and the other + is enabled without a layer list, the unspecified format defaults to the remaining (unclaimed) layers. + + Args: + num_layers: Total number of transformer layers in the model. + fp8_enabled: Whether FP8 quantization is enabled. + fp4_enabled: Whether FP4 quantization is enabled. + fp8_layers: 1-indexed list of layers for FP8, or None if not specified. + fp4_layers: 1-indexed list of layers for FP4, or None if not specified. + + Returns: + A list of length ``num_layers`` where each element is ``"fp8"``, ``"fp4"``, or ``None`` + (BF16 fallback), indexed by layer position (0-indexed). + + Raises: + ValueError: If both formats are enabled with no layer lists, or if layer lists overlap. + """ + all_layers = set(range(1, num_layers + 1)) + + if fp8_enabled and fp4_enabled and fp8_layers is None and fp4_layers is None: + raise ValueError( + "Both fp8_config and fp4_config are enabled but neither fp8_layers nor fp4_layers is specified. " + "When both are enabled, you must explicitly provide layer lists to indicate which layers use which format." + ) + + # When one format has explicit layers and the other defaults, fill in the remaining layers. + if fp8_enabled and fp8_layers is None: + claimed_by_fp4 = set(fp4_layers) if fp4_layers is not None else set() + fp8_layers = sorted(all_layers - claimed_by_fp4) + if claimed_by_fp4: + logger.warning( + f"fp8_config.enabled=True with no fp8_layers specified, but fp4_layers={sorted(claimed_by_fp4)} " + f"are already claimed by FP4. Defaulting FP8 to the remaining layers: {fp8_layers}" + ) + else: + logger.info( + f"fp8_config.enabled=True with no fp8_layers specified, defaulting all {num_layers} layers to FP8" + ) + + if fp4_enabled and fp4_layers is None: + claimed_by_fp8 = set(fp8_layers) if fp8_layers is not None else set() + fp4_layers = sorted(all_layers - claimed_by_fp8) + if claimed_by_fp8: + logger.warning( + f"fp4_config.enabled=True with no fp4_layers specified, but fp8_layers={sorted(claimed_by_fp8)} " + f"are already claimed by FP8. Defaulting FP4 to the remaining layers: {fp4_layers}" + ) + else: + logger.info( + f"fp4_config.enabled=True with no fp4_layers specified, defaulting all {num_layers} layers to FP4" + ) + + # Disable layer lists when corresponding config is not enabled. + if not fp8_enabled: + fp8_layers = None + if not fp4_enabled: + fp4_layers = None + + # Validate no overlap between FP8 and FP4 layer assignments. + if fp8_layers is not None and fp4_layers is not None: + overlap = set(fp8_layers) & set(fp4_layers) + if overlap: + raise ValueError( + f"fp8_layers and fp4_layers cannot have overlapping layer numbers. Found overlap: {sorted(overlap)}" + ) + + # Build per-layer precision list (0-indexed by position, 1-indexed for lookup). + fp8_set = set(fp8_layers) if fp8_layers is not None else set() + fp4_set = set(fp4_layers) if fp4_layers is not None else set() + return [ + "fp8" if layer_1indexed in fp8_set else "fp4" if layer_1indexed in fp4_set else None + for layer_1indexed in range(1, num_layers + 1) + ] diff --git a/bionemo-recipes/recipes/opengenome2_llama_native_te/references/NVIDIA-Nemotron-3-Super-Technical-Report.pdf b/bionemo-recipes/recipes/opengenome2_llama_native_te/references/NVIDIA-Nemotron-3-Super-Technical-Report.pdf new file mode 100644 index 0000000000..366b2bc63d Binary files /dev/null and b/bionemo-recipes/recipes/opengenome2_llama_native_te/references/NVIDIA-Nemotron-3-Super-Technical-Report.pdf differ diff --git a/bionemo-recipes/recipes/opengenome2_llama_native_te/scripts/evaluate_fasta_lm_loss.py b/bionemo-recipes/recipes/opengenome2_llama_native_te/scripts/evaluate_fasta_lm_loss.py index 6681df900f..dbd0d701d0 100644 --- a/bionemo-recipes/recipes/opengenome2_llama_native_te/scripts/evaluate_fasta_lm_loss.py +++ b/bionemo-recipes/recipes/opengenome2_llama_native_te/scripts/evaluate_fasta_lm_loss.py @@ -82,7 +82,6 @@ import numpy as np import torch import torch.distributed as dist -import transformer_engine.pytorch from torch.distributed.checkpoint.state_dict_loader import load as dcp_load from torch.distributed.device_mesh import init_device_mesh from torch.distributed.fsdp import MixedPrecisionPolicy, fully_shard @@ -395,8 +394,7 @@ def compute_per_sequence_log_probs( seq_names = batch["seq_name"] # Forward — no labels ⇒ model returns logits only, no loss - with transformer_engine.pytorch.fp8_autocast(enabled=False): - outputs = model(input_ids=input_ids, attention_mask=attention_mask) + outputs = model(input_ids=input_ids, attention_mask=attention_mask) logits = outputs.logits # (B, S, V) diff --git a/bionemo-recipes/recipes/opengenome2_llama_native_te/submit_claude_agent_lepton.py b/bionemo-recipes/recipes/opengenome2_llama_native_te/submit_claude_agent_lepton.py new file mode 100644 index 0000000000..f416569406 --- /dev/null +++ b/bionemo-recipes/recipes/opengenome2_llama_native_te/submit_claude_agent_lepton.py @@ -0,0 +1,579 @@ +#!/usr/bin/env python3 + +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: LicenseRef-Apache2 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Lepton job submission script that runs Claude Code as an autonomous FP8 Precision Agent. + +On rank 0, Claude Code is installed and given the OG2 FP8 Agent Guide. Claude autonomously +manages the training loop: launching torchrun, monitoring metrics, adjusting layer precision, +and producing reports. On other ranks, nodes wait for torchrun commands from rank 0. + +Usage: + # MVP demo (single node, tiny model) + python submit_claude_agent_lepton.py --config-name=claude_agent_demo + + # Full FP8 agent (multi-node, OG2-7B) + python submit_claude_agent_lepton.py --config-name=og2_fp8_agent + + # Override strategy + python submit_claude_agent_lepton.py --config-name=og2_fp8_agent promotion_strategy=tail_in +""" + +import hydra +from leptonai.api.v1.types.affinity import LeptonResourceAffinity +from leptonai.api.v1.types.common import Metadata +from leptonai.api.v1.types.deployment import EnvValue, EnvVar, LeptonContainer +from leptonai.api.v1.types.job import LeptonJob, LeptonJobUserSpec +from leptonai.api.v2.client import APIClient +from omegaconf import DictConfig, OmegaConf + + +def _resolve_scheduling_target(client, cfg: DictConfig): + """Resolve node group and resource shape.""" + desired_node_group = str(cfg.node_group).strip() + resource_shape = str(cfg.resource_shape).strip() + + node_groups = client.nodegroup.list_all() + node_group_map = {ng.metadata.name: ng for ng in node_groups} + + if desired_node_group not in node_group_map: + available = ", ".join(sorted(node_group_map.keys())) + raise SystemExit(f"Node group '{desired_node_group}' not found.\nAvailable: {available}") + + chosen_group = node_group_map[desired_node_group] + valid_node_ids = {n.metadata.id_ for n in client.nodegroup.list_nodes(chosen_group.metadata.id_)} + + return chosen_group, valid_node_ids, resource_shape + + +def _build_warm_start_section(cfg: DictConfig) -> str: + """Build the warm-start section of the agent prompt, or a fresh-start message.""" + warm_start = cfg.get("warm_start", None) + if warm_start is None or not getattr(warm_start, "enabled", False): + return "## Start Mode\n\nFresh start — begin from scratch with all layers in $INITIAL_PRECISION." + + fp8_layers = list(OmegaConf.to_container(warm_start.fp8_layers, resolve=True)) + strategy = cfg.get("promotion_strategy", "ends_in") + + # Compute which layers are BF16 (not in fp8_layers) + num_layers = 32 # OG2-7B has 32 transformer layers + bf16_layers = sorted(set(range(1, num_layers + 1)) - set(fp8_layers)) + + lines = [ + "## Warm Start from Existing Checkpoint", + "", + "This run resumes from an existing checkpoint with a pre-configured precision schedule.", + f"Layers {bf16_layers} are already in BF16; layers {fp8_layers} are in FP8.", + "Do NOT start from scratch. Follow the warm-start procedure below.", + "", + "```", + f"EXTERNAL_CHECKPOINT = {warm_start.external_checkpoint}", + f"LKG_STEP = {warm_start.lkg_step}", + f"INITIAL_FP8_LAYERS = {fp8_layers}", + f"DEMOTION_ROUND = {warm_start.demotion_round}", + ] + + if strategy == "ends_in": + lines += [ + f"BOTTOM_PTR = {warm_start.bottom_ptr}", + f"TOP_PTR = {warm_start.top_ptr}", + ] + elif strategy == "tail_in": + lines += [ + f"TAIL_PTR = {warm_start.tail_ptr}", + ] + elif strategy == "gradual": + lines += [ + f"CENTER_LOW = {warm_start.get('center_low', 16)}", + f"CENTER_HIGH = {warm_start.get('center_high', 17)}", + ] + + checkpoint_root = cfg.get("checkpoint_root", "/data/savithas/checkpoints") + + lines += [ + "```", + "", + "### Warm-Start Procedure", + "", + "Before your first training launch:", + "", + f"**IMPORTANT**: Use `CHECKPOINT_ROOT={checkpoint_root}` for checkpoint paths,", + "NOT `WORKSPACE_ROOT`. The `checkpoint.ckpt_dir` CLI argument must be", + f"`{checkpoint_root}/` (e.g., `{checkpoint_root}/ends_in_20260318_143000`).", + "", + "1. Create your checkpoint directory:", + " ```", + f" mkdir -p {checkpoint_root}//train_fsdp2", + " ```", + "2. Symlink the external checkpoint into your checkpoint directory:", + " ```", + f" ln -s {warm_start.external_checkpoint}/train_fsdp2/step_{warm_start.lkg_step}" + f" {checkpoint_root}//train_fsdp2/step_{warm_start.lkg_step}", + " ```", + "3. Verify the symlink resolves correctly:", + " ```", + f" ls -la {checkpoint_root}//train_fsdp2/step_{warm_start.lkg_step}/", + f" ls {checkpoint_root}//train_fsdp2/step_{warm_start.lkg_step}/.metadata", + " ```", + f"4. Set `fp8_layers` to `{fp8_layers}` for the first launch.", + f"5. Initialize `state.json` with `demotion_round={warm_start.demotion_round}`, " + f"`lkg_step={warm_start.lkg_step}`, and the pointer values above.", + "", + f"The agent picks up at round {warm_start.demotion_round + 1} of `{strategy}`.", + f"On the first check-in (step {warm_start.lkg_step + cfg.get('checkin_interval', 100)}), " + "compare against the BF16 baseline at that step.", + "If the check-in passes, training continues. If it fails, demote the next layers per the strategy.", + "", + "### Important", + "", + "- The external checkpoint was trained with a DIFFERENT precision schedule. " + "The optimizer state is matched to that schedule. Only further demotions (FP8 -> BF16) are safe. " + "Do NOT promote layers back to FP8 that were already in BF16.", + f"- `checkpoint.ckpt_dir` stays FIXED at `{checkpoint_root}/` for the entire session " + "(same rule as fresh start). NEVER use WORKSPACE_ROOT for checkpoint.ckpt_dir.", + ] + + return "\n".join(lines) + + +def _build_agent_prompt(cfg: DictConfig) -> str: + """Read claude_agent_prompt.txt and fill in config values.""" + import pathlib + + prompt_path = pathlib.Path(__file__).parent / "claude_agent_prompt.txt" + template = prompt_path.read_text() + + warm_start_section = _build_warm_start_section(cfg) + + workspace_root = cfg.get("workspace_root", "/data/savithas/agent_runs") + launch_dir = f"{workspace_root}/.launches/{cfg.job_name}" + + # Choose guide based on strategy + strategy = cfg.get("promotion_strategy", "ends_in") + guide_filename = cfg.get("guide_filename", None) + if guide_filename is None: + guide_filename = "OG2_FP8_1NODE_DEMO_GUIDE.md" if strategy == "gradual" else "OG2_FP8_AGENT_GUIDE.md" + + return template.format( + code_path=cfg.code_path, + num_nodes=cfg.num_nodes, + gpus_per_node=cfg.gpus_per_node, + num_train_steps=cfg.num_train_steps, + checkin_interval=cfg.get("checkin_interval", 100), + tolerance_pct=cfg.get("tolerance_pct", 5.0), + promotion_strategy=strategy, + workspace_root=workspace_root, + checkpoint_root=cfg.get("checkpoint_root", "/data/savithas/checkpoints"), + wandb_project=cfg.get("wandb_project", "opengenome2-7b"), + launch_dir=launch_dir, + warm_start_section=warm_start_section, + guide_filename=guide_filename, + baseline_logfile=cfg.get("baseline_logfile", "baseline_bf16.json"), + ) + + +def launch_claude_agent_job(client, cfg: DictConfig): + """Launch a multi-node job where rank 0 runs Claude Code as the FP8 Precision Agent.""" + chosen_group, valid_node_ids, resource_shape = _resolve_scheduling_target(client, cfg) + + agent_prompt = _build_agent_prompt(cfg) + num_nodes = cfg.get("num_nodes", 1) + workspace_root = cfg.get("workspace_root", "/data/savithas/agent_runs") + launch_dir = f"{workspace_root}/.launches/{cfg.job_name}" + + # Git branch checkout logic (rank 0 only for NFS safety) + git_branch = cfg.get("git_branch", "") + repo_root = cfg.get("repo_root", "/data/savithas/bionemo-framework") + + git_sync_script = "" + if git_branch: + git_sync_script = f""" +# Git sync to specified branch (only on rank 0 to avoid NFS race conditions) +if [ "$NODE_RANK" = "0" ]; then + echo "==========================================" + echo "[Rank 0] Syncing to branch: {git_branch}" + echo "==========================================" + cd {repo_root} + find .git -name "*.lock" -delete 2>/dev/null || true + git fetch origin + git checkout {git_branch} 2>/dev/null || git checkout -b {git_branch} origin/{git_branch} + git reset --hard origin/{git_branch} + echo "Git sync complete! Commit: $(git rev-parse HEAD)" + echo "==========================================" +else + echo "[Rank $NODE_RANK] Waiting for rank 0 to complete git sync..." + sleep 30 + cd {repo_root} + echo "[Rank $NODE_RANK] Current commit: $(git rev-parse HEAD)" +fi +""" + + container_script = f"""#!/bin/bash +set -e + +echo "==========================================" +echo "Claude Code FP8 Precision Agent - Lepton" +echo "Node rank: $NODE_RANK / $NNODES" +echo "GPUs: {cfg.gpus_per_node}x H100" +echo "==========================================" + +# 1. Initialize Lepton environment (sets MASTER_ADDR, MASTER_PORT, NODE_RANK, NNODES) +wget -O init.sh https://raw.githubusercontent.com/leptonai/scripts/main/lepton_env_to_pytorch.sh +chmod +x init.sh +source init.sh + +export MASTER_PORT=29400 +export NCCL_TIMEOUT_MS=1800000 +export NCCL_DEBUG=WARN +export HF_HOME=/data/savithas/cache + +# Write env vars to a file so they survive the `su` to claude-agent on rank 0. +# Without this, Claude Code's torchrun would get empty MASTER_ADDR/NODE_RANK/NNODES. +cat > /tmp/training_env.sh << ENV_EOF +export MASTER_ADDR=$MASTER_ADDR +export MASTER_PORT=$MASTER_PORT +export NODE_RANK=$NODE_RANK +export NNODES=$NNODES +export NCCL_TIMEOUT_MS=$NCCL_TIMEOUT_MS +export NCCL_DEBUG=$NCCL_DEBUG +export HF_HOME=$HF_HOME +export WANDB_API_KEY=$WANDB_API_KEY +export PATH=$PATH +export LD_LIBRARY_PATH=${{LD_LIBRARY_PATH:-}} +export CUDA_HOME=${{CUDA_HOME:-}} + +# Pre-built torchrun prefix — Claude MUST use this instead of constructing its own. +# This ensures correct --nnodes, --node_rank, --master_addr for multi-node training. +export TORCHRUN_PREFIX="torchrun --nproc_per_node={cfg.gpus_per_node} --nnodes=$NNODES --node_rank=$NODE_RANK --master_addr=$MASTER_ADDR --master_port=$MASTER_PORT" +ENV_EOF +chmod 644 /tmp/training_env.sh +echo "Env vars written to /tmp/training_env.sh" +echo " MASTER_ADDR=$MASTER_ADDR NODE_RANK=$NODE_RANK NNODES=$NNODES" +echo " TORCHRUN_PREFIX will be: torchrun --nproc_per_node={cfg.gpus_per_node} --nnodes=$NNODES --node_rank=$NODE_RANK --master_addr=$MASTER_ADDR --master_port=$MASTER_PORT" +{git_sync_script} + +# 2. Install Python training requirements (all nodes) +cd {cfg.code_path} +pip install -r requirements.txt + +# 3. Login to wandb (all nodes, needed for distributed logging) +wandb login ${{WANDB_API_KEY}} + +# 4. Create workspace and launch coordination directories +mkdir -p {workspace_root} +mkdir -p {launch_dir} + +# Clean stale round files from previous job runs (same job_name reuses the launch dir on NFS). +# Without this, workers immediately pick up old round_1_ready and start torchrun before rank 0 is ready. +if [ "$NODE_RANK" = "0" ]; then + echo "Cleaning stale launch files from {launch_dir}..." + rm -f {launch_dir}/round_* {launch_dir}/done 2>/dev/null || true + echo "Launch dir cleaned." +fi +# Workers wait for rank 0 to finish cleanup before polling +sleep 5 + +# ============================================================ +# RANK 0: Run Claude Code as the FP8 Precision Agent +# OTHER RANKS: Poll for barrier-based round files and run torchrun +# ============================================================ + +if [ "$NODE_RANK" = "0" ]; then + echo "==========================================" + echo "[Rank 0] Setting up Claude Code agent..." + echo "==========================================" + + # Install Node.js 22 LTS + echo "Installing Node.js 22 LTS..." + curl -fsSL https://deb.nodesource.com/setup_22.x | bash - + apt-get install -y nodejs + echo "Node.js version: $(node --version)" + echo "npm version: $(npm --version)" + + # Install Claude Code CLI + echo "Installing Claude Code..." + npm install -g @anthropic-ai/claude-code + echo "Claude Code installed: $(claude --version)" + + # Create non-root user (Claude Code refuses --dangerously-skip-permissions as root) + echo "Creating non-root user for Claude Code..." + useradd -m -s /bin/bash claude-agent + chown -R claude-agent:claude-agent {workspace_root} + + # Ensure claude-agent can access CUDA devices, NCCL, and shared memory + # Without this, torchrun launched by Claude fails at init_process_group + echo "Setting CUDA/NCCL device permissions for claude-agent..." + chmod a+rw /dev/nvidia* 2>/dev/null || true + chmod a+rw /dev/infiniband/* 2>/dev/null || true + # Add claude-agent to video group (for GPU access) + usermod -aG video claude-agent 2>/dev/null || true + # Ensure checkpoint directory is writable + mkdir -p {cfg.get("checkpoint_root", "/data/savithas/checkpoints")} + chown -R claude-agent:claude-agent {cfg.get("checkpoint_root", "/data/savithas/checkpoints")} + # Ensure NFS code path is readable/writable + chmod -R a+rw {cfg.code_path} 2>/dev/null || true + + # Write the agent prompt to a file + cat > /tmp/agent_prompt.txt << 'AGENT_PROMPT_EOF' +{agent_prompt} +AGENT_PROMPT_EOF + chmod 644 /tmp/agent_prompt.txt + + # Create a torchrun wrapper that FORCES correct multi-node flags. + # This is a failsafe: even if Claude ignores $TORCHRUN_PREFIX and writes + # "torchrun --nnodes=1 ...", the wrapper strips wrong flags and injects correct ones. + # We replace the actual torchrun binary so it works regardless of PATH or .bashrc. + REAL_TORCHRUN=$(which torchrun) + cp "$REAL_TORCHRUN" "${{REAL_TORCHRUN}}.real" + cat > "$REAL_TORCHRUN" << TORCHRUN_WRAPPER_EOF +#!/bin/bash +# Torchrun wrapper: strips any --nnodes/--node_rank/--master_addr/--master_port/--nproc_per_node +# flags and replaces them with the correct values from the environment. +# This prevents Claude from accidentally launching single-node training. +# Real torchrun is at: ${{REAL_TORCHRUN}}.real + +ARGS=() +SKIP_NEXT=false +for arg in "\\$@"; do + if \\$SKIP_NEXT; then + SKIP_NEXT=false + continue + fi + case "\\$arg" in + --nnodes=*|--node_rank=*|--master_addr=*|--master_port=*|--nproc_per_node=*) + echo "[torchrun-wrapper] Stripped flag: \\$arg" >&2 + ;; + --nnodes|--node_rank|--master_addr|--master_port|--nproc_per_node) + echo "[torchrun-wrapper] Stripped flag: \\$arg (and next arg)" >&2 + SKIP_NEXT=true + ;; + *) + ARGS+=("\\$arg") + ;; + esac +done + +echo "[torchrun-wrapper] Injecting: --nproc_per_node={cfg.gpus_per_node} --nnodes=\\$NNODES --node_rank=\\$NODE_RANK --master_addr=\\$MASTER_ADDR --master_port=\\$MASTER_PORT" >&2 +exec ${{REAL_TORCHRUN}}.real \\ + --nproc_per_node={cfg.gpus_per_node} \\ + --nnodes=\\$NNODES \\ + --node_rank=\\$NODE_RANK \\ + --master_addr=\\$MASTER_ADDR \\ + --master_port=\\$MASTER_PORT \\ + "\\${{ARGS[@]}}" +TORCHRUN_WRAPPER_EOF + chmod 755 "$REAL_TORCHRUN" + echo "Torchrun wrapper installed (replaced $REAL_TORCHRUN, original at ${{REAL_TORCHRUN}}.real)" + + # Write a wrapper script for the non-root user + cat > /tmp/run_claude.sh << 'WRAPPER_EOF' +#!/bin/bash +set -e + +# Source env vars from root shell (MASTER_ADDR, NODE_RANK, NNODES, TORCHRUN_PREFIX, etc.) +source /tmp/training_env.sh +echo "Env check: MASTER_ADDR=$MASTER_ADDR NODE_RANK=$NODE_RANK NNODES=$NNODES" +echo "TORCHRUN_PREFIX=$TORCHRUN_PREFIX" + +cd {cfg.code_path} + +# Verify CUDA access as claude-agent user +echo "CUDA sanity check (as claude-agent)..." +python3 -c "import torch; print('CUDA available:', torch.cuda.is_available(), 'devices:', torch.cuda.device_count())" || echo "WARNING: CUDA not accessible!" +echo "" + +echo "Testing Claude Code authentication..." +claude --dangerously-skip-permissions \\ + --model {cfg.claude_model} \\ + -p "Say OK if you can read this." 2>&1 | head -5 +echo "Auth check complete." + +echo "==========================================" +echo "Starting FP8 Precision Agent..." +echo "Strategy: {cfg.get("promotion_strategy", "ends_in")}" +echo "==========================================" + +AGENT_LOG="{workspace_root}/claude_agent_output.log" +echo "Agent output will be logged to: $AGENT_LOG" +echo "Timestamp before launch: $(date -u '+%Y-%m-%dT%H:%M:%SZ')" + +PROMPT=$(cat /tmp/agent_prompt.txt) +echo "Prompt size: $(echo "$PROMPT" | wc -c) bytes" + +# Redirect directly to file (avoids pipe buffering from tee which blocks output for 10-20 min). +# Then tail -f streams the file to container logs in real time. +claude --dangerously-skip-permissions \\ + --model {cfg.claude_model} \\ + -p "$PROMPT" > "$AGENT_LOG" 2>&1 & +CLAUDE_PID=$! +echo "Claude Code launched (PID: $CLAUDE_PID)" + +# Give Claude a moment to start writing, then tail the log +sleep 3 +tail -f "$AGENT_LOG" & +TAIL_PID=$! + +# Wait for Claude to finish +wait $CLAUDE_PID +AGENT_EXIT=$? +kill $TAIL_PID 2>/dev/null || true + +echo "==========================================" +echo "FP8 Precision Agent finished (exit code: $AGENT_EXIT)." +echo "Timestamp after exit: $(date -u '+%Y-%m-%dT%H:%M:%SZ')" +echo "Log file size: $(wc -c < "$AGENT_LOG") bytes" +echo "==========================================" +WRAPPER_EOF + chmod 755 /tmp/run_claude.sh + + # Run as non-root user, preserving full environment (no dash = keep env) + su claude-agent -c "bash /tmp/run_claude.sh" + +else + echo "==========================================" + echo "[Rank $NODE_RANK] Worker node — barrier-based round polling" + echo "Launch dir: {launch_dir}" + echo "MASTER_ADDR=$MASTER_ADDR MASTER_PORT=$MASTER_PORT" + echo "==========================================" + + # Worker nodes poll NFS for barrier files written by the Claude agent on rank 0. + # All workers block on the SAME round_N_ready file, ensuring they start torchrun together. + # This prevents the desync bug where independent counters caused workers to be on different rounds. + ROUND=1 + while true; do + echo "[Rank $NODE_RANK] Waiting for round $ROUND (polling for round_${{ROUND}}_ready)..." + while [ ! -f "{launch_dir}/round_${{ROUND}}_ready" ] && \ + [ ! -f "{launch_dir}/done" ]; do + sleep 5 + done + + # Check for completion signal + if [ -f "{launch_dir}/done" ]; then + echo "[Rank $NODE_RANK] Done signal received. Exiting." + break + fi + + # Source training args written by Claude (contains TRAIN_CMD variable) + source "{launch_dir}/round_${{ROUND}}_args.env" + echo "==========================================" + echo "[Rank $NODE_RANK] Starting round $ROUND" + echo "TRAIN_CMD=$TRAIN_CMD" + echo "==========================================" + + # Run torchrun (exits when rank 0 dies/kills or training completes) + cd {cfg.code_path} + torchrun \\ + --nproc_per_node={cfg.gpus_per_node} \\ + --nnodes=$NNODES \\ + --node_rank=$NODE_RANK \\ + --master_addr=$MASTER_ADDR \\ + --master_port=$MASTER_PORT \\ + $TRAIN_CMD || true + + echo "[Rank $NODE_RANK] Round $ROUND finished" + ROUND=$((ROUND + 1)) + done + + echo "[Rank $NODE_RANK] All rounds complete. Exiting." +fi +""" + + command = ["bash", "-c", container_script] + + env_vars = [ + EnvVar(name="ANTHROPIC_AUTH_TOKEN", value_from=EnvValue(secret_name_ref=cfg.anthropic_secret)), + EnvVar(name="ANTHROPIC_BASE_URL", value=cfg.anthropic_base_url), + EnvVar(name="WANDB_API_KEY", value_from=EnvValue(secret_name_ref=cfg.wandb_secret)), + ] + + nfs_source_path = cfg.get("nfs", {}).get("source_path", "/BioNeMo") + nfs_mount_path = cfg.get("nfs", {}).get("mount_path", "/data") + nfs_source = cfg.get("nfs", {}).get("nfs_source", "node-nfs:fs1") + + mounts = [ + { + "path": nfs_source_path, + "mount_path": nfs_mount_path, + "from": nfs_source, + }, + ] + + job_spec = LeptonJobUserSpec( + resource_shape=resource_shape, + affinity=LeptonResourceAffinity( + allowed_dedicated_node_groups=[chosen_group.metadata.id_], + allowed_nodes_in_node_group=valid_node_ids, + ), + container=LeptonContainer( + image=cfg.container.image, + command=command, + ), + completions=num_nodes, + parallelism=num_nodes, + envs=env_vars, + image_pull_secrets=[cfg.container.registry_auth], + mounts=mounts, + ) + + job = LeptonJob(spec=job_spec, metadata=Metadata(id=cfg.job_name)) + + try: + launched_job = client.job.create(job) + if launched_job.status: + print(f" Job launched: {cfg.job_name}") + workspace_id = cfg.get("workspace_id", "vfco61g2") + print( + f" View at: https://dashboard.dgxc-lepton.nvidia.com/workspace/" + f"{workspace_id}/compute/jobs/detail/{launched_job.metadata.id_}/replicas/list" + ) + return True + except Exception as e: + print(f" ERROR submitting job {cfg.job_name}: {e}") + return False + + +@hydra.main(version_base=None, config_path="lepton_configs", config_name="og2_fp8_agent") +def main(cfg: DictConfig): + """Submit a Lepton job that runs Claude Code as the FP8 Precision Agent.""" + print("=" * 60) + print(f"FP8 Precision Agent - Job: {cfg.job_name}") + print("=" * 60) + print(f" Claude model: {cfg.claude_model}") + print(f" Nodes: {cfg.num_nodes} x {cfg.gpus_per_node} GPUs") + print(f" Training config: {cfg.get('hydra_config', 'N/A')}") + print(f" Steps: {cfg.num_train_steps:,}") + print(f" Strategy: {cfg.get('promotion_strategy', 'ends_in')}") + print(f" Workspace: {cfg.get('workspace_root', 'N/A')}") + + if cfg.get("git_branch"): + print(f" Git branch: {cfg.git_branch}") + + print() + + client = APIClient() + OmegaConf.resolve(cfg) + + success = launch_claude_agent_job(client, cfg) + if not success: + print("\nJob submission failed!") + exit(1) + + print("\nFP8 Precision Agent job submitted successfully!") + print(f"Check {cfg.get('workspace_root', '')}/*/report.md for results after completion.") + + +if __name__ == "__main__": + main() diff --git a/bionemo-recipes/recipes/opengenome2_llama_native_te/submit_training_lepton.py b/bionemo-recipes/recipes/opengenome2_llama_native_te/submit_training_lepton.py new file mode 100644 index 0000000000..623d1630d7 --- /dev/null +++ b/bionemo-recipes/recipes/opengenome2_llama_native_te/submit_training_lepton.py @@ -0,0 +1,221 @@ +#!/usr/bin/env python3 + +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: LicenseRef-Apache2 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Simple Lepton job submission for non-agent training runs. + +Submits a single- or multi-node torchrun job without the Claude agent. +Used for running BF16 baselines or other standard training jobs. + +Usage: + python submit_training_lepton.py --config-name=og2_bf16_baseline_1node +""" + +import hydra +from leptonai.api.v1.types.affinity import LeptonResourceAffinity +from leptonai.api.v1.types.common import Metadata +from leptonai.api.v1.types.deployment import EnvValue, EnvVar, LeptonContainer +from leptonai.api.v1.types.job import LeptonJob, LeptonJobUserSpec +from leptonai.api.v2.client import APIClient +from omegaconf import DictConfig, OmegaConf + + +def _resolve_scheduling_target(client, cfg: DictConfig): + """Resolve node group and resource shape.""" + desired_node_group = str(cfg.node_group).strip() + resource_shape = str(cfg.resource_shape).strip() + + node_groups = client.nodegroup.list_all() + node_group_map = {ng.metadata.name: ng for ng in node_groups} + + if desired_node_group not in node_group_map: + available = ", ".join(sorted(node_group_map.keys())) + raise SystemExit(f"Node group '{desired_node_group}' not found.\nAvailable: {available}") + + chosen_group = node_group_map[desired_node_group] + valid_node_ids = {n.metadata.id_ for n in client.nodegroup.list_nodes(chosen_group.metadata.id_)} + + return chosen_group, valid_node_ids, resource_shape + + +def launch_training_job(client, cfg: DictConfig): + """Launch a standard training job (no Claude agent).""" + chosen_group, valid_node_ids, resource_shape = _resolve_scheduling_target(client, cfg) + + num_nodes = cfg.get("num_nodes", 1) + git_branch = cfg.get("git_branch", "") + repo_root = cfg.get("repo_root", "/data/savithas/bionemo-framework") + + git_sync_script = "" + if git_branch: + git_sync_script = f""" +# Git sync to specified branch +cd {repo_root} +find .git -name "*.lock" -delete 2>/dev/null || true +git fetch origin +git checkout {git_branch} 2>/dev/null || git checkout -b {git_branch} origin/{git_branch} +git reset --hard origin/{git_branch} +echo "Git sync complete! Commit: $(git rev-parse HEAD)" +""" + + # Build checkpoint symlink setup if resuming from external checkpoint + resume_setup_script = "" + resume_from = cfg.get("resume_from", None) + if resume_from and resume_from.get("external_checkpoint"): + ckpt_dir = cfg.hydra_overrides.get("checkpoint.ckpt_dir", "") + ext_ckpt = resume_from.external_checkpoint + step = resume_from.step + resume_setup_script = f""" +# Symlink external checkpoint for resume +mkdir -p {ckpt_dir}/train_fsdp2 +if [ ! -e {ckpt_dir}/train_fsdp2/step_{step} ]; then + ln -s {ext_ckpt}/train_fsdp2/step_{step} {ckpt_dir}/train_fsdp2/step_{step} + echo "Symlinked checkpoint: {ext_ckpt}/train_fsdp2/step_{step}" +else + echo "Checkpoint already exists at {ckpt_dir}/train_fsdp2/step_{step}" +fi +ls -la {ckpt_dir}/train_fsdp2/ +""" + + # Build the torchrun command from Hydra overrides + hydra_overrides = OmegaConf.to_container(cfg.get("hydra_overrides", {}), resolve=True) or {} + override_str = " \\\n ".join(f"{k}={v}" for k, v in hydra_overrides.items()) + + container_script = f"""#!/bin/bash +set -e + +echo "==========================================" +echo "OpenGenome2 Training Job" +echo "Node rank: $NODE_RANK / $NNODES" +echo "GPUs: {cfg.gpus_per_node}x H100" +echo "==========================================" + +# Initialize Lepton environment +wget -O init.sh https://raw.githubusercontent.com/leptonai/scripts/main/lepton_env_to_pytorch.sh +chmod +x init.sh +source init.sh + +export MASTER_PORT=29400 +export NCCL_TIMEOUT_MS=1800000 +export NCCL_DEBUG=WARN +export HF_HOME=/data/savithas/cache +{git_sync_script} + +# Install requirements +cd {cfg.code_path} +pip install -r requirements.txt + +# Login to wandb +wandb login ${{WANDB_API_KEY}} + +echo "==========================================" +echo "Launching torchrun training..." +echo "Config: {cfg.hydra_config}" +echo "==========================================" +{resume_setup_script} +cd {cfg.code_path} +torchrun \\ + --nproc_per_node={cfg.gpus_per_node} \\ + --nnodes=$NNODES \\ + --node_rank=$NODE_RANK \\ + --master_addr=$MASTER_ADDR \\ + --master_port=$MASTER_PORT \\ + {cfg.train_script} \\ + --config-name {cfg.hydra_config} \\ + {override_str} + +echo "==========================================" +echo "Training complete!" +echo "==========================================" +""" + + command = ["bash", "-c", container_script] + + env_vars = [ + EnvVar(name="WANDB_API_KEY", value_from=EnvValue(secret_name_ref=cfg.wandb_secret)), + ] + + nfs_source_path = cfg.get("nfs", {}).get("source_path", "/BioNeMo") + nfs_mount_path = cfg.get("nfs", {}).get("mount_path", "/data") + nfs_source = cfg.get("nfs", {}).get("nfs_source", "node-nfs:fs1") + + mounts = [ + { + "path": nfs_source_path, + "mount_path": nfs_mount_path, + "from": nfs_source, + }, + ] + + job_spec = LeptonJobUserSpec( + resource_shape=resource_shape, + affinity=LeptonResourceAffinity( + allowed_dedicated_node_groups=[chosen_group.metadata.id_], + allowed_nodes_in_node_group=valid_node_ids, + ), + container=LeptonContainer( + image=cfg.container.image, + command=command, + ), + completions=num_nodes, + parallelism=num_nodes, + envs=env_vars, + image_pull_secrets=[cfg.container.registry_auth], + mounts=mounts, + ) + + job = LeptonJob(spec=job_spec, metadata=Metadata(id=cfg.job_name)) + + try: + launched_job = client.job.create(job) + if launched_job.status: + print(f" Job launched: {cfg.job_name}") + workspace_id = cfg.get("workspace_id", "vfco61g2") + print( + f" View at: https://dashboard.dgxc-lepton.nvidia.com/workspace/" + f"{workspace_id}/compute/jobs/detail/{launched_job.metadata.id_}/replicas/list" + ) + return True + except Exception as e: + print(f" ERROR submitting job {cfg.job_name}: {e}") + return False + + +@hydra.main(version_base=None, config_path="lepton_configs", config_name="og2_bf16_baseline_1node") +def main(cfg: DictConfig): + """Submit a standard training job to Lepton.""" + print("=" * 60) + print(f"Training Job: {cfg.job_name}") + print("=" * 60) + print(f" Nodes: {cfg.num_nodes} x {cfg.gpus_per_node} GPUs") + print(f" Config: {cfg.hydra_config}") + if cfg.get("git_branch"): + print(f" Git branch: {cfg.git_branch}") + print() + + client = APIClient() + OmegaConf.resolve(cfg) + + success = launch_training_job(client, cfg) + if not success: + print("\nJob submission failed!") + exit(1) + + print("\nTraining job submitted successfully!") + + +if __name__ == "__main__": + main() diff --git a/bionemo-recipes/recipes/opengenome2_llama_native_te/tests/test_quantization.py b/bionemo-recipes/recipes/opengenome2_llama_native_te/tests/test_quantization.py new file mode 100644 index 0000000000..390e43c218 --- /dev/null +++ b/bionemo-recipes/recipes/opengenome2_llama_native_te/tests/test_quantization.py @@ -0,0 +1,315 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: LicenseRef-Apache2 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for quantization utilities (resolve_layer_precision, generate_layer_regex, update_quant_stats_config).""" + +import re +import sys +from pathlib import Path + +import pytest +import yaml + + +sys.path.append(Path(__file__).parent.parent.as_posix()) + +from quantization import generate_layer_regex, resolve_layer_precision, update_quant_stats_config + + +# -- resolve_layer_precision -- + + +def test_fp8_enabled_no_layers_defaults_all(): + """When fp8 is enabled with no explicit layers, all layers should default to FP8.""" + result = resolve_layer_precision( + num_layers=6, fp8_enabled=True, fp4_enabled=False, fp8_layers=None, fp4_layers=None + ) + assert result == ["fp8", "fp8", "fp8", "fp8", "fp8", "fp8"] + + +def test_fp4_enabled_no_layers_defaults_all(): + """When fp4 is enabled with no explicit layers, all layers should default to FP4.""" + result = resolve_layer_precision( + num_layers=6, fp8_enabled=False, fp4_enabled=True, fp8_layers=None, fp4_layers=None + ) + assert result == ["fp4", "fp4", "fp4", "fp4", "fp4", "fp4"] + + +def test_fp8_explicit_layers(): + """Explicit 1-indexed fp8_layers should produce fp8 at those positions.""" + result = resolve_layer_precision( + num_layers=6, fp8_enabled=True, fp4_enabled=False, fp8_layers=[1, 3, 5], fp4_layers=None + ) + assert result == ["fp8", None, "fp8", None, "fp8", None] + + +def test_fp4_explicit_layers(): + """Explicit 1-indexed fp4_layers should produce fp4 at those positions.""" + result = resolve_layer_precision( + num_layers=6, fp8_enabled=False, fp4_enabled=True, fp8_layers=None, fp4_layers=[2, 4, 6] + ) + assert result == [None, "fp4", None, "fp4", None, "fp4"] + + +def test_mixed_fp8_fp4_explicit(): + """Both enabled with explicit non-overlapping layers should work correctly.""" + result = resolve_layer_precision( + num_layers=6, fp8_enabled=True, fp4_enabled=True, fp8_layers=[1, 3, 4], fp4_layers=[2, 5] + ) + assert result == ["fp8", "fp4", "fp8", "fp8", "fp4", None] + + +def test_both_enabled_no_layers_raises(): + """Both enabled with no layer lists should raise ValueError.""" + with pytest.raises(ValueError, match="Both fp8_config and fp4_config are enabled"): + resolve_layer_precision(num_layers=6, fp8_enabled=True, fp4_enabled=True, fp8_layers=None, fp4_layers=None) + + +def test_overlapping_layers_raises(): + """Overlapping layer assignments should raise ValueError.""" + with pytest.raises(ValueError, match="fp8_layers and fp4_layers cannot have overlapping"): + resolve_layer_precision( + num_layers=6, fp8_enabled=True, fp4_enabled=True, fp8_layers=[1, 2, 3], fp4_layers=[3, 4, 5] + ) + + +def test_disabled_ignores_layers(): + """When a format is disabled, its layers should be ignored.""" + result = resolve_layer_precision( + num_layers=6, fp8_enabled=False, fp4_enabled=False, fp8_layers=[1, 2, 3], fp4_layers=[4, 5, 6] + ) + assert result == [None, None, None, None, None, None] + + +def test_both_disabled(): + """Both disabled with no layers should return all None.""" + result = resolve_layer_precision( + num_layers=6, fp8_enabled=False, fp4_enabled=False, fp8_layers=None, fp4_layers=None + ) + assert result == [None, None, None, None, None, None] + + +def test_large_model_defaults_all(): + """Auto-population should work correctly for larger models (e.g. 36 layers).""" + result = resolve_layer_precision( + num_layers=36, fp8_enabled=True, fp4_enabled=False, fp8_layers=None, fp4_layers=None + ) + assert result == ["fp8"] * 36 + + +def test_fp8_enabled_empty_list(): + """An explicit empty list should remain empty (not default to all).""" + result = resolve_layer_precision(num_layers=6, fp8_enabled=True, fp4_enabled=False, fp8_layers=[], fp4_layers=None) + assert result == [None, None, None, None, None, None] + + +def test_both_enabled_fp8_specified_fp4_defaults_to_remaining(): + """When both enabled, FP8 has explicit layers, FP4 should default to the remaining layers.""" + result = resolve_layer_precision( + num_layers=6, fp8_enabled=True, fp4_enabled=True, fp8_layers=[1, 2, 3], fp4_layers=None + ) + assert result == ["fp8", "fp8", "fp8", "fp4", "fp4", "fp4"] + + +def test_both_enabled_fp4_specified_fp8_defaults_to_remaining(): + """When both enabled, FP4 has explicit layers, FP8 should default to the remaining layers.""" + result = resolve_layer_precision( + num_layers=6, fp8_enabled=True, fp4_enabled=True, fp8_layers=None, fp4_layers=[4, 5, 6] + ) + assert result == ["fp8", "fp8", "fp8", "fp4", "fp4", "fp4"] + + +def test_returns_correct_length(): + """Result list length should always equal num_layers.""" + for n in [1, 6, 48]: + result = resolve_layer_precision( + num_layers=n, fp8_enabled=False, fp4_enabled=False, fp8_layers=None, fp4_layers=None + ) + assert len(result) == n + + +# -- generate_layer_regex -- + + +def test_single_layer(): + """Single layer should produce a simple regex.""" + regex = generate_layer_regex([3]) + assert re.search(regex, "model.model.layers.3.self_attention.layernorm_qkv") + assert not re.search(regex, "model.model.layers.2.self_attention.layernorm_qkv") + + +def test_multiple_layers(): + """Multiple layers should match any of them.""" + regex = generate_layer_regex([1, 2, 3]) + assert re.search(regex, "model.model.layers.1.self_attention.layernorm_qkv") + assert re.search(regex, "model.model.layers.2.layernorm_mlp.fc1") + assert re.search(regex, "model.model.layers.3.layernorm_mlp.fc2") + assert not re.search(regex, "model.model.layers.4.self_attention.proj") + + +def test_matches_correct_sublayers(): + """Regex should only match layernorm_qkv, proj, fc1, fc2.""" + regex = generate_layer_regex([1]) + assert re.search(regex, "model.model.layers.1.self_attention.layernorm_qkv_something") + assert re.search(regex, "model.model.layers.1.self_attention.proj_something") + assert re.search(regex, "model.model.layers.1.layernorm_mlp.fc1_something") + assert re.search(regex, "model.model.layers.1.layernorm_mlp.fc2_something") + # Should not match unrelated sublayer names + assert not re.search(regex, "model.model.layers.1.self_attention.some_other_thing") + + +def test_none_returns_disabled_pattern(): + """None should return a pattern that matches nothing.""" + regex = generate_layer_regex(None) + assert "DISABLED" in regex + assert not re.search(regex, "model.model.layers.1.self_attention.layernorm_qkv") + + +def test_empty_list_returns_disabled_pattern(): + """Empty list should return a pattern that matches nothing.""" + regex = generate_layer_regex([]) + assert "DISABLED" in regex + + +def test_1indexed_layer_names(): + """Regex should use 1-indexed layer numbers (matching debug API naming).""" + regex = generate_layer_regex([1]) + # Should match layers.1 (1-indexed first layer) + assert re.search(regex, "model.model.layers.1.self_attention.layernorm_qkv") + # Should NOT match layers.0 (0-indexed first layer) + assert not re.search(regex, "model.model.layers.0.self_attention.layernorm_qkv") + + +# -- update_quant_stats_config -- + + +@pytest.fixture +def fp8_only_config(tmp_path): + """Create an FP8-only stats config file.""" + config = { + "example_fp8_tensor_stat_collection": { + "enabled": True, + "layers": { + "layer_name_regex_pattern": "PLACEHOLDER", + }, + "transformer_engine": { + "LogFp8TensorStats": { + "enabled": True, + "tensors_struct": [{"tensor": "activation", "stats": ["underflows%"], "freq": 10}], + } + }, + } + } + config_path = tmp_path / "fp8_stats.yaml" + with open(config_path, "w") as f: + yaml.dump(config, f) + return str(config_path) + + +@pytest.fixture +def fp4_fp8_config(tmp_path): + """Create a combined FP4+FP8 stats config file.""" + config = { + "example_fp4_tensor_stat_collection": { + "enabled": True, + "layers": { + "layer_name_regex_pattern": "PLACEHOLDER", + }, + "transformer_engine": { + "LogNvfp4TensorStats": {"enabled": True}, + }, + }, + "example_fp8_tensor_stat_collection": { + "enabled": True, + "layers": { + "layer_name_regex_pattern": "PLACEHOLDER", + }, + "transformer_engine": { + "LogFp8TensorStats": {"enabled": True}, + }, + }, + } + config_path = tmp_path / "fp4_fp8_stats.yaml" + with open(config_path, "w") as f: + yaml.dump(config, f) + return str(config_path) + + +def test_fp8_layers_updates_regex(fp8_only_config): + """FP8 layer list should update the regex in the output config.""" + output_path = update_quant_stats_config(config_file=fp8_only_config, fp4_layers=None, fp8_layers=[1, 2, 3]) + with open(output_path) as f: + result = yaml.safe_load(f) + regex = result["example_fp8_tensor_stat_collection"]["layers"]["layer_name_regex_pattern"] + assert re.search(regex, "model.model.layers.1.self_attention.layernorm_qkv") + assert re.search(regex, "model.model.layers.3.layernorm_mlp.fc2") + assert not re.search(regex, "model.model.layers.4.self_attention.proj") + + +def test_none_layers_disables_matching(fp8_only_config): + """None layers should set regex to match nothing.""" + output_path = update_quant_stats_config(config_file=fp8_only_config, fp4_layers=None, fp8_layers=None) + with open(output_path) as f: + result = yaml.safe_load(f) + regex = result["example_fp8_tensor_stat_collection"]["layers"]["layer_name_regex_pattern"] + assert "DISABLED" in regex + + +def test_fp4_section_disabled_fp8_still_updated(fp4_fp8_config): + """FP4 stats section should be disabled (not yet supported), FP8 should still be updated.""" + output_path = update_quant_stats_config(config_file=fp4_fp8_config, fp4_layers=[1, 2, 3], fp8_layers=[4, 5, 6]) + with open(output_path) as f: + result = yaml.safe_load(f) + + # FP4 section should be disabled + assert result["example_fp4_tensor_stat_collection"]["enabled"] is False + + # FP8 regex should still match layers 4-6 + fp8_regex = result["example_fp8_tensor_stat_collection"]["layers"]["layer_name_regex_pattern"] + assert re.search(fp8_regex, "model.model.layers.5.self_attention.proj") + assert not re.search(fp8_regex, "model.model.layers.2.self_attention.proj") + + +def test_original_file_not_modified(fp8_only_config): + """update_quant_stats_config should write to a temp file, not modify the original.""" + with open(fp8_only_config) as f: + original_content = f.read() + + output_path = update_quant_stats_config(config_file=fp8_only_config, fp4_layers=None, fp8_layers=[1, 2]) + + assert output_path != fp8_only_config + with open(fp8_only_config) as f: + assert f.read() == original_content + + +def test_preserves_other_config_fields(fp8_only_config): + """Non-layer fields in the config should be preserved.""" + output_path = update_quant_stats_config(config_file=fp8_only_config, fp4_layers=None, fp8_layers=[1]) + with open(output_path) as f: + result = yaml.safe_load(f) + # The transformer_engine section should still be there + assert result["example_fp8_tensor_stat_collection"]["transformer_engine"]["LogFp8TensorStats"]["enabled"] is True + + +def test_missing_section_is_skipped(fp8_only_config): + """If fp4 section doesn't exist in config, it should be silently skipped.""" + # fp8_only_config has no fp4 section -- passing fp4_layers should not error + output_path = update_quant_stats_config(config_file=fp8_only_config, fp4_layers=[1, 2], fp8_layers=[3, 4]) + with open(output_path) as f: + result = yaml.safe_load(f) + # Only FP8 section should exist and be updated + assert "example_fp4_tensor_stat_collection" not in result + regex = result["example_fp8_tensor_stat_collection"]["layers"]["layer_name_regex_pattern"] + assert re.search(regex, "model.model.layers.3.self_attention.layernorm_qkv") diff --git a/bionemo-recipes/recipes/opengenome2_llama_native_te/train_fsdp2.py b/bionemo-recipes/recipes/opengenome2_llama_native_te/train_fsdp2.py index 15d173b955..6f7d3de665 100644 --- a/bionemo-recipes/recipes/opengenome2_llama_native_te/train_fsdp2.py +++ b/bionemo-recipes/recipes/opengenome2_llama_native_te/train_fsdp2.py @@ -20,7 +20,7 @@ - Megatron-style scaled initialization for residual output layers - Spike-No-More embedding initialization (std=1.0) - Weight decay grouping (skip bias and 1D params) -- FP8 training with configurable first/last layer BF16 override +- Layer-wise FP8/FP4 quantization via resolve_layer_precision() - Validation with per-token and per-batch loss metrics - Checkpoint resume with LenientLoadPlanner for missing TE keys """ @@ -43,8 +43,6 @@ except ImportError: debug_api = None HAS_NVDLFW_INSPECT = False -import transformer_engine -import transformer_engine.pytorch from omegaconf import DictConfig, OmegaConf from torch.distributed.device_mesh import init_device_mesh from torch.distributed.fsdp import MixedPrecisionPolicy, fully_shard @@ -62,10 +60,10 @@ ) from dataset import create_bshd_dataloader, create_thd_dataloader from distributed_config import DistributedConfig -from fp8_debugging import initialize_fp8_debugging from opengenome_modeling_llama_te import NVLlamaConfig, NVLlamaForCausalLM from optimizer import get_parameter_groups_with_weight_decay from perf_logger import PerfLogger +from quantization import initialize_quant_stats_logging, resolve_layer_precision from scheduler import get_cosine_annealing_schedule_with_warmup from validation import run_validation @@ -110,16 +108,25 @@ def main(args: DictConfig) -> float | None: seed = getattr(args, "seed", 42) set_seed(seed) - # TE Debug feature logging - MUST be done BEFORE FSDP wrapping - if args.fp8_stats_config.enabled: - initialize_fp8_debugging(dist_config, **args.fp8_stats_config, fp8_enabled=args.fp8_config.enabled) + # Quant stats logging is initialized later, after layer_precision is resolved. device_mesh = init_device_mesh("cuda", mesh_shape=(dist_config.world_size,), mesh_dim_names=("dp",)) - # Create an FP8 recipe -- this is only used if FP8 is enabled in the config. - fp8_recipe = hydra.utils.get_class(args.fp8_config.fp8_recipe)( - fp8_format=Format[args.fp8_config.fp8_format], **args.fp8_config.fp8_recipe_kwargs - ) + # Create FP8 recipe -- only used if FP8 is enabled in the config. + fp8_recipe = None + if args.fp8_config.enabled: + fp8_recipe = hydra.utils.get_class(args.fp8_config.fp8_recipe)( + fp8_format=Format[args.fp8_config.fp8_format], **args.fp8_config.fp8_recipe_kwargs + ) + + # Create FP4 recipe -- only used if FP4 is enabled in the config. + fp4_recipe = None + fp4_config = getattr(args, "fp4_config", None) + fp4_enabled = fp4_config is not None and getattr(fp4_config, "enabled", False) + if fp4_enabled: + fp4_recipe = hydra.utils.get_class(fp4_config.fp4_recipe)( + **OmegaConf.to_container(fp4_config.fp4_recipe_kwargs, resolve=True) + ) if args.use_te: config_class = NVLlamaConfig @@ -155,8 +162,53 @@ def main(args: DictConfig) -> float | None: config_kwargs["use_megatron_scaled_init"] = True logger.info("Megatron scaled init enabled: proj/fc2 use std/sqrt(2*num_layers)") + # Handle quantized model init for FP8 layers + quantized_init_cfg = getattr(args.fp8_config, "quantized_model_init_kwargs", None) + if quantized_init_cfg is not None and getattr(quantized_init_cfg, "enabled", False): + config_kwargs["use_quantized_model_init"] = True + logger.info("Quantized model init enabled for FP8 layers") + config = config_class.from_pretrained(args.config_name_or_path, dtype=model_dtype, **config_kwargs) + # Resolve layer-wise quantization precision (FP8/FP4/BF16) from config + fp8_layers_cfg = getattr(args, "fp8_layers", None) + fp4_layers_cfg = getattr(args, "fp4_layers", None) + + def _parse_layers_cfg(cfg): + """Parse layer config from OmegaConf list or CLI string like '[1,2,3]'.""" + if cfg is None: + return None + if isinstance(cfg, str): + import ast + + return ast.literal_eval(cfg.strip("'\"")) + return OmegaConf.to_container(cfg, resolve=True) + + layer_precision = resolve_layer_precision( + num_layers=config.num_hidden_layers, + fp8_enabled=args.fp8_config.enabled, + fp4_enabled=fp4_enabled, + fp8_layers=_parse_layers_cfg(fp8_layers_cfg), + fp4_layers=_parse_layers_cfg(fp4_layers_cfg), + ) + config.layer_precision = layer_precision + logger.info(f"Layer precision: {layer_precision}") + + # Initialize quant stats logging (debug API) BEFORE model creation. + # TEDebugState.initialize() is called during TE module __init__ and checks if debug_api + # is already initialized. If not, it sets debug_enabled=False permanently. + quant_stats_config = getattr(args, "quant_stats_config", None) + if quant_stats_config is not None and getattr(quant_stats_config, "enabled", False): + if HAS_NVDLFW_INSPECT: + initialize_quant_stats_logging( + quant_stats_file=quant_stats_config.quant_stats_file, + quant_log_dir=quant_stats_config.quant_log_dir, + rank=dist_config.rank, + layer_precision=layer_precision, + ) + else: + logger.warning("quant_stats_config.enabled=True but nvdlfw_inspect is not installed, skipping") + # Log initialization settings std = getattr(config, "initializer_range", 0.02) num_layers = getattr(config, "num_hidden_layers", 32) @@ -168,13 +220,11 @@ def main(args: DictConfig) -> float | None: f"embedding_std={embedding_init_std}" ) - with ( - torch.device("meta") if args.use_meta_device else nullcontext(), - transformer_engine.pytorch.quantized_model_init( - recipe=fp8_recipe, **args.fp8_config.quantized_model_init_kwargs - ), - ): - model = model_class(config) + with torch.device("meta") if args.use_meta_device else nullcontext(): + if args.use_te: + model = model_class(config, fp8_recipe=fp8_recipe, fp4_recipe=fp4_recipe) + else: + model = model_class(config) logger.info("Initialized Model:\n%s", model) @@ -198,6 +248,10 @@ def main(args: DictConfig) -> float | None: fully_shard(layer, mesh=device_mesh["dp"], mp_policy=mp_policy) fully_shard(model, mesh=device_mesh["dp"], mp_policy=mp_policy) + # Set recipes after FSDP wrapping (recipes are not serializable through FSDP init) + if args.use_te: + model.model.set_recipes(fp8_recipe=fp8_recipe, fp4_recipe=fp4_recipe) + # If using meta device, move sharded weights to cuda and initialize parameters. # WARNING: meta-device init breaks Megatron-style scaled init for proj/fc2. # Use use_meta_device=false when using use_megatron_scaled_init or spike_no_more_embedding_init. @@ -207,8 +261,8 @@ def main(args: DictConfig) -> float | None: model.to_empty(device=device) model.apply(model._init_weights) - # Assign names to layers so debug API can identify them - if args.fp8_stats_config.enabled and HAS_NVDLFW_INSPECT: + # Assign layer names for debug API (must happen after model creation) + if quant_stats_config is not None and getattr(quant_stats_config, "enabled", False) and HAS_NVDLFW_INSPECT: debug_api.infer_and_assign_layer_names(model) # Create optimizer @@ -311,8 +365,7 @@ def main(args: DictConfig) -> float | None: micro_step += 1 - with transformer_engine.pytorch.autocast(enabled=args.fp8_config.enabled, recipe=fp8_recipe): - outputs = model(**batch) + outputs = model(**batch) loss = outputs.loss / args.grad_acc_steps loss.backward() diff --git a/bionemo-recipes/recipes/opengenome2_llama_native_te/train_fsdp2_cp.py b/bionemo-recipes/recipes/opengenome2_llama_native_te/train_fsdp2_cp.py index 3319fb5d25..90ec182a5e 100644 --- a/bionemo-recipes/recipes/opengenome2_llama_native_te/train_fsdp2_cp.py +++ b/bionemo-recipes/recipes/opengenome2_llama_native_te/train_fsdp2_cp.py @@ -51,8 +51,6 @@ HAS_NVDLFW_INSPECT = False import random -import transformer_engine -import transformer_engine.pytorch from omegaconf import DictConfig, OmegaConf from torch.distributed.device_mesh import init_device_mesh from torch.distributed.fsdp import MixedPrecisionPolicy, fully_shard @@ -69,10 +67,10 @@ from collator import ContextParallelDataLoaderWrapper, DataCollatorForContextParallel from dataset import create_bshd_dataloader, create_thd_dataloader from distributed_config import DistributedConfig -from fp8_debugging import initialize_fp8_debugging from opengenome_modeling_llama_te import NVLlamaConfig, NVLlamaForCausalLM from optimizer import get_parameter_groups_with_weight_decay from perf_logger import PerfLogger +from quantization import initialize_quant_stats_logging, resolve_layer_precision from scheduler import get_cosine_annealing_schedule_with_warmup @@ -117,9 +115,7 @@ def main(args: DictConfig) -> float | None: seed = getattr(args, "seed", 42) set_seed(seed) - # TE Debug feature logging - MUST be done BEFORE FSDP wrapping - if args.fp8_stats_config.enabled: - initialize_fp8_debugging(dist_config, **args.fp8_stats_config, fp8_enabled=args.fp8_config.enabled) + # Quant stats logging is initialized later, after layer_precision is resolved. device_mesh = init_device_mesh( "cuda", @@ -129,9 +125,20 @@ def main(args: DictConfig) -> float | None: logger.info("Created device mesh: %s", device_mesh) # --- Model Configuration --- - fp8_recipe = hydra.utils.get_class(args.fp8_config.fp8_recipe)( - fp8_format=Format[args.fp8_config.fp8_format], **args.fp8_config.fp8_recipe_kwargs - ) + fp8_recipe = None + if args.fp8_config.enabled: + fp8_recipe = hydra.utils.get_class(args.fp8_config.fp8_recipe)( + fp8_format=Format[args.fp8_config.fp8_format], **args.fp8_config.fp8_recipe_kwargs + ) + + # Create FP4 recipe -- only used if FP4 is enabled in the config. + fp4_recipe = None + fp4_config = getattr(args, "fp4_config", None) + fp4_enabled = fp4_config is not None and getattr(fp4_config, "enabled", False) + if fp4_enabled: + fp4_recipe = hydra.utils.get_class(fp4_config.fp4_recipe)( + **OmegaConf.to_container(fp4_config.fp4_recipe_kwargs, resolve=True) + ) # Validate config: meta-device init breaks custom initialization if getattr(args, "use_meta_device", False): @@ -160,8 +167,38 @@ def main(args: DictConfig) -> float | None: config_kwargs["use_megatron_scaled_init"] = True logger.info("Megatron scaled init enabled: proj/fc2 use std/sqrt(2*num_layers)") + # Handle quantized model init for FP8 layers + quantized_init_cfg = getattr(args.fp8_config, "quantized_model_init_kwargs", None) + if quantized_init_cfg is not None and getattr(quantized_init_cfg, "enabled", False): + config_kwargs["use_quantized_model_init"] = True + logger.info("Quantized model init enabled for FP8 layers") + config = NVLlamaConfig.from_pretrained(args.config_name_or_path, dtype=model_dtype, **config_kwargs) + # Resolve layer-wise quantization precision (FP8/FP4/BF16) from config + fp8_layers_cfg = getattr(args, "fp8_layers", None) + fp4_layers_cfg = getattr(args, "fp4_layers", None) + + def _parse_layers_cfg(cfg): + """Parse layer config from OmegaConf list or CLI string like '[1,2,3]'.""" + if cfg is None: + return None + if isinstance(cfg, str): + import ast + + return ast.literal_eval(cfg.strip("'\"")) + return OmegaConf.to_container(cfg, resolve=True) + + layer_precision = resolve_layer_precision( + num_layers=config.num_hidden_layers, + fp8_enabled=args.fp8_config.enabled, + fp4_enabled=fp4_enabled, + fp8_layers=_parse_layers_cfg(fp8_layers_cfg), + fp4_layers=_parse_layers_cfg(fp4_layers_cfg), + ) + config.layer_precision = layer_precision + logger.info(f"Layer precision: {layer_precision}") + # Log initialization settings std = getattr(config, "initializer_range", 0.02) num_layers = getattr(config, "num_hidden_layers", 32) @@ -174,13 +211,8 @@ def main(args: DictConfig) -> float | None: ) # --- Model Initialization --- - with ( - torch.device("meta") if args.use_meta_device else nullcontext(), - transformer_engine.pytorch.quantized_model_init( - recipe=fp8_recipe, **args.fp8_config.quantized_model_init_kwargs - ), - ): - model = NVLlamaForCausalLM(config) + with torch.device("meta") if args.use_meta_device else nullcontext(): + model = NVLlamaForCausalLM(config, fp8_recipe=fp8_recipe, fp4_recipe=fp4_recipe) logger.info("Initialized Model:\n%s", model) @@ -217,15 +249,28 @@ def main(args: DictConfig) -> float | None: torch.cuda.Stream(), ) + # Set recipes after FSDP wrapping (recipes are not serializable through FSDP init) + model.model.set_recipes(fp8_recipe=fp8_recipe, fp4_recipe=fp4_recipe) + # If using meta device, move sharded weights to cuda and initialize parameters. # WARNING: meta-device init breaks Megatron-style scaled init for proj/fc2. # Use use_meta_device=false when using use_megatron_scaled_init or spike_no_more_embedding_init. if args.use_meta_device: model.init_empty_weights() - # Assign names to layers so debug API can identify them - if args.fp8_stats_config.enabled and HAS_NVDLFW_INSPECT: - debug_api.infer_and_assign_layer_names(model) + # Initialize quant stats logging (debug API) if enabled + quant_stats_config = getattr(args, "quant_stats_config", None) + if quant_stats_config is not None and getattr(quant_stats_config, "enabled", False): + if HAS_NVDLFW_INSPECT: + initialize_quant_stats_logging( + quant_stats_file=quant_stats_config.quant_stats_file, + quant_log_dir=quant_stats_config.quant_log_dir, + rank=dist_config.rank, + layer_precision=layer_precision, + ) + debug_api.infer_and_assign_layer_names(model) + else: + logger.warning("quant_stats_config.enabled=True but nvdlfw_inspect is not installed, skipping") # --- Optimizer & Scheduler --- adamw_kwargs = OmegaConf.to_container(args.adamw_kwargs, resolve=True) @@ -316,8 +361,7 @@ def main(args: DictConfig) -> float | None: # Forward pass with mixed precision. with nvtx.annotate("Forward pass", color="green"): - with transformer_engine.pytorch.autocast(enabled=args.fp8_config.enabled, recipe=fp8_recipe): - outputs = model(**batch) + outputs = model(**batch) # Backward pass - scale loss by grad_acc_steps for proper gradient averaging loss = outputs.loss / args.grad_acc_steps