add train script (#2)

pszemraj · Peter Szemraj · web-flow · commit 49079eec04f5 · 2024-11-22T17:47:17.000-05:00
* add enwiki8

* add train script

Signed-off-by: Peter Szemraj &lt;peterszemraj+dev@gmail.com&gt;

* 🎨

Signed-off-by: Peter Szemraj &lt;peterszemraj+dev@gmail.com&gt;

* upd base_decoding for samba shapes

Signed-off-by: Peter Szemraj &lt;peterszemraj+dev@gmail.com&gt;

* save config to json

Signed-off-by: Peter Szemraj &lt;peterszemraj+dev@gmail.com&gt;

* ignore outputs

Signed-off-by: Peter Szemraj &lt;peterszemraj+dev@gmail.com&gt;

* 🚧 prefer rotary-embedding-torch for rope impl

Signed-off-by: Peter Szemraj &lt;peterszemraj+dev@gmail.com&gt;

* fix loss reporting

Signed-off-by: Peter Szemraj &lt;peterszemraj+dev@gmail.com&gt;

* clean up, improve IO

Signed-off-by: Peter Szemraj &lt;peterszemraj+dev@gmail.com&gt;

* 📝 pay homage

Signed-off-by: Peter Szemraj &lt;peterszemraj+dev@gmail.com&gt;

---------

Signed-off-by: Peter Szemraj &lt;peterszemraj+dev@gmail.com&gt;
Co-authored-by: Peter Szemraj &lt;peterszemraj+dev@gmail.com&gt;
diff --git a/.gitignore b/.gitignore
@@ -6,6 +6,9 @@ samba_pytorch/_version.py
 *.pyd
 *.pt*
 
+# outputs
+out/*
+
 # <<< END CUSTOM
 # Byte-compiled / optimized / DLL files
 __pycache__/
diff --git a/README.md b/README.md
@@ -7,7 +7,7 @@ This aims to be a simpler implementation of the [original repo](https://github.c
 ## Installation
 
 > [!TIP]
-> While the `pip install` command _should_ install all deps and the package, in practice some of the more CUDA-heavy deps are better installed separately from source. See section below for more details.
+> The pip install command _should_ install all dependencies and the package, but some CUDA-heavy dependencies are better installed separately. See below for more details.
 
 ```bash
 git clone https://github.com/pszemraj/samba-pytorch.git
@@ -40,6 +40,16 @@ model = GPT(cfg)
 model
 ```
 
+### Training
+
+A minimalist training script for a character-level language model on enwiki8:
+
+```python
+python train.py
+```
+
+Credit to [nGPT-pytorch](https://github.com/lucidrains/nGPT-pytorch) for the enwik8 data set and the training script, which has been adapted for this repo.
+
 ## repo structure
 
 ```text
diff --git a/data/README.md b/data/README.md
@@ -0,0 +1,3 @@
+# Data source
+
+Credit to [nGPT-pytorch](https://github.com/lucidrains/nGPT-pytorch) for the enwik8 dataset. The enwik8 data was (_originally_) downloaded from the Hutter prize page: <http://prize.hutter1.net/>
diff --git a/data/enwik8.gz b/data/enwik8.gz
diff --git a/pyproject.toml b/pyproject.toml
@@ -19,6 +19,7 @@ dependencies = [
     "flash-attn>=2.0.0.post1",
     "mamba-ssm",
     "numpy",
+    "rotary-embedding-torch",
     "sentencepiece",
     "torch>=2.0.0",
     "tqdm",
diff --git a/samba_pytorch/config.py b/samba_pytorch/config.py
@@ -101,6 +101,7 @@ def from_name(cls, name: str, **kwargs: Any) -> Self:
     @property
     def mlp_class(self) -> Type:
         from samba_pytorch import samba
+
         # `self._mlp_class` cannot be the type to keep the config json serializable
         return getattr(samba, self._mlp_class)
 
diff --git a/samba_pytorch/modules/rmsnorm.py b/samba_pytorch/modules/rmsnorm.py
@@ -1,13 +1,14 @@
 import torch
 from torch import nn
-from torch.nn import functional as F
 from einops import rearrange
 from typing import Optional, Tuple, Union
 
+
 def maybe_align(x: torch.Tensor, alignment_in_bytes: int = 16) -> torch.Tensor:
     """Ensures memory alignment by cloning if necessary."""
     return x if x.data_ptr() % alignment_in_bytes == 0 else x.clone()
 
+
 def dropout_add_layer_norm(
     x0: torch.Tensor,
     residual: Optional[torch.Tensor],
@@ -54,7 +55,7 @@ def dropout_add_layer_norm(
 
     # Apply row scaling if provided
     if rowscale is not None:
-        x0 = x0 * rearrange(rowscale, 'b -> b 1')
+        x0 = x0 * rearrange(rowscale, "b -> b 1")
 
     # Compute normalization (either LayerNorm or RMSNorm)
     if is_rms_norm:
@@ -74,21 +75,23 @@ def dropout_add_layer_norm(
         return output, mask
     return output
 
+
 class DropoutAddLayerNorm(nn.Module):
     """
     Module that combines dropout, residual connection, and layer normalization.
     """
+
     def __init__(
         self,
         hidden_size: int,
         prenorm: bool = False,
         p: float = 0.0,
         eps: float = 1e-5,
         residual_in_fp32: bool = False,
-        device = None,
-        dtype = None,
+        device=None,
+        dtype=None,
     ):
-        factory_kwargs = {'device': device, 'dtype': dtype}
+        factory_kwargs = {"device": device, "dtype": dtype}
         super().__init__()
         self.prenorm = prenorm
         self.p = p
@@ -101,7 +104,7 @@ def forward(
         self,
         x0: torch.Tensor,
         residual: Optional[torch.Tensor] = None,
-        rowscale: Optional[torch.Tensor] = None
+        rowscale: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         return dropout_add_layer_norm(
             x0,
@@ -112,28 +115,24 @@ def forward(
             self.eps,
             rowscale=rowscale,
             prenorm=self.prenorm,
-            residual_in_fp32=self.residual_in_fp32
+            residual_in_fp32=self.residual_in_fp32,
         )
 
     def reset_parameters(self):
         """Reset parameters to default initialization."""
         nn.init.ones_(self.weight)
         nn.init.zeros_(self.bias)
 
+
 class RMSNorm(nn.Module):
     """
     Root Mean Square Layer Normalization.
 
     Implementation follows the paper: https://arxiv.org/abs/1910.07467
     """
-    def __init__(
-        self,
-        hidden_size: int,
-        eps: float = 1e-5,
-        device = None,
-        dtype = None
-    ):
-        factory_kwargs = {'device': device, 'dtype': dtype}
+
+    def __init__(self, hidden_size: int, eps: float = 1e-5, device=None, dtype=None):
+        factory_kwargs = {"device": device, "dtype": dtype}
         super().__init__()
         self.weight = nn.Parameter(torch.ones(hidden_size, **factory_kwargs))
         self.eps = eps
@@ -145,14 +144,11 @@ def reset_parameters(self):
         """Reset parameters to default initialization."""
         nn.init.ones_(self.weight)
 
-def rms_norm(
-    x: torch.Tensor,
-    weight: torch.Tensor,
-    epsilon: float
-) -> torch.Tensor:
+
+def rms_norm(x: torch.Tensor, weight: torch.Tensor, epsilon: float) -> torch.Tensor:
     """
     Applies RMS normalization to the input tensor.
     """
     norm_x = torch.mean(x * x, dim=-1, keepdim=True)
     x_normed = x * torch.rsqrt(norm_x + epsilon)
-    return x_normed * weight
+    return x_normed * weight
diff --git a/samba_pytorch/samba.py b/samba_pytorch/samba.py
diff --git a/train.py b/train.py

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+# Data source`
	`2`	`+`
	`3`	`+Credit to [nGPT-pytorch](https://github.com/lucidrains/nGPT-pytorch) for the enwik8 dataset. The enwik8 data was (_originally_) downloaded from the Hutter prize page: <http://prize.hutter1.net/>`