sorting.py

# -*- coding: utf-8 -*-
"""sorting.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/1dN33JZ_RYRDiBcaVEULX3tYoA9z3Nb7r
"""

# !nvidia-smi
# !pip install einops

# Commented out IPython magic to ensure Python compatibility.
# Import stuff
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import einops
import tqdm.notebook as tqdm

import random
import time

# from google.colab import drive
from pathlib import Path
import pickle
import os

import matplotlib.pyplot as plt
# %matplotlib inline
import plotly.express as px
import plotly.io as pio
import wandb.integration
pio.renderers.default = "colab"
import plotly.graph_objects as go

from torch.utils.data import DataLoader

from functools import *
import pandas as pd
import gc

# import comet_ml
import itertools
import wandb


use_drive = False


if use_drive:
    drive.mount('/content/gdrive')
    drive_root = Path('/content/gdrive/MyDrive/Colab Notebooks/Sort/')
root = Path('/scratch/herocharge/Grokking/saved_runs')
large_root = Path('./Grokking/large_files')

# A helper class to get access to intermediate activations (inspired by Garcon)
# It's a dummy module that is the identity function by default
# I can wrap any intermediate activation in a HookPoint and get a convenient
# way to add PyTorch hooks
class HookPoint(nn.Module):
    def __init__(self):
        super().__init__()
        self.fwd_hooks = []
        self.bwd_hooks = []

    def give_name(self, name):
        # Called by the model at initialisation
        self.name = name

    def add_hook(self, hook, dir='fwd'):
        # Hook format is fn(activation, hook_name)
        # Change it into PyTorch hook format (this includes input and output,
        # which are the same for a HookPoint)
        def full_hook(module, module_input, module_output):
            return hook(module_output, name=self.name)
        if dir=='fwd':
            handle = self.register_forward_hook(full_hook)
            self.fwd_hooks.append(handle)
        elif dir=='bwd':
            handle = self.register_backward_hook(full_hook)
            self.bwd_hooks.append(handle)
        else:
            raise ValueError(f"Invalid direction {dir}")

    def remove_hooks(self, dir='fwd'):
        if (dir=='fwd') or (dir=='both'):
            for hook in self.fwd_hooks:
                hook.remove()
            self.fwd_hooks = []
        if (dir=='bwd') or (dir=='both'):
            for hook in self.bwd_hooks:
                hook.remove()
            self.bwd_hooks = []
        if dir not in ['fwd', 'bwd', 'both']:
            raise ValueError(f"Invalid direction {dir}")

    def forward(self, x):
        return x

# Define network architecture
# I defined my own transformer from scratch so I'd fully understand each component
# - I expect this wasn't necessary or particularly important, and a bunch of this
# replicates existing PyTorch functionality

# Embed & Unembed
class Embed(nn.Module):
    def __init__(self, d_vocab, d_model):
        super().__init__()
        self.W_E = nn.Parameter(torch.randn(d_model, d_vocab)/np.sqrt(d_model))

    def forward(self, x):
        return torch.einsum('dbp -> bpd', self.W_E[:, x])

class Unembed(nn.Module):
    def __init__(self, d_vocab, d_model):
        super().__init__()
        self.W_U = nn.Parameter(torch.randn(d_model, d_vocab)/np.sqrt(d_vocab))

    def forward(self, x):
        return (x @ self.W_U)

# Positional Embeddings
class PosEmbed(nn.Module):
    def __init__(self, max_ctx, d_model):
        super().__init__()
        self.W_pos = nn.Parameter(torch.randn(max_ctx, d_model)/np.sqrt(d_model))

    def forward(self, x):
        return x+self.W_pos[:x.shape[-2]]

# LayerNorm
class LayerNorm(nn.Module):
    def __init__(self, d_model, epsilon = 1e-4, model=[None]):
        super().__init__()
        self.model = model
        self.w_ln = nn.Parameter(torch.ones(d_model))
        self.b_ln = nn.Parameter(torch.zeros(d_model))
        self.epsilon = epsilon

    def forward(self, x):
        if self.model[0].use_ln:
            x = x - x.mean(axis=-1)[..., None]
            x = x / (x.std(axis=-1)[..., None] + self.epsilon)
            x = x * self.w_ln
            x = x + self.b_ln
            return x
        else:
            return x

# Attention
class Attention(nn.Module):
    def __init__(self, d_model, num_heads, d_head, n_ctx, model):
        super().__init__()
        self.model = model
        self.W_K = nn.Parameter(torch.randn(num_heads, d_head, d_model)/np.sqrt(d_model))
        self.W_Q = nn.Parameter(torch.randn(num_heads, d_head, d_model)/np.sqrt(d_model))
        self.W_V = nn.Parameter(torch.randn(num_heads, d_head, d_model)/np.sqrt(d_model))
        self.W_O = nn.Parameter(torch.randn(d_model, d_head * num_heads)/np.sqrt(d_model))
        self.register_buffer('mask', torch.tril(torch.ones((n_ctx, n_ctx))))
        self.d_head = d_head
        self.hook_k = HookPoint()
        self.hook_q = HookPoint()
        self.hook_v = HookPoint()
        self.hook_z = HookPoint()
        self.hook_attn = HookPoint()
        self.hook_attn_pre = HookPoint()

    def forward(self, x):
        k = self.hook_k(torch.einsum('ihd,bpd->biph', self.W_K, x))
        q = self.hook_q(torch.einsum('ihd,bpd->biph', self.W_Q, x))
        v = self.hook_v(torch.einsum('ihd,bpd->biph', self.W_V, x))
        attn_scores_pre = torch.einsum('biph,biqh->biqp', k, q)
        attn_scores_masked = torch.tril(attn_scores_pre) - 1e10 * (1 - self.mask[:x.shape[-2], :x.shape[-2]])
        attn_matrix = self.hook_attn(F.softmax(self.hook_attn_pre(attn_scores_masked/np.sqrt(self.d_head)), dim=-1))
        z = self.hook_z(torch.einsum('biph,biqp->biqh', v, attn_matrix))
        z_flat = einops.rearrange(z, 'b i q h -> b q (i h)')
        out = torch.einsum('df,bqf->bqd', self.W_O, z_flat)
        return out

# MLP Layers
class MLP(nn.Module):
    def __init__(self, d_model, d_mlp, act_type, model):
        super().__init__()
        self.model = model
        self.W_in = nn.Parameter(torch.randn(d_mlp, d_model)/np.sqrt(d_model))
        self.b_in = nn.Parameter(torch.zeros(d_mlp))
        self.W_out = nn.Parameter(torch.randn(d_model, d_mlp)/np.sqrt(d_model))
        self.b_out = nn.Parameter(torch.zeros(d_model))
        self.act_type = act_type
        # self.ln = LayerNorm(d_mlp, model=self.model)
        self.hook_pre = HookPoint()
        self.hook_post = HookPoint()
        assert act_type in ['ReLU', 'GeLU']

    def forward(self, x):
        x = self.hook_pre(torch.einsum('md,bpd->bpm', self.W_in, x) + self.b_in)
        if self.act_type=='ReLU':
            x = F.relu(x)
        elif self.act_type=='GeLU':
            x = F.gelu(x)
        x = self.hook_post(x)
        x = torch.einsum('dm,bpm->bpd', self.W_out, x) + self.b_out
        return x

# Transformer Block
class TransformerBlock(nn.Module):
    def __init__(self, d_model, d_mlp, d_head, num_heads, n_ctx, act_type, model):
        super().__init__()
        self.model = model
        # self.ln1 = LayerNorm(d_model, model=self.model)
        self.attn = Attention(d_model, num_heads, d_head, n_ctx, model=self.model)
        # self.ln2 = LayerNorm(d_model, model=self.model)
        self.mlp = MLP(d_model, d_mlp, act_type, model=self.model)
        self.hook_attn_out = HookPoint()
        self.hook_mlp_out = HookPoint()
        self.hook_resid_pre = HookPoint()
        self.hook_resid_mid = HookPoint()
        self.hook_resid_post = HookPoint()

    def forward(self, x):
        x = self.hook_resid_mid(x + self.hook_attn_out(self.attn((self.hook_resid_pre(x)))))
        x = self.hook_resid_post(x + self.hook_mlp_out(self.mlp((x))))
        return x

# Full transformer
class Transformer(nn.Module):
    def __init__(self, num_layers, d_vocab, d_model, d_mlp, d_head, num_heads, n_ctx, act_type, use_cache=False, use_ln=True):
        super().__init__()
        self.cache = {}
        self.use_cache = use_cache

        self.embed = Embed(d_vocab, d_model)
        self.pos_embed = PosEmbed(n_ctx, d_model)
        self.blocks = nn.ModuleList([TransformerBlock(d_model, d_mlp, d_head, num_heads, n_ctx, act_type, model=[self]) for i in range(num_layers)])
        # self.ln = LayerNorm(d_model, model=[self])
        self.unembed = Unembed(d_vocab, d_model)
        self.use_ln = use_ln

        for name, module in self.named_modules():
            if type(module)==HookPoint:
                module.give_name(name)

    def forward(self, x):
        x = self.embed(x)
        x = self.pos_embed(x)
        for block in self.blocks:
            x = block(x)
        # x = self.ln(x)
        x = self.unembed(x)
        return x

    def set_use_cache(self, use_cache):
        self.use_cache = use_cache

    def hook_points(self):
        return [module for name, module in self.named_modules() if 'hook' in name]

    def remove_all_hooks(self):
        for hp in self.hook_points():
            hp.remove_hooks('fwd')
            hp.remove_hooks('bwd')

    def cache_all(self, cache, incl_bwd=False):
        # Caches all activations wrapped in a HookPoint
        def save_hook(tensor, name):
            cache[name] = tensor.detach()
        def save_hook_back(tensor, name):
            cache[name+'_grad'] = tensor[0].detach()
        for hp in self.hook_points():
            hp.add_hook(save_hook, 'fwd')
            if incl_bwd:
                hp.add_hook(save_hook_back, 'bwd')

# Helper functions
def cuda_memory():
    print(torch.cuda.memory_allocated()/1e9)

def cross_entropy_high_precision(logits, labels):
    # Shapes: batch x seq x vocab, batch x seq
    # Cast logits to float64 because log_softmax has a float32 underflow on overly
    # confident data and can only return multiples of 1.2e-7 (the smallest float x
    # such that 1+x is different from 1 in float32). This leads to loss spikes
    # and dodgy gradients

    logprobs = F.log_softmax(logits.to(torch.float64), dim=-1)
    # print(logprobs.shape, labels.shape)
    prediction_logprobs = torch.gather(logprobs, index=labels[:, :, None], dim=-1)
    # print(prediction_logprobs.shape)
    loss = -torch.mean(prediction_logprobs)
    return loss

def full_loss(model, data, arr_len, device='cuda'):
    """
    Calculate the full loss and accuracy of the model.

    Parameters:
    model (nn.Module): The PyTorch model.
    data (Tensor): The input data.
    arr_len (int): The length of the array.

    Returns:
    tuple: A tuple containing the loss and accuracy.
    """
    # Take the final position only
    logits = model(data)[:, arr_len:-1]

    # labels = torch.tensor([fn(i, j) for i, j, _ in data]).to('cuda')
    # labels = torch.tensor([np.sort(x) for x in data])
    labels = torch.tensor(data[:, arr_len + 1:]).to(device)

    # Calculate loss
    loss = cross_entropy_high_precision(logits, labels)

    # Calculate accuracy
    predictions = torch.argmax(logits, dim=2)
    # print(predictions.shape, labels.shape, len(labels))
    accuracy = torch.sum(predictions == labels) / (arr_len * len(labels))
    # Calculate exact match accuracy
    exact_match_accuracy = torch.sum(torch.all(predictions == labels, dim=-1)) / labels.shape[0]
    return loss, accuracy,exact_match_accuracy

# def test_logits(logits, bias_correction=False, original_logits=None, mode='all'):
#     # Calculates cross entropy loss of logits representing a batch of all p^2
#     # possible inputs
#     # Batch dimension is assumed to be first
#     if logits.shape[1]==p*p:
#         logits = logits.T
#     if logits.shape==torch.Size([p*p, p+1]):
#         logits = logits[:, :-1]
#     logits = logits.reshape(p*p, p)
#     if bias_correction:
#         # Applies bias correction - we correct for any missing bias terms,
#         # independent of the input, by centering the new logits along the batch
#         # dimension, and then adding the average original logits across all inputs
#         logits = einops.reduce(original_logits - logits, 'batch ... -> ...', 'mean') + logits
#     if mode=='train':
#         return cross_entropy_high_precision(logits[is_train], labels[is_train])
#     elif mode=='test':
#         return cross_entropy_high_precision(logits[is_test], labels[is_test])
#     elif mode=='all':
#         return cross_entropy_high_precision(logits, labels)

lr=1e-4 #@param
weight_decay = 1.0 #@param
p=113 #@param
d_model = 128 #@param
fn_name = 'add' #@param ['add', 'subtract', 'x2xyy2','rand']
frac_train = 0.3 #@param
dataset_size = 1000 #@param
num_epochs = 20000 #@param
save_models = True #@param
save_every = 1000 #@param
# Stop training when test loss is <stopping_thresh
stopping_thresh = -1 #@param
seed = 0 #@param

arr_len = 3 #@param
start = 1 #@param
end = 10 #@param

num_layers = 1
batch_style = 'full'
d_vocab = p+1
n_ctx = 3
d_mlp = 4*d_model
num_heads = 1
assert d_model % num_heads == 0
d_head = d_model//num_heads
act_type = 'ReLU' #@param ['ReLU', 'GeLU']
# batch_size = 512
use_ln = False
random_answers = np.random.randint(low=0, high=p, size=(p, p))
fns_dict = {'add': lambda x,y:(x+y)%p, 'subtract': lambda x,y:(x-y)%p, 'x2xyy2':lambda x,y:(x**2+x*y+y**2)%p, 'rand':lambda x,y:random_answers[x][y]}
fn = fns_dict[fn_name]


config = {
    'lr':lr,
    'weight_decay':weight_decay,
    'fn_name': fn_name,
    'dataset_size':dataset_size,
    'frac_train':frac_train,
    'num_epochs':num_epochs,
    'arr_len': arr_len,
    'num_layers':num_layers,
    'operation':'sort'
}


train_model = True #@param


wandb.init(project='grok-x', config=config, name=f'{config["operation"]}_{int(time.time())}')

def gen_train_test(frac_train, dataset_size, arr_len=5, start=1, end=100, seed=0):
    # Generate train and test split
    
    
    arrs = [np.random.randint(start, end+1, size=arr_len) for _ in range(dataset_size)]
    pairs = [np.concatenate([x, [0], np.sort(x)]) for x in arrs]
    random.seed(seed)
    random.shuffle(pairs)
    div = int(frac_train*len(pairs))
    return np.array(pairs[:div]), np.array(pairs[div:])

random.seed(seed)
np.random.seed(seed)
torch.random.manual_seed(seed)
torch.cuda.random.manual_seed(seed)


train, test = gen_train_test(
                    frac_train,
                    dataset_size,
                    seed=seed,
                    start=start,
                    end=end,
                    arr_len=arr_len)
print(len(train), len(test))


if train_model:
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    print(device)
    model = Transformer(num_layers=num_layers, d_vocab=(end - start + 1 + 1), d_model=d_model, d_mlp=d_mlp, d_head=d_head, num_heads=num_heads, n_ctx=2 * arr_len + 1, act_type=act_type, use_cache=False, use_ln=use_ln)
    model.to(device)
    optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay, betas=(0.9, 0.98))
    scheduler = optim.lr_scheduler.LambdaLR(optimizer, lambda step: min(step/10, 1))
    run_name = f"grok_{int(time.time())}"
    print(f'Run name {run_name}')
    if save_models:
        os.mkdir(root/run_name)
        save_dict = {'model':model.state_dict(), 'train_data':train, 'test_data':test}
        torch.save(save_dict, root/run_name/'init.pth')
    train_losses = []
    train_accs = []
    train_exact_accs = []
    test_losses = []
    test_accs = []
    test_exact_accs = []
    for epoch in range(num_epochs):
        train_loss, train_acc, train_exact_acc = full_loss(model, train, arr_len, device=device)
        test_loss, test_acc, test_exact_acc = full_loss(model, test, arr_len, device =device)
        train_losses.append(train_loss.item())
        test_losses.append(test_loss.item())
        test_accs.append(test_acc.item())
        train_accs.append(train_acc.item())
        test_exact_accs.append(test_exact_acc.item())
        train_exact_accs.append(train_exact_acc.item())


        if epoch%100 == 0:
          wandb.log({
            'Train loss': train_loss.item(),
            'Test loss': test_loss.item(),
            'Test acc': test_acc.item(),
            'Train acc': train_acc.item(),
            'Test exact acc': test_exact_acc.item(),
            'Train exact acc': train_exact_acc.item(),
            })
          print(f"{epoch}_ \
            {np.log(train_loss.item()):.4f}_{np.log(test_loss.item()):.4f}\
            {train_acc.item():.4f}_{test_acc.item():.4f}\
            {train_exact_acc.item():.4f}_{test_exact_acc.item():.4f}    ")#_{train_acc.item():.4f}_{test_acc.item():.4f}")
        train_loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
        if test_loss.item() < stopping_thresh:
            break
        if (save_models) and (epoch%save_every == 0):
            if test_loss.item() < stopping_thresh:
                break
            save_dict = {
                'model': model.state_dict(),
                'optimizer': optimizer.state_dict(),
                'scheduler': scheduler.state_dict(),
                'train_loss': train_loss,
                'test_loss': test_loss,
                'train_acc' : train_acc,
                'test_acc' : test_acc,
                'train_exact_acc' : train_exact_acc,
                'test_exact_acc' : test_exact_acc,
                'epoch': epoch,
            }
            torch.save(save_dict, root/run_name/f"{epoch}.pth")
            # print(f"Saved model to {root/run_name/f'{epoch}.pth'}")
            # save_models = False
            
import matplotlib.pyplot as plt


# Plot loss
plt.plot(train_losses, label='Training Loss')
plt.plot(test_losses, label='Validation Loss')
plt.legend()
plt.title('Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.show()

# Plot accuracy
plt.plot(train_accs, label='Training Accuracy')
plt.plot(test_accs, label='Validation Accuracy')
plt.legend()
plt.title('Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.show()

# Plot exact match accuracy
plt.plot(train_exact_accs, label='Training Exact Match Accuracy')
plt.plot(test_exact_accs, label='Validation Exact Match Accuracy')
plt.legend()
plt.title('Exact Match Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.show()


'''
!mkdir -p /content/Grokking/saved_runs

!ls gdrive/MyDrive/'Colab Notebooks'/Sort

# prompt: generate 5 random integers between 1 and 100, use numpy

random_integers = np.random.randint(1, 101, size=5)
random_integers

import gc
gc.collect()
'''