orifmilod · orifmilod · Mar 11, 2024 · Jul 10, 2023 · Mar 9, 2024 · Mar 9, 2024
diff --git a/.gitignore b/.gitignore
@@ -3,3 +3,6 @@ __pycache__/
 dist/
 output*
 .DS_Store
+env/
+build/
+.eggs/
diff --git a/data/mnist/raw/t10k-images-idx3-ubyte b/data/mnist/raw/t10k-images-idx3-ubyte
diff --git a/data/mnist/raw/t10k-images-idx3-ubyte.gz b/data/mnist/raw/t10k-images-idx3-ubyte.gz
diff --git a/data/mnist/raw/t10k-labels-idx1-ubyte b/data/mnist/raw/t10k-labels-idx1-ubyte
diff --git a/data/mnist/raw/t10k-labels-idx1-ubyte.gz b/data/mnist/raw/t10k-labels-idx1-ubyte.gz
diff --git a/data/mnist/raw/train-images-idx3-ubyte b/data/mnist/raw/train-images-idx3-ubyte
diff --git a/data/mnist/raw/train-images-idx3-ubyte.gz b/data/mnist/raw/train-images-idx3-ubyte.gz
diff --git a/data/mnist/raw/train-labels-idx1-ubyte b/data/mnist/raw/train-labels-idx1-ubyte
diff --git a/data/mnist/raw/train-labels-idx1-ubyte.gz b/data/mnist/raw/train-labels-idx1-ubyte.gz
diff --git a/example/language_model/lm.py b/example/language_model/lm.py
diff --git a/example/language_model/makemore_part2_mlp.ipynb b/example/language_model/makemore_part2_mlp.ipynb
diff --git a/example/language_model/rnn.py b/example/language_model/rnn.py
@@ -0,0 +1,105 @@
+import torch
+import torch.nn.functional as F
+import matplotlib.pyplot as plt
+from math import sqrt
+
+def generate_mapping(data):
+    chars = sorted(list(set(''.join(data))))
+    stoi = {char: index + 1 for index, char in enumerate(chars)}
+    # marks beginning or end of a word
+    stoi['.'] = 0
+    return stoi
+
+def generate_learning_rates(size):
+    lre = torch.linspace(-6, 0, size)
+    return 10 ** lre # we want the learning rates to be spaced exponentially
+
+def load_data(context_size):
+    data, label = [], []
+    words = open('./names.txt', 'r').read().splitlines()
+    stoi = generate_mapping(words)
+    # itos = {v: k for k, v in stoi.items()}
+
+    for w in words:
+        context = [0] * context_size
+        for ch in w + '.':
+          ix = stoi[ch]
+          data.append(context)
+          label.append(ix)
+          context = context[1:] + [ix] # crop and append
+
+    data = torch.tensor(data)
+    label = torch.tensor(label)
+    return data, label
+
+def main():
+    # How much tokens to keep as context when making the prediction for the next one
+    CONTEXT_SIZE = 3
+    # Size of the vector to represent a single token
+    EMBEDDING_SIZE = 10
+    VOCAB_SIZE = 27 # There are 27 possible chars in our dataset
+
+    data, label = load_data(CONTEXT_SIZE)
+    # Creating an embedding from our data with each token being embedding represented 
+    # by a vector of length "EMBEDDING_SIZE"
+    C = torch.rand((VOCAB_SIZE, EMBEDDING_SIZE))
+
+    NUMBER_OF_NEURONS = 200
+
+    # Creating hidden layer
+    # Using Kaiming init https://pytorch.org/docs/stable/nn.init.html
+    w1 = torch.rand((CONTEXT_SIZE * EMBEDDING_SIZE, NUMBER_OF_NEURONS)) * ((5/3) / (CONTEXT_SIZE*EMBEDDING_SIZE))
+    print("First ",  ((5/3) / (CONTEXT_SIZE*EMBEDDING_SIZE)))
+    b1 = torch.rand(NUMBER_OF_NEURONS) * 0.01
+
+    # Creating the output layer
+    w2 = torch.rand((NUMBER_OF_NEURONS, 27)) *  ((5/3) / (NUMBER_OF_NEURONS))
+    print("second ", ((5/3) * sqrt(NUMBER_OF_NEURONS)))
+    b2 = torch.rand(27) * 0.01
+
+    parameters = [C, w1, b1, w2, b2]
+    print("Number of parameters:", sum(p.nelement() for p in parameters))
+
+    for p in parameters:
+        p.requires_grad = True
+
+    used_lrs = []
+    losses = []
+
+    EPOCHS = 200000
+    MINIBATCH_SIZE = 32
+    avgs = []
+    for i in range(EPOCHS):
+        # Minibatching 
+        minibatch_indexes = torch.randint(0, data.shape[0], (MINIBATCH_SIZE,))
+        embedding = C[data[minibatch_indexes]]
+
+        # Forward pass
+        h = torch.tanh(embedding.view(-1, EMBEDDING_SIZE * CONTEXT_SIZE) @ w1 + b1)
+        logits = h @ w2 + b2
+
+        loss = F.cross_entropy(logits, label[minibatch_indexes])
+        for p in parameters:
+            p.grad = None
+        loss.backward()
+
+        # track stats
+        if i % 1000 == 0: # print every once in a while
+          print(f'{i:7d}/{EPOCHS:7d}: {loss.item():.4f}')
+          if i > EPOCHS / 2:
+              avgs.append(loss.item())
+
+        used_lrs.append(i)
+        losses.append(loss.item())
+
+        lr = 0.1 if i < EPOCHS / 2 else 0.01
+        for p in parameters:
+            p.data -= lr * p.grad
+
+    print("Average loss", sum(avgs) / len(avgs))
+    plt.plot(used_lrs, losses)
+    plt.legend()
+    plt.show()
+
+if __name__ == "__main__":
+    main()
diff --git a/gigatorch/activation_fn.py b/gigatorch/activation_fn.py
@@ -1,2 +1,2 @@
-def relu(x):
+def relu(x: int) -> int:
     return max(0, x)
diff --git a/gigatorch/cnn.py b/gigatorch/cnn.py
@@ -10,47 +10,56 @@
 from abc import ABC, abstractmethod
 from os import listdir
 from os.path import join
+import numpy as np
 
 
 class Compute(ABC):
     @abstractmethod
-    def compute(self, data) -> List[List[Tensor]]:
+    def compute(self, input: Tensor) -> Tensor:
         pass
 
 
+"""
+The MaxPool2D layer extracts the maximum value over the window defined by pool_size
+for each dimension along the features axis. The window is shifted by strides in each dimension.
+
+MaxPool2D accepts a 4-dimensional tensor as input. The dimensions represent:
+Batch size: The number of samples in a batch. We can do parallel processing if it's more than 1 batch.
+Channels: The number of input channels. For example, an RGB image would have 3 channels.
+Height: The height of the input.
+Width: The width of the input.
+"""
 class MaxPool2D(Compute):
     def __init__(self, kernel_size, stride=None):
         self.kernel_size = kernel_size
         self.stride = stride if stride is not None else kernel_size
 
-    def compute(self, data_list) -> List[List[Tensor]]:
+
+    def compute(self, input: Tensor) -> Tensor:
+        assert len(input.shape) == 4, f"can't 2d pool {input.shape}"
+        (batch_size, channels, height, width) = input.shape
+        assert (height - self.kernel_size) % self.stride == 0, f"Height does not fit the kernel size {self.kernel_size} and stride {self.stride}"
+        assert (width - self.kernel_size) % self.stride == 0, f"Width does not fit the kernel size {self.kernel_size} and stride {self.stride}"
+
         print("Computing maxpool")
-        print("Size of data", len(data_list[0]))
-        print("Number of input", len(data_list))
-        output = []
-        for data in data_list:
-            if len(data) < self.kernel_size or len(data[0]) < self.kernel_size:
-                raise Exception("Received data is smaller than the kernel_size")
-
-            new_data = []
-            for row_index in range(0, len(data) - self.kernel_size + 1, self.stride):
-                row = []
-                for column_index in range(
-                    0, len(data[row_index]) - self.kernel_size + 1, self.stride
-                ):
-                    current_max = 0
-                    for i in range(self.kernel_size):
-                        for j in range(self.kernel_size):
-                            current_max = max(
-                                current_max, data[row_index + i][column_index + j]
-                            )
-                    row.append(current_max)
-                new_data.append(row)
-            output.append(new_data)
-        print("Size of data", len(output[0]))
-        print("Number of output", len(output))
+        print("Input shape: ", input.shape)
+
+        pooled_height = (height - self.kernel_size) // self.stride + 1
+        pooled_width = (width - self.kernel_size) // self.stride + 1
+        output = np.zeros((batch_size, channels, pooled_height, pooled_width))
+
+        for b in range(batch_size):
+            for c in range(channels):
+                for i in range(pooled_height):
+                    for j in range(pooled_width):
+                        h_start = i * self.stride
+                        h_end = h_start + self.kernel_size
+                        w_start = j * self.stride
+                        w_end = w_start + self.kernel_size
+                        output[b, c, i, j] = np.max(input.data[b, c, h_start:h_end, w_start:w_end])
+
         print("\n")
-        return output
+        return Tensor(output)
 
 
 class Conv2D(Compute):
@@ -88,45 +97,25 @@ def __init__(self, in_channels, out_channels, kernel_size, activation_fn, stride
         self.activation_fn = activation_fn
         self.stride = stride
 
-    def compute(self, data_list):
-        print("computing conv2d")
-        print("Size of data", data_list.shape)
-        output = Tensor([])
-        print("Number of kernels", self.kernels.shape)
-        # Iterate for out_channels number of times
-        for i in range(self.kernels.shape[0]):
-            print("output", i)
-            for layer_index in range(data_list.shape[0]):
-                print("layer index", layer_index)
-                data = data_list[layer_index]
-                kernel = self.kernels[layer_index]
-                print("data", data.shape)
-                print("kernel", kernel.shape)
-
-                if data.shape[0] < self.kernel_size or data.shape[1] < self.kernel_size:
-                    raise Exception("Received data is smaller than the kernel_size")
-
-                new_data = []
-                for row_index in range(
-                    0, len(data) - self.kernel_size + 1, self.stride
-                ):
-                    row = []
-                    for column_index in range(len(data[0]) - self.kernel_size + 1):
-                        sum = Tensor(0)
-                        for i in range(self.kernel_size):
-                            for j in range(self.kernel_size):
-                                sum += (
-                                    data[row_index + i][column_index + j] * kernel[i][j]
-                                )
-                        row.append(self.activation_fn(sum))
-                    new_data.append(row)
-                output.append(new_data)
-
-        print("Size of data", len(output[0]))
-        print("Number of output", len(output))
-        print("\n")
-        return output
+   def compute(self, input):
+        (batch_size, _, height, width) = input.shape
+        output_height = (height - self.kernel_size) // self.stride + 1
+        output_width = (width - self.kernel_size) // self.stride + 1
+        output = Tensor(np.zeros((batch_size, self.out_channels, output_height, output_width)))
+
+        for b in range(batch_size):
+            for k in range(self.out_channels):
+                for i in range(output_height):
+                    for j in range(output_width):
+                        h_start = i * self.stride
+                        h_end = h_start + self.kernel_size
+                        w_start = j * self.stride
+                        w_end = w_start + self.kernel_size
+                        output[b, k, i, j] = self.activation_fn(
+                            np.sum(input[b, :, h_start:h_end, w_start:w_end] * self.kernels[k])
+                        )
 
+        return output
 
 class CNN:
     def __init__(self, train_data_dir, test_data_dir, categories):

diff --git a/gigatorch/nn.py b/gigatorch/nn.py
@@ -1,5 +1,4 @@
 import random
-from typing import List
 from gigatorch.tensor import Tensor
 
 

diff --git a/gigatorch/tensor.py b/gigatorch/tensor.py
@@ -108,7 +108,6 @@ def _backprop():
             self.grad += (out.data > 0) * out.grad
 
         out._backprop = _backprop
-
         return out
 
     def to(self, new_type):

diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,21 @@
+black==24.2.0
+click==8.1.7
+filelock==3.13.1
+fsspec==2024.2.0
+iniconfig==2.0.0
+Jinja2==3.1.3
+MarkupSafe==2.1.5
+mpmath==1.3.0
+mypy-extensions==1.0.0
+networkx==3.2.1
+numpy==1.26.4
+packaging==23.2
+pathspec==0.12.1
+pillow==10.2.0
+platformdirs==4.2.0
+pluggy==1.4.0
+pytest==8.0.2
+setuptools-black==0.1.5
+sympy==1.12
+torch==2.2.1
+typing_extensions==4.10.0
diff --git a/temp/0/1.png b/temp/0/1.png
diff --git a/temp/0/108.png b/temp/0/108.png
diff --git a/temp/0/114.png b/temp/0/114.png
diff --git a/temp/0/118.png b/temp/0/118.png
diff --git a/temp/0/21.png b/temp/0/21.png
diff --git a/temp/0/34.png b/temp/0/34.png
diff --git a/temp/0/37.png b/temp/0/37.png
diff --git a/temp/0/51.png b/temp/0/51.png
diff --git a/temp/0/56.png b/temp/0/56.png
diff --git a/temp/0/63.png b/temp/0/63.png
diff --git a/temp/0/68.png b/temp/0/68.png
diff --git a/temp/0/69.png b/temp/0/69.png
diff --git a/temp/0/75.png b/temp/0/75.png
diff --git a/temp/0/81.png b/temp/0/81.png
diff --git a/temp/0/88.png b/temp/0/88.png
diff --git a/temp/0/95.png b/temp/0/95.png
-Original file line number
+Diff line change
@@ Expand Up / @@ -3,3 +3,6 @@ __pycache__/ @@
     dist/
     output*
     .DS_Store
+    env/
+    build/
+    .eggs/