diff --git a/Deep Learning from Scratch in Python/.ipynb_checkpoints/Gradient Descent Optimization Algorithms-checkpoint.ipynb b/Deep Learning from Scratch in Python/.ipynb_checkpoints/Gradient Descent Optimization Algorithms-checkpoint.ipynb
index d8dc501..81b0980 100644
--- a/Deep Learning from Scratch in Python/.ipynb_checkpoints/Gradient Descent Optimization Algorithms-checkpoint.ipynb	
+++ b/Deep Learning from Scratch in Python/.ipynb_checkpoints/Gradient Descent Optimization Algorithms-checkpoint.ipynb	
@@ -11,7 +11,8 @@
     "- [Adagrad](https://youtu.be/EGt-UOIIdDk)\n",
     "- [RMSprop](https://youtu.be/nLCuzsQaAKE)\n",
     "- [AdaDelta](https://youtu.be/6gvh0IySNMs)\n",
-    "- Adam\n",
+    "- [Adam](https://youtu.be/6nqV58NA_Ew)\n",
+    "- [Nesterov](https://youtu.be/6FrBXv9OcqE)\n",
     "\n",
     "## Tests\n",
     "In order to demonstrate the algorithms capabilities to optimize a function we used these simple test setup:\n",
@@ -20,18 +21,22 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 26,
+   "execution_count": 15,
    "metadata": {},
    "outputs": [],
    "source": [
     "import numpy as np\n",
+    "import copy\n",
     "from numpy.random import permutation\n",
     "\n",
     "class Line():\n",
+    "    \"\"\"\n",
+    "        Linear Model with two weights w0 (intercept) and w1 (slope)\n",
+    "    \"\"\"\n",
     "    def __init__(self):\n",
-    "        self.w0 = np.random.uniform(0,1,1)\n",
-    "        self.w1 = np.random.uniform(0,1,1)\n",
-    "    \n",
+    "        self.weights = [np.random.uniform(0,1,1) for _ in range(2)]\n",
+    "        self.derivative_funcs = [self.dx_w0, self.dx_w1]\n",
+    "        \n",
     "    def evaluate(self,x):\n",
     "        \"\"\"\n",
     "            evaluate: will evaluate the line yhate given x\n",
@@ -39,34 +44,51 @@
     "\n",
     "            return the result of the function evalutation\n",
     "        \"\"\"\n",
-    "        return self.w0 + self.w1*x\n",
+    "        return self.weights[0] + self.weights[1]*x\n",
+    "\n",
+    "    def derivate(self, x, y):\n",
+    "        \"\"\"\n",
+    "            derivate: will calculate all partial derivatives and return them\n",
+    "            input:\n",
+    "            x: a point in the plane\n",
+    "            y: the response of the point x\n",
+    "            \n",
+    "            output:\n",
+    "            partial_derivatives: an array of partial derivatives\n",
+    "        \"\"\"\n",
+    "        partial_derivatives = []\n",
+    "        \n",
+    "        yhat = self.evaluate(x)\n",
+    "        partial_derivatives.append(self.dx_w0(x, y, yhat))\n",
+    "        partial_derivatives.append(self.dx_w1(x, y, yhat))\n",
+    "        \n",
+    "        return partial_derivatives\n",
     "    \n",
-    "    def dx_w0(self, x, y):\n",
+    "    def dx_w0(self, x, y, yhat):\n",
     "        \"\"\"\n",
     "            dx_w0: partial derivative of the weight w0\n",
     "            x: a point in the plane\n",
     "            y: the response of the point x\n",
+    "            yhat: the current approximation of y given x and the weights\n",
     "\n",
     "            return the gradient at that point for this x and y for w0\n",
     "        \"\"\"\n",
-    "        yhat = self.evaluate(x)\n",
     "        return 2*(yhat - y)\n",
-    "        \n",
     "    \n",
-    "    def dx_w1(self, x, y):\n",
+    "    def dx_w1(self, x, y, yhat):\n",
     "        \"\"\"\n",
     "            dx_w1: partial derivative of the weight w1 for a linear function\n",
     "            x: a point in the plane\n",
     "            y: the response of the point x\n",
+    "            yhat: the current approximation of y given x and the weights\n",
     "\n",
     "            return the gradient at that point for this x and y for w1\n",
     "        \"\"\"  \n",
-    "        yhat = self.evaluate(x)\n",
     "        return 2*x*(yhat - y)\n",
     "\n",
     "    def __str__(self):\n",
-    "        return f\"y = {self.w0[0]} + {self.w1[0]}*x\"\n",
-    "    \n",
+    "        return f\"y = {self.weights[0]} + {self.weights[1]}*x\"\n",
+    "        \n",
     "    \n",
     "#################### Helper functions ######################\n",
     "def stochastic_sample(xs, ys):\n",
@@ -84,9 +106,10 @@
     "    return x, y\n",
     "    \n",
     "    \n",
-    "def gradient(dx, xs, ys):\n",
+    "def gradient(dx, evaluate, xs, ys):\n",
     "    \"\"\"\n",
     "        gradient: estimate mean gradient over all point for w1\n",
+    "        evaluate: the evaulation function from the model\n",
     "        dx: partial derivative function used to evaluate the gradient\n",
     "        xs: all point on the plane\n",
     "        ys: all response on the plane\n",
@@ -97,7 +120,8 @@
     "    \n",
     "    total = 0\n",
     "    for x,y in zip(xs,ys):\n",
-    "        total = total + dx(x, y)\n",
+    "        yhat = evaluate(x)\n",
+    "        total = total + dx(x, y, yhat)\n",
     "    \n",
     "    gradient = total/N\n",
     "    return gradient\n",
@@ -115,8 +139,8 @@
     "    \"\"\"    \n",
     "\n",
     "    for i in range(max_num_iteration):\n",
-    "        model.w0 = model.w0 - learning_rate*gradient(model.dx_w0, xs, ys)\n",
-    "        model.w1 = model.w1 - learning_rate*gradient(model.dx_w1, xs, ys)\n",
+    "        # Updating the model parameters\n",
+    "        model.weights = [weight - learning_rate*gradient(derivative_func, model.evaluate, xs, ys) for weight, derivative_func in zip(model.weights, model.derivative_funcs)]\n",
     "        \n",
     "        if i % 100 == 0:\n",
     "            print(f\"Iteration {i}\")\n",
@@ -139,9 +163,7 @@
     "        x, y = stochastic_sample(xs, ys)\n",
     "        \n",
     "        # Updating the model parameters\n",
-    "        model.w0 = model.w0 - learning_rate*model.dx_w0(x, y)\n",
-    "        model.w1 = model.w1 - learning_rate*model.dx_w1(x, y)\n",
-    "        \n",
+    "        model.weights = [weight - learning_rate*derivative for weight, derivative in zip(model.weights, model.derivate(x,y))]        \n",
     "    \n",
     "        if i % 100 == 0:\n",
     "            print(f\"Iteration {i}\")\n",
@@ -159,31 +181,26 @@
     "        max_num_iteration: the number of iteration before we stop updating\n",
     "    \"\"\"\n",
     "    \n",
-    "    # These are needed to keep track of the previous gradient\n",
-    "    prev_g0 = 0\n",
-    "    prev_g1 = 0\n",
+    "    # Create the gradient that we keep track as an array of 0 of the same size as the number of weights\n",
+    "    gradients = [0 for _ in range(len(model.weights))]\n",
     "    \n",
     "    for i in range(max_num_iteration):\n",
     "        \n",
     "        # Select a random x and y\n",
     "        x, y = stochastic_sample(xs, ys)\n",
     "\n",
-    "        g0 = decay_factor*prev_g0 - learning_rate*model.dx_w0(x,y)\n",
-    "        g1 = decay_factor*prev_g1 - learning_rate*model.dx_w1(x,y)\n",
+    "        # Calculate the new gradients\n",
+    "        gradients = [decay_factor*g + learning_rate*derivative for g, derivative in zip(gradients, model.derivate(x,y))]\n",
     "        \n",
     "        # Updating the model parameters\n",
-    "        model.w0 = model.w0 + g0\n",
-    "        model.w1 = model.w1 + g1\n",
-    "        \n",
-    "        # swap previous gradient\n",
-    "        prev_g0, prev_g1 = g0, g1\n",
+    "        model.weights = [weight - g for weight, g in zip(model.weights, gradients)]\n",
     "    \n",
     "        if i % 100 == 0:\n",
     "            print(f\"Iteration {i}\")\n",
     "            print(model)\n",
     "            \n",
     "            \n",
-    "def adagrad(model, xs, ys, learning_rate = 0.1, max_num_iteration = 10000, eps=0.0000001):\n",
+    "def adagrad(model, xs, ys, learning_rate = 0.1, max_num_iteration = 1000, eps=0.0000001):\n",
     "    \"\"\"\n",
     "        adagrad: will estimate the parameters w0 and w1 \n",
     "        (here it uses least square cost function)\n",
@@ -194,31 +211,28 @@
     "        max_num_iteration: the number of iteration before we stop updating\n",
     "        eps: is a numerical safety to avoid division by 0\n",
     "    \"\"\"         \n",
+    "    \n",
     "    # Here only the diagonal matter\n",
-    "    G = [[0,0],\n",
-    "         [0,0]]\n",
+    "    num_param = len(model.weights)\n",
+    "    G = [[0 for _ in range(num_param)] for _ in range(num_param)]\n",
     "    \n",
     "    for i in range(max_num_iteration):\n",
     "        \n",
     "        # Select a random x and y\n",
     "        x, y = stochastic_sample(xs, ys)\n",
     "        \n",
-    "        g0 = model.dx_w0(x, y)\n",
-    "        g1 = model.dx_w1(x, y)\n",
-    "        \n",
-    "        G[0][0] = G[0][0] + g0*g0\n",
-    "        G[1][1] = G[1][1] + g1*g1\n",
-    "        \n",
-    "        model.w0 = model.w0 - (learning_rate/np.sqrt(G[0][0] + eps)) * g0\n",
-    "        model.w1 = model.w1 - (learning_rate/np.sqrt(G[1][1] + eps)) * g1\n",
+    "        # Update G and the model weights iteratively (Note: speed up could be gained from vectorized implementation)\n",
+    "        for idx, gradient in enumerate(model.derivate(x, y)):\n",
+    "            G[idx][idx] = G[idx][idx] + gradient**2\n",
+    "            model.weights[idx] = model.weights[idx] - (learning_rate / np.sqrt(G[idx][idx] + eps)) * gradient\n",
     "    \n",
     "        if i % 100 == 0:\n",
     "            print(f\"Iteration {i}\")\n",
     "            print(model)\n",
     "            \n",
-    "def RMSprop(model, xs, ys, learning_rate = 0.01, decay_factor = 0.9, max_num_iteration = 10000, eps=0.0000001):\n",
+    "def rmsprop(model, xs, ys, learning_rate = 0.01, decay_factor = 0.9, max_num_iteration = 10000, eps=0.0000001):\n",
     "    \"\"\"\n",
-    "        RMSprop: will estimate the parameters w0 and w1 \n",
+    "        rmsprop: will estimate the parameters w0 and w1 \n",
     "        (here it uses least square cost function)\n",
     "        model: the model we are trying to optimize using sgd\n",
     "        xs: all point on the plane\n",
@@ -230,21 +244,18 @@
     "    \"\"\"         \n",
     "    \n",
     "    # Running average\n",
-    "    E = [0,0]\n",
+    "    E = [0 for _ in range(len(model.weights))]\n",
     "    \n",
     "    for i in range(max_num_iteration):\n",
     "        \n",
     "        # Select a random x and y\n",
     "        x, y = stochastic_sample(xs, ys)\n",
     "        \n",
-    "        g0 = model.dx_w0(x, y)\n",
-    "        g1 = model.dx_w1(x, y)\n",
-    "        \n",
-    "        E[0] = decay_factor*E[0] + (1-decay_factor)*g0*g0\n",
-    "        E[1] = decay_factor*E[1] + (1-decay_factor)*g1*g1\n",
-    "        \n",
-    "        model.w0 = model.w0 - (learning_rate/np.sqrt(E[0] + eps)) * g0\n",
-    "        model.w1 = model.w1 - (learning_rate/np.sqrt(E[1] + eps)) * g1\n",
+    "        # Update E and the model weights iteratively (Note: speed up could be gained from vectorized implementation)\n",
+    "        for idx, gradient in enumerate(model.derivate(x, y)):    \n",
+    "            E[idx] = decay_factor*E[idx] + (1 - decay_factor)*(gradient**2)\n",
+    "            model.weights[idx] = model.weights[idx] - (learning_rate/np.sqrt(E[idx] + eps))*gradient\n",
+    "\n",
     "    \n",
     "        if i % 100 == 0:\n",
     "            print(f\"Iteration {i}\")\n",
@@ -261,35 +272,31 @@
     "        eps: is a numerical safety to avoid division by 0\n",
     "    \"\"\"         \n",
     "    \n",
-    "    # Running average\n",
-    "    E_g = [0,0] # for gradient\n",
-    "    E_p = [0,0] # for parameters\n",
-    "    delta_p = [0,0] #delta for parameter\n",
+    "    # Init Running Averages\n",
+    "    num_param = len(model.weights)\n",
+    "    E_g = [0 for _ in range(num_param)]\n",
+    "    E_p = [0 for _ in range(num_param)]\n",
+    "    delta_p = [0 for _ in range(num_param)]\n",
+    "    \n",
     "    \n",
     "    for i in range(max_num_iteration):\n",
     "        \n",
     "        # Select a random x and y\n",
     "        x, y = stochastic_sample(xs, ys)\n",
     "        \n",
-    "        g0 = model.dx_w0(x, y)\n",
-    "        g1 = model.dx_w1(x, y)\n",
-    "        \n",
-    "        # Get the running average for the gradient\n",
-    "        E_g[0] = decay_factor*E_g[0] + (1-decay_factor)*g0*g0\n",
-    "        E_g[1] = decay_factor*E_g[1] + (1-decay_factor)*g1*g1\n",
-    "        \n",
-    "        # Get the running average for the parameters\n",
-    "        E_p[0] = decay_factor*E_p[0] + (1-decay_factor)*delta_p[0]*delta_p[0]\n",
-    "        E_p[1] = decay_factor*E_p[1] + (1-decay_factor)*delta_p[1]*delta_p[1]\n",
-    "        \n",
-    "        # Calculate the gradient difference\n",
-    "        delta_p[0] = - np.sqrt(E_p[0] + eps) / np.sqrt(E_g[0] + eps) * g0\n",
-    "        delta_p[1] = - np.sqrt(E_p[1] + eps) / np.sqrt(E_g[1] + eps) * g1\n",
-    "        \n",
-    "        # update the models\n",
-    "        model.w0 = model.w0 + delta_p[0]\n",
-    "        model.w1 = model.w1 + delta_p[1]\n",
-    "        \n",
+    "        for idx, gradient in enumerate(model.derivate(x, y)):\n",
+    "            # Get the running average for the gradient\n",
+    "            E_g[idx] = decay_factor*E_g[idx] + (1 - decay_factor)*(gradient**2)\n",
+    "            \n",
+    "            # Get the running average for the parameters\n",
+    "            E_p[idx] = decay_factor*E_p[idx] + (1 - decay_factor)*(delta_p[idx]**2)\n",
+    "            \n",
+    "            # Calculate the gradient difference\n",
+    "            delta_p[idx] = - np.sqrt(E_p[idx] + eps) / np.sqrt(E_g[idx] + eps) * gradient\n",
+    "            \n",
+    "            # update the model weight\n",
+    "            model.weights[idx] = model.weights[idx] + delta_p[idx]\n",
+    "            \n",
     "        \n",
     "        if i % 100 == 0:\n",
     "            print(f\"Iteration {i}\")\n",
@@ -311,16 +318,18 @@
     "    \n",
     "    \n",
     "    # Variable Initialization\n",
-    "    m = [0, 0] # two m for each parameter\n",
-    "    v = [0, 0] # two v for each parameter\n",
-    "    g = [0, 0] # two gradient\n",
-    "    t = 1 # time steps\n",
+    "    num_param = len(model.weights)\n",
+    "    m = [0 for _ in range(num_param)] # two m for each parameter\n",
+    "    v = [0 for _ in range(num_param)] # two v for each parameter\n",
+    "    g = [0 for _ in range(num_param)] # two gradient\n",
     "    \n",
-    "    for i in range(max_iteration):\n",
+    "    for t in range(1,max_iteration):\n",
+    "        \n",
     "        # Calculate the gradients \n",
     "        x, y = stochastic_sample(xs, ys)\n",
-    "        g[0] = model.dx_w0(x, y)\n",
-    "        g[1] = model.dx_w1(x, y)\n",
+    "        \n",
+    "        # Get the partial derivatives\n",
+    "        g = model.derivate(x, y)\n",
     "\n",
     "        # Update the m and v parameter\n",
     "        m = [b1*m_i + (1 - b1)*g_i for m_i, g_i in zip(m, g)]\n",
@@ -331,49 +340,83 @@
     "        v_cor = [v_i / (1 - (b2**t)) for v_i in v]\n",
     "\n",
     "        # Update the parameter\n",
-    "        model.w0 = model.w0 - (learning_rate / (np.sqrt(v_cor[0]) + epsilon))*m_cor[0]\n",
-    "        model.w1 = model.w1 - (learning_rate / (np.sqrt(v_cor[1]) + epsilon))*m_cor[1]\n",
-    "\n",
-    "        t = t + 1\n",
+    "        model.weights = [weight - (learning_rate / (np.sqrt(v_cor_i) + epsilon))*m_cor_i for weight, v_cor_i, m_cor_i in zip(model.weights, v_cor, m_cor)]\n",
     "        \n",
-    "        if i % 100 == 0:\n",
-    "            print(f\"Iteration {i}\")\n",
+    "        if t % 100 == 0:\n",
+    "            print(f\"Iteration {t}\")\n",
     "            print(model)\n",
     "    \n",
-    "    "
+    "def nesterov(model, xs, ys, learning_rate = 0.01, decay_factor = 0.9, max_num_iteration = 1000):\n",
+    "    \"\"\"\n",
+    "        Nesterov: This is the nesterov accelerated gradient optimizer that build upon momentum\n",
+    "        model: the model we want to optimize the parameter on (this is a line right now)\n",
+    "        xs: the feature of my dataset\n",
+    "        ys: the continous value (target)\n",
+    "        learning_rate: the learning rate for the step that weights update will take\n",
+    "        decay_factor: determines the relative contribution of the current gradient and earlier gradients to the weight change\n",
+    "        max_num_iteration: the number of iteration before we stop updating\n",
+    "    \"\"\"\n",
+    "    \n",
+    "    # These are needed to keep track of the previous gradient\n",
+    "    g = [0 for _ in range(len(model.weights))] \n",
+    "    g0 = 0\n",
+    "    g1 = 0\n",
+    "    \n",
+    "    for i in range(max_num_iteration):\n",
+    "        \n",
+    "        # Select a random x and y\n",
+    "        x, y = stochastic_sample(xs, ys)\n",
+    "\n",
+    "        # Calculate the gradient for w0 by predicting where the ball will be (approximatively)\n",
+    "        for idx, gradient in enumerate(model.derivate(x,y)):\n",
+    "            \n",
+    "            # Here we need to do a bit of gymnastic because of how the code is setup\n",
+    "            # We need to save the parameters state, modify it, do the simulation and then reset the parameter state\n",
+    "            # The update happen in the next section\n",
+    "            prev_weight = model.weights[idx]\n",
+    "            model.weights[idx] = model.weights[idx] = decay_factor*gradient\n",
+    "            g[idx] = decay_factor*g[idx] + learning_rate*gradient\n",
+    "            model.weights[idx] = prev_weight\n",
+    "            \n",
+    "            # Update the model parameter\n",
+    "            model.weights[idx] = model.weights[idx] - g[idx]\n",
+    "            \n",
+    "        if i % 100 == 0:\n",
+    "            print(f\"Iteration {i}\")\n",
+    "            print(model)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 27,
+   "execution_count": 16,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Adam\n",
+      "Nesterov Accelerated Gradient\n",
       "Iteration 0\n",
-      "y = 0.8575880576520295 + 0.8405747654640998*x\n",
+      "y = [0.89010029] + [0.35356173]*x\n",
       "Iteration 100\n",
-      "y = -0.001590081073315357 + 0.9984342651063493*x\n",
+      "y = [0.3723535] + [0.9655646]*x\n",
       "Iteration 200\n",
-      "y = 2.70122392056152e-05 + 1.0001344335031852*x\n",
+      "y = [0.01159183] + [0.9884176]*x\n",
       "Iteration 300\n",
-      "y = -1.4876611515003004e-06 + 1.0000033575642253*x\n",
+      "y = [0.00013029] + [1.00053685]*x\n",
       "Iteration 400\n",
-      "y = 1.030624925582267e-05 + 0.9999970769162957*x\n",
+      "y = [0.00110482] + [1.00002591]*x\n",
       "Iteration 500\n",
-      "y = 2.567876097403007e-06 + 0.9999995341571982*x\n",
+      "y = [1.20330159e-05] + [0.99999153]*x\n",
       "Iteration 600\n",
-      "y = 3.001062476619204e-08 + 0.9999999996624379*x\n",
+      "y = [4.34325924e-05] + [1.00000075]*x\n",
       "Iteration 700\n",
-      "y = 3.5509149827671966e-09 + 0.9999999957864142*x\n",
+      "y = [-1.87460485e-05] + [1.00003432]*x\n",
       "Iteration 800\n",
-      "y = 2.013987394989773e-08 + 0.9999999952565595*x\n",
+      "y = [1.26114336e-05] + [0.99998661]*x\n",
       "Iteration 900\n",
-      "y = -2.4103486747254733e-08 + 0.9999999984873583*x\n",
-      "y = 4.7124723585610135e-07 + 0.9999992952944199*x\n"
+      "y = [-1.84626241e-06] + [1.0000026]*x\n",
+      "y = [4.67870626e-06] + [0.99999889]*x\n"
      ]
     }
    ],
@@ -382,7 +425,7 @@
     "xs = [1,2,3,4,5,6,7]\n",
     "ys = [1,2,3,4,5,6,7]\n",
     "\n",
-    "\n",
+    "'''\n",
     "# Gradient Descent\n",
     "model = Line()\n",
     "print(\"Gradient Descent: \")\n",
@@ -401,6 +444,7 @@
     "sgd_momentum(model, xs, ys)\n",
     "print(model)\n",
     "\n",
+    "\n",
     "# Adagrad\n",
     "model = Line()\n",
     "print(\"Adagrad\")\n",
@@ -410,7 +454,7 @@
     "# RMSprop\n",
     "model = Line()\n",
     "print(\"RMSprop\")\n",
-    "RMSprop(model, xs, ys)\n",
+    "rmsprop(model, xs, ys)\n",
     "print(model)\n",
     "\n",
     "# Adadelta\n",
@@ -424,40 +468,48 @@
     "model = Line()\n",
     "print(\"Adam\")\n",
     "adam(model, xs, ys)\n",
+    "print(model)\n",
+    "\n",
+    "'''\n",
+    "\n",
+    "# Nesterov Accelerated Gradient\n",
+    "model = Line()\n",
+    "print(\"Nesterov Accelerated Gradient\")\n",
+    "nesterov(model, xs, ys)\n",
     "print(model)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 28,
+   "execution_count": 17,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Adam\n",
+      "Nesterov Accelerated Gradient\n",
       "Iteration 0\n",
-      "y = 0.7770901717569257 + 0.9433753832651581*x\n",
+      "y = [0.66878322] + [0.90014342]*x\n",
       "Iteration 100\n",
-      "y = 0.32725296586754515 + 1.900221970247115*x\n",
+      "y = [-0.17903726] + [2.40553571]*x\n",
       "Iteration 200\n",
-      "y = 0.004147083459202631 + 1.99892111204854*x\n",
+      "y = [0.05899865] + [2.04887923]*x\n",
       "Iteration 300\n",
-      "y = -3.4688751132678575e-05 + 2.000013509462116*x\n",
+      "y = [0.00894693] + [1.97536479]*x\n",
       "Iteration 400\n",
-      "y = -9.06655931188767e-08 + 1.9999999764669432*x\n",
+      "y = [0.01374569] + [2.05300722]*x\n",
       "Iteration 500\n",
-      "y = 1.8387422421644338e-10 + 2.0000000000594307*x\n",
+      "y = [0.10548793] + [1.91493233]*x\n",
       "Iteration 600\n",
-      "y = 4.361065425337935e-12 + 1.9999999999993485*x\n",
+      "y = [0.00385495] + [2.01632264]*x\n",
       "Iteration 700\n",
-      "y = 1.1991224399817034e-13 + 1.9999999999999527*x\n",
+      "y = [0.05427682] + [2.13589417]*x\n",
       "Iteration 800\n",
-      "y = -2.5150603804494032e-17 + 2.000000000000001*x\n",
+      "y = [0.0113579] + [2.00570216]*x\n",
       "Iteration 900\n",
-      "y = -8.298690846122729e-17 + 2.0*x\n",
-      "y = 9.891579815200104e-17 + 2.0*x\n"
+      "y = [-0.02275247] + [1.96719449]*x\n",
+      "y = [-0.00900316] + [2.00493823]*x\n"
      ]
     }
    ],
@@ -466,7 +518,7 @@
     "xs = [1,2,3,4,5,6,7]\n",
     "ys = [2,4,6,8,10,12,14]\n",
     "\n",
-    "\n",
+    "'''\n",
     "# Gradient Descent\n",
     "model = Line()\n",
     "print(\"Gradient Descent: \")\n",
@@ -494,7 +546,7 @@
     "# RMSprop\n",
     "model = Line()\n",
     "print(\"RMSprop\")\n",
-    "RMSprop(model, xs, ys)\n",
+    "rmsprop(model, xs, ys)\n",
     "print(model)\n",
     "\n",
     "# Adadelta\n",
@@ -507,40 +559,47 @@
     "model = Line()\n",
     "print(\"Adam\")\n",
     "adam(model, xs, ys)\n",
+    "print(model)\n",
+    "'''\n",
+    "\n",
+    "# Nesterov Accelerated Gradient\n",
+    "model = Line()\n",
+    "print(\"Nesterov Accelerated Gradient\")\n",
+    "nesterov(model, xs, ys)\n",
     "print(model)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 29,
+   "execution_count": 20,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Adam\n",
+      "Nesterov Accelerated Gradient\n",
       "Iteration 0\n",
-      "y = 0.7139491962653317 + 0.6462402260655279*x\n",
+      "y = [0.30475578] + [0.9422567]*x\n",
       "Iteration 100\n",
-      "y = 1.5012704143511661 + 1.913754894630153*x\n",
+      "y = [-0.39695759] + [2.51820116]*x\n",
       "Iteration 200\n",
-      "y = 1.147275006702479 + 1.9626892285633653*x\n",
+      "y = [-1.48096144] + [3.46703259]*x\n",
       "Iteration 300\n",
-      "y = 1.0187321764566213 + 1.9959358177402393*x\n",
+      "y = [1.90685378] + [2.11784826]*x\n",
       "Iteration 400\n",
-      "y = 1.0018116802276642 + 1.9995771873157036*x\n",
+      "y = [1.03772983] + [1.98962059]*x\n",
       "Iteration 500\n",
-      "y = 1.0001550519477989 + 1.9999713016943046*x\n",
+      "y = [1.00457709] + [2.05050135]*x\n",
       "Iteration 600\n",
-      "y = 1.0000046235977667 + 1.9999989555915936*x\n",
+      "y = [0.99465739] + [1.99190331]*x\n",
       "Iteration 700\n",
-      "y = 0.9999999689046957 + 2.0000000091643306*x\n",
+      "y = [1.00337418] + [1.99869932]*x\n",
       "Iteration 800\n",
-      "y = 1.0000000007114835 + 1.999999999822266*x\n",
+      "y = [0.99671896] + [1.98843029]*x\n",
       "Iteration 900\n",
-      "y = 1.000000000010868 + 2.0000000000003593*x\n",
-      "y = 0.9999999999999609 + 2.000000000000028*x\n"
+      "y = [1.00465637] + [1.99875298]*x\n",
+      "y = [0.99462305] + [2.00330215]*x\n"
      ]
     }
    ],
@@ -548,7 +607,7 @@
     "# Here we have a simple line with intercept = 1 and slope = 2\n",
     "xs = [1,2,3,4,5,6,7]\n",
     "ys = [3,5,7,9,11,13,15]\n",
-    "\n",
+    "'''\n",
     "# Gradient Descent\n",
     "model = Line()\n",
     "print(\"Gradient Descent: \")\n",
@@ -576,7 +635,7 @@
     "# RMSprop\n",
     "model = Line()\n",
     "print(\"RMSprop\")\n",
-    "RMSprop(model, xs, ys)\n",
+    "rmsprop(model, xs, ys)\n",
     "print(model)\n",
     "\n",
     "# Adadelta\n",
@@ -589,6 +648,13 @@
     "model = Line()\n",
     "print(\"Adam\")\n",
     "adam(model, xs, ys)\n",
+    "print(model)\n",
+    "'''\n",
+    "\n",
+    "# Nesterov Accelerated Gradient\n",
+    "model = Line()\n",
+    "print(\"Nesterov Accelerated Gradient\")\n",
+    "nesterov(model, xs, ys)\n",
     "print(model)"
    ]
   },
diff --git a/Deep Learning from Scratch in Python/Gradient Descent Optimization Algorithms.ipynb b/Deep Learning from Scratch in Python/Gradient Descent Optimization Algorithms.ipynb
index d8dc501..81b0980 100644
--- a/Deep Learning from Scratch in Python/Gradient Descent Optimization Algorithms.ipynb	
+++ b/Deep Learning from Scratch in Python/Gradient Descent Optimization Algorithms.ipynb	
@@ -11,7 +11,8 @@
     "- [Adagrad](https://youtu.be/EGt-UOIIdDk)\n",
     "- [RMSprop](https://youtu.be/nLCuzsQaAKE)\n",
     "- [AdaDelta](https://youtu.be/6gvh0IySNMs)\n",
-    "- Adam\n",
+    "- [Adam](https://youtu.be/6nqV58NA_Ew)\n",
+    "- [Nesterov](https://youtu.be/6FrBXv9OcqE)\n",
     "\n",
     "## Tests\n",
     "In order to demonstrate the algorithms capabilities to optimize a function we used these simple test setup:\n",
@@ -20,18 +21,22 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 26,
+   "execution_count": 15,
    "metadata": {},
    "outputs": [],
    "source": [
     "import numpy as np\n",
+    "import copy\n",
     "from numpy.random import permutation\n",
     "\n",
     "class Line():\n",
+    "    \"\"\"\n",
+    "        Linear Model with two weights w0 (intercept) and w1 (slope)\n",
+    "    \"\"\"\n",
     "    def __init__(self):\n",
-    "        self.w0 = np.random.uniform(0,1,1)\n",
-    "        self.w1 = np.random.uniform(0,1,1)\n",
-    "    \n",
+    "        self.weights = [np.random.uniform(0,1,1) for _ in range(2)]\n",
+    "        self.derivative_funcs = [self.dx_w0, self.dx_w1]\n",
+    "        \n",
     "    def evaluate(self,x):\n",
     "        \"\"\"\n",
     "            evaluate: will evaluate the line yhate given x\n",
@@ -39,34 +44,51 @@
     "\n",
     "            return the result of the function evalutation\n",
     "        \"\"\"\n",
-    "        return self.w0 + self.w1*x\n",
+    "        return self.weights[0] + self.weights[1]*x\n",
+    "\n",
+    "    def derivate(self, x, y):\n",
+    "        \"\"\"\n",
+    "            derivate: will calculate all partial derivatives and return them\n",
+    "            input:\n",
+    "            x: a point in the plane\n",
+    "            y: the response of the point x\n",
+    "            \n",
+    "            output:\n",
+    "            partial_derivatives: an array of partial derivatives\n",
+    "        \"\"\"\n",
+    "        partial_derivatives = []\n",
+    "        \n",
+    "        yhat = self.evaluate(x)\n",
+    "        partial_derivatives.append(self.dx_w0(x, y, yhat))\n",
+    "        partial_derivatives.append(self.dx_w1(x, y, yhat))\n",
+    "        \n",
+    "        return partial_derivatives\n",
     "    \n",
-    "    def dx_w0(self, x, y):\n",
+    "    def dx_w0(self, x, y, yhat):\n",
     "        \"\"\"\n",
     "            dx_w0: partial derivative of the weight w0\n",
     "            x: a point in the plane\n",
     "            y: the response of the point x\n",
+    "            yhat: the current approximation of y given x and the weights\n",
     "\n",
     "            return the gradient at that point for this x and y for w0\n",
     "        \"\"\"\n",
-    "        yhat = self.evaluate(x)\n",
     "        return 2*(yhat - y)\n",
-    "        \n",
     "    \n",
-    "    def dx_w1(self, x, y):\n",
+    "    def dx_w1(self, x, y, yhat):\n",
     "        \"\"\"\n",
     "            dx_w1: partial derivative of the weight w1 for a linear function\n",
     "            x: a point in the plane\n",
     "            y: the response of the point x\n",
+    "            yhat: the current approximation of y given x and the weights\n",
     "\n",
     "            return the gradient at that point for this x and y for w1\n",
     "        \"\"\"  \n",
-    "        yhat = self.evaluate(x)\n",
     "        return 2*x*(yhat - y)\n",
     "\n",
     "    def __str__(self):\n",
-    "        return f\"y = {self.w0[0]} + {self.w1[0]}*x\"\n",
-    "    \n",
+    "        return f\"y = {self.weights[0]} + {self.weights[1]}*x\"\n",
+    "        \n",
     "    \n",
     "#################### Helper functions ######################\n",
     "def stochastic_sample(xs, ys):\n",
@@ -84,9 +106,10 @@
     "    return x, y\n",
     "    \n",
     "    \n",
-    "def gradient(dx, xs, ys):\n",
+    "def gradient(dx, evaluate, xs, ys):\n",
     "    \"\"\"\n",
     "        gradient: estimate mean gradient over all point for w1\n",
+    "        evaluate: the evaulation function from the model\n",
     "        dx: partial derivative function used to evaluate the gradient\n",
     "        xs: all point on the plane\n",
     "        ys: all response on the plane\n",
@@ -97,7 +120,8 @@
     "    \n",
     "    total = 0\n",
     "    for x,y in zip(xs,ys):\n",
-    "        total = total + dx(x, y)\n",
+    "        yhat = evaluate(x)\n",
+    "        total = total + dx(x, y, yhat)\n",
     "    \n",
     "    gradient = total/N\n",
     "    return gradient\n",
@@ -115,8 +139,8 @@
     "    \"\"\"    \n",
     "\n",
     "    for i in range(max_num_iteration):\n",
-    "        model.w0 = model.w0 - learning_rate*gradient(model.dx_w0, xs, ys)\n",
-    "        model.w1 = model.w1 - learning_rate*gradient(model.dx_w1, xs, ys)\n",
+    "        # Updating the model parameters\n",
+    "        model.weights = [weight - learning_rate*gradient(derivative_func, model.evaluate, xs, ys) for weight, derivative_func in zip(model.weights, model.derivative_funcs)]\n",
     "        \n",
     "        if i % 100 == 0:\n",
     "            print(f\"Iteration {i}\")\n",
@@ -139,9 +163,7 @@
     "        x, y = stochastic_sample(xs, ys)\n",
     "        \n",
     "        # Updating the model parameters\n",
-    "        model.w0 = model.w0 - learning_rate*model.dx_w0(x, y)\n",
-    "        model.w1 = model.w1 - learning_rate*model.dx_w1(x, y)\n",
-    "        \n",
+    "        model.weights = [weight - learning_rate*derivative for weight, derivative in zip(model.weights, model.derivate(x,y))]        \n",
     "    \n",
     "        if i % 100 == 0:\n",
     "            print(f\"Iteration {i}\")\n",
@@ -159,31 +181,26 @@
     "        max_num_iteration: the number of iteration before we stop updating\n",
     "    \"\"\"\n",
     "    \n",
-    "    # These are needed to keep track of the previous gradient\n",
-    "    prev_g0 = 0\n",
-    "    prev_g1 = 0\n",
+    "    # Create the gradient that we keep track as an array of 0 of the same size as the number of weights\n",
+    "    gradients = [0 for _ in range(len(model.weights))]\n",
     "    \n",
     "    for i in range(max_num_iteration):\n",
     "        \n",
     "        # Select a random x and y\n",
     "        x, y = stochastic_sample(xs, ys)\n",
     "\n",
-    "        g0 = decay_factor*prev_g0 - learning_rate*model.dx_w0(x,y)\n",
-    "        g1 = decay_factor*prev_g1 - learning_rate*model.dx_w1(x,y)\n",
+    "        # Calculate the new gradients\n",
+    "        gradients = [decay_factor*g + learning_rate*derivative for g, derivative in zip(gradients, model.derivate(x,y))]\n",
     "        \n",
     "        # Updating the model parameters\n",
-    "        model.w0 = model.w0 + g0\n",
-    "        model.w1 = model.w1 + g1\n",
-    "        \n",
-    "        # swap previous gradient\n",
-    "        prev_g0, prev_g1 = g0, g1\n",
+    "        model.weights = [weight - g for weight, g in zip(model.weights, gradients)]\n",
     "    \n",
     "        if i % 100 == 0:\n",
     "            print(f\"Iteration {i}\")\n",
     "            print(model)\n",
     "            \n",
     "            \n",
-    "def adagrad(model, xs, ys, learning_rate = 0.1, max_num_iteration = 10000, eps=0.0000001):\n",
+    "def adagrad(model, xs, ys, learning_rate = 0.1, max_num_iteration = 1000, eps=0.0000001):\n",
     "    \"\"\"\n",
     "        adagrad: will estimate the parameters w0 and w1 \n",
     "        (here it uses least square cost function)\n",
@@ -194,31 +211,28 @@
     "        max_num_iteration: the number of iteration before we stop updating\n",
     "        eps: is a numerical safety to avoid division by 0\n",
     "    \"\"\"         \n",
+    "    \n",
     "    # Here only the diagonal matter\n",
-    "    G = [[0,0],\n",
-    "         [0,0]]\n",
+    "    num_param = len(model.weights)\n",
+    "    G = [[0 for _ in range(num_param)] for _ in range(num_param)]\n",
     "    \n",
     "    for i in range(max_num_iteration):\n",
     "        \n",
     "        # Select a random x and y\n",
     "        x, y = stochastic_sample(xs, ys)\n",
     "        \n",
-    "        g0 = model.dx_w0(x, y)\n",
-    "        g1 = model.dx_w1(x, y)\n",
-    "        \n",
-    "        G[0][0] = G[0][0] + g0*g0\n",
-    "        G[1][1] = G[1][1] + g1*g1\n",
-    "        \n",
-    "        model.w0 = model.w0 - (learning_rate/np.sqrt(G[0][0] + eps)) * g0\n",
-    "        model.w1 = model.w1 - (learning_rate/np.sqrt(G[1][1] + eps)) * g1\n",
+    "        # Update G and the model weights iteratively (Note: speed up could be gained from vectorized implementation)\n",
+    "        for idx, gradient in enumerate(model.derivate(x, y)):\n",
+    "            G[idx][idx] = G[idx][idx] + gradient**2\n",
+    "            model.weights[idx] = model.weights[idx] - (learning_rate / np.sqrt(G[idx][idx] + eps)) * gradient\n",
     "    \n",
     "        if i % 100 == 0:\n",
     "            print(f\"Iteration {i}\")\n",
     "            print(model)\n",
     "            \n",
-    "def RMSprop(model, xs, ys, learning_rate = 0.01, decay_factor = 0.9, max_num_iteration = 10000, eps=0.0000001):\n",
+    "def rmsprop(model, xs, ys, learning_rate = 0.01, decay_factor = 0.9, max_num_iteration = 10000, eps=0.0000001):\n",
     "    \"\"\"\n",
-    "        RMSprop: will estimate the parameters w0 and w1 \n",
+    "        rmsprop: will estimate the parameters w0 and w1 \n",
     "        (here it uses least square cost function)\n",
     "        model: the model we are trying to optimize using sgd\n",
     "        xs: all point on the plane\n",
@@ -230,21 +244,18 @@
     "    \"\"\"         \n",
     "    \n",
     "    # Running average\n",
-    "    E = [0,0]\n",
+    "    E = [0 for _ in range(len(model.weights))]\n",
     "    \n",
     "    for i in range(max_num_iteration):\n",
     "        \n",
     "        # Select a random x and y\n",
     "        x, y = stochastic_sample(xs, ys)\n",
     "        \n",
-    "        g0 = model.dx_w0(x, y)\n",
-    "        g1 = model.dx_w1(x, y)\n",
-    "        \n",
-    "        E[0] = decay_factor*E[0] + (1-decay_factor)*g0*g0\n",
-    "        E[1] = decay_factor*E[1] + (1-decay_factor)*g1*g1\n",
-    "        \n",
-    "        model.w0 = model.w0 - (learning_rate/np.sqrt(E[0] + eps)) * g0\n",
-    "        model.w1 = model.w1 - (learning_rate/np.sqrt(E[1] + eps)) * g1\n",
+    "        # Update E and the model weights iteratively (Note: speed up could be gained from vectorized implementation)\n",
+    "        for idx, gradient in enumerate(model.derivate(x, y)):    \n",
+    "            E[idx] = decay_factor*E[idx] + (1 - decay_factor)*(gradient**2)\n",
+    "            model.weights[idx] = model.weights[idx] - (learning_rate/np.sqrt(E[idx] + eps))*gradient\n",
+    "\n",
     "    \n",
     "        if i % 100 == 0:\n",
     "            print(f\"Iteration {i}\")\n",
@@ -261,35 +272,31 @@
     "        eps: is a numerical safety to avoid division by 0\n",
     "    \"\"\"         \n",
     "    \n",
-    "    # Running average\n",
-    "    E_g = [0,0] # for gradient\n",
-    "    E_p = [0,0] # for parameters\n",
-    "    delta_p = [0,0] #delta for parameter\n",
+    "    # Init Running Averages\n",
+    "    num_param = len(model.weights)\n",
+    "    E_g = [0 for _ in range(num_param)]\n",
+    "    E_p = [0 for _ in range(num_param)]\n",
+    "    delta_p = [0 for _ in range(num_param)]\n",
+    "    \n",
     "    \n",
     "    for i in range(max_num_iteration):\n",
     "        \n",
     "        # Select a random x and y\n",
     "        x, y = stochastic_sample(xs, ys)\n",
     "        \n",
-    "        g0 = model.dx_w0(x, y)\n",
-    "        g1 = model.dx_w1(x, y)\n",
-    "        \n",
-    "        # Get the running average for the gradient\n",
-    "        E_g[0] = decay_factor*E_g[0] + (1-decay_factor)*g0*g0\n",
-    "        E_g[1] = decay_factor*E_g[1] + (1-decay_factor)*g1*g1\n",
-    "        \n",
-    "        # Get the running average for the parameters\n",
-    "        E_p[0] = decay_factor*E_p[0] + (1-decay_factor)*delta_p[0]*delta_p[0]\n",
-    "        E_p[1] = decay_factor*E_p[1] + (1-decay_factor)*delta_p[1]*delta_p[1]\n",
-    "        \n",
-    "        # Calculate the gradient difference\n",
-    "        delta_p[0] = - np.sqrt(E_p[0] + eps) / np.sqrt(E_g[0] + eps) * g0\n",
-    "        delta_p[1] = - np.sqrt(E_p[1] + eps) / np.sqrt(E_g[1] + eps) * g1\n",
-    "        \n",
-    "        # update the models\n",
-    "        model.w0 = model.w0 + delta_p[0]\n",
-    "        model.w1 = model.w1 + delta_p[1]\n",
-    "        \n",
+    "        for idx, gradient in enumerate(model.derivate(x, y)):\n",
+    "            # Get the running average for the gradient\n",
+    "            E_g[idx] = decay_factor*E_g[idx] + (1 - decay_factor)*(gradient**2)\n",
+    "            \n",
+    "            # Get the running average for the parameters\n",
+    "            E_p[idx] = decay_factor*E_p[idx] + (1 - decay_factor)*(delta_p[idx]**2)\n",
+    "            \n",
+    "            # Calculate the gradient difference\n",
+    "            delta_p[idx] = - np.sqrt(E_p[idx] + eps) / np.sqrt(E_g[idx] + eps) * gradient\n",
+    "            \n",
+    "            # update the model weight\n",
+    "            model.weights[idx] = model.weights[idx] + delta_p[idx]\n",
+    "            \n",
     "        \n",
     "        if i % 100 == 0:\n",
     "            print(f\"Iteration {i}\")\n",
@@ -311,16 +318,18 @@
     "    \n",
     "    \n",
     "    # Variable Initialization\n",
-    "    m = [0, 0] # two m for each parameter\n",
-    "    v = [0, 0] # two v for each parameter\n",
-    "    g = [0, 0] # two gradient\n",
-    "    t = 1 # time steps\n",
+    "    num_param = len(model.weights)\n",
+    "    m = [0 for _ in range(num_param)] # two m for each parameter\n",
+    "    v = [0 for _ in range(num_param)] # two v for each parameter\n",
+    "    g = [0 for _ in range(num_param)] # two gradient\n",
     "    \n",
-    "    for i in range(max_iteration):\n",
+    "    for t in range(1,max_iteration):\n",
+    "        \n",
     "        # Calculate the gradients \n",
     "        x, y = stochastic_sample(xs, ys)\n",
-    "        g[0] = model.dx_w0(x, y)\n",
-    "        g[1] = model.dx_w1(x, y)\n",
+    "        \n",
+    "        # Get the partial derivatives\n",
+    "        g = model.derivate(x, y)\n",
     "\n",
     "        # Update the m and v parameter\n",
     "        m = [b1*m_i + (1 - b1)*g_i for m_i, g_i in zip(m, g)]\n",
@@ -331,49 +340,83 @@
     "        v_cor = [v_i / (1 - (b2**t)) for v_i in v]\n",
     "\n",
     "        # Update the parameter\n",
-    "        model.w0 = model.w0 - (learning_rate / (np.sqrt(v_cor[0]) + epsilon))*m_cor[0]\n",
-    "        model.w1 = model.w1 - (learning_rate / (np.sqrt(v_cor[1]) + epsilon))*m_cor[1]\n",
-    "\n",
-    "        t = t + 1\n",
+    "        model.weights = [weight - (learning_rate / (np.sqrt(v_cor_i) + epsilon))*m_cor_i for weight, v_cor_i, m_cor_i in zip(model.weights, v_cor, m_cor)]\n",
     "        \n",
-    "        if i % 100 == 0:\n",
-    "            print(f\"Iteration {i}\")\n",
+    "        if t % 100 == 0:\n",
+    "            print(f\"Iteration {t}\")\n",
     "            print(model)\n",
     "    \n",
-    "    "
+    "def nesterov(model, xs, ys, learning_rate = 0.01, decay_factor = 0.9, max_num_iteration = 1000):\n",
+    "    \"\"\"\n",
+    "        Nesterov: This is the nesterov accelerated gradient optimizer that build upon momentum\n",
+    "        model: the model we want to optimize the parameter on (this is a line right now)\n",
+    "        xs: the feature of my dataset\n",
+    "        ys: the continous value (target)\n",
+    "        learning_rate: the learning rate for the step that weights update will take\n",
+    "        decay_factor: determines the relative contribution of the current gradient and earlier gradients to the weight change\n",
+    "        max_num_iteration: the number of iteration before we stop updating\n",
+    "    \"\"\"\n",
+    "    \n",
+    "    # These are needed to keep track of the previous gradient\n",
+    "    g = [0 for _ in range(len(model.weights))] \n",
+    "    g0 = 0\n",
+    "    g1 = 0\n",
+    "    \n",
+    "    for i in range(max_num_iteration):\n",
+    "        \n",
+    "        # Select a random x and y\n",
+    "        x, y = stochastic_sample(xs, ys)\n",
+    "\n",
+    "        # Calculate the gradient for w0 by predicting where the ball will be (approximatively)\n",
+    "        for idx, gradient in enumerate(model.derivate(x,y)):\n",
+    "            \n",
+    "            # Here we need to do a bit of gymnastic because of how the code is setup\n",
+    "            # We need to save the parameters state, modify it, do the simulation and then reset the parameter state\n",
+    "            # The update happen in the next section\n",
+    "            prev_weight = model.weights[idx]\n",
+    "            model.weights[idx] = model.weights[idx] = decay_factor*gradient\n",
+    "            g[idx] = decay_factor*g[idx] + learning_rate*gradient\n",
+    "            model.weights[idx] = prev_weight\n",
+    "            \n",
+    "            # Update the model parameter\n",
+    "            model.weights[idx] = model.weights[idx] - g[idx]\n",
+    "            \n",
+    "        if i % 100 == 0:\n",
+    "            print(f\"Iteration {i}\")\n",
+    "            print(model)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 27,
+   "execution_count": 16,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Adam\n",
+      "Nesterov Accelerated Gradient\n",
       "Iteration 0\n",
-      "y = 0.8575880576520295 + 0.8405747654640998*x\n",
+      "y = [0.89010029] + [0.35356173]*x\n",
       "Iteration 100\n",
-      "y = -0.001590081073315357 + 0.9984342651063493*x\n",
+      "y = [0.3723535] + [0.9655646]*x\n",
       "Iteration 200\n",
-      "y = 2.70122392056152e-05 + 1.0001344335031852*x\n",
+      "y = [0.01159183] + [0.9884176]*x\n",
       "Iteration 300\n",
-      "y = -1.4876611515003004e-06 + 1.0000033575642253*x\n",
+      "y = [0.00013029] + [1.00053685]*x\n",
       "Iteration 400\n",
-      "y = 1.030624925582267e-05 + 0.9999970769162957*x\n",
+      "y = [0.00110482] + [1.00002591]*x\n",
       "Iteration 500\n",
-      "y = 2.567876097403007e-06 + 0.9999995341571982*x\n",
+      "y = [1.20330159e-05] + [0.99999153]*x\n",
       "Iteration 600\n",
-      "y = 3.001062476619204e-08 + 0.9999999996624379*x\n",
+      "y = [4.34325924e-05] + [1.00000075]*x\n",
       "Iteration 700\n",
-      "y = 3.5509149827671966e-09 + 0.9999999957864142*x\n",
+      "y = [-1.87460485e-05] + [1.00003432]*x\n",
       "Iteration 800\n",
-      "y = 2.013987394989773e-08 + 0.9999999952565595*x\n",
+      "y = [1.26114336e-05] + [0.99998661]*x\n",
       "Iteration 900\n",
-      "y = -2.4103486747254733e-08 + 0.9999999984873583*x\n",
-      "y = 4.7124723585610135e-07 + 0.9999992952944199*x\n"
+      "y = [-1.84626241e-06] + [1.0000026]*x\n",
+      "y = [4.67870626e-06] + [0.99999889]*x\n"
      ]
     }
    ],
@@ -382,7 +425,7 @@
     "xs = [1,2,3,4,5,6,7]\n",
     "ys = [1,2,3,4,5,6,7]\n",
     "\n",
-    "\n",
+    "'''\n",
     "# Gradient Descent\n",
     "model = Line()\n",
     "print(\"Gradient Descent: \")\n",
@@ -401,6 +444,7 @@
     "sgd_momentum(model, xs, ys)\n",
     "print(model)\n",
     "\n",
+    "\n",
     "# Adagrad\n",
     "model = Line()\n",
     "print(\"Adagrad\")\n",
@@ -410,7 +454,7 @@
     "# RMSprop\n",
     "model = Line()\n",
     "print(\"RMSprop\")\n",
-    "RMSprop(model, xs, ys)\n",
+    "rmsprop(model, xs, ys)\n",
     "print(model)\n",
     "\n",
     "# Adadelta\n",
@@ -424,40 +468,48 @@
     "model = Line()\n",
     "print(\"Adam\")\n",
     "adam(model, xs, ys)\n",
+    "print(model)\n",
+    "\n",
+    "'''\n",
+    "\n",
+    "# Nesterov Accelerated Gradient\n",
+    "model = Line()\n",
+    "print(\"Nesterov Accelerated Gradient\")\n",
+    "nesterov(model, xs, ys)\n",
     "print(model)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 28,
+   "execution_count": 17,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Adam\n",
+      "Nesterov Accelerated Gradient\n",
       "Iteration 0\n",
-      "y = 0.7770901717569257 + 0.9433753832651581*x\n",
+      "y = [0.66878322] + [0.90014342]*x\n",
       "Iteration 100\n",
-      "y = 0.32725296586754515 + 1.900221970247115*x\n",
+      "y = [-0.17903726] + [2.40553571]*x\n",
       "Iteration 200\n",
-      "y = 0.004147083459202631 + 1.99892111204854*x\n",
+      "y = [0.05899865] + [2.04887923]*x\n",
       "Iteration 300\n",
-      "y = -3.4688751132678575e-05 + 2.000013509462116*x\n",
+      "y = [0.00894693] + [1.97536479]*x\n",
       "Iteration 400\n",
-      "y = -9.06655931188767e-08 + 1.9999999764669432*x\n",
+      "y = [0.01374569] + [2.05300722]*x\n",
       "Iteration 500\n",
-      "y = 1.8387422421644338e-10 + 2.0000000000594307*x\n",
+      "y = [0.10548793] + [1.91493233]*x\n",
       "Iteration 600\n",
-      "y = 4.361065425337935e-12 + 1.9999999999993485*x\n",
+      "y = [0.00385495] + [2.01632264]*x\n",
       "Iteration 700\n",
-      "y = 1.1991224399817034e-13 + 1.9999999999999527*x\n",
+      "y = [0.05427682] + [2.13589417]*x\n",
       "Iteration 800\n",
-      "y = -2.5150603804494032e-17 + 2.000000000000001*x\n",
+      "y = [0.0113579] + [2.00570216]*x\n",
       "Iteration 900\n",
-      "y = -8.298690846122729e-17 + 2.0*x\n",
-      "y = 9.891579815200104e-17 + 2.0*x\n"
+      "y = [-0.02275247] + [1.96719449]*x\n",
+      "y = [-0.00900316] + [2.00493823]*x\n"
      ]
     }
    ],
@@ -466,7 +518,7 @@
     "xs = [1,2,3,4,5,6,7]\n",
     "ys = [2,4,6,8,10,12,14]\n",
     "\n",
-    "\n",
+    "'''\n",
     "# Gradient Descent\n",
     "model = Line()\n",
     "print(\"Gradient Descent: \")\n",
@@ -494,7 +546,7 @@
     "# RMSprop\n",
     "model = Line()\n",
     "print(\"RMSprop\")\n",
-    "RMSprop(model, xs, ys)\n",
+    "rmsprop(model, xs, ys)\n",
     "print(model)\n",
     "\n",
     "# Adadelta\n",
@@ -507,40 +559,47 @@
     "model = Line()\n",
     "print(\"Adam\")\n",
     "adam(model, xs, ys)\n",
+    "print(model)\n",
+    "'''\n",
+    "\n",
+    "# Nesterov Accelerated Gradient\n",
+    "model = Line()\n",
+    "print(\"Nesterov Accelerated Gradient\")\n",
+    "nesterov(model, xs, ys)\n",
     "print(model)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 29,
+   "execution_count": 20,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Adam\n",
+      "Nesterov Accelerated Gradient\n",
       "Iteration 0\n",
-      "y = 0.7139491962653317 + 0.6462402260655279*x\n",
+      "y = [0.30475578] + [0.9422567]*x\n",
       "Iteration 100\n",
-      "y = 1.5012704143511661 + 1.913754894630153*x\n",
+      "y = [-0.39695759] + [2.51820116]*x\n",
       "Iteration 200\n",
-      "y = 1.147275006702479 + 1.9626892285633653*x\n",
+      "y = [-1.48096144] + [3.46703259]*x\n",
       "Iteration 300\n",
-      "y = 1.0187321764566213 + 1.9959358177402393*x\n",
+      "y = [1.90685378] + [2.11784826]*x\n",
       "Iteration 400\n",
-      "y = 1.0018116802276642 + 1.9995771873157036*x\n",
+      "y = [1.03772983] + [1.98962059]*x\n",
       "Iteration 500\n",
-      "y = 1.0001550519477989 + 1.9999713016943046*x\n",
+      "y = [1.00457709] + [2.05050135]*x\n",
       "Iteration 600\n",
-      "y = 1.0000046235977667 + 1.9999989555915936*x\n",
+      "y = [0.99465739] + [1.99190331]*x\n",
       "Iteration 700\n",
-      "y = 0.9999999689046957 + 2.0000000091643306*x\n",
+      "y = [1.00337418] + [1.99869932]*x\n",
       "Iteration 800\n",
-      "y = 1.0000000007114835 + 1.999999999822266*x\n",
+      "y = [0.99671896] + [1.98843029]*x\n",
       "Iteration 900\n",
-      "y = 1.000000000010868 + 2.0000000000003593*x\n",
-      "y = 0.9999999999999609 + 2.000000000000028*x\n"
+      "y = [1.00465637] + [1.99875298]*x\n",
+      "y = [0.99462305] + [2.00330215]*x\n"
      ]
     }
    ],
@@ -548,7 +607,7 @@
     "# Here we have a simple line with intercept = 1 and slope = 2\n",
     "xs = [1,2,3,4,5,6,7]\n",
     "ys = [3,5,7,9,11,13,15]\n",
-    "\n",
+    "'''\n",
     "# Gradient Descent\n",
     "model = Line()\n",
     "print(\"Gradient Descent: \")\n",
@@ -576,7 +635,7 @@
     "# RMSprop\n",
     "model = Line()\n",
     "print(\"RMSprop\")\n",
-    "RMSprop(model, xs, ys)\n",
+    "rmsprop(model, xs, ys)\n",
     "print(model)\n",
     "\n",
     "# Adadelta\n",
@@ -589,6 +648,13 @@
     "model = Line()\n",
     "print(\"Adam\")\n",
     "adam(model, xs, ys)\n",
+    "print(model)\n",
+    "'''\n",
+    "\n",
+    "# Nesterov Accelerated Gradient\n",
+    "model = Line()\n",
+    "print(\"Nesterov Accelerated Gradient\")\n",
+    "nesterov(model, xs, ys)\n",
     "print(model)"
    ]
   },