diff --git a/Deep Learning from Scratch in Python/.ipynb_checkpoints/Gradient Descent Optimization Algorithms-checkpoint.ipynb b/Deep Learning from Scratch in Python/.ipynb_checkpoints/Gradient Descent Optimization Algorithms-checkpoint.ipynb index d8dc501..81b0980 100644 --- a/Deep Learning from Scratch in Python/.ipynb_checkpoints/Gradient Descent Optimization Algorithms-checkpoint.ipynb +++ b/Deep Learning from Scratch in Python/.ipynb_checkpoints/Gradient Descent Optimization Algorithms-checkpoint.ipynb @@ -11,7 +11,8 @@ "- [Adagrad](https://youtu.be/EGt-UOIIdDk)\n", "- [RMSprop](https://youtu.be/nLCuzsQaAKE)\n", "- [AdaDelta](https://youtu.be/6gvh0IySNMs)\n", - "- Adam\n", + "- [Adam](https://youtu.be/6nqV58NA_Ew)\n", + "- [Nesterov](https://youtu.be/6FrBXv9OcqE)\n", "\n", "## Tests\n", "In order to demonstrate the algorithms capabilities to optimize a function we used these simple test setup:\n", @@ -20,18 +21,22 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", + "import copy\n", "from numpy.random import permutation\n", "\n", "class Line():\n", + " \"\"\"\n", + " Linear Model with two weights w0 (intercept) and w1 (slope)\n", + " \"\"\"\n", " def __init__(self):\n", - " self.w0 = np.random.uniform(0,1,1)\n", - " self.w1 = np.random.uniform(0,1,1)\n", - " \n", + " self.weights = [np.random.uniform(0,1,1) for _ in range(2)]\n", + " self.derivative_funcs = [self.dx_w0, self.dx_w1]\n", + " \n", " def evaluate(self,x):\n", " \"\"\"\n", " evaluate: will evaluate the line yhate given x\n", @@ -39,34 +44,51 @@ "\n", " return the result of the function evalutation\n", " \"\"\"\n", - " return self.w0 + self.w1*x\n", + " return self.weights[0] + self.weights[1]*x\n", + "\n", + " def derivate(self, x, y):\n", + " \"\"\"\n", + " derivate: will calculate all partial derivatives and return them\n", + " input:\n", + " x: a point in the plane\n", + " y: the response of the point x\n", + " \n", + " output:\n", + " partial_derivatives: an array of partial derivatives\n", + " \"\"\"\n", + " partial_derivatives = []\n", + " \n", + " yhat = self.evaluate(x)\n", + " partial_derivatives.append(self.dx_w0(x, y, yhat))\n", + " partial_derivatives.append(self.dx_w1(x, y, yhat))\n", + " \n", + " return partial_derivatives\n", " \n", - " def dx_w0(self, x, y):\n", + " def dx_w0(self, x, y, yhat):\n", " \"\"\"\n", " dx_w0: partial derivative of the weight w0\n", " x: a point in the plane\n", " y: the response of the point x\n", + " yhat: the current approximation of y given x and the weights\n", "\n", " return the gradient at that point for this x and y for w0\n", " \"\"\"\n", - " yhat = self.evaluate(x)\n", " return 2*(yhat - y)\n", - " \n", " \n", - " def dx_w1(self, x, y):\n", + " def dx_w1(self, x, y, yhat):\n", " \"\"\"\n", " dx_w1: partial derivative of the weight w1 for a linear function\n", " x: a point in the plane\n", " y: the response of the point x\n", + " yhat: the current approximation of y given x and the weights\n", "\n", " return the gradient at that point for this x and y for w1\n", " \"\"\" \n", - " yhat = self.evaluate(x)\n", " return 2*x*(yhat - y)\n", "\n", " def __str__(self):\n", - " return f\"y = {self.w0[0]} + {self.w1[0]}*x\"\n", - " \n", + " return f\"y = {self.weights[0]} + {self.weights[1]}*x\"\n", + " \n", " \n", "#################### Helper functions ######################\n", "def stochastic_sample(xs, ys):\n", @@ -84,9 +106,10 @@ " return x, y\n", " \n", " \n", - "def gradient(dx, xs, ys):\n", + "def gradient(dx, evaluate, xs, ys):\n", " \"\"\"\n", " gradient: estimate mean gradient over all point for w1\n", + " evaluate: the evaulation function from the model\n", " dx: partial derivative function used to evaluate the gradient\n", " xs: all point on the plane\n", " ys: all response on the plane\n", @@ -97,7 +120,8 @@ " \n", " total = 0\n", " for x,y in zip(xs,ys):\n", - " total = total + dx(x, y)\n", + " yhat = evaluate(x)\n", + " total = total + dx(x, y, yhat)\n", " \n", " gradient = total/N\n", " return gradient\n", @@ -115,8 +139,8 @@ " \"\"\" \n", "\n", " for i in range(max_num_iteration):\n", - " model.w0 = model.w0 - learning_rate*gradient(model.dx_w0, xs, ys)\n", - " model.w1 = model.w1 - learning_rate*gradient(model.dx_w1, xs, ys)\n", + " # Updating the model parameters\n", + " model.weights = [weight - learning_rate*gradient(derivative_func, model.evaluate, xs, ys) for weight, derivative_func in zip(model.weights, model.derivative_funcs)]\n", " \n", " if i % 100 == 0:\n", " print(f\"Iteration {i}\")\n", @@ -139,9 +163,7 @@ " x, y = stochastic_sample(xs, ys)\n", " \n", " # Updating the model parameters\n", - " model.w0 = model.w0 - learning_rate*model.dx_w0(x, y)\n", - " model.w1 = model.w1 - learning_rate*model.dx_w1(x, y)\n", - " \n", + " model.weights = [weight - learning_rate*derivative for weight, derivative in zip(model.weights, model.derivate(x,y))] \n", " \n", " if i % 100 == 0:\n", " print(f\"Iteration {i}\")\n", @@ -159,31 +181,26 @@ " max_num_iteration: the number of iteration before we stop updating\n", " \"\"\"\n", " \n", - " # These are needed to keep track of the previous gradient\n", - " prev_g0 = 0\n", - " prev_g1 = 0\n", + " # Create the gradient that we keep track as an array of 0 of the same size as the number of weights\n", + " gradients = [0 for _ in range(len(model.weights))]\n", " \n", " for i in range(max_num_iteration):\n", " \n", " # Select a random x and y\n", " x, y = stochastic_sample(xs, ys)\n", "\n", - " g0 = decay_factor*prev_g0 - learning_rate*model.dx_w0(x,y)\n", - " g1 = decay_factor*prev_g1 - learning_rate*model.dx_w1(x,y)\n", + " # Calculate the new gradients\n", + " gradients = [decay_factor*g + learning_rate*derivative for g, derivative in zip(gradients, model.derivate(x,y))]\n", " \n", " # Updating the model parameters\n", - " model.w0 = model.w0 + g0\n", - " model.w1 = model.w1 + g1\n", - " \n", - " # swap previous gradient\n", - " prev_g0, prev_g1 = g0, g1\n", + " model.weights = [weight - g for weight, g in zip(model.weights, gradients)]\n", " \n", " if i % 100 == 0:\n", " print(f\"Iteration {i}\")\n", " print(model)\n", " \n", " \n", - "def adagrad(model, xs, ys, learning_rate = 0.1, max_num_iteration = 10000, eps=0.0000001):\n", + "def adagrad(model, xs, ys, learning_rate = 0.1, max_num_iteration = 1000, eps=0.0000001):\n", " \"\"\"\n", " adagrad: will estimate the parameters w0 and w1 \n", " (here it uses least square cost function)\n", @@ -194,31 +211,28 @@ " max_num_iteration: the number of iteration before we stop updating\n", " eps: is a numerical safety to avoid division by 0\n", " \"\"\" \n", + " \n", " # Here only the diagonal matter\n", - " G = [[0,0],\n", - " [0,0]]\n", + " num_param = len(model.weights)\n", + " G = [[0 for _ in range(num_param)] for _ in range(num_param)]\n", " \n", " for i in range(max_num_iteration):\n", " \n", " # Select a random x and y\n", " x, y = stochastic_sample(xs, ys)\n", " \n", - " g0 = model.dx_w0(x, y)\n", - " g1 = model.dx_w1(x, y)\n", - " \n", - " G[0][0] = G[0][0] + g0*g0\n", - " G[1][1] = G[1][1] + g1*g1\n", - " \n", - " model.w0 = model.w0 - (learning_rate/np.sqrt(G[0][0] + eps)) * g0\n", - " model.w1 = model.w1 - (learning_rate/np.sqrt(G[1][1] + eps)) * g1\n", + " # Update G and the model weights iteratively (Note: speed up could be gained from vectorized implementation)\n", + " for idx, gradient in enumerate(model.derivate(x, y)):\n", + " G[idx][idx] = G[idx][idx] + gradient**2\n", + " model.weights[idx] = model.weights[idx] - (learning_rate / np.sqrt(G[idx][idx] + eps)) * gradient\n", " \n", " if i % 100 == 0:\n", " print(f\"Iteration {i}\")\n", " print(model)\n", " \n", - "def RMSprop(model, xs, ys, learning_rate = 0.01, decay_factor = 0.9, max_num_iteration = 10000, eps=0.0000001):\n", + "def rmsprop(model, xs, ys, learning_rate = 0.01, decay_factor = 0.9, max_num_iteration = 10000, eps=0.0000001):\n", " \"\"\"\n", - " RMSprop: will estimate the parameters w0 and w1 \n", + " rmsprop: will estimate the parameters w0 and w1 \n", " (here it uses least square cost function)\n", " model: the model we are trying to optimize using sgd\n", " xs: all point on the plane\n", @@ -230,21 +244,18 @@ " \"\"\" \n", " \n", " # Running average\n", - " E = [0,0]\n", + " E = [0 for _ in range(len(model.weights))]\n", " \n", " for i in range(max_num_iteration):\n", " \n", " # Select a random x and y\n", " x, y = stochastic_sample(xs, ys)\n", " \n", - " g0 = model.dx_w0(x, y)\n", - " g1 = model.dx_w1(x, y)\n", - " \n", - " E[0] = decay_factor*E[0] + (1-decay_factor)*g0*g0\n", - " E[1] = decay_factor*E[1] + (1-decay_factor)*g1*g1\n", - " \n", - " model.w0 = model.w0 - (learning_rate/np.sqrt(E[0] + eps)) * g0\n", - " model.w1 = model.w1 - (learning_rate/np.sqrt(E[1] + eps)) * g1\n", + " # Update E and the model weights iteratively (Note: speed up could be gained from vectorized implementation)\n", + " for idx, gradient in enumerate(model.derivate(x, y)): \n", + " E[idx] = decay_factor*E[idx] + (1 - decay_factor)*(gradient**2)\n", + " model.weights[idx] = model.weights[idx] - (learning_rate/np.sqrt(E[idx] + eps))*gradient\n", + "\n", " \n", " if i % 100 == 0:\n", " print(f\"Iteration {i}\")\n", @@ -261,35 +272,31 @@ " eps: is a numerical safety to avoid division by 0\n", " \"\"\" \n", " \n", - " # Running average\n", - " E_g = [0,0] # for gradient\n", - " E_p = [0,0] # for parameters\n", - " delta_p = [0,0] #delta for parameter\n", + " # Init Running Averages\n", + " num_param = len(model.weights)\n", + " E_g = [0 for _ in range(num_param)]\n", + " E_p = [0 for _ in range(num_param)]\n", + " delta_p = [0 for _ in range(num_param)]\n", + " \n", " \n", " for i in range(max_num_iteration):\n", " \n", " # Select a random x and y\n", " x, y = stochastic_sample(xs, ys)\n", " \n", - " g0 = model.dx_w0(x, y)\n", - " g1 = model.dx_w1(x, y)\n", - " \n", - " # Get the running average for the gradient\n", - " E_g[0] = decay_factor*E_g[0] + (1-decay_factor)*g0*g0\n", - " E_g[1] = decay_factor*E_g[1] + (1-decay_factor)*g1*g1\n", - " \n", - " # Get the running average for the parameters\n", - " E_p[0] = decay_factor*E_p[0] + (1-decay_factor)*delta_p[0]*delta_p[0]\n", - " E_p[1] = decay_factor*E_p[1] + (1-decay_factor)*delta_p[1]*delta_p[1]\n", - " \n", - " # Calculate the gradient difference\n", - " delta_p[0] = - np.sqrt(E_p[0] + eps) / np.sqrt(E_g[0] + eps) * g0\n", - " delta_p[1] = - np.sqrt(E_p[1] + eps) / np.sqrt(E_g[1] + eps) * g1\n", - " \n", - " # update the models\n", - " model.w0 = model.w0 + delta_p[0]\n", - " model.w1 = model.w1 + delta_p[1]\n", - " \n", + " for idx, gradient in enumerate(model.derivate(x, y)):\n", + " # Get the running average for the gradient\n", + " E_g[idx] = decay_factor*E_g[idx] + (1 - decay_factor)*(gradient**2)\n", + " \n", + " # Get the running average for the parameters\n", + " E_p[idx] = decay_factor*E_p[idx] + (1 - decay_factor)*(delta_p[idx]**2)\n", + " \n", + " # Calculate the gradient difference\n", + " delta_p[idx] = - np.sqrt(E_p[idx] + eps) / np.sqrt(E_g[idx] + eps) * gradient\n", + " \n", + " # update the model weight\n", + " model.weights[idx] = model.weights[idx] + delta_p[idx]\n", + " \n", " \n", " if i % 100 == 0:\n", " print(f\"Iteration {i}\")\n", @@ -311,16 +318,18 @@ " \n", " \n", " # Variable Initialization\n", - " m = [0, 0] # two m for each parameter\n", - " v = [0, 0] # two v for each parameter\n", - " g = [0, 0] # two gradient\n", - " t = 1 # time steps\n", + " num_param = len(model.weights)\n", + " m = [0 for _ in range(num_param)] # two m for each parameter\n", + " v = [0 for _ in range(num_param)] # two v for each parameter\n", + " g = [0 for _ in range(num_param)] # two gradient\n", " \n", - " for i in range(max_iteration):\n", + " for t in range(1,max_iteration):\n", + " \n", " # Calculate the gradients \n", " x, y = stochastic_sample(xs, ys)\n", - " g[0] = model.dx_w0(x, y)\n", - " g[1] = model.dx_w1(x, y)\n", + " \n", + " # Get the partial derivatives\n", + " g = model.derivate(x, y)\n", "\n", " # Update the m and v parameter\n", " m = [b1*m_i + (1 - b1)*g_i for m_i, g_i in zip(m, g)]\n", @@ -331,49 +340,83 @@ " v_cor = [v_i / (1 - (b2**t)) for v_i in v]\n", "\n", " # Update the parameter\n", - " model.w0 = model.w0 - (learning_rate / (np.sqrt(v_cor[0]) + epsilon))*m_cor[0]\n", - " model.w1 = model.w1 - (learning_rate / (np.sqrt(v_cor[1]) + epsilon))*m_cor[1]\n", - "\n", - " t = t + 1\n", + " model.weights = [weight - (learning_rate / (np.sqrt(v_cor_i) + epsilon))*m_cor_i for weight, v_cor_i, m_cor_i in zip(model.weights, v_cor, m_cor)]\n", " \n", - " if i % 100 == 0:\n", - " print(f\"Iteration {i}\")\n", + " if t % 100 == 0:\n", + " print(f\"Iteration {t}\")\n", " print(model)\n", " \n", - " " + "def nesterov(model, xs, ys, learning_rate = 0.01, decay_factor = 0.9, max_num_iteration = 1000):\n", + " \"\"\"\n", + " Nesterov: This is the nesterov accelerated gradient optimizer that build upon momentum\n", + " model: the model we want to optimize the parameter on (this is a line right now)\n", + " xs: the feature of my dataset\n", + " ys: the continous value (target)\n", + " learning_rate: the learning rate for the step that weights update will take\n", + " decay_factor: determines the relative contribution of the current gradient and earlier gradients to the weight change\n", + " max_num_iteration: the number of iteration before we stop updating\n", + " \"\"\"\n", + " \n", + " # These are needed to keep track of the previous gradient\n", + " g = [0 for _ in range(len(model.weights))] \n", + " g0 = 0\n", + " g1 = 0\n", + " \n", + " for i in range(max_num_iteration):\n", + " \n", + " # Select a random x and y\n", + " x, y = stochastic_sample(xs, ys)\n", + "\n", + " # Calculate the gradient for w0 by predicting where the ball will be (approximatively)\n", + " for idx, gradient in enumerate(model.derivate(x,y)):\n", + " \n", + " # Here we need to do a bit of gymnastic because of how the code is setup\n", + " # We need to save the parameters state, modify it, do the simulation and then reset the parameter state\n", + " # The update happen in the next section\n", + " prev_weight = model.weights[idx]\n", + " model.weights[idx] = model.weights[idx] = decay_factor*gradient\n", + " g[idx] = decay_factor*g[idx] + learning_rate*gradient\n", + " model.weights[idx] = prev_weight\n", + " \n", + " # Update the model parameter\n", + " model.weights[idx] = model.weights[idx] - g[idx]\n", + " \n", + " if i % 100 == 0:\n", + " print(f\"Iteration {i}\")\n", + " print(model)" ] }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Adam\n", + "Nesterov Accelerated Gradient\n", "Iteration 0\n", - "y = 0.8575880576520295 + 0.8405747654640998*x\n", + "y = [0.89010029] + [0.35356173]*x\n", "Iteration 100\n", - "y = -0.001590081073315357 + 0.9984342651063493*x\n", + "y = [0.3723535] + [0.9655646]*x\n", "Iteration 200\n", - "y = 2.70122392056152e-05 + 1.0001344335031852*x\n", + "y = [0.01159183] + [0.9884176]*x\n", "Iteration 300\n", - "y = -1.4876611515003004e-06 + 1.0000033575642253*x\n", + "y = [0.00013029] + [1.00053685]*x\n", "Iteration 400\n", - "y = 1.030624925582267e-05 + 0.9999970769162957*x\n", + "y = [0.00110482] + [1.00002591]*x\n", "Iteration 500\n", - "y = 2.567876097403007e-06 + 0.9999995341571982*x\n", + "y = [1.20330159e-05] + [0.99999153]*x\n", "Iteration 600\n", - "y = 3.001062476619204e-08 + 0.9999999996624379*x\n", + "y = [4.34325924e-05] + [1.00000075]*x\n", "Iteration 700\n", - "y = 3.5509149827671966e-09 + 0.9999999957864142*x\n", + "y = [-1.87460485e-05] + [1.00003432]*x\n", "Iteration 800\n", - "y = 2.013987394989773e-08 + 0.9999999952565595*x\n", + "y = [1.26114336e-05] + [0.99998661]*x\n", "Iteration 900\n", - "y = -2.4103486747254733e-08 + 0.9999999984873583*x\n", - "y = 4.7124723585610135e-07 + 0.9999992952944199*x\n" + "y = [-1.84626241e-06] + [1.0000026]*x\n", + "y = [4.67870626e-06] + [0.99999889]*x\n" ] } ], @@ -382,7 +425,7 @@ "xs = [1,2,3,4,5,6,7]\n", "ys = [1,2,3,4,5,6,7]\n", "\n", - "\n", + "'''\n", "# Gradient Descent\n", "model = Line()\n", "print(\"Gradient Descent: \")\n", @@ -401,6 +444,7 @@ "sgd_momentum(model, xs, ys)\n", "print(model)\n", "\n", + "\n", "# Adagrad\n", "model = Line()\n", "print(\"Adagrad\")\n", @@ -410,7 +454,7 @@ "# RMSprop\n", "model = Line()\n", "print(\"RMSprop\")\n", - "RMSprop(model, xs, ys)\n", + "rmsprop(model, xs, ys)\n", "print(model)\n", "\n", "# Adadelta\n", @@ -424,40 +468,48 @@ "model = Line()\n", "print(\"Adam\")\n", "adam(model, xs, ys)\n", + "print(model)\n", + "\n", + "'''\n", + "\n", + "# Nesterov Accelerated Gradient\n", + "model = Line()\n", + "print(\"Nesterov Accelerated Gradient\")\n", + "nesterov(model, xs, ys)\n", "print(model)" ] }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 17, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Adam\n", + "Nesterov Accelerated Gradient\n", "Iteration 0\n", - "y = 0.7770901717569257 + 0.9433753832651581*x\n", + "y = [0.66878322] + [0.90014342]*x\n", "Iteration 100\n", - "y = 0.32725296586754515 + 1.900221970247115*x\n", + "y = [-0.17903726] + [2.40553571]*x\n", "Iteration 200\n", - "y = 0.004147083459202631 + 1.99892111204854*x\n", + "y = [0.05899865] + [2.04887923]*x\n", "Iteration 300\n", - "y = -3.4688751132678575e-05 + 2.000013509462116*x\n", + "y = [0.00894693] + [1.97536479]*x\n", "Iteration 400\n", - "y = -9.06655931188767e-08 + 1.9999999764669432*x\n", + "y = [0.01374569] + [2.05300722]*x\n", "Iteration 500\n", - "y = 1.8387422421644338e-10 + 2.0000000000594307*x\n", + "y = [0.10548793] + [1.91493233]*x\n", "Iteration 600\n", - "y = 4.361065425337935e-12 + 1.9999999999993485*x\n", + "y = [0.00385495] + [2.01632264]*x\n", "Iteration 700\n", - "y = 1.1991224399817034e-13 + 1.9999999999999527*x\n", + "y = [0.05427682] + [2.13589417]*x\n", "Iteration 800\n", - "y = -2.5150603804494032e-17 + 2.000000000000001*x\n", + "y = [0.0113579] + [2.00570216]*x\n", "Iteration 900\n", - "y = -8.298690846122729e-17 + 2.0*x\n", - "y = 9.891579815200104e-17 + 2.0*x\n" + "y = [-0.02275247] + [1.96719449]*x\n", + "y = [-0.00900316] + [2.00493823]*x\n" ] } ], @@ -466,7 +518,7 @@ "xs = [1,2,3,4,5,6,7]\n", "ys = [2,4,6,8,10,12,14]\n", "\n", - "\n", + "'''\n", "# Gradient Descent\n", "model = Line()\n", "print(\"Gradient Descent: \")\n", @@ -494,7 +546,7 @@ "# RMSprop\n", "model = Line()\n", "print(\"RMSprop\")\n", - "RMSprop(model, xs, ys)\n", + "rmsprop(model, xs, ys)\n", "print(model)\n", "\n", "# Adadelta\n", @@ -507,40 +559,47 @@ "model = Line()\n", "print(\"Adam\")\n", "adam(model, xs, ys)\n", + "print(model)\n", + "'''\n", + "\n", + "# Nesterov Accelerated Gradient\n", + "model = Line()\n", + "print(\"Nesterov Accelerated Gradient\")\n", + "nesterov(model, xs, ys)\n", "print(model)" ] }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 20, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Adam\n", + "Nesterov Accelerated Gradient\n", "Iteration 0\n", - "y = 0.7139491962653317 + 0.6462402260655279*x\n", + "y = [0.30475578] + [0.9422567]*x\n", "Iteration 100\n", - "y = 1.5012704143511661 + 1.913754894630153*x\n", + "y = [-0.39695759] + [2.51820116]*x\n", "Iteration 200\n", - "y = 1.147275006702479 + 1.9626892285633653*x\n", + "y = [-1.48096144] + [3.46703259]*x\n", "Iteration 300\n", - "y = 1.0187321764566213 + 1.9959358177402393*x\n", + "y = [1.90685378] + [2.11784826]*x\n", "Iteration 400\n", - "y = 1.0018116802276642 + 1.9995771873157036*x\n", + "y = [1.03772983] + [1.98962059]*x\n", "Iteration 500\n", - "y = 1.0001550519477989 + 1.9999713016943046*x\n", + "y = [1.00457709] + [2.05050135]*x\n", "Iteration 600\n", - "y = 1.0000046235977667 + 1.9999989555915936*x\n", + "y = [0.99465739] + [1.99190331]*x\n", "Iteration 700\n", - "y = 0.9999999689046957 + 2.0000000091643306*x\n", + "y = [1.00337418] + [1.99869932]*x\n", "Iteration 800\n", - "y = 1.0000000007114835 + 1.999999999822266*x\n", + "y = [0.99671896] + [1.98843029]*x\n", "Iteration 900\n", - "y = 1.000000000010868 + 2.0000000000003593*x\n", - "y = 0.9999999999999609 + 2.000000000000028*x\n" + "y = [1.00465637] + [1.99875298]*x\n", + "y = [0.99462305] + [2.00330215]*x\n" ] } ], @@ -548,7 +607,7 @@ "# Here we have a simple line with intercept = 1 and slope = 2\n", "xs = [1,2,3,4,5,6,7]\n", "ys = [3,5,7,9,11,13,15]\n", - "\n", + "'''\n", "# Gradient Descent\n", "model = Line()\n", "print(\"Gradient Descent: \")\n", @@ -576,7 +635,7 @@ "# RMSprop\n", "model = Line()\n", "print(\"RMSprop\")\n", - "RMSprop(model, xs, ys)\n", + "rmsprop(model, xs, ys)\n", "print(model)\n", "\n", "# Adadelta\n", @@ -589,6 +648,13 @@ "model = Line()\n", "print(\"Adam\")\n", "adam(model, xs, ys)\n", + "print(model)\n", + "'''\n", + "\n", + "# Nesterov Accelerated Gradient\n", + "model = Line()\n", + "print(\"Nesterov Accelerated Gradient\")\n", + "nesterov(model, xs, ys)\n", "print(model)" ] }, diff --git a/Deep Learning from Scratch in Python/Gradient Descent Optimization Algorithms.ipynb b/Deep Learning from Scratch in Python/Gradient Descent Optimization Algorithms.ipynb index d8dc501..81b0980 100644 --- a/Deep Learning from Scratch in Python/Gradient Descent Optimization Algorithms.ipynb +++ b/Deep Learning from Scratch in Python/Gradient Descent Optimization Algorithms.ipynb @@ -11,7 +11,8 @@ "- [Adagrad](https://youtu.be/EGt-UOIIdDk)\n", "- [RMSprop](https://youtu.be/nLCuzsQaAKE)\n", "- [AdaDelta](https://youtu.be/6gvh0IySNMs)\n", - "- Adam\n", + "- [Adam](https://youtu.be/6nqV58NA_Ew)\n", + "- [Nesterov](https://youtu.be/6FrBXv9OcqE)\n", "\n", "## Tests\n", "In order to demonstrate the algorithms capabilities to optimize a function we used these simple test setup:\n", @@ -20,18 +21,22 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", + "import copy\n", "from numpy.random import permutation\n", "\n", "class Line():\n", + " \"\"\"\n", + " Linear Model with two weights w0 (intercept) and w1 (slope)\n", + " \"\"\"\n", " def __init__(self):\n", - " self.w0 = np.random.uniform(0,1,1)\n", - " self.w1 = np.random.uniform(0,1,1)\n", - " \n", + " self.weights = [np.random.uniform(0,1,1) for _ in range(2)]\n", + " self.derivative_funcs = [self.dx_w0, self.dx_w1]\n", + " \n", " def evaluate(self,x):\n", " \"\"\"\n", " evaluate: will evaluate the line yhate given x\n", @@ -39,34 +44,51 @@ "\n", " return the result of the function evalutation\n", " \"\"\"\n", - " return self.w0 + self.w1*x\n", + " return self.weights[0] + self.weights[1]*x\n", + "\n", + " def derivate(self, x, y):\n", + " \"\"\"\n", + " derivate: will calculate all partial derivatives and return them\n", + " input:\n", + " x: a point in the plane\n", + " y: the response of the point x\n", + " \n", + " output:\n", + " partial_derivatives: an array of partial derivatives\n", + " \"\"\"\n", + " partial_derivatives = []\n", + " \n", + " yhat = self.evaluate(x)\n", + " partial_derivatives.append(self.dx_w0(x, y, yhat))\n", + " partial_derivatives.append(self.dx_w1(x, y, yhat))\n", + " \n", + " return partial_derivatives\n", " \n", - " def dx_w0(self, x, y):\n", + " def dx_w0(self, x, y, yhat):\n", " \"\"\"\n", " dx_w0: partial derivative of the weight w0\n", " x: a point in the plane\n", " y: the response of the point x\n", + " yhat: the current approximation of y given x and the weights\n", "\n", " return the gradient at that point for this x and y for w0\n", " \"\"\"\n", - " yhat = self.evaluate(x)\n", " return 2*(yhat - y)\n", - " \n", " \n", - " def dx_w1(self, x, y):\n", + " def dx_w1(self, x, y, yhat):\n", " \"\"\"\n", " dx_w1: partial derivative of the weight w1 for a linear function\n", " x: a point in the plane\n", " y: the response of the point x\n", + " yhat: the current approximation of y given x and the weights\n", "\n", " return the gradient at that point for this x and y for w1\n", " \"\"\" \n", - " yhat = self.evaluate(x)\n", " return 2*x*(yhat - y)\n", "\n", " def __str__(self):\n", - " return f\"y = {self.w0[0]} + {self.w1[0]}*x\"\n", - " \n", + " return f\"y = {self.weights[0]} + {self.weights[1]}*x\"\n", + " \n", " \n", "#################### Helper functions ######################\n", "def stochastic_sample(xs, ys):\n", @@ -84,9 +106,10 @@ " return x, y\n", " \n", " \n", - "def gradient(dx, xs, ys):\n", + "def gradient(dx, evaluate, xs, ys):\n", " \"\"\"\n", " gradient: estimate mean gradient over all point for w1\n", + " evaluate: the evaulation function from the model\n", " dx: partial derivative function used to evaluate the gradient\n", " xs: all point on the plane\n", " ys: all response on the plane\n", @@ -97,7 +120,8 @@ " \n", " total = 0\n", " for x,y in zip(xs,ys):\n", - " total = total + dx(x, y)\n", + " yhat = evaluate(x)\n", + " total = total + dx(x, y, yhat)\n", " \n", " gradient = total/N\n", " return gradient\n", @@ -115,8 +139,8 @@ " \"\"\" \n", "\n", " for i in range(max_num_iteration):\n", - " model.w0 = model.w0 - learning_rate*gradient(model.dx_w0, xs, ys)\n", - " model.w1 = model.w1 - learning_rate*gradient(model.dx_w1, xs, ys)\n", + " # Updating the model parameters\n", + " model.weights = [weight - learning_rate*gradient(derivative_func, model.evaluate, xs, ys) for weight, derivative_func in zip(model.weights, model.derivative_funcs)]\n", " \n", " if i % 100 == 0:\n", " print(f\"Iteration {i}\")\n", @@ -139,9 +163,7 @@ " x, y = stochastic_sample(xs, ys)\n", " \n", " # Updating the model parameters\n", - " model.w0 = model.w0 - learning_rate*model.dx_w0(x, y)\n", - " model.w1 = model.w1 - learning_rate*model.dx_w1(x, y)\n", - " \n", + " model.weights = [weight - learning_rate*derivative for weight, derivative in zip(model.weights, model.derivate(x,y))] \n", " \n", " if i % 100 == 0:\n", " print(f\"Iteration {i}\")\n", @@ -159,31 +181,26 @@ " max_num_iteration: the number of iteration before we stop updating\n", " \"\"\"\n", " \n", - " # These are needed to keep track of the previous gradient\n", - " prev_g0 = 0\n", - " prev_g1 = 0\n", + " # Create the gradient that we keep track as an array of 0 of the same size as the number of weights\n", + " gradients = [0 for _ in range(len(model.weights))]\n", " \n", " for i in range(max_num_iteration):\n", " \n", " # Select a random x and y\n", " x, y = stochastic_sample(xs, ys)\n", "\n", - " g0 = decay_factor*prev_g0 - learning_rate*model.dx_w0(x,y)\n", - " g1 = decay_factor*prev_g1 - learning_rate*model.dx_w1(x,y)\n", + " # Calculate the new gradients\n", + " gradients = [decay_factor*g + learning_rate*derivative for g, derivative in zip(gradients, model.derivate(x,y))]\n", " \n", " # Updating the model parameters\n", - " model.w0 = model.w0 + g0\n", - " model.w1 = model.w1 + g1\n", - " \n", - " # swap previous gradient\n", - " prev_g0, prev_g1 = g0, g1\n", + " model.weights = [weight - g for weight, g in zip(model.weights, gradients)]\n", " \n", " if i % 100 == 0:\n", " print(f\"Iteration {i}\")\n", " print(model)\n", " \n", " \n", - "def adagrad(model, xs, ys, learning_rate = 0.1, max_num_iteration = 10000, eps=0.0000001):\n", + "def adagrad(model, xs, ys, learning_rate = 0.1, max_num_iteration = 1000, eps=0.0000001):\n", " \"\"\"\n", " adagrad: will estimate the parameters w0 and w1 \n", " (here it uses least square cost function)\n", @@ -194,31 +211,28 @@ " max_num_iteration: the number of iteration before we stop updating\n", " eps: is a numerical safety to avoid division by 0\n", " \"\"\" \n", + " \n", " # Here only the diagonal matter\n", - " G = [[0,0],\n", - " [0,0]]\n", + " num_param = len(model.weights)\n", + " G = [[0 for _ in range(num_param)] for _ in range(num_param)]\n", " \n", " for i in range(max_num_iteration):\n", " \n", " # Select a random x and y\n", " x, y = stochastic_sample(xs, ys)\n", " \n", - " g0 = model.dx_w0(x, y)\n", - " g1 = model.dx_w1(x, y)\n", - " \n", - " G[0][0] = G[0][0] + g0*g0\n", - " G[1][1] = G[1][1] + g1*g1\n", - " \n", - " model.w0 = model.w0 - (learning_rate/np.sqrt(G[0][0] + eps)) * g0\n", - " model.w1 = model.w1 - (learning_rate/np.sqrt(G[1][1] + eps)) * g1\n", + " # Update G and the model weights iteratively (Note: speed up could be gained from vectorized implementation)\n", + " for idx, gradient in enumerate(model.derivate(x, y)):\n", + " G[idx][idx] = G[idx][idx] + gradient**2\n", + " model.weights[idx] = model.weights[idx] - (learning_rate / np.sqrt(G[idx][idx] + eps)) * gradient\n", " \n", " if i % 100 == 0:\n", " print(f\"Iteration {i}\")\n", " print(model)\n", " \n", - "def RMSprop(model, xs, ys, learning_rate = 0.01, decay_factor = 0.9, max_num_iteration = 10000, eps=0.0000001):\n", + "def rmsprop(model, xs, ys, learning_rate = 0.01, decay_factor = 0.9, max_num_iteration = 10000, eps=0.0000001):\n", " \"\"\"\n", - " RMSprop: will estimate the parameters w0 and w1 \n", + " rmsprop: will estimate the parameters w0 and w1 \n", " (here it uses least square cost function)\n", " model: the model we are trying to optimize using sgd\n", " xs: all point on the plane\n", @@ -230,21 +244,18 @@ " \"\"\" \n", " \n", " # Running average\n", - " E = [0,0]\n", + " E = [0 for _ in range(len(model.weights))]\n", " \n", " for i in range(max_num_iteration):\n", " \n", " # Select a random x and y\n", " x, y = stochastic_sample(xs, ys)\n", " \n", - " g0 = model.dx_w0(x, y)\n", - " g1 = model.dx_w1(x, y)\n", - " \n", - " E[0] = decay_factor*E[0] + (1-decay_factor)*g0*g0\n", - " E[1] = decay_factor*E[1] + (1-decay_factor)*g1*g1\n", - " \n", - " model.w0 = model.w0 - (learning_rate/np.sqrt(E[0] + eps)) * g0\n", - " model.w1 = model.w1 - (learning_rate/np.sqrt(E[1] + eps)) * g1\n", + " # Update E and the model weights iteratively (Note: speed up could be gained from vectorized implementation)\n", + " for idx, gradient in enumerate(model.derivate(x, y)): \n", + " E[idx] = decay_factor*E[idx] + (1 - decay_factor)*(gradient**2)\n", + " model.weights[idx] = model.weights[idx] - (learning_rate/np.sqrt(E[idx] + eps))*gradient\n", + "\n", " \n", " if i % 100 == 0:\n", " print(f\"Iteration {i}\")\n", @@ -261,35 +272,31 @@ " eps: is a numerical safety to avoid division by 0\n", " \"\"\" \n", " \n", - " # Running average\n", - " E_g = [0,0] # for gradient\n", - " E_p = [0,0] # for parameters\n", - " delta_p = [0,0] #delta for parameter\n", + " # Init Running Averages\n", + " num_param = len(model.weights)\n", + " E_g = [0 for _ in range(num_param)]\n", + " E_p = [0 for _ in range(num_param)]\n", + " delta_p = [0 for _ in range(num_param)]\n", + " \n", " \n", " for i in range(max_num_iteration):\n", " \n", " # Select a random x and y\n", " x, y = stochastic_sample(xs, ys)\n", " \n", - " g0 = model.dx_w0(x, y)\n", - " g1 = model.dx_w1(x, y)\n", - " \n", - " # Get the running average for the gradient\n", - " E_g[0] = decay_factor*E_g[0] + (1-decay_factor)*g0*g0\n", - " E_g[1] = decay_factor*E_g[1] + (1-decay_factor)*g1*g1\n", - " \n", - " # Get the running average for the parameters\n", - " E_p[0] = decay_factor*E_p[0] + (1-decay_factor)*delta_p[0]*delta_p[0]\n", - " E_p[1] = decay_factor*E_p[1] + (1-decay_factor)*delta_p[1]*delta_p[1]\n", - " \n", - " # Calculate the gradient difference\n", - " delta_p[0] = - np.sqrt(E_p[0] + eps) / np.sqrt(E_g[0] + eps) * g0\n", - " delta_p[1] = - np.sqrt(E_p[1] + eps) / np.sqrt(E_g[1] + eps) * g1\n", - " \n", - " # update the models\n", - " model.w0 = model.w0 + delta_p[0]\n", - " model.w1 = model.w1 + delta_p[1]\n", - " \n", + " for idx, gradient in enumerate(model.derivate(x, y)):\n", + " # Get the running average for the gradient\n", + " E_g[idx] = decay_factor*E_g[idx] + (1 - decay_factor)*(gradient**2)\n", + " \n", + " # Get the running average for the parameters\n", + " E_p[idx] = decay_factor*E_p[idx] + (1 - decay_factor)*(delta_p[idx]**2)\n", + " \n", + " # Calculate the gradient difference\n", + " delta_p[idx] = - np.sqrt(E_p[idx] + eps) / np.sqrt(E_g[idx] + eps) * gradient\n", + " \n", + " # update the model weight\n", + " model.weights[idx] = model.weights[idx] + delta_p[idx]\n", + " \n", " \n", " if i % 100 == 0:\n", " print(f\"Iteration {i}\")\n", @@ -311,16 +318,18 @@ " \n", " \n", " # Variable Initialization\n", - " m = [0, 0] # two m for each parameter\n", - " v = [0, 0] # two v for each parameter\n", - " g = [0, 0] # two gradient\n", - " t = 1 # time steps\n", + " num_param = len(model.weights)\n", + " m = [0 for _ in range(num_param)] # two m for each parameter\n", + " v = [0 for _ in range(num_param)] # two v for each parameter\n", + " g = [0 for _ in range(num_param)] # two gradient\n", " \n", - " for i in range(max_iteration):\n", + " for t in range(1,max_iteration):\n", + " \n", " # Calculate the gradients \n", " x, y = stochastic_sample(xs, ys)\n", - " g[0] = model.dx_w0(x, y)\n", - " g[1] = model.dx_w1(x, y)\n", + " \n", + " # Get the partial derivatives\n", + " g = model.derivate(x, y)\n", "\n", " # Update the m and v parameter\n", " m = [b1*m_i + (1 - b1)*g_i for m_i, g_i in zip(m, g)]\n", @@ -331,49 +340,83 @@ " v_cor = [v_i / (1 - (b2**t)) for v_i in v]\n", "\n", " # Update the parameter\n", - " model.w0 = model.w0 - (learning_rate / (np.sqrt(v_cor[0]) + epsilon))*m_cor[0]\n", - " model.w1 = model.w1 - (learning_rate / (np.sqrt(v_cor[1]) + epsilon))*m_cor[1]\n", - "\n", - " t = t + 1\n", + " model.weights = [weight - (learning_rate / (np.sqrt(v_cor_i) + epsilon))*m_cor_i for weight, v_cor_i, m_cor_i in zip(model.weights, v_cor, m_cor)]\n", " \n", - " if i % 100 == 0:\n", - " print(f\"Iteration {i}\")\n", + " if t % 100 == 0:\n", + " print(f\"Iteration {t}\")\n", " print(model)\n", " \n", - " " + "def nesterov(model, xs, ys, learning_rate = 0.01, decay_factor = 0.9, max_num_iteration = 1000):\n", + " \"\"\"\n", + " Nesterov: This is the nesterov accelerated gradient optimizer that build upon momentum\n", + " model: the model we want to optimize the parameter on (this is a line right now)\n", + " xs: the feature of my dataset\n", + " ys: the continous value (target)\n", + " learning_rate: the learning rate for the step that weights update will take\n", + " decay_factor: determines the relative contribution of the current gradient and earlier gradients to the weight change\n", + " max_num_iteration: the number of iteration before we stop updating\n", + " \"\"\"\n", + " \n", + " # These are needed to keep track of the previous gradient\n", + " g = [0 for _ in range(len(model.weights))] \n", + " g0 = 0\n", + " g1 = 0\n", + " \n", + " for i in range(max_num_iteration):\n", + " \n", + " # Select a random x and y\n", + " x, y = stochastic_sample(xs, ys)\n", + "\n", + " # Calculate the gradient for w0 by predicting where the ball will be (approximatively)\n", + " for idx, gradient in enumerate(model.derivate(x,y)):\n", + " \n", + " # Here we need to do a bit of gymnastic because of how the code is setup\n", + " # We need to save the parameters state, modify it, do the simulation and then reset the parameter state\n", + " # The update happen in the next section\n", + " prev_weight = model.weights[idx]\n", + " model.weights[idx] = model.weights[idx] = decay_factor*gradient\n", + " g[idx] = decay_factor*g[idx] + learning_rate*gradient\n", + " model.weights[idx] = prev_weight\n", + " \n", + " # Update the model parameter\n", + " model.weights[idx] = model.weights[idx] - g[idx]\n", + " \n", + " if i % 100 == 0:\n", + " print(f\"Iteration {i}\")\n", + " print(model)" ] }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Adam\n", + "Nesterov Accelerated Gradient\n", "Iteration 0\n", - "y = 0.8575880576520295 + 0.8405747654640998*x\n", + "y = [0.89010029] + [0.35356173]*x\n", "Iteration 100\n", - "y = -0.001590081073315357 + 0.9984342651063493*x\n", + "y = [0.3723535] + [0.9655646]*x\n", "Iteration 200\n", - "y = 2.70122392056152e-05 + 1.0001344335031852*x\n", + "y = [0.01159183] + [0.9884176]*x\n", "Iteration 300\n", - "y = -1.4876611515003004e-06 + 1.0000033575642253*x\n", + "y = [0.00013029] + [1.00053685]*x\n", "Iteration 400\n", - "y = 1.030624925582267e-05 + 0.9999970769162957*x\n", + "y = [0.00110482] + [1.00002591]*x\n", "Iteration 500\n", - "y = 2.567876097403007e-06 + 0.9999995341571982*x\n", + "y = [1.20330159e-05] + [0.99999153]*x\n", "Iteration 600\n", - "y = 3.001062476619204e-08 + 0.9999999996624379*x\n", + "y = [4.34325924e-05] + [1.00000075]*x\n", "Iteration 700\n", - "y = 3.5509149827671966e-09 + 0.9999999957864142*x\n", + "y = [-1.87460485e-05] + [1.00003432]*x\n", "Iteration 800\n", - "y = 2.013987394989773e-08 + 0.9999999952565595*x\n", + "y = [1.26114336e-05] + [0.99998661]*x\n", "Iteration 900\n", - "y = -2.4103486747254733e-08 + 0.9999999984873583*x\n", - "y = 4.7124723585610135e-07 + 0.9999992952944199*x\n" + "y = [-1.84626241e-06] + [1.0000026]*x\n", + "y = [4.67870626e-06] + [0.99999889]*x\n" ] } ], @@ -382,7 +425,7 @@ "xs = [1,2,3,4,5,6,7]\n", "ys = [1,2,3,4,5,6,7]\n", "\n", - "\n", + "'''\n", "# Gradient Descent\n", "model = Line()\n", "print(\"Gradient Descent: \")\n", @@ -401,6 +444,7 @@ "sgd_momentum(model, xs, ys)\n", "print(model)\n", "\n", + "\n", "# Adagrad\n", "model = Line()\n", "print(\"Adagrad\")\n", @@ -410,7 +454,7 @@ "# RMSprop\n", "model = Line()\n", "print(\"RMSprop\")\n", - "RMSprop(model, xs, ys)\n", + "rmsprop(model, xs, ys)\n", "print(model)\n", "\n", "# Adadelta\n", @@ -424,40 +468,48 @@ "model = Line()\n", "print(\"Adam\")\n", "adam(model, xs, ys)\n", + "print(model)\n", + "\n", + "'''\n", + "\n", + "# Nesterov Accelerated Gradient\n", + "model = Line()\n", + "print(\"Nesterov Accelerated Gradient\")\n", + "nesterov(model, xs, ys)\n", "print(model)" ] }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 17, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Adam\n", + "Nesterov Accelerated Gradient\n", "Iteration 0\n", - "y = 0.7770901717569257 + 0.9433753832651581*x\n", + "y = [0.66878322] + [0.90014342]*x\n", "Iteration 100\n", - "y = 0.32725296586754515 + 1.900221970247115*x\n", + "y = [-0.17903726] + [2.40553571]*x\n", "Iteration 200\n", - "y = 0.004147083459202631 + 1.99892111204854*x\n", + "y = [0.05899865] + [2.04887923]*x\n", "Iteration 300\n", - "y = -3.4688751132678575e-05 + 2.000013509462116*x\n", + "y = [0.00894693] + [1.97536479]*x\n", "Iteration 400\n", - "y = -9.06655931188767e-08 + 1.9999999764669432*x\n", + "y = [0.01374569] + [2.05300722]*x\n", "Iteration 500\n", - "y = 1.8387422421644338e-10 + 2.0000000000594307*x\n", + "y = [0.10548793] + [1.91493233]*x\n", "Iteration 600\n", - "y = 4.361065425337935e-12 + 1.9999999999993485*x\n", + "y = [0.00385495] + [2.01632264]*x\n", "Iteration 700\n", - "y = 1.1991224399817034e-13 + 1.9999999999999527*x\n", + "y = [0.05427682] + [2.13589417]*x\n", "Iteration 800\n", - "y = -2.5150603804494032e-17 + 2.000000000000001*x\n", + "y = [0.0113579] + [2.00570216]*x\n", "Iteration 900\n", - "y = -8.298690846122729e-17 + 2.0*x\n", - "y = 9.891579815200104e-17 + 2.0*x\n" + "y = [-0.02275247] + [1.96719449]*x\n", + "y = [-0.00900316] + [2.00493823]*x\n" ] } ], @@ -466,7 +518,7 @@ "xs = [1,2,3,4,5,6,7]\n", "ys = [2,4,6,8,10,12,14]\n", "\n", - "\n", + "'''\n", "# Gradient Descent\n", "model = Line()\n", "print(\"Gradient Descent: \")\n", @@ -494,7 +546,7 @@ "# RMSprop\n", "model = Line()\n", "print(\"RMSprop\")\n", - "RMSprop(model, xs, ys)\n", + "rmsprop(model, xs, ys)\n", "print(model)\n", "\n", "# Adadelta\n", @@ -507,40 +559,47 @@ "model = Line()\n", "print(\"Adam\")\n", "adam(model, xs, ys)\n", + "print(model)\n", + "'''\n", + "\n", + "# Nesterov Accelerated Gradient\n", + "model = Line()\n", + "print(\"Nesterov Accelerated Gradient\")\n", + "nesterov(model, xs, ys)\n", "print(model)" ] }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 20, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Adam\n", + "Nesterov Accelerated Gradient\n", "Iteration 0\n", - "y = 0.7139491962653317 + 0.6462402260655279*x\n", + "y = [0.30475578] + [0.9422567]*x\n", "Iteration 100\n", - "y = 1.5012704143511661 + 1.913754894630153*x\n", + "y = [-0.39695759] + [2.51820116]*x\n", "Iteration 200\n", - "y = 1.147275006702479 + 1.9626892285633653*x\n", + "y = [-1.48096144] + [3.46703259]*x\n", "Iteration 300\n", - "y = 1.0187321764566213 + 1.9959358177402393*x\n", + "y = [1.90685378] + [2.11784826]*x\n", "Iteration 400\n", - "y = 1.0018116802276642 + 1.9995771873157036*x\n", + "y = [1.03772983] + [1.98962059]*x\n", "Iteration 500\n", - "y = 1.0001550519477989 + 1.9999713016943046*x\n", + "y = [1.00457709] + [2.05050135]*x\n", "Iteration 600\n", - "y = 1.0000046235977667 + 1.9999989555915936*x\n", + "y = [0.99465739] + [1.99190331]*x\n", "Iteration 700\n", - "y = 0.9999999689046957 + 2.0000000091643306*x\n", + "y = [1.00337418] + [1.99869932]*x\n", "Iteration 800\n", - "y = 1.0000000007114835 + 1.999999999822266*x\n", + "y = [0.99671896] + [1.98843029]*x\n", "Iteration 900\n", - "y = 1.000000000010868 + 2.0000000000003593*x\n", - "y = 0.9999999999999609 + 2.000000000000028*x\n" + "y = [1.00465637] + [1.99875298]*x\n", + "y = [0.99462305] + [2.00330215]*x\n" ] } ], @@ -548,7 +607,7 @@ "# Here we have a simple line with intercept = 1 and slope = 2\n", "xs = [1,2,3,4,5,6,7]\n", "ys = [3,5,7,9,11,13,15]\n", - "\n", + "'''\n", "# Gradient Descent\n", "model = Line()\n", "print(\"Gradient Descent: \")\n", @@ -576,7 +635,7 @@ "# RMSprop\n", "model = Line()\n", "print(\"RMSprop\")\n", - "RMSprop(model, xs, ys)\n", + "rmsprop(model, xs, ys)\n", "print(model)\n", "\n", "# Adadelta\n", @@ -589,6 +648,13 @@ "model = Line()\n", "print(\"Adam\")\n", "adam(model, xs, ys)\n", + "print(model)\n", + "'''\n", + "\n", + "# Nesterov Accelerated Gradient\n", + "model = Line()\n", + "print(\"Nesterov Accelerated Gradient\")\n", + "nesterov(model, xs, ys)\n", "print(model)" ] },