From dd9dee429f5a0d6bf136f47e2c7149ccf8151558 Mon Sep 17 00:00:00 2001 From: Yacine Mahdid Date: Thu, 10 Sep 2020 18:26:57 -0400 Subject: [PATCH] Add adamax --- ...t Optimization Algorithms-checkpoint.ipynb | 150 ++++++++++++------ ...ient Descent Optimization Algorithms.ipynb | 150 ++++++++++++------ 2 files changed, 204 insertions(+), 96 deletions(-) diff --git a/Deep Learning from Scratch in Python/.ipynb_checkpoints/Gradient Descent Optimization Algorithms-checkpoint.ipynb b/Deep Learning from Scratch in Python/.ipynb_checkpoints/Gradient Descent Optimization Algorithms-checkpoint.ipynb index 81b0980..0909376 100644 --- a/Deep Learning from Scratch in Python/.ipynb_checkpoints/Gradient Descent Optimization Algorithms-checkpoint.ipynb +++ b/Deep Learning from Scratch in Python/.ipynb_checkpoints/Gradient Descent Optimization Algorithms-checkpoint.ipynb @@ -13,6 +13,7 @@ "- [AdaDelta](https://youtu.be/6gvh0IySNMs)\n", "- [Adam](https://youtu.be/6nqV58NA_Ew)\n", "- [Nesterov](https://youtu.be/6FrBXv9OcqE)\n", + "- Adamax\n", "\n", "## Tests\n", "In order to demonstrate the algorithms capabilities to optimize a function we used these simple test setup:\n", @@ -21,7 +22,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ @@ -383,40 +384,80 @@ " \n", " if i % 100 == 0:\n", " print(f\"Iteration {i}\")\n", + " print(model)\n", + " \n", + " \n", + "def adamax(model, xs, ys, learning_rate = 0.1, b1 = 0.9, b2 = 0.999, max_iteration = 1000):\n", + " \"\"\"\n", + " Adamax: This is the adamax optimizer that build upong adam with L_inf norm\n", + " model: The model we want to optimize the parameter on\n", + " xs: the feature of my dataset\n", + " ys: the continous value (target)\n", + " learning_rate: the amount of learning we want to happen at each time step (default is 0.1 and will be updated by the optimizer)\n", + " b1: this is the first decaying average with proposed default value of 0.9 (deep learning purposes)\n", + " b2: this is the second decaying average with proposed default value of 0.999 (deep learning purposes)\n", + " max_iteration: the number of sgd round we want to do before stopping the optimization\n", + " \"\"\"\n", + " \n", + " \n", + " # Variable Initialization\n", + " num_param = len(model.weights)\n", + " m = [0 for _ in range(num_param)] # two m for each parameter\n", + " v = [0 for _ in range(num_param)] # two v for each parameter\n", + " g = [0 for _ in range(num_param)] # two gradient\n", + " \n", + " for t in range(1,max_iteration):\n", + " \n", + " # Calculate the gradients \n", + " x, y = stochastic_sample(xs, ys)\n", + " \n", + " # Get the partial derivatives\n", + " g = model.derivate(x, y)\n", + "\n", + " # Update the m and v parameter\n", + " m = [b1*m_i + (1 - b1)*g_i for m_i, g_i in zip(m, g)]\n", + " v = [np.maximum(b2*v_i, np.absolute(g_i)) for v_i, g_i in zip(v, g)]\n", + "\n", + " # Bias correction for m only\n", + " m_cor = [m_i / (1 - (b1**t)) for m_i in m]\n", + "\n", + " # Update the parameter\n", + " model.weights = [weight - (learning_rate / np.sqrt(v_i))*m_cor_i for weight, v_i, m_cor_i in zip(model.weights, v, m_cor)]\n", + " \n", + " if t % 100 == 0:\n", + " print(f\"Iteration {t}\")\n", " print(model)" ] }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Nesterov Accelerated Gradient\n", - "Iteration 0\n", - "y = [0.89010029] + [0.35356173]*x\n", + "Adamax\n", "Iteration 100\n", - "y = [0.3723535] + [0.9655646]*x\n", + "y = [-0.00777475] + [1.00552222]*x\n", "Iteration 200\n", - "y = [0.01159183] + [0.9884176]*x\n", + "y = [0.00039666] + [1.00003985]*x\n", "Iteration 300\n", - "y = [0.00013029] + [1.00053685]*x\n", + "y = [9.6917012e-06] + [0.99999651]*x\n", "Iteration 400\n", - "y = [0.00110482] + [1.00002591]*x\n", + "y = [2.99389693e-07] + [0.99999994]*x\n", "Iteration 500\n", - "y = [1.20330159e-05] + [0.99999153]*x\n", + "y = [-3.59743715e-09] + [1.]*x\n", "Iteration 600\n", - "y = [4.34325924e-05] + [1.00000075]*x\n", + "y = [2.77363348e-11] + [1.]*x\n", "Iteration 700\n", - "y = [-1.87460485e-05] + [1.00003432]*x\n", + "y = [-6.49186838e-14] + [1.]*x\n", "Iteration 800\n", - "y = [1.26114336e-05] + [0.99998661]*x\n", + "y = [-3.5671997e-15] + [1.]*x\n", "Iteration 900\n", - "y = [-1.84626241e-06] + [1.0000026]*x\n", - "y = [4.67870626e-06] + [0.99999889]*x\n" + "y = [6.75723728e-17] + [1.]*x\n", + "y = [4.19651576e-17] + [1.]*x\n" ] } ], @@ -463,57 +504,60 @@ "adadelta(model, xs, ys)\n", "print(model)\n", "\n", - "\n", "# Adam\n", "model = Line()\n", "print(\"Adam\")\n", "adam(model, xs, ys)\n", "print(model)\n", "\n", - "'''\n", - "\n", "# Nesterov Accelerated Gradient\n", "model = Line()\n", "print(\"Nesterov Accelerated Gradient\")\n", "nesterov(model, xs, ys)\n", + "print(model)\n", + "'''\n", + "\n", + "# Adamax\n", + "model = Line()\n", + "print(\"Adamax\")\n", + "adamax(model, xs, ys)\n", "print(model)" ] }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Nesterov Accelerated Gradient\n", - "Iteration 0\n", - "y = [0.66878322] + [0.90014342]*x\n", + "Adamax\n", "Iteration 100\n", - "y = [-0.17903726] + [2.40553571]*x\n", + "y = [0.4369314] + [1.89897329]*x\n", "Iteration 200\n", - "y = [0.05899865] + [2.04887923]*x\n", + "y = [0.13822549] + [1.96756579]*x\n", "Iteration 300\n", - "y = [0.00894693] + [1.97536479]*x\n", + "y = [0.03852642] + [1.99071418]*x\n", "Iteration 400\n", - "y = [0.01374569] + [2.05300722]*x\n", + "y = [0.01035516] + [1.99740938]*x\n", "Iteration 500\n", - "y = [0.10548793] + [1.91493233]*x\n", + "y = [0.0021898] + [1.99963751]*x\n", "Iteration 600\n", - "y = [0.00385495] + [2.01632264]*x\n", + "y = [0.00051143] + [1.99991838]*x\n", "Iteration 700\n", - "y = [0.05427682] + [2.13589417]*x\n", + "y = [9.66789006e-05] + [1.99998553]*x\n", "Iteration 800\n", - "y = [0.0113579] + [2.00570216]*x\n", + "y = [1.43786098e-05] + [1.99999787]*x\n", "Iteration 900\n", - "y = [-0.02275247] + [1.96719449]*x\n", - "y = [-0.00900316] + [2.00493823]*x\n" + "y = [2.96576946e-06] + [1.99999894]*x\n", + "y = [5.99822337e-07] + [1.99999991]*x\n" ] } ], "source": [ + "\n", "# Here we have a simple line with intercept = 0 and slope = 2\n", "xs = [1,2,3,4,5,6,7]\n", "ys = [2,4,6,8,10,12,14]\n", @@ -560,46 +604,50 @@ "print(\"Adam\")\n", "adam(model, xs, ys)\n", "print(model)\n", - "'''\n", "\n", "# Nesterov Accelerated Gradient\n", "model = Line()\n", "print(\"Nesterov Accelerated Gradient\")\n", "nesterov(model, xs, ys)\n", + "print(model)\n", + "'''\n", + "\n", + "# Adamax\n", + "model = Line()\n", + "print(\"Adamax\")\n", + "adamax(model, xs, ys)\n", "print(model)" ] }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Nesterov Accelerated Gradient\n", - "Iteration 0\n", - "y = [0.30475578] + [0.9422567]*x\n", + "Adamax\n", "Iteration 100\n", - "y = [-0.39695759] + [2.51820116]*x\n", + "y = [1.11731293] + [1.9747599]*x\n", "Iteration 200\n", - "y = [-1.48096144] + [3.46703259]*x\n", + "y = [1.05362733] + [1.99324401]*x\n", "Iteration 300\n", - "y = [1.90685378] + [2.11784826]*x\n", + "y = [1.02088689] + [1.99784093]*x\n", "Iteration 400\n", - "y = [1.03772983] + [1.98962059]*x\n", + "y = [1.00584065] + [1.99878137]*x\n", "Iteration 500\n", - "y = [1.00457709] + [2.05050135]*x\n", + "y = [1.00157963] + [1.99971604]*x\n", "Iteration 600\n", - "y = [0.99465739] + [1.99190331]*x\n", + "y = [1.00050744] + [1.99986608]*x\n", "Iteration 700\n", - "y = [1.00337418] + [1.99869932]*x\n", + "y = [1.00011891] + [1.99998385]*x\n", "Iteration 800\n", - "y = [0.99671896] + [1.98843029]*x\n", + "y = [1.00003134] + [1.99999406]*x\n", "Iteration 900\n", - "y = [1.00465637] + [1.99875298]*x\n", - "y = [0.99462305] + [2.00330215]*x\n" + "y = [1.00000663] + [1.99999862]*x\n", + "y = [1.00000132] + [1.99999974]*x\n" ] } ], @@ -649,12 +697,18 @@ "print(\"Adam\")\n", "adam(model, xs, ys)\n", "print(model)\n", - "'''\n", "\n", "# Nesterov Accelerated Gradient\n", "model = Line()\n", "print(\"Nesterov Accelerated Gradient\")\n", "nesterov(model, xs, ys)\n", + "print(model)\n", + "'''\n", + "\n", + "# Adamax\n", + "model = Line()\n", + "print(\"Adamax\")\n", + "adamax(model, xs, ys)\n", "print(model)" ] }, diff --git a/Deep Learning from Scratch in Python/Gradient Descent Optimization Algorithms.ipynb b/Deep Learning from Scratch in Python/Gradient Descent Optimization Algorithms.ipynb index 81b0980..0909376 100644 --- a/Deep Learning from Scratch in Python/Gradient Descent Optimization Algorithms.ipynb +++ b/Deep Learning from Scratch in Python/Gradient Descent Optimization Algorithms.ipynb @@ -13,6 +13,7 @@ "- [AdaDelta](https://youtu.be/6gvh0IySNMs)\n", "- [Adam](https://youtu.be/6nqV58NA_Ew)\n", "- [Nesterov](https://youtu.be/6FrBXv9OcqE)\n", + "- Adamax\n", "\n", "## Tests\n", "In order to demonstrate the algorithms capabilities to optimize a function we used these simple test setup:\n", @@ -21,7 +22,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ @@ -383,40 +384,80 @@ " \n", " if i % 100 == 0:\n", " print(f\"Iteration {i}\")\n", + " print(model)\n", + " \n", + " \n", + "def adamax(model, xs, ys, learning_rate = 0.1, b1 = 0.9, b2 = 0.999, max_iteration = 1000):\n", + " \"\"\"\n", + " Adamax: This is the adamax optimizer that build upong adam with L_inf norm\n", + " model: The model we want to optimize the parameter on\n", + " xs: the feature of my dataset\n", + " ys: the continous value (target)\n", + " learning_rate: the amount of learning we want to happen at each time step (default is 0.1 and will be updated by the optimizer)\n", + " b1: this is the first decaying average with proposed default value of 0.9 (deep learning purposes)\n", + " b2: this is the second decaying average with proposed default value of 0.999 (deep learning purposes)\n", + " max_iteration: the number of sgd round we want to do before stopping the optimization\n", + " \"\"\"\n", + " \n", + " \n", + " # Variable Initialization\n", + " num_param = len(model.weights)\n", + " m = [0 for _ in range(num_param)] # two m for each parameter\n", + " v = [0 for _ in range(num_param)] # two v for each parameter\n", + " g = [0 for _ in range(num_param)] # two gradient\n", + " \n", + " for t in range(1,max_iteration):\n", + " \n", + " # Calculate the gradients \n", + " x, y = stochastic_sample(xs, ys)\n", + " \n", + " # Get the partial derivatives\n", + " g = model.derivate(x, y)\n", + "\n", + " # Update the m and v parameter\n", + " m = [b1*m_i + (1 - b1)*g_i for m_i, g_i in zip(m, g)]\n", + " v = [np.maximum(b2*v_i, np.absolute(g_i)) for v_i, g_i in zip(v, g)]\n", + "\n", + " # Bias correction for m only\n", + " m_cor = [m_i / (1 - (b1**t)) for m_i in m]\n", + "\n", + " # Update the parameter\n", + " model.weights = [weight - (learning_rate / np.sqrt(v_i))*m_cor_i for weight, v_i, m_cor_i in zip(model.weights, v, m_cor)]\n", + " \n", + " if t % 100 == 0:\n", + " print(f\"Iteration {t}\")\n", " print(model)" ] }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Nesterov Accelerated Gradient\n", - "Iteration 0\n", - "y = [0.89010029] + [0.35356173]*x\n", + "Adamax\n", "Iteration 100\n", - "y = [0.3723535] + [0.9655646]*x\n", + "y = [-0.00777475] + [1.00552222]*x\n", "Iteration 200\n", - "y = [0.01159183] + [0.9884176]*x\n", + "y = [0.00039666] + [1.00003985]*x\n", "Iteration 300\n", - "y = [0.00013029] + [1.00053685]*x\n", + "y = [9.6917012e-06] + [0.99999651]*x\n", "Iteration 400\n", - "y = [0.00110482] + [1.00002591]*x\n", + "y = [2.99389693e-07] + [0.99999994]*x\n", "Iteration 500\n", - "y = [1.20330159e-05] + [0.99999153]*x\n", + "y = [-3.59743715e-09] + [1.]*x\n", "Iteration 600\n", - "y = [4.34325924e-05] + [1.00000075]*x\n", + "y = [2.77363348e-11] + [1.]*x\n", "Iteration 700\n", - "y = [-1.87460485e-05] + [1.00003432]*x\n", + "y = [-6.49186838e-14] + [1.]*x\n", "Iteration 800\n", - "y = [1.26114336e-05] + [0.99998661]*x\n", + "y = [-3.5671997e-15] + [1.]*x\n", "Iteration 900\n", - "y = [-1.84626241e-06] + [1.0000026]*x\n", - "y = [4.67870626e-06] + [0.99999889]*x\n" + "y = [6.75723728e-17] + [1.]*x\n", + "y = [4.19651576e-17] + [1.]*x\n" ] } ], @@ -463,57 +504,60 @@ "adadelta(model, xs, ys)\n", "print(model)\n", "\n", - "\n", "# Adam\n", "model = Line()\n", "print(\"Adam\")\n", "adam(model, xs, ys)\n", "print(model)\n", "\n", - "'''\n", - "\n", "# Nesterov Accelerated Gradient\n", "model = Line()\n", "print(\"Nesterov Accelerated Gradient\")\n", "nesterov(model, xs, ys)\n", + "print(model)\n", + "'''\n", + "\n", + "# Adamax\n", + "model = Line()\n", + "print(\"Adamax\")\n", + "adamax(model, xs, ys)\n", "print(model)" ] }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Nesterov Accelerated Gradient\n", - "Iteration 0\n", - "y = [0.66878322] + [0.90014342]*x\n", + "Adamax\n", "Iteration 100\n", - "y = [-0.17903726] + [2.40553571]*x\n", + "y = [0.4369314] + [1.89897329]*x\n", "Iteration 200\n", - "y = [0.05899865] + [2.04887923]*x\n", + "y = [0.13822549] + [1.96756579]*x\n", "Iteration 300\n", - "y = [0.00894693] + [1.97536479]*x\n", + "y = [0.03852642] + [1.99071418]*x\n", "Iteration 400\n", - "y = [0.01374569] + [2.05300722]*x\n", + "y = [0.01035516] + [1.99740938]*x\n", "Iteration 500\n", - "y = [0.10548793] + [1.91493233]*x\n", + "y = [0.0021898] + [1.99963751]*x\n", "Iteration 600\n", - "y = [0.00385495] + [2.01632264]*x\n", + "y = [0.00051143] + [1.99991838]*x\n", "Iteration 700\n", - "y = [0.05427682] + [2.13589417]*x\n", + "y = [9.66789006e-05] + [1.99998553]*x\n", "Iteration 800\n", - "y = [0.0113579] + [2.00570216]*x\n", + "y = [1.43786098e-05] + [1.99999787]*x\n", "Iteration 900\n", - "y = [-0.02275247] + [1.96719449]*x\n", - "y = [-0.00900316] + [2.00493823]*x\n" + "y = [2.96576946e-06] + [1.99999894]*x\n", + "y = [5.99822337e-07] + [1.99999991]*x\n" ] } ], "source": [ + "\n", "# Here we have a simple line with intercept = 0 and slope = 2\n", "xs = [1,2,3,4,5,6,7]\n", "ys = [2,4,6,8,10,12,14]\n", @@ -560,46 +604,50 @@ "print(\"Adam\")\n", "adam(model, xs, ys)\n", "print(model)\n", - "'''\n", "\n", "# Nesterov Accelerated Gradient\n", "model = Line()\n", "print(\"Nesterov Accelerated Gradient\")\n", "nesterov(model, xs, ys)\n", + "print(model)\n", + "'''\n", + "\n", + "# Adamax\n", + "model = Line()\n", + "print(\"Adamax\")\n", + "adamax(model, xs, ys)\n", "print(model)" ] }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Nesterov Accelerated Gradient\n", - "Iteration 0\n", - "y = [0.30475578] + [0.9422567]*x\n", + "Adamax\n", "Iteration 100\n", - "y = [-0.39695759] + [2.51820116]*x\n", + "y = [1.11731293] + [1.9747599]*x\n", "Iteration 200\n", - "y = [-1.48096144] + [3.46703259]*x\n", + "y = [1.05362733] + [1.99324401]*x\n", "Iteration 300\n", - "y = [1.90685378] + [2.11784826]*x\n", + "y = [1.02088689] + [1.99784093]*x\n", "Iteration 400\n", - "y = [1.03772983] + [1.98962059]*x\n", + "y = [1.00584065] + [1.99878137]*x\n", "Iteration 500\n", - "y = [1.00457709] + [2.05050135]*x\n", + "y = [1.00157963] + [1.99971604]*x\n", "Iteration 600\n", - "y = [0.99465739] + [1.99190331]*x\n", + "y = [1.00050744] + [1.99986608]*x\n", "Iteration 700\n", - "y = [1.00337418] + [1.99869932]*x\n", + "y = [1.00011891] + [1.99998385]*x\n", "Iteration 800\n", - "y = [0.99671896] + [1.98843029]*x\n", + "y = [1.00003134] + [1.99999406]*x\n", "Iteration 900\n", - "y = [1.00465637] + [1.99875298]*x\n", - "y = [0.99462305] + [2.00330215]*x\n" + "y = [1.00000663] + [1.99999862]*x\n", + "y = [1.00000132] + [1.99999974]*x\n" ] } ], @@ -649,12 +697,18 @@ "print(\"Adam\")\n", "adam(model, xs, ys)\n", "print(model)\n", - "'''\n", "\n", "# Nesterov Accelerated Gradient\n", "model = Line()\n", "print(\"Nesterov Accelerated Gradient\")\n", "nesterov(model, xs, ys)\n", + "print(model)\n", + "'''\n", + "\n", + "# Adamax\n", + "model = Line()\n", + "print(\"Adamax\")\n", + "adamax(model, xs, ys)\n", "print(model)" ] },