From dd9dee429f5a0d6bf136f47e2c7149ccf8151558 Mon Sep 17 00:00:00 2001
From: Yacine Mahdid <yacine.mahdid@mail.mcgill.ca>
Date: Thu, 10 Sep 2020 18:26:57 -0400
Subject: [PATCH] Add adamax

---
 ...t Optimization Algorithms-checkpoint.ipynb | 150 ++++++++++++------
 ...ient Descent Optimization Algorithms.ipynb | 150 ++++++++++++------
 2 files changed, 204 insertions(+), 96 deletions(-)

diff --git a/Deep Learning from Scratch in Python/.ipynb_checkpoints/Gradient Descent Optimization Algorithms-checkpoint.ipynb b/Deep Learning from Scratch in Python/.ipynb_checkpoints/Gradient Descent Optimization Algorithms-checkpoint.ipynb
index 81b0980..0909376 100644
--- a/Deep Learning from Scratch in Python/.ipynb_checkpoints/Gradient Descent Optimization Algorithms-checkpoint.ipynb	
+++ b/Deep Learning from Scratch in Python/.ipynb_checkpoints/Gradient Descent Optimization Algorithms-checkpoint.ipynb	
@@ -13,6 +13,7 @@
     "- [AdaDelta](https://youtu.be/6gvh0IySNMs)\n",
     "- [Adam](https://youtu.be/6nqV58NA_Ew)\n",
     "- [Nesterov](https://youtu.be/6FrBXv9OcqE)\n",
+    "- Adamax\n",
     "\n",
     "## Tests\n",
     "In order to demonstrate the algorithms capabilities to optimize a function we used these simple test setup:\n",
@@ -21,7 +22,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -383,40 +384,80 @@
     "            \n",
     "        if i % 100 == 0:\n",
     "            print(f\"Iteration {i}\")\n",
+    "            print(model)\n",
+    "            \n",
+    "            \n",
+    "def adamax(model, xs, ys, learning_rate = 0.1, b1 = 0.9, b2 = 0.999, max_iteration = 1000):\n",
+    "    \"\"\"\n",
+    "        Adamax: This is the adamax optimizer that build upong adam with L_inf norm\n",
+    "        model: The model we want to optimize the parameter on\n",
+    "        xs: the feature of my dataset\n",
+    "        ys: the continous value (target)\n",
+    "        learning_rate: the amount of learning we want to happen at each time step (default is 0.1 and will be updated by the optimizer)\n",
+    "        b1: this is the first decaying average with proposed default value of 0.9 (deep learning purposes)\n",
+    "        b2: this is the second decaying average with proposed default value of 0.999 (deep learning purposes)\n",
+    "        max_iteration: the number of sgd round we want to do before stopping the optimization\n",
+    "    \"\"\"\n",
+    "    \n",
+    "    \n",
+    "    # Variable Initialization\n",
+    "    num_param = len(model.weights)\n",
+    "    m = [0 for _ in range(num_param)] # two m for each parameter\n",
+    "    v = [0 for _ in range(num_param)] # two v for each parameter\n",
+    "    g = [0 for _ in range(num_param)] # two gradient\n",
+    "    \n",
+    "    for t in range(1,max_iteration):\n",
+    "        \n",
+    "        # Calculate the gradients \n",
+    "        x, y = stochastic_sample(xs, ys)\n",
+    "        \n",
+    "        # Get the partial derivatives\n",
+    "        g = model.derivate(x, y)\n",
+    "\n",
+    "        # Update the m and v parameter\n",
+    "        m = [b1*m_i + (1 - b1)*g_i for m_i, g_i in zip(m, g)]\n",
+    "        v = [np.maximum(b2*v_i, np.absolute(g_i)) for v_i, g_i in zip(v, g)]\n",
+    "\n",
+    "        # Bias correction for m only\n",
+    "        m_cor = [m_i / (1 - (b1**t)) for m_i in m]\n",
+    "\n",
+    "        # Update the parameter\n",
+    "        model.weights = [weight - (learning_rate / np.sqrt(v_i))*m_cor_i for weight, v_i, m_cor_i in zip(model.weights, v, m_cor)]\n",
+    "        \n",
+    "        if t % 100 == 0:\n",
+    "            print(f\"Iteration {t}\")\n",
     "            print(model)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 10,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Nesterov Accelerated Gradient\n",
-      "Iteration 0\n",
-      "y = [0.89010029] + [0.35356173]*x\n",
+      "Adamax\n",
       "Iteration 100\n",
-      "y = [0.3723535] + [0.9655646]*x\n",
+      "y = [-0.00777475] + [1.00552222]*x\n",
       "Iteration 200\n",
-      "y = [0.01159183] + [0.9884176]*x\n",
+      "y = [0.00039666] + [1.00003985]*x\n",
       "Iteration 300\n",
-      "y = [0.00013029] + [1.00053685]*x\n",
+      "y = [9.6917012e-06] + [0.99999651]*x\n",
       "Iteration 400\n",
-      "y = [0.00110482] + [1.00002591]*x\n",
+      "y = [2.99389693e-07] + [0.99999994]*x\n",
       "Iteration 500\n",
-      "y = [1.20330159e-05] + [0.99999153]*x\n",
+      "y = [-3.59743715e-09] + [1.]*x\n",
       "Iteration 600\n",
-      "y = [4.34325924e-05] + [1.00000075]*x\n",
+      "y = [2.77363348e-11] + [1.]*x\n",
       "Iteration 700\n",
-      "y = [-1.87460485e-05] + [1.00003432]*x\n",
+      "y = [-6.49186838e-14] + [1.]*x\n",
       "Iteration 800\n",
-      "y = [1.26114336e-05] + [0.99998661]*x\n",
+      "y = [-3.5671997e-15] + [1.]*x\n",
       "Iteration 900\n",
-      "y = [-1.84626241e-06] + [1.0000026]*x\n",
-      "y = [4.67870626e-06] + [0.99999889]*x\n"
+      "y = [6.75723728e-17] + [1.]*x\n",
+      "y = [4.19651576e-17] + [1.]*x\n"
      ]
     }
    ],
@@ -463,57 +504,60 @@
     "adadelta(model, xs, ys)\n",
     "print(model)\n",
     "\n",
-    "\n",
     "# Adam\n",
     "model = Line()\n",
     "print(\"Adam\")\n",
     "adam(model, xs, ys)\n",
     "print(model)\n",
     "\n",
-    "'''\n",
-    "\n",
     "# Nesterov Accelerated Gradient\n",
     "model = Line()\n",
     "print(\"Nesterov Accelerated Gradient\")\n",
     "nesterov(model, xs, ys)\n",
+    "print(model)\n",
+    "'''\n",
+    "\n",
+    "# Adamax\n",
+    "model = Line()\n",
+    "print(\"Adamax\")\n",
+    "adamax(model, xs, ys)\n",
     "print(model)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 11,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Nesterov Accelerated Gradient\n",
-      "Iteration 0\n",
-      "y = [0.66878322] + [0.90014342]*x\n",
+      "Adamax\n",
       "Iteration 100\n",
-      "y = [-0.17903726] + [2.40553571]*x\n",
+      "y = [0.4369314] + [1.89897329]*x\n",
       "Iteration 200\n",
-      "y = [0.05899865] + [2.04887923]*x\n",
+      "y = [0.13822549] + [1.96756579]*x\n",
       "Iteration 300\n",
-      "y = [0.00894693] + [1.97536479]*x\n",
+      "y = [0.03852642] + [1.99071418]*x\n",
       "Iteration 400\n",
-      "y = [0.01374569] + [2.05300722]*x\n",
+      "y = [0.01035516] + [1.99740938]*x\n",
       "Iteration 500\n",
-      "y = [0.10548793] + [1.91493233]*x\n",
+      "y = [0.0021898] + [1.99963751]*x\n",
       "Iteration 600\n",
-      "y = [0.00385495] + [2.01632264]*x\n",
+      "y = [0.00051143] + [1.99991838]*x\n",
       "Iteration 700\n",
-      "y = [0.05427682] + [2.13589417]*x\n",
+      "y = [9.66789006e-05] + [1.99998553]*x\n",
       "Iteration 800\n",
-      "y = [0.0113579] + [2.00570216]*x\n",
+      "y = [1.43786098e-05] + [1.99999787]*x\n",
       "Iteration 900\n",
-      "y = [-0.02275247] + [1.96719449]*x\n",
-      "y = [-0.00900316] + [2.00493823]*x\n"
+      "y = [2.96576946e-06] + [1.99999894]*x\n",
+      "y = [5.99822337e-07] + [1.99999991]*x\n"
      ]
     }
    ],
    "source": [
+    "\n",
     "# Here we have a simple line with intercept = 0 and slope = 2\n",
     "xs = [1,2,3,4,5,6,7]\n",
     "ys = [2,4,6,8,10,12,14]\n",
@@ -560,46 +604,50 @@
     "print(\"Adam\")\n",
     "adam(model, xs, ys)\n",
     "print(model)\n",
-    "'''\n",
     "\n",
     "# Nesterov Accelerated Gradient\n",
     "model = Line()\n",
     "print(\"Nesterov Accelerated Gradient\")\n",
     "nesterov(model, xs, ys)\n",
+    "print(model)\n",
+    "'''\n",
+    "\n",
+    "# Adamax\n",
+    "model = Line()\n",
+    "print(\"Adamax\")\n",
+    "adamax(model, xs, ys)\n",
     "print(model)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 12,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Nesterov Accelerated Gradient\n",
-      "Iteration 0\n",
-      "y = [0.30475578] + [0.9422567]*x\n",
+      "Adamax\n",
       "Iteration 100\n",
-      "y = [-0.39695759] + [2.51820116]*x\n",
+      "y = [1.11731293] + [1.9747599]*x\n",
       "Iteration 200\n",
-      "y = [-1.48096144] + [3.46703259]*x\n",
+      "y = [1.05362733] + [1.99324401]*x\n",
       "Iteration 300\n",
-      "y = [1.90685378] + [2.11784826]*x\n",
+      "y = [1.02088689] + [1.99784093]*x\n",
       "Iteration 400\n",
-      "y = [1.03772983] + [1.98962059]*x\n",
+      "y = [1.00584065] + [1.99878137]*x\n",
       "Iteration 500\n",
-      "y = [1.00457709] + [2.05050135]*x\n",
+      "y = [1.00157963] + [1.99971604]*x\n",
       "Iteration 600\n",
-      "y = [0.99465739] + [1.99190331]*x\n",
+      "y = [1.00050744] + [1.99986608]*x\n",
       "Iteration 700\n",
-      "y = [1.00337418] + [1.99869932]*x\n",
+      "y = [1.00011891] + [1.99998385]*x\n",
       "Iteration 800\n",
-      "y = [0.99671896] + [1.98843029]*x\n",
+      "y = [1.00003134] + [1.99999406]*x\n",
       "Iteration 900\n",
-      "y = [1.00465637] + [1.99875298]*x\n",
-      "y = [0.99462305] + [2.00330215]*x\n"
+      "y = [1.00000663] + [1.99999862]*x\n",
+      "y = [1.00000132] + [1.99999974]*x\n"
      ]
     }
    ],
@@ -649,12 +697,18 @@
     "print(\"Adam\")\n",
     "adam(model, xs, ys)\n",
     "print(model)\n",
-    "'''\n",
     "\n",
     "# Nesterov Accelerated Gradient\n",
     "model = Line()\n",
     "print(\"Nesterov Accelerated Gradient\")\n",
     "nesterov(model, xs, ys)\n",
+    "print(model)\n",
+    "'''\n",
+    "\n",
+    "# Adamax\n",
+    "model = Line()\n",
+    "print(\"Adamax\")\n",
+    "adamax(model, xs, ys)\n",
     "print(model)"
    ]
   },
diff --git a/Deep Learning from Scratch in Python/Gradient Descent Optimization Algorithms.ipynb b/Deep Learning from Scratch in Python/Gradient Descent Optimization Algorithms.ipynb
index 81b0980..0909376 100644
--- a/Deep Learning from Scratch in Python/Gradient Descent Optimization Algorithms.ipynb	
+++ b/Deep Learning from Scratch in Python/Gradient Descent Optimization Algorithms.ipynb	
@@ -13,6 +13,7 @@
     "- [AdaDelta](https://youtu.be/6gvh0IySNMs)\n",
     "- [Adam](https://youtu.be/6nqV58NA_Ew)\n",
     "- [Nesterov](https://youtu.be/6FrBXv9OcqE)\n",
+    "- Adamax\n",
     "\n",
     "## Tests\n",
     "In order to demonstrate the algorithms capabilities to optimize a function we used these simple test setup:\n",
@@ -21,7 +22,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -383,40 +384,80 @@
     "            \n",
     "        if i % 100 == 0:\n",
     "            print(f\"Iteration {i}\")\n",
+    "            print(model)\n",
+    "            \n",
+    "            \n",
+    "def adamax(model, xs, ys, learning_rate = 0.1, b1 = 0.9, b2 = 0.999, max_iteration = 1000):\n",
+    "    \"\"\"\n",
+    "        Adamax: This is the adamax optimizer that build upong adam with L_inf norm\n",
+    "        model: The model we want to optimize the parameter on\n",
+    "        xs: the feature of my dataset\n",
+    "        ys: the continous value (target)\n",
+    "        learning_rate: the amount of learning we want to happen at each time step (default is 0.1 and will be updated by the optimizer)\n",
+    "        b1: this is the first decaying average with proposed default value of 0.9 (deep learning purposes)\n",
+    "        b2: this is the second decaying average with proposed default value of 0.999 (deep learning purposes)\n",
+    "        max_iteration: the number of sgd round we want to do before stopping the optimization\n",
+    "    \"\"\"\n",
+    "    \n",
+    "    \n",
+    "    # Variable Initialization\n",
+    "    num_param = len(model.weights)\n",
+    "    m = [0 for _ in range(num_param)] # two m for each parameter\n",
+    "    v = [0 for _ in range(num_param)] # two v for each parameter\n",
+    "    g = [0 for _ in range(num_param)] # two gradient\n",
+    "    \n",
+    "    for t in range(1,max_iteration):\n",
+    "        \n",
+    "        # Calculate the gradients \n",
+    "        x, y = stochastic_sample(xs, ys)\n",
+    "        \n",
+    "        # Get the partial derivatives\n",
+    "        g = model.derivate(x, y)\n",
+    "\n",
+    "        # Update the m and v parameter\n",
+    "        m = [b1*m_i + (1 - b1)*g_i for m_i, g_i in zip(m, g)]\n",
+    "        v = [np.maximum(b2*v_i, np.absolute(g_i)) for v_i, g_i in zip(v, g)]\n",
+    "\n",
+    "        # Bias correction for m only\n",
+    "        m_cor = [m_i / (1 - (b1**t)) for m_i in m]\n",
+    "\n",
+    "        # Update the parameter\n",
+    "        model.weights = [weight - (learning_rate / np.sqrt(v_i))*m_cor_i for weight, v_i, m_cor_i in zip(model.weights, v, m_cor)]\n",
+    "        \n",
+    "        if t % 100 == 0:\n",
+    "            print(f\"Iteration {t}\")\n",
     "            print(model)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 10,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Nesterov Accelerated Gradient\n",
-      "Iteration 0\n",
-      "y = [0.89010029] + [0.35356173]*x\n",
+      "Adamax\n",
       "Iteration 100\n",
-      "y = [0.3723535] + [0.9655646]*x\n",
+      "y = [-0.00777475] + [1.00552222]*x\n",
       "Iteration 200\n",
-      "y = [0.01159183] + [0.9884176]*x\n",
+      "y = [0.00039666] + [1.00003985]*x\n",
       "Iteration 300\n",
-      "y = [0.00013029] + [1.00053685]*x\n",
+      "y = [9.6917012e-06] + [0.99999651]*x\n",
       "Iteration 400\n",
-      "y = [0.00110482] + [1.00002591]*x\n",
+      "y = [2.99389693e-07] + [0.99999994]*x\n",
       "Iteration 500\n",
-      "y = [1.20330159e-05] + [0.99999153]*x\n",
+      "y = [-3.59743715e-09] + [1.]*x\n",
       "Iteration 600\n",
-      "y = [4.34325924e-05] + [1.00000075]*x\n",
+      "y = [2.77363348e-11] + [1.]*x\n",
       "Iteration 700\n",
-      "y = [-1.87460485e-05] + [1.00003432]*x\n",
+      "y = [-6.49186838e-14] + [1.]*x\n",
       "Iteration 800\n",
-      "y = [1.26114336e-05] + [0.99998661]*x\n",
+      "y = [-3.5671997e-15] + [1.]*x\n",
       "Iteration 900\n",
-      "y = [-1.84626241e-06] + [1.0000026]*x\n",
-      "y = [4.67870626e-06] + [0.99999889]*x\n"
+      "y = [6.75723728e-17] + [1.]*x\n",
+      "y = [4.19651576e-17] + [1.]*x\n"
      ]
     }
    ],
@@ -463,57 +504,60 @@
     "adadelta(model, xs, ys)\n",
     "print(model)\n",
     "\n",
-    "\n",
     "# Adam\n",
     "model = Line()\n",
     "print(\"Adam\")\n",
     "adam(model, xs, ys)\n",
     "print(model)\n",
     "\n",
-    "'''\n",
-    "\n",
     "# Nesterov Accelerated Gradient\n",
     "model = Line()\n",
     "print(\"Nesterov Accelerated Gradient\")\n",
     "nesterov(model, xs, ys)\n",
+    "print(model)\n",
+    "'''\n",
+    "\n",
+    "# Adamax\n",
+    "model = Line()\n",
+    "print(\"Adamax\")\n",
+    "adamax(model, xs, ys)\n",
     "print(model)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 11,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Nesterov Accelerated Gradient\n",
-      "Iteration 0\n",
-      "y = [0.66878322] + [0.90014342]*x\n",
+      "Adamax\n",
       "Iteration 100\n",
-      "y = [-0.17903726] + [2.40553571]*x\n",
+      "y = [0.4369314] + [1.89897329]*x\n",
       "Iteration 200\n",
-      "y = [0.05899865] + [2.04887923]*x\n",
+      "y = [0.13822549] + [1.96756579]*x\n",
       "Iteration 300\n",
-      "y = [0.00894693] + [1.97536479]*x\n",
+      "y = [0.03852642] + [1.99071418]*x\n",
       "Iteration 400\n",
-      "y = [0.01374569] + [2.05300722]*x\n",
+      "y = [0.01035516] + [1.99740938]*x\n",
       "Iteration 500\n",
-      "y = [0.10548793] + [1.91493233]*x\n",
+      "y = [0.0021898] + [1.99963751]*x\n",
       "Iteration 600\n",
-      "y = [0.00385495] + [2.01632264]*x\n",
+      "y = [0.00051143] + [1.99991838]*x\n",
       "Iteration 700\n",
-      "y = [0.05427682] + [2.13589417]*x\n",
+      "y = [9.66789006e-05] + [1.99998553]*x\n",
       "Iteration 800\n",
-      "y = [0.0113579] + [2.00570216]*x\n",
+      "y = [1.43786098e-05] + [1.99999787]*x\n",
       "Iteration 900\n",
-      "y = [-0.02275247] + [1.96719449]*x\n",
-      "y = [-0.00900316] + [2.00493823]*x\n"
+      "y = [2.96576946e-06] + [1.99999894]*x\n",
+      "y = [5.99822337e-07] + [1.99999991]*x\n"
      ]
     }
    ],
    "source": [
+    "\n",
     "# Here we have a simple line with intercept = 0 and slope = 2\n",
     "xs = [1,2,3,4,5,6,7]\n",
     "ys = [2,4,6,8,10,12,14]\n",
@@ -560,46 +604,50 @@
     "print(\"Adam\")\n",
     "adam(model, xs, ys)\n",
     "print(model)\n",
-    "'''\n",
     "\n",
     "# Nesterov Accelerated Gradient\n",
     "model = Line()\n",
     "print(\"Nesterov Accelerated Gradient\")\n",
     "nesterov(model, xs, ys)\n",
+    "print(model)\n",
+    "'''\n",
+    "\n",
+    "# Adamax\n",
+    "model = Line()\n",
+    "print(\"Adamax\")\n",
+    "adamax(model, xs, ys)\n",
     "print(model)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 12,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Nesterov Accelerated Gradient\n",
-      "Iteration 0\n",
-      "y = [0.30475578] + [0.9422567]*x\n",
+      "Adamax\n",
       "Iteration 100\n",
-      "y = [-0.39695759] + [2.51820116]*x\n",
+      "y = [1.11731293] + [1.9747599]*x\n",
       "Iteration 200\n",
-      "y = [-1.48096144] + [3.46703259]*x\n",
+      "y = [1.05362733] + [1.99324401]*x\n",
       "Iteration 300\n",
-      "y = [1.90685378] + [2.11784826]*x\n",
+      "y = [1.02088689] + [1.99784093]*x\n",
       "Iteration 400\n",
-      "y = [1.03772983] + [1.98962059]*x\n",
+      "y = [1.00584065] + [1.99878137]*x\n",
       "Iteration 500\n",
-      "y = [1.00457709] + [2.05050135]*x\n",
+      "y = [1.00157963] + [1.99971604]*x\n",
       "Iteration 600\n",
-      "y = [0.99465739] + [1.99190331]*x\n",
+      "y = [1.00050744] + [1.99986608]*x\n",
       "Iteration 700\n",
-      "y = [1.00337418] + [1.99869932]*x\n",
+      "y = [1.00011891] + [1.99998385]*x\n",
       "Iteration 800\n",
-      "y = [0.99671896] + [1.98843029]*x\n",
+      "y = [1.00003134] + [1.99999406]*x\n",
       "Iteration 900\n",
-      "y = [1.00465637] + [1.99875298]*x\n",
-      "y = [0.99462305] + [2.00330215]*x\n"
+      "y = [1.00000663] + [1.99999862]*x\n",
+      "y = [1.00000132] + [1.99999974]*x\n"
      ]
     }
    ],
@@ -649,12 +697,18 @@
     "print(\"Adam\")\n",
     "adam(model, xs, ys)\n",
     "print(model)\n",
-    "'''\n",
     "\n",
     "# Nesterov Accelerated Gradient\n",
     "model = Line()\n",
     "print(\"Nesterov Accelerated Gradient\")\n",
     "nesterov(model, xs, ys)\n",
+    "print(model)\n",
+    "'''\n",
+    "\n",
+    "# Adamax\n",
+    "model = Line()\n",
+    "print(\"Adamax\")\n",
+    "adamax(model, xs, ys)\n",
     "print(model)"
    ]
   },