From 1e936b0eed6f9115b91f224f237975b9fbab0634 Mon Sep 17 00:00:00 2001 From: mmamedli Date: Mon, 11 Jan 2021 20:38:54 +0300 Subject: [PATCH 1/3] Proofread recaps and week 1 assignments --- week1_intro/crossentropy_method.ipynb | 30 ++-- week1_intro/deep_crossentropy_method.ipynb | 36 ++--- week1_intro/gym_interface.ipynb | 38 +++--- week1_intro/primer/recap_ml.ipynb | 152 ++++++++++++--------- week1_intro/primer/recap_pytorch.ipynb | 84 +++++++----- 5 files changed, 187 insertions(+), 153 deletions(-) diff --git a/week1_intro/crossentropy_method.ipynb b/week1_intro/crossentropy_method.ipynb index 1f21258b9..f91ad4efb 100644 --- a/week1_intro/crossentropy_method.ipynb +++ b/week1_intro/crossentropy_method.ipynb @@ -6,7 +6,7 @@ "source": [ "# Crossentropy method\n", "\n", - "This notebook will teach you to solve reinforcement learning problems with crossentropy method. We'll follow-up by scaling everything up and using neural network policy." + "This notebook will teach you to solve reinforcement learning problems with crossentropy method. After that we'll scale everything up using neural network policy." ] }, { @@ -24,8 +24,8 @@ "\n", " !touch .setup_complete\n", "\n", - "# This code creates a virtual display to draw game images on.\n", - "# It will have no effect if your machine has a monitor.\n", + "# This code creates a virtual display for drawing game images on.\n", + "# It won't have any effect if your machine has a monitor.\n", "if type(os.environ.get(\"DISPLAY\")) is not str or len(os.environ.get(\"DISPLAY\")) == 0:\n", " !bash ../xvfb start\n", " os.environ['DISPLAY'] = ':1'" @@ -69,7 +69,7 @@ "\n", "Since we still use integer state and action representations, you can use a 2-dimensional array to represent the policy.\n", "\n", - "Please initialize the policy __uniformly__, that is, probabililities of all actions should be equal." + "Please initialize the policy __uniformly__, that is, the probabililities of all actions should be equal." ] }, { @@ -114,9 +114,9 @@ "source": [ "def generate_session(env, policy, t_max=10**4):\n", " \"\"\"\n", - " Play game until end or for t_max ticks.\n", - " :param policy: an array of shape [n_states,n_actions] with action probabilities\n", - " :returns: list of states, list of actions and sum of rewards\n", + " Play the game until the end or for t_max ticks.\n", + " :param policy: an array of shape [n_states,n_actions] with the action probabilities\n", + " :returns: list of states, list of actions and the sum of rewards\n", " \"\"\"\n", " states, actions = [], []\n", " total_reward = 0.\n", @@ -198,7 +198,7 @@ " [i.e. sorted by session number and timestep within session]\n", "\n", " If you are confused, see examples below. Please don't assume that states are integers\n", - " (they will become different later).\n", + " (their type will change later).\n", " \"\"\"\n", "\n", " reward_threshold = \n", @@ -267,7 +267,7 @@ " policy[s_i,a_i] ~ #[occurrences of s_i and a_i in elite states/actions]\n", "\n", " Don't forget to normalize the policy to get valid probabilities and handle the 0/0 case.\n", - " For states that you never visited, use a uniform distribution (1/n_actions for all states).\n", + " For states, that you never visited, use a uniform distribution (1/n_actions for all states).\n", "\n", " :param elite_states: 1D list of states from elite sessions\n", " :param elite_actions: 1D list of actions from elite sessions\n", @@ -387,7 +387,7 @@ "\n", " policy = learning_rate * new_policy + (1 - learning_rate) * policy\n", "\n", - " # display results on chart\n", + " # display results on the chart\n", " show_progress(rewards_batch, log, percentile)" ] }, @@ -395,13 +395,13 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Reflecting on results\n", + "### Reflecting on the results\n", "\n", - "You may have noticed that the taxi problem quickly converges from less than -1000 to a near-optimal score and then descends back into -50/-100. This is in part because the environment has some innate randomness. Namely, the starting points of passenger/driver change from episode to episode.\n", + "You may have noticed that the taxi problem quickly converges from less than -1000 to a near-optimal score and then descends back to -50/-100. This is in part because the environment has some innate randomness. Namely, the starting points of passenger/driver change from episode to episode.\n", "\n", - "In case CEM failed to learn how to win from one distinct starting point, it will simply discard it because no sessions from that starting point will make it into the \"elites\".\n", + "In case CEM failed to learn, how to win from one distinct starting point, it will simply discard it because no sessions from that starting point will make it into the \"elites\".\n", "\n", - "To mitigate that problem, you can either reduce the threshold for elite sessions (duct tape way) or change the way you evaluate strategy (theoretically correct way). For each starting state, you can sample an action randomly, and then evaluate this action by running _several_ games starting from it and averaging the total reward. Choosing elite sessions with this kind of sampling (where each session's reward is counted as the average of the rewards of all sessions with the same starting state and action) should improve the performance of your policy." + "To mitigate that problem, you can either reduce the threshold for elite sessions (duct tape way) or change the way you evaluate the strategy (theoretically correct way). For each starting state, you can sample an action randomly, and then evaluate this action by running _several_ games starting from it and averaging the total reward. Choosing elite sessions with this kind of sampling (where each session reward is counted as the average of the rewards of all sessions with the same starting state and action) should improve the performance of your policy." ] }, { @@ -429,5 +429,5 @@ } }, "nbformat": 4, - "nbformat_minor": 1 + "nbformat_minor": 0 } diff --git a/week1_intro/deep_crossentropy_method.ipynb b/week1_intro/deep_crossentropy_method.ipynb index 9e3681f50..446ddebaa 100644 --- a/week1_intro/deep_crossentropy_method.ipynb +++ b/week1_intro/deep_crossentropy_method.ipynb @@ -27,8 +27,8 @@ "\n", " !touch .setup_complete\n", "\n", - "# This code creates a virtual display to draw game images on.\n", - "# It will have no effect if your machine has a monitor.\n", + "# This code creates a virtual display for drawing game images on.\n", + "# It won't have any effect if your machine has a monitor.\n", "if type(os.environ.get(\"DISPLAY\")) is not str or len(os.environ.get(\"DISPLAY\")) == 0:\n", " !bash ../xvfb start\n", " os.environ['DISPLAY'] = ':1'" @@ -65,8 +65,8 @@ "\n", "For this assignment we'll utilize the simplified neural network implementation from __[Scikit-learn](https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html)__. Here's what you'll need:\n", "\n", - "* `agent.partial_fit(states, actions)` - make a single training pass over the data. Maximize the probabilitity of :actions: from :states:\n", - "* `agent.predict_proba(states)` - predict probabilities of all actions, a matrix of shape __[len(states), n_actions]__\n" + "* `agent.partial_fit(states, actions)` - makes a single training pass over the data. Maximize the probabilitity of :actions: from :states:\n", + "* `agent.predict_proba(states)` - predicts probabilities of all actions, a matrix of shape __[len(states), n_actions]__\n" ] }, { @@ -82,7 +82,7 @@ " activation='tanh',\n", ")\n", "\n", - "# initialize agent to the dimension of state space and number of actions\n", + "# initialize agent to the dimension of state space and a number of actions\n", "agent.partial_fit([env.reset()] * n_actions, range(n_actions), range(n_actions))" ] }, @@ -107,7 +107,7 @@ " # use agent to predict a vector of action probabilities for state :s:\n", " probs = \n", "\n", - " assert probs.shape == (env.action_space.n,), \"make sure probabilities are a vector (hint: np.reshape)\"\n", + " assert probs.shape == (env.action_space.n,), \"make sure that the probabilities are a vector (hint: np.reshape)\"\n", " \n", " # use the probabilities you predicted to pick an action\n", " # sample proportionally to the probabilities, don't just take the most likely action\n", @@ -168,7 +168,7 @@ " [i.e. sorted by session number and timestep within session]\n", "\n", " If you are confused, see examples below. Please don't assume that states are integers\n", - " (they will become different later).\n", + " (their type will change later).\n", " \"\"\"\n", "\n", " \n", @@ -274,7 +274,7 @@ "metadata": {}, "outputs": [], "source": [ - "# Show video. This may not work in some setups. If it doesn't\n", + "# Show video. In some setups this may not work. If it doesn't\n", "# work for you, you can download the videos and view them locally.\n", "\n", "from pathlib import Path\n", @@ -297,23 +297,23 @@ "\n", "By this moment you should have got enough score on [CartPole-v0](https://gym.openai.com/envs/CartPole-v0) to consider it solved (see the link). It's time to try something harder.\n", "\n", - "_if you have any trouble with CartPole-v0 and feel stuck, take a look at the forums_\n", + "_if you have any trouble with CartPole-v0 and feel stuck, take a look on forums_\n", "\n", - "Your assignment is to obtain average reward of __at least -150__ on `MountainCar-v0`.\n", + "Your assignment is to obtain an average reward of __at least -150__ on `MountainCar-v0`.\n", "\n", "See the tips section below, it's kinda important.\n", " \n", "* Bonus quest: Devise a way to speed up training against the default version\n", " * Obvious improvement: use [joblib](https://www.google.com/search?client=ubuntu&channel=fs&q=joblib&ie=utf-8&oe=utf-8)\n", - " * Try re-using samples from 3-5 last iterations when computing threshold and training\n", - " * Experiment with amount of training iterations and learning rate of the neural network (see params)\n", + " * Try re-using samples from 3-5 last iterations when computing threshold and during training\n", + " * Experiment with an amount of training iterations and the learning rate of the neural network (see params)\n", " \n", " \n", "### Tips\n", "* Gym page: [MountainCar](https://gym.openai.com/envs/MountainCar-v0)\n", "* Sessions for MountainCar may last for 10k+ ticks. Make sure ```t_max``` param is at least 10k.\n", - " * Also it may be a good idea to cut rewards via \">\" and not \">=\". If 90% of your sessions get reward of -10k and 10% are better, than if you use percentile 20% as threshold, R >= threshold __fails cut off bad sessions__ whule R > threshold works alright.\n", - "* _issue with gym_: Some versions of gym limit game time by 200 ticks. This will prevent cem training in most cases. Make sure your agent is able to play for the specified __t_max__, and if it isn't, try `env = gym.make(\"MountainCar-v0\").env` or otherwise get rid of TimeLimit wrapper.\n", + " * Also it may be a good idea to cut rewards via \">\" and not \">=\". If 90% of your sessions get reward of -10k and 10% are better, than if you use percentile 20% as the threshold, R >= threshold __fails cut off bad sessions__ whule R > threshold works alright.\n", + "* _issue with gym_: Some versions of gym limit game time by 200 ticks. This will prevent the training in most cases. Make sure your agent is able to play for the specified __t_max__, and if it isn't, try `env = gym.make(\"MountainCar-v0\").env` or otherwise get rid of TimeLimit wrapper.\n", "* If it won't train it's a good idea to plot reward distribution and record sessions: they may give you some clue. If they don't, call course staff :)\n", "* 20-neuron network is probably not enough, feel free to experiment.\n", "\n", @@ -332,7 +332,9 @@ "
" ] }, - "metadata": {}, + "metadata": { + "tags": [] + }, "output_type": "display_data" } ], @@ -359,7 +361,7 @@ " ax.set_xlabel('position (x)')\n", " ax.set_ylabel('velocity (v)')\n", " \n", - " # Sample a trajectory and draw it\n", + " # Sample the trajectory and draw it\n", " states, actions, _ = generate_session(env, agent)\n", " states = np.array(states)\n", " ax.plot(states[:, 0], states[:, 1], color='white')\n", @@ -416,5 +418,5 @@ } }, "nbformat": 4, - "nbformat_minor": 1 + "nbformat_minor": 0 } diff --git a/week1_intro/gym_interface.ipynb b/week1_intro/gym_interface.ipynb index 4ef90609f..db6295a90 100644 --- a/week1_intro/gym_interface.ipynb +++ b/week1_intro/gym_interface.ipynb @@ -15,8 +15,8 @@ "\n", " !touch .setup_complete\n", "\n", - "# This code creates a virtual display to draw game images on.\n", - "# It will have no effect if your machine has a monitor.\n", + "# This code creates a virtual display for drawing game images.\n", + "# It has no effect if your machine has a monitor.\n", "if type(os.environ.get(\"DISPLAY\")) is not str or len(os.environ.get(\"DISPLAY\")) == 0:\n", " !bash ../xvfb start\n", " os.environ['DISPLAY'] = ':1'" @@ -39,9 +39,9 @@ "source": [ "### OpenAI Gym\n", "\n", - "We're gonna spend several next weeks learning algorithms that solve decision processes. We are then in need of some interesting decision problems to test our algorithms.\n", + "We're gonna spend several next weeks learning algorithms that solve decision processes. So we need a few interesting decision problems to test our algorithms.\n", "\n", - "That's where OpenAI Gym comes into play. It's a Python library that wraps many classical decision problems including robot control, videogames and board games.\n", + "That's where OpenAI Gym comes into play. It's a Python library that wraps many classical decision problems, including robot control, videogames and board games.\n", "\n", "So here's how it works:" ] @@ -66,7 +66,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Note: if you're running this on your local machine, you'll see a window pop up with the image above. Don't close it, just alt-tab away." + "Note: if you're running this on your local machine, you'll see a window popping up with the image above. Don't close it, just alt-tab away." ] }, { @@ -75,13 +75,13 @@ "source": [ "### Gym interface\n", "\n", - "The three main methods of an environment are\n", - "* `reset()`: reset environment to the initial state, _return first observation_\n", - "* `render()`: show current environment state (a more colorful version :) )\n", - "* `step(a)`: commit action `a` and return `(new_observation, reward, is_done, info)`\n", + "The three main methods of this environment are:\n", + "* `reset()`: resets an environment to the initial state, _return first observation_\n", + "* `render()`: shows the current environment state (a more colorful version :) )\n", + "* `step(a)`: commits an action `a` and returns `(new_observation, reward, is_done, info)`\n", " * `new_observation`: an observation right after committing the action `a`\n", - " * `reward`: a number representing your reward for committing action `a`\n", - " * `is_done`: True if the MDP has just finished, False if still in progress\n", + " * `reward`: a number which represents your reward for committing action `a`\n", + " * `is_done`: True if the MDP has just finished, False if it is still in progress\n", " * `info`: some auxiliary stuff about what just happened. For now, ignore it." ] }, @@ -94,7 +94,7 @@ "obs0 = env.reset()\n", "print(\"initial observation code:\", obs0)\n", "\n", - "# Note: in MountainCar, observation is just two numbers: car position and velocity" + "# Note: in MountainCar, an observation is just two numbers: car position and velocity" ] }, { @@ -110,7 +110,7 @@ "print(\"reward:\", reward)\n", "print(\"is game over?:\", is_done)\n", "\n", - "# Note: as you can see, the car has moved to the right slightly (around 0.0005)" + "# Note: as you can see, the car has moved slightly to the right (around 0.0005)" ] }, { @@ -119,7 +119,7 @@ "source": [ "### Play with it\n", "\n", - "Below is the code that drives the car to the right. However, if you simply use the default policy, the car will not reach the flag at the far right due to gravity.\n", + "Below is the code that drives the car to the right. However, if you simply use the default policy, the car won't reach the flag at the far right due to the gravity.\n", "\n", "__Your task__ is to fix it. Find a strategy that reaches the flag. \n", "\n", @@ -151,14 +151,14 @@ "source": [ "def policy(obs, t):\n", " # Write the code for your policy here. You can use the observation\n", - " # (a tuple of position and velocity), the current time step, or both,\n", + " # (a tuple of the position and the velocity), the current time step, or both,\n", " # if you want.\n", " position, velocity = obs\n", " \n", - " # This is an example policy. You can try running it, but it will not work.\n", + " # This is an example policy. You can try running it, but it won't work.\n", " # Your goal is to fix that. You don't need anything sophisticated here,\n", " # and you can hard-code any policy that seems to work.\n", - " # Hint: think how you would make a swing go farther and faster.\n", + " # Hint: think how you would make a swing go faster and faster.\n", " return actions['right']" ] }, @@ -178,7 +178,7 @@ " action = policy(obs, t) # Call your policy\n", " obs, reward, done, _ = env.step(action) # Pass the action chosen by the policy to the environment\n", " \n", - " # We don't do anything with reward here because MountainCar is a very simple environment,\n", + " # We won't do anything with reward here because MountainCar is a very simple environment,\n", " # and reward is a constant -1. Therefore, your goal is to end the episode as quickly as possible.\n", "\n", " # Draw game image on display.\n", @@ -214,5 +214,5 @@ } }, "nbformat": 4, - "nbformat_minor": 1 + "nbformat_minor": 0 } diff --git a/week1_intro/primer/recap_ml.ipynb b/week1_intro/primer/recap_ml.ipynb index 920699541..c67c789c3 100644 --- a/week1_intro/primer/recap_ml.ipynb +++ b/week1_intro/primer/recap_ml.ipynb @@ -14,7 +14,7 @@ "\n", "---\n", "\n", - "This notebook is going to teach you to use the basic data science stack for Python: Jupyter, Numpy, matplotlib, and sklearn." + "This notebook will teach you how to use the basic data science stack in Python: Jupyter, Numpy, matplotlib and sklearn." ] }, { @@ -23,13 +23,13 @@ "source": [ "### Part I: Jupyter notebooks in a nutshell\n", "* You are reading this line in a jupyter notebook.\n", - "* A notebook consists of cells. A cell can contain either code or hypertext. \n", - " * This cell contains hypertext. The next cell contains code.\n", - "* You can __run a cell__ with code by selecting it (click) and pressing `Ctrl + Enter` to execute the code and display output(if any).\n", - "* If you're running this on a device with no keyboard, ~~you are doing it wrong~~ use the top bar (esp. play/stop/restart buttons) to run code.\n", - "* Behind the curtains, there's a Python interpreter that runs that code and remembers anything you defined.\n", + "* A notebook consists of cells, which can contain either code or hypertext. \n", + " * This cell contains hypertext, while the next one contains code.\n", + "* You can __run a cell__ with code by selecting it (click) and pressing `Ctrl + Enter` to execute the code and display the output (if any).\n", + "* If you're running this on a device with no keyboard, ~~you are doing it wrong~~ use the top bar (esp. play/stop/restart buttons) to run the code.\n", + "* Behind the curtains, there's a Python interpreter, that runs the code and remembers everything, that was defined.\n", "\n", - "Run these cells to get started" + "Run these cells to get started:" ] }, { @@ -62,9 +62,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "* `Ctrl + S` to save changes (or use the button that looks like a floppy disk)\n", - "* Top menu → Kernel → Interrupt (or Stop button) if you want it to stop running cell midway.\n", - "* Top menu → Kernel → Restart (or cyclic arrow button) if interrupt doesn't fix the problem (you will lose all variables).\n", + "* Press `Ctrl + S` to save changes (or use the button, that looks like a floppy disk)\n", + "* Use top menu → Kernel → Interrupt (or Stop button), if you want to stop running the cell midway.\n", + "* Use top menu → Kernel → Restart (or cyclic arrow button), if interruption doesn't fix the problem (be aware, you will lose all defined variables).\n", "* For shortcut junkies like us: Top menu → Help → Keyboard Shortcuts\n", "\n", "\n", @@ -72,7 +72,7 @@ "\n", "Now __the most important feature__ of jupyter notebooks for this course: \n", "* if you're typing something, press `Tab` to see automatic suggestions, use arrow keys + enter to pick one.\n", - "* if you move your cursor inside some function and press `Shift + Tab`, you'll get a help window. `Shift + (Tab , Tab)` (press `Tab` twice) will expand it." + "* if you move your cursor inside a function and press `Shift + Tab`, a help window will appear. `Shift + (Tab, Tab)` (press `Tab` twice) expands it." ] }, { @@ -92,7 +92,7 @@ "outputs": [], "source": [ "# place your cursor at the end of the unfinished line below to find a function\n", - "# that computes arctangent from two parameters (should have 2 in it's name)\n", + "# that computes arctangent from two parameters (has '2' in it's name)\n", "# once you chose it, press shift + tab + tab(again) to see the docs\n", "\n", "math.a # <---" @@ -103,9 +103,9 @@ "metadata": {}, "source": [ "### Part II: Loading data with Pandas\n", - "Pandas is a library that helps you load the data, prepare it and perform some lightweight analysis. The god object here is the `pandas.DataFrame` - a 2D table with batteries included. \n", + "Pandas is the library that helps you to load the data, prepare it and perform some lightweight analysis. The god object here is the `pandas.DataFrame` - a 2D table with included batteries. \n", "\n", - "In the cells below we use it to read the data on the infamous titanic shipwreck.\n", + "We use it in the cells below to read the data on the infamous titanic shipwreck.\n", "\n", "__please keep running all the code cells as you read__" ] @@ -116,8 +116,8 @@ "metadata": {}, "outputs": [], "source": [ - "# If you are running in Google Colab, this cell will download the dataset from our repository.\n", - "# Otherwise, this cell will do nothing.\n", + "# In Google Colab this cell will download the dataset from our repository.\n", + "# Otherwise, this cell won't do anything.\n", "\n", "import sys\n", "if 'google.colab' in sys.modules:\n", @@ -131,7 +131,7 @@ "outputs": [], "source": [ "import pandas as pd\n", - "# this yields a pandas.DataFrame\n", + "# this creates a pandas.DataFrame\n", "data = pd.read_csv(\"train.csv\", index_col='PassengerId')" ] }, @@ -375,7 +375,9 @@ ] }, "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [] + }, "output_type": "execute_result" } ], @@ -391,17 +393,17 @@ "metadata": {}, "source": [ "#### About the data\n", - "Here's some of the columns\n", - "* Name - a string with person's full name\n", - "* Survived - 1 if a person survived the shipwreck, 0 otherwise.\n", - "* Pclass - passenger class. Pclass == 3 is cheap'n'cheerful, Pclass == 1 is for moneybags.\n", - "* Sex - a person's gender (in those good ol' times when there were just 2 of them)\n", - "* Age - age in years, if available\n", - "* Sibsp - number of siblings on a ship\n", - "* Parch - number of parents on a ship\n", - "* Fare - ticket cost\n", - "* Embarked - port where the passenger embarked\n", - " * C = Cherbourg; Q = Queenstown; S = Southampton" + "Here are some of the columns:\n", + "* Name - a string with person's full name;\n", + "* Survived - 1 - if a person survived the shipwreck, 0 - otherwise;\n", + "* Pclass - passenger class. Pclass == 3 is cheap'n'cheerful, Pclass == 1 is for moneybags;\n", + "* Sex - a person's gender (in those good ol' times, when there were just 2 of them);\n", + "* Age - age in years, if available;\n", + "* Sibsp - number of siblings on a ship;\n", + "* Parch - number of parents on a ship;\n", + "* Fare - ticket cost;\n", + "* Embarked - port, where the passenger embarked;\n", + " * C = Cherbourg; Q = Queenstown; S = Southampton." ] }, { @@ -507,7 +509,7 @@ } ], "source": [ - "# select a single column.\n", + "# select a single column\n", "ages = data[\"Age\"]\n", "print(ages[:10]) # alternatively: data.Age" ] @@ -594,7 +596,9 @@ ] }, "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [] + }, "output_type": "execute_result" } ], @@ -617,7 +621,7 @@ "metadata": {}, "outputs": [], "source": [ - "# Select passengers number 13 and 666 (with these PassengerId values). Did they survive?\n", + "# Select passengers under number 13 and 666 (with these PassengerId values). Did they survive?\n", "\n", "" ] @@ -644,7 +648,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Pandas also has some basic data analysis tools. For one, you can quickly display statistical aggregates for each column using `.describe()`" + "Pandas also has some basic data analysis tools. For one, you can quickly display statistical aggregates for each column using `.describe()`." ] }, { @@ -660,9 +664,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Some columns contain __NaN__ values - this means that there is no data there. For example, passenger `#6` has unknown age. To simplify the future data analysis, we'll replace NaN values by using pandas `fillna` function.\n", + "Some columns contain __NaN__ values - this means that the data is missing. For example, age of the passenger `#6` is unknown. To simplify the future data analysis, we'll replace all NaN values by using pandas `fillna` function.\n", "\n", - "_Note: we do this so easily because it's a tutorial. In general, you think twice before you modify data like this._" + "_Note: we do this so easy because it's a tutorial. In general, you should think twice before you modify the data like this._" ] }, { @@ -697,9 +701,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "More pandas: \n", - "* A neat [tutorial](http://pandas.pydata.org/) from pydata\n", - "* Official [tutorials](https://pandas.pydata.org/pandas-docs/stable/tutorials.html), including this [10 minutes to pandas](https://pandas.pydata.org/pandas-docs/stable/10min.html#min)\n", + "More on pandas: \n", + "* A neat [tutorial](http://pandas.pydata.org/) from pydata;\n", + "* Official [tutorials](https://pandas.pydata.org/pandas-docs/stable/tutorials.html), including this [10 minutes to pandas](https://pandas.pydata.org/pandas-docs/stable/10min.html#min);\n", "* Bunch of cheat sheets awaits just one google query away from you (e.g. [basics](http://blog.yhat.com/static/img/datacamp-cheat.png), [combining datasets](https://pbs.twimg.com/media/C65MaMpVwAA3v0A.jpg) and so on). " ] }, @@ -709,9 +713,9 @@ "source": [ "### Part III: Numpy and vectorized computing\n", "\n", - "Almost any machine learning model requires some computational heavy lifting usually involving linear algebra problems. Unfortunately, raw Python is terrible at this because each operation is interpreted at runtime. \n", + "Almost any machine learning model requires some computational heavy lifting, usually involving linear algebra. Unfortunately, raw Python is terrible at this, because each operation is interpreted at runtime. \n", "\n", - "So instead, we'll use `numpy` - a library that lets you run blazing fast computation with vectors, matrices and other tensors. Again, the god object here is `numpy.ndarray`:" + "So instead, we'll use `numpy` - the library, that lets you run blazing fast computation with vectors, matrices and other tensors. Again, the god object here is `numpy.ndarray`:" ] }, { @@ -781,8 +785,8 @@ "### How fast is it, Harry?\n", "![img](https://img.buzzfeed.com/buzzfeed-static/static/2015-11/6/7/enhanced/webdr10/enhanced-buzz-22847-1446811476-0.jpg)\n", "\n", - "Let's compare computation time for Python and Numpy\n", - "* Two arrays of $10^6$ elements\n", + "Let's compare the computation time for Python and Numpy:\n", + "* Two arrays of $10^6$ elements:\n", " * first one: from 0 to 1 000 000\n", " * second one: from 99 to 1 000 099\n", " \n", @@ -801,7 +805,7 @@ "outputs": [], "source": [ "%%time\n", - "# ^-- this \"magic\" measures and prints cell computation time\n", + "# ^-- this \"magic\" measures and prints the cell computation time\n", "\n", "# Option I: pure Python\n", "arr_1 = range(1000000)\n", @@ -872,7 +876,7 @@ "---\n", "\n", "\n", - "There's also a bunch of pre-implemented operations including logarithms, trigonometry, vector/matrix products and aggregations." + "There's also a bunch of pre-implemented operations: logarithms, trigonometry, vector/matrix products, aggregations and others." ] }, { @@ -954,7 +958,7 @@ "metadata": {}, "outputs": [], "source": [ - "# your code: compute mean passenger age and the oldest guy on the ship\n", + "# your code: compute the mean passenger age and define the oldest guy on the ship\n", "" ] }, @@ -2079,7 +2083,9 @@ ] }, "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [] + }, "output_type": "execute_result" } ], @@ -2087,11 +2093,11 @@ "a = np.array([0, 1, 4, 9, 16, 25])\n", "ix = np.array([1, 2, 5])\n", "print(\"a =\", a)\n", - "print(\"Select by element index\")\n", + "print(\"Select by the element index\")\n", "print(\"a[[1,2,5]] =\", a[ix])\n", "\n", "print(\"\\nSelect by boolean mask\")\n", - "# select all elementts in a that are greater than 5\n", + "# select all elements in 'a' that are greater than 5\n", "print(\"a[a > 5] =\", a[a > 5])\n", "print(\"(a % 2 == 0) =\", a % 2 == 0) # True for even, False for odd\n", "print(\"a[a % 2 == 0] =\", a[a % 2 == 0]) # select all elements in a that are even\n", @@ -2106,9 +2112,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Your turn\n", + "It's your turn\n", "\n", - "Use numpy and pandas to answer a few questions about data" + "Use numpy and pandas to answer a few questions about the data" ] }, { @@ -2117,7 +2123,7 @@ "metadata": {}, "outputs": [], "source": [ - "# who on average paid more for their ticket, men or women?\n", + "# who, men or women, paid more on average for their ticket?\n", "\n", "mean_fare_men = \n", "mean_fare_women = \n", @@ -2147,7 +2153,7 @@ "\n", "Using Python to visualize the data is covered by yet another library: matplotlib.\n", "\n", - "Just like Python itself, matplotlib has an awesome tendency of keeping simple things simple while still allowing you to write complicated stuff with convenience (e.g. super-detailed plots or custom animations)." + "Just like Python itself, matplotlib has an awesome tendency of keeping simple things simple, while still allowing you to write complicated stuff with convenience (e.g. super-detailed plots or custom animations)." ] }, { @@ -2162,7 +2168,9 @@ ] }, "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [] + }, "output_type": "execute_result" }, { @@ -2172,14 +2180,16 @@ "" ] }, - "metadata": {}, + "metadata": { + "tags": [] + }, "output_type": "display_data" } ], "source": [ "import matplotlib.pyplot as plt\n", "%matplotlib inline\n", - "# ^-- this \"magic\" tells all future matplotlib plots to be drawn inside notebook and not in a separate window.\n", + "# ^-- this \"magic\" tells all future matplotlib plots to be drawn inside a notebook, not in a separate window.\n", "\n", "# line plot\n", "plt.plot([0, 1, 2, 3, 4, 5], [0, 1, 4, 9, 16, 25])" @@ -2197,7 +2207,9 @@ "" ] }, - "metadata": {}, + "metadata": { + "tags": [] + }, "output_type": "display_data" } ], @@ -2205,7 +2217,7 @@ "# scatter-plot\n", "plt.scatter([0, 1, 2, 3, 4, 5], [0, 1, 4, 9, 16, 25])\n", "\n", - "plt.show() # show the first plot and begin drawing next one" + "plt.show() # shows the first plot and begins to draw the next one" ] }, { @@ -2220,7 +2232,9 @@ ] }, "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [] + }, "output_type": "execute_result" }, { @@ -2230,7 +2244,9 @@ "" ] }, - "metadata": {}, + "metadata": { + "tags": [] + }, "output_type": "display_data" } ], @@ -2262,7 +2278,9 @@ "" ] }, - "metadata": {}, + "metadata": { + "tags": [] + }, "output_type": "display_data" }, { @@ -2274,7 +2292,9 @@ ] }, "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [] + }, "output_type": "execute_result" }, { @@ -2284,7 +2304,9 @@ "" ] }, - "metadata": {}, + "metadata": { + "tags": [] + }, "output_type": "display_data" } ], @@ -2303,7 +2325,7 @@ "metadata": {}, "outputs": [], "source": [ - "# plot a histogram of age and a histogram of ticket fares on separate plots\n", + "# plot the histogram of age and the histogram of ticket fares on separate plots\n", "\n", "\n", "\n", @@ -2342,7 +2364,7 @@ "\n", "Scikit-learn is _the_ tool for simple machine learning pipelines. \n", "\n", - "It's a single library that unites a whole bunch of models under the common interface:\n", + "It's a single library that unites a whole bunch of models under a common interface:\n", "* Create: `model = sklearn.whatever.ModelNameHere(parameters_if_any)`\n", "* Train: `model.fit(X, y)`\n", "* Predict: `model.predict(X_test)`\n", @@ -2418,5 +2440,5 @@ } }, "nbformat": 4, - "nbformat_minor": 1 + "nbformat_minor": 0 } diff --git a/week1_intro/primer/recap_pytorch.ipynb b/week1_intro/primer/recap_pytorch.ipynb index d11679fa7..fb0f7c823 100644 --- a/week1_intro/primer/recap_pytorch.ipynb +++ b/week1_intro/primer/recap_pytorch.ipynb @@ -8,11 +8,11 @@ "\n", "![img](https://pytorch.org/tutorials/_static/pytorch-logo-dark.svg)\n", "\n", - "__This notebook__ will teach you to use PyTorch low-level core. If you're running this notebook outside the course environment, you can install it [here](https://pytorch.org).\n", + "__This notebook__ will teach you how to use PyTorch low-level core. If you're running this notebook outside the course environment, you can install it [here](https://pytorch.org).\n", "\n", - "__PyTorch feels__ differently than tensorflow/theano on almost every level. TensorFlow makes your code live in two \"worlds\" simultaneously: symbolic graphs and actual tensors. First you declare a symbolic \"recipe\" of how to get from inputs to outputs, then feed it with actual minibatches of data. In PyTorch, __there's only one world__: all tensors have a numeric value.\n", + "__PyTorch feels__ differently than tensorflow/theano at almost every level. TensorFlow makes your code live in two \"worlds\" simultaneously: symbolic graphs and actual tensors. First you declare a symbolic \"recipe\" of how to get from inputs to outputs, then feed it with actual minibatches of data. In PyTorch, __there's only one world__: all tensors have a numeric value.\n", "\n", - "You compute outputs on the fly without pre-declaring anything. The code looks exactly as in pure numpy with one exception: PyTorch computes gradients for you. And can run stuff on GPU. And has a number of pre-implemented building blocks for your neural nets. [And a few more things.](https://medium.com/towards-data-science/pytorch-vs-tensorflow-spotting-the-difference-25c75777377b)\n", + "You compute outputs on the fly without pre-declaring anything. The code looks exactly as in pure numpy with one exception: PyTorch computes gradients for you, can run stuff on GPU and has a number of pre-implemented building blocks for your neural nets. [And a few more things.](https://medium.com/towards-data-science/pytorch-vs-tensorflow-spotting-the-difference-25c75777377b)\n", "\n", "And now we finally shut up and let PyTorch do the talking." ] @@ -159,9 +159,9 @@ "source": [ "## NumPy and PyTorch\n", "\n", - "As you can notice, PyTorch allows you to hack stuff much the same way you did with NumPy. No graph declaration, no placeholders, no sessions. This means that you can _see the numeric value of any tensor at any moment of time_. Debugging such code can be done with by printing tensors or using any debug tool you want (e.g. [PyCharm debugger](https://www.jetbrains.com/help/pycharm/part-1-debugging-python-code.html) or [gdb](https://wiki.python.org/moin/DebuggingWithGdb)).\n", + "As you can notice, PyTorch allows you to hack stuff more or less the same way you did with NumPy. No graph declaration, no placeholders, no sessions. This means that you can _see the numeric value of any tensor at any moment of time_. Debugging such code can be done by printing tensors or using any debug tool you want (e.g. [PyCharm debugger](https://www.jetbrains.com/help/pycharm/part-1-debugging-python-code.html) or [gdb](https://wiki.python.org/moin/DebuggingWithGdb)).\n", "\n", - "You could also notice the a few new method names and a different API. So no, there's no compatibility with NumPy [yet](https://github.com/pytorch/pytorch/issues/2228) and yes, you'll have to memorize all the names again. Get excited!\n", + "You could also notice a few new method names and a different API. So no, there's no compatibility with NumPy [yet](https://github.com/pytorch/pytorch/issues/2228) and yes, you'll need to memorize all the names again. Get excited!\n", "\n", "![img](http://i0.kym-cdn.com/entries/icons/original/000/017/886/download.jpg)\n", "\n", @@ -170,14 +170,14 @@ " * `x.reshape([1,2,8]) -> x.view(1,2,8)`\n", "* You should swap `axis` for `dim` in operations like `mean` or `cumsum`\n", " * `x.sum(axis=-1) -> x.sum(dim=-1)`\n", - "* Most mathematical operations are the same, but types an shaping is different\n", + "* Most mathematical operations are the same, but types and shaping is different\n", " * `x.astype('int64') -> x.type(torch.LongTensor)`\n", "\n", - "To help you acclimatize, there's a [table](https://github.com/torch/torch7/wiki/Torch-for-NumPy-users) covering most new things. There's also a neat [documentation page](http://pytorch.org/docs/master/).\n", + "To help you acclimatize, there's a [table](https://github.com/torch/torch7/wiki/Torch-for-NumPy-users) covering most of new things. There's also a neat [documentation page](http://pytorch.org/docs/master/).\n", "\n", - "Finally, if you're stuck with a technical problem, we recommend searching [PyTorch forums](https://discuss.pytorch.org/). Or just googling, which usually works just as efficiently. \n", + "Finally, if you're stuck with a technical problem, we recommend searching [PyTorch forums](https://discuss.pytorch.org/) or googling, which usually works just as efficiently. \n", "\n", - "If you feel like you almost give up, remember two things: __GPU__ and __free gradients__. Besides you can always jump back to NumPy with `x.numpy()`." + "If you feel like you are ready to give up, remember two things: __GPU__ and __free gradients__. Besides you can always jump back to NumPy with `x.numpy()`." ] }, { @@ -215,7 +215,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "If you're done early, try adjusting the formula and seeing how it affects the function." + "If you're done, try adjusting the formula and seeing how it affects the function." ] }, { @@ -231,7 +231,7 @@ "source": [ "## Automatic gradients\n", "\n", - "Any self-respecting DL framework must do your backprop for you. Torch handles this with the `autograd` module.\n", + "Any self-respecting DL framework must do backprop for you. Torch handles this with the `autograd` module.\n", "\n", "The general pipeline looks like this:\n", "* When creating a tensor, you mark it as `requires_grad`:\n", @@ -256,7 +256,9 @@ ] }, "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [] + }, "output_type": "execute_result" }, { @@ -267,7 +269,8 @@ ] }, "metadata": { - "needs_background": "light" + "needs_background": "light", + "tags": [] }, "output_type": "display_data" } @@ -308,7 +311,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The gradients are now stored in `.grad` of those variables that require them." + "The gradients are now stored in `.grad` of the variables that require them." ] }, { @@ -336,7 +339,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "If you compute gradient from multiple losses, the gradients will add up at variables, therefore it's useful to __zero the gradients__ between iteratons." + "If you compute gradient from multiple losses, the gradients will be added to variables, therefore, it's useful to __zero the gradients__ between iteratons." ] }, { @@ -352,7 +355,8 @@ ] }, "metadata": { - "needs_background": "light" + "needs_background": "light", + "tags": [] }, "output_type": "display_data" }, @@ -379,7 +383,7 @@ " w.grad.data.zero_()\n", " b.grad.data.zero_()\n", "\n", - " # the rest of code is just bells and whistles\n", + " # the rest of the code are just bells and whistles\n", " if (i + 1) % 5 == 0:\n", " clear_output(True)\n", " plt.scatter(x.numpy(), y.numpy())\n", @@ -437,7 +441,9 @@ "" ] }, - "metadata": {}, + "metadata": { + "tags": [] + }, "output_type": "display_data" } ], @@ -501,7 +507,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "There's a vast library of popular layers and architectures already built for ya'.\n", + "There's a detailed library of popular layers and architectures already built for you.\n", "\n", "This is a binary classification problem, so we'll train __Logistic Regression__.\n", "$$P(y_i | X_i) = \\sigma(W \\cdot X_i + b) ={ 1 \\over {1+e^{- [W \\cdot X_i + b]}} }$$\n" @@ -553,7 +559,9 @@ ] }, "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [] + }, "output_type": "execute_result" } ], @@ -562,7 +570,7 @@ "x = torch.tensor(X_train[:3], dtype=torch.float32)\n", "y = torch.tensor(y_train[:3], dtype=torch.float32)\n", "\n", - "# compute outputs given inputs, both are variables\n", + "# compute outputs with given inputs, both are variables\n", "y_predicted = model(x)[:, 0]\n", "\n", "y_predicted # display what we've got" @@ -590,19 +598,19 @@ "loss = \n", "\n", "assert tuple(crossentropy.size()) == (\n", - " 3,), \"Crossentropy must be a vector with element per sample\"\n", + " 3,), \"Crossentropy must be a vector with an element per sample\"\n", "assert tuple(loss.size()) == tuple(\n", "), \"Loss must be scalar. Did you forget the mean/sum?\"\n", - "assert loss.data.numpy() > 0, \"Crossentropy must non-negative, zero only for perfect prediction\"\n", + "assert loss.data.numpy() > 0, \"Crossentropy must be non-negative, zero only for perfect prediction\"\n", "assert loss.data.numpy() <= np.log(\n", - " 3), \"Loss is too large even for untrained model. Please double-check it.\"" + " 3), \"Loss is too large even for an untrained model. Please double-check it.\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "__Note:__ you can also find many such functions in `torch.nn.functional`, just type __`F.`__." + "__Note:__ you can also find many other functions in `torch.nn.functional`, just type __`F.`__." ] }, { @@ -611,7 +619,7 @@ "source": [ "__Torch optimizers__\n", "\n", - "When we trained Linear Regression above, we had to manually `.zero_()` gradients on both our variables. Imagine that code for a 50-layer network.\n", + "When we trained Linear Regression above, we had to manually zero-out (`.zero_()`) gradients of both variables. Imagine this code for a 50-layer network.\n", "\n", "Again, to keep it from getting dirty, there's `torch.optim` module with pre-implemented algorithms:" ] @@ -723,10 +731,10 @@ "source": [ "__Debugging tips:__\n", "* Make sure your model predicts probabilities correctly. Just print them and see what's inside.\n", - "* Don't forget the _minus_ sign in the loss function! It's a mistake 99% people do at some point.\n", + "* Don't forget the _minus_ sign in the loss function! This is the mistake 99% people do at some point.\n", "* Make sure you zero-out gradients after each step. Seriously:)\n", - "* In general, PyTorch's error messages are quite helpful, read 'em before you google 'em.\n", - "* if you see nan/inf, print what happens at each iteration to find our where exactly it occurs.\n", + "* In general, PyTorch's error messages are quite helpful, read them before you google them.\n", + "* if you see nan/inf, print what happens at each iteration to find out, where exactly it occurs.\n", " * If loss goes down and then turns nan midway through, try smaller learning rate. (Our current loss formula is unstable)." ] }, @@ -774,8 +782,8 @@ "## More about PyTorch:\n", "* Using torch on GPU and multi-GPU - [link](http://pytorch.org/docs/master/notes/cuda.html)\n", "* More tutorials on PyTorch - [link](http://pytorch.org/tutorials/beginner/deep_learning_60min_blitz.html)\n", - "* PyTorch examples - a repo that implements many cool DL models in PyTorch - [link](https://github.com/pytorch/examples)\n", - "* Practical PyTorch - a repo that implements some... other cool DL models... yes, in PyTorch - [link](https://github.com/spro/practical-pytorch)\n", + "* PyTorch examples - a repo, that implements many cool DL models in PyTorch - [link](https://github.com/pytorch/examples)\n", + "* Practical PyTorch - a repo, that implements some... other cool DL models... yes, in PyTorch - [link](https://github.com/spro/practical-pytorch)\n", "* And some more - [link](https://www.reddit.com/r/pytorch/comments/6z0yeo/pytorch_and_pytorch_tricks_for_kaggle/)\n", "\n", "---" @@ -787,7 +795,7 @@ "source": [ "# Bonus tasks\n", "\n", - "If you get stuck with no progress, try switching to the next task and returning later." + "If you get stuck with no progress, try switching to the next task and returning to them later." ] }, { @@ -1034,7 +1042,9 @@ "" ] }, - "metadata": {}, + "metadata": { + "tags": [] + }, "output_type": "display_data" } ], @@ -1078,7 +1088,7 @@ " - Softmax (exp over sum of exps) can be implemented manually or as `nn.Softmax` (layer) or `F.softmax` (function)\n", " - Probably better to use STOCHASTIC gradient descent (minibatch) for greater speed\n", " - You can also try momentum/rmsprop/adawhatever\n", - " - in which case the dataset should probably be shuffled (or use random subsamples on each iteration)\n", + " - in this case the dataset should probably be shuffled (or use random subsamples on each iteration)\n", "* Add a hidden layer. Now your logistic regression uses hidden neurons instead of inputs.\n", " - Hidden layer uses the same math as output layer (ex-logistic regression), but uses some nonlinearity (e.g. sigmoid) instead of softmax\n", " - You need to train both layers, not just the output layer :)\n", @@ -1086,9 +1096,9 @@ " - In ideal case this totals to 2 `torch.matmul`'s, 1 softmax and 1 ReLU/sigmoid\n", " - __Make sure this neural network works better than logistic regression!__\n", " \n", - "* Now's the time to try improving the network. Consider layers (size, neuron count), nonlinearities, optimization methods, initialization — whatever you want, but please avoid convolutions for now.\n", + "* Now's the time to try improving the network. Consider layers (size, neuron count), nonlinearities, optimization methods, initialization — whatever you want, but, please, avoid convolutions for now.\n", " \n", - "* If anything seems wrong, try going through one step of training and printing everything you compute.\n", + "* If something seems wrong, try going through each step of training and print everything you compute.\n", "* If you see NaNs midway through optimization, you can estimate $\\log P(y \\mid x)$ as `F.log_softmax(layer_before_softmax)`." ] } @@ -1100,5 +1110,5 @@ } }, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 0 } From cd9f5e9ebce6e1404f270f2eed50ad390a53db9d Mon Sep 17 00:00:00 2001 From: mmamedli Date: Thu, 28 Jan 2021 20:07:53 +0300 Subject: [PATCH 2/3] Proofread weeks from 2 to 6 --- week2_model_based/practice_vi.ipynb | 42 +++--- week3_model_free/experience_replay.ipynb | 40 +++--- week3_model_free/qlearning.ipynb | 51 ++++---- week3_model_free/sarsa.ipynb | 24 ++-- week4_approx/dqn_atari_pytorch.ipynb | 122 +++++++++--------- .../practice_approx_qlearning_pytorch.ipynb | 38 +++--- .../practice_actorcritic_pytorch.ipynb | 47 ++++--- .../practice_reinforce_pytorch.ipynb | 36 +++--- week6_outro/bandits.ipynb | 20 +-- week6_outro/practice_mcts.ipynb | 94 +++++++------- week6_outro/seq2seq/practice_pytorch.ipynb | 64 ++++----- 11 files changed, 292 insertions(+), 286 deletions(-) diff --git a/week2_model_based/practice_vi.ipynb b/week2_model_based/practice_vi.ipynb index 9d9c03b6d..973579327 100644 --- a/week2_model_based/practice_vi.ipynb +++ b/week2_model_based/practice_vi.ipynb @@ -6,9 +6,9 @@ "source": [ "### Markov decision process\n", "\n", - "This week's methods are all built to solve __M__arkov __D__ecision __P__rocesses. In the broadest sense, an MDP is defined by how it changes states and how rewards are computed.\n", + "This week methods are all built to solve __M__arkov __D__ecision __P__rocesses. In the broadest sense, the MDP is defined by how it changes the states and how rewards are computed.\n", "\n", - "State transition is defined by $P(s' |s,a)$ - how likely are you to end at state $s'$ if you take action $a$ from state $s$. Now there's more than one way to define rewards, but we'll use $r(s,a,s')$ function for convenience.\n", + "State transition is defined by $P(s' |s,a)$ - how likely you are to end at the state $s'$ if you take an action $a$ from the state $s$. Now there's more than one way to define rewards, but for convenience we'll use $r(s,a,s')$ function.\n", "\n", "_This notebook is inspired by the awesome_ [CS294](https://github.com/berkeleydeeprlcourse/homework/blob/36a0b58261acde756abd55306fbe63df226bf62b/hw2/HW2.ipynb) _by Berkeley_" ] @@ -39,7 +39,7 @@ " !touch .setup_complete\n", "\n", "# This code creates a virtual display to draw game images on.\n", - "# It will have no effect if your machine has a monitor.\n", + "# It won't have any effect if your machine has a monitor.\n", "if type(os.environ.get(\"DISPLAY\")) is not str or len(os.environ.get(\"DISPLAY\")) == 0:\n", " !bash ../xvfb start\n", " os.environ['DISPLAY'] = ':1'" @@ -78,7 +78,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We can now use MDP just as any other gym environment:" + "We can now use the MDP just as any other gym environment:" ] }, { @@ -105,7 +105,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "but it also has other methods that you'll need for Value Iteration" + "but it also has other methods that you'll need for Value Iteration:" ] }, { @@ -352,7 +352,9 @@ "" ] }, - "metadata": {}, + "metadata": { + "tags": [] + }, "output_type": "display_data" } ], @@ -387,9 +389,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "First, let's write a function to compute the state-action value function $Q^{\\pi}$, defined as follows\n", + "First, let's write a function to compute the state-action value function $Q^{\\pi}$, defined as follows:\n", "\n", - "$$Q_i(s, a) = \\sum_{s'} P(s' | s,a) \\cdot [ r(s,a,s') + \\gamma V_{i}(s')]$$\n" + "$$Q_i(s, a) = \\sum_{s'} P(s' | s,a) \\cdot [ r(s,a,s') + \\gamma V_{i}(s')].$$\n" ] }, { @@ -399,7 +401,7 @@ "outputs": [], "source": [ "def get_action_value(mdp, state_values, state, action, gamma):\n", - " \"\"\" Computes Q(s,a) as in formula above \"\"\"\n", + " \"\"\" Computes Q(s,a) according to the formula above \"\"\"\n", "\n", " \n", "\n", @@ -422,7 +424,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Using $Q(s,a)$ we can now define the \"next\" V(s) for value iteration.\n", + "Using $Q(s,a)$ we now can define the \"next\" V(s) for value iteration.\n", " $$V_{(i+1)}(s) = \\max_a \\sum_{s'} P(s' | s,a) \\cdot [ r(s,a,s') + \\gamma V_{i}(s')] = \\max_a Q_i(s,a)$$" ] }, @@ -433,7 +435,7 @@ "outputs": [], "source": [ "def get_new_state_value(mdp, state_values, state, gamma):\n", - " \"\"\" Computes next V(s) as in formula above. Please do not change state_values in process. \"\"\"\n", + " \"\"\" Computes the next V(s) according to the formula above. Please do not change state_values in process. \"\"\"\n", " if mdp.is_terminal(state):\n", " return 0\n", "\n", @@ -470,9 +472,9 @@ "outputs": [], "source": [ "# parameters\n", - "gamma = 0.9 # discount for MDP\n", + "gamma = 0.9 # discount for the MDP\n", "num_iter = 100 # maximum iterations, excluding initialization\n", - "# stop VI if new values are this close to old values (or closer)\n", + "# stop VI if new values are as close to old values (or closer)\n", "min_difference = 0.001\n", "\n", "# initialize V(s)\n", @@ -528,11 +530,11 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Now let's use those $V^{*}(s)$ to find optimal actions in each state\n", + "Now let's use those $V^{*}(s)$ to find optimal actions in each state:\n", "\n", - " $$\\pi^*(s) = argmax_a \\sum_{s'} P(s' | s,a) \\cdot [ r(s,a,s') + \\gamma V_{i}(s')] = argmax_a Q_i(s,a)$$\n", + " $$\\pi^*(s) = argmax_a \\sum_{s'} P(s' | s,a) \\cdot [ r(s,a,s') + \\gamma V_{i}(s')] = argmax_a Q_i(s,a).$$\n", " \n", - "The only difference vs V(s) is that here we take not max but argmax: find action such with maximum Q(s,a)." + "The only difference vs V(s) is that here instead of max we take argmax: find the action that leads to the maximum of Q(s,a)." ] }, { @@ -622,7 +624,7 @@ "outputs": [], "source": [ "def value_iteration(mdp, state_values=None, gamma=0.9, num_iter=1000, min_difference=1e-5):\n", - " \"\"\" performs num_iter value iteration steps starting from state_values. Same as before but in a function \"\"\"\n", + " \"\"\" performs num_iter value iteration steps starting from state_values. The same as before but in a function \"\"\"\n", " state_values = state_values or {s: 0 for s in mdp.get_all_states()}\n", " for i in range(num_iter):\n", "\n", @@ -631,7 +633,7 @@ "\n", " assert isinstance(new_state_values, dict)\n", "\n", - " # Compute difference\n", + " # Compute the difference\n", " diff = max(abs(new_state_values[s] - state_values[s])\n", " for s in mdp.get_all_states())\n", "\n", @@ -677,7 +679,7 @@ "source": [ "### Let's visualize!\n", "\n", - "It's usually interesting to see what your algorithm actually learned under the hood. To do so, we'll plot state value functions and optimal actions at each VI step." + "It's usually interesting to see, what your algorithm actually learned under the hood. To do so, we'll plot the state value functions and optimal actions at each VI step." ] }, { @@ -903,5 +905,5 @@ } }, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 0 } diff --git a/week3_model_free/experience_replay.ipynb b/week3_model_free/experience_replay.ipynb index b73691c10..5aa6a5b75 100644 --- a/week3_model_free/experience_replay.ipynb +++ b/week3_model_free/experience_replay.ipynb @@ -6,9 +6,9 @@ "source": [ "### Honor Track: experience replay\n", "\n", - "There's a powerful technique that you can use to improve sample efficiency for off-policy algorithms: [spoiler] Experience replay :)\n", + "There's a powerful technique that you can use to improve the sample efficiency for off-policy algorithms: [spoiler] Experience replay :)\n", "\n", - "The catch is that you can train Q-learning and EV-SARSA on `` tuples even if they aren't sampled under current agent's policy. So here's what we're gonna do:\n", + "The catch is that you can train Q-learning and EV-SARSA on `` tuples even if they aren't sampled under the current agent's policy. So here's what we're gonna do:\n", "\n", "\n", "\n", @@ -16,11 +16,11 @@ "1. Play game, sample ``.\n", "2. Update q-values based on ``.\n", "3. Store `` transition in a buffer. \n", - " 3. If buffer is full, delete earliest data.\n", - "4. Sample K such transitions from that buffer and update q-values based on them.\n", + " 3. If buffer is full, delete the earliest data.\n", + "4. Sample K such transitions from that buffer and update the q-values based on them.\n", "\n", "\n", - "To enable such training, first we must implement a memory structure that would act like such a buffer." + "To enable such training, first, we must implement a memory structure, that would act as this buffer." ] }, { @@ -39,7 +39,7 @@ " !touch .setup_complete\n", "\n", "# This code creates a virtual display to draw game images on.\n", - "# It will have no effect if your machine has a monitor.\n", + "# It won't have any effect if your machine has a monitor.\n", "if type(os.environ.get(\"DISPLAY\")) is not str or len(os.environ.get(\"DISPLAY\")) == 0:\n", " !bash ../xvfb start\n", " os.environ['DISPLAY'] = ':1'" @@ -64,7 +64,7 @@ "metadata": {}, "outputs": [], "source": [ - "" + "" ] }, { @@ -83,12 +83,12 @@ " Parameters\n", " ----------\n", " size: int\n", - " Max number of transitions to store in the buffer. When the buffer\n", - " overflows the old memories are dropped.\n", + " Max number of transitions to store in the buffer. When the buffer is\n", + " overflowed, the old memories are dropped.\n", "\n", " Note: for this assignment you can pick any data structure you want.\n", " If you want to keep it simple, you can store a list of tuples of (s, a, r, s') in self._storage\n", - " However you may find out there are faster and/or more memory-efficient ways to do so.\n", + " However you may find, that there are faster and/or more memory-efficient ways to do so.\n", " \"\"\"\n", " self._storage = []\n", " self._maxsize = size\n", @@ -101,7 +101,7 @@ " def add(self, obs_t, action, reward, obs_tp1, done):\n", " '''\n", " Make sure, _storage will not exceed _maxsize. \n", - " Make sure, FIFO rule is being followed: the oldest examples has to be removed earlier\n", + " Make sure, FIFO rule is being followed: the oldest examples have to be removed earlier\n", " '''\n", " data = (obs_t, action, reward, obs_tp1, done)\n", "\n", @@ -121,9 +121,9 @@ " act_batch: np.array\n", " batch of actions executed given obs_batch\n", " rew_batch: np.array\n", - " rewards received as results of executing act_batch\n", + " rewards received as the results of executing act_batch\n", " next_obs_batch: np.array\n", - " next set of observations seen after executing act_batch\n", + " next set of observations, seen after executing act_batch\n", " done_mask: np.array\n", " done_mask[i] = 1 if executing act_batch[i] resulted in\n", " the end of an episode and 0 otherwise.\n", @@ -184,7 +184,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Now let's use this buffer to improve training:" + "Now let's use this buffer to improve the training:" ] }, { @@ -212,7 +212,7 @@ " - train agent using agent.update(...) whenever possible\n", " - return total reward\n", " :param replay: ReplayBuffer where agent can store and sample (s,a,r,s',done) tuples.\n", - " If None, do not use experience replay\n", + " If None, do not use an experience replay\n", " \"\"\"\n", " total_reward = 0.0\n", " s = env.reset()\n", @@ -231,7 +231,7 @@ " \n", "\n", " # sample replay_batch_size random transitions from replay,\n", - " # then update agent on each of them in a loop\n", + " # then update the agent on each of them in a loop\n", " s_, a_, r_, next_s_, done_ = replay.sample(replay_batch_size)\n", " for i in range(replay_batch_size):\n", " \n", @@ -250,7 +250,7 @@ "metadata": {}, "outputs": [], "source": [ - "# Create two agents: first will use experience replay, second will not.\n", + "# Create two agents: first will use the experience replay, second will not.\n", "\n", "agent_baseline = QLearningAgent(\n", " alpha=0.5, epsilon=0.25, discount=0.99,\n", @@ -326,11 +326,11 @@ "\n", "### Outro\n", "\n", - "We will use the code you just wrote extensively in the next week of our course. If you're feeling that you need more examples to understand how experience replay works, try using it for binarized state spaces (CartPole or other __[classic control envs](https://gym.openai.com/envs/#classic_control)__).\n", + "We will use the code you just wrote extensively in the next week of our course. If you're feeling, that you need more examples to understand how the experience replay works, try using it for binarized state spaces (CartPole or other __[classic control envs](https://gym.openai.com/envs/#classic_control)__).\n", "\n", "__Next week__ we're gonna explore how q-learning and similar algorithms can be applied for large state spaces, with deep learning models to approximate the Q function.\n", "\n", - "However, __the code you've written__ for this week is already capable of solving many RL problems, and as an added benifit - it is very easy to detach. You can use Q-learning, SARSA and Experience Replay for any RL problems you want to solve - just thow 'em into a file and import the stuff you need." + "However, __the code you've written__ this week is already capable to solve many RL problems, and as an added benifit - it is very easy to detach. You can use Q-learning, SARSA and Experience Replay for any RL problems you want to solve - just throw them into a file and import the stuff you need." ] } ], @@ -341,5 +341,5 @@ } }, "nbformat": 4, - "nbformat_minor": 1 + "nbformat_minor": 0 } diff --git a/week3_model_free/qlearning.ipynb b/week3_model_free/qlearning.ipynb index 75b718f70..6b5e0d15d 100644 --- a/week3_model_free/qlearning.ipynb +++ b/week3_model_free/qlearning.ipynb @@ -6,9 +6,9 @@ "source": [ "## Q-learning\n", "\n", - "This notebook will guide you through implementation of vanilla Q-learning algorithm.\n", + "This notebook will guide you through the implementation of vanilla Q-learning algorithm.\n", "\n", - "You need to implement QLearningAgent (follow instructions for each method) and use it on a number of tests below." + "You need to implement QLearningAgent (follow instructions for each method) and use it in a number of tests below." ] }, { @@ -26,8 +26,8 @@ "\n", " !touch .setup_complete\n", "\n", - "# This code creates a virtual display to draw game images on.\n", - "# It will have no effect if your machine has a monitor.\n", + "# This code creates a virtual display for drawing game images on.\n", + "# It won't have any effect if your machine has a monitor.\n", "if type(os.environ.get(\"DISPLAY\")) is not str or len(os.environ.get(\"DISPLAY\")) == 0:\n", " !bash ../xvfb start\n", " os.environ['DISPLAY'] = ':1'" @@ -61,12 +61,12 @@ " \"\"\"\n", " Q-Learning Agent\n", " based on https://inst.eecs.berkeley.edu/~cs188/sp19/projects.html\n", - " Instance variables you have access to\n", + " Instance variables you have access to:\n", " - self.epsilon (exploration prob)\n", " - self.alpha (learning rate)\n", " - self.discount (discount rate aka gamma)\n", "\n", - " Functions you should use\n", + " Functions that you should use:\n", " - self.get_legal_actions(state) {state, hashable -> list of actions, each is hashable}\n", " which returns legal actions for a state\n", " - self.get_qvalue(state,action)\n", @@ -92,7 +92,7 @@ " \"\"\" Sets the Qvalue for [state,action] to the given value \"\"\"\n", " self._qvalues[state][action] = value\n", "\n", - " #---------------------START OF YOUR CODE---------------------#\n", + " #---------------------BEGINNING OF YOUR CODE---------------------#\n", "\n", " def get_value(self, state):\n", " \"\"\"\n", @@ -126,7 +126,7 @@ "\n", " def get_best_action(self, state):\n", " \"\"\"\n", - " Compute the best action to take in a state (using current q-values). \n", + " Compute the best action to take in the state (using current q-values). \n", " \"\"\"\n", " possible_actions = self.get_legal_actions(state)\n", "\n", @@ -145,7 +145,7 @@ " otherwise - the best policy action (self.get_best_action).\n", "\n", " Note: To pick randomly from a list, use random.choice(list). \n", - " To pick True or False with a given probablity, generate uniform number in [0, 1]\n", + " To pick True or False with a given probablity, generate a uniform number in [0, 1]\n", " and compare it with your probability\n", " \"\"\"\n", "\n", @@ -172,7 +172,7 @@ "### Try it on taxi\n", "\n", "Here we use the qlearning agent on taxi env from openai gym.\n", - "You will need to insert a few agent functions here." + "You will need to add a few agent functions here." ] }, { @@ -209,18 +209,18 @@ " This function should \n", " - run a full game, actions given by agent's e-greedy policy\n", " - train agent using agent.update(...) whenever it is possible\n", - " - return total reward\n", + " - return the total reward\n", " \"\"\"\n", " total_reward = 0.0\n", " s = env.reset()\n", "\n", " for t in range(t_max):\n", - " # get agent to pick action given state s.\n", + " # get an agent to pick action given state s.\n", " a = \n", "\n", " next_s, r, done, _ = env.step(a)\n", "\n", - " # train (update) agent for state s\n", + " # train (update) an agent for state s\n", " \n", "\n", " s = next_s\n", @@ -243,7 +243,9 @@ "
" ] }, - "metadata": {}, + "metadata": { + "tags": [] + }, "output_type": "display_data" } ], @@ -285,11 +287,11 @@ "source": [ "# Binarized state spaces\n", "\n", - "Use agent to train efficiently on `CartPole-v0`. This environment has a continuous set of possible states, so you will have to group them into bins somehow.\n", + "Use an agent to train efficiently on `CartPole-v0`. This environment has a continuous set of possible states, so you will have to group them somwhow into bins.\n", "\n", - "The simplest way is to use `round(x, n_digits)` (or `np.round`) to round a real number to a given amount of digits. The tricky part is to get the `n_digits` right for each state to train effectively.\n", + "The simplest way is to use `round(x, n_digits)` (or `np.round`) to round the real number to a given amount of digits. The tricky part is to get the `n_digits` right for each state to train effectively.\n", "\n", - "Note that you don't need to convert state to integers, but to __tuples__ of any kind of values." + "Note that you don't need to convert a state to integers, but to __tuples__ for any kind of values." ] }, { @@ -353,7 +355,8 @@ ] }, "metadata": { - "needs_background": "light" + "needs_background": "light", + "tags": [] }, "output_type": "display_data" } @@ -429,16 +432,16 @@ "source": [ "## Learn binarized policy\n", "\n", - "Now let's train a policy that uses binarized state space.\n", + "Now let's train a policy, that uses binarized state space.\n", "\n", "__Tips:__\n", "\n", - "* Note that increasing the number of digits for one dimension of the observations increases your state space by a factor of $10$.\n", - "* If your binarization is too fine-grained, your agent will take much longer than 10000 steps to converge. You can either increase the number of iterations and reduce epsilon decay or change binarization. In practice we found that this kind of mistake is rather frequent.\n", - "* If your binarization is too coarse, your agent may fail to find the optimal policy. In practice we found that on this particular environment this kind of mistake is rare.\n", + "* Note, that increasing the number of digits for one dimension of the observations increases your state space by a factor of $10$.\n", + "* If your binarization is too fine-grained, your agent will take much longer than 10000 steps to converge. You can either increase the number of iterations and reduce epsilon decay or change binarization. In practice we found, that this mistake is rather frequent.\n", + "* If your binarization is too coarse, your agent may fail to find the optimal policy. In practice we found, that in this particular environment this kind of mistake is rare.\n", "* **Start with a coarse binarization** and make it more fine-grained if that seems necessary.\n", "* Having $10^3$–$10^4$ distinct states is recommended (`len(agent._qvalues)`), but not required.\n", - "* If things don't work without annealing $\\varepsilon$, consider adding that, but make sure that it doesn't go to zero too quickly.\n", + "* If things don't work without annealing $\\varepsilon$, consider adding that, but make sure, that it doesn't go to zero too quickly.\n", "\n", "A reasonable agent should attain an average reward of at least 50." ] @@ -528,5 +531,5 @@ } }, "nbformat": 4, - "nbformat_minor": 1 + "nbformat_minor": 0 } diff --git a/week3_model_free/sarsa.ipynb b/week3_model_free/sarsa.ipynb index 55cb7202c..5c8494c6d 100644 --- a/week3_model_free/sarsa.ipynb +++ b/week3_model_free/sarsa.ipynb @@ -8,7 +8,7 @@ "\n", "_This notebook builds upon `qlearning.ipynb`, or to be exact your implementation of QLearningAgent._\n", "\n", - "The policy we're gonna use is epsilon-greedy policy, where agent takes optimal action with probability $(1-\\epsilon)$, otherwise samples action at random. Note that agent __can__ occasionally sample optimal action during random sampling by pure chance." + "The policy we're gonna use is epsilon-greedy policy, where an agent takes the optimal action with probability $(1-\\epsilon)$, otherwise samples an action at random. Note, that agent __can__ occasionally samples the optimal action during the random sampling by pure chance." ] }, { @@ -27,7 +27,7 @@ " !touch .setup_complete\n", "\n", "# This code creates a virtual display to draw game images on.\n", - "# It will have no effect if your machine has a monitor.\n", + "# It won't have any effect if your machine has a monitor.\n", "if type(os.environ.get(\"DISPLAY\")) is not str or len(os.environ.get(\"DISPLAY\")) == 0:\n", " !bash ../xvfb start\n", " os.environ['DISPLAY'] = ':1'" @@ -48,7 +48,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "You can copy your `QLearningAgent` implementation from previous notebook." + "You can copy your `QLearningAgent` implementation from the previous notebook." ] }, { @@ -105,8 +105,8 @@ " def get_value(self, state):\n", " \"\"\"\n", " Compute your agent's estimate of V(s) using current q-values\n", - " V(s) = max_over_action Q(state,action) over possible actions.\n", - " Note: please take into account that q-values can be negative.\n", + " V(s) = max_over_action Q(state,action) over all possible actions.\n", + " Note: please take into account, that q-values can be negative.\n", " \"\"\"\n", " possible_actions = self.get_legal_actions(state)\n", "\n", @@ -177,7 +177,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Now we gonna implement Expected Value SARSA on top of it." + "Now we gonna to implement Expected Value SARSA on top of it." ] }, { @@ -188,8 +188,8 @@ "source": [ "class EVSarsaAgent(QLearningAgent):\n", " \"\"\" \n", - " An agent that changes some of q-learning functions to implement Expected Value SARSA. \n", - " Note: this demo assumes that your implementation of QLearningAgent.update uses get_value(next_state).\n", + " An agent, that changes some of q-learning functions to implement Expected Value SARSA. \n", + " Note: this demo assumes, that your implementation of QLearningAgent.update uses get_value(next_state).\n", " If it doesn't, please add\n", " def update(self, state, action, reward, next_state):\n", " and implement it for Expected Value SARSA's V(s')\n", @@ -220,7 +220,7 @@ "source": [ "### Cliff World\n", "\n", - "Let's now see how our algorithm compares against q-learning in case where we force agent to explore all the time.\n", + "Let's now see, how our algorithm compares against q-learning in case, where we force an agent to explore all the time.\n", "\n", "\n", "
image by cs188
" @@ -328,7 +328,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Let's now see what did the algorithms learn by visualizing their actions at every state." + "Let's now see, what did the algorithms learn by visualizing their actions at every state." ] }, { @@ -395,7 +395,7 @@ "\n", "Here are some of the things you can do if you feel like it:\n", "\n", - "* Play with epsilon. See learned how policies change if you set epsilon to higher/lower values (e.g. 0.75).\n", + "* Play with an epsilon. See how learned policies change if you set an epsilon to the higher/lower values (e.g. 0.75).\n", "* Expected Value SASRSA for softmax policy:\n", "$$ \\pi(a_i|s) = softmax({Q(s,a_i) \\over \\tau}) = {e ^ {Q(s,a_i)/ \\tau} \\over {\\sum_{a_j} e ^{Q(s,a_j) / \\tau }}} $$\n", "* Implement N-step algorithms and TD($\\lambda$): see [Sutton's book](http://incompleteideas.net/book/bookdraft2018jan1.pdf) chapter 7 and chapter 12.\n", @@ -410,5 +410,5 @@ } }, "nbformat": 4, - "nbformat_minor": 1 + "nbformat_minor": 0 } diff --git a/week4_approx/dqn_atari_pytorch.ipynb b/week4_approx/dqn_atari_pytorch.ipynb index 8bff8f7db..cf5305745 100644 --- a/week4_approx/dqn_atari_pytorch.ipynb +++ b/week4_approx/dqn_atari_pytorch.ipynb @@ -6,7 +6,7 @@ "source": [ "# Deep Q-Network implementation.\n", "\n", - "This homework shamelessly demands you to implement a DQN - an approximate q-learning algorithm with experience replay and target networks - and see if it works any better this way.\n", + "This homework shamelessly demands you to implement a DQN - an approximate q-learning algorithm with experience replay and target networks - and see if it works better this way.\n", "\n", "Original paper:\n", "https://arxiv.org/pdf/1312.5602.pdf" @@ -40,7 +40,7 @@ " !touch .setup_complete\n", "\n", "# This code creates a virtual display to draw game images on.\n", - "# It will have no effect if your machine has a monitor.\n", + "# It won't have any effect if your machine has a monitor.\n", "if type(os.environ.get(\"DISPLAY\")) is not str or len(os.environ.get(\"DISPLAY\")) == 0:\n", " !bash ../xvfb start\n", " os.environ['DISPLAY'] = ':1'" @@ -84,7 +84,7 @@ "### Let's play some old videogames\n", "![img](https://github.com/yandexdataschool/Practical_RL/raw/master/yet_another_week/_resource/nerd.png)\n", "\n", - "This time we're gonna apply approximate q-learning to an Atari game called Breakout. It's not the hardest thing out there, but it's definitely way more complex than anything we tried before.\n" + "This time we're going to apply the approximate q-learning to an Atari game called Breakout. It's not the hardest thing out there, but it's definitely way more complex, than anything we tried before.\n" ] }, { @@ -150,7 +150,7 @@ "source": [ "**Let's play a little.**\n", "\n", - "Pay attention to zoom and fps args of play function. Control: A, D, space." + "Pay attention to zoom and fps args of the play function. Control: A, D, space." ] }, { @@ -173,7 +173,7 @@ "source": [ "### Processing game image \n", "\n", - "Raw Atari images are large, 210x160x3 by default. However, we don't need that level of detail in order to learn them.\n", + "Raw Atari images are large, 210x160x3 by default. However, we don't need that level of detail in order to learn from them.\n", "\n", "We can thus save a lot of time by preprocessing game image, including\n", "* Resizing to a smaller shape, 64 x 64\n", @@ -182,7 +182,7 @@ "\n", "Also please keep one dimension for channel so that final shape would be 1 x 64 x 64.\n", "\n", - "Tip: You can implement your own grayscale converter and assign a huge weight to the red channel. This dirty trick is not necessary but it will speed up learning." + "Tip: You can implement your own grayscale converter and assign a huge weight to the red channel. This dirty trick is not necessary but it will speed up the learning." ] }, { @@ -197,7 +197,7 @@ "\n", "class PreprocessAtariObs(ObservationWrapper):\n", " def __init__(self, env):\n", - " \"\"\"A gym wrapper that crops, scales image into the desired shapes and grayscales it.\"\"\"\n", + " \"\"\"A gym wrapper that crops, scales an image into the desired shapes and grayscales it.\"\"\"\n", " ObservationWrapper.__init__(self, env)\n", "\n", " self.img_size = (1, 64, 64)\n", @@ -348,9 +348,9 @@ "source": [ "### Frame buffer\n", "\n", - "Our agent can only process one observation at a time, so we gotta make sure it contains enough information to find optimal actions. For instance, agent has to react to moving objects so he must be able to measure object's velocity.\n", + "Our agent can only process one observation at a time, so we have to make sure it has enough information to find the optimal actions. For instance, agent has to react to moving objects so he must be able to measure an object's velocity.\n", "\n", - "To do so, we introduce a buffer that stores 4 last images. This time everything is pre-implemented for you, not really by the staff of the course :)" + "To do so, we introduce a buffer, that stores 4 last images. This time everything is pre-implemented for you, not really by the staff of the course :)" ] }, { @@ -408,9 +408,9 @@ "source": [ "### Building a network\n", "\n", - "We now need to build a neural network that can map images to state q-values. This network will be called on every agent's step so it better not be resnet-152 unless you have an array of GPUs. Instead, you can use strided convolutions with a small number of features to save time and memory.\n", + "We now need to build a neural network, that can map images to state q-values. This network will be called on every agent's step so it is better not to be the resnet-152, unless you have an array of GPUs. Instead, you can use strided convolutions with a small number of features to save time and memory.\n", "\n", - "You can build any architecture you want, but for reference, here's something that will more or less work:" + "You can build any architecture you want, but for reference, here's something, that will more or less work:" ] }, { @@ -429,7 +429,7 @@ "import torch\n", "import torch.nn as nn\n", "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n", - "# those who have a GPU but feel unfair to use it can uncomment:\n", + "# those, who have a GPU but feel unfair to use it can uncomment:\n", "# device = torch.device('cpu')\n", "device" ] @@ -464,7 +464,7 @@ " self.n_actions = n_actions\n", " self.state_shape = state_shape\n", "\n", - " # Define your network body here. Please make sure agent is fully contained here\n", + " # Define your network body here. Please, make sure agent is fully contained here\n", " # nn.Flatten() can be useful\n", " \n", " \n", @@ -474,7 +474,7 @@ " takes agent's observation (tensor), returns qvalues (tensor)\n", " :param state_t: a batch of 4-frame buffers, shape = [batch_size, 4, h, w]\n", " \"\"\"\n", - " # Use your network to compute qvalues for given state\n", + " # Use your network to compute qvalues for a given state\n", " qvalues = \n", "\n", " assert qvalues.requires_grad, \"qvalues must be a torch tensor with grad\"\n", @@ -518,7 +518,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Now let's try out our agent to see if it raises any errors." + "Now let's try out our agent to see if the code raises any errors." ] }, { @@ -528,7 +528,7 @@ "outputs": [], "source": [ "def evaluate(env, agent, n_games=1, greedy=False, t_max=10000):\n", - " \"\"\" Plays n_games full games. If greedy, picks actions as argmax(qvalues). Returns mean reward. \"\"\"\n", + " \"\"\" Plays n_games full games. If greedy, picks actions as argmax(qvalues). Returns the mean reward. \"\"\"\n", " rewards = []\n", " for _ in range(n_games):\n", " s = env.reset()\n", @@ -568,9 +568,9 @@ "metadata": {}, "source": [ "#### The interface is fairly simple:\n", - "* `exp_replay.add(obs, act, rw, next_obs, done)` - saves (s,a,r,s',done) tuple into the buffer\n", + "* `exp_replay.add(obs, act, rw, next_obs, done)` - saves (s,a,r,s',done) tuple into the buffer;\n", "* `exp_replay.sample(batch_size)` - returns observations, actions, rewards, next_observations and is_done for `batch_size` random samples.\n", - "* `len(exp_replay)` - returns number of elements stored in replay buffer." + "* `len(exp_replay)` - returns the number of elements stored in replay buffer." ] }, { @@ -589,7 +589,7 @@ "obs_batch, act_batch, reward_batch, next_obs_batch, is_done_batch = exp_replay.sample(\n", " 5)\n", "\n", - "assert len(exp_replay) == 10, \"experience replay size should be 10 because that's what maximum capacity is\"" + "assert len(exp_replay) == 10, \"experience replay size should be 10 because that's what the maximum capacity is\"" ] }, { @@ -602,11 +602,11 @@ " \"\"\"\n", " Play the game for exactly n steps, record every (s,a,r,s', done) to replay buffer. \n", " Whenever game ends, add record with done=True and reset the game.\n", - " It is guaranteed that env has done=False when passed to this function.\n", + " It is guaranteed, that env has done=False when passed to this function.\n", "\n", " PLEASE DO NOT RESET ENV UNLESS IT IS \"DONE\"\n", "\n", - " :returns: return sum of rewards over time and the state in which the env stays\n", + " :returns: return sum of rewards over time and the state, in which the env stays\n", " \"\"\"\n", " s = initial_state\n", " sum_rewards = 0\n", @@ -629,13 +629,13 @@ "state = env.reset()\n", "play_and_record(state, agent, env, exp_replay, n_steps=1000)\n", "\n", - "# if you're using your own experience replay buffer, some of those tests may need correction.\n", - "# just make sure you know what your code does\n", + "# if you're using your own experience replay buffer, some of those tests may need several corrections.\n", + "# just make sure you know, what your code does\n", "assert len(exp_replay) == 1000, \"play_and_record should have added exactly 1000 steps, \"\\\n", " \"but instead added %i\" % len(exp_replay)\n", "is_dones = list(zip(*exp_replay._storage))[-1]\n", "\n", - "assert 0 < np.mean(is_dones) < 0.1, \"Please make sure you restart the game whenever it is 'done' and record the is_done correctly into the buffer.\"\\\n", + "assert 0 < np.mean(is_dones) < 0.1, \"Please, make sure you restart the game whenever it is 'done' and record the is_done correctly into the buffer.\"\\\n", " \"Got %f is_done rate over %i steps. [If you think it's your tough luck, just re-run the test]\" % (\n", " np.mean(is_dones), len(exp_replay))\n", "\n", @@ -665,7 +665,7 @@ "\n", "We also employ the so called \"target network\" - a copy of neural network weights to be used for reference Q-values:\n", "\n", - "The network itself is an exact copy of agent network, but it's parameters are not trained. Instead, they are moved here from agent's actual network every so often.\n", + "The network itself is an exact copy of agent network, but its parameters are not trained. Instead, they are moved here from agent's actual network from time to time.\n", "\n", "$$ Q_{reference}(s,a) = r + \\gamma \\cdot \\max _{a'} Q_{target}(s',a') $$\n", "\n", @@ -679,7 +679,7 @@ "outputs": [], "source": [ "target_network = DQNAgent(agent.state_shape, agent.n_actions, epsilon=0.5).to(device)\n", - "# This is how you can load weights from agent into target network\n", + "# This is how you can load the weights from agent into target network\n", "target_network.load_state_dict(agent.state_dict())" ] }, @@ -703,9 +703,9 @@ "\n", "$$ Q_{reference}(s,a) = r(s,a) + \\gamma \\cdot max_{a'} Q_{target}(s', a') $$\n", "\n", - "Where\n", - "* $Q_{target}(s',a')$ denotes q-value of next state and next action predicted by __target_network__\n", - "* $s, a, r, s'$ are current state, action, reward and next state respectively\n", + "Where:\n", + "* $Q_{target}(s',a')$ denotes q-value of next state and next action predicted by __target_network__;\n", + "* $s, a, r, s'$ are current state, action, reward and next state respectively;\n", "* $\\gamma$ is a discount factor defined two cells above.\n", "\n", "\n", @@ -725,7 +725,7 @@ " gamma=0.99,\n", " check_shapes=False,\n", " device=device):\n", - " \"\"\" Compute td loss using torch operations only. Use the formulae above. \"\"\"\n", + " \"\"\" Compute td loss using torch operations only. Use the formula above. \"\"\"\n", " states = torch.tensor(states, device=device, dtype=torch.float) # shape: [batch_size, *state_shape]\n", "\n", " # for some torch reason should not make actions a tensor\n", @@ -756,7 +756,7 @@ " assert next_state_values.dim(\n", " ) == 1 and next_state_values.shape[0] == states.shape[0], \"must predict one value per state\"\n", "\n", - " # compute \"target q-values\" for loss - it's what's inside square parentheses in the above formula.\n", + " # compute \"target q-values\" for loss - that is what's inside the square parentheses in the formula above.\n", " # at the last state use the simplified formula: Q(s,a) = r(s,a) since s' doesn't exist\n", " # you can multiply next state values by is_not_done to achieve this.\n", " target_qvalues_for_actions = \n", @@ -769,7 +769,7 @@ " assert predicted_next_qvalues.data.dim(\n", " ) == 2, \"make sure you predicted q-values for all actions in next state\"\n", " assert next_state_values.data.dim(\n", - " ) == 1, \"make sure you computed V(s') as maximum over just the actions axis and not all axes\"\n", + " ) == 1, \"make sure you computed V(s') as the maximum over just the actions axis and not all axes\"\n", " assert target_qvalues_for_actions.data.dim(\n", " ) == 1, \"there's something wrong with target q-values, they must be a vector\"\n", "\n", @@ -811,7 +811,7 @@ "## Main loop\n", "\n", "\n", - "It's time to put everything together and see if it learns anything." + "It's time to put everything together and see if it learns something." ] }, { @@ -873,8 +873,8 @@ " if not utils.is_enough_ram(min_available_gb=0.1):\n", " print(\"\"\"\n", " Less than 100 Mb RAM available. \n", - " Make sure the buffer size in not too huge.\n", - " Also check, maybe other processes consume RAM heavily.\n", + " Make sure the buffer size in not too large.\n", + " Also check, whether other processes consume RAM heavily.\n", " \"\"\"\n", " )\n", " break\n", @@ -932,7 +932,7 @@ "for step in trange(step, total_steps + 1):\n", " if not utils.is_enough_ram():\n", " print('less that 100 Mb RAM available, freezing')\n", - " print('make sure everythin is ok and make KeyboardInterrupt to continue')\n", + " print('make sure, that everything is ok and make KeyboardInterrupt to continue')\n", " try:\n", " while True:\n", " pass\n", @@ -1005,9 +1005,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Agent is evaluated for 1 life, not for a whole episode of 5 lives. Rewards in evaluation are also truncated. Cuz this is what environment the agent is learning in and in this way mean rewards per life can be compared with initial state value\n", + "Agent is evaluated for 1 life, not for the whole episode of 5 lives. Rewards in evaluation are also truncated. Because this is the environment the agent is learning in and this way the mean rewards per life can be compared to initial state value.\n", "\n", - "**The goal is to get 10 points in the real env**. So 3 or more points in the preprocessed one will probably be enough. You can interrupt learning then." + "**The goal is to get 10 points in the real env**. So 3 or more points in the preprocessed one will probably be enough. You can interrupt the learning then." ] }, { @@ -1038,19 +1038,19 @@ "source": [ "## How to interpret plots:\n", "\n", - "This aint no supervised learning so don't expect anything to improve monotonously. \n", - "* **TD loss** is the MSE between agent's current Q-values and target Q-values. It may slowly increase or decrease, it's ok. The \"not ok\" behavior includes going NaN or stayng at exactly zero before agent has perfect performance.\n", - "* **grad norm** just shows the intensivity of training. Not ok is growing to values of about 100 (or maybe even 50) though it depends on network architecture.\n", - "* **mean reward** is the expected sum of r(s,a) agent gets over the full game session. It will oscillate, but on average it should get higher over time (after a few thousand iterations...). \n", + "This is not the supervised learning so don't expect anything to improve monotonously. \n", + "* **TD loss** is the MSE between agent's current Q-values and target Q-values. It may slowly increase or decrease, it's ok. The \"not ok\" behavior includes going NaN or staying at the exact zero before agent has perfect the performance.\n", + "* **grad norm** just shows the intensivity of training. Not ok is growing to values of about 100 (or maybe even 50), though it depends on the network architecture.\n", + "* **mean reward** is the expected sum of r(s,a) agent gets over the full game session. It will oscillate, but on average it should get higher over time (after a few thousand of iterations...). \n", " * In basic q-learning implementation it takes about 40k steps to \"warm up\" agent before it starts to get better.\n", - "* **Initial state V** is the expected discounted reward for episode in the oppinion of the agent. It should behave more smoothly than **mean reward**. It should get higher over time but sometimes can experience drawdowns because of the agaent's overestimates.\n", + "* **Initial state V** is the expected discounted reward for an episode in the oppinion of the agent. It should behave more smoothly than **mean reward**. It should get higher over time, but sometimes can experience drawdowns, because of the agent's overestimates.\n", "* **buffer size** - this one is simple. It should go up and cap at max size.\n", - "* **epsilon** - agent's willingness to explore. If you see that agent's already at 0.01 epsilon before it's average reward is above 0 - it means you need to increase epsilon. Set it back to some 0.2 - 0.5 and decrease the pace at which it goes down.\n", - "* Smoothing of plots is done with a gaussian kernel\n", + "* **epsilon** - agent's willingness to explore. If you see, that agent's already at 0.01 epsilon before its average reward is above 0 - it means, that you need to increase the epsilon. Set it back to 0.2 - 0.5 and decrease the pace, at which it goes down.\n", + "* Smoothing of plots is done with a gaussian kernel.\n", "\n", - "At first your agent will lose quickly. Then it will learn to suck less and at least hit the ball a few times before it loses. Finally it will learn to actually score points.\n", + "At first, your agent will lose quickly. Then it will learn to suck less and at least hit the ball a few times before it loses. Finally, it will learn to actually score some points.\n", "\n", - "**Training will take time.** A lot of it actually. Probably you will not see any improvment during first **150k** time steps (note that by default in this notebook agent is evaluated every 5000 time steps).\n", + "**Training will take time.** A lot of it, actually. Probably you will not see any improvment during first **150k** time steps (note, that by default in this notebook agent is evaluated every 5000 time steps).\n", "\n", "But hey, long training time isn't _that_ bad:\n", "![img](https://github.com/yandexdataschool/Practical_RL/raw/master/yet_another_week/_resource/training.png)" @@ -1062,19 +1062,19 @@ "source": [ "## About hyperparameters:\n", "\n", - "The task has something in common with supervised learning: loss is optimized through the buffer (instead of Train dataset). But the distribution of states and actions in the buffer **is not stationary** and depends on the policy that generated it. It can even happen that the mean TD error across the buffer is very low but the performance is extremely poor (imagine the agent collecting data to the buffer always manages to avoid the ball).\n", + "The task has something in common with the supervised learning: loss is optimized through the buffer (instead of Train dataset). But the distribution of states and actions in the buffer **is not stationary** and depends on the policy, that generated it. It can even happen, that the mean TD error across the buffer is very low, but the performance is extremely poor (imagine the agent collects data to the buffer and always manages to avoid the ball).\n", "\n", - "* Total timesteps and training time: It seems to be so huge, but actually it is normal for RL.\n", + "* Total timesteps and training time: It seems to be huge, but actually it is normal for RL.\n", "\n", - "* $\\epsilon$ decay shedule was taken from the original paper and is like traditional for epsilon-greedy policies. At the beginning of the training the agent's greedy policy is poor so many random actions should be taken.\n", + "* $\\epsilon$ decay shedule was taken from the original paper and is traditional for epsilon-greedy policies. At the beginning of the training the agent's greedy policy is poor so many random actions should be made.\n", "\n", - "* Optimizer: In the original paper RMSProp was used (they did not have Adam in 2013) and it can work not worse than Adam. For us Adam was default and it worked.\n", + "* Optimizer: In the original paper RMSProp was used (they did not have Adam in 2013) and it can work as good as Adam. For us Adam was a default and it worked.\n", "\n", "* lr: $10^{-3}$ would probably be too huge\n", "\n", - "* batch size: This one can be very important: if it is too small the agent can fail to learn. Huge batch takes more time to process. If batch of size 8 can not be processed on the hardware you use take 2 (or even 4) batches of size 4, divide the loss on them by 2 (or 4) and make optimization step after both backward() calls in torch.\n", + "* batch size: This one can be very important: if it is too small the agent can fail to learn. Huge batch takes more time to process. If batch of size 8 can not be processed on the hardware you use, take 2 (or even 4) batches of size 4, divide the loss on them by 2 (or 4) and make an optimization step after both backward() calls in torch.\n", "\n", - "* target network update frequency: has something in common with learning rate. Too frequent updates can lead to divergence. Too rare can lead to slow leraning. For millions of total timesteps thousands of inner steps seem ok. One iteration of target network updating is an iteration of the (this time approximate) $\\gamma$-compression that stands behind Q-learning. The more inner steps it makes the more accurate is the compression.\n", + "* target network update frequency: has something in common with the learning rate. Too frequent updates can lead to divergence. Too rare can lead to slow learning. For millions of total timesteps thousands of inner steps seem ok. One iteration of target network updating is an iteration of (this time approximate) $\\gamma$-compression, that stands behind Q-learning. The more inner steps it makes the more accurate is the compression.\n", "* max_grad_norm - just huge enough. In torch clip_grad_norm also evaluates the norm before clipping and it can be convenient for logging." ] }, @@ -1217,11 +1217,11 @@ "If you want to play with DQN a bit more, here's a list of things you can try with it:\n", "\n", "### Easy:\n", - "* Implementing __double q-learning__ shouldn't be a problem if you've already have target networks in place.\n", + "* Implementing __double q-learning__ shouldn't be a problem, if you already have target networks in place.\n", " * You will probably need `tf.argmax` to select best actions\n", " * Here's an original [article](https://arxiv.org/abs/1509.06461)\n", "\n", - "* __Dueling__ architecture is also quite straightforward if you have standard DQN.\n", + "* __Dueling__ architecture is also quite straightforward, if you have standard DQN.\n", " * You will need to change network architecture, namely the q-values layer\n", " * It must now contain two heads: V(s) and A(s,a), both dense layers\n", " * You should then add them up via elemwise sum layer.\n", @@ -1234,17 +1234,17 @@ "source": [ "### Hard: Prioritized experience replay\n", "\n", - "In this section, you're invited to implement prioritized experience replay\n", + "In this section, you're invited to implement prioritized experience replay:\n", "\n", - "* You will probably need to provide a custom data structure\n", - "* Once pool.update is called, collect the pool.experience_replay.observations, actions, rewards and is_alive and store them in your data structure\n", + "* You will probably need to provide a custom data structure;\n", + "* Once pool.update is called, collect the pool.experience_replay.observations, actions, rewards and is_alive and store them in your data structure;\n", "* You can now sample such transitions in proportion to the error (see [article](https://arxiv.org/abs/1511.05952)) for training.\n", "\n", "It's probably more convenient to explicitly declare inputs for \"sample observations\", \"sample actions\" and so on to plug them into q-learning.\n", "\n", - "Prioritized (and even normal) experience replay should greatly reduce amount of game sessions you need to play in order to achieve good performance. \n", + "Prioritized (and even normal) experience replay should greatly reduce an amount of game sessions you need to play in order to achieve good performance. \n", "\n", - "While it's effect on runtime is limited for atari, more complicated envs (further in the course) will certainly benefit for it.\n", + "While its effect on runtime is limited for atari, more complicated envs (further in the course) will certainly benefit for it.\n", "\n", "There is even more out there - see this [overview article](https://arxiv.org/abs/1710.02298)." ] @@ -1257,5 +1257,5 @@ } }, "nbformat": 4, - "nbformat_minor": 1 + "nbformat_minor": 0 } diff --git a/week4_approx/practice_approx_qlearning_pytorch.ipynb b/week4_approx/practice_approx_qlearning_pytorch.ipynb index 273861e49..e6f5934fb 100644 --- a/week4_approx/practice_approx_qlearning_pytorch.ipynb +++ b/week4_approx/practice_approx_qlearning_pytorch.ipynb @@ -25,7 +25,7 @@ " !touch .setup_complete\n", "\n", "# This code creates a virtual display to draw game images on.\n", - "# It will have no effect if your machine has a monitor.\n", + "# It won't have any effect if your machine has a monitor.\n", "if type(os.environ.get(\"DISPLAY\")) is not str or len(os.environ.get(\"DISPLAY\")) == 0:\n", " !bash ../xvfb start\n", " os.environ['DISPLAY'] = ':1'" @@ -72,11 +72,11 @@ "\n", "![img](https://raw.githubusercontent.com/yandexdataschool/Practical_RL/master/yet_another_week/_resource/qlearning_scheme.png)\n", "\n", - "For your first run, please only use linear layers (`nn.Linear`) and activations. Stuff like batch normalization or dropout may ruin everything if used haphazardly. \n", + "For your first run, please, only use linear layers (`nn.Linear`) and activations. Stuff like batch normalization or dropout may ruin everything if used haphazardly. \n", "\n", - "Also please avoid using nonlinearities like sigmoid & tanh: since agent's observations are not normalized, sigmoids might be saturated at initialization. Instead, use non-saturating nonlinearities like ReLU.\n", + "Also, please, avoid using nonlinearities like sigmoid & tanh: since agent's observations are not normalized, sigmoids might be saturated at initialization. Instead, use non-saturating nonlinearities like ReLU.\n", "\n", - "Ideally you should start small with maybe 1-2 hidden layers with < 200 neurons and then increase network size if agent doesn't beat the target score." + "Ideally you should start small, with maybe 1-2 hidden layers with < 200 neurons and then increase the network size, if agent doesn't beat the target score." ] }, { @@ -132,9 +132,9 @@ "source": [ "s = env.reset()\n", "assert tuple(network(torch.tensor([s]*3, dtype=torch.float32)).size()) == (\n", - " 3, n_actions), \"please make sure your model maps state s -> [Q(s,a0), ..., Q(s, a_last)]\"\n", + " 3, n_actions), \"please, make sure your model maps state s -> [Q(s,a0), ..., Q(s, a_last)]\"\n", "assert isinstance(list(network.modules(\n", - "))[-1], nn.Linear), \"please make sure you predict q-values without nonlinearity (ignore if you know what you're doing)\"\n", + "))[-1], nn.Linear), \"please, make sure you predict q-values without nonlinearity (ignore, if you know, what you're doing)\"\n", "assert isinstance(get_action(\n", " s), int), \"get_action(s) must return int, not %s. try int(action)\" % (type(get_action(s)))\n", "\n", @@ -158,17 +158,17 @@ "source": [ "### Q-learning via gradient descent\n", "\n", - "We shall now train our agent's Q-function by minimizing the TD loss:\n", + "Now we shall train our agent's Q-function by minimizing the TD loss:\n", "$$ L = { 1 \\over N} \\sum_i (Q_{\\theta}(s,a) - [r(s,a) + \\gamma \\cdot max_{a'} Q_{-}(s', a')]) ^2 $$\n", "\n", "\n", "Where\n", - "* $s, a, r, s'$ are current state, action, reward and next state respectively\n", - "* $\\gamma$ is a discount factor defined two cells above.\n", + "* $s, a, r, s'$ are current state, action, reward and next state respectively;\n", + "* $\\gamma$ is the discount factor defined two cells above.\n", "\n", "The tricky part is with $Q_{-}(s',a')$. From an engineering standpoint, it's the same as $Q_{\\theta}$ - the output of your neural network policy. However, when doing gradient descent, __we won't propagate gradients through it__ to make training more stable (see lectures).\n", "\n", - "To do so, we shall use `x.detach()` function which basically says \"consider this thing constant when doingbackprop\"." + "To do so, we shall use `x.detach()` function, which basically says \"consider this thing constant when doing backprop\"." ] }, { @@ -215,9 +215,9 @@ "\n", " if check_shapes:\n", " assert predicted_next_qvalues.data.dim(\n", - " ) == 2, \"make sure you predicted q-values for all actions in next state\"\n", + " ) == 2, \"make sure, that you predicted q-values for all actions in next state\"\n", " assert next_state_values.data.dim(\n", - " ) == 1, \"make sure you computed V(s') as maximum over just the actions axis and not all axes\"\n", + " ) == 1, \"make sure, that you computed V(s') as maximum over just the actions axis and not all axes\"\n", " assert target_qvalues_for_actions.data.dim(\n", " ) == 1, \"there's something wrong with target q-values, they must be a vector\"\n", "\n", @@ -265,7 +265,7 @@ "outputs": [], "source": [ "def generate_session(env, t_max=1000, epsilon=0, train=False):\n", - " \"\"\"play env with approximate q-learning agent and train it at the same time\"\"\"\n", + " \"\"\"play env with an approximate q-learning agent and train it at the same time\"\"\"\n", " total_reward = 0\n", " s = env.reset()\n", "\n", @@ -317,15 +317,15 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### How to interpret results\n", + "### How to interpret the results\n", "\n", "\n", - "Welcome to the f.. world of deep f...n reinforcement learning. Don't expect agent's reward to smoothly go up. Hope for it to go increase eventually. If it deems you worthy.\n", + "Welcome to the f.. world of deep f...n reinforcement learning. Don't expect agent's reward to go up smoothly. Hope for it to increase eventually. If it deems you worthy.\n", "\n", "Seriously though,\n", - "* __ mean reward__ is the average reward per game. For a correct implementation it may stay low for some 10 epochs, then start growing while oscilating insanely and converges by ~50-100 steps depending on the network architecture. \n", + "* __ mean reward__ is the average reward per game. For the correct implementation it may stay low for some 10 epochs, then start growing, while oscilating insanely and converges by ~50-100 steps depending on the network architecture. \n", "* If it never reaches target score by the end of for loop, try increasing the number of hidden neurons or look at the epsilon.\n", - "* __ epsilon__ - agent's willingness to explore. If you see that agent's already at < 0.01 epsilon before it's is at least 200, just reset it back to 0.1 - 0.5." + "* __ epsilon__ - agent's willingness to explore. If you see, that agent's already at < 0.01 epsilon before it's is at least 200, just reset it back to 0.1 - 0.5." ] }, { @@ -334,7 +334,7 @@ "source": [ "### Record videos\n", "\n", - "As usual, we now use `gym.wrappers.Monitor` to record a video of our agent playing the game. Unlike our previous attempts with state binarization, this time we expect our agent to act ~~(or fail)~~ more smoothly since there's no more binarization error at play.\n", + "As usual, we now use `gym.wrappers.Monitor` to record the video of our agent playing the game. Unlike our previous attempts with state binarization, this time we expect our agent to act ~~(or fail)~~ more smoothly since there's no more binarization error at play.\n", "\n", "As you already did with tabular q-learning, we set epsilon=0 for final evaluation to prevent agent from exploring himself to death." ] @@ -399,5 +399,5 @@ } }, "nbformat": 4, - "nbformat_minor": 1 + "nbformat_minor": 0 } diff --git a/week5_policy_based/practice_actorcritic_pytorch.ipynb b/week5_policy_based/practice_actorcritic_pytorch.ipynb index 419954f19..e8836449a 100644 --- a/week5_policy_based/practice_actorcritic_pytorch.ipynb +++ b/week5_policy_based/practice_actorcritic_pytorch.ipynb @@ -118,9 +118,9 @@ "source": [ "### Build a network\n", "\n", - "We now have to build an agent for actor-critic training — a convolutional neural network that converts states into action probabilities $\\pi$ and state values $V$.\n", + "We now have to build an agent for actor-critic training — a convolutional neural network, that converts states into action probabilities $\\pi$ and state values $V$.\n", "\n", - "Your assignment here is to build and apply a neural network. You can use any framework you want, but in this notebook we prepared for you a template in PyTorch.\n", + "Your assignment here is to build and apply the neural network. You can use any framework you want, but in this notebook we prepared for you a template in PyTorch.\n", "\n", "For starters, we want you to implement this architecture:\n", "\n", @@ -129,7 +129,7 @@ "Notes:\n", "* This diagram was originally made for Tensorflow. In PyTorch, the input shape is `[batch_size, 4, 42, 42]`.\n", "* Use convolution kernel size 3x3 throughout.\n", - "* After your agent gets mean reward above 5000, we encourage you to experiment with model architecture to score even better." + "* After your agent gets mean reward above 5000, we encourage you to experiment with the model architecture to score even better." ] }, { @@ -167,7 +167,7 @@ " Common use case:\n", " cur_layer_img_w = conv2d_size_out(cur_layer_img_w, kernel_size, stride)\n", " cur_layer_img_h = conv2d_size_out(cur_layer_img_h, kernel_size, stride)\n", - " This can be used to understand the shape for dense layer's input.\n", + " This can be used to understand the shape for the dense layer's input.\n", " \"\"\"\n", " return (size - (kernel_size - 1) - 1) // stride + 1" ] @@ -184,7 +184,7 @@ " \n", " self.input_dims = input_dims\n", "\n", - " # Initialize layers as shown in image above\n", + " # Initialize layers as shown in the image above\n", " \n", "\n", " self.optimizer = torch.optim.Adam(self.parameters(), lr=lr)\n", @@ -193,9 +193,9 @@ " \n", " def forward(self, state):\n", " # Compute logits and values using network.\n", - " # Note that if you do so naively, your state_values will have shape\n", + " # Note, that if you do so naively, your state_values will have shape\n", " # ending in 1, since they come from a Linear(..., 1) layer. It is useful\n", - " # to .squeeze(dim=-1) them, since this will help avoid shape conflicts\n", + " # to .squeeze(dim=-1) them, since this will help to avoid shape conflicts\n", " # in the loss function part, after we add multiple environments.\n", " # If you don't do this here, don't forget to do that in the\n", " # loss function!\n", @@ -211,7 +211,7 @@ " observation = [observation]\n", "\n", " observation = torch.tensor(observation, dtype=torch.float32, device=device)\n", - " # Pass states into agent network and get back logits and states\n", + " # Pass states into the agent network and get back logits and states\n", " logits, _ = \n", " \n", " policy = F.softmax(logits, dim=-1)\n", @@ -322,7 +322,7 @@ "metadata": {}, "source": [ "### Let's play!\n", - "Let's build a function that measures the agent's average reward." + "Let's build a function, that measures the agent's average reward." ] }, { @@ -332,7 +332,7 @@ "outputs": [], "source": [ "def evaluate(agent, env, n_games=1):\n", - " \"\"\"Plays an a game from start till done, returns per-game rewards \"\"\"\n", + " \"\"\"Plays the game from start till done, returns per-game rewards \"\"\"\n", "\n", " game_rewards = []\n", " for _ in range(n_games):\n", @@ -394,7 +394,7 @@ "![img](https://s7.postimg.cc/4y36s2b2z/env_pool.png)\n", "\n", "\n", - "To make actor-critic training more stable, we shall play several games in parallel. This means ya'll have to initialize several parallel gym envs, send agent's actions there and .reset() each env if it becomes terminated. To minimize learner brain damage, we've taken care of them for ya - just make sure you read it before you use it." + "To make actor-critic training more stable, we shall play several games in parallel. This means you'll have to initialize several parallel gym envs, send agent's actions there and .reset() each env if it becomes terminated. To minimize learner brain damage, we've taken care of them for you - just make sure you read it before you use it." ] }, { @@ -520,7 +520,7 @@ "1. All rewards are multiples of 100, and even an untrained agent can get a score of 800. Therefore, even in the beginning of training, the critic will have to predict pretty large numbers. Neural networks require extra tinkering to output large numbers reliably. In this case, the easiest workaround is just to scale back those numbers.\n", "2. We have already tweaked the hyperparameters (loss coefficients) to work well with this scaling.\n", "\n", - "Please note however that we would not have needed to do this in plain REINFORCE without entropy regularization but with Adam optimizer.\n", + "Please note, that we would not need to do this in plain REINFORCE without entropy regularization but with Adam optimizer.\n", "\n", "In REINFORCE, there is only actor and no critic. Without entropy regularization, actor loss is just policy gradient. It is proportional to rewards, but it only affects the scale of the gradient. However, Adam maintains a running average of the variance of the gradient for each parameter it optimizes, and normalizes the gradient by its variance in each optimization step. This will negate any scaling of the gradient.\n", "\n", @@ -583,18 +583,18 @@ "\n", "__How to interpret plots:__\n", "\n", - "The session reward is the easy thing: it should in general go up over time, but it's okay if it fluctuates ~~like crazy~~. It's also OK if it doesn't increase substantially before some 10-20k initial steps, and some people who tried this assignment [told us](https://www.coursera.org/learn/practical-rl/discussions/all/threads/3OnFNVxEEemLZA644RFX2A) they didn't see improvements until around 60k steps. However, if reward reaches zero and doesn't seem to get up over 2-3 evaluations, there's something wrong happening.\n", + "The session reward is the easy thing: it should in general go up over time, but it's okay, if it fluctuates ~~like crazy~~. It's also OK, if it doesn't increase substantially before 10-20k initial steps, and some people, who tried this assignment [told us](https://www.coursera.org/learn/practical-rl/discussions/all/threads/3OnFNVxEEemLZA644RFX2A) they didn't see improvements until approximately 60k steps. However, if reward reaches zero and doesn't seem to get up over 2-3 evaluations, something wrong is happening.\n", "\n", "Since we use a policy-based method, we also keep track of __policy entropy__ — the same one you used as a regularizer. The only important thing about it is that your entropy shouldn't drop too low (`< 0.1`) before your agent gets the yellow belt. Or at least it can drop there, but _it shouldn't stay there for long_.\n", "\n", - "If it does, the culprit is likely:\n", - "* Some bug in entropy computation. Remember that it is $-\\sum p(a_i) \\cdot \\log p(a_i)$.\n", + "If it does, the likely culprit is:\n", + "* Some bug in entropy computation. Remember, that it is $-\\sum p(a_i) \\cdot \\log p(a_i)$.\n", "* Your model architecture is broken in some way: for example, if you create layers in `Agent.symbolic_step()` rather than in `Agent.__init__()`, then effectively you will be training two separate agents: one for `logits, state_values` and another one for `next_logits, next_state_values`.\n", - "* Your architecture is different from the one we suggest and it converges too quickly. Change your architecture or increase entropy coefficient in actor loss. \n", - "* Gradient explosion: just [clip gradients](https://stackoverflow.com/a/43486487) and maybe use a smaller network\n", + "* Your architecture is different from the one we suggest and it converges too quickly. Change your architecture or increase the entropy coefficient in actor loss. \n", + "* Gradient explosion: just [clip gradients](https://stackoverflow.com/a/43486487) and maybe use a smaller network.\n", "* Us. Or TF developers. Or aliens. Or lizardfolk. Contact us on forums before it's too late!\n", "\n", - "If you're debugging, just run `logits, values = agent.step(batch_states)` and manually look into logits and values. This will reveal the problem 9 times out of 10: you'll likely see some NaNs or insanely large numbers or zeros. Try to catch the moment when this happens for the first time and investigate from there." + "If you're debugging, just run `logits, values = agent.step(batch_states)` and manually look into logits and values. This will reveal the problem 9 times out of 10: you are likely to see some NaNs or insanely large numbers or zeros. Try to catch the moment, when this happens for the first time and investigate from there." ] }, { @@ -678,12 +678,12 @@ "Well, 5k reward is [just the beginning](https://www.buzzfeed.com/mattjayyoung/what-the-color-of-your-karate-belt-actually-means-lg3g). Can you get past 200? With recurrent neural network memory, chances are you can even beat 400!\n", "\n", "* Try n-step advantage and \"lambda\"-advantage (aka GAE) - see [this article](https://arxiv.org/abs/1506.02438)\n", - " * This change should improve early convergence a lot\n", - "* Try recurrent neural network \n", - " * RNN memory will slow things down initially, but in will reach better final reward at this game\n", + " * This change should improve the early convergence a lot\n", + "* Try recurrent a neural network \n", + " * RNN memory will slow things down initially, but it will reach better final reward at this game\n", "* Implement asynchronuous version\n", " * Remember [A3C](https://arxiv.org/abs/1602.01783)? The first \"A\" stands for asynchronuous. It means there are several parallel actor-learners out there.\n", - " * You can write custom code for synchronization, but we recommend using [redis](https://redis.io/)\n", + " * You can write the custom code for synchronization, but we recommend using [redis](https://redis.io/)\n", " * You can store full parameter set in redis, along with any other metadate\n", " * Here's a _quick_ way to (de)serialize parameters for redis\n", " ```\n", @@ -706,12 +706,11 @@ } ], "metadata": { - "accelerator": "GPU", "language_info": { "name": "python", "pygments_lexer": "ipython3" } }, "nbformat": 4, - "nbformat_minor": 1 + "nbformat_minor": 0 } diff --git a/week5_policy_based/practice_reinforce_pytorch.ipynb b/week5_policy_based/practice_reinforce_pytorch.ipynb index 67f8dd7c8..6588d93a2 100644 --- a/week5_policy_based/practice_reinforce_pytorch.ipynb +++ b/week5_policy_based/practice_reinforce_pytorch.ipynb @@ -27,7 +27,7 @@ " !touch .setup_complete\n", "\n", "# This code creates a virtual display to draw game images on.\n", - "# It will have no effect if your machine has a monitor.\n", + "# It won't have any effect if your machine has a monitor.\n", "if type(os.environ.get(\"DISPLAY\")) is not str or len(os.environ.get(\"DISPLAY\")) == 0:\n", " !bash ../xvfb start\n", " os.environ['DISPLAY'] = ':1'" @@ -82,10 +82,10 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "For REINFORCE algorithm, we'll need a model that predicts action probabilities given states.\n", + "For REINFORCE algorithm, we'll need a model, that predicts action probabilities given states.\n", "\n", - "For numerical stability, please __do not include the softmax layer into your network architecture__.\n", - "We'll use softmax or log-softmax where appropriate." + "For numerical stability, please, __do not include the softmax layer into your network architecture__.\n", + "We'll use softmax or log-softmax, where appropriate." ] }, { @@ -104,7 +104,7 @@ "metadata": {}, "outputs": [], "source": [ - "# Build a simple neural network that predicts policy logits. \n", + "# Build a simple neural network, that predicts policy logits. \n", "# Keep it simple: CartPole isn't worth deep architectures.\n", "model = nn.Sequential(\n", " \n", @@ -115,7 +115,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "#### Predict function" + "#### Prediction function" ] }, { @@ -123,7 +123,7 @@ "metadata": {}, "source": [ "Note: output value of this function is not a torch tensor, it's a numpy array.\n", - "So, here gradient calculation is not needed.\n", + "So, here the gradient calculation is not needed.\n", "
\n", "Use [no_grad](https://pytorch.org/docs/stable/autograd.html#torch.autograd.no_grad)\n", "to suppress gradient calculation.\n", @@ -133,7 +133,7 @@ "With `.detach()` computational graph is built but then disconnected from a particular tensor,\n", "so `.detach()` should be used if that graph is needed for backprop via some other (not detached) tensor;\n", "
\n", - "In contrast, no graph is built by any operation in `no_grad()` context, thus it's preferable here." + "In contrast, no graph is built by any operation in `no_grad()` context, thus, it's preferable here." ] }, { @@ -244,7 +244,7 @@ "outputs": [], "source": [ "def get_cumulative_rewards(rewards, # rewards at each step\n", - " gamma=0.99 # discount for reward\n", + " gamma=0.99 # discount for the reward\n", " ):\n", " \"\"\"\n", " Take a list of immediate rewards r(s,a) for the whole session \n", @@ -287,13 +287,13 @@ "source": [ "#### Loss function and updates\n", "\n", - "We now need to define objective and update over policy gradient.\n", + "We now need to define objective and update over the policy gradient.\n", "\n", - "Our objective function is\n", + "Our objective function is:\n", "\n", "$$ J \\approx { 1 \\over N } \\sum_{s_i,a_i} G(s_i,a_i) $$\n", "\n", - "REINFORCE defines a way to compute the gradient of the expected reward with respect to policy parameters. The formula is as follows:\n", + "REINFORCE defines the way to compute the gradient of the expected reward with respect to policy parameters. The formula is as follows:\n", "\n", "$$ \\nabla_\\theta \\hat J(\\theta) \\approx { 1 \\over N } \\sum_{s_i, a_i} \\nabla_\\theta \\log \\pi_\\theta (a_i \\mid s_i) \\cdot G_t(s_i, a_i) $$\n", "\n", @@ -301,7 +301,7 @@ "\n", "$$ \\hat J(\\theta) \\approx { 1 \\over N } \\sum_{s_i, a_i} \\log \\pi_\\theta (a_i \\mid s_i) \\cdot G_t(s_i, a_i) $$\n", "\n", - "When you compute the gradient of that function with respect to network weights $\\theta$, it will become exactly the policy gradient." + "When you compute the gradient of this function with respect to the network weights $\\theta$, it will become exactly the policy gradient." ] }, { @@ -324,7 +324,7 @@ "metadata": {}, "outputs": [], "source": [ - "# Your code: define optimizers\n", + "# Your code: define the optimizers\n", "optimizer = torch.optim.Adam(model.parameters(), 1e-3)\n", "\n", "\n", @@ -332,7 +332,7 @@ " \"\"\"\n", " Takes a sequence of states, actions and rewards produced by generate_session.\n", " Updates agent's weights by following the policy gradient above.\n", - " Please use Adam optimizer with default parameters.\n", + " Please use Adam optimizer with the default parameters.\n", " \"\"\"\n", "\n", " # cast everything into torch tensors\n", @@ -347,13 +347,13 @@ " log_probs = nn.functional.log_softmax(logits, -1)\n", "\n", " assert all(isinstance(v, torch.Tensor) for v in [logits, probs, log_probs]), \\\n", - " \"please use compute using torch tensors and don't use predict_probs function\"\n", + " \"please use compute with torch tensors and don't use predict_probs function\"\n", "\n", " # select log-probabilities for chosen actions, log pi(a_i|s_i)\n", " log_probs_for_actions = torch.sum(\n", " log_probs * to_one_hot(actions, env.action_space.n), dim=1)\n", " \n", - " # Compute loss here. Don't forgen entropy regularization with `entropy_coef` \n", + " # Compute loss here. Don't forget the entropy regularization with `entropy_coef` \n", " entropy = \n", " loss = \n", "\n", @@ -456,5 +456,5 @@ } }, "nbformat": 4, - "nbformat_minor": 1 + "nbformat_minor": 0 } diff --git a/week6_outro/bandits.ipynb b/week6_outro/bandits.ipynb index 6478750ce..aff24bff3 100644 --- a/week6_outro/bandits.ipynb +++ b/week6_outro/bandits.ipynb @@ -16,7 +16,7 @@ " !touch .setup_complete\n", "\n", "# This code creates a virtual display to draw game images on.\n", - "# It will have no effect if your machine has a monitor.\n", + "# It won't have any effect if your machine has a monitor.\n", "if type(os.environ.get(\"DISPLAY\")) is not str or len(os.environ.get(\"DISPLAY\")) == 0:\n", " !bash ../xvfb start\n", " os.environ['DISPLAY'] = ':1'" @@ -49,7 +49,7 @@ "\n", "We are going to implement several exploration strategies for simplest problem - bernoulli bandit.\n", "\n", - "The bandit has $K$ actions. Action produce 1.0 reward $r$ with probability $0 \\le \\theta_k \\le 1$ which is unknown to agent, but fixed over time. Agent's objective is to minimize regret over fixed number $T$ of action selections:\n", + "The bandit has $K$ actions. Action produce 1.0 reward $r$ with probability $0 \\le \\theta_k \\le 1$, which is unknown to the agent, but fixed over time. Agent's objective is to minimize the regret over fixed number $T$ of action selections:\n", "\n", "$$\\rho = T\\theta^* - \\sum_{t=1}^T r_t$$\n", "\n", @@ -57,7 +57,7 @@ "\n", "**Real-world analogy:**\n", "\n", - "Clinical trials - we have $K$ pills and $T$ ill patient. After taking pill, patient is cured with probability $\\theta_k$. Task is to find most efficient pill.\n", + "Clinical trials - we have $K$ pills and ill patient $T$. After taking the pill, a patient is cured with the probability $\\theta_k$. Task is to find the most efficient pill.\n", "\n", "A research on clinical trials - https://arxiv.org/pdf/1507.08025.pdf" ] @@ -152,7 +152,7 @@ "\n", "   **end for** \n", "\n", - "   $x_t \\leftarrow argmax_{k}\\hat\\theta$ with probability $1 - \\epsilon$ or random action with probability $\\epsilon$\n", + "   $x_t \\leftarrow argmax_{k}\\hat\\theta$ with the probability $1 - \\epsilon$ or random action with the probability $\\epsilon$\n", "\n", "   Apply $x_t$ and observe $r_t$\n", "\n", @@ -186,7 +186,7 @@ "metadata": {}, "source": [ "### UCB Agent\n", - "Epsilon-greedy strategy heve no preference for actions. It would be better to select among actions that are uncertain or have potential to be optimal. One can come up with idea of index for each action that represents otimality and uncertainty at the same time. One efficient way to do it is to use UCB1 algorithm:\n", + "Epsilon-greedy strategy heve no preference for actions. It would be better to select among actions, that are uncertain or have potential to be optimal. One can come up with an idea of index for each action, that represents optimality and uncertainty at the same time. One efficient way to do it is to use UCB1 algorithm:\n", "\n", "**for** $t = 1,2,...$ **do**\n", "\n", @@ -205,7 +205,7 @@ "\n", "**end for**\n", "\n", - "__Note:__ in practice, one can multiply $\\sqrt{2log\\ t \\ / \\ (\\alpha_k + \\beta_k)}$ by some tunable parameter to regulate agent's optimism and wilingness to abandon non-promising actions.\n", + "__Note:__ in practice, one can multiply $\\sqrt{2log\\ t \\ / \\ (\\alpha_k + \\beta_k)}$ by some tunable parameter to regulate the agent's optimism and wilingness to abandon non-promising actions.\n", "\n", "More versions and optimality analysis - https://homes.di.unimi.it/~cesabian/Pubblicazioni/ml-02.pdf" ] @@ -227,7 +227,7 @@ "source": [ "### Thompson sampling\n", "\n", - "UCB1 algorithm does not take into account actual distribution of rewards. If we know the distribution - we can do much better by using Thompson sampling:\n", + "UCB1 algorithm does not take into account the actual distribution of rewards. If we know the distribution - we can do much better by using Thompson sampling:\n", "\n", "**for** $t = 1,2,...$ **do**\n", "\n", @@ -320,7 +320,9 @@ "" ] }, - "metadata": {}, + "metadata": { + "tags": [] + }, "output_type": "display_data" } ], @@ -362,5 +364,5 @@ } }, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 0 } diff --git a/week6_outro/practice_mcts.ipynb b/week6_outro/practice_mcts.ipynb index 7706b0c29..275e39a3f 100644 --- a/week6_outro/practice_mcts.ipynb +++ b/week6_outro/practice_mcts.ipynb @@ -6,9 +6,9 @@ "source": [ "## Seminar: Monte-carlo tree search (5 pts)\n", "\n", - "Monte Carlo tree search (MCTS) is a heuristic search algorithm, which shows cool results in challenging domains such as Go and chess. The algorithm builds a search tree, iteratively traverses it, and evaluates its nodes using a Monte-Carlo simulation.\n", + "Monte Carlo tree search (MCTS) is a heuristic search algorithm, which shows cool results in such challenging domains as Go and chess. The algorithm builds a search tree, iteratively traverses it, and evaluates its nodes using a Monte-Carlo simulation.\n", "\n", - "In this seminar, we'll implement a MCTS([[1]](#1), [[2]](#2)) planning and use it to solve some Gym envs.\n", + "In this seminar, we'll implement a MCTS([[1]](#1), [[2]](#2)) planning and use it to solve in some Gym envs.\n", "\n", "![image.png](https://i.postimg.cc/6QmwnjPS/image.png)" ] @@ -21,11 +21,11 @@ "We just start with an empty tree and expand it. There are several common procedures.\n", "\n", "__1) Selection__\n", - "Starting from the root, recursively select the node that corresponds to the tree policy. \n", + "Starting from the root, recursively select the node, that corresponds to the tree policy. \n", "\n", - "There are several options for tree policies, which we saw earlier as exploration strategies: epsilon-greedy, Thomson sampling, UCB-1. It was shown that in MCTS, UCB-1 achieves a good result. Further, we will consider the one, but you can try to use others.\n", + "There are several options for tree policies, which we saw earlier as exploration strategies: epsilon-greedy, Thomson sampling, UCB-1. It was shown, that in MCTS, UCB-1 achieves a good result. Further, we will consider this one, but you can try to use others.\n", "\n", - "Following the UCB-1 tree policy, we will choose an action that, on one hand, we expect to have the highest return, and on the other hand, we haven't explored much.\n", + "Following the UCB-1 tree policy, we will choose an action, that, on the one hand, is expected to have the highest return, and on the other hand, we haven't explored much.\n", "\n", "$$\n", "\\DeclareMathOperator*{\\argmax}{arg\\,max}\n", @@ -40,17 +40,17 @@ "$$\n", "\n", "where: \n", - "- $N$ - number of times we have visited state $s$,\n", - "- $n_a$ - number of times we have taken action $a$,\n", + "- $N$ - number of times we have visited the state $s$,\n", + "- $n_a$ - number of times we have taken the action $a$,\n", "- $C_p$ - exploration balance parameter, which is performed between exploration and exploitation. \n", "\n", - "Using Hoeffding inequality for rewards $R \\in [0,1]$ it can be shown [[3]](#3) that optimal $C_p = 1/\\sqrt{2}$. For rewards outside this range, the parameter should be tuned. We'll be using 10, but you can experiment with other values.\n", + "Using Hoeffding inequality for rewards $R \\in [0,1]$ it can be shown [[3]](#3), that optimal $C_p = 1/\\sqrt{2}$. For rewards outside this range, the parameter should be tuned. We'll be using 10, but you can experiment with other values.\n", "\n", "__2) Expansion__\n", - "After the selection procedure, we can achieve a leaf node or node in which we don't complete actions. In this case, we expand the tree by feasible actions and get new state nodes. \n", + "After the selection procedure, we can achieve a leaf node or node, in which we don't complete actions. In this case, we expand the tree by feasible actions and get new state nodes. \n", "\n", "__3) Simulation__\n", - "How we can estimate node Q-values? The idea is to estimate action values for a given _rollout policy_ by averaging the return of many simulated trajectories from the current node. Simply, we can play with random or some special policy or use some model that can estimate it.\n", + "How we can estimate node Q-values? The idea is to estimate action values for a given _rollout policy_ by averaging the return of many simulated trajectories from the current node. Simply, we can play with random or some special policy or use a model, that can estimate it.\n", "\n", "__4) Backpropagation__\n", "The reward of the last simulation is backed up through the traversed nodes and propagates Q-value estimations, upwards to the root.\n", @@ -59,7 +59,7 @@ "Q({\\text{parent}}, a) = r + \\gamma \\cdot Q({\\text{child}}, a)\n", "$$\n", "\n", - "There are a lot modifications of MCTS, more details about it you can find in this paper [[4]](#4)" + "There are a lot of modifications of MCTS, more details about it you can find in this paper [[4]](#4)" ] }, { @@ -78,7 +78,7 @@ " !touch .setup_complete\n", "\n", "# This code creates a virtual display to draw game images on.\n", - "# It will have no effect if your machine has a monitor.\n", + "# It won't have any effect, if your machine has a monitor.\n", "if type(os.environ.get(\"DISPLAY\")) is not str or len(os.environ.get(\"DISPLAY\")) == 0:\n", " !bash ../xvfb start\n", " os.environ['DISPLAY'] = ':1'" @@ -101,7 +101,7 @@ "source": [ "---\n", "\n", - "But before we do that, we first need to make a wrapper for Gym environments to allow saving and loading game states to facilitate backtracking." + "But before we do that, we first need to make a wrapper for Gym environments to allow us to save and load the game states to facilitate backtracking." ] }, { @@ -122,7 +122,7 @@ "\n", "class WithSnapshots(Wrapper):\n", " \"\"\"\n", - " Creates a wrapper that supports saving and loading environemnt states.\n", + " Creates a wrapper, that supports saving and loading environemnt states.\n", " Required for planning algorithms.\n", "\n", " This class will have access to the core environment as self.env, e.g.:\n", @@ -134,20 +134,20 @@ " - s = self.reset() # same as self.env.reset()\n", " - s, r, done, _ = self.step(action) # same as self.env.step(action)\n", " \n", - " Note that while you may use self.render(), it will spawn a window that cannot be pickled.\n", - " Thus, you will need to call self.close() before pickling will work again.\n", + " Note, that while you may use self.render(), it will spawn a window, that cannot be pickled.\n", + " Thus, you will need to call self.close() before the pickling will work again.\n", " \"\"\"\n", "\n", " def get_snapshot(self, render=False):\n", " \"\"\"\n", - " :returns: environment state that can be loaded with load_snapshot \n", + " :returns: environment state, that can be loaded with load_snapshot \n", " Snapshots guarantee same env behaviour each time they are loaded.\n", "\n", " Warning! Snapshots can be arbitrary things (strings, integers, json, tuples)\n", - " Don't count on them being pickle strings when implementing MCTS.\n", + " Don't count on them being pickle strings, when implementing MCTS.\n", "\n", " Developer Note: Make sure the object you return will not be affected by \n", - " anything that happens to the environment after it's saved.\n", + " anything, that happens to the environment after it's saved.\n", " You shouldn't, for example, return self.env. \n", " In case of doubt, use pickle.dumps or deepcopy.\n", "\n", @@ -177,14 +177,14 @@ "\n", " def get_result(self, snapshot, action):\n", " \"\"\"\n", - " A convenience function that \n", + " A convenience function that: \n", " - loads snapshot, \n", " - commits action via self.step,\n", " - and takes snapshot again :)\n", "\n", " :returns: next snapshot, next_observation, reward, is_done, info\n", "\n", - " Basically it returns next snapshot and everything that env.step would have returned.\n", + " Basically it returns the next snapshot and everything, that env.step has returned.\n", " \"\"\"\n", "\n", " \n", @@ -203,7 +203,7 @@ "metadata": {}, "source": [ "### Try out snapshots:\n", - "Let`s check our wrapper. At first, reset environment and save it, further randomly play some actions and restore our environment from the snapshot. It should be the same as our previous initial state." + "Let`s check our wrapper. At first, reset environment and save it, then randomly play some actions and restore our environment from the snapshot. It should be the same as our previous initial state." ] }, { @@ -287,7 +287,7 @@ "# MCTS: Monte-Carlo tree search\n", "\n", "\n", - "We will start by implementing the `Node` class - a simple class that acts like MCTS node and supports some of the MCTS algorithm steps.\n", + "We will start by implementing the `Node` class - a simple class, that acts like MCTS node and supports some of the MCTS algorithm steps.\n", "\n", "This MCTS implementation makes some assumptions about the environment, you can find those _in the notes section at the end of the notebook_." ] @@ -311,7 +311,7 @@ " \"\"\"A tree node for MCTS.\n", " \n", " Each Node corresponds to the result of performing a particular action (self.action)\n", - " in a particular state (self.parent), and is essentially one arm in the multi-armed bandit that\n", + " in a particular state (self.parent), and is essentially one arm in the multi-armed bandit, that\n", " we model in that state.\"\"\"\n", "\n", " # metadata:\n", @@ -347,7 +347,7 @@ "\n", " def ucb_score(self, scale=10, max_value=1e100):\n", " \"\"\"\n", - " Computes ucb1 upper bound using current value and visit counts for node and it's parent.\n", + " Computes ucb1 upper bound using the current value and visit counts for node and its parent.\n", "\n", " :param scale: Multiplies upper bound by that. From Hoeffding inequality,\n", " assumes reward range to be [0, scale].\n", @@ -371,7 +371,7 @@ " def select_best_leaf(self):\n", " \"\"\"\n", " Picks the leaf with the highest priority to expand.\n", - " Does so by recursively picking nodes with the best UCB-1 score until it reaches a leaf.\n", + " Does so by recursively picking nodes with the best UCB-1 score until it reaches the leaf.\n", " \"\"\"\n", " if self.is_leaf():\n", " return self\n", @@ -403,9 +403,9 @@ " \"\"\"\n", " Play the game from this state to the end (done) or for t_max steps.\n", "\n", - " On each step, pick action at random (hint: env.action_space.sample()).\n", + " On each step, pick the action at random (hint: env.action_space.sample()).\n", "\n", - " Compute sum of rewards from the current state until the end of the episode.\n", + " Compute the sum of rewards from the current state until the end of the episode.\n", " Note 1: use env.action_space.sample() for picking a random action.\n", " Note 2: if the node is terminal (self.is_done is True), just return self.immediate_reward.\n", "\n", @@ -452,7 +452,7 @@ "class Root(Node):\n", " def __init__(self, snapshot, observation):\n", " \"\"\"\n", - " creates special node that acts like tree root\n", + " creates special node, that acts like tree root\n", " :snapshot: snapshot (from env.get_snapshot) to start planning from\n", " :observation: last environment observation\n", " \"\"\"\n", @@ -502,11 +502,11 @@ " node = \n", "\n", " if node.is_done:\n", - " # All rollouts from a terminal node are empty, and thus have 0 reward.\n", + " # All rollouts from a terminal node are empty, and, thus, have 0 reward.\n", " node.propagate(0)\n", " else:\n", " # Expand the best leaf. Perform a rollout from it. Propagate the results upwards.\n", - " # Note that here you have some leeway in choosing where to propagate from.\n", + " # Note, that here you have some leeway in choosing, where to propagate from.\n", " # Any reasonable choice should work.\n", " \n", " " @@ -591,7 +591,7 @@ " if child != best_child:\n", " child.safe_delete()\n", "\n", - " # declare best child a new root\n", + " # declare best child in a new root\n", " root = Root.from_node(best_child)\n", "\n", " assert not root.is_leaf(), \\\n", @@ -618,7 +618,7 @@ "source": [ "## Bonus assignments (10+pts each)\n", "\n", - "There's a few things you might want to try if you want to dig deeper:\n", + "There are a few things you might want to try if you want to dig deeper:\n", "\n", "### Node selection and expansion\n", "\n", @@ -626,10 +626,10 @@ "\n", "UCB-1 is a weak bound as it relies on a very general bounds (Hoeffding Inequality, to be exact). \n", "* Try playing with the exploration parameter $C_p$. The theoretically optimal $C_p$ you can get from a max reward of the environment (max reward for CartPole is 200).\n", - "* Use using a different exploration strategy (bayesian UCB, for example)\n", - "* Expand not all but several random actions per `expand` call. See __the notes below__ for details.\n", + "* Try using a different exploration strategy (bayesian UCB, for example)\n", + "* Expand not all, but several random actions per `expand` call. See __the notes below__ for details.\n", "\n", - "The goal is to find out what gives the optimal performance for `CartPole-v0` for different time budgets (i.e. different n_iter in plan_mcts.)\n", + "The goal is to define, what gives the optimal performance for `CartPole-v0` for different time budgets (i.e. different n_iter in plan_mcts.)\n", "\n", "Evaluate your results on `Acrobot-v1` - do the results change and if so, how can you explain it?\n", "\n", @@ -660,9 +660,9 @@ " \n", "### Integrate learning into planning\n", "\n", - "Planning on each iteration is a costly thing to do. You can speed things up drastically if you train a classifier to predict which action will turn out to be best according to MCTS.\n", + "Planning on each iteration is a costly thing to do. You can speed things up drastically if you train a classifier to predict, which action will turn out to be best according to MCTS.\n", "\n", - "To do so, just record which action did the MCTS agent take on each step and fit something to [state, mcts_optimal_action]\n", + "To do so, just record, which action did the MCTS agent take on each step and fit something to [state, mcts_optimal_action]\n", "* You can also use optimal actions from discarded states to get more (dirty) samples. Just don't forget to fine-tune without them.\n", "* It's also worth a try to use P(best_action|state) from your model to select best nodes in addition to UCB\n", "* If your model is lightweight enough, try using it as a rollout policy.\n", @@ -673,11 +673,11 @@ "* Also consider what [AlphaGo Zero](https://deepmind.com/blog/alphago-zero-learning-scratch/) did in this area.\n", "\n", "### Integrate planning into learning \n", - "_(this will likely take long time, better consider this as side project when all other deadlines are met)_\n", + "_(this will likely take long time, better consider this as side project, when all other deadlines are met)_\n", "\n", - "Incorporate planning into the agent architecture. The goal is to implement [Value Iteration Networks](https://arxiv.org/abs/1602.02867).\n", + "Incorporate planning into the agent's architecture. The goal is to implement [Value Iteration Networks](https://arxiv.org/abs/1602.02867).\n", "\n", - "Remember [week5 assignment](https://github.com/yandexdataschool/Practical_RL/blob/coursera/week5_policy_based/practice_a3c.ipynb)? You will need to switch it into a maze-like game, like MsPacman, and implement a special layer that performs value iteration-like update to a recurrent memory. This can be implemented the same way you did in the POMDP assignment." + "Do you remember [week5 assignment](https://github.com/yandexdataschool/Practical_RL/blob/coursera/week5_policy_based/practice_a3c.ipynb)? You will need to switch it into a maze-like game, like MsPacman, and implement a special layer, that performs value iteration-like update to a recurrent memory. This can be implemented the same way you did in the POMDP assignment." ] }, { @@ -689,11 +689,11 @@ "\n", "#### Assumptions\n", "\n", - "The full list of assumptions is:\n", + " Here is the full list of assumptions:\n", "\n", "* __Finite number of actions__: we enumerate all actions in `expand`.\n", "* __Episodic (finite) MDP__: while technically it works for infinite MDPs, we perform a rollout for $10^4$ steps. If you are knowingly infinite, please adjust `t_max` to something more reasonable.\n", - "* __Deterministic MDP__: `Node` represents the single outcome of taking `self.action` in `self.parent`, and does not support the situation where taking an action in a state may lead to different rewards and next states.\n", + "* __Deterministic MDP__: `Node` represents the single outcome of taking `self.action` in `self.parent`, and does not support the situation, where taking an action in a state may lead to different rewards and next states.\n", "* __No discounted rewards__: we assume $\\gamma=1$. If that isn't the case, you only need to change two lines in `rollout()` and use `my_qvalue = self.immediate_reward + gamma * child_qvalue` for `propagate()`.\n", "* __pickleable env__: won't work if e.g. your env is connected to a web-browser surfing the internet. For custom envs, you may need to modify get_snapshot/load_snapshot from `WithSnapshots`.\n", "\n", @@ -702,13 +702,13 @@ "This MCTS implementation only selects leaf nodes for expansion.\n", "This doesn't break things down because `expand` adds all possible actions. Hence, all non-leaf nodes are by design fully expanded and shouldn't be selected.\n", "\n", - "If you want to only add a few random action on each expand, you will also have to modify `get_best_leaf` to consider returning non-leafs.\n", + "If you want to add only a few random actions on each expand, you will also have to modify `get_best_leaf` to consider returning non-leafs.\n", "\n", "#### Rollout policy\n", "\n", - "We use a simple uniform policy for rollouts. This introduces a negative bias to good situations that can be messed up completely with random bad action. As a simple example, if you tend to rollout with uniform policy, you better don't use sharp knives and walk near cliffs.\n", + "We use a simple uniform policy for rollouts. This introduces a negative bias to good situations, that can be messed up completely with random bad action. As a simple example, if you tend to rollout with uniform policy, you better don't use sharp knives and walk near cliffs.\n", "\n", - "You can improve that by integrating a reinforcement _learning_ algorithm with a computationally light agent. You can even train this agent on optimal policy found by the tree search.\n", + "You can improve that by integrating a reinforcement _learning_ algorithm with a computationally light agent. You can even train this agent on the optimal policy found by the tree search.\n", "\n", "#### Contributions\n", "* Reusing some code from 5vision [solution for deephack.RL](https://github.com/5vision/uct_atari), code by Mikhail Pavlov\n", @@ -739,5 +739,5 @@ } }, "nbformat": 4, - "nbformat_minor": 1 + "nbformat_minor": 0 } diff --git a/week6_outro/seq2seq/practice_pytorch.ipynb b/week6_outro/seq2seq/practice_pytorch.ipynb index 8d84660b5..c1d0acd25 100644 --- a/week6_outro/seq2seq/practice_pytorch.ipynb +++ b/week6_outro/seq2seq/practice_pytorch.ipynb @@ -10,12 +10,12 @@ "\n", " * word (sequence of letters in source language) -> translation (sequence of letters in target language)\n", "\n", - "Unlike what most deep learning practicioners do, we won't only train it to maximize likelihood of correct translation, but also employ reinforcement learning to actually teach it to translate with as few errors as possible.\n", + "Unlike what most deep learning practicioners do, we won't only train it to maximize likelihood of the correct translation, but also employ reinforcement learning to actually teach it to translate with as few errors as possible.\n", "\n", "\n", "### About the task\n", "\n", - "One notable property of Hebrew is that it's consonant language. That is, there are no vowels in the written language. One could represent vowels with diacritics above consonants, but you don't expect people to do that in everyay life.\n", + "One notable property of Hebrew is that it's consonant language. That is, there are no vowels in the written language. One could represent vowels with diacritics above consonants, but you don't expect people to do that in everyday life.\n", "\n", "Therefore, some hebrew characters will correspond to several english letters and others --- to none, so we should use encoder-decoder architecture to figure that out.\n", "\n", @@ -67,7 +67,7 @@ "MODE = \"he-to-en\"\n", "# maximal length of _generated_ output, does not affect training\n", "MAX_OUTPUT_LENGTH = 50 if not EASY_MODE else 20\n", - "REPORT_FREQ = 100 # how often to evaluate validation score" + "REPORT_FREQ = 100 # how often to evaluate the validation score" ] }, { @@ -76,7 +76,7 @@ "source": [ "## Preprocessing\n", "\n", - "We shall store dataset as a dictionary\n", + "We shall store our dataset as a dictionary\n", "`{ word1:[translation1,translation2,...], word2:[...],...}`.\n", "\n", "This is mostly due to the fact that many words have several correct translations.\n", @@ -153,7 +153,7 @@ "source": [ "## Building vocabularies\n", "\n", - "We now need to build vocabularies that map strings to token ids and vice versa. We're gonna need these fellas when we feed training data into model or convert output matrices into english words." + "We now need to build vocabularies, that map strings to token ids and vice versa. We're gonna need these fellas, when we feed training data into model or convert output matrices into english words." ] }, { @@ -217,13 +217,13 @@ "source": [ "## Deploy encoder-decoder\n", "\n", - "__The assignment starts here__\n", + "__The assignment starts here:__\n", "\n", "Our architecture consists of two main blocks:\n", "* Encoder reads words character by character and outputs code vector (usually a function of last RNN state)\n", "* Decoder takes that code vector and produces translations character by character\n", "\n", - "Than it gets fed into a model that follows this simple interface:\n", + "Than it gets fed into a model, that follows this simple interface:\n", "* __`model(inp, out, **flags) -> logp`__ - takes symbolic int32 matrices of hebrew words and their english translations. Computes the log-probabilities of all possible english characters given english prefices and hebrew word.\n", "* __`model.translate(inp, **flags) -> out, logp`__ - takes symbolic int32 matrix of hebrew words, produces output tokens sampled from the model and output log-probabilities for all possible tokens at each tick.\n", " * if given flag __`greedy=True`__, takes most likely next token at each iteration. Otherwise samples with next token probabilities predicted by model.\n", @@ -331,12 +331,12 @@ "source": [ "### Scoring function\n", "\n", - "LogLikelihood is a poor estimator of model performance.\n", - "* If we predict zero probability once, it shouldn't ruin entire model.\n", + "LogLikelihood is a poor estimator of the model performance.\n", + "* If we predict zero probability once, it shouldn't ruin the entire model.\n", "* It is enough to learn just one translation if there are several correct ones.\n", - "* What matters is how many mistakes model's gonna make when it translates!\n", + "* What matters, is how many mistakes model's gonna make when it translates!\n", "\n", - "Therefore, we will use minimal Levenshtein distance. It measures how many characters do we need to add/remove/replace from model translation to make it perfect. Alternatively, one could use character-level BLEU/RougeL or other similar metrics.\n", + "Therefore, we will use minimal Levenshtein distance. It measures, how many characters do we need to add/remove/replace from model translation to make it perfect. Alternatively, one could use character-level BLEU/RougeL or other similar metrics.\n", "\n", "The catch here is that Levenshtein distance is not differentiable: it isn't even continuous. We can't train our neural network to maximize it by gradient descent." ] @@ -352,8 +352,8 @@ "\n", "def get_distance(word, trans):\n", " \"\"\"\n", - " A function that takes word and predicted translation\n", - " and evaluates (Levenshtein's) edit distance to closest correct translation\n", + " A function, that takes a word and predicted translation\n", + " and evaluates (Levenshtein's) edit distance to the closest correct translation\n", " \"\"\"\n", " references = word_to_translation[word]\n", " assert len(references) != 0, \"wrong/unknown word\"\n", @@ -361,7 +361,7 @@ "\n", "\n", "def score(words, bsize=100):\n", - " \"\"\"a function that computes levenshtein distance for bsize random samples\"\"\"\n", + " \"\"\"a function, that computes levenshtein distance for bsize random samples\"\"\"\n", " assert isinstance(words, np.ndarray)\n", "\n", " batch_words = np.random.choice(words, size=bsize, replace=False)\n", @@ -388,7 +388,7 @@ "source": [ "## Supervised pre-training\n", "\n", - "Here we define a function that trains our model through maximizing log-likelihood a.k.a. minimizing crossentropy." + "Here we define a function, that trains our model through maximizing log-likelihood a.k.a. minimizing crossentropy." ] }, { @@ -450,7 +450,7 @@ " torch.sum(logprobs_seq *\n", " to_one_hot(reference_answers, len(out_voc)), dim=-1)\n", " assert crossentropy.dim(\n", - " ) == 2, \"please return elementwise crossentropy, don't compute mean just yet\"\n", + " ) == 2, \"please, return elementwise crossentropy, don't compute the mean just yet\"\n", "\n", " # average with mask\n", " mask = infer_mask(reference_answers, out_voc.eos_ix)\n", @@ -544,10 +544,10 @@ "__How to interpret the plots:__\n", "\n", "* __Train loss__ - that's your model's crossentropy over minibatches. It should go down steadily. Most importantly, it shouldn't be NaN :)\n", - "* __Val score distribution__ - distribution of translation edit distance (score) within batch. It should move to the left over time.\n", - "* __Val score / training time__ - it's your current mean edit distance. This plot is much whimsier than loss, but make sure it goes below 8 by 2500 steps. \n", + "* __Val score distribution__ - distribution of translation edit distance (score) within the batch. It should move to the left over time.\n", + "* __Val score / training time__ - it's your current mean edit distance. This plot is much whimsier, than loss, but make sure it goes below 8 by 2500 steps. \n", "\n", - "If it doesn't, first try to re-create both model and opt. You may have changed it's weight too much while debugging. If that doesn't help, it's debugging time." + "If it doesn't, first, try to re-create both model and opt. You may have changed it's weight too much while debugging. If that doesn't help, it's time for debugging." ] }, { @@ -588,14 +588,14 @@ "\n", "$$ \\nabla J = E_{x \\sim p(s)} E_{y \\sim \\pi(y|x)} \\nabla log \\pi(y|x) \\cdot (R(x,y) - b(x)) $$\n", "\n", - "Here reward R(x,y) is a __negative levenshtein distance__ (since we minimize it). The baseline __b(x)__ represents how well model fares on word __x__.\n", + "Here reward R(x,y) is a __negative levenshtein distance__ (since we minimize it). The baseline __b(x)__ represents, how well model fares on word __x__.\n", "\n", - "In practice, this means that we compute baseline as a score of greedy translation, $b(x) = R(x,y_{greedy}(x)) $.\n", + "In practice, this means, that we compute the baseline as a score of greedy translation, $b(x) = R(x,y_{greedy}(x)) $.\n", "\n", "![img](https://github.com/yandexdataschool/Practical_RL/raw/master/yet_another_week/_resource/scheme.png)\n", "\n", "\n", - "Luckily, we already obtained the required outputs: `model.greedy_translations, model.greedy_mask` and we only need to compute levenshtein using `compute_levenshtein` function." + "Luckily, we've already obtained the required outputs: `model.greedy_translations, model.greedy_mask` and we only need to compute levenshtein using `compute_levenshtein` function." ] }, { @@ -644,7 +644,7 @@ " # policy gradient pseudo-loss. Gradient of J is exactly policy gradient.\n", " J = logp_sample * advantage[:, None]\n", "\n", - " assert J.dim() == 2, \"please return elementwise objective, don't compute mean just yet\"\n", + " assert J.dim() == 2, \"please return elementwise objective, don't compute the mean just yet\"\n", "\n", " # average with mask\n", " mask = infer_mask(sample_translations, out_voc.eos_ix)\n", @@ -726,13 +726,13 @@ "__Debugging tips:__\n", "\n", "\n", - " * As usual, don't expect improvements right away, but in general the model should be able to show some positive changes by 5k steps.\n", - " * Entropy is a good indicator of many problems. \n", + " * As usual, don't expect the improvements right away, but in general the model should be able to show some positive changes by 5k steps.\n", + " * Entropy is a good indicator of many problems: \n", " * If it reaches zero, you may need greater entropy regularizer.\n", " * If it has rapid changes time to time, you may need gradient clipping.\n", " * If it oscillates up and down in an erratic manner... it's perfectly okay for entropy to do so. But it should decrease at the end.\n", " \n", - " * We don't show loss_history cuz it's uninformative for pseudo-losses in policy gradient. However, if something goes wrong you can check it to see if everything isn't a constant zero." + " * We don't show loss_history because it's uninformative for pseudo-losses in policy gradient. However, if something goes wrong you can check it to see if everything isn't a constant zero." ] }, { @@ -822,8 +822,8 @@ "* When leaving training for nighttime, try setting REPORT_FREQ to a larger value (e.g. 500) not to waste time on it.\n", "\n", "### Attention\n", - "There's more than one way to connect decoder to encoder\n", - " * __Vanilla:__ layer_i of encoder last state goes to layer_i of decoder initial state\n", + "There's more than one way to connect decoder to encoder:\n", + " * __Vanilla:__ layer_i of encoder last state goes to layer_i of decoder initial state.\n", " * __Every tick:__ feed encoder last state _on every iteration_ of decoder.\n", " * __Attention:__ allow decoder to \"peek\" at one (or several) positions of encoded sequence on every tick.\n", " \n", @@ -831,11 +831,11 @@ "You can read more about attention [in this nice blog post](https://distill.pub/2016/augmented-rnns/). The easiest way to begin is to use \"soft\" attention with \"additive\" or \"dot-product\" intermediate layers.\n", "\n", "__Tips__\n", - "* Model usually generalizes better if you no longer allow decoder to see final encoder state\n", - "* Once your model made it through several epochs, it is a good idea to visualize attention maps to understand what your model has actually learned\n", + "* Model usually generalizes better if you no longer allow decoder to see the final encoder state.\n", + "* Once your model made it through several epochs, it is a good idea to visualize attention maps to understand what your model has actually learned.\n", "\n", "* There's more stuff [here](https://github.com/yandexdataschool/Practical_RL/blob/master/week8_scst/bonus.ipynb)\n", - "* If you opted for hard attention, we recommend [gumbel-softmax](https://blog.evjang.com/2016/11/tutorial-categorical-variational.html) instead of sampling. Also please make sure soft attention works fine before you switch to hard.\n", + "* If you opted for hard attention, we recommend [gumbel-softmax](https://blog.evjang.com/2016/11/tutorial-categorical-variational.html) instead of sampling. Also please make sure soft attention works fine before you switch to hard one.\n", "\n", "### UREX\n", "* This is a way to improve exploration in policy-based settings. The main idea is that you find and upweight under-appreciated actions.\n", @@ -878,5 +878,5 @@ } }, "nbformat": 4, - "nbformat_minor": 1 + "nbformat_minor": 0 } From c2acfa50734fb96ce521ffc7591ba28e477e1d63 Mon Sep 17 00:00:00 2001 From: Lionel Miller Date: Sat, 13 Feb 2021 18:49:32 +0300 Subject: [PATCH 3/3] Remove empty tags from metadata --- week1_intro/deep_crossentropy_method.ipynb | 4 +- week1_intro/primer/recap_ml.ipynb | 44 ++++++---------------- week1_intro/primer/recap_pytorch.ipynb | 22 +++-------- week2_model_based/practice_vi.ipynb | 4 +- week3_model_free/qlearning.ipynb | 7 +--- week4_approx/dqn_atari_pytorch.ipynb | 4 +- week6_outro/bandits.ipynb | 4 +- 7 files changed, 23 insertions(+), 66 deletions(-) diff --git a/week1_intro/deep_crossentropy_method.ipynb b/week1_intro/deep_crossentropy_method.ipynb index 446ddebaa..3e7cfb4c9 100644 --- a/week1_intro/deep_crossentropy_method.ipynb +++ b/week1_intro/deep_crossentropy_method.ipynb @@ -332,9 +332,7 @@ "
" ] }, - "metadata": { - "tags": [] - }, + "metadata": {}, "output_type": "display_data" } ], diff --git a/week1_intro/primer/recap_ml.ipynb b/week1_intro/primer/recap_ml.ipynb index c67c789c3..f3b253878 100644 --- a/week1_intro/primer/recap_ml.ipynb +++ b/week1_intro/primer/recap_ml.ipynb @@ -375,9 +375,7 @@ ] }, "execution_count": null, - "metadata": { - "tags": [] - }, + "metadata": {}, "output_type": "execute_result" } ], @@ -596,9 +594,7 @@ ] }, "execution_count": null, - "metadata": { - "tags": [] - }, + "metadata": {}, "output_type": "execute_result" } ], @@ -2083,9 +2079,7 @@ ] }, "execution_count": null, - "metadata": { - "tags": [] - }, + "metadata": {}, "output_type": "execute_result" } ], @@ -2168,9 +2162,7 @@ ] }, "execution_count": null, - "metadata": { - "tags": [] - }, + "metadata": {}, "output_type": "execute_result" }, { @@ -2180,9 +2172,7 @@ "" ] }, - "metadata": { - "tags": [] - }, + "metadata": {}, "output_type": "display_data" } ], @@ -2207,9 +2197,7 @@ "" ] }, - "metadata": { - "tags": [] - }, + "metadata": {}, "output_type": "display_data" } ], @@ -2232,9 +2220,7 @@ ] }, "execution_count": null, - "metadata": { - "tags": [] - }, + "metadata": {}, "output_type": "execute_result" }, { @@ -2244,9 +2230,7 @@ "" ] }, - "metadata": { - "tags": [] - }, + "metadata": {}, "output_type": "display_data" } ], @@ -2278,9 +2262,7 @@ "" ] }, - "metadata": { - "tags": [] - }, + "metadata": {}, "output_type": "display_data" }, { @@ -2292,9 +2274,7 @@ ] }, "execution_count": null, - "metadata": { - "tags": [] - }, + "metadata": {}, "output_type": "execute_result" }, { @@ -2304,9 +2284,7 @@ "" ] }, - "metadata": { - "tags": [] - }, + "metadata": {}, "output_type": "display_data" } ], diff --git a/week1_intro/primer/recap_pytorch.ipynb b/week1_intro/primer/recap_pytorch.ipynb index fb0f7c823..b5772af1a 100644 --- a/week1_intro/primer/recap_pytorch.ipynb +++ b/week1_intro/primer/recap_pytorch.ipynb @@ -256,9 +256,7 @@ ] }, "execution_count": null, - "metadata": { - "tags": [] - }, + "metadata": {}, "output_type": "execute_result" }, { @@ -269,8 +267,7 @@ ] }, "metadata": { - "needs_background": "light", - "tags": [] + "needs_background": "light" }, "output_type": "display_data" } @@ -355,8 +352,7 @@ ] }, "metadata": { - "needs_background": "light", - "tags": [] + "needs_background": "light" }, "output_type": "display_data" }, @@ -441,9 +437,7 @@ "" ] }, - "metadata": { - "tags": [] - }, + "metadata": {}, "output_type": "display_data" } ], @@ -559,9 +553,7 @@ ] }, "execution_count": null, - "metadata": { - "tags": [] - }, + "metadata": {}, "output_type": "execute_result" } ], @@ -1042,9 +1034,7 @@ "" ] }, - "metadata": { - "tags": [] - }, + "metadata": {}, "output_type": "display_data" } ], diff --git a/week2_model_based/practice_vi.ipynb b/week2_model_based/practice_vi.ipynb index 973579327..4c4a5ee64 100644 --- a/week2_model_based/practice_vi.ipynb +++ b/week2_model_based/practice_vi.ipynb @@ -352,9 +352,7 @@ "" ] }, - "metadata": { - "tags": [] - }, + "metadata": {}, "output_type": "display_data" } ], diff --git a/week3_model_free/qlearning.ipynb b/week3_model_free/qlearning.ipynb index 6b5e0d15d..ee2509357 100644 --- a/week3_model_free/qlearning.ipynb +++ b/week3_model_free/qlearning.ipynb @@ -243,9 +243,7 @@ "
" ] }, - "metadata": { - "tags": [] - }, + "metadata": {}, "output_type": "display_data" } ], @@ -355,8 +353,7 @@ ] }, "metadata": { - "needs_background": "light", - "tags": [] + "needs_background": "light" }, "output_type": "display_data" } diff --git a/week4_approx/dqn_atari_pytorch.ipynb b/week4_approx/dqn_atari_pytorch.ipynb index cf5305745..1d65241c1 100644 --- a/week4_approx/dqn_atari_pytorch.ipynb +++ b/week4_approx/dqn_atari_pytorch.ipynb @@ -122,9 +122,7 @@ "
" ] }, - "metadata": { - "tags": [] - }, + "metadata": {}, "output_type": "display_data" } ], diff --git a/week6_outro/bandits.ipynb b/week6_outro/bandits.ipynb index aff24bff3..020192dad 100644 --- a/week6_outro/bandits.ipynb +++ b/week6_outro/bandits.ipynb @@ -320,9 +320,7 @@ "" ] }, - "metadata": { - "tags": [] - }, + "metadata": {}, "output_type": "display_data" } ],