diff --git a/Dockerfile b/Dockerfile index fe5977f15..158e80631 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,67 +1,46 @@ -FROM andrewosh/binder-base -MAINTAINER Alexander Panin -USER root +FROM python:3.7-slim +# install the notebook package +RUN pip install --no-cache --upgrade pip && \ + pip install --no-cache notebook -RUN echo "deb http://archive.ubuntu.com/ubuntu trusty-backports main restricted universe multiverse" >> /etc/apt/sources.list RUN apt-get -qq update - -RUN apt-get install -y gcc-4.9 g++-4.9 libstdc++6 wget unzip +# RUN apt-get install -y gcc-4.9 g++-4.9 libstdc++6 wget unzip +RUN apt-get install -y gcc g++ libstdc++6 wget curl unzip git RUN apt-get install -y libopenblas-dev liblapack-dev libsdl2-dev libboost-all-dev graphviz RUN apt-get install -y cmake zlib1g-dev libjpeg-dev RUN apt-get install -y xvfb libav-tools xorg-dev python-opengl python3-opengl RUN apt-get -y install swig3.0 RUN ln -s /usr/bin/swig3.0 /usr/bin/swig - -USER main RUN pip install --upgrade pip==9.0.3 RUN pip install --upgrade --ignore-installed setuptools #fix https://github.com/tensorflow/tensorflow/issues/622 -RUN pip install --upgrade sklearn tqdm nltk editdistance joblib graphviz +RUN pip install --upgrade sklearn tqdm nltk editdistance joblib graphviz pandas matplotlib # install all gym stuff except mujoco - it fails at "import importlib.util" (no module named util) RUN pip install --upgrade gym RUN pip install --upgrade gym[atari] RUN pip install --upgrade gym[box2d] -RUN pip install --upgrade http://download.pytorch.org/whl/cu80/torch-0.3.0.post4-cp27-cp27mu-linux_x86_64.whl +RUN pip install --upgrade https://download.pytorch.org/whl/cpu/torch-1.0.1.post2-cp37-cp37m-linux_x86_64.whl RUN pip install --upgrade torchvision RUN pip install --upgrade keras RUN pip install --upgrade https://github.com/Theano/Theano/archive/master.zip RUN pip install --upgrade https://github.com/Lasagne/Lasagne/archive/master.zip RUN pip install --upgrade https://github.com/yandexdataschool/AgentNet/archive/master.zip RUN pip install gym_pull -RUN pip install ppaquette-gym-doom - - - - -RUN /home/main/anaconda/envs/python3/bin/pip install --upgrade pip==9.0.3 - -# fix https://github.com/tensorflow/tensorflow/issues/622 -RUN /home/main/anaconda/envs/python3/bin/pip install --upgrade --ignore-installed setuptools - -# python3: fix `GLIBCXX_3.4.20' not found - conda's libgcc blocked system's gcc-4.9 and libstdc++6 -RUN bash -c "conda update -y conda && source activate python3 && conda uninstall -y libgcc && source deactivate" -RUN /home/main/anaconda/envs/python3/bin/pip install --upgrade matplotlib numpy scipy pandas graphviz - -RUN /home/main/anaconda/envs/python3/bin/pip install --upgrade sklearn tqdm nltk editdistance joblib -RUN /home/main/anaconda/envs/python3/bin/pip install --upgrade --ignore-installed setuptools #fix https://github.com/tensorflow/tensorflow/issues/622 - -# install all gym stuff except mujoco - it fails at "mjmodel.h: no such file or directory" -RUN /home/main/anaconda/envs/python3/bin/pip install --upgrade gym -RUN /home/main/anaconda/envs/python3/bin/pip install --upgrade gym[atari] -RUN /home/main/anaconda/envs/python3/bin/pip install --upgrade gym[box2d] - - - -RUN /home/main/anaconda/envs/python3/bin/pip install --upgrade http://download.pytorch.org/whl/cu80/torch-0.3.0.post4-cp35-cp35m-linux_x86_64.whl -RUN /home/main/anaconda/envs/python3/bin/pip install --upgrade torchvision -RUN /home/main/anaconda/envs/python3/bin/pip install --upgrade keras -RUN /home/main/anaconda/envs/python3/bin/pip install --upgrade https://github.com/Theano/Theano/archive/master.zip -RUN /home/main/anaconda/envs/python3/bin/pip install --upgrade https://github.com/Lasagne/Lasagne/archive/master.zip -RUN /home/main/anaconda/envs/python3/bin/pip install --upgrade https://github.com/yandexdataschool/AgentNet/archive/master.zip - -#install TF after everything else not to break python3's pyglet with python2's tensorflow -RUN pip install --upgrade tensorflow==1.4.0 -RUN /home/main/anaconda/envs/python3/bin/pip install --upgrade tensorflow==1.4.0 -#TODO py3 doom once it's no longer broken +# RUN pip install ppaquette-gym-doom + +# create user with a home directory +ARG NB_USER +ARG NB_UID +ENV USER ${NB_USER} +ENV HOME /home/${NB_USER} + +RUN adduser --disabled-password \ + --gecos "Default user" \ + --uid ${NB_UID} \ + ${NB_USER} +WORKDIR ${HOME} +USER ${USER} + +RUN cd ${HOME} && git clone https://github.com/yandexdataschool/Practical_RL diff --git a/README.md b/README.md index 8f8ed278f..5ba841f28 100644 --- a/README.md +++ b/README.md @@ -1,29 +1,33 @@ -# Practical_RL -** Announce - new HSE track will start in late january, YSDA soon after. Tons of changes incoming. We'll also fix all the issues :) ** -A course on reinforcement learning in the wild. +# Practical_RL [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/yandexdataschool/practical_rl/spring19) +An open course on reinforcement learning in the wild. Taught on-campus at [HSE](https://cs.hse.ru) and [YSDA](https://yandexdataschool.com/) and maintained to be friendly to online students (both english and russian). +__Note:__ this branch is an on-campus version of the for __spring 2019 YSDA and HSE students__. For full course materials, switch to the [master branch](https://github.com/yandexdataschool/Practical_RL/tree/master). + #### Manifesto: * __Optimize for the curious.__ For all the materials that aren’t covered in detail there are links to more information and related materials (D.Silver/Sutton/blogs/whatever). Assignments will have bonus sections if you want to dig deeper. * __Practicality first.__ Everything essential to solving reinforcement learning problems is worth mentioning. We won't shun away from covering tricks and heuristics. For every major idea there should be a lab that makes you to “feel” it on a practical problem. * __Git-course.__ Know a way to make the course better? Noticed a typo in a formula? Found a useful link? Made the code more readable? Made a version for alternative framework? You're awesome! [Pull-request](https://help.github.com/articles/about-pull-requests/) it! +[![Github contributors](https://img.shields.io/github/contributors/yandexdataschool/Practical_RL.svg?logo=github&logoColor=white)](https://github.com/yandexdataschool/Practical_RL/graphs/contributors) + # Course info -* Lecture slides are [here](https://yadi.sk/d/loPpY45J3EAYfU). -* Telegram chat room for YSDA & HSE students is [here](https://t.me/rlspring18) -* Grading rules for YSDA & HSE students is [here](https://github.com/yandexdataschool/Practical_RL/wiki/Homeworks-and-grading) -* Online student __[survival guide](https://github.com/yandexdataschool/Practical_RL/wiki/Online-student's-survival-guide)__ -* Installing the libraries - [guide and issues thread](https://github.com/yandexdataschool/Practical_RL/issues/1) -* Magical button that launches you into course environment: - * [![Binder](https://mybinder.org/badge.svg)](https://mybinder.org/v2/gh/yandexdataschool/Practical_RL/master) - comes with all libraries pre-installed. May be down time to time. - * If it's down, try [__google colab__](https://colab.research.google.com/) or [__azure notebooks__](http://notebooks.azure.com/). Those last longer, but they will require you to run installer commands (see ./Dockerfile). -* Anonymous [feedback form](https://docs.google.com/forms/d/e/1FAIpQLSdurWw97Sm9xCyYwC8g3iB5EibITnoPJW2IkOVQYE_kcXPh6Q/viewform) for everything that didn't go through e-mail. -* [About the course](https://github.com/yandexdataschool/Practical_RL/wiki/Practical-RL) +* __Chat room__ for YSDA & HSE students is [here](https://t.me/joinchat/CDFcMVcoAQvEiI9WAo1pEQ) +* __Grading__ rules for YSDA & HSE students is [here](https://github.com/yandexdataschool/Practical_RL/wiki/Homeworks-and-grading) + +* __FAQ:__ [About the course](https://github.com/yandexdataschool/Practical_RL/wiki/Practical-RL), [Technical issues thread](https://github.com/yandexdataschool/Practical_RL/issues/1), [Lecture Slides](https://yadi.sk/d/loPpY45J3EAYfU), [Online Student Survival Guide](https://github.com/yandexdataschool/Practical_RL/wiki/Online-student's-survival-guide) + +* Anonymous [feedback form](https://docs.google.com/forms/d/e/1FAIpQLSdurWw97Sm9xCyYwC8g3iB5EibITnoPJW2IkOVQYE_kcXPh6Q/viewform). + +* Virtual course environment: + * [Installing dependencies](https://github.com/yandexdataschool/Practical_RL/issues/1) on your local machine (recommended). + * [__google colab__](https://colab.research.google.com/) - set open -> github -> yandexdataschool/pracical_rl -> {branch name} and select any notebook you want. + * Alternatives: [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/yandexdataschool/practical_rl/spring19) and [Azure Notebooks](https://notebooks.azure.com/). + # Additional materials -* A large list of RL materials - [awesome rl](https://github.com/aikorea/awesome-rl) * [RL reading group](https://github.com/yandexdataschool/Practical_RL/wiki/RL-reading-group) @@ -31,75 +35,43 @@ Taught on-campus at [HSE](https://cs.hse.ru) and [YSDA](https://yandexdataschool The syllabus is approximate: the lectures may occur in a slightly different order and some topics may end up taking two weeks. -* [__week1__](https://github.com/yandexdataschool/Practical_RL/tree/master/week1_intro) RL as blackbox optimization +* [__week01_intro__](./week01_intro) Introduction * Lecture: RL problems around us. Decision processes. Stochastic optimization, Crossentropy method. Parameter space search vs action space search. * Seminar: Welcome into openai gym. Tabular CEM for Taxi-v0, deep CEM for box2d environments. * Homework description - see week1/README.md. - * **YSDA Deadline: 2018.02.26 23.59** - * **HSE Deadline: 2018.01.28 23:59** - -* [__week2__](https://github.com/yandexdataschool/Practical_RL/tree/master/week2_value_based) Value-based methods + +* [__week02_value_based__](./week02_value_based) Value-based methods * Lecture: Discounted reward MDP. Value-based approach. Value iteration. Policy iteration. Discounted reward fails. * Seminar: Value iteration. * Homework description - see week2/README.md. - * **HSE Deadline: 2018.02.11 23:59** - * **YSDA Deadline: part1 2018.03.05 23.59, part2 2018.03.12 23.59** - -* [__week3__](https://github.com/yandexdataschool/Practical_RL/tree/master/week3_model_free) Model-free reinforcement learning +* [__week03_model_free__](./week03_model_free) Model-free reinforcement learning * Lecture: Q-learning. SARSA. Off-policy Vs on-policy algorithms. N-step algorithms. TD(Lambda). * Seminar: Qlearning Vs SARSA Vs Expected Value SARSA * Homework description - see week3/README.md. - * **HSE Deadline: 2018.02.15 23:59** - * **YSDA Deadline: 2018.03.12 23.59** - -* [__week4_recap__](https://github.com/yandexdataschool/Practical_RL/tree/master/week4_%5Brecap%5D_deep_learning) - deep learning recap - * Lecture: Deep learning 101 - * Seminar: Simple image classification with convnets - -* [__week4__](https://github.com/yandexdataschool/Practical_RL/tree/master/week4_approx_rl) Approximate reinforcement learning - * Lecture: Infinite/continuous state space. Value function approximation. Convergence conditions. Multiple agents trick; experience replay, target networks, double/dueling/bootstrap DQN, etc. - * Seminar: Approximate Q-learning with experience replay. (CartPole, Atari) - * **HSE Deadline: 2018.03.04 23:30** - * **YSDA Deadline: 2018.03.20 23.30** - -* [__week5__](https://github.com/yandexdataschool/Practical_RL/tree/master/week5_explore) Exploration in reinforcement learning - * Lecture: Contextual bandits. Thompson Sampling, UCB, bayesian UCB. Exploration in model-based RL, MCTS. "Deep" heuristics for exploration. - * Seminar: bayesian exploration for contextual bandits. UCB for MCTS. - - * **YSDA Deadline: 2018.03.30 23.30** - -* [__week6__](https://github.com/yandexdataschool/Practical_RL/tree/master/week6_policy_based) Policy gradient methods I - * Lecture: Motivation for policy-based, policy gradient, logderivative trick, REINFORCE/crossentropy method, variance reduction(baseline), advantage actor-critic (incl. GAE) - * Seminar: REINFORCE, advantage actor-critic - -* [__week7_recap__](https://github.com/yandexdataschool/Practical_RL/tree/master/week7_%5Brecap%5D_rnn) Recurrent neural networks recap - * Lecture: Problems with sequential data. Recurrent neural netowks. Backprop through time. Vanishing & exploding gradients. LSTM, GRU. Gradient clipping - * Seminar: character-level RNN language model -* [__week7__](https://github.com/yandexdataschool/Practical_RL/tree/master/week7_pomdp) Partially observable MDPs - * Lecture: POMDP intro. POMDP learning (agents with memory). POMDP planning (POMCP, etc) - * Seminar: Deep kung-fu & doom with recurrent A3C and DRQN - -* [__week8__](https://github.com/yandexdataschool/Practical_RL/tree/master/week8_scst) Applications II - * Lecture: Reinforcement Learning as a general way to optimize non-differentiable loss. G2P, machine translation, conversation models, image captioning, discrete GANs. Self-critical sequence training. - * Seminar: Simple neural machine translation with self-critical sequence training +* __week04__ Approximate (deep) RL +* __week05__ Exploration +* __week06__ Policy Gradient methods +* __week07__ Applications I +* __week{++i}__ Partially Observed MDP +* __week{++i}__ Advanced policy-based methods +* __week{++i}__ Applications II +* __week{++i}__ Distributional reinforcement learning +* __week{++i}__ Inverse RL and Imitation Learning -* [__week9__](https://github.com/yandexdataschool/Practical_RL/tree/master/week9_policy_II) Policy gradient methods II - * Lecture: Trust region policy optimization. NPO/PPO. Deterministic policy gradient. DDPG. Bonus: DPG for discrete action spaces. - * Seminar: Approximate TRPO for simple robotic tasks. - -* [Some after-course bonus materials](https://github.com/yandexdataschool/Practical_RL/tree/master/yet_another_week) - # Course staff Course materials and teaching by: _[unordered]_ - [Pavel Shvechikov](https://github.com/bestxolodec) - lectures, seminars, hw checkups, reading group -- [Oleg Vasilev](https://github.com/Omrigan) - seminars, hw checkups, technical support -- [Alexander Fritsler](https://github.com/Fritz449) - lectures, seminars, hw checkups - [Nikita Putintsev](https://github.com/qwasser) - seminars, hw checkups, organizing our hot mess -- [Fedor Ratnikov](https://github.com/justheuristic/) - lectures, seminars, hw checkups -- [Alexey Umnov](https://github.com/alexeyum) - seminars, hw checkups +- [Alexander Fritsler](https://github.com/Fritz449) - lectures, seminars, hw checkups +- [Oleg Vasilev](https://github.com/Omrigan) - seminars, hw checkups, technical support +- [Dmitry Nikulin](https://github.com/pastafarianist) - tons of fixes, far and wide +- [Mikhail Konobeev](https://github.com/MichaelKonobeev) - seminars, hw checkups +- [Ivan Kharitonov](https://github.com/neer201) - seminars, hw checkups +- [Ravil Khisamov](https://github.com/zshrav) - seminars, hw checkups +- [Fedor Ratnikov](https://github.com/justheuristic) - admin stuff # Contributions * Using pictures from [Berkeley AI course](http://ai.berkeley.edu/home.html) @@ -107,4 +79,5 @@ Course materials and teaching by: _[unordered]_ * Several tensorflow assignments by [Scitator](https://github.com/Scitator) * A lot of fixes from [arogozhnikov](https://github.com/arogozhnikov) * Other awesome people: see github [contributors](https://github.com/yandexdataschool/Practical_RL/graphs/contributors) +* [Alexey Umnov](https://github.com/alexeyum) helped us a lot during spring2018 diff --git a/docker/Dockerfile b/docker/Dockerfile index ebed46e56..ee883dd6e 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -41,7 +41,7 @@ RUN pip install --upgrade pip==9.0.3 && \ https://github.com/Lasagne/Lasagne/archive/master.zip \ https://github.com/yandexdataschool/AgentNet/archive/master.zip \ tensorflow \ - http://download.pytorch.org/whl/cpu/torch-0.4.1-cp27-cp27mu-linux_x86_64.whl \ + https://download.pytorch.org/whl/cpu/torch-1.0.1.post2-cp27-cp27mu-linux_x86_64.whl \ torchvision \ keras @@ -60,7 +60,7 @@ RUN pip3 install --upgrade pip==9.0.3 && \ pip3 install --upgrade https://github.com/Theano/Theano/archive/master.zip \ https://github.com/Lasagne/Lasagne/archive/master.zip \ https://github.com/yandexdataschool/AgentNet/archive/master.zip \ - http://download.pytorch.org/whl/cpu/torch-0.4.1-cp35-cp35m-linux_x86_64.whl \ + https://download.pytorch.org/whl/cpu/torch-1.0.1.post2-cp35-cp35m-linux_x86_64.whl \ torchvision \ tensorflow \ keras && \ diff --git a/setup_colab.sh b/setup_colab.sh new file mode 100644 index 000000000..01340b7ea --- /dev/null +++ b/setup_colab.sh @@ -0,0 +1,8 @@ +#!/bin/bash +# a setup script for google colab. Will be updated +pip install gym +apt-get install -y xvfb +wget https://raw.githubusercontent.com/yandexdataschool/Practical_DL/fall18/xvfb -O ../xvfb +apt-get install -y python-opengl ffmpeg +pip install pyglet==1.2.4 + diff --git a/week1_intro/README.md b/week01_intro/README.md similarity index 75% rename from week1_intro/README.md rename to week01_intro/README.md index 423e6e70c..054ad0b99 100644 --- a/week1_intro/README.md +++ b/week01_intro/README.md @@ -1,5 +1,5 @@ ## Materials: -* [__Lecture slides__](https://yadi.sk/i/sbc0ZCKx3RRGbW) +* [__Lecture slides__](https://yadi.sk/i/-EUHXUXOTC5t9Q) * __Russian:__ * Intro to RL - [video](https://yadi.sk/i/bMo0qa-x3DoqkS) * Blackbox optimization - [video](https://yadi.sk/i/5yf_4oGI3EDJhJ) @@ -13,6 +13,7 @@ ## More materials: * __[recommended]__ - awesome openai post about evolution strategies - [blog post](https://blog.openai.com/evolution-strategies/), [article](https://arxiv.org/abs/1703.03864) +* __[recommended]__ - formal explanation of crossentropy method in [general](https://people.smp.uq.edu.au/DirkKroese/ps/CEEncycl.pdf) and for [optimization](https://people.smp.uq.edu.au/DirkKroese/ps/CEopt.pdf) * Deep learning course (if you want to learn in parallel) - https://github.com/yandexdataschool/HSE_deeplearning * Video on genetic algorithms (english) - [video](https://www.youtube.com/watch?v=ejxfTy4lI6I) * Another guide to genetic algorithm (english) - [video](https://www.youtube.com/watch?v=zwYV11a__HQ) @@ -21,9 +22,10 @@ * Longer video on Ant Colony Algorithm (english) - [video](https://www.youtube.com/watch?v=xpyKmjJuqhk) -## Homework description +## Practice assignment +Instant dive in: [__seminar_gym_interface__](https://colab.research.google.com/github/yandexdataschool/Practical_RL/blob/spring19/week01_intro/seminar_gym_interface.ipynb), [__crossentropy_method__](https://colab.research.google.com/github/yandexdataschool/Practical_RL/blob/spring19/week01_intro/crossentropy_method.ipynb) + * Open `gym_interface.ipynb` and follow instructions from there - * If you haven't installed everything yet, try [![Binder](https://mybinder.org/badge.svg)](https://mybinder.org/v2/gh/yandexdataschool/Practical_RL/master) * After you're done there, proceed to `crossentropy_method.ipynb` * You can find homework and bonus assignment descriptions at the end of that notebook. * Note: so far it's enough to say `pip install gym` on top of any data-science-stuffed python, but we'd appreciate if you gradually switch to [full installation](https://github.com/openai/gym#installing-everything). diff --git a/week1_intro/crossentropy_method.ipynb b/week01_intro/crossentropy_method.ipynb similarity index 85% rename from week1_intro/crossentropy_method.ipynb rename to week01_intro/crossentropy_method.ipynb index bb26b2be0..031b7e6e7 100644 --- a/week1_intro/crossentropy_method.ipynb +++ b/week01_intro/crossentropy_method.ipynb @@ -12,11 +12,12 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ + "# In Google Colab, uncomment this:\n", + "# !wget https://bit.ly/2FMJP5K -O setup.py && bash setup.py\n", + "\n", "# XVFB will be launched if you run on a server\n", "import os\n", "if type(os.environ.get(\"DISPLAY\")) is not str or len(os.environ.get(\"DISPLAY\")) == 0:\n", @@ -27,9 +28,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "import gym\n", @@ -44,9 +43,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "n_states = env.observation_space.n\n", @@ -73,20 +70,16 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ - "policy = " + "policy = " ] }, { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "assert type(policy) in (np.ndarray, np.matrix)\n", @@ -106,9 +99,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "def generate_session(policy, t_max=10**4):\n", @@ -142,9 +133,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "s, a, r = generate_session(policy)\n", @@ -156,9 +145,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "# let's see the initial reward distribution\n", @@ -168,10 +155,8 @@ "sample_rewards = [generate_session(policy, t_max=1000)[-1] for _ in range(200)]\n", "\n", "plt.hist(sample_rewards, bins=20)\n", - "plt.vlines([np.percentile(sample_rewards, 50)], [0], [\n", - " 100], label=\"50'th percentile\", color='green')\n", - "plt.vlines([np.percentile(sample_rewards, 90)], [0], [\n", - " 100], label=\"90'th percentile\", color='red')\n", + "plt.vlines([np.percentile(sample_rewards, 50)], [0], [100], label=\"50'th percentile\", color='green')\n", + "plt.vlines([np.percentile(sample_rewards, 90)], [0], [100], label=\"90'th percentile\", color='red')\n", "plt.legend()" ] }, @@ -185,9 +170,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "def select_elites(states_batch, actions_batch, rewards_batch, percentile=50):\n", @@ -195,14 +178,15 @@ " Select states and actions from games that have rewards >= percentile\n", " :param states_batch: list of lists of states, states_batch[session_i][t]\n", " :param actions_batch: list of lists of actions, actions_batch[session_i][t]\n", - " :param rewards_batch: list of rewards, rewards_batch[session_i][t]\n", + " :param rewards_batch: list of rewards, rewards_batch[session_i]\n", "\n", " :returns: elite_states,elite_actions, both 1D lists of states and respective actions from elite sessions\n", "\n", " Please return elite states and actions in their original order \n", " [i.e. sorted by session number and timestep within session]\n", "\n", - " If you're confused, see examples below. Please don't assume that states are integers (they'll get different later).\n", + " If you are confused, see examples below. Please don't assume that states are integers\n", + " (they will become different later).\n", " \"\"\"\n", "\n", " reward_threshold = \n", @@ -216,21 +200,19 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "states_batch = [\n", - " [1, 2, 3], # game1\n", + " [1, 2, 3], # game1\n", " [4, 2, 0, 2], # game2\n", - " [3, 1] # game3\n", + " [3, 1], # game3\n", "]\n", "\n", "actions_batch = [\n", - " [0, 2, 4], # game1\n", + " [0, 2, 4], # game1\n", " [3, 2, 0, 1], # game2\n", - " [3, 3] # game3\n", + " [3, 3], # game3\n", "]\n", "rewards_batch = [\n", " 3, # game1\n", @@ -265,9 +247,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "def update_policy(elite_states, elite_actions):\n", @@ -296,15 +276,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ - "\n", - "elite_states, elite_actions = ([1, 2, 3, 4, 2, 0, 2, 3, 1], [\n", - " 0, 2, 4, 3, 2, 0, 1, 3, 3])\n", - "\n", + "elite_states = [1, 2, 3, 4, 2, 0, 2, 3, 1]\n", + "elite_actions = [0, 2, 4, 3, 2, 0, 1, 3, 3]\n", "\n", "new_policy = update_policy(elite_states, elite_actions)\n", "\n", @@ -334,15 +310,12 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "from IPython.display import clear_output\n", "\n", - "\n", - "def show_progress(rewards_batch, log, reward_range=[-990, +10]):\n", + "def show_progress(rewards_batch, log, percentile, reward_range=[-990, +10]):\n", " \"\"\"\n", " A convenience function that displays training progress. \n", " No cool math here, just charts.\n", @@ -374,21 +347,17 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "# reset policy just in case\n", - "policy = np.ones([n_states, n_actions])/n_actions" + "policy = np.ones([n_states, n_actions]) / n_actions" ] }, { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "n_sessions = 250 # sample this many sessions\n", @@ -410,7 +379,7 @@ " policy = learning_rate*new_policy + (1-learning_rate)*policy\n", "\n", " # display results on chart\n", - " show_progress(rewards_batch, log)" + " show_progress(rewards_batch, log, percentile)" ] }, { @@ -419,7 +388,7 @@ "source": [ "# Digging deeper: approximate crossentropy with neural nets\n", "\n", - "![img](https://casd35.wikispaces.com/file/view/digging_deeper_final.jpg/359658499/503x260/digging_deeper_final.jpg)\n", + "![img](https://tip.duke.edu/independent_learning/greek/lesson/digging_deeper_final.jpg)\n", "\n", "In this section we will train a neural network policy for continuous state space game" ] @@ -427,9 +396,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "# if you see \" has no attribute .env\", remove .env or update gym\n", @@ -444,18 +411,17 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "# create agent\n", "from sklearn.neural_network import MLPClassifier\n", - "agent = MLPClassifier(hidden_layer_sizes=(20, 20),\n", - " activation='tanh',\n", - " warm_start=True, # keep progress between .fit(...) calls\n", - " max_iter=1 # make only 1 iteration on each .fit(...)\n", - " )\n", + "agent = MLPClassifier(\n", + " hidden_layer_sizes=(20, 20),\n", + " activation='tanh',\n", + " warm_start=True, # keep progress between .fit(...) calls\n", + " max_iter=1, # make only 1 iteration on each .fit(...)\n", + ")\n", "# initialize agent to the dimension of state an amount of actions\n", "agent.fit([env.reset()]*n_actions, range(n_actions))" ] @@ -463,9 +429,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "def generate_session(t_max=1000):\n", @@ -498,9 +462,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "n_sessions = 100\n", @@ -513,11 +475,11 @@ "\n", " states_batch, actions_batch, rewards_batch = map(np.array, zip(*sessions))\n", "\n", - " elite_states, elite_actions = \n", "\n", - " \n", + " \n", "\n", - " show_progress(rewards_batch, log, reward_range=[0, np.max(rewards_batch)])\n", + " show_progress(rewards_batch, log, percentile, reward_range=[0, np.max(rewards_batch)])\n", "\n", " if np.mean(rewards_batch) > 190:\n", " print(\"You Win! You may stop training now via KeyboardInterrupt.\")" @@ -533,9 +495,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "# record sessions\n", @@ -543,17 +503,13 @@ "env = gym.wrappers.Monitor(gym.make(\"CartPole-v0\"),\n", " directory=\"videos\", force=True)\n", "sessions = [generate_session() for _ in range(100)]\n", - "env.close()\n", - "# upload to gym\n", - "# gym.upload(\"./videos/\",api_key=\"\") #you'll need me later" + "env.close()" ] }, { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "# show video\n", @@ -581,7 +537,7 @@ "You may have noticed that the taxi problem quickly converges from -100 to a near-optimal score and then descends back into -50/-100. This is in part because the environment has some innate randomness. Namely, the starting points of passenger/driver change from episode to episode.\n", "\n", "### Tasks\n", - "- __1.1__ (1 pts) Find out how the algorithm performance changes if you change different percentile and different n_samples.\n", + "- __1.1__ (1 pts) Find out how the algorithm performance changes if you use a different `percentile` and/or `n_sessions`.\n", "- __1.2__ (2 pts) Tune the algorithm to end up with positive average score.\n", "\n", "It's okay to modify the existing code.\n" @@ -602,7 +558,7 @@ "\n", "### Deep crossentropy method\n", "\n", - "By this moment you should have got enough score on [CartPole-v0](https://gym.openai.com/envs/CartPole-v0) to consider it solved (see the link). It's time to upload the result and get to something harder.\n", + "By this moment you should have got enough score on [CartPole-v0](https://gym.openai.com/envs/CartPole-v0) to consider it solved (see the link). It's time to try something harder.\n", "\n", "* if you have any trouble with CartPole-v0 and feel stuck, feel free to ask us or your peers for help.\n", "\n", @@ -611,7 +567,6 @@ "* __2.1__ (3 pts) Pick one of environments: MountainCar-v0 or LunarLander-v2.\n", " * For MountainCar, get average reward of __at least -150__\n", " * For LunarLander, get average reward of __at least +50__\n", - " * For any environment, upload it to gym and post url in your anytask form.\n", "\n", "See the tips section below, it's kinda important.\n", "__Note:__ If your agent is below the target score, you'll still get most of the points depending on the result, so don't be afraid to submit it.\n", @@ -625,15 +580,38 @@ " \n", " \n", "### Tips\n", - "* Gym page: [mountaincar](https://gym.openai.com/envs/MountainCar-v0), [lunarlander](https://gym.openai.com/envs/LunarLander-v2)\n", + "* Gym page: [MountainCar](https://gym.openai.com/envs/MountainCar-v0), [LunarLander](https://gym.openai.com/envs/LunarLander-v2)\n", "* Sessions for MountainCar may last for 10k+ ticks. Make sure ```t_max``` param is at least 10k.\n", " * Also it may be a good idea to cut rewards via \">\" and not \">=\". If 90% of your sessions get reward of -10k and 20% are better, than if you use percentile 20% as threshold, R >= threshold __fails cut off bad sessions__ whule R > threshold works alright.\n", "* _issue with gym_: Some versions of gym limit game time by 200 ticks. This will prevent cem training in most cases. Make sure your agent is able to play for the specified __t_max__, and if it isn't, try `env = gym.make(\"MountainCar-v0\").env` or otherwise get rid of TimeLimit wrapper.\n", "* If you use old _swig_ lib for LunarLander-v2, you may get an error. See this [issue](https://github.com/openai/gym/issues/100) for solution.\n", "* If it won't train it's a good idea to plot reward distribution and record sessions: they may give you some clue. If they don't, call course staff :)\n", "* 20-neuron network is probably not enough, feel free to experiment.\n", - "* __Please upload the results to openai gym and send links to all submissions in the e-mail__\n", "\n", + "You may find the following snippet useful:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def visualize_mountain_car(env, agent):\n", + " xs = np.linspace(env.min_position, env.max_position, 100)\n", + " vs = np.linspace(-env.max_speed, env.max_speed, 100)\n", + " grid = np.dstack(np.meshgrid(xs, vs)).transpose(1, 0, 2)\n", + " grid_flat = grid.reshape(len(xs) * len(vs), 2)\n", + " probs = agent.predict_proba(grid_flat).reshape(len(xs), len(vs), 3)\n", + " return probs\n", + "\n", + "plt.imshow(visualize_mountain_car(env, agent))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ "### Bonus tasks\n", "\n", "* __2.3 bonus__ Try to find a network architecture and training params that solve __both__ environments above (_Points depend on implementation. If you attempted this task, please mention it in anytask submission._)\n", @@ -650,22 +628,9 @@ } ], "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.0" + "pygments_lexer": "ipython3" } }, "nbformat": 4, diff --git a/week1_intro/pong.py b/week01_intro/pong.py similarity index 100% rename from week1_intro/pong.py rename to week01_intro/pong.py diff --git a/week1_intro/primer_python_for_ml/recap_ml.ipynb b/week01_intro/primer_python_for_ml/recap_ml.ipynb similarity index 99% rename from week1_intro/primer_python_for_ml/recap_ml.ipynb rename to week01_intro/primer_python_for_ml/recap_ml.ipynb index b8b904f89..3726593e0 100644 --- a/week1_intro/primer_python_for_ml/recap_ml.ipynb +++ b/week01_intro/primer_python_for_ml/recap_ml.ipynb @@ -849,7 +849,7 @@ }, "outputs": [], "source": [ - "% % time", + "%%time", "\n", "# ^-- this \"magic\" measures and prints cell computation time", "\n", @@ -888,7 +888,7 @@ }, "outputs": [], "source": [ - "% % time", + "%% time", "\n", "\n", "# Option II: start from python, convert to numpy", @@ -919,7 +919,7 @@ }, "outputs": [], "source": [ - "% % time", + "%% time", "\n", "\n", "# Option III: pure numpy", @@ -2653,4 +2653,4 @@ }, "nbformat": 4, "nbformat_minor": 1 -} \ No newline at end of file +} diff --git a/week1_intro/primer_python_for_ml/train.csv b/week01_intro/primer_python_for_ml/train.csv similarity index 100% rename from week1_intro/primer_python_for_ml/train.csv rename to week01_intro/primer_python_for_ml/train.csv diff --git a/week01_intro/project_starter_evolution_strategies.ipynb b/week01_intro/project_starter_evolution_strategies.ipynb new file mode 100644 index 000000000..9b1bce4df --- /dev/null +++ b/week01_intro/project_starter_evolution_strategies.ipynb @@ -0,0 +1,165 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Project :: Evolution Strategies\n", + "\n", + "![img](https://t4.ftcdn.net/jpg/00/17/46/81/240_F_17468143_wY3hsHyfNYoMdG9BlC56HI4JA7pNu63h.jpg)\n", + "\n", + "Remember the idea behind Evolution Strategies? Here's a neat [blog post](https://blog.openai.com/evolution-strategies/) about 'em.\n", + "\n", + "Can you reproduce their success? You will have to implement evolutionary strategies and see how they work.\n", + "\n", + "This project is optional; has several milestones each worth a number of points [and swag].\n", + "\n", + "__Milestones:__\n", + "* [10pts] Basic prototype of evolutionary strategies that works in one thread on CartPole\n", + "* [+5pts] Modify the code to make them work in parallel\n", + "* [+5pts] if you can run ES distributedly on at least two PCs\n", + "* [+10pts] Apply ES to play Atari Pong at least better than random\n", + "* [++] Additional points for all kinds of cool stuff besides milestones\n", + "\n", + "__Rules:__\n", + "\n", + "* This is __not a mandatory assignment__, but it's a way to learn some cool things if you're getting bored with default assignments.\n", + "* Once you decided to take on this project, please tell any of course staff members so that we can help ypu if you get stuck.\n", + "* There's a default implementation of ES in this [openai repo](https://github.com/openai/evolution-strategies-starter). It's okay to look there if you get stuck or want to compare your solutions, but each copy-pasted chunk of code should be understood thoroughly. We'll test that with questions." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Tips on implementation\n", + "\n", + "* It would be very convenient later if you implemented a function that takes policy weights, generates a session and returns policy changes -- so that you could then run a bunch of them in parallel.\n", + "\n", + "* The simplest way you can do multiprocessing is to use [joblib](https://www.google.com/search?client=ubuntu&channel=fs&q=joblib&ie=utf-8&oe=utf-8)\n", + "\n", + "* For joblib, make sure random variables are independent in each job. Simply add `np.random.seed()` at the beginning of your \"job\" function.\n", + "\n", + "Later once you got distributed, you may need a storage that gathers gradients from all workers. In such case we recommend [Redis](https://redis.io/) due to it's simplicity.\n", + "\n", + "Here's a speed-optimized saver/loader to store numpy arrays in Redis as strings.\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import joblib\n", + "from six import BytesIO\n", + "\n", + "\n", + "def dumps(data):\n", + " \"\"\"converts whatever to string\"\"\"\n", + " s = BytesIO()\n", + " joblib.dump(data, s)\n", + " return s.getvalue()\n", + "\n", + "\n", + "def loads(self, string):\n", + " \"\"\"converts string to whatever was dumps'ed in it\"\"\"\n", + " return joblib.load(BytesIO(string))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Tips on atari games\n", + "* There's all the pre-processing and tuning done for you in the code below\n", + " * Images rescaled to 42x42 to speed up computation\n", + " * We use last 4 frames as observations to account for ball velocity\n", + " * The code below requires ```pip install Image``` and ```pip install gym[atari]``` \n", + " * You may also need some dependencies for gym[atari] - google \"gym install all\" dependencies or use our pre-built environment.\n", + "* The recommended agent architecture is a convolutional neural network. Dense network will also do.\n", + "\n", + "\n", + "May the force be with you!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from pong import make_pong\n", + "import numpy as np\n", + "\n", + "env = make_pong()\n", + "print(env.action_space)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# get the initial state\n", + "s = env.reset()\n", + "print(s.shape)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "%matplotlib inline\n", + "# plot first observation. Only one frame\n", + "plt.imshow(s.swapaxes(1, 2).reshape(-1, s.shape[-1]).T)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# next frame\n", + "new_s, r, done, _ = env.step(env.action_space.sample())\n", + "plt.imshow(new_s.swapaxes(1, 2).reshape(-1, s.shape[-1]).T)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# after 10 frames\n", + "for _ in range(10):\n", + " new_s, r, done, _ = env.step(env.action_space.sample())\n", + "\n", + "plt.imshow(new_s.swapaxes(1, 2).reshape(-1, s.shape[-1]).T, vmin=0)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "< tons of your code here or elsewhere >" + ] + } + ], + "metadata": { + "language_info": { + "name": "python", + "pygments_lexer": "ipython3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/week01_intro/seminar_gym_interface.ipynb b/week01_intro/seminar_gym_interface.ipynb new file mode 100644 index 000000000..1ba5d6df2 --- /dev/null +++ b/week01_intro/seminar_gym_interface.ipynb @@ -0,0 +1,181 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "%matplotlib inline\n", + "\n", + "# In Google Colab, uncomment this:\n", + "# !wget https://bit.ly/2FMJP5K -O setup.py && bash setup.py\n", + "\n", + "# This code creates a virtual display to draw game images on.\n", + "# If you are running locally, just ignore it\n", + "import os\n", + "if type(os.environ.get(\"DISPLAY\")) is not str or len(os.environ.get(\"DISPLAY\")) == 0:\n", + " !bash ../xvfb start\n", + " %env DISPLAY = : 1" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### OpenAI Gym\n", + "\n", + "We're gonna spend several next weeks learning algorithms that solve decision processes. We are then in need of some interesting decision problems to test our algorithms.\n", + "\n", + "That's where OpenAI gym comes into play. It's a python library that wraps many classical decision problems including robot control, videogames and board games.\n", + "\n", + "So here's how it works:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import gym\n", + "\n", + "env = gym.make(\"MountainCar-v0\")\n", + "env.reset()\n", + "\n", + "plt.imshow(env.render('rgb_array'))\n", + "print(\"Observation space:\", env.observation_space)\n", + "print(\"Action space:\", env.action_space)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Note: if you're running this on your local machine, you'll see a window pop up with the image above. Don't close it, just alt-tab away." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Gym interface\n", + "\n", + "The three main methods of an environment are\n", + "* __reset()__ - reset environment to initial state, _return first observation_\n", + "* __render()__ - show current environment state (a more colorful version :) )\n", + "* __step(a)__ - commit action __a__ and return (new observation, reward, is done, info)\n", + " * _new observation_ - an observation right after commiting the action __a__\n", + " * _reward_ - a number representing your reward for commiting action __a__\n", + " * _is done_ - True if the MDP has just finished, False if still in progress\n", + " * _info_ - some auxilary stuff about what just happened. Ignore it ~~for now~~." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "obs0 = env.reset()\n", + "print(\"initial observation code:\", obs0)\n", + "\n", + "# Note: in MountainCar, observation is just two numbers: car position and velocity" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(\"taking action 2 (right)\")\n", + "new_obs, reward, is_done, _ = env.step(2)\n", + "\n", + "print(\"new observation code:\", new_obs)\n", + "print(\"reward:\", reward)\n", + "print(\"is game over?:\", is_done)\n", + "\n", + "# Note: as you can see, the car has moved to the right slightly (around 0.0005)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Play with it\n", + "\n", + "Below is the code that drives the car to the right. \n", + "\n", + "However, it doesn't reach the flag at the far right due to gravity. \n", + "\n", + "__Your task__ is to fix it. Find a strategy that reaches the flag. \n", + "\n", + "You're not required to build any sophisticated algorithms for now, feel free to hard-code :)\n", + "\n", + "__Hint__: your action at each step should depend either on `t` or on `s`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from IPython import display\n", + "\n", + "# create env manually to set time limit. Please don't change this.\n", + "TIME_LIMIT = 250\n", + "env = gym.wrappers.TimeLimit(\n", + " gym.envs.classic_control.MountainCarEnv(),\n", + " max_episode_steps=TIME_LIMIT + 1,\n", + ")\n", + "s = env.reset()\n", + "actions = {'left': 0, 'stop': 1, 'right': 2}\n", + "\n", + "plt.figure(figsize=(4, 3))\n", + "display.clear_output(wait=True)\n", + "\n", + "for t in range(TIME_LIMIT):\n", + " plt.gca().clear()\n", + " \n", + " # change the line below to reach the flag\n", + " s, r, done, _ = env.step(actions['right'])\n", + "\n", + " # draw game image on display\n", + " plt.imshow(env.render('rgb_array'))\n", + " \n", + " display.clear_output(wait=True)\n", + " display.display(plt.gcf())\n", + "\n", + " if done:\n", + " print(\"Well done!\")\n", + " break\n", + "else:\n", + " print(\"Time limit exceeded. Try again.\")\n", + "\n", + "display.clear_output(wait=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "assert s[0] > 0.47\n", + "print(\"You solved it!\")" + ] + } + ], + "metadata": { + "language_info": { + "name": "python", + "pygments_lexer": "ipython3" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/week02_value_based/README.md b/week02_value_based/README.md new file mode 100644 index 000000000..aca7e7a4d --- /dev/null +++ b/week02_value_based/README.md @@ -0,0 +1,14 @@ +## Materials +* [__Lecture slides__](https://docs.google.com/presentation/d/1lz2oIUTvd2MHWKEQSH8hquS66oe4MZ_eRvVViZs2uuE) +* Our videos: [lecture](https://yadi.sk/i/iAkDaxVftlAdnA) [seminar](https://yadi.sk/i/hrnHB9DK3SeZRC) (russian) +* __[main]__ lecture by David Silver - [url](https://www.youtube.com/watch?v=Nd1-UUMVfz4) +* Alternative lecture by Pieter Abbeel (english): [part 1](https://www.youtube.com/watch?v=i0o-ui1N35U), [part 2](https://www.youtube.com/watch?v=Csiiv6WGzKM) +* Alternative lecture by John Schulmann (english): [video](https://www.youtube.com/watch?v=IL3gVyJMmhg) +* Definitive guide in policy/value iteration from Sutton: start from page 81 [here](http://incompleteideas.net/book/the-book-2nd.html). + + +## Homework description: + +The main assignment is `seminar_vi.ipynb`[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/yandexdataschool/Practical_RL/blob/spring19/week02_value_based/seminar_vi.ipynb) notebook in this week's folder. It has no requirements besides the most basic data science libraries (e.g. numpy) so you should be able to run it locally. + +__Note:__ if you have any difficulty using graphviz, just set `has_graphviz=False`. diff --git a/week2_value_based/mdp.py b/week02_value_based/mdp.py similarity index 96% rename from week2_value_based/mdp.py rename to week02_value_based/mdp.py index b14798672..2536e31b3 100644 --- a/week2_value_based/mdp.py +++ b/week02_value_based/mdp.py @@ -1,11 +1,10 @@ # most of this code was politely stolen from https://github.com/berkeleydeeprlcourse/homework/ -# all creadit goes to https://github.com/abhishekunique (if i got the author right) +# all credit goes to https://github.com/abhishekunique (if i got the author right) import sys import random import numpy as np try: - from IPython.display import display from graphviz import Digraph import graphviz has_graphviz = True @@ -13,17 +12,6 @@ has_graphviz = False -def weighted_choice(v, p): - total = sum(p) - r = random.uniform(0, total) - upto = 0 - for c, w in zip(v, p): - if upto + w >= r: - return c - upto += w - assert False, "Shouldn't get here" - - class MDP: def __init__(self, transition_probs, rewards, initial_state=None): """ @@ -114,7 +102,7 @@ def step(self, action): """ take action, return next_state, reward, is_done, empty_info """ possible_states, probs = zip( *self.get_next_states(self._current_state, action).items()) - next_state = weighted_choice(possible_states, p=probs) + next_state = possible_states[np.random.choice(np.arange(len(possible_states)), p=probs)] reward = self.get_reward(self._current_state, action, next_state) is_done = self.is_terminal(next_state) self._current_state = next_state @@ -290,7 +278,7 @@ def plot_graph(mdp, graph_size='10,10', s_node_size='1,5', :return: dot object """ s_node_attrs = {'shape': 'doublecircle', - 'color': 'lightgreen', + 'color': '#85ff75', 'style': 'filled', 'width': str(s_node_size), 'height': str(s_node_size), @@ -353,20 +341,24 @@ def plot_graph_with_state_values(mdp, state_values): value = state_values[state_node] graph.node(state_node, label=str(state_node) + '\n' + 'V =' + str(value)[:4]) - return display(graph) + return graph def get_optimal_action_for_plot(mdp, state_values, state, gamma=0.9): """ Finds optimal action using formula above. """ if mdp.is_terminal(state): return None next_actions = mdp.get_possible_actions(state) + try: + from mdp_get_action_value import get_action_value + except ImportError: + raise ImportError("Implement get_action_value(mdp, state_values, state, action, gamma) in the file \"mdp_get_action_value.py\".") q_values = [get_action_value(mdp, state_values, state, action, gamma) for action in next_actions] optimal_action = next_actions[np.argmax(q_values)] return optimal_action -def plot_graph_optimal_strategy_and_state_values(mdp, state_values): +def plot_graph_optimal_strategy_and_state_values(mdp, state_values, gamma=0.9): """ Plot graph with state values and """ graph = plot_graph(mdp) opt_s_a_edge_attrs = {'style': 'bold', @@ -385,4 +377,4 @@ def plot_graph_optimal_strategy_and_state_values(mdp, state_values): gamma): graph.edge(state_node, state_node + "-" + action, **opt_s_a_edge_attrs) - return display(graph) + return graph diff --git a/week02_value_based/seminar_vi.ipynb b/week02_value_based/seminar_vi.ipynb new file mode 100644 index 000000000..c4efc0851 --- /dev/null +++ b/week02_value_based/seminar_vi.ipynb @@ -0,0 +1,1367 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Markov decision process\n", + "\n", + "This week's methods are all built to solve __M__arkov __D__ecision __P__rocesses. In the broadest sense, an MDP is defined by how it changes states and how rewards are computed.\n", + "\n", + "State transition is defined by $P(s' |s,a)$ - how likely are you to end at state $s'$ if you take action $a$ from state $s$. Now there's more than one way to define rewards, but we'll use $r(s,a,s')$ function for convenience.\n", + "\n", + "_This notebook is inspired by the awesome_ [CS294](https://github.com/berkeleydeeprlcourse/homework/blob/36a0b58261acde756abd55306fbe63df226bf62b/hw2/HW2.ipynb) _by Berkeley_" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For starters, let's define a simple MDP from this picture:\n", + "\n", + "_img by MistWiz (Own work) [Public domain], via Wikimedia Commons_" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "ExecuteTime": { + "end_time": "2018-04-02T13:44:32.642838Z", + "start_time": "2018-04-02T13:44:32.545142Z" + } + }, + "outputs": [], + "source": [ + "# If you Colab, uncomment this please\n", + "# !wget -q https://raw.githubusercontent.com/yandexdataschool/Practical_RL/spring19/week02_value_based/mdp.py\n", + "\n", + "transition_probs = {\n", + " 's0': {\n", + " 'a0': {'s0': 0.5, 's2': 0.5},\n", + " 'a1': {'s2': 1}\n", + " },\n", + " 's1': {\n", + " 'a0': {'s0': 0.7, 's1': 0.1, 's2': 0.2},\n", + " 'a1': {'s1': 0.95, 's2': 0.05}\n", + " },\n", + " 's2': {\n", + " 'a0': {'s0': 0.4, 's1': 0.6},\n", + " 'a1': {'s0': 0.3, 's1': 0.3, 's2': 0.4}\n", + " }\n", + "}\n", + "rewards = {\n", + " 's1': {'a0': {'s0': +5}},\n", + " 's2': {'a1': {'s0': -1}}\n", + "}\n", + "\n", + "from mdp import MDP\n", + "mdp = MDP(transition_probs, rewards, initial_state='s0')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can now use MDP just as any other gym environment:" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "ExecuteTime": { + "end_time": "2018-04-02T13:44:34.203384Z", + "start_time": "2018-04-02T13:44:34.199297Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "initial state = s0\n", + "next_state = s2, reward = 0.0, done = False\n" + ] + } + ], + "source": [ + "print('initial state =', mdp.reset())\n", + "next_state, reward, done, info = mdp.step('a1')\n", + "print('next_state = %s, reward = %s, done = %s' % (next_state, reward, done))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "but it also has other methods that you'll need for Value Iteration" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "ExecuteTime": { + "end_time": "2018-04-02T13:44:34.956122Z", + "start_time": "2018-04-02T13:44:34.949856Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "mdp.get_all_states = ('s0', 's1', 's2')\n", + "mdp.get_possible_actions('s1') = ('a0', 'a1')\n", + "mdp.get_next_states('s1', 'a0') = {'s0': 0.7, 's1': 0.1, 's2': 0.2}\n", + "mdp.get_reward('s1', 'a0', 's0') = 5\n", + "mdp.get_transition_prob('s1', 'a0', 's0') = 0.7\n" + ] + } + ], + "source": [ + "print(\"mdp.get_all_states =\", mdp.get_all_states())\n", + "print(\"mdp.get_possible_actions('s1') = \", mdp.get_possible_actions('s1'))\n", + "print(\"mdp.get_next_states('s1', 'a0') = \", mdp.get_next_states('s1', 'a0'))\n", + "print(\"mdp.get_reward('s1', 'a0', 's0') = \", mdp.get_reward('s1', 'a0', 's0'))\n", + "print(\"mdp.get_transition_prob('s1', 'a0', 's0') = \",\n", + " mdp.get_transition_prob('s1', 'a0', 's0'))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Optional: Visualizing MDPs\n", + "\n", + "You can also visualize any MDP with the drawing fuction donated by [neer201](https://github.com/neer201).\n", + "\n", + "You have to install graphviz for system and for python. For ubuntu just run:\n", + "\n", + "1. `sudo apt-get install graphviz`\n", + "2. `pip install graphviz`\n", + "3. restart the notebook\n", + "\n", + "__Note:__ Installing graphviz on some OS (esp. Windows) may be tricky. However, you can ignore this part alltogether and use the standart vizualization." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "ExecuteTime": { + "end_time": "2018-04-02T13:44:37.797182Z", + "start_time": "2018-04-02T13:44:37.794073Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Graphviz available: True\n" + ] + } + ], + "source": [ + "from mdp import has_graphviz\n", + "from IPython.display import display\n", + "print(\"Graphviz available:\", has_graphviz)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "ExecuteTime": { + "end_time": "2018-04-02T13:44:38.715883Z", + "start_time": "2018-04-02T13:44:38.648684Z" + } + }, + "outputs": [ + { + "data": { + "image/svg+xml": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "MDP\n", + "\n", + "\n", + "s0\n", + "\n", + "\n", + "s0\n", + "\n", + "\n", + "s0-a0\n", + "\n", + "a0\n", + "\n", + "\n", + "s0->s0-a0\n", + "\n", + "\n", + "\n", + "\n", + "s0-a1\n", + "\n", + "a1\n", + "\n", + "\n", + "s0->s0-a1\n", + "\n", + "\n", + "\n", + "\n", + "s0-a0->s0\n", + "\n", + "\n", + "p = 0.5\n", + "\n", + "\n", + "s2\n", + "\n", + "\n", + "s2\n", + "\n", + "\n", + "s0-a0->s2\n", + "\n", + "\n", + "p = 0.5\n", + "\n", + "\n", + "s2-a0\n", + "\n", + "a0\n", + "\n", + "\n", + "s2->s2-a0\n", + "\n", + "\n", + "\n", + "\n", + "s2-a1\n", + "\n", + "a1\n", + "\n", + "\n", + "s2->s2-a1\n", + "\n", + "\n", + "\n", + "\n", + "s0-a1->s2\n", + "\n", + "\n", + "p = 1\n", + "\n", + "\n", + "s1\n", + "\n", + "\n", + "s1\n", + "\n", + "\n", + "s1-a0\n", + "\n", + "a0\n", + "\n", + "\n", + "s1->s1-a0\n", + "\n", + "\n", + "\n", + "\n", + "s1-a1\n", + "\n", + "a1\n", + "\n", + "\n", + "s1->s1-a1\n", + "\n", + "\n", + "\n", + "\n", + "s1-a0->s0\n", + "\n", + "\n", + "p = 0.7  reward =5\n", + "\n", + "\n", + "s1-a0->s2\n", + "\n", + "\n", + "p = 0.2\n", + "\n", + "\n", + "s1-a0->s1\n", + "\n", + "\n", + "p = 0.1\n", + "\n", + "\n", + "s1-a1->s2\n", + "\n", + "\n", + "p = 0.05\n", + "\n", + "\n", + "s1-a1->s1\n", + "\n", + "\n", + "p = 0.95\n", + "\n", + "\n", + "s2-a0->s0\n", + "\n", + "\n", + "p = 0.4\n", + "\n", + "\n", + "s2-a0->s1\n", + "\n", + "\n", + "p = 0.6\n", + "\n", + "\n", + "s2-a1->s0\n", + "\n", + "\n", + "p = 0.3  reward =-1\n", + "\n", + "\n", + "s2-a1->s2\n", + "\n", + "\n", + "p = 0.4\n", + "\n", + "\n", + "s2-a1->s1\n", + "\n", + "\n", + "p = 0.3\n", + "\n", + "\n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "if has_graphviz:\n", + " from mdp import plot_graph, plot_graph_with_state_values, \\\n", + " plot_graph_optimal_strategy_and_state_values\n", + "\n", + " display(plot_graph(mdp))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Value Iteration\n", + "\n", + "Now let's build something to solve this MDP. The simplest algorithm so far is __V__alue __I__teration\n", + "\n", + "Here's the pseudo-code for VI:\n", + "\n", + "---\n", + "\n", + "`1.` Initialize $V^{(0)}(s)=0$, for all $s$\n", + "\n", + "`2.` For $i=0, 1, 2, \\dots$\n", + " \n", + "`3.` $ \\quad V_{(i+1)}(s) = \\max_a \\sum_{s'} P(s' | s,a) \\cdot [ r(s,a,s') + \\gamma V_{i}(s')]$, for all $s$\n", + "\n", + "---" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "First, let's write a function to compute the state-action value function $Q^{\\pi}$, defined as follows\n", + "\n", + "$$Q_i(s, a) = \\sum_{s'} P(s' | s,a) \\cdot [ r(s,a,s') + \\gamma V_{i}(s')]$$\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2018-04-02T13:43:17.101416Z", + "start_time": "2018-04-02T13:43:17.095468Z" + }, + "collapsed": true + }, + "outputs": [], + "source": [ + "%%writefile mdp_get_action_value.py\n", + "\n", + "def get_action_value(mdp, state_values, state, action, gamma):\n", + " \"\"\" Computes Q(s,a) as in formula above \"\"\"\n", + "\n", + " # YOUR CODE HERE\n", + "\n", + " return < YOUR CODE >" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from mdp_get_action_value import get_action_value" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2018-04-02T13:43:17.102247Z", + "start_time": "2018-04-02T13:43:05.502Z" + }, + "collapsed": true + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "test_Vs = {s: i for i, s in enumerate(sorted(mdp.get_all_states()))}\n", + "assert np.allclose(get_action_value(mdp, test_Vs, 's2', 'a1', 0.9), 0.69)\n", + "assert np.allclose(get_action_value(mdp, test_Vs, 's1', 'a0', 0.9), 3.95)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Using $Q(s,a)$ we can now define the \"next\" V(s) for value iteration.\n", + " $$V_{(i+1)}(s) = \\max_a \\sum_{s'} P(s' | s,a) \\cdot [ r(s,a,s') + \\gamma V_{i}(s')] = \\max_a Q_i(s,a)$$" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2018-04-02T13:43:17.103358Z", + "start_time": "2018-04-02T13:43:05.506Z" + }, + "collapsed": true + }, + "outputs": [], + "source": [ + "def get_new_state_value(mdp, state_values, state, gamma):\n", + " \"\"\" Computes next V(s) as in formula above. Please do not change state_values in process. \"\"\"\n", + " if mdp.is_terminal(state):\n", + " return 0\n", + "\n", + " # \n", + " return" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2018-04-02T13:43:17.104340Z", + "start_time": "2018-04-02T13:43:05.510Z" + }, + "collapsed": true + }, + "outputs": [], + "source": [ + "test_Vs_copy = dict(test_Vs)\n", + "assert np.allclose(get_new_state_value(mdp, test_Vs, 's0', 0.9), 1.8)\n", + "assert np.allclose(get_new_state_value(mdp, test_Vs, 's2', 0.9), 0.69)\n", + "assert test_Vs == test_Vs_copy, \"please do not change state_values in get_new_state_value\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Finally, let's combine everything we wrote into a working value iteration algo." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2018-04-02T13:44:09.793405Z", + "start_time": "2018-04-02T13:44:09.770623Z" + }, + "collapsed": true + }, + "outputs": [], + "source": [ + "# parameters\n", + "gamma = 0.9 # discount for MDP\n", + "num_iter = 100 # maximum iterations, excluding initialization\n", + "# stop VI if new values are this close to old values (or closer)\n", + "min_difference = 0.001\n", + "\n", + "# initialize V(s)\n", + "state_values = {s: 0 for s in mdp.get_all_states()}\n", + "\n", + "if has_graphviz:\n", + " display(plot_graph_with_state_values(mdp, state_values))\n", + "\n", + "for i in range(num_iter):\n", + "\n", + " # Compute new state values using the functions you defined above.\n", + " # It must be a dict {state : float V_new(state)}\n", + " new_state_values = \n", + "\n", + " assert isinstance(new_state_values, dict)\n", + "\n", + " # Compute difference\n", + " diff = max(abs(new_state_values[s] - state_values[s])\n", + " for s in mdp.get_all_states())\n", + " print(\"iter %4i | diff: %6.5f | \" % (i, diff), end=\"\")\n", + " print(' '.join(\"V(%s) = %.3f\" % (s, v)\n", + " for s, v in state_values.items()), end='\\n\\n')\n", + " state_values = new_state_values\n", + "\n", + " if diff < min_difference:\n", + " print(\"Terminated\")\n", + " break" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "if has_graphviz:\n", + " display(plot_graph_with_state_values(mdp, state_values))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2018-04-02T13:43:17.106395Z", + "start_time": "2018-04-02T13:43:05.522Z" + }, + "collapsed": true + }, + "outputs": [], + "source": [ + "print(\"Final state values:\", state_values)\n", + "\n", + "assert abs(state_values['s0'] - 8.032) < 0.01\n", + "assert abs(state_values['s1'] - 11.169) < 0.01\n", + "assert abs(state_values['s2'] - 8.921) < 0.01" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now let's use those $V^{*}(s)$ to find optimal actions in each state\n", + "\n", + " $$\\pi^*(s) = argmax_a \\sum_{s'} P(s' | s,a) \\cdot [ r(s,a,s') + \\gamma V_{i}(s')] = argmax_a Q_i(s,a)$$\n", + " \n", + "The only difference vs V(s) is that here we take not max but argmax: find action such with maximum Q(s,a)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2018-04-02T13:43:17.107338Z", + "start_time": "2018-04-02T13:43:05.525Z" + }, + "collapsed": true + }, + "outputs": [], + "source": [ + "def get_optimal_action(mdp, state_values, state, gamma=0.9):\n", + " \"\"\" Finds optimal action using formula above. \"\"\"\n", + " if mdp.is_terminal(state):\n", + " return None\n", + "\n", + " # \n", + "\n", + " return < YOUR CODE >" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2018-04-02T13:43:17.108149Z", + "start_time": "2018-04-02T13:43:05.530Z" + }, + "collapsed": true + }, + "outputs": [], + "source": [ + "assert get_optimal_action(mdp, state_values, 's0', gamma) == 'a1'\n", + "assert get_optimal_action(mdp, state_values, 's1', gamma) == 'a0'\n", + "assert get_optimal_action(mdp, state_values, 's2', gamma) == 'a0'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2018-04-02T13:44:05.017823Z", + "start_time": "2018-04-02T13:44:04.962755Z" + }, + "collapsed": true + }, + "outputs": [], + "source": [ + "if has_graphviz:\n", + " try:\n", + " display(plot_graph_optimal_strategy_and_state_values(mdp, state_values))\n", + " except ImportError:\n", + " raise ImportError(\"Run the cell that starts with \\\"%%writefile mdp_get_action_value.py\\\"\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2018-04-02T13:43:17.110002Z", + "start_time": "2018-04-02T13:43:05.538Z" + }, + "collapsed": true + }, + "outputs": [], + "source": [ + "# Measure agent's average reward\n", + "\n", + "s = mdp.reset()\n", + "rewards = []\n", + "for _ in range(10000):\n", + " s, r, done, _ = mdp.step(get_optimal_action(mdp, state_values, s, gamma))\n", + " rewards.append(r)\n", + "\n", + "print(\"average reward: \", np.mean(rewards))\n", + "\n", + "assert(0.85 < np.mean(rewards) < 1.0)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Frozen lake" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2018-04-02T13:43:17.110991Z", + "start_time": "2018-04-02T13:43:05.541Z" + }, + "collapsed": true + }, + "outputs": [], + "source": [ + "from mdp import FrozenLakeEnv\n", + "mdp = FrozenLakeEnv(slip_chance=0)\n", + "\n", + "mdp.render()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2018-04-02T13:43:17.111919Z", + "start_time": "2018-04-02T13:43:05.545Z" + }, + "collapsed": true + }, + "outputs": [], + "source": [ + "def value_iteration(mdp, state_values=None, gamma=0.9, num_iter=1000, min_difference=1e-5):\n", + " \"\"\" performs num_iter value iteration steps starting from state_values. Same as before but in a function \"\"\"\n", + " state_values = state_values or {s: 0 for s in mdp.get_all_states()}\n", + " for i in range(num_iter):\n", + "\n", + " # Compute new state values using the functions you defined above. It must be a dict {state : new_V(state)}\n", + " new_state_values = \n", + "\n", + " assert isinstance(new_state_values, dict)\n", + "\n", + " # Compute difference\n", + " diff = max(abs(new_state_values[s] - state_values[s])\n", + " for s in mdp.get_all_states())\n", + "\n", + " print(\"iter %4i | diff: %6.5f | V(start): %.3f \" %\n", + " (i, diff, new_state_values[mdp._initial_state]))\n", + "\n", + " state_values = new_state_values\n", + " if diff < min_difference:\n", + " break\n", + "\n", + " return state_values" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2018-04-02T13:43:17.112871Z", + "start_time": "2018-04-02T13:43:05.548Z" + }, + "collapsed": true + }, + "outputs": [], + "source": [ + "state_values = value_iteration(mdp)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2018-04-02T13:43:17.114062Z", + "start_time": "2018-04-02T13:43:05.552Z" + }, + "collapsed": true + }, + "outputs": [], + "source": [ + "s = mdp.reset()\n", + "mdp.render()\n", + "for t in range(100):\n", + " a = get_optimal_action(mdp, state_values, s, gamma)\n", + " print(a, end='\\n\\n')\n", + " s, r, done, _ = mdp.step(a)\n", + " mdp.render()\n", + " if done:\n", + " break" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Let's visualize!\n", + "\n", + "It's usually interesting to see what your algorithm actually learned under the hood. To do so, we'll plot state value functions and optimal actions at each VI step." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2018-04-02T13:43:17.115092Z", + "start_time": "2018-04-02T13:43:05.556Z" + }, + "collapsed": true + }, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "%matplotlib inline\n", + "\n", + "\n", + "def draw_policy(mdp, state_values):\n", + " plt.figure(figsize=(3, 3))\n", + " h, w = mdp.desc.shape\n", + " states = sorted(mdp.get_all_states())\n", + " V = np.array([state_values[s] for s in states])\n", + " Pi = {s: get_optimal_action(mdp, state_values, s, gamma) for s in states}\n", + " plt.imshow(V.reshape(w, h), cmap='gray', interpolation='none', clim=(0, 1))\n", + " ax = plt.gca()\n", + " ax.set_xticks(np.arange(h)-.5)\n", + " ax.set_yticks(np.arange(w)-.5)\n", + " ax.set_xticklabels([])\n", + " ax.set_yticklabels([])\n", + " Y, X = np.mgrid[0:4, 0:4]\n", + " a2uv = {'left': (-1, 0), 'down': (0, -1), 'right': (1, 0), 'up': (-1, 0)}\n", + " for y in range(h):\n", + " for x in range(w):\n", + " plt.text(x, y, str(mdp.desc[y, x].item()),\n", + " color='g', size=12, verticalalignment='center',\n", + " horizontalalignment='center', fontweight='bold')\n", + " a = Pi[y, x]\n", + " if a is None:\n", + " continue\n", + " u, v = a2uv[a]\n", + " plt.arrow(x, y, u*.3, -v*.3, color='m',\n", + " head_width=0.1, head_length=0.1)\n", + " plt.grid(color='b', lw=2, ls='-')\n", + " plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2018-04-02T13:43:17.116164Z", + "start_time": "2018-04-02T13:43:05.560Z" + }, + "collapsed": true + }, + "outputs": [], + "source": [ + "state_values = {s: 0 for s in mdp.get_all_states()}\n", + "\n", + "for i in range(10):\n", + " print(\"after iteration %i\" % i)\n", + " state_values = value_iteration(mdp, state_values, num_iter=1)\n", + " draw_policy(mdp, state_values)\n", + "# please ignore iter 0 at each step" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2018-04-02T13:43:17.117143Z", + "start_time": "2018-04-02T13:43:05.563Z" + }, + "collapsed": true + }, + "outputs": [], + "source": [ + "from IPython.display import clear_output\n", + "from time import sleep\n", + "mdp = FrozenLakeEnv(map_name='8x8', slip_chance=0.1)\n", + "state_values = {s: 0 for s in mdp.get_all_states()}\n", + "\n", + "for i in range(30):\n", + " clear_output(True)\n", + " print(\"after iteration %i\" % i)\n", + " state_values = value_iteration(mdp, state_values, num_iter=1)\n", + " draw_policy(mdp, state_values)\n", + " sleep(0.5)\n", + "# please ignore iter 0 at each step" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Massive tests" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2018-04-02T13:43:17.118218Z", + "start_time": "2018-04-02T13:43:05.568Z" + }, + "collapsed": true + }, + "outputs": [], + "source": [ + "mdp = FrozenLakeEnv(slip_chance=0)\n", + "state_values = value_iteration(mdp)\n", + "\n", + "total_rewards = []\n", + "for game_i in range(1000):\n", + " s = mdp.reset()\n", + " rewards = []\n", + " for t in range(100):\n", + " s, r, done, _ = mdp.step(\n", + " get_optimal_action(mdp, state_values, s, gamma))\n", + " rewards.append(r)\n", + " if done:\n", + " break\n", + " total_rewards.append(np.sum(rewards))\n", + "\n", + "print(\"average reward: \", np.mean(total_rewards))\n", + "assert(1.0 <= np.mean(total_rewards) <= 1.0)\n", + "print(\"Well done!\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2018-04-02T13:43:17.119075Z", + "start_time": "2018-04-02T13:43:05.571Z" + }, + "collapsed": true + }, + "outputs": [], + "source": [ + "# Measure agent's average reward\n", + "mdp = FrozenLakeEnv(slip_chance=0.1)\n", + "state_values = value_iteration(mdp)\n", + "\n", + "total_rewards = []\n", + "for game_i in range(1000):\n", + " s = mdp.reset()\n", + " rewards = []\n", + " for t in range(100):\n", + " s, r, done, _ = mdp.step(\n", + " get_optimal_action(mdp, state_values, s, gamma))\n", + " rewards.append(r)\n", + " if done:\n", + " break\n", + " total_rewards.append(np.sum(rewards))\n", + "\n", + "print(\"average reward: \", np.mean(total_rewards))\n", + "assert(0.8 <= np.mean(total_rewards) <= 0.95)\n", + "print(\"Well done!\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2018-04-02T13:43:17.120316Z", + "start_time": "2018-04-02T13:43:05.574Z" + }, + "collapsed": true + }, + "outputs": [], + "source": [ + "# Measure agent's average reward\n", + "mdp = FrozenLakeEnv(slip_chance=0.25)\n", + "state_values = value_iteration(mdp)\n", + "\n", + "total_rewards = []\n", + "for game_i in range(1000):\n", + " s = mdp.reset()\n", + " rewards = []\n", + " for t in range(100):\n", + " s, r, done, _ = mdp.step(\n", + " get_optimal_action(mdp, state_values, s, gamma))\n", + " rewards.append(r)\n", + " if done:\n", + " break\n", + " total_rewards.append(np.sum(rewards))\n", + "\n", + "print(\"average reward: \", np.mean(total_rewards))\n", + "assert(0.6 <= np.mean(total_rewards) <= 0.7)\n", + "print(\"Well done!\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2018-04-02T13:43:17.121544Z", + "start_time": "2018-04-02T13:43:05.578Z" + }, + "collapsed": true + }, + "outputs": [], + "source": [ + "# Measure agent's average reward\n", + "mdp = FrozenLakeEnv(slip_chance=0.2, map_name='8x8')\n", + "state_values = value_iteration(mdp)\n", + "\n", + "total_rewards = []\n", + "for game_i in range(1000):\n", + " s = mdp.reset()\n", + " rewards = []\n", + " for t in range(100):\n", + " s, r, done, _ = mdp.step(\n", + " get_optimal_action(mdp, state_values, s, gamma))\n", + " rewards.append(r)\n", + " if done:\n", + " break\n", + " total_rewards.append(np.sum(rewards))\n", + "\n", + "print(\"average reward: \", np.mean(total_rewards))\n", + "assert(0.6 <= np.mean(total_rewards) <= 0.8)\n", + "print(\"Well done!\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Bonus area" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Bonus 1 - find an MDP for which value iteration takes long to converge (2+ pts)\n", + "\n", + "When we ran value iteration on the small frozen lake problem, the last iteration where an action changed was iteration 6--i.e., value iteration computed the optimal policy at iteration 6. Are there any guarantees regarding how many iterations it'll take value iteration to compute the optimal policy? There are no such guarantees without additional assumptions--we can construct the MDP in such a way that the greedy policy will change after arbitrarily many iterations.\n", + "\n", + "Your task: define an MDP with at most 3 states and 2 actions, such that when you run value iteration, the optimal action changes at iteration >= 50. Use discount=0.95. (However, note that the discount doesn't matter here--you can construct an appropriate MDP with any discount.)\n", + "\n", + "Note: value function must change at least once after iteration >=50, not necessarily change on every iteration till >=50." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2018-04-02T13:43:17.122424Z", + "start_time": "2018-04-02T13:43:05.582Z" + }, + "collapsed": true + }, + "outputs": [], + "source": [ + "transition_probs = {\n", + " < YOUR CODE >\n", + "}\n", + "rewards = {\n", + " < YOUR CODE >\n", + "}\n", + "\n", + "from mdp import MDP\n", + "from numpy import random\n", + "mdp = MDP(transition_probs, rewards, initial_state=random.choice(tuple(transition_probs.keys())))\n", + "# Feel free to change the initial_state" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2018-04-02T13:43:17.123825Z", + "start_time": "2018-04-02T13:43:05.586Z" + }, + "collapsed": true + }, + "outputs": [], + "source": [ + "state_values = {s: 0 for s in mdp.get_all_states()}\n", + "policy = np.array([get_optimal_action(mdp, state_values, state, gamma)\n", + " for state in sorted(mdp.get_all_states())])\n", + "\n", + "for i in range(100):\n", + " print(\"after iteration %i\" % i)\n", + " state_values = value_iteration(mdp, state_values, num_iter=1)\n", + "\n", + " new_policy = np.array([get_optimal_action(mdp, state_values, state, gamma)\n", + " for state in sorted(mdp.get_all_states())])\n", + "\n", + " n_changes = (policy != new_policy).sum()\n", + " print(\"N actions changed = %i \\n\" % n_changes)\n", + " policy = new_policy\n", + "\n", + "# please ignore iter 0 at each step" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Bonus 2 - Policy Iteration (3+ points)\n", + "\n", + "Let's implement exact policy iteration (PI), which has the following pseudocode:\n", + "\n", + "---\n", + "Initialize $\\pi_0$ `// random or fixed action`\n", + "\n", + "For $n=0, 1, 2, \\dots$\n", + "- Compute the state-value function $V^{\\pi_{n}}$\n", + "- Using $V^{\\pi_{n}}$, compute the state-action-value function $Q^{\\pi_{n}}$\n", + "- Compute new policy $\\pi_{n+1}(s) = \\operatorname*{argmax}_a Q^{\\pi_{n}}(s,a)$\n", + "---\n", + "\n", + "Unlike VI, policy iteration has to maintain a policy - chosen actions from all states - and estimate $V^{\\pi_{n}}$ based on this policy. It only changes policy once values converged.\n", + "\n", + "\n", + "Below are a few helpers that you may or may not use in your implementation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2018-04-02T13:43:17.125320Z", + "start_time": "2018-04-02T13:43:05.590Z" + }, + "collapsed": true + }, + "outputs": [], + "source": [ + "transition_probs = {\n", + " 's0': {\n", + " 'a0': {'s0': 0.5, 's2': 0.5},\n", + " 'a1': {'s2': 1}\n", + " },\n", + " 's1': {\n", + " 'a0': {'s0': 0.7, 's1': 0.1, 's2': 0.2},\n", + " 'a1': {'s1': 0.95, 's2': 0.05}\n", + " },\n", + " 's2': {\n", + " 'a0': {'s0': 0.4, 's1': 0.6},\n", + " 'a1': {'s0': 0.3, 's1': 0.3, 's2': 0.4}\n", + " }\n", + "}\n", + "rewards = {\n", + " 's1': {'a0': {'s0': +5}},\n", + " 's2': {'a1': {'s0': -1}}\n", + "}\n", + "\n", + "from mdp import MDP\n", + "mdp = MDP(transition_probs, rewards, initial_state='s0')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's write a function called `compute_vpi` that computes the state-value function $V^{\\pi}$ for an arbitrary policy $\\pi$.\n", + "\n", + "Unlike VI, this time you must find the exact solution, not just a single iteration.\n", + "\n", + "Recall that $V^{\\pi}$ satisfies the following linear equation:\n", + "$$V^{\\pi}(s) = \\sum_{s'} P(s,\\pi(s),s')[ R(s,\\pi(s),s') + \\gamma V^{\\pi}(s')]$$\n", + "\n", + "You'll have to solve a linear system in your code. (Find an exact solution, e.g., with `np.linalg.solve`.)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2018-04-02T13:43:17.126518Z", + "start_time": "2018-04-02T13:43:05.593Z" + }, + "collapsed": true + }, + "outputs": [], + "source": [ + "def compute_vpi(mdp, policy, gamma):\n", + " \"\"\"\n", + " Computes V^pi(s) FOR ALL STATES under given policy.\n", + " :param policy: a dict of currently chosen actions {s : a}\n", + " :returns: a dict {state : V^pi(state) for all states}\n", + " \"\"\"\n", + " # YOUR CODE HERE\n", + " return < YOUR CODE >" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2018-04-02T13:43:17.127349Z", + "start_time": "2018-04-02T13:43:05.597Z" + }, + "collapsed": true + }, + "outputs": [], + "source": [ + "test_policy = {s: np.random.choice(\n", + " mdp.get_possible_actions(s)) for s in mdp.get_all_states()}\n", + "new_vpi = compute_vpi(mdp, test_policy, gamma)\n", + "\n", + "print(new_vpi)\n", + "\n", + "assert type(\n", + " new_vpi) is dict, \"compute_vpi must return a dict {state : V^pi(state) for all states}\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Once we've got new state values, it's time to update our policy." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2018-04-02T13:43:17.128415Z", + "start_time": "2018-04-02T13:43:05.601Z" + }, + "collapsed": true + }, + "outputs": [], + "source": [ + "def compute_new_policy(mdp, vpi, gamma):\n", + " \"\"\"\n", + " Computes new policy as argmax of state values\n", + " :param vpi: a dict {state : V^pi(state) for all states}\n", + " :returns: a dict {state : optimal action for all states}\n", + " \"\"\"\n", + " \n", + " return < YOUR CODE >" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2018-04-02T13:43:17.129416Z", + "start_time": "2018-04-02T13:43:05.604Z" + }, + "collapsed": true + }, + "outputs": [], + "source": [ + "new_policy = compute_new_policy(mdp, new_vpi, gamma)\n", + "\n", + "print(new_policy)\n", + "\n", + "assert type(\n", + " new_policy) is dict, \"compute_new_policy must return a dict {state : optimal action for all states}\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "__Main loop__" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2018-04-02T13:43:17.130183Z", + "start_time": "2018-04-02T13:43:05.608Z" + }, + "collapsed": true + }, + "outputs": [], + "source": [ + "def policy_iteration(mdp, policy=None, gamma=0.9, num_iter=1000, min_difference=1e-5):\n", + " \"\"\" \n", + " Run the policy iteration loop for num_iter iterations or till difference between V(s) is below min_difference.\n", + " If policy is not given, initialize it at random.\n", + " \"\"\"\n", + " < A WHOLE LOT OF YOUR CODE >\n", + "\n", + " return state_values, policy" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "__Your PI Results__" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2018-04-02T13:43:17.130926Z", + "start_time": "2018-04-02T13:43:05.612Z" + }, + "collapsed": true + }, + "outputs": [], + "source": [ + "< Compare PI and VI on the MDP from bonus 1, then on small & large FrozenLake >" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.8" + }, + "toc": { + "nav_menu": {}, + "number_sections": true, + "sideBar": true, + "skip_h1_title": false, + "title_cell": "Table of Contents", + "title_sidebar": "Contents", + "toc_cell": false, + "toc_position": {}, + "toc_section_display": true, + "toc_window_display": false + }, + "varInspector": { + "cols": { + "lenName": 16, + "lenType": 16, + "lenVar": 40 + }, + "kernels_config": { + "python": { + "delete_cmd_postfix": "", + "delete_cmd_prefix": "del ", + "library": "var_list.py", + "varRefreshCmd": "print(var_dic_list())" + }, + "r": { + "delete_cmd_postfix": ") ", + "delete_cmd_prefix": "rm(", + "library": "var_list.r", + "varRefreshCmd": "cat(var_dic_list()) " + } + }, + "types_to_exclude": [ + "module", + "function", + "builtin_function_or_method", + "instance", + "_Feature" + ], + "window_display": false + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/week3_model_free/README.md b/week03_model_free/README.md similarity index 74% rename from week3_model_free/README.md rename to week03_model_free/README.md index 920f87c3a..c197e9532 100644 --- a/week3_model_free/README.md +++ b/week03_model_free/README.md @@ -11,15 +11,24 @@ - Blog post on q-learning Vs SARSA - [url](https://studywolf.wordpress.com/2013/07/01/reinforcement-learning-sarsa-vs-q-learning/) ### More materials -* N-step temporal difference from Sutton's book - [suttonbook](http://incompleteideas.net/book/bookdraft2018jan1.pdf) __chapter 7__ -* Eligibility traces from Sutton's book - [suttonbook](http://incompleteideas.net/book/bookdraft2018jan1.pdf) __chapter 12__ +* N-step temporal difference from Sutton's book - [suttonbook](http://incompleteideas.net/book/RLbook2018.pdf) __chapter 7__ +* Eligibility traces from Sutton's book - [suttonbook](http://incompleteideas.net/book/RLbook2018.pdf) __chapter 12__ * Blog post on eligibility traces - [url](http://pierrelucbacon.com/traces/) ### Assignments -Just as usual, start with `seminar_qlearning.ipynb` and then proceed to `homework.ipynb`. +Just as usual, start with +- `seminar_qlearning.ipynb` +[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/yandexdataschool/Practical_RL/blob/spring19/week03_model_free/seminar_qlearning.ipynb) +and then proceed to +- `homework.ipynb` +[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/yandexdataschool/Practical_RL/blob/spring19/week03_model_free/homework.ipynb) + +Please pay attention for uncommenting first lines in code if you use Colab. + +--- (optional) If you're running on a local machine (e.g. your pc) with python2, you can also try `seminar_py2`. It has some neat RL problems with cool visualizations. ### ./seminar_py2 @@ -29,7 +38,7 @@ This homework assignment works on __python2 only__. If you stick to py3, conside This homework also requires some physical display (e.g. laptop monitor). It won't work on binder VM / headless server. Please run it on laptop or consider ./seminar_alternative -* You need to implement **QLearining** algorithm. If you're running go to ```seminar_main/``` folder and open file ```qlearningAgent.py```. +* You need to implement **QLearining** algorithm. Once you're done, run use those commands: ``` @@ -40,7 +49,7 @@ python pacman.py -p PacmanQAgent -x 5000 -n 5010 -l smallGrid # example * Make sure you can tune agent to beat ./run_crawler.sh * on windows, just run `python crawler.py` from cmd in the project directory * other ./run* files are mostly for your amusement. - * ./run_pacman.sh will need more epochs to converge, see [comments](https://github.com/yandexdataschool/Practical_RL/blob/master/week3_model_free/seminar_py2/run_pacman.sh) + * ./run_pacman.sh will need more epochs to converge, see [comments](https://github.com/yandexdataschool/Practical_RL/blob/spring19/week03_model_free/seminar_py2/run_pacman.sh) * on windows, just copy the type `python pacman.py -p PacmanQAgent -x 2000 -n 2010 -l smallGrid` in cmd from assignemnt dir (YSDA/HSE) Please submit only qlearningAgents.py file and include a brief text report as comments in it. diff --git a/week03_model_free/homework.ipynb b/week03_model_free/homework.ipynb new file mode 100644 index 000000000..aaaeb0cd7 --- /dev/null +++ b/week03_model_free/homework.ipynb @@ -0,0 +1,819 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "[Part I: On-policy learning and SARSA (3 points)](#Part-I:-On-policy-learning-and-SARSA-(3-points))\n", + "\n", + "[Part II: Experience replay (4 points)](#Part-II:-experience-replay-(4-points))\n", + "\n", + "[Bonus I: TD($ \\lambda $) (5+ points)](#Bonus-I:-TD($\\lambda$)-(5+-points))\n", + "\n", + "[Bonus II: More pacman (5+ points)](#Bonus-II:-More-pacman-(5+-points))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Part I: On-policy learning and SARSA (3 points)\n", + "\n", + "_This notebook builds upon `qlearning.ipynb`, or to be exact, generating qlearning.py._\n", + "\n", + "The policy we're gonna use is epsilon-greedy policy, where agent takes optimal action with probability $(1-\\epsilon)$, otherwise samples action at random. Note that agent __can__ occasionally sample optimal action during random sampling by pure chance." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# In google collab, uncomment this:\n", + "# !wget https://bit.ly/2FMJP5K -q -O setup.py\n", + "# !bash setup.py 2>&1 1>stdout.log | tee stderr.log\n", + "\n", + "# This code creates a virtual display to draw game images on.\n", + "# If you are running locally, just ignore it\n", + "import os\n", + "if type(os.environ.get(\"DISPLAY\")) is not str or len(os.environ.get(\"DISPLAY\")) == 0:\n", + " !bash ../xvfb start\n", + " %env DISPLAY = : 1\n", + "\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "%matplotlib inline\n", + "%load_ext autoreload\n", + "%autoreload 2" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now you can use code, generated from seminar `seminar_qlearning.ipynb`. Or just copy&paste it." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%writefile qlearning.py\n", + "from collections import defaultdict\n", + "import random\n", + "import math\n", + "import numpy as np\n", + "\n", + "\n", + "class QLearningAgent:\n", + " def __init__(self, alpha, epsilon, discount, get_legal_actions):\n", + " \"\"\"\n", + " Q-Learning Agent\n", + " based on https://inst.eecs.berkeley.edu/~cs188/sp19/projects.html\n", + " Instance variables you have access to\n", + " - self.epsilon (exploration prob)\n", + " - self.alpha (learning rate)\n", + " - self.discount (discount rate aka gamma)\n", + "\n", + " Functions you should use\n", + " - self.get_legal_actions(state) {state, hashable -> list of actions, each is hashable}\n", + " which returns legal actions for a state\n", + " - self.get_qvalue(state,action)\n", + " which returns Q(state,action)\n", + " - self.set_qvalue(state,action,value)\n", + " which sets Q(state,action) := value\n", + "\n", + " !!!Important!!!\n", + " Note: please avoid using self._qValues directly. \n", + " There's a special self.get_qvalue/set_qvalue for that.\n", + " \"\"\"\n", + "\n", + " self.get_legal_actions = get_legal_actions\n", + " self._qvalues = defaultdict(lambda: defaultdict(lambda: 0))\n", + " self.alpha = alpha\n", + " self.epsilon = epsilon\n", + " self.discount = discount\n", + "\n", + " def get_qvalue(self, state, action):\n", + " \"\"\" Returns Q(state,action) \"\"\"\n", + " return self._qvalues[state][action]\n", + "\n", + " def set_qvalue(self, state, action, value):\n", + " \"\"\" Sets the Qvalue for [state,action] to the given value \"\"\"\n", + " self._qvalues[state][action] = value\n", + "\n", + " #---------------------START OF YOUR CODE---------------------#\n", + "\n", + " def get_value(self, state):\n", + " \"\"\"\n", + " Compute your agent's estimate of V(s) using current q-values\n", + " V(s) = max_over_action Q(state,action) over possible actions.\n", + " Note: please take into account that q-values can be negative.\n", + " \"\"\"\n", + " possible_actions = self.get_legal_actions(state)\n", + "\n", + " # If there are no legal actions, return 0.0\n", + " if len(possible_actions) == 0:\n", + " return 0.0\n", + "\n", + " \n", + "\n", + " return value\n", + "\n", + " def update(self, state, action, reward, next_state):\n", + " \"\"\"\n", + " You should do your Q-Value update here:\n", + " Q(s,a) := (1 - alpha) * Q(s,a) + alpha * (r + gamma * V(s'))\n", + " \"\"\"\n", + "\n", + " # agent parameters\n", + " gamma = self.discount\n", + " learning_rate = self.alpha\n", + "\n", + " \n", + "\n", + " self.set_qvalue(state, action, < YOUR_QVALUE > )\n", + "\n", + " def get_best_action(self, state):\n", + " \"\"\"\n", + " Compute the best action to take in a state (using current q-values). \n", + " \"\"\"\n", + " possible_actions = self.get_legal_actions(state)\n", + "\n", + " # If there are no legal actions, return None\n", + " if len(possible_actions) == 0:\n", + " return None\n", + "\n", + " \n", + "\n", + " return best_action\n", + "\n", + " def get_action(self, state):\n", + " \"\"\"\n", + " Compute the action to take in the current state, including exploration. \n", + " With probability self.epsilon, we should take a random action.\n", + " otherwise - the best policy action (self.getPolicy).\n", + "\n", + " Note: To pick randomly from a list, use random.choice(list). \n", + " To pick True or False with a given probablity, generate uniform number in [0, 1]\n", + " and compare it with your probability\n", + " \"\"\"\n", + "\n", + " # Pick Action\n", + " possible_actions = self.get_legal_actions(state)\n", + " action = None\n", + "\n", + " # If there are no legal actions, return None\n", + " if len(possible_actions) == 0:\n", + " return None\n", + "\n", + " # agent parameters:\n", + " epsilon = self.epsilon\n", + "\n", + " \n", + "\n", + " return chosen_action" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "from qlearning import QLearningAgent\n", + "\n", + "\n", + "class EVSarsaAgent(QLearningAgent):\n", + " \"\"\" \n", + " An agent that changes some of q-learning functions to implement Expected Value SARSA. \n", + " Note: this demo assumes that your implementation of QLearningAgent.update uses get_value(next_state).\n", + " If it doesn't, please add\n", + " def update(self, state, action, reward, next_state):\n", + " and implement it for Expected Value SARSA's V(s')\n", + " \"\"\"\n", + "\n", + " def get_value(self, state):\n", + " \"\"\" \n", + " Returns Vpi for current state under epsilon-greedy policy:\n", + " V_{pi}(s) = sum _{over a_i} {pi(a_i | s) * Q(s, a_i)}\n", + "\n", + " Hint: all other methods from QLearningAgent are still accessible.\n", + " \"\"\"\n", + " epsilon = self.epsilon\n", + " possible_actions = self.get_legal_actions(state)\n", + "\n", + " # If there are no legal actions, return 0.0\n", + " if len(possible_actions) == 0:\n", + " return 0.0\n", + "\n", + " \n", + "\n", + " return state_value" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Cliff World\n", + "\n", + "Let's now see how our algorithm compares against q-learning in case where we force agent to explore all the time.\n", + "\n", + "\n", + "
image by cs188
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import gym\n", + "import gym.envs.toy_text\n", + "env = gym.envs.toy_text.CliffWalkingEnv()\n", + "n_actions = env.action_space.n\n", + "\n", + "print(env.__doc__)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# Our cliffworld has one difference from what's on the image: there is no wall.\n", + "# Agent can choose to go as close to the cliff as it wishes. x:start, T:exit, C:cliff, o: flat ground\n", + "env.render()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def play_and_train(env, agent, t_max=10**4):\n", + " \"\"\"This function should \n", + " - run a full game, actions given by agent.getAction(s)\n", + " - train agent using agent.update(...) whenever possible\n", + " - return total reward\"\"\"\n", + " total_reward = 0.0\n", + " s = env.reset()\n", + "\n", + " for t in range(t_max):\n", + " a = agent.get_action(s)\n", + "\n", + " next_s, r, done, _ = env.step(a)\n", + " agent.update(s, a, r, next_s)\n", + "\n", + " s = next_s\n", + " total_reward += r\n", + " if done:\n", + " break\n", + "\n", + " return total_reward" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "from qlearning import QLearningAgent\n", + "\n", + "agent_sarsa = EVSarsaAgent(alpha=0.25, epsilon=0.2, discount=0.99,\n", + " get_legal_actions=lambda s: range(n_actions))\n", + "\n", + "agent_ql = QLearningAgent(alpha=0.25, epsilon=0.2, discount=0.99,\n", + " get_legal_actions=lambda s: range(n_actions))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "from IPython.display import clear_output\n", + "from pandas import DataFrame\n", + "\n", + "\n", + "def moving_average(x, span=100): return DataFrame(\n", + " {'x': np.asarray(x)}).x.ewm(span=span).mean().values\n", + "\n", + "\n", + "rewards_sarsa, rewards_ql = [], []\n", + "\n", + "for i in range(5000):\n", + " rewards_sarsa.append(play_and_train(env, agent_sarsa))\n", + " rewards_ql.append(play_and_train(env, agent_ql))\n", + " # Note: agent.epsilon stays constant\n", + "\n", + " if i % 100 == 0:\n", + " clear_output(True)\n", + " print('EVSARSA mean reward =', np.mean(rewards_sarsa[-100:]))\n", + " print('QLEARNING mean reward =', np.mean(rewards_ql[-100:]))\n", + " plt.title(\"epsilon = %s\" % agent_ql.epsilon)\n", + " plt.plot(moving_average(rewards_sarsa), label='ev_sarsa')\n", + " plt.plot(moving_average(rewards_ql), label='qlearning')\n", + " plt.grid()\n", + " plt.legend()\n", + " plt.ylim(-500, 0)\n", + " plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's now see what did the algorithms learn by visualizing their actions at every state." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def draw_policy(env, agent):\n", + " \"\"\" Prints CliffWalkingEnv policy with arrows. Hard-coded. \"\"\"\n", + " n_rows, n_cols = env._cliff.shape\n", + "\n", + " actions = '^>v<'\n", + "\n", + " for yi in range(n_rows):\n", + " for xi in range(n_cols):\n", + " if env._cliff[yi, xi]:\n", + " print(\" C \", end='')\n", + " elif (yi * n_cols + xi) == env.start_state_index:\n", + " print(\" X \", end='')\n", + " elif (yi * n_cols + xi) == n_rows * n_cols - 1:\n", + " print(\" T \", end='')\n", + " else:\n", + " print(\" %s \" %\n", + " actions[agent.get_best_action(yi * n_cols + xi)], end='')\n", + " print()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "print(\"Q-Learning\")\n", + "draw_policy(env, agent_ql)\n", + "\n", + "print(\"SARSA\")\n", + "draw_policy(env, agent_sarsa)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": true + }, + "source": [ + "### More on SARSA\n", + "\n", + "Here are some of the things you can do if you feel like it:\n", + "\n", + "* Play with epsilon. See learned how policies change if you set epsilon to higher/lower values (e.g. 0.75).\n", + "* Expected Value SASRSA for softmax policy __(2pts)__:\n", + "$$ \\pi(a_i|s) = softmax({Q(s,a_i) \\over \\tau}) = {e ^ {Q(s,a_i)/ \\tau} \\over {\\sum_{a_j} e ^{Q(s,a_j) / \\tau }}} $$\n", + "* Implement N-step algorithms and TD($\\lambda$): see [Sutton's book](http://incompleteideas.net/book/bookdraft2018jan1.pdf) chapter 7 and chapter 12.\n", + "* Use those algorithms to train on CartPole in previous / next assignment for this week." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Part II: experience replay (4 points)\n", + "\n", + "There's a powerful technique that you can use to improve sample efficiency for off-policy algorithms: [spoiler] Experience replay :)\n", + "\n", + "The catch is that you can train Q-learning and EV-SARSA on `` tuples even if they aren't sampled under current agent's policy. So here's what we're gonna do:\n", + "\n", + "\n", + "\n", + "#### Training with experience replay\n", + "1. Play game, sample ``.\n", + "2. Update q-values based on ``.\n", + "3. Store `` transition in a buffer. \n", + " 3. If buffer is full, delete earliest data.\n", + "4. Sample K such transitions from that buffer and update q-values based on them.\n", + "\n", + "\n", + "To enable such training, first we must implement a memory structure that would act like such a buffer." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# In google collab, uncomment this:\n", + "# !wget https://bit.ly/2FMJP5K -q -O setup.py\n", + "# !bash setup.py 2>&1 1>stdout.log | tee stderr.log\n", + "\n", + "# This code creates a virtual display to draw game images on.\n", + "# If you are running locally, just ignore it\n", + "import os\n", + "if type(os.environ.get(\"DISPLAY\")) is not str or len(os.environ.get(\"DISPLAY\")) == 0:\n", + " !bash ../xvfb start\n", + " %env DISPLAY = : 1\n", + "\n", + "%load_ext autoreload\n", + "%autoreload 2\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "%matplotlib inline\n", + "from IPython.display import clear_output" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import random\n", + "\n", + "\n", + "class ReplayBuffer(object):\n", + " def __init__(self, size):\n", + " \"\"\"\n", + " Create Replay buffer.\n", + " Parameters\n", + " ----------\n", + " size: int\n", + " Max number of transitions to store in the buffer. When the buffer\n", + " overflows the old memories are dropped.\n", + "\n", + " Note: for this assignment you can pick any data structure you want.\n", + " If you want to keep it simple, you can store a list of tuples of (s, a, r, s') in self._storage\n", + " However you may find out there are faster and/or more memory-efficient ways to do so.\n", + " \"\"\"\n", + " self._storage = []\n", + " self._maxsize = size\n", + "\n", + " # OPTIONAL: YOUR CODE\n", + "\n", + " def __len__(self):\n", + " return len(self._storage)\n", + "\n", + " def add(self, obs_t, action, reward, obs_tp1, done):\n", + " '''\n", + " Make sure, _storage will not exceed _maxsize. \n", + " Make sure, FIFO rule is being followed: the oldest examples has to be removed earlier\n", + " '''\n", + " data = (obs_t, action, reward, obs_tp1, done)\n", + "\n", + " # add data to storage\n", + " \n", + "\n", + " def sample(self, batch_size):\n", + " \"\"\"Sample a batch of experiences.\n", + " Parameters\n", + " ----------\n", + " batch_size: int\n", + " How many transitions to sample.\n", + " Returns\n", + " -------\n", + " obs_batch: np.array\n", + " batch of observations\n", + " act_batch: np.array\n", + " batch of actions executed given obs_batch\n", + " rew_batch: np.array\n", + " rewards received as results of executing act_batch\n", + " next_obs_batch: np.array\n", + " next set of observations seen after executing act_batch\n", + " done_mask: np.array\n", + " done_mask[i] = 1 if executing act_batch[i] resulted in\n", + " the end of an episode and 0 otherwise.\n", + " \"\"\"\n", + " idxes = \n", + "\n", + " # collect for each index\n", + " \n", + "\n", + " return np.array( < states > ), np.array( < actions > ), np.array( < rewards > ), np.array( < next_states > ), np.array( < is_done > )" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Some tests to make sure your buffer works right" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def obj2arrays(obj):\n", + " for x in obj:\n", + " yield np.array([x])\n", + "\n", + "def obj2sampled(obj):\n", + " return tuple(obj2arrays(obj))\n", + "\n", + "replay = ReplayBuffer(2)\n", + "obj1 = (0, 1, 2, 3, True)\n", + "obj2 = (4, 5, 6, 7, False)\n", + "replay.add(*obj1)\n", + "assert replay.sample(\n", + " 1) == obj2sampled(obj1), \"If there's just one object in buffer, it must be retrieved by buf.sample(1)\"\n", + "replay.add(*obj2)\n", + "assert len(replay) == 2, \"Please make sure __len__ methods works as intended.\"\n", + "replay.add(*obj2)\n", + "assert len(replay) == 2, \"When buffer is at max capacity, replace objects instead of adding new ones.\"\n", + "assert tuple(np.unique(a) for a in replay.sample(100)) == obj2sampled(obj2)\n", + "replay.add(*obj1)\n", + "assert max(len(np.unique(a)) for a in replay.sample(100)) == 2\n", + "replay.add(*obj1)\n", + "assert tuple(np.unique(a) for a in replay.sample(100)) == obj2sampled(obj1)\n", + "print(\"Success!\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now let's use this buffer to improve training:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import gym\n", + "from qlearning import QLearningAgent\n", + "\n", + "env = gym.make(\"Taxi-v2\")\n", + "n_actions = env.action_space.n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def play_and_train_with_replay(env, agent, replay=None,\n", + " t_max=10**4, replay_batch_size=32):\n", + " \"\"\"\n", + " This function should \n", + " - run a full game, actions given by agent.getAction(s)\n", + " - train agent using agent.update(...) whenever possible\n", + " - return total reward\n", + " :param replay: ReplayBuffer where agent can store and sample (s,a,r,s',done) tuples.\n", + " If None, do not use experience replay\n", + " \"\"\"\n", + " total_reward = 0.0\n", + " s = env.reset()\n", + "\n", + " for t in range(t_max):\n", + " # get agent to pick action given state s\n", + " a = \n", + "\n", + " next_s, r, done, _ = env.step(a)\n", + "\n", + " # update agent on current transition. Use agent.update\n", + " \n", + "\n", + " if replay is not None:\n", + " # store current transition in buffer\n", + " \n", + "\n", + " # sample replay_batch_size random transitions from replay,\n", + " # then update agent on each of them in a loop\n", + " s_, a_, r_, next_s_, done_ = replay.sample(replay_batch_size)\n", + " for i in range(replay_batch_size):\n", + " \n", + "\n", + " s = next_s\n", + " total_reward += r\n", + " if done:\n", + " break\n", + "\n", + " return total_reward" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# Create two agents: first will use experience replay, second will not.\n", + "\n", + "agent_baseline = QLearningAgent(alpha=0.5, epsilon=0.25, discount=0.99,\n", + " get_legal_actions=lambda s: range(n_actions))\n", + "\n", + "agent_replay = QLearningAgent(alpha=0.5, epsilon=0.25, discount=0.99,\n", + " get_legal_actions=lambda s: range(n_actions))\n", + "\n", + "replay = ReplayBuffer(1000)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "from IPython.display import clear_output\n", + "\n", + "rewards_replay, rewards_baseline = [], []\n", + "\n", + "for i in range(1000):\n", + " rewards_replay.append(\n", + " play_and_train_with_replay(env, agent_replay, replay))\n", + " rewards_baseline.append(play_and_train_with_replay(\n", + " env, agent_baseline, replay=None))\n", + "\n", + " agent_replay.epsilon *= 0.99\n", + " agent_baseline.epsilon *= 0.99\n", + "\n", + " if i % 100 == 0:\n", + " clear_output(True)\n", + " print('Baseline : eps =', agent_replay.epsilon,\n", + " 'mean reward =', np.mean(rewards_baseline[-10:]))\n", + " print('ExpReplay: eps =', agent_baseline.epsilon,\n", + " 'mean reward =', np.mean(rewards_replay[-10:]))\n", + " plt.plot(moving_average(rewards_replay), label='exp. replay')\n", + " plt.plot(moving_average(rewards_baseline), label='baseline')\n", + " plt.grid()\n", + " plt.legend()\n", + " plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": true + }, + "source": [ + "#### What to expect:\n", + "\n", + "Experience replay, if implemented correctly, will improve algorithm's initial convergence a lot, but it shouldn't affect the final performance.\n", + "\n", + "### Outro\n", + "\n", + "We will use the code you just wrote extensively in the next week of our course. If you're feeling that you need more examples to understand how experience replay works, try using it for binarized state spaces (CartPole or other __[classic control envs](https://gym.openai.com/envs/#classic_control)__).\n", + "\n", + "__Next week__ we're gonna explore how q-learning and similar algorithms can be applied for large state spaces, with deep learning models to approximate the Q function.\n", + "\n", + "However, __the code you've written__ for this week is already capable of solving many RL problems, and as an added benifit - it is very easy to detach. You can use Q-learning, SARSA and Experience Replay for any RL problems you want to solve - just thow 'em into a file and import the stuff you need." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Bonus I: TD($\\lambda$) (5+ points)\n", + "\n", + "There's a number of advanced algorithms you can find in week 3 materials (Silver lecture II and/or reading about eligibility traces). One such algorithm is TD(lambda), which is based on the idea of eligibility traces. You can also view it as a combination of N-step updates for alll N.\n", + "* N-step temporal difference from Sutton's book - [url](http://incompleteideas.net/book/the-book-2nd.html), page 142 / chapter 7 \n", + "* Eligibility traces from Sutton's book - same url, chapter 12 / page 278\n", + "* Blog post on eligibility traces - [url](http://pierrelucbacon.com/traces/)\n", + "\n", + "Here's a practical algorithm you can start with: [url](https://stackoverflow.com/questions/40862578/how-to-understand-watkinss-q%CE%BB-learning-algorithm-in-suttonbartos-rl-book/40892302)\n", + "\n", + "\n", + "Implementing this algorithm will prove more challenging than q-learning or sarsa, but doing so will earn you a deeper understanding of how value-based methods work [in addition to some bonus points].\n", + "\n", + "More kudos for comparing and analyzing TD($\\lambda$) against Q-learning and EV-SARSA in different setups (taxi vs cartpole, constant epsilon vs decreasing epsilon)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Bonus II: More pacman (5+ points)\n", + "\n", + "__see README.md for software requirements of seminar_py2__\n", + "\n", + "Remember seminar_py2 where your vanilla q-learning had hard time solving Pacman even on a small grid? Now's the time to fix that issue.\n", + "\n", + "We'll focus on those grids for pacman setup.\n", + "* python pacman.py -p PacmanQAgent -x N_TRAIN_GAMES -n N_TOTAL_GAMES -l __mediumGrid__\n", + "* python pacman.py -p PacmanQAgent -x N_TRAIN_GAMES -n N_TOTAL_GAMES -l __mediumClassic__\n", + "\n", + "Even if you adjust N_TRAIN_GAMES to 10^5 and N_TOTAL_GAMES to 10^5+100 (100 last games are for test), pacman won't solve those environments\n", + "\n", + "The problem with those environments is that they have a large amount of unique states. However, you can devise a smaller environment state by choosing different observation parameters, e.g.:\n", + " * distance and direction to nearest ghost\n", + " * where is nearest food\n", + " * 'center of mass' of all food points (and variance, and whatever)\n", + " * is there a wall in each direction\n", + " * and anything else you see fit \n", + " \n", + "Here's how to get this information from [state](https://github.com/yandexdataschool/Practical_RL/blob/master/week3_model_free/seminar_py2/pacman.py#L49),\n", + " * Get pacman position: [state.getPacmanPosition()](https://github.com/yandexdataschool/Practical_RL/blob/master/week3_model_free/seminar_py2/pacman.py#L128)\n", + " * Is there a wall at (x,y)?: [state.hasWall(x,y)](https://github.com/yandexdataschool/Practical_RL/blob/master/week3_model_free/seminar_py2/pacman.py#L189)\n", + " * Get ghost positions: [state.getGhostPositions()](https://github.com/yandexdataschool/Practical_RL/blob/master/week3_model_free/seminar_py2/pacman.py#L144)\n", + " * Get all food positions: [state.getCapsules()](https://github.com/yandexdataschool/Practical_RL/blob/master/week3_model_free/seminar_py2/pacman.py#L153)\n", + " \n", + "You can call those methods anywhere you see state.\n", + " * e.g. in [agent.getValue(state)](https://github.com/yandexdataschool/Practical_RL/blob/master/week3_model_free/seminar_py2/qlearningAgents.py#L52)\n", + " * Defining a function that extracts all features and calling it in [getQValue](https://github.com/yandexdataschool/Practical_RL/blob/master/week3_model_free/seminar_py2/qlearningAgents.py#L38) and [setQValue](https://github.com/yandexdataschool/Practical_RL/blob/master/week3_model_free/seminar_py2/qlearningAgents.py#L44) is probably enough.\n", + " * You can also change agent parameters. The simplest way is to hard-code them in [PacmanQAgent](https://github.com/yandexdataschool/Practical_RL/blob/master/week3_model_free/seminar_py2/qlearningAgents.py#L140)\n", + "\n", + "Also, don't forget to optimize ```learning_rate```, ```discount``` and ```epsilon``` params of model, this may also help to solve this env." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.8" + }, + "latex_envs": { + "LaTeX_envs_menu_present": true, + "autoclose": false, + "autocomplete": true, + "bibliofile": "biblio.bib", + "cite_by": "apalike", + "current_citInitial": 1, + "eqLabelWithNumbers": true, + "eqNumInitial": 1, + "hotkeys": { + "equation": "Ctrl-E", + "itemize": "Ctrl-I" + }, + "labels_anchors": false, + "latex_user_defs": false, + "report_style_numbering": false, + "user_envs_cfg": false + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/week3_model_free/seminar_py2/analysis.py b/week03_model_free/seminar_py2/analysis.py similarity index 100% rename from week3_model_free/seminar_py2/analysis.py rename to week03_model_free/seminar_py2/analysis.py diff --git a/week3_model_free/seminar_py2/crawler.py b/week03_model_free/seminar_py2/crawler.py similarity index 100% rename from week3_model_free/seminar_py2/crawler.py rename to week03_model_free/seminar_py2/crawler.py diff --git a/week3_model_free/seminar_py2/environment.py b/week03_model_free/seminar_py2/environment.py similarity index 100% rename from week3_model_free/seminar_py2/environment.py rename to week03_model_free/seminar_py2/environment.py diff --git a/week3_model_free/seminar_py2/featureExtractors.py b/week03_model_free/seminar_py2/featureExtractors.py similarity index 100% rename from week3_model_free/seminar_py2/featureExtractors.py rename to week03_model_free/seminar_py2/featureExtractors.py diff --git a/week3_model_free/seminar_py2/game.py b/week03_model_free/seminar_py2/game.py similarity index 100% rename from week3_model_free/seminar_py2/game.py rename to week03_model_free/seminar_py2/game.py diff --git a/week3_model_free/seminar_py2/ghostAgents.py b/week03_model_free/seminar_py2/ghostAgents.py similarity index 100% rename from week3_model_free/seminar_py2/ghostAgents.py rename to week03_model_free/seminar_py2/ghostAgents.py diff --git a/week3_model_free/seminar_py2/graphicsCrawlerDisplay.py b/week03_model_free/seminar_py2/graphicsCrawlerDisplay.py similarity index 100% rename from week3_model_free/seminar_py2/graphicsCrawlerDisplay.py rename to week03_model_free/seminar_py2/graphicsCrawlerDisplay.py diff --git a/week3_model_free/seminar_py2/graphicsDisplay.py b/week03_model_free/seminar_py2/graphicsDisplay.py similarity index 100% rename from week3_model_free/seminar_py2/graphicsDisplay.py rename to week03_model_free/seminar_py2/graphicsDisplay.py diff --git a/week3_model_free/seminar_py2/graphicsGridworldDisplay.py b/week03_model_free/seminar_py2/graphicsGridworldDisplay.py similarity index 100% rename from week3_model_free/seminar_py2/graphicsGridworldDisplay.py rename to week03_model_free/seminar_py2/graphicsGridworldDisplay.py diff --git a/week3_model_free/seminar_py2/graphicsUtils.py b/week03_model_free/seminar_py2/graphicsUtils.py similarity index 100% rename from week3_model_free/seminar_py2/graphicsUtils.py rename to week03_model_free/seminar_py2/graphicsUtils.py diff --git a/week3_model_free/seminar_py2/gridworld.py b/week03_model_free/seminar_py2/gridworld.py similarity index 100% rename from week3_model_free/seminar_py2/gridworld.py rename to week03_model_free/seminar_py2/gridworld.py diff --git a/week3_model_free/seminar_py2/how2run b/week03_model_free/seminar_py2/how2run similarity index 100% rename from week3_model_free/seminar_py2/how2run rename to week03_model_free/seminar_py2/how2run diff --git a/week3_model_free/seminar_py2/keyboardAgents.py b/week03_model_free/seminar_py2/keyboardAgents.py similarity index 100% rename from week3_model_free/seminar_py2/keyboardAgents.py rename to week03_model_free/seminar_py2/keyboardAgents.py diff --git a/week3_model_free/seminar_py2/layout.py b/week03_model_free/seminar_py2/layout.py similarity index 100% rename from week3_model_free/seminar_py2/layout.py rename to week03_model_free/seminar_py2/layout.py diff --git a/week3_model_free/seminar_py2/layouts/capsuleClassic.lay b/week03_model_free/seminar_py2/layouts/capsuleClassic.lay similarity index 100% rename from week3_model_free/seminar_py2/layouts/capsuleClassic.lay rename to week03_model_free/seminar_py2/layouts/capsuleClassic.lay diff --git a/week3_model_free/seminar_py2/layouts/contestClassic.lay b/week03_model_free/seminar_py2/layouts/contestClassic.lay similarity index 100% rename from week3_model_free/seminar_py2/layouts/contestClassic.lay rename to week03_model_free/seminar_py2/layouts/contestClassic.lay diff --git a/week3_model_free/seminar_py2/layouts/mediumClassic.lay b/week03_model_free/seminar_py2/layouts/mediumClassic.lay similarity index 100% rename from week3_model_free/seminar_py2/layouts/mediumClassic.lay rename to week03_model_free/seminar_py2/layouts/mediumClassic.lay diff --git a/week3_model_free/seminar_py2/layouts/mediumGrid.lay b/week03_model_free/seminar_py2/layouts/mediumGrid.lay similarity index 100% rename from week3_model_free/seminar_py2/layouts/mediumGrid.lay rename to week03_model_free/seminar_py2/layouts/mediumGrid.lay diff --git a/week3_model_free/seminar_py2/layouts/minimaxClassic.lay b/week03_model_free/seminar_py2/layouts/minimaxClassic.lay similarity index 100% rename from week3_model_free/seminar_py2/layouts/minimaxClassic.lay rename to week03_model_free/seminar_py2/layouts/minimaxClassic.lay diff --git a/week3_model_free/seminar_py2/layouts/openClassic.lay b/week03_model_free/seminar_py2/layouts/openClassic.lay similarity index 100% rename from week3_model_free/seminar_py2/layouts/openClassic.lay rename to week03_model_free/seminar_py2/layouts/openClassic.lay diff --git a/week3_model_free/seminar_py2/layouts/originalClassic.lay b/week03_model_free/seminar_py2/layouts/originalClassic.lay similarity index 100% rename from week3_model_free/seminar_py2/layouts/originalClassic.lay rename to week03_model_free/seminar_py2/layouts/originalClassic.lay diff --git a/week3_model_free/seminar_py2/layouts/smallClassic.lay b/week03_model_free/seminar_py2/layouts/smallClassic.lay similarity index 100% rename from week3_model_free/seminar_py2/layouts/smallClassic.lay rename to week03_model_free/seminar_py2/layouts/smallClassic.lay diff --git a/week3_model_free/seminar_py2/layouts/smallGrid.lay b/week03_model_free/seminar_py2/layouts/smallGrid.lay similarity index 100% rename from week3_model_free/seminar_py2/layouts/smallGrid.lay rename to week03_model_free/seminar_py2/layouts/smallGrid.lay diff --git a/week3_model_free/seminar_py2/layouts/testClassic.lay b/week03_model_free/seminar_py2/layouts/testClassic.lay similarity index 100% rename from week3_model_free/seminar_py2/layouts/testClassic.lay rename to week03_model_free/seminar_py2/layouts/testClassic.lay diff --git a/week3_model_free/seminar_py2/layouts/trappedClassic.lay b/week03_model_free/seminar_py2/layouts/trappedClassic.lay similarity index 100% rename from week3_model_free/seminar_py2/layouts/trappedClassic.lay rename to week03_model_free/seminar_py2/layouts/trappedClassic.lay diff --git a/week3_model_free/seminar_py2/layouts/trickyClassic.lay b/week03_model_free/seminar_py2/layouts/trickyClassic.lay similarity index 100% rename from week3_model_free/seminar_py2/layouts/trickyClassic.lay rename to week03_model_free/seminar_py2/layouts/trickyClassic.lay diff --git a/week3_model_free/seminar_py2/learningAgents.py b/week03_model_free/seminar_py2/learningAgents.py similarity index 100% rename from week3_model_free/seminar_py2/learningAgents.py rename to week03_model_free/seminar_py2/learningAgents.py diff --git a/week3_model_free/seminar_py2/mdp.py b/week03_model_free/seminar_py2/mdp.py similarity index 100% rename from week3_model_free/seminar_py2/mdp.py rename to week03_model_free/seminar_py2/mdp.py diff --git a/week3_model_free/seminar_py2/pacman.py b/week03_model_free/seminar_py2/pacman.py similarity index 100% rename from week3_model_free/seminar_py2/pacman.py rename to week03_model_free/seminar_py2/pacman.py diff --git a/week3_model_free/seminar_py2/pacmanAgents.py b/week03_model_free/seminar_py2/pacmanAgents.py similarity index 100% rename from week3_model_free/seminar_py2/pacmanAgents.py rename to week03_model_free/seminar_py2/pacmanAgents.py diff --git a/week3_model_free/seminar_py2/qlearningAgents.py b/week03_model_free/seminar_py2/qlearningAgents.py similarity index 100% rename from week3_model_free/seminar_py2/qlearningAgents.py rename to week03_model_free/seminar_py2/qlearningAgents.py diff --git a/week3_model_free/seminar_py2/run_crawler.sh b/week03_model_free/seminar_py2/run_crawler.sh similarity index 100% rename from week3_model_free/seminar_py2/run_crawler.sh rename to week03_model_free/seminar_py2/run_crawler.sh diff --git a/week3_model_free/seminar_py2/run_grid.sh b/week03_model_free/seminar_py2/run_grid.sh similarity index 100% rename from week3_model_free/seminar_py2/run_grid.sh rename to week03_model_free/seminar_py2/run_grid.sh diff --git a/week3_model_free/seminar_py2/run_pacman.sh b/week03_model_free/seminar_py2/run_pacman.sh similarity index 100% rename from week3_model_free/seminar_py2/run_pacman.sh rename to week03_model_free/seminar_py2/run_pacman.sh diff --git a/week3_model_free/seminar_py2/textDisplay.py b/week03_model_free/seminar_py2/textDisplay.py similarity index 100% rename from week3_model_free/seminar_py2/textDisplay.py rename to week03_model_free/seminar_py2/textDisplay.py diff --git a/week3_model_free/seminar_py2/textGridworldDisplay.py b/week03_model_free/seminar_py2/textGridworldDisplay.py similarity index 100% rename from week3_model_free/seminar_py2/textGridworldDisplay.py rename to week03_model_free/seminar_py2/textGridworldDisplay.py diff --git a/week3_model_free/seminar_py2/util.py b/week03_model_free/seminar_py2/util.py similarity index 100% rename from week3_model_free/seminar_py2/util.py rename to week03_model_free/seminar_py2/util.py diff --git a/week3_model_free/seminar_qlearning.ipynb b/week03_model_free/seminar_qlearning.ipynb similarity index 98% rename from week3_model_free/seminar_qlearning.ipynb rename to week03_model_free/seminar_qlearning.ipynb index 2aa4be084..a908ff5e2 100644 --- a/week3_model_free/seminar_qlearning.ipynb +++ b/week03_model_free/seminar_qlearning.ipynb @@ -4,8 +4,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Q-learning \n", - "(3 points)\n", + "## Q-learning (3 points)\n", "\n", "This notebook will guide you through implementation of vanilla Q-learning algorithm.\n", "\n", @@ -20,7 +19,12 @@ }, "outputs": [], "source": [ - "# XVFB will be launched if you run on a server\n", + "# In google collab, uncomment this:\n", + "# !wget https://bit.ly/2FMJP5K -q -O setup.py\n", + "# !bash setup.py 2>&1 1>stdout.log | tee stderr.log\n", + "\n", + "# This code creates a virtual display to draw game images on.\n", + "# If you are running locally, just ignore it\n", "import os\n", "if type(os.environ.get(\"DISPLAY\")) is not str or len(os.environ.get(\"DISPLAY\")) == 0:\n", " !bash ../xvfb start\n", @@ -58,7 +62,7 @@ " def __init__(self, alpha, epsilon, discount, get_legal_actions):\n", " \"\"\"\n", " Q-Learning Agent\n", - " based on http://inst.eecs.berkeley.edu/~cs188/sp09/pacman.html\n", + " based on https://inst.eecs.berkeley.edu/~cs188/sp19/projects.html\n", " Instance variables you have access to\n", " - self.epsilon (exploration prob)\n", " - self.alpha (learning rate)\n", @@ -71,7 +75,6 @@ " which returns Q(state,action)\n", " - self.set_qvalue(state,action,value)\n", " which sets Q(state,action) := value\n", - "\n", " !!!Important!!!\n", " Note: please avoid using self._qValues directly. \n", " There's a special self.get_qvalue/set_qvalue for that.\n", @@ -141,7 +144,7 @@ " \"\"\"\n", " Compute the action to take in the current state, including exploration. \n", " With probability self.epsilon, we should take a random action.\n", - " otherwise - the best policy action (self.getPolicy).\n", + " otherwise - the best policy action (self.get_best_action).\n", "\n", " Note: To pick randomly from a list, use random.choice(list). \n", " To pick True or False with a given probablity, generate uniform number in [0, 1]\n", @@ -362,7 +365,7 @@ "\n", "class Binarizer(ObservationWrapper):\n", "\n", - " def _observation(self, state):\n", + " def observation(self, state):\n", "\n", " # state = \n", " # hint: you can do that with round(x,n_digits)\n", @@ -472,7 +475,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.2" + "version": "3.6.8" } }, "nbformat": 4, diff --git a/week4_[recap]_deep_learning/README.md b/week04_[recap]_deep_learning/README.md similarity index 95% rename from week4_[recap]_deep_learning/README.md rename to week04_[recap]_deep_learning/README.md index 0fac1ceb4..ad6e8542e 100644 --- a/week4_[recap]_deep_learning/README.md +++ b/week04_[recap]_deep_learning/README.md @@ -30,6 +30,7 @@ __Note:__ This week's materials cover the basics of neural nets and deep learnin ### Practice +__[Colab url (pytorch)](https://colab.research.google.com/github/yandexdataschool/Practical_RL/blob/spring19/week04_%5Brecap%5D_deep_learning/seminar_pytorch.ipynb)__ From now on, we'll have two tracks: theano and tensorflow. We'll also add pytorch seminars as soon as they're ready. Please pick seminar_theano.ipynb, seminar_tensorflow.ipynb or seminar_pytorch.ipynb. diff --git a/week4_[recap]_deep_learning/fix_my_nn.ipynb b/week04_[recap]_deep_learning/fix_my_nn.ipynb similarity index 100% rename from week4_[recap]_deep_learning/fix_my_nn.ipynb rename to week04_[recap]_deep_learning/fix_my_nn.ipynb diff --git a/week4_[recap]_deep_learning/mnist.py b/week04_[recap]_deep_learning/mnist.py similarity index 100% rename from week4_[recap]_deep_learning/mnist.py rename to week04_[recap]_deep_learning/mnist.py diff --git a/week4_[recap]_deep_learning/notmnist.py b/week04_[recap]_deep_learning/notmnist.py similarity index 92% rename from week4_[recap]_deep_learning/notmnist.py rename to week04_[recap]_deep_learning/notmnist.py index fa733b771..5f7c98ac5 100644 --- a/week4_[recap]_deep_learning/notmnist.py +++ b/week04_[recap]_deep_learning/notmnist.py @@ -10,7 +10,7 @@ def load_notmnist(path='./notMNIST_small',letters='ABCDEFGHIJ', # download data if it's missing. If you have any problems, go to the urls and load it manually. if not os.path.exists(path): print("Downloading data...") - assert os.system('curl http://yaroslavvb.com/upload/notMNIST/notMNIST_small.tar.gz > notMNIST_small.tar.gz') == 0 + assert os.system('wget http://yaroslavvb.com/upload/notMNIST/notMNIST_small.tar.gz') == 0 print("Extracting ...") assert os.system('tar -zxvf notMNIST_small.tar.gz > untar_notmnist.log') == 0 diff --git a/week4_[recap]_deep_learning/practice_lasagne.ipynb b/week04_[recap]_deep_learning/practice_lasagne.ipynb similarity index 100% rename from week4_[recap]_deep_learning/practice_lasagne.ipynb rename to week04_[recap]_deep_learning/practice_lasagne.ipynb diff --git a/week4_[recap]_deep_learning/practice_tensorflow.ipynb b/week04_[recap]_deep_learning/practice_tensorflow.ipynb similarity index 100% rename from week4_[recap]_deep_learning/practice_tensorflow.ipynb rename to week04_[recap]_deep_learning/practice_tensorflow.ipynb diff --git a/week04_[recap]_deep_learning/seminar_pytorch.ipynb b/week04_[recap]_deep_learning/seminar_pytorch.ipynb new file mode 100644 index 000000000..8597565df --- /dev/null +++ b/week04_[recap]_deep_learning/seminar_pytorch.ipynb @@ -0,0 +1,1327 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Hello, pytorch\n", + "\n", + "![img](https://pytorch.org/tutorials/_static/pytorch-logo-dark.svg)\n", + "\n", + "__This notebook__ will teach you to use pytorch low-level core. You can install it [here](http://pytorch.org/). For high-level interface see the next notebook.\n", + "\n", + "__Pytorch feels__ differently than tensorflow/theano on almost every level. TensorFlow makes your code live in two \"worlds\" simultaneously: symbolic graphs and actual tensors. First you declare a symbolic \"recipe\" of how to get from inputs to outputs, then feed it with actual minibatches of data. In pytorch, __there's only one world__: all tensors have a numeric value.\n", + "\n", + "You compute outputs on the fly without pre-declaring anything. The code looks exactly as in pure numpy with one exception: pytorch computes gradients for you. And can run stuff on GPU. And has a number of pre-implemented building blocks for your neural nets. [And a few more things.](https://medium.com/towards-data-science/pytorch-vs-tensorflow-spotting-the-difference-25c75777377b)\n", + "\n", + "And now we finally shut up and let pytorch do the talking." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1.0.0\n" + ] + } + ], + "source": [ + "# if running in colab, execute this:\n", + "# !wget https://raw.githubusercontent.com/yandexdataschool/Practical_RL/bc1d4d4e8ca9cdf2b088bab0b4fb583f03acd530/week04_%5Brecap%5D_deep_learning/notmnist.py -O notmnist.py\n", + "# !pip3 install torch==1.0.0 torchvision\n", + "# !pip3 uninstall -y Pillow\n", + "# !pip3 install Pillow==5.3.0\n", + "\n", + "from __future__ import print_function\n", + "import numpy as np\n", + "import torch\n", + "print(torch.__version__)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "X :\n", + "[[ 0 1 2 3]\n", + " [ 4 5 6 7]\n", + " [ 8 9 10 11]\n", + " [12 13 14 15]]\n", + "\n", + "X.shape : (4, 4)\n", + "\n", + "add 5 :\n", + "[[ 5 6 7 8]\n", + " [ 9 10 11 12]\n", + " [13 14 15 16]\n", + " [17 18 19 20]]\n", + "\n", + "X*X^T :\n", + "[[ 14 38 62 86]\n", + " [ 38 126 214 302]\n", + " [ 62 214 366 518]\n", + " [ 86 302 518 734]]\n", + "\n", + "mean over cols :\n", + "[ 1.5 5.5 9.5 13.5]\n", + "\n", + "cumsum of cols :\n", + "[[ 0 1 2 3]\n", + " [ 4 6 8 10]\n", + " [12 15 18 21]\n", + " [24 28 32 36]]\n", + "\n" + ] + } + ], + "source": [ + "# numpy world\n", + "\n", + "x = np.arange(16).reshape(4, 4)\n", + "\n", + "print(\"X :\\n%s\\n\" % x)\n", + "print(\"X.shape : %s\\n\" % (x.shape,))\n", + "print(\"add 5 :\\n%s\\n\" % (x + 5))\n", + "print(\"X*X^T :\\n%s\\n\" % np.dot(x, x.T))\n", + "print(\"mean over cols :\\n%s\\n\" % (x.mean(axis=-1)))\n", + "print(\"cumsum of cols :\\n%s\\n\" % (np.cumsum(x, axis=0)))" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "X :\n", + "tensor([[ 0., 1., 2., 3.],\n", + " [ 4., 5., 6., 7.],\n", + " [ 8., 9., 10., 11.],\n", + " [12., 13., 14., 15.]])\n", + "X.shape : torch.Size([4, 4])\n", + "\n", + "add 5 :\n", + "tensor([[ 5., 6., 7., 8.],\n", + " [ 9., 10., 11., 12.],\n", + " [13., 14., 15., 16.],\n", + " [17., 18., 19., 20.]])\n", + "X*X^T :\n", + "tensor([[ 14., 38., 62., 86.],\n", + " [ 38., 126., 214., 302.],\n", + " [ 62., 214., 366., 518.],\n", + " [ 86., 302., 518., 734.]])\n", + "mean over cols :\n", + "tensor([ 1.5000, 5.5000, 9.5000, 13.5000])\n", + "cumsum of cols :\n", + "tensor([[ 0., 1., 2., 3.],\n", + " [ 4., 6., 8., 10.],\n", + " [12., 15., 18., 21.],\n", + " [24., 28., 32., 36.]])\n" + ] + } + ], + "source": [ + "# pytorch world\n", + "\n", + "x = np.arange(16).reshape(4, 4)\n", + "\n", + "x = torch.tensor(x, dtype=torch.float32) # or torch.arange(0,16).view(4,4)\n", + "\n", + "print(\"X :\\n%s\" % x)\n", + "print(\"X.shape : %s\\n\" % (x.shape,))\n", + "print(\"add 5 :\\n%s\" % (x + 5))\n", + "print(\"X*X^T :\\n%s\" % torch.matmul(x, x.transpose(1, 0))) # short: x.mm(x.t())\n", + "print(\"mean over cols :\\n%s\" % torch.mean(x, dim=-1))\n", + "print(\"cumsum of cols :\\n%s\" % torch.cumsum(x, dim=0))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## NumPy and Pytorch\n", + "\n", + "As you can notice, pytorch allows you to hack stuff much the same way you did with numpy. No graph declaration, no placeholders, no sessions. This means that you can _see the numeric value of any tensor at any moment of time_. Debugging such code can be done with by printing tensors or using any debug tool you want (e.g. [gdb](https://wiki.python.org/moin/DebuggingWithGdb)).\n", + "\n", + "You could also notice the a few new method names and a different API. So no, there's no compatibility with numpy [yet](https://github.com/pytorch/pytorch/issues/2228) and yes, you'll have to memorize all the names again. Get excited!\n", + "\n", + "![img](http://i0.kym-cdn.com/entries/icons/original/000/017/886/download.jpg)\n", + "\n", + "For example, \n", + "* If something takes a list/tuple of axes in numpy, you can expect it to take *args in pytorch\n", + " * `x.reshape([1,2,8]) -> x.view(1,2,8)`\n", + "* You should swap _axis_ for _dim_ in operations like mean or cumsum\n", + " * `x.sum(axis=-1) -> x.sum(dim=-1)`\n", + "* most mathematical operations are the same, but types an shaping is different\n", + " * `x.astype('int64') -> x.type(torch.LongTensor)`\n", + "\n", + "To help you acclimatize, there's a [table](https://github.com/torch/torch7/wiki/Torch-for-Numpy-users) covering most new things. There's also a neat [documentation page](http://pytorch.org/docs/master/).\n", + "\n", + "Finally, if you're stuck with a technical problem, we recommend searching [pytorch forumns](https://discuss.pytorch.org/). Or just googling, which usually works just as efficiently. \n", + "\n", + "If you feel like you almost give up, remember two things: __GPU__ an __free gradients__. Besides you can always jump back to numpy with x.numpy()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Warmup: trigonometric knotwork\n", + "_inspired by [this post](https://www.quora.com/What-are-the-most-interesting-equation-plots)_\n", + "\n", + "There are some simple mathematical functions with cool plots. For one, consider this:\n", + "\n", + "$$ x(t) = t - 1.5 * cos( 15 t) $$\n", + "$$ y(t) = t - 1.5 * sin( 16 t) $$\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "%matplotlib inline\n", + "\n", + "t = torch.linspace(-10, 10, steps=10000)\n", + "\n", + "# compute x(t) and y(t) as defined above\n", + "x = # YOUR CODE\n", + "y = # YOUR CODE\n", + "\n", + "plt.plot(x.numpy(), y.numpy())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "if you're done early, try adjusting the formula and seing how it affects the function" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "```\n", + "\n", + "```\n", + "\n", + "```\n", + "\n", + "```\n", + "\n", + "```\n", + "\n", + "```\n", + "\n", + "```\n", + "\n", + "```\n", + "\n", + "```\n", + "\n", + "```\n", + "\n", + "```\n", + "\n", + "```\n", + "\n", + "```\n", + "\n", + "```\n", + "\n", + "```\n", + "\n", + "```\n", + "\n", + "```\n", + "\n", + "```\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Automatic gradients\n", + "\n", + "Any self-respecting DL framework must do your backprop for you. Torch handles this with the `autograd` module.\n", + "\n", + "The general pipeline looks like this:\n", + "* When creating a tensor, you mark it as `requires_grad`:\n", + " * __```torch.zeros(5, requires_grad=True)```__\n", + " * torch.tensor(np.arange(5), dtype=torch.float32, requires_grad=True)\n", + "* Define some differentiable `loss = arbitrary_function(a)`\n", + "* Call `loss.backward()`\n", + "* Gradients are now available as ```a.grads```\n", + "\n", + "__Here's an example:__ let's fit a linear regression on Boston house prices" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXQAAAD8CAYAAABn919SAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAIABJREFUeJztnX+MHOWZ57/PtMu4xyT0eBlypsHYyUX2LuvYA7OJVz5FwbeLsyGQSYA4CFacFC13pz0psNxcJicUm5WjzO4oC/8l4i65IMUBA84OJtbJidaO9tYnnBtnPCE+8OUHxKRB4MhuEpgG2jPv/dFd457q9616q7q6qrr6+5Gs6a6prnqqPP3U8z7v93leUUqBEEJI7zOQtgGEEELigQ6dEEJyAh06IYTkBDp0QgjJCXTohBCSE+jQCSEkJ9ChE0JITqBDJ4SQnECHTgghOWFFkie7/PLL1fr165M8JSGE9DwnTpz4rVJqOGi/RB36+vXrMTMzk+QpCSGk5xGRX9vsx5QLIYTkBDp0QgjJCXTohBCSE+jQCSEkJ9ChE0JITrBSuYjISwB+D2ABwAWl1KiIrAGwH8B6AC8B+KxS6nzcBk7PVjB1+DQq1RoKIlhQCuVSEeM7N2JspLy03wPTz+G7x89g0bNehwjwr4dX41dn57GgFAoiuOMjV2Pv2GZMz1aw5+ApVGt1AMDQoIPdN18LAJg6fBqvVGu4UnOuKPbbHsu0f9jjEEL6D7FZsajp0EeVUr9t2fb3AM4ppSZFZALAkFLqi37HGR0dVWFki9OzFXzpe8+hVl9o+13RKeCrn9mMsZEyHph+Dt959oz1cQFg+wfW4Mcvnkfd8wQYEKAwIKgvXNzeeq4w6Oz3O5Zp/1uvL+PAiYr1cQgh+UJETiilRoP26yTl8ikAjzZfPwpgrINjaZk6fFrrzAGgVl/A1OHTAIDHjr8c+tjHfnmuzZkDwKLCMmfuPVcYdPb7Hcu0/2PHXw51HEJIf2Lr0BWAH4jICRG5p7ntfUqpVwGg+fMK3QdF5B4RmRGRmbNnz4Yy7pVqzer3CwmsixpkS5jPhN1uur4oNhFC8outQ9+ulLoOwF8A+GsR+ajtCZRSjyilRpVSo8PDgZWry7iyVLT6fUEk1HGjEGRLmM+E3W66vig2EULyi5VDV0q90vz5OoB/BPBhAK+JyFoAaP58PW7jxnduRNEpaH9XdAoY37kRAHDHR64OfeztH1gDZ6DdUQ4I4BSWb289Vxh09vsdy7T/HR+5OtRxCCH9SaBDF5HVIvIe9zWAGwH8DMBBAHc3d7sbwNNxGzc2UsZXP7MZZU8kXi4Vl00I7h3bjLu2rYPGP0ME+OAVq5c+WxDBXdvWYd9f/Smmbt+CUtFZ2ndo0ME/fHYrpm7bgnKpCPGca3q2gu2TR7Bh4hC2Tx7B9GzF2n7vscLsv3dsc6jjEEL6k0CVi4i8H42oHGjIHL+rlPqKiPwBgCcArANwBsDtSqlzfscKq3KJk05lf2EVK4QQEhe2Khcr2WJcpOXQdc5YANy5bR32jm22Osb2ySOoaCYhy6Uijk3siMtUQghpIwnZYs+gkwMqAPuePROYNnEJq0whhJCk6QuHbnK6CrDWcodVphBCSNL0hUP3c7q2EXZYxQohhCRNXzj08Z0bYVKq20bYYRUrhBCSNIkuQZcWYyNlzPz6HPY9ewatU8BhI+yxkTIdOCEks/RFhA40tOoP7drKCJsQklv6xqETQkje6YuUC9CuRa9Ua/jS954DAEbphJBc0DcRethWtoQQ0mvkKkL3K+9nYRAhJO/kJkJ3UyqVag0KF1MqbiUoC4MIIXknNxF6UEpl/t0LbZ9hYRAhJE/kxqGbUidupO519qWigz23XMsJUUJIbshNysVvtR/duqSrL1lBZ04IyRW5ceimXium9Tgr1ZrVIhWEENIr5Mahm3qtlH0mPb0Tp4QQ0svkJocOmHut6HLoLu7EKdMvhJBeJ1cOXYfrqKcOn9auOARQi04IyQe5c+im4qKxkbJxGTlq0QkheSBXDl3Xr2X8qTnsOXgKb9TquKzowCkI6gsXJ0qpRSeE5IXcTIoC+uKi+oJCtVaHAlCt1QEFDA06bKFLCMkduYrQTTnyVuqLCoMrV2D2yzcmYBEhhCRHrhx6QcSoO28lqUlQv2ZhhBASN7ly6DbOHEhmEpT91wkhSZOrHLpfEZGLMyCJTIKy/zohJGly5dDHd26EUxDffS5dlUwPF/ZfJ4QkTa4cOgAgIOtSna8nYgb7rxNCkiZXDn3q8GnUF/09elIO1dQsjJp3Qki36NlJUZ2CJCidkaRDbW05QJULISQJRFkqQ+JgdHRUzczMdHwcr4IEaDjrS1YMNIqHNJQNDpXSQkJI1hGRE0qp0aD9ejJCNylIVjkDKDqFNkdvqgaltJAQkid6ModuSq1U5+vanugm50xpISEkT/RkhH5lqWjsmmjqia6D0kJCSJ7oyQg9LgUJpYWEkDzRkw7dtNxc2Lw3pYWEkDxhnXIRkQKAGQAVpdQnRWQDgMcBrAHwEwB/qZR6tztmthMmteJ3DIDSQkJIPgiTQ/8CgOcBvLf5/u8APKSUelxEvgHg8wC+HrN9XSeOB0MQlEYSQpLAKuUiIlcBuAnAf2++FwA7ADzV3OVRAGPdMLDXcaWRlWoNChelkdOzlbRNI4TkDNsc+sMA/guAxeb7PwBQVUpdaL7/DQBtyCki94jIjIjMnD17tiNjexFKIwkhSRHo0EXkkwBeV0qdaN2s2VVbcqqUekQpNaqUGh0eHo5oZu9CaSQhJClscujbAdwiIp8AsAqNHPrDAEoisqIZpV8F4JXumdm7+GnmCSEkTgIjdKXUl5RSVyml1gP4HIAjSqk7ARwFcFtzt7sBPN01K2NmeraC7ZNHsGHiELZPHulqPpvSSEJIUnSiQ/8igL8RkV+gkVP/ZjwmdZekJynj0swTQkgQPdltsRO2Tx7RpkDKpSKOTexIwSJCCPEn190WO8E0GVmp1rB98gi14oSQnqUnS/87wTQZKQC14oSQniZ3Dj1owlM3SSlo11xSK04I6TVy5dB1E5737T+J9S3OXTdJaZpFqFRrjNIJIT1DrnLouqpM11l7VyNqzY+bJkoBYPzJuaXPEEJIlslVhB5UfWlKo+jSMC71RYU9B0/FYh8hhHSTXDl0m+pLndN30zAmTAtPE0JIlsiNQ5+ereCtdy4E7mdy+kmnVJKsViWE9Ae5yKG7k6He/LmXoJL7oUEH5+fbo3ERLE2o2tgS1Pvca683vx/lmIQQkosIXTcZCgClohOq5H73zdfCKbQ3klQKVrr0oLYCblR+7/6T1i112U+dEGJLLiJ002ToG7U6Tu6+0fo4rrO//4k5LHhaIrgO1++BENT7PGgUobsOv2MySieEtJILh95Ji1o3nVGp1lAQaXPkrQSpaPx6n5tGEUH2sp86IcSWXKRcoraobU1nAPB15oD5AeGmUkyfvrJUDHTAJntN52Q/dUKIl1w49Kgtam2iZheTw/U+FEyf83PAfvaynzohxJaeSbkEKT281Z82xzA5YR2rHP2zz++hUPbY6c2hF51C4IPH/R1VLoSQIHrCoYeV+dkeIwzn5+vac5pSKQIs66/eiWO2eVgRQkhPOPQ4lB5h0ismdOcsGbTrpUGnbRsdMyGkm/REDj0OpUdcqhDvcUzzqAkuBEUIIQB6JELvRJYYdAwTuh7punO+YejzYtoeFVaLEkKC6IkIPQ6lh+4YzoBoK0OHBh3cuW2d1TmTkBWyWpQQYkNPROg2E4o2KhjdMfyOO3rNmsCo+IZNw9j37Jll0XzcskJWixJCbBCVYLJ3dHRUzczMxH5cXXMukyRweraCPQdPLbXEHRp0sPvmawHYKVBaK0t1aRkBcOe2ddg7Zm7HG5YNE4e06R8B8OLkTbGdhxCSTUTkhFJqNGi/nojQg7CNYKdnKxh/cg71xYvu8fx8HffuP7nssyZZpPfBoXOyCsDRF8762hs2Hx7HHAIhJP/kwqHbqmCmDp9e5sz9aG2q5TrfgYBeL0H2ANE09eM7N2pHIKwWJYS00hOTokHYTkyGlS66ztadjLRx5n72AMEdGXVEbW1ACOkvchGhmyLYGzYNY/vkkaXUxmVFJ9RycgWR0MVIfpHz9GzFKJ0MetiwKIkQEkQuHLrr6FonOwcE2P/jl5dSLJVqDU5BMABg0eKYRacQ2pkPyPJoW5d/N8F8OCGkU3KRcnF558JFV/3Wuwtt+fL6gsJlgw5KxfayfKChGgEupjTKAU62II1PlIoOnILAPZ1OJ+7XeoD5cEJIHOTGodv2ajk/X8cbtTrKpSLu2rZuWV76oV1b8dLkTUtNtfwWnS6XivjaZ7egXCqiWqujvqBf4cjFL6XCfDghJA5ykXIBwk14utWWB05U8NXPNPTiU4dP4779JzF1+DRu2DSMAycqvhH1DZuGQy0pZ5IelktFOnNCSCzkJkKPkoOu1Rfwpe/9FPftP7msrP47z57x7XH+1c9sxtEXzoZaUo4LVRBCuk1uHLq2V0tB0N6pZTm1+qJx6Tgvbo/zsZFy6CXl0pIeusvjbZg4hO2TR9j/hZAck5uUi6lXy32eKtBOaI24/bo3elcqarUxyfRKHAuDdHJudockJFly49ABvcN0+650ijfiNmnfszTBmVZTrzQfJIT0M4EpFxFZJSI/FpE5ETklIg82t28QkeMi8nMR2S8iK7tvbnh0qZiwFETaHHXcKZRupEbiWBgkClGqYQkhnWMTob8DYIdS6k0RcQD8i4j8TwB/A+AhpdTjIvINAJ8H8PUu2hoJXSomTMTuF3XrRgS6VIP3/LrWv92IaNNq6pXWg4SQficwQlcN3my+dZr/FIAdAJ5qbn8UwFhXLIyBsZEyjk3swItNjXlQwZBL2Kh7eraC8afmlilm7n9yDuNPLt9mU3QUR0SrG51I04ZuTpAmsegHIaQdK5WLiBRE5CSA1wH8EMAvAVSVUm7lzW8AZC45akpjjO/cGKh+KZeKS4oWWx585lRbgdHComqrWLUtOuo0om1NCwHLl9WzXfUoSiqIEk1C0sHKoSulFpRSWwFcBeDDAP5Qt5vusyJyj4jMiMjM2bP+fcLjxG/ZtrGRsq9UUYClxl5hHNn5efvGX96iIx1xRLTu6KRcKrZdc9AoIOrSd+wOSUg6hFK5KKWqIvIjANsAlERkRTNKvwrAK4bPPALgEaCxYlFn5toTpPAYGnSMDlgByypFu6HSGBDBholDuLJU1Famxh3RRhkFdKKSYXdIQpLHRuUyLCKl5usigD8D8DyAowBua+52N4Cnu2VkFPwc2PRsBW++be7TAiBSTtvU9EvHglLLWhDcen25qxFtlFEAJzcJ6S1sIvS1AB4VkQIaD4AnlFLfF5H/C+BxEdkLYBbAN7toZ2j8FB5hVi5qxZ1MNClX9txybdsSd14KmlWPavUFHH3h7FJTMBOdFOtEWfWIS98R0lsEOnSl1E8BjGi2/wqNfHomuWHTMPY9e2ZZ3th1YJ1Uj1aqNYw/OQcIliZAK9Ua7tt/EgqNKN20iIYAWDSsetQa9Zqkj51IG02VtH6f5dJ3hPQWuaoUdZmereDAicoyZy4Abr2+kdfttHpUF4G7W6q1+jI1iXcfXYQOXIx6dZp092HhJWzVZ9i8dpSHACEkPXLp0HWTeQrA0RcaKhtd5BknCjA6dZ0zb416Tbab6CSfbZPC4eQmIb1DLh160GRea+QZR58XHQqNyU3T8QsiWFSqzZGGddBR89nst0JI/shN+9xWbBQdrfrssDgDAqfgX5pUEPF1zotKLVWutjrQsA56/t0LkSo+2W+lAdsLkzyRS4ceplIxbERcLhUxdfsWTN22ZVkFphdXlmjC5LjDNhM7P1+3KvbxQkli9MIpQrJKLlMuYSbzwjTrKogsO477szUXPWCY9GzFTykSJR0UpSVuv0kSdfMFabUXJqRbiApwPnEyOjqqZmZmEjufDd5cMtBY6QhKr2YJ6nm+YeJQYFuBO7etw96xzVb2rZ84ZLWfAHhx8iarfQH9dWetn3tcmK7VNCke9l4S0m1E5IRSajRov1xG6GFwndeDz5xaagWweuUKfHLLWjx2/GVtEdC9zcWkdQVGJZ+WAkBjsvSx4y9j9Jo1Vo7Tb2K1lbCRdT9JEk2ReJCElJBeo+8dusvb9cWl19VaHQdOVHxTJ5VqDeNPzS2L5CvV2tKEqbfrYisLSrUpSkwSQhuJZdRin04lib2yzJxpXmBBqbZInYVTpJfJ5aRoWPwiOD/qC+2tceuLCisGJFA906ooCeoM+dXPbDbaoltNKQl6aULRFHG7PXPYFZLkBTp0BEdwYanVFzG+cyMe3rXV9/PueYMkhGMjZWPLgEWlUnFAvSR79FM9eRc/oTMnvQwdOuwiuLC4Sgm/6No9r42EMGurAPWS7JH92Um/wBw6/JtQuXlmkxrGlCv3VqX6NbmykRBmrVFWr8ke2cKA9AOM0GEfwV2y4uLtGhp0MHXbFgwN6nuge6tS/Y5vUwiVtSiTy8wRkj36XoduolXBURp08ObbF0L3UC97lB9+qhC/3z0w/dyyVsBu4y/v8ZOmV1QuhPQ6tjr03Dv0KE5Hl16JilMQrF65QttW16aQ54Hp5/CdZ88Yf58V504I6R4sLEK0joLTsxXc/8RcYPm+LfUFtbTghWmRZj8nvM/HmbceM6luiYzKCckuuXboYXt1uA+AuJy5DSZViOs4w1jSem2mVY86cca2D0g6fULSIdcOPay07sFnTnVt0QsTOlVIJymfV6q1tpy7adm8sBG9zQOSfdYJSY9cO/Qw0rrp2YpvD5ZuIMBS5By2Y6OJy4pO21qqgL7RWK2+gAefOWUdTds8INnBkJD0yLVsMYy0Lo0KRwUs07i7ZfRRnXnRKUDEf8k6L+fn69bl+zbFTb1UcETigwuFZINcO3Rb7fb0bKVrS9H5IWioWO5/Yi5SemXQGWi7tmqHowy/8n2bB2TWKlpJ9+mlvj55J/eyxSDilChGwbSYdJjPX1Z0IAJU5+sdpWtaj9naD9xNB1WqtWX2Dg062H3ztcsmYXUPRqcgmLptS2wpF9tJV07OJsP2ySPa//dyqYhjEztSsCh/2MoWcx2h26DL+SZJp49ThUa73/Pzdat0TUFkKaIvFfVVrgMiS9FVa/TltddtOezdR2tkTNhGg4wak4NptuzQ9w49jVRLmrhrnb71zgV8cstabTdIt1+7G+GaHnhueibooVhfVLHNUdh2eeylbpC9DtNs2aGvHfr0bEW7wHMWWb2yACfG/61qrY79P34Zt15f1naDdFdmCnrgvVKtWUVifnr7MJNppuNUqrVlx2DUmBzs65Md+tqhhy3cSQMBcNe2dfj0dWW0LKoUC/VFhaMvnDX2WrfhylLRKhLz09uHSYv4nav1GCWLpmkkHrLWOK6fybUOPYheiNYUgO/PvbrUPiBu3AnDKKmn1ihs/Kk5YythP6loWM26zZJ8tfoCLlkxwOXlEoTtibNBXzv0qI4sabrlzIHGA2P+3QtwBsSqm6S7sHK5VMQNm4aNyhYXv6ZhUdIi3sWtTRa/UavjoV1bqXIhfUVfO3SbaK8fOD9fh1MQKwnlv7ps1VKUG3TvTLK1oD41prSIV4b40K6t2HPwlPaBd2Wp2NWokZJIkkWoQzdorOMg7uN1m6FBB9Wm/NEPZ0Bw6aoVVq0SBFjm8IJ0/6aWwtoVowYEiwAWPCMLZ0AwdXt8uncvOltsWiETEhW2z7WkNYrzK46Jwp3b1uHQT19NvEdMVM7P11F0BlALmH2tLyrra2qdqAT8df9+6Rnd50wpoktXreiqY2W/GpJV+lrl4sVdAd5vUWhbmePQoIO9Y5sx++Ub8fCurZEWmu6Uh3dtRTGk1jHImUfFdXim/LgAODaxI3RjMB2dtj8IgpJIklXo0DXodLUAUCo6uHPbumXyrO0fWNPm5AXATR9au/Te5kHRDaYOn16q5swCbr5ZR5CcMIzcsNvSRBbSkKxCh67B1dV6F4Cu1uo4cKKC8Z0b8eLkTTg2sQP7/upPcee2dcucugJw4ERlmZ56eraC+XcvJHMBTSo+DrSbaOqUAAClQUd7D2zkhLqHrDMgKAwsP5lTkK5LE1lIQ7JKoEMXkatF5KiIPC8ip0TkC83ta0TkhyLy8+bPoe6bmxxjI2UMrmyfYtCVjx994axxeTng4iRa0rn0gjScW9LVsEqh3fkWBG++faHtHohcvFd+BUW64pVdH766/Q84gVloFtKQrBKochGRtQDWKqV+IiLvAXACwBiAfwfgnFJqUkQmAAwppb7od6wsqlz82DBxSOsfvN0Ig/YzdaNLgpcmb8L6iUOJn7dV4VN0BnDJikKgnj7sgte91uWPUkcSldhULkqpVwG82nz9exF5HkAZwKcAfKy526MAfgTA16H3GrYrHgXtl9ZkWblUXOpXk7R8svV8tfqi1WRr2AWvk56c7MQhc2k+kgShcugish7ACIDjAN7XdPau078ibuPSxjZXGrRfGnls9/y90K9GR62+gPufmIvU1+XK5oMszhV0Om3Hy+6PJAmsHbqIXArgAIB7lVK/C/G5e0RkRkRmzp49G8XG1LDNlQbtZ1LNdGtGuiCydP4sSel098CP1ja+OkwP0hs2DcfeC71Th2wzmuAybqRTrAqLRMRBw5nvU0p9r7n5NRFZq5R6tZlnf133WaXUIwAeARo59BhsThTb8nG//cZGypj59bm2xZsLBYFaULFH0Hd85OolW7LUr+a6dZfhf//yXKjrbXWapnSHWwxWEEGtvoDHjr/cttBHp4U/fm17p2crgccNSssxJUPiwEblIgC+CeB5pdQ/tPzqIIC7m6/vBvB0/OblB50Spr6gUBp02qLMTlUp3597dem1aXSQBmGduUulWsN9zd7squX9+olDmDp8GjdsGkbRKSw5cdOqTZ2MVvzSZjbRf1BajikZEgc2o/7tAP4SwA4ROdn89wkAkwD+XER+DuDPm++JAZMzqc7X29I1Xl17WKq1+tKQvTUdlBQfvGK1dnsnIxHvZ1snUPc9e8aqwVoncxl+D0YbxxuUlmP1KYkDG5XLv8AcNP7beM3JL5cVHa1s77KiY0zXfOfZM5HP5x2yj42UE5FP3rVtHY6+kOxcic2DotPCH/f/5979J7W/t3G8fmk5W0UVIX6wUjQhTNWTv3u7rh2u7x1rr1QNS62+gAefObU00XburXc6Op4NB05UMpOzdxEAt17feSvdsZGycaTTqeNl9SmJAzr0hDA1jFpU5hxsHE2mzs/Xl3LPcTfeMq1FqtvebfzOqIDYRg3dcrysPu0u/aIg6vv2uUnhpzYxKTCSVKi4KxDZpHkEwEO7tuI+Q/phQalEi5mcAcGuD1/t26o4rly0d8WkOCs+s7aMW14qW/tJQcQIPSGC1CY6h6P7jBuJxjnJ6ZbKj16zJjC6FjT6vI+NlAMXbA5L5Mi++TG/zpKmRaOj4HbPdBu05c0pAJ0XUmWJflIQMUJPCPdLf/8Tc1pZnc45BkWDQZOczgAQlGVx0wXuF9gk+QMuOvO9Y5sBxLuEn7viDwDct/9kqAdCfUFpteetxLUwV5ai1iBbOrE1T4t49JOCiA49Qdwvgm75MlMO1m8YHuRQL13l4O36YtuybZeuWoHqfH3Zl3zrgz8IdMzeXLS3sCcq3mZcJiWJH37OHGgsGu0S1dFlaegeZEuntubJCfaTgogpl4SJc/LLPZYJncZ96vYtmP3yjcvSBdOzlcBOiC7eL7SbfohC0Sng4V1b29IWUdJJQemay4qNlEsnqYQsDd2DbOnU1jwt4tFPCiJG6Cmgi7qjRo1jI2VjhHxlyW7l+zAOSfeFjtLRsbXfjJfxnRuNaZdS0cE7FxbbRji3Xl/GgRMV4yijWqvjgenncPSFs5FTCVmKWoNs6dRW3ejP1glmKS0FdHciO2vQoWeATofHnXz5APsvuemYUTo6Lirl+1AbXFnAW+8ud7xFp4A9t1y7dE7vl3P0mjW+6R9vL51WKtUatk8e8f3CZ2noHmRLp7ZGdYJZSku1kjUFUbegQ88AnU5A2Xz5dFGT+xkbZ+y36ESUCNXrWLyO4K13F+AUBKtXrsAbtTpKgw7eri8s5deHBh08tGvrMnvcL61pwRGFxshAl28XYMkBmpxQpw/OOAmyJQ5bozjBPE2m9iJ06BkgjqG835dPFzWNPzkHSEMh4oczIJi6fYvvlzGsXl6ANseicwT1BYXVl6zAnluuxfiTc6gvXrT1/Hwd40/NAWiP/PzsWVAKRaew7Fy6dJHOCWVp6B5kS1q2Zikt1Y/QoWeAbg/ltc5y0S5JUl9UgdHV+M6NGH9qLvDh4OLq2FsxOeBKtYY9B09p7a0v6G3zy8G7I41WR2c6t84JZWno3okt3cpzZykt1Y/QoWeAbg/lO42Ogj4/NlLGnoOnrJQyzoBg9Jo1y7YFTar6HVfnPEz95wXADZuG2xyhSc/v54SyNvHnxS+XDaBree4spaX6ETr0DNDt4XGnLQRsoqs3LGWPuoi/02Xytj74A4hgmbZ+79hmvHj2TRz75bml/RQazcNGr1nTUW48qxN/rQTJFruV585SWqofERVXCZ0Fo6OjamZmJrHz5ZkwEaLXAQHAgDQag3nxbm+t4PQ7X5jWvALgxcmblt6bJjGj4soYTX1p3FYHrbTez9KgA6UaD6kw16o7blqY7qmr1jf9rvX/hWQHETmhlBoN2o+FRT1I2OIYXTHTe1fpe5u8d5XTVvQEIPB8fn1nvHgjfrfoJy5q9QXsO25uMmbKjR+b2IGHdm3F2/VFVGt147X2wsSfX2FQnoqGyHKYculBokjDvHnjDROHtPu9Uavj5O4bAVyMWnXRqPd8uqH2DZuG24p9dKmMbnTb9Rt4tjou70jnrXcuBN7bXpj4C0ojxZnnzvp8Qj9Bh96DxBEhhl202OZ8OtWFW+zj1b+3FvGYWt52C9cGXS7cROu19sLEn00uOw4nHHY+IS3n3y8PHTr0HiSOCDHIKelGAVHO53XyYZzo0KCDwZUrYu0Jv3plYcmePQdPWXeKbL3Wbk78xel4/GQFZJoEAAAN6UlEQVSNcckvw4wW05pM7oVJ7LigQ+9B4qoCBMxOKSjaD3O+Vic1YKjU1FGdr+OmD6317dESBqcg+MqnNy/ZZNuQTHet3dCAp+14ojxMwowW06oiNZ333v0nMXX4dFej9aRHBnToPUhcEaKfU/KTOvq1AfDidVK2zhy4KDO89fqy72pENnht9mtI5o4M/NJE3UhRpFk2H/VhEma0mNZkst/xu/nQTOMBTYfeo3S7YtE0Cgjb6tcmdeNHrb6Ax46/jEWlMNTs5xJ2bVSdnDDIiXidufeLee/+k7h3/0mUig723HKt9T3xc9rddnh+0WLUh0mY0WJak8lBdRjdemim8YCmbJFo0Ukdo/Rtt3FGAwEqlwWloNDo3xLWmTsFMToXE+fnl0sW/XLt1Vod40/OWS/N5tfiwEQcss4gqWuY9gethPk7SasvedDyj0B3RglpjEgYoRMjcYwCTNFRQQSLSqE06ODNty9gsVsFbp7Dtkoxve0GTE26gkYYNv1uIph60a4YZJ1BlaOm1gtRJr799nNtSVJt0npe04OrG6OENEYkdOikqwSlbrZPHumqbLG+qLDn4CmtE1e46MjKHbZH0EVdrSmOy4pOZMdcjeH++EWLptYLuq6YnZJWczP3vDo5brdGCWnIW5lyIV0laEgex/AzyE9Wa/UlZ+11XK4zPzaxw7j03dCgEzhkN/V3d1Mc1Vo98oMrjojOrzrU9H+gkD9ZX1ypxKydy4UROuk6UdU0Nrh9W46+cDbycdzPmSKq3Tc3VkkydZR0Btrz9J1OBi8duyB4650L2DBxqKMUhV+0aEpFRFnbtRdIcpSQ9IiEETpJFZsJKy9uRO5GPHvHNuPYxA48vGtr6GMBFxeYHhsp49bry0vvCyK49fry0pfy5O4b8fCurRgavDhJWSo62gVAoo48nAHB0KADQWNkAIVlfWXu238S6ycOYfvkEeuJWPfaTNFiPy2inHcYoSdMv5Qgt+J3ze7P+5+YM2rUh5rdD93ouDToYPfN7VJB3aTb/LsXAlMd7nmnZys4cKKy9H5BqbZ2u7YRl83Iw10Oz/1ZaubZ3TbAb71zoW1hD/ddFE2zyfaok5X9+Lecddg+N0FMEzLdzqulie01T89W2paZAxoph11/crW2yZfNfbPpSePm0E1tcXWFRnGctxVnQKyWBDTZnjT9+LecJmyfm0GCpGN5xPaax0bKmLp9C0otmuuhQQdTt23B0RfORr5vrakGoH0CtTW1YEqTnJ+vW7cq1p1X0EjNuKmUgkbuUl9UoZ25n83dph//lnsBplwSpBf6aMdNmGs2pQTu238y1LH9juuXJrCdoLWt9jNdj6l1cRTSatnbj3/LvQAdeoL0Qh/tTtA5yziuOc775pcD1ylBTFSqNYz87Q+WLXtnm2oIo+xp7TjpLf5Jc+Iy73/LvQpTLgmSZzWBqbT8hk3DHV9zUvdNpwQp+ZTde1sE2KpObJU9rmTy2MQOvDR5Ex7atTVRTbMfef5b7mU4KZoweVUG+K2z6WqdO7nmB6afw2PHX15Shdzxkauxd2xzXOYbCTO5WSo6WH2J3eTp9GzFV9lTEMHXPtsuh4xC1L+5oM/l9W85i9hOigY6dBH5FoBPAnhdKfXHzW1rAOwHsB7ASwA+q5Q6H3QyOvT84rcosXfh4bCOIG1FxfRsBfca8vh+BNnod9y4FmyOeu+6cc/5AIhOnCqXbwP4uGfbBIB/Ukp9EMA/Nd+TPsZ24eGwC1wD6SsqxkbKkaomg2wcGykvK1JqJa5cdNR7F/c9j/L/TsIT6NCVUv8M4Jxn86cAPNp8/SiAsZjtIj2GbU41iqPIgqIiSkUrEGzj7puv7TgXPT1bwfbJI9igqSCNeu/ivudpP5T7haiTou9TSr0KAM2fV8RnEulFbBsRRXEUQdG/n0OLC52u3IagSNt73KFBB5esGMB9+09aXUtQ5Gs7crL9fdSRQxYeyv1A12WLInIPgHsAYN26dd0+HUkRm7L4KHI3v8ZSSS7z5b0+00Sw10bb40a5lqBVcaK2cI279StljskQNUJ/TUTWAkDz5+umHZVSjyilRpVSo8PDwxFPR/JCFLmbX/Sf5lBedy3exmGdLtfXaToqagvXuFu/UuaYDFEj9IMA7gYw2fz5dGwWkVwTtRGUKfrv5lA+SJUR9wo8UdNRQZFv1BaucbZ+HRspY+bX55ZJT91OliQ+Ah26iDwG4GMALheR3wDYjYYjf0JEPg/gDIDbu2kkyRdxOoo4hvI6xw20LwytS3+kfS1BqRG/h1KSMkKbTpakc1hYRHqaTvXSD0w/h33PnmkrqV/lDGjb7obtbhjGaZqKmEpFB3tuaW8XHHQOv3sDIFFtv1/hWbe6ReZJ926rQ2cvF9LTdJL2mJ6ttDlzwH9h6DCpnLCTnO62B585texhUq3VAz+n2x6Uk/ebTI2CnwNNWuWS5GR5lqBDJz1P1LSHaXFkP8KkcoIUKDrciV7v6CCKs43iRKM62CAHmrTKJcq9zwNszkX6Fj/nVSq2LwwdVpWRdlGPn5Y8bp150GggaZVLv+reGaGTvsUUNQqAPbc0FobuJAcbNSqN8jlduiNowjROnbmNfBKwv5+d5r/7VffOCJ30LaZy/qLTqNScOnwa4zs34sXJm3BsYkfooXrUqDTs50zVogCMWvJOdOa6ylybiN8tdLqyVMQr1RqmDp/WVsLG0felX3XvVLmQvqY1EiwNOnjz7eULMyfZYdBri1LAG7XgBTSSVJCYlDO3Xl8OXPfVVpEU1/UkoXJJSkkTW/vcOKFDJ1kmDWmdSyfyyzCtizulk773tvc3yevphCTbOlO2SEhI0pxI60SVkWS+2O8eBamNbO9vr+S/s6ikYQ6dkCZxKz/C0MnDJMl8sc09MnW/tL2/vZL/zqKShg6dkCZpOpJOHiZxN9LyI+ge+U1o2t7fJK+nE9IMAEww5UJIk7ibbYWh03a1cfaUCToPYL5HfmkIN09uc3+Tup5OiLvFcBxwUpSQjNBNxURSaoxemdCMi6ypXBihE5IRuhWVJtnXpFcmNOMiayMJ5tAJyTlJLgLSKxOaeYUROiEJkGYr1yTVGKYcO9DQoeehlW2WoUMnpMuk3co16TSINw2R9vX3E0y5ENJl0lz3FEg/DZL29fcTjNAJ6TJpF6CkKccE0r/+foIOnZAukwXlR5pqjCxcf7/AlAshXSbtlEfa9Pv1JwkjdEK6TNopj7Tp9+tPElaKEkJIxrGtFGXKhRBCcgIdOiGE5AQ6dEIIyQl06IQQkhPo0AkhJCckqnIRkbMA3gLw28ROGp3LQTvjpBfs7AUbAdoZN71g5zVKqeGgnRJ16AAgIjM28pu0oZ3x0gt29oKNAO2Mm16x0wamXAghJCfQoRNCSE5Iw6E/ksI5o0A746UX7OwFGwHaGTe9YmcgiefQCSGEdAemXAghJCck5tBF5OMiclpEfiEiE0mdNywi8pKIPCciJ0UkM53ERORbIvK6iPysZdsaEfmhiPy8+XMoTRubNuns3CMileY9PSkin0jTxqZNV4vIURF5XkROicgXmtszdU997MzUPRWRVSLyYxGZa9r5YHP7BhE53ryf+0VkZUbt/LaIvNhyP7emaWdklFJd/wegAOCXAN4PYCWAOQB/lMS5I9j6EoDL07ZDY9dHAVwH4Gct2/4ewETz9QSAv8uonXsA/Oe0bfPYuRbAdc3X7wHw/wD8UdbuqY+dmbqnAATApc3XDoDjALYBeALA55rbvwHgP2bUzm8DuC3t+9jpv6Qi9A8D+IVS6ldKqXcBPA7gUwmdOxcopf4ZwDnP5k8BeLT5+lEAY4kapcFgZ+ZQSr2qlPpJ8/XvATwPoIyM3VMfOzOFavBm863T/KcA7ADwVHN7Fu6nyc5ckJRDLwN4ueX9b5DBP8omCsAPROSEiNyTtjEBvE8p9SrQ+OIDuCJle/z4TyLy02ZKJvXUUCsish7ACBrRWmbvqcdOIGP3VEQKInISwOsAfojGqLyqlLrQ3CUT33uvnUop935+pXk/HxKRS1I0MTJJOXTRbMvqU3G7Uuo6AH8B4K9F5KNpG5QDvg7gAwC2AngVwNfSNeciInIpgAMA7lVK/S5te0xo7MzcPVVKLSiltgK4Co1R+R/qdkvWKo0BHjtF5I8BfAnAJgB/AmANgC+maGJkknLovwFwdcv7qwC8ktC5Q6GUeqX583UA/4jGH2ZWeU1E1gJA8+frKdujRSn1WvNLtAjgvyEj91REHDSc5D6l1PeamzN3T3V2ZvWeAoBSqgrgR2jkpksi4i51manvfYudH2+mtpRS6h0A/wMZup9hSMqh/x8AH2zOeK8E8DkABxM6tzUislpE3uO+BnAjgJ/5fypVDgK4u/n6bgBPp2iLEddBNvk0MnBPRUQAfBPA80qpf2j5VabuqcnOrN1TERkWkVLzdRHAn6GR7z8K4Lbmblm4nzo7X2h5iAsaef7U/0ajkFhhUVNW9TAaipdvKaW+ksiJQyAi70cjKgcaC2h/Nyt2ishjAD6GRme41wDsBjCNhopgHYAzAG5XSqU6IWmw82NopAYUGiqif+/mqdNCRP4NgP8F4DkAi83N/xWN/HRm7qmPnXcgQ/dURD6ExqRnAY1A8Qml1N82v1OPo5HGmAVwVzMKzpqdRwAMo5EePgngP7RMnvYMrBQlhJCcwEpRQgjJCXTohBCSE+jQCSEkJ9ChE0JITqBDJ4SQnECHTgghOYEOnRBCcgIdOiGE5IT/D0kxI7djPluEAAAAAElFTkSuQmCC\n", + "text/plain": [ + "" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "from sklearn.datasets import load_boston\n", + "boston = load_boston()\n", + "plt.scatter(boston.data[:, -1], boston.target)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "from torch.autograd import Variable\n", + "w = torch.zeros(1, requires_grad=True)\n", + "b = torch.zeros(1, requires_grad=True)\n", + "\n", + "x = torch.tensor(boston.data[:, -1] / 10, dtype=torch.float32)\n", + "y = torch.tensor(boston.target, dtype=torch.float32)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "y_pred = w * x + b\n", + "loss = torch.mean((y_pred - y)**2)\n", + "\n", + "# propagete gradients\n", + "loss.backward()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The gradients are now stored in `.grad` of those variables that require them." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "dL/dw = \n", + " tensor([-47.3514])\n", + "dL/db = \n", + " tensor([-45.0656])\n" + ] + } + ], + "source": [ + "print(\"dL/dw = \\n\", w.grad)\n", + "print(\"dL/db = \\n\", b.grad)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If you compute gradient from multiple losses, the gradients will add up at variables, therefore it's useful to __zero the gradients__ between iteratons." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "loss = 44.59417\n" + ] + } + ], + "source": [ + "from IPython.display import clear_output\n", + "\n", + "for i in range(100):\n", + "\n", + " y_pred = w * x + b\n", + " loss = torch.mean((y_pred - y)**2)\n", + " loss.backward()\n", + "\n", + " w.data -= 0.05 * w.grad.data\n", + " b.data -= 0.05 * b.grad.data\n", + "\n", + " # zero gradients\n", + " w.grad.data.zero_()\n", + " b.grad.data.zero_()\n", + "\n", + " # the rest of code is just bells and whistles\n", + " if (i+1) % 5 == 0:\n", + " clear_output(True)\n", + " plt.scatter(x.data.numpy(), y.data.numpy())\n", + " plt.scatter(x.data.numpy(), y_pred.data.numpy(),\n", + " color='orange', linewidth=5)\n", + " plt.show()\n", + "\n", + " print(\"loss = \", loss.data.numpy())\n", + " if loss.data.numpy() < 0.5:\n", + " print(\"Done!\")\n", + " break" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "__Bonus quest__: try implementing and writing some nonlinear regression. You can try quadratic features or some trigonometry, or a simple neural network. The only difference is that now you have more variables and a more complicated `y_pred`. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# High-level pytorch\n", + "\n", + "So far we've been dealing with low-level torch API. While it's absolutely vital for any custom losses or layers, building large neura nets in it is a bit clumsy.\n", + "\n", + "Luckily, there's also a high-level torch interface with a pre-defined layers, activations and training algorithms. \n", + "\n", + "We'll cover them as we go through a simple image recognition problem: classifying letters into __\"A\"__ vs __\"B\"__.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Parsing...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/jheuristic/anaconda3/lib/python3.6/site-packages/scipy/misc/pilutil.py:482: FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`.\n", + " if issubdtype(ts, int):\n", + "/home/jheuristic/anaconda3/lib/python3.6/site-packages/scipy/misc/pilutil.py:485: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n", + " elif issubdtype(type(size), float):\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "found broken img: ./notMNIST_small/A/RGVtb2NyYXRpY2FCb2xkT2xkc3R5bGUgQm9sZC50dGY=.png [it's ok if <10 images are broken]\n", + "Done\n", + "Train size = 2808, test_size = 937\n" + ] + } + ], + "source": [ + "from notmnist import load_notmnist\n", + "X_train, y_train, X_test, y_test = load_notmnist(letters='AB')\n", + "X_train, X_test = X_train.reshape([-1, 784]), X_test.reshape([-1, 784])\n", + "\n", + "print(\"Train size = %i, test_size = %i\" % (len(X_train), len(X_test)))" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXQAAADHCAYAAAAJSqg8AAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAHWFJREFUeJzt3XmU1NWVB/Dv7eqNpWnpNCCCCGERo0chtuiIxAX3GDHG\nLCQmxsHgEvdkMoTxTBI1GScmLolKBpUBM4boiNsYd8xEExXFFRVRBIRmR0E26aXqzh9dTLr73Wf/\nfl378/s5Jwf6+upXr6pfvfyod999oqogIqLSV1boDhARUXZwQiciCgQndCKiQHBCJyIKBCd0IqJA\ncEInIgoEJ3QiokBwQi9CIlInIveJyA4ReV9EvlnoPhFlSkQuFJGFItIkIrML3Z8QlRe6A2S6GUAz\ngAEAxgD4k4i8pqpvFrZbRBlZA+BqACcA6FHgvgRJuFO0uIhILwCbARygqu+kY78HsFpVpxW0c0RZ\nICJXAxisqt8tdF9Cw69cis8oAMndk3naawD2L1B/iKhEcEIvPr0BfNQp9hGAmgL0hYhKCCf04rMd\nQJ9OsT4AthWgL0RUQjihF593AJSLyMh2sYMAcEGUiD4RJ/Qio6o7ANwL4EoR6SUi4wFMAvD7wvaM\nKDMiUi4i1QASABIiUi0izLTLIk7oxekCtKV1bQAwF8D5TFmkAFwB4GMA0wCcmf77FQXtUWCYtkhE\nFAjeoRMRBYITOhFRIDihExEFghM6EVEgMprQReREEVkiIktFhHVGKBgc21SKup3lIiIJtG2COQ5A\nI4AXAUxW1bd8j6mUKq1Gr249X7boqEonVp1oMdtu3VWd6+78P5HMHh/n1+h7Lusa1WtTdttdTdGf\nME92YQeatSnDd7J0xzasV+4ZF80D3b4e0G+j2TZlXKTMfDKbGo/PRm6ddd2k54NgRZs0YbbdnnI/\n9x8128UhkzvcNPqqD+z5RJub3aDvbezU4ahjO5Ok/nEAlqrqMgAQkT+ibQOMd9BXoxcOlYkZPGXm\nWmcMcWIj+tgDef7Sfd2g2u9prAm1zPiAlNkTZ1SppD041bhsoiJptk22utfY9yq74kByyVI3WGb3\nASn7+bJtgc7P1qVKcmxLuftx1tZWs+3KqYc7sRfOu8Vs26TuBFUlFZH7lTQGYSsyHxO71H1t2zxj\nrcn4fK5orTXbPr19tBN7eJVdG2/Hc/VObNidjWbb1hUrnZj1OwPc31vUsZ3JVy6DAKxq93NjOtaB\niExNF7Vf2ILiu6sjMnBsU0nKZEKP9A88VZ2pqg2q2lCBqgyejihvOLapJGXylUsjgL3b/TwYbSeS\n5E6Mf9KXHbSf2fS/Rt3qxAaW9zbbJgf9zYklpPQTg6x/QgP2P6OH42yz7Ygz3ZiUeb6OyuzbpELI\n/9jOAk1G/xqj3/i1OexJ1+J8ZRPnGrUxPp7DPV89TuzhVtn41/pFZtvE590nfOkc47tyAN/6/SVO\nbJ+fvvBJXYwtk9npRQAjRWSYiFQC+AaAB7PTLaKC4timktTtO3RVbRWRCwE8hrbqabNYQIpCwLFN\npSqj0pWq+jCAh7PUF6KiwbFNpaj0vxAmIiIAnNCJiIJRUqeFxMmiWDehr9n2slWnOrGXVw0227Y2\nGxs1mj3/H2iEEx/ab+/wse7Gg8f2e8iJbU7uNB/fN9HTiX1j+TFm2wWvjXBiT33xOrPtsAo3a+Cd\no2832x75lfOdWK95C8y2cTa8UAQxtvom+tqfg6tH3hf56crhZpdZm4UAOwvs7RY3R/9njaeYj+9V\n7maIJD2b+eoq3c/HIb2Xm21P6Olu6qlP2Dt7fVlglpTxnh9YaWfjvX3ODCc2bM/vmW1HTX0xch/a\n4x06EVEgOKETEQWCEzoRUSA4oRMRBSKvh0T3kTrNSUU6Y5FIEp7qg5kuxsUpP3CAW7UNAE6+6zkn\ndlHf9yN3YeJb7sJu1RkfmW2TW9z4e3eONdsuPfo/I/fhn9a511g0LkfveUQLdD626ocZl8/tjpyN\nbUuMMdh08iFm0ydu/Z0TqxD7unGqJVrb8Yc9co4TGzVlofn4TBfQpcItjw0AGONWTt3ys4/Nps+P\nuceJxVkEblH7vbFKDm9N7TLbnnrxZR1+fu2pG7F986ouxzbv0ImIAsEJnYgoEJzQiYgCwQmdiCgQ\nnNCJiAJRUlv/vYxMHe/KuJEh4CspIFXuKTSpHTvstg0HOLFz595vtj2t13a7b50c8frpZrzm9HVu\nv6wDaD1GXGe3fXO8u+q/f6V9OO61e77ixMaed4HZtv9NzzqxqGcpksubwWVkuaw+yn6frYyWOAef\nWFveffb+nwzvG2OcoK4tns/Bi+4BFXucar83Fzx3mBO7ZdDzZlsro8WXLbQz5fbNKuMBAKs7JUy1\n2ElBDt6hExEFghM6EVEgOKETEQWCEzoRUSAyWhQVkRUAtgFIAmhV1YZsdCqnjIUjNeo9A4AaC6Dl\n++xttAQmznYX/qIufgLAeGMBtGbSarOtuQBqbEEGPFupX7KPxzzlyYuc2PKTbzPbWo482z7B/O1b\n3cVlbXJrZAOwF8DyWJ7i790okrFtvB/aGr1e98hDopeU8Imz8Hf/jt5OrOZFt/6/b+lbk8a2+Ti/\nf88CqlS6JQF8Y3D+40a5hLPtRdEUrJIA9nuTiLG4m6jr1LeEXXqgs2xkuRytqpuycB2iYsOxTSWF\nX7kQEQUi0wldATwuIi+JyNRsdIioSHBsU8nJ9CuX8aq6RkT6A3hCRN5W1afbN0h/GKYCQDXsJHqi\nIsSxTSUnozt0VV2T/nMDgPsAjDPazFTVBlVtqIC7OEZUjDi2qRR1+w5dRHoBKFPVbem/Hw/gyqz1\nLFeslWYj8wUAynq6d1297nRPGgeAy+uWRe7ChasPdWJ9vrrR7ZYvE8Q64MDzGnwZPJZRM93n23SC\nXerAOjH9hoH2/uSDzzrfffxM95APwN7Wnu9yAEU1tq3sJc/vOjFquBP77WfneC7sZqOUe8ZKk7rv\nvy/L5YpFk5zYoNVuVlXOSj94MmK0Jfp1k9W5yapKWn3zJL6k1ld3DLRGu/fO5CuXAQDuk7YJshzA\nH1T10QyuR1QsOLapJHV7QlfVZQAOymJfiIoCxzaVKqYtEhEFghM6EVEgwqiHHkOcRbclM0Y7sWWf\nnRX5ue7YWm/Gl53plg9IbVvqNoxxuruX0da7IPWCWzP6S298x2z73EHzIndhwGR3+3lypt3W3Prt\n2zJdgJIA+WbV6vccQI8NE/o7seEV7uInEG87f1WMaSLxTG3ktsXqwEPey+jxSc8vqGeZW37go5R7\nBgEA7Plsx7G9MWIVEd6hExEFghM6EVEgOKETEQWCEzoRUSA4oRMRBSLYLJc4W4sbf3y42XbZcbc4\nMSs7AABWtrqr1XMumGK2LV/ykhMzD6LI85Z3S9ksO1MHN7oh3+r+3SPvdWJfGfc9+7pGpo0328fz\nuwiK5+ASS/OXtkRuax3MkFQ7myhh9GFlq512Meihde51jXZmNlMWxPncpyaMNdvePPS3RtTOFrLK\nJbSarxhIGPfPP1k/wWzbe17HMhplSbsER2e8QyciCgQndCKiQHBCJyIKBCd0IqJAhLEoaiya+RYU\nU0eMcWJPnv9Ls21S3Xrovu3Rp938Iye211PPmm2lwjiBvKXZbJsLcRakah9bbMZ/sWlfJza9fonZ\ntrdUO7Gl33TrqQPAiBfcmLX9HfBvgS9JnvIG1rhIDHC3+APAJfv+OfLTlRn3cnEW8y5//zSzbfLd\niOcC+Mo2+Mo8WE3LK9zLxhjbO//lIzM+sNxdAG3SlsjX9dWV35lyf5cLf3Gw2bZXakHk52uPd+hE\nRIHghE5EFAhO6EREgeCETkQUiC4ndBGZJSIbROSNdrE6EXlCRN5N/9k3t90kyj6ObQpNlCyX2QBu\nAnBHu9g0APNV9RoRmZb++Z+z372IYhziMPBadxXeWtX2OeiFyWZ8r1+6GS3ebch5zGixO2BnGFjZ\nN8mtW822tz19lBObfrqd5WI5+5j/NeN/q97DiaV27bIv0jkjIv55F7NRJGPbOngFsLO1tk4YZrad\nUvu4E/OVqrCytXb6Dk4xEk9e/+tIs+kwbHJiZT3dbDFttj8DZpaKZ7zG+Ry9c+shTmz5gbeaba1s\nlIQn+6ZK3Ewb33t+6I2XOrG95nky4TrPHRGrgHR5h66qTwP4sFN4EoA56b/PAWDnMBEVMY5tCk13\nv0MfoKprASD9p50YS1R6OLapZOV8Y5GITAUwFQCq4f7Ti6hUcWxTsenuHfp6ERkIAOk/N/gaqupM\nVW1Q1YYKVHXz6YjyhmObSlZ379AfBHAWgGvSfz6QtR59gji1jt/7hbsIAgCP7jMj8vPN297HiQ2+\n1K5LbK1ZaKq0TqWPs216+H+7C0ctX46+AHdF/dtm26OPPMeJVT620GhpLCRm5+0uyNiOM1ZWHxu9\nrVX3HLBrn1un0gNAo1H7fJ8/2afVm33Y1WQEo4+1xGfqzPiWY0c5sSEXv2O2XT7MXQD11e+vEnee\nsWrCA8AdW93zAn73kzPMtnvdZSyAemr9O3NaxF95lLTFuQCeA7CviDSKyBS0DfbjRORdAMelfyYq\nKRzbFJou79BV1c7TAyZmuS9EecWxTaHhTlEiokBwQiciCgQndCKiQBTvARcxDq2Qg/d3Yo9+/Vqz\nrXVohW8F++fXfcuJ9VvxnN0HIwPH19+iFePEiMTzbzmxn210Dw8BgKv7L4p83cZj3K3Un30s8sNL\nV4ysj0smuFv8fayDLLJh23Q38wUAmlpGO7GKcve1VZfbn43htW7pgMn1z5htj+/51Cd1sYPtKbd8\nxMak3YfZWw51Ynffe6TZdtjNbrmLmk3Pm23zMUfwDp2IKBCc0ImIAsEJnYgoEJzQiYgCUbSLotZJ\n7741u1XT3djwiug1zs9471gz3u8/3MUNq7YzYG+b99W4LiW+cgupHW4JhDsXuotJAHD1ydEXRU+a\n6G7z91VZd7bLl0ilhTiLY00nuSUszq79jefKPZxImVXMHHYigG8rfL+EW6fm+TH3ePqQP03a4sSs\n+uQAcNe2oU7sujtON9vuM+NNJzZki123PGX8Lq1zBYD8nIPAO3QiokBwQiciCgQndCKiQHBCJyIK\nROEXRaPWAwaw65RxZttXD7NqnNvXXdy804ltv3iA3Td1d62ZtZ2BWDv9SkqMGumDHrHf85aT3GtY\nNdIB4Nz6p53YD8dMMdumXnV3q5YEz85ky5ovuB/R2jJ38ROwDzf21TiPw7fQWGjlxmfcd0DzlNp1\nTmzy928w2879zhAn9us77QXUIde+5MS0yTNHWAdNew7A7i7eoRMRBYITOhFRIDihExEFghM6EVEg\nopwpOktENojIG+1iPxWR1SLyavp/J+e2m0TZx7FNoYmS5TIbwE0A7ugUv15Vf5X1Hn2C6svWmHFf\nxoTlke0HOLHlX+ljtk1+8x+cmEQvGR4Ge+e4+T601thvzqake0J8/4RdQmH/SjeDY9PYWrNt3at2\n32KYjQKM7ThbwMcf/UbXjdKsz4Ev68NqO3vrXmbb2dMnObGyFjs7Q40ZRY0yHk197HvJ7Xu7bWsP\nX2+2vWv/OU5sSLld8sMqa+DLAPpuH3eemXLBLWbbgw/9mhOrn/Se2dbMhLMyX4BuZ790eYeuqk8D\n+LBbVycqYhzbFJpMvkO/UEReT/+ztW/WekRUeBzbVJK6O6HPADAcwBgAawH82tdQRKaKyEIRWdgC\nT8I9UfHg2KaS1a0JXVXXq2pSVVMAbgVgb+FsaztTVRtUtaECbhlOomLCsU2lrFtb/0VkoKquTf/4\nZQDRVm7ErQXtqwPdfEKDE3ty9EyzbTLG+sHldcvc2D9apQMoe9yFKmubOgD0FHeh6oMx9i+4LrNO\nmbo9tu2L2XFjwavsQPdwZQC4dtDtRrSX2daqfd4KX+kGd1H031450Ww5/L4FTizTmt/2kjjQ1yoF\n4imrMeXIi5zYtFmd17fbTDSqJfgWjFNwF1A/9ozXlw6+24mN+OV5ZtvhP3TPV/CdmdDdw6O7nNBF\nZC6AowDUi0gjgJ8AOEpExqDtSIEVAM7t1rMTFRDHNoWmywldVScbYeu2gaikcGxTaLhTlIgoEJzQ\niYgCwQmdiCgQhT/gwmP12e6qsnVSuc8Nm4ea8duWjHdileX2irKqJ0uBIOJmajS12MNp8kj3EIAr\n6t+O/FxHjLMPsrA3hBcPKbcPhrAyQd4/1c7ZqU+4GS1xtvOnYmwh7/9AdeS2kvB8Fq29/3EYn3Gp\n9Byw8ZdXnNClM+w17EWX21v3LdaBHmUx7n1//MX7zfi8aw90Ysn1G+yLdM6Qivhr5B06EVEgOKET\nEQWCEzoRUSA4oRMRBSK/i6LqbmktH+qesA0ANzXMzeip7r7S3sY86C5j+225/TZ0d/vtp4G19du3\n7fuei45xYlf8OPqi6Jn9njPjN9R/oWOfNkevi58PmvRtu3fVjo++xOtbFLW2/vvOCnh8p7vw1/fZ\nRrOt9SlI+U62z/Ip9gCgyei/14F/22H/h8vdkPV++cQ5c8Gqpw4A9wx0PwfwLop2716bd+hERIHg\nhE5EFAhO6EREgeCETkQUCE7oRESBKPjW/zUnDzbjx/dsiXyN320Z5MT63O9uCwYAWBktnhVlXxF/\nAqTCfR99WS41jW5WhnUKO2CXdxhdudlsu2vM0A4/p54v8KlBnQ9n8BzMkBgxzInNGP0Hz0Xd11Ql\n9se2Sd18FOvAEAD44RtnOLGBqxabba0ssGLNANs1IPoYSHn202eaK+U9VCRlj/ls4h06EVEgOKET\nEQWCEzoRUSA4oRMRBSLKIdF7A7gDwJ4AUgBmquqNIlIH4C4AQ9F2mO7XVNVevfoEqeNjP8Tx78+f\n5MRGNS20G5unikc7qZz+Ls629l6r3O3Y77V+bLYdVeHW/x5S3ttsu3l0xwWw5Gvx6tdndWwLIGUd\nn9+z7ouNE/Z0YmOq7MU8a5u/bxt6VYwcB/lL38htC00S9utVY9G58ZjoYyAF3yKl+3xxatDP3eYm\naQCArP8wct+8g6cLUe7QWwH8QFX3A3AYgO+LyOcATAMwX1VHApif/pmolHBsU1C6nNBVda2qvpz+\n+zYAiwEMAjAJwJx0szkATstVJ4lygWObQhPrO3QRGQpgLIAFAAao6lqg7YMBoL/nMVNFZKGILGyB\np0IbUYFlPLaVY5sKL/KELiK9AcwDcKmqbo36OFWdqaoNqtpQYWySICq0rIxt4dimwos0oYtIBdoG\n/J2qem86vF5EBqb/+0AAnsK+RMWLY5tCEiXLRQDcDmCxql7X7j89COAsANek/3ygy2slEkj0qe0Q\nu2r/Lh/WpbrnPaeCe/rQmbVaTl2IsQqfWL/Fia1prTHbjqqIft2twzu2Tca8Sc7m2E6nuUR63o+/\nFPkfAWYmRlLtTA6rbMLylu1m28H3r3Zivs38cTKaMmZkoflKSiTqP+PEfnTsQ9GfKsY3znGyXK55\n/QSz7T7rFzmxbB+uEyXPaTyAbwNYJCKvpmPT0TbY7xaRKQBWAvhqt3pAVDgc2xSULid0Vf0r4D2r\naWJ2u0OUPxzbFBruFCUiCgQndCKiQOS1Hnqyphrbjh7dITah+hFP655OpLHVXuCpe8veRm7J6wIP\nAQBSH7hbnte11hotASB6KYjKwR1LCkiMBdWsU3UW7xL9+plNL9tvfuTLWgt3vnrbCaPt+e993Wyr\ny993g+L59sla7I1RZaFzSYS4fOvvi68a7sQe3sN+b5vUPV+hSuxkCqtWv78GvXvdvW6LvjqvKbsm\ne3fxDp2IKBCc0ImIAsEJnYgoEJzQiYgCwQmdiCgQec1yae0JbBzb8f9D+ibcbBafN5rdrb4AULl0\nrftcvot0s3A8daLRV+dTO3c6sR2pzItZ7du/Y4mV9RVuxkEhbTnGzcIAgCm1TzixOFvLd/pKVRjJ\nJI2P72M2HQR367+U21kfvq33UalVqsAzfsp6uvPBkt983my7fNLMyH2wMlqsbBYA+Fjd19u7rNps\nO+KB85zYqMdfsDthHq6T3aw73qETEQWCEzoRUSA4oRMRBYITOhFRIPK6KIrqFFIj3QWyqHzbxVs3\nbIp+kRiLeRSTb+u48Z7v0ug17H2Oql/S4edXyndlfM1sWn9K9GPpfCfQW7XPe5ZVmm3XGqUx6t62\n0wOsWuJS09tsqwnjvq/CnjqStT2c2I5BbmzdP9hj5UdffNCJTa191mxrLSSXeWoSWOUSkp65wFoA\nPeTlr5lt9734VSemvs9BHs5d4B06EVEgOKETEQWCEzoRUSA4oRMRBaLLCV1E9haRP4vIYhF5U0Qu\nScd/KiKrReTV9P9Ozn13ibKHY5tCEyXLpRXAD1T1ZRGpAfCSiOzeu3y9qv4q6pPVVO3C0cPf6U4/\nAQDN6uluHlaPKQJP1oB1snlNWfRDSXw+32NFh597lsXeop61sS1lZSjrXdMhduU4N2PDx3cCfcI6\nXMKjLuGWU/jV9TdHfnwC9u+vQtwMnApPVs6exu72OOU9LNtTdvaStZ3f935Zh38kPa9h2EPfc2L7\nXvS62dYsi+DLcsmDKIdErwWwNv33bSKyGMCgXHeMKNc4tik0sb5DF5GhAMYCWJAOXSgir4vILBHp\n63nMVBFZKCILd22OnpdLlE+Zju1mLa4cePp0ijyhi0hvAPMAXKqqWwHMADAcwBi03eX82nqcqs5U\n1QZVbajum3mFPaJsy8bYrhS7Gh9RPkWa0EWkAm0D/k5VvRcAVHW9qiZVNQXgVgDjctdNotzg2KaQ\ndPkduogIgNsBLFbV69rFB6a/gwSALwN4o6tr1SR2YeIeb3W3rxhasdGMlw880Im1rl1nX8RasGA5\ngPhivI+JAf2d2KCKVzLuwp6JHR1+9i3U+WRzbLfU9cDG0w7oEPtWzTOR++Lbsh6HtUg4LoB/FPcQ\nu9SBtQC60ih/AABnLv62+/jr6822ox590YmpVcscKLr5JEqWy3gA3wawSER2Fy6YDmCyiIwBoABW\nADg3Jz0kyh2ObQpKlCyXv8I8CwUPZ787RPnDsU2h4U5RIqJAcEInIgoEJ3QiokDk9YCL3tKMw6s7\nnzZuF9W3HN/TPtX9qsPck8173mdnuUjCXa3WVvsQAPKL8z5++IUhTmxiD7tcg3USu287916d+lBR\nuB3XSNaksGVitHIGTeqOYytDJRusQyCy0Xan8RoAYE3SHRfrWmuc2IqWfubjn/xgPyf28sq9zbZV\ni9ySAkMe3my27fHaYjcoK8y2VqkKTXremyLLkOMdOhFRIDihExEFghM6EVEgOKETEQVCNI9f6ovI\nRgDvp3+sB7Apb0+eP3xdhbOPqtqrbTnWbmyXwvvUXaG+tlJ4XZHGdl4n9A5PLLJQVRsK8uQ5xNf1\n6Rby+xTqawvpdfErFyKiQHBCJyIKRCEn9JkFfO5c4uv6dAv5fQr1tQXzugr2HToREWUXv3IhIgpE\n3id0ETlRRJaIyFIRmZbv58+m9AHCG0TkjXaxOhF5QkTeTf9pHjBczERkbxH5s4gsFpE3ReSSdLzk\nX1suhTK2Oa5L77XtltcJXUQSAG4GcBKAz6HtZJjP5bMPWTYbwImdYtMAzFfVkQDmp38uNa0AfqCq\n+wE4DMD307+nEF5bTgQ2tmeD47ok5fsOfRyApaq6TFWbAfwRwKQ89yFrVPVpAB92Ck8CMCf99zkA\nTstrp7JAVdeq6svpv28DsBjAIATw2nIomLHNcV16r223fE/ogwCsavdzYzoWkgG7DxhO/+mekFxC\nRGQogLEAFiCw15ZloY/toH73oY7rfE/oVsVqptkUKRHpDWAegEtVdWuh+1PkOLZLRMjjOt8TeiOA\n9tXqBwNYk+c+5Np6ERkIAOk/NxS4P90iIhVoG/R3quq96XAQry1HQh/bQfzuQx/X+Z7QXwQwUkSG\niUglgG8AeDDPfci1BwGclf77WQAeKGBfukVEBMDtABar6nXt/lPJv7YcCn1sl/zv/tMwrvO+sUhE\nTgZwA4AEgFmq+vO8diCLRGQugKPQVq1tPYCfALgfwN0AhgBYCeCrqtp5gamoicgRAJ4BsAjA7jPh\npqPt+8aSfm25FMrY5rguvde2G3eKEhEFgjtFiYgCwQmdiCgQnNCJiALBCZ2IKBCc0ImIAsEJnYgo\nEJzQiYgCwQmdiCgQ/wfQEz56iywmUgAAAABJRU5ErkJggg==\n", + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "for i in [0, 1]:\n", + " plt.subplot(1, 2, i + 1)\n", + " plt.imshow(X_train[i].reshape([28, 28]))\n", + " plt.title(str(y_train[i]))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's start with layers. The main abstraction here is __`torch.nn.Module`__" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Base class for all neural network modules.\n", + "\n", + " Your models should also subclass this class.\n", + "\n", + " Modules can also contain other Modules, allowing to nest them in\n", + " a tree structure. You can assign the submodules as regular attributes::\n", + "\n", + " import torch.nn as nn\n", + " import torch.nn.functional as F\n", + "\n", + " class Model(nn.Module):\n", + " def __init__(self):\n", + " super(Model, self).__init__()\n", + " self.conv1 = nn.Conv2d(1, 20, 5)\n", + " self.conv2 = nn.Conv2d(20, 20, 5)\n", + "\n", + " def forward(self, x):\n", + " x = F.relu(self.conv1(x))\n", + " return F.relu(self.conv2(x))\n", + "\n", + " Submodules assigned in this way will be registered, and will have their\n", + " parameters converted too when you call `.cuda()`, etc.\n", + " \n" + ] + } + ], + "source": [ + "from torch import nn\n", + "import torch.nn.functional as F\n", + "\n", + "print(nn.Module.__doc__)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "There's a vast library of popular layers and architectures already built for ya'.\n", + "\n", + "This is a binary classification problem, so we'll train a __Logistic Regression with sigmoid__.\n", + "$$P(y_i | X_i) = \\sigma(W \\cdot X_i + b) ={ 1 \\over {1+e^{- [W \\cdot X_i + b]}} }$$\n" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# create a network that stacks layers on top of each other\n", + "model = nn.Sequential()\n", + "\n", + "# add first \"dense\" layer with 784 input units and 1 output unit.\n", + "model.add_module('l1', nn.Linear(784, 1))\n", + "\n", + "# add softmax activation for probabilities. Normalize over axis 1\n", + "# note: layer names must be unique\n", + "model.add_module('l2', nn.Sigmoid())" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Weight shapes: [torch.Size([1, 784]), torch.Size([1])]\n" + ] + } + ], + "source": [ + "print(\"Weight shapes:\", [w.shape for w in model.parameters()])" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "tensor([ 0.4526, 0.4411, 0.5917])" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# create dummy data with 3 samples and 784 features\n", + "x = torch.tensor(X_train[:3], dtype=torch.float32)\n", + "y = torch.tensor(y_train[:3], dtype=torch.float32)\n", + "\n", + "# compute outputs given inputs, both are variables\n", + "y_predicted = model(x)[:, 0]\n", + "\n", + "y_predicted # display what we've got" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's now define a loss function for our model.\n", + "\n", + "The natural choice is to use binary crossentropy (aka logloss, negative llh):\n", + "$$ L = {1 \\over N} \\underset{X_i,y_i} \\sum - [ y_i \\cdot log P(y_i | X_i) + (1-y_i) \\cdot log (1-P(y_i | X_i)) ]$$\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "crossentropy = # YOUR CODE\n", + "\n", + "loss = # YOUR CODE\n", + "\n", + "assert tuple(crossentropy.size()) == (\n", + " 3,), \"Crossentropy must be a vector with element per sample\"\n", + "assert tuple(loss.size()) == tuple(\n", + "), \"Loss must be scalar. Did you forget the mean/sum?\"\n", + "assert loss.data.numpy() > 0, \"Crossentropy must non-negative, zero only for perfect prediction\"\n", + "assert loss.data.numpy() <= np.log(\n", + " 3), \"Loss is too large even for untrained model. Please double-check it.\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "__Note:__ you can also find many such functions in `torch.nn.functional`, just type __`F.`__." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "__Torch optimizers__\n", + "\n", + "When we trained Linear Regression above, we had to manually .zero_() gradients on both our variables. Imagine that code for a 50-layer network.\n", + "\n", + "Again, to keep it from getting dirty, there's `torch.optim` module with pre-implemented algorithms:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "opt = torch.optim.RMSprop(model.parameters(), lr=0.01)\n", + "\n", + "# here's how it's used:\n", + "loss.backward() # add new gradients\n", + "opt.step() # change weights\n", + "opt.zero_grad() # clear gradients" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# dispose of old variables to avoid bugs later\n", + "del x, y, y_predicted, loss, y_pred" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Putting it all together" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# create network again just in case\n", + "model = nn.Sequential()\n", + "model.add_module('first', nn.Linear(784, 1))\n", + "model.add_module('second', nn.Sigmoid())\n", + "\n", + "opt = torch.optim.Adam(model.parameters(), lr=1e-3)" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "step #0 | mean loss = 0.573\n", + "step #10 | mean loss = 0.371\n", + "step #20 | mean loss = 0.218\n", + "step #30 | mean loss = 0.159\n", + "step #40 | mean loss = 0.141\n", + "step #50 | mean loss = 0.127\n", + "step #60 | mean loss = 0.131\n", + "step #70 | mean loss = 0.107\n", + "step #80 | mean loss = 0.116\n", + "step #90 | mean loss = 0.101\n" + ] + } + ], + "source": [ + "history = []\n", + "\n", + "for i in range(100):\n", + "\n", + " # sample 256 random images\n", + " ix = np.random.randint(0, len(X_train), 256)\n", + " x_batch = torch.tensor(X_train[ix], dtype=torch.float32)\n", + " y_batch = torch.tensor(y_train[ix], dtype=torch.float32)\n", + "\n", + " # predict probabilities\n", + " y_predicted = # YOUR CODE\n", + "\n", + " assert y_predicted.dim(\n", + " ) == 1, \"did you forget to select first column with [:, 0]\"\n", + "\n", + " # compute loss, just like before\n", + " loss = # YOUR CODE\n", + "\n", + " # compute gradients\n", + " \n", + "\n", + " # Adam step\n", + " \n", + "\n", + " # clear gradients\n", + " \n", + "\n", + " history.append(loss.data.numpy())\n", + "\n", + " if i % 10 == 0:\n", + " print(\"step #%i | mean loss = %.3f\" % (i, np.mean(history[-10:])))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "__Debugging tips:__\n", + "* make sure your model predicts probabilities correctly. Just print them and see what's inside.\n", + "* don't forget _minus_ sign in the loss function! It's a mistake 99% ppl do at some point.\n", + "* make sure you zero-out gradients after each step. Srsly:)\n", + "* In general, pytorch's error messages are quite helpful, read 'em before you google 'em.\n", + "* if you see nan/inf, print what happens at each iteration to find our where exactly it occurs.\n", + " * If loss goes down and then turns nan midway through, try smaller learning rate. (Our current loss formula is unstable).\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Evaluation\n", + "\n", + "Let's see how our model performs on test data" + ] + }, + { + "cell_type": "code", + "execution_count": 254, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Test accuracy: 0.96585\n" + ] + } + ], + "source": [ + "# use your model to predict classes (0 or 1) for all test samples\n", + "predicted_y_test = # YOUR CODE\n", + "\n", + "assert isinstance(predicted_y_test, np.ndarray), \"please return np array, not %s\" % type(\n", + " predicted_y_test)\n", + "assert predicted_y_test.shape == y_test.shape, \"please predict one class for each test sample\"\n", + "assert np.in1d(predicted_y_test, y_test).all(), \"please predict class indexes\"\n", + "\n", + "accuracy = np.mean(predicted_y_test == y_test)\n", + "\n", + "print(\"Test accuracy: %.5f\" % accuracy)\n", + "assert accuracy > 0.95, \"try training longer\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## More about pytorch:\n", + "* Using torch on GPU and multi-GPU - [link](http://pytorch.org/docs/master/notes/cuda.html)\n", + "* More tutorials on pytorch - [link](http://pytorch.org/tutorials/beginner/deep_learning_60min_blitz.html)\n", + "* Pytorch examples - a repo that implements many cool DL models in pytorch - [link](https://github.com/pytorch/examples)\n", + "* Practical pytorch - a repo that implements some... other cool DL models... yes, in pytorch - [link](https://github.com/spro/practical-pytorch)\n", + "* And some more - [link](https://www.reddit.com/r/pytorch/comments/6z0yeo/pytorch_and_pytorch_tricks_for_kaggle/)\n", + "\n", + "```\n", + "\n", + "```\n", + "\n", + "```\n", + "\n", + "```\n", + "\n", + "```\n", + "\n", + "```\n", + "\n", + "```\n", + "\n", + "```\n", + "\n", + "```\n", + "\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Homework tasks\n", + "\n", + "There will be three tasks worth 2, 3 and 5 points respectively. \n", + "If you get stuck with no progress, try switching to the next task and returning later." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Task I (2 points) - tensormancy\n", + "\n", + "![img](https://media.giphy.com/media/3o751UMCYtSrRAFRFC/giphy.gif)\n", + "\n", + "When dealing with more complex stuff like neural network, it's best if you use tensors the way samurai uses his sword. \n", + "\n", + "\n", + "__1.1 the cannabola__ \n", + "[_disclaimer_](https://gist.githubusercontent.com/justheuristic/e2c1fa28ca02670cabc42cacf3902796/raw/fd3d935cef63a01b85ed2790b5c11c370245cbd7/stddisclaimer.h)\n", + "\n", + "Let's write another function, this time in polar coordinates:\n", + "$$\\rho(\\theta) = (1 + 0.9 \\cdot cos (8 \\cdot \\theta) ) \\cdot (1 + 0.1 \\cdot cos(24 \\cdot \\theta)) \\cdot (0.9 + 0.05 \\cdot cos(200 \\cdot \\theta)) \\cdot (1 + sin(\\theta))$$\n", + "\n", + "\n", + "Then convert it into cartesian coordinates ([howto](http://www.mathsisfun.com/polar-cartesian-coordinates.html)) and plot the results.\n", + "\n", + "Use torch tensors only: no lists, loops, numpy arrays, etc." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "theta = torch.linspace(- np.pi, np.pi, steps=1000)\n", + "\n", + "# compute rho(theta) as per formula above\n", + "rho = # YOUR CODE\n", + "\n", + "# Now convert polar (rho, theta) pairs into cartesian (x,y) to plot them.\n", + "x = # YOUR CODE\n", + "y = # YOUR CODE\n", + "\n", + "\n", + "plt.figure(figsize=[6, 6])\n", + "plt.fill(x.numpy(), y.numpy(), color='green')\n", + "plt.grid()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Task II: the game of life (3 points)\n", + "\n", + "Now it's time for you to make something more challenging. We'll implement Conway's [Game of Life](http://web.stanford.edu/~cdebs/GameOfLife/) in _pure pytorch_. \n", + "\n", + "While this is still a toy task, implementing game of life this way has one cool benefit: __you'll be able to run it on GPU! __ Indeed, what could be a better use of your gpu than simulating game of life on 1M/1M grids?\n", + "\n", + "![img](https://cdn.tutsplus.com/gamedev/authors/legacy/Stephane%20Beniak/2012/09/11/Preview_Image.png)\n", + "If you've skipped the url above out of sloth, here's the game of life:\n", + "* You have a 2D grid of cells, where each cell is \"alive\"(1) or \"dead\"(0)\n", + "* Any living cell that has 2 or 3 neighbors survives, else it dies [0,1 or 4+ neighbors]\n", + "* Any cell with exactly 3 neighbors becomes alive (if it was dead)\n", + "\n", + "For this task, you are given a reference numpy implementation that you must convert to pytorch.\n", + "_[numpy code inspired by: https://github.com/rougier/numpy-100]_\n", + "\n", + "\n", + "__Note:__ You can find convolution in `torch.nn.functional.conv2d(Z,filters)`. Note that it has a different input format.\n", + "\n", + "__Note 2:__ From the mathematical standpoint, pytorch convolution is actually cross-correlation. Those two are very similar operations. More info: [video tutorial](https://www.youtube.com/watch?v=C3EEy8adxvc), [scipy functions review](http://programmerz.ru/questions/26903/2d-convolution-in-python-similar-to-matlabs-conv2-question), [stack overflow source](https://stackoverflow.com/questions/31139977/comparing-matlabs-conv2-with-scipys-convolve2d)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "from scipy.signal import correlate2d\n", + "\n", + "def np_update(Z):\n", + " # Count neighbours with convolution\n", + " filters = np.array([[1, 1, 1],\n", + " [1, 0, 1],\n", + " [1, 1, 1]])\n", + "\n", + " N = correlate2d(Z, filters, mode='same')\n", + "\n", + " # Apply rules\n", + " birth = (N == 3) & (Z == 0)\n", + " survive = ((N == 2) | (N == 3)) & (Z == 1)\n", + "\n", + " Z[:] = birth | survive\n", + " return Z" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def torch_update(Z):\n", + " \"\"\"\n", + " Implement an update function that does to Z exactly the same as np_update.\n", + " :param Z: torch.FloatTensor of shape [height,width] containing 0s(dead) an 1s(alive)\n", + " :returns: torch.FloatTensor Z after updates.\n", + "\n", + " You can opt to create new tensor or change Z inplace.\n", + " \"\"\"\n", + "\n", + " # \n", + "\n", + " return Z" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# initial frame\n", + "Z_numpy = np.random.choice([0, 1], p=(0.5, 0.5), size=(100, 100))\n", + "Z = torch.from_numpy(Z_numpy).type(torch.FloatTensor)\n", + "\n", + "# your debug polygon :)\n", + "Z_new = torch_update(Z.clone())\n", + "\n", + "# tests\n", + "Z_reference = np_update(Z_numpy.copy())\n", + "assert np.all(Z_new.numpy(\n", + ") == Z_reference), \"your pytorch implementation doesn't match np_update. Look into Z and np_update(ZZ) to investigate.\"\n", + "print(\"Well done!\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "%matplotlib notebook\n", + "plt.ion()\n", + "\n", + "# initialize game field\n", + "Z = np.random.choice([0, 1], size=(100, 100))\n", + "Z = torch.from_numpy(Z).type(torch.FloatTensor)\n", + "\n", + "fig = plt.figure()\n", + "ax = fig.add_subplot(111)\n", + "fig.show()\n", + "\n", + "for _ in range(100):\n", + "\n", + " # update\n", + " Z = torch_update(Z)\n", + "\n", + " # re-draw image\n", + " ax.clear()\n", + " ax.imshow(Z.numpy(), cmap='gray')\n", + " fig.canvas.draw()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# Some fun setups for your amusement\n", + "\n", + "# parallel stripes\n", + "Z = np.arange(100) % 2 + np.zeros([100, 100])\n", + "# with a small imperfection\n", + "Z[48:52, 50] = 1\n", + "\n", + "Z = torch.from_numpy(Z).type(torch.FloatTensor)\n", + "\n", + "fig = plt.figure()\n", + "ax = fig.add_subplot(111)\n", + "fig.show()\n", + "\n", + "for _ in range(100):\n", + " Z = torch_update(Z)\n", + " ax.clear()\n", + " ax.imshow(Z.numpy(), cmap='gray')\n", + " fig.canvas.draw()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "More fun with Game of Life: [video](https://www.youtube.com/watch?v=C2vgICfQawE)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "```\n", + "\n", + "```\n", + "\n", + "```\n", + "\n", + "```\n", + "\n", + "```\n", + "\n", + "```\n", + "\n", + "```\n", + "\n", + "```\n", + "\n", + "```\n", + "\n", + "```\n", + "\n", + "\n", + "\n", + "### Task III: Going deeper (5 points)\n", + "\n", + "Your ultimate task for this week is to build your first neural network [almost] from scratch and pure torch.\n", + "\n", + "This time you will solve the same digit recognition problem, but at a greater scale\n", + "* 10 different letters\n", + "* 20k samples\n", + "\n", + "We want you to build a network that reaches at least 80% accuracy and has at least 2 linear layers in it. Naturally, it should be nonlinear to beat logistic regression. You can implement it with either \n", + "\n", + "\n", + "With 10 classes you will need to use __Softmax__ at the top instead of sigmoid and train for __categorical crossentropy__ (see [here](http://wiki.fast.ai/index.php/Log_Loss)). Write your own loss or use `torch.nn.functional.nll_loss`. Just make sure you understand what it accepts as an input.\n", + "\n", + "Note that you are not required to build 152-layer monsters here. A 2-layer (one hidden, one output) neural network should already give you an edge over logistic regression.\n", + "\n", + "\n", + "__[bonus kudos]__\n", + "If you've already beaten logistic regression with a two-layer net, but enthusiasm still ain't gone, you can try improving the test accuracy even further! It should be possible to reach 90% without convnets.\n", + "\n", + "__SPOILERS!__\n", + "At the end of the notebook you will find a few tips and frequent errors. \n", + "If you feel confident enogh, just start coding right away and get there ~~if~~ once you need to untangle yourself. \n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Parsing...\n", + "found broken img: ./notMNIST_small/F/Q3Jvc3NvdmVyIEJvbGRPYmxpcXVlLnR0Zg==.png [it's ok if <10 images are broken]\n", + "found broken img: ./notMNIST_small/A/RGVtb2NyYXRpY2FCb2xkT2xkc3R5bGUgQm9sZC50dGY=.png [it's ok if <10 images are broken]\n" + ] + } + ], + "source": [ + "from notmnist import load_notmnist\n", + "X_train, y_train, X_test, y_test = load_notmnist(letters='ABCDEFGHIJ')\n", + "X_train, X_test = X_train.reshape([-1, 784]), X_test.reshape([-1, 784])" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAsMAAADeCAYAAADYWw0uAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzsnXeYZEXV/z/V3RN3dmZ3Nue8pCUsOQkrINEAikgGBYmK\nCBh/vqKI4PuqqGRQQZIoKIISVQSJy7Kkhd1ll41szml2YnfX749zq6dvzfRM94Tt2enzeZ55evr2\nDXXPrapb9a1Tp4y1FkVRFEVRFEUpRCL5ToCiKIqiKIqi5AttDCuKoiiKoigFizaGFUVRFEVRlIJF\nG8OKoiiKoihKwaKNYUVRFEVRFKVg0cawoiiKoiiKUrBoY1hRFEVRFEUpWHplY9gY86Ixpt4YUxP8\nzct3mvKJMeZBY8wqY8xWY8x8Y8yF+U5TvjHGnG6MmWuM2W6MWWiM+US+05RPNI+0RPOIkFaPur+E\nMeaWfKcrnxhjdjPG/McYs8UYs8AYc0q+05RP1B5hjDHVxpi/BXXHUmPMmflOU77p6fVpr2wMB3zN\nWlsR/O2S78TkmRuBsdbaSuCzwPXGmP3ynKa8YYz5FPC/wJeBvsARwKK8Jir/aB5JQ/NIM2n1aAUw\nBKgDHs1zsvKGMSYGPAE8CVQDFwEPGmMm5zVheULt0Sq3AY1IeTkLuMMYs0d+k5Q/dob6tDc3hpUA\na+1sa22D+xr8TchjkvLNj4HrrLXTrbVJa+0Ka+2KfCcqn2geaYHmkdY5FVgLvJzvhOSRXYHhwK+s\ntQlr7X+AV4Fz8pusvKH2SMMY0wf4AvA/1toaa+0rwN8pUHsE9Pj6tDc3hm80xqw3xrxqjJmW78Tk\nG2PM7caYWuBDYBXwdJ6TlBeMMVFgf2BQMJy33BhzqzGmLN9pyzeaRwTNI21yHnC/tdbmOyF5xGTY\nNmVHJ6SHoPYIMxlIWGvnp217DyhIZXhnqU97a2P4O8B4YARwN/APY0whq1xYay9Dhic+ATwGNLR9\nRK9lCFCEKFyfAPYBpgI/yGeiegKaR1JoHmkFY8xo4EjgvnynJc98iKjj3zLGFBljjkXsUp7fZOUN\ntUeYCmCLt20LUrcWIjtFfdorG8PW2jestdustQ3W2vuQIZsT852ufBMMYb0CjAQuzXd68kRd8HmL\ntXaVtXY9cBOaPwDNIwGaR1rnXOAVa+3ifCckn1hrm4CTgZOA1cDVwCPA8nymK1+oPVpQA1R62yqB\nbXlIS09gp6hPe2VjuBUsrQ/lFCoxCtQf1Fq7CamkC3mYNxs0j2ge8TkXVYUBsNbOstYeaa0dYK09\nDhmJnJHvdOULtUeI+UDMGDMpbdvewOw8pSev7Cz1aa9rDBtj+hljjjPGlBpjYsaYs5CZi8/lO235\nwBgzOAhpUmGMiRpjjgPOAP6T77TlkXuBrwe26Q9cicyELkg0j7SK5pE0jDGHIm5nBRtFIh1jzF7B\nO6bcGHMNMAz4Q56TlTfUHs1Ya7cjbmbXGWP6GGMOAz4HPJDflOWVHl+fxvKdgG6gCLgemeGaQPyZ\nTrbWFmqsYYsMd9+JdH6WAldaa5/Ia6ryy0+AgUgPvh4Z0vtpXlOUXzSPtETzSJjzgMestYU61Otz\nDnAh8r55GfhUWjSWQkTtEeYy4B7El3oDcKm1tiCV4YAeX5+awp4UrCiKoiiKohQyvc5NQlEURVEU\nRVGyRRvDiqIoiqIoSsHSqcawMeZ4Y8y8IJDyd7sqUTsrao8wao8wao+WqE3CqD3CqD3CqD3CqD3C\nqD06Tod9hoNVReYDn0LCZrwJnGGtndN1ydt5UHuEUXuEUXu0RG0SRu0RRu0RRu0RRu0RRu3ROTrT\nGD4E+FEQUxBjzPcArLU3ZjomWt7HFvWrZsqgdQAkg7BzkV4QArhme5JVa+JsrbHrrbWDsrFHcVWZ\nLRtayaRSWawmH/awwTXdtZuC7FBvJdDI9mQJAJvqZTEhUyuDCcWb43J8fesThhPEaaCOBPHs7WFK\nbCl92k+0M4+fdfvI6o4N1ZLGfhXbAaiOyWeZZ9ZE2gkarfy4MS7X31wr91uyMdhhW204CdEoAPHx\nYqfYx3Iu29DYapK71B4muJGg7MYHyj67D+9cubJp9qgPzl1vi4C0fFAnNo5sD/LBliAfNOQ+cTxB\nnFq2NVlri6H9OiTr/JGJDPnGxOQZxvvJPcb7JgGoLK1P7dM/Ks+/xMj9Fhm5f2frOivfN8cl32yu\nFzsVbZHtkU1B/nH1bStpydke0TJbFq2kfrg8owF9agDoG5F0l5qEXDvILya4qHu2H9dXy+8LmsJp\n6wJMRO67fqTYoV8fKYN9o5K2MiPXdHZ0NFmxfY0tob42wepF9Tsuf+wATJE8q3hlsXyWuh/ko3ir\nPAOztdY/dMeXly6kcZiko7RS6omKqHwOjYXrDVcHubzqytf6hJTNmuBze20JyYYG4mvWZ20P2AE2\n8epmd99Ah9s9vk02JeXds36OlC2blDKTIE4dNSRt0khSdpw9TLHk66YgXyf6SJrLS+X5Vsak3FdE\nmp93UXD7zgoN1n3KuRqSUi9vbpL7bKqT7UVSzRHZ4tWprVDPdhptQ1aG7kxotRHAsrTvy4GD/J2M\nMRcBFwHEqvoz9qtXMePS2wFosFIhlpiiTiSjZ/CXJ2t47oVa7vnj1qXBpnbtUTqkL4fcfTpP7/I0\nkB97NNlE6NorE/J9TuMQAGbUjAfgsY/2BqBopqwoOeofUrATcz+SE0WiofOuSSxlA6tZyZLs7UE5\nB5mjMyfWvdSDhqiNx8O/77UXAB+dKYXns4e9BcA51a8BsJtn1prgngGWxeXHBzYeCsATM6cCMOFP\nYo/oC2+Hjo1W9Qdg480DABjwNUlLfNGSUFpdQV1jl3eZPUyRVDi2SRreGz5/CAAzfnwH0PF85PIC\nwPzg3B+6fLA9yAdz9wGg/A1p9I14Zg0AifkL3Q0En0EjJ2jctFZhrbHLeZ/p6cuWtrBJTvkjE36+\nCfK4S1N04GAA1p0ka4xsOkYWTDp6YnM0xi9UzwRgUtEmAIbH5KW8LSl2mtMkL5QnNu0rnx9KeRn8\nD9mv8m/vyCWDToNrgKenJWd7RPty6NAzmPPDYQCce8DrAHyyYi4AuxRtBaBvRK5VZOT+5zXJNS/9\n8Ez5/TOyUJjLT11BpELqiXnf3R2Azx4sZfGoShGp9ixeC8CQqOTlaPCMlsfFPtPrxzDj2Q3cesWC\n7s8f3YlXD8SGDAdg3TFjANi0e/BzTH4f+W95NiXPvCk/uHo1mdhx5aUr8O572YVSr0781CIADquW\n+uI7Az4KHZYI6otoUH/UBuXr91tk/YpXN0sZfeP9idS+NYv1dz/Ypj0kKTvOJn7d/PFFh6Z+m3FJ\nx9o9rl525fevNbK43e+m7glAcrt0NNewgg/tW+mHdp09vPc7yUToa2z4KEnDp0YCsOEQuccDdpHF\nKo8ZIHXSYWULU8cMicqzLg3ua1HwOv+oUerjBQ3y7nlq5RQAVr4v34e9Jnmqz1PvAmlCTCtpfMM+\n3/69uXvIes+WtNbabvHGs9beDdwNUDp8VK+N45ahc9KmPap2GZJ3e7gC5j4nB22YyUH36+Q+swD4\nyWDJeNHDZYeFl8vvxzx/JQC7fUfaeIl10kgmEgFrfAu0aY9KU926PbyK1TWCG044QLZ/U6754pT7\n5bNO0njRm2cD8N+XDwSgapEU0FidFOR4WXPhqRkuRcEV4u8d8ZSc4zMrAbhipVzr1d/uD8DWYG22\nj/aRBugnJ38VgOKgMZyxwe7dfosN2djDNTADNu0R3i3hq49Z4vIAwB7FZcGnNKi+UCHP/4bB0imI\nThMbv3ulVESnTb8IgLG3Bg3PV2X/5goqc6PYI7RDVvbIhFc5umcRmbIrAB9+XRpsdx39BwCOLf9n\nFietCH0ricoL7YjgUkcME/v8MvhMTJP7/p/vSCfixf+Vl2PfP00Pp9GGXy7pyQ59SbNHnwGj7IZp\no1h84p1Ay5emn1bHXoGK8589/wzAycXT5NydaQx7ZTTSrwqAB06Q8nFYqeSX5sZO62mbECinfSMf\nU19c19ouXZc/upO0RixAzRelTXLtjfcAcGy51DN+46/pdNl/8j+lPO16uTQikrUtleKAnmkPr5yP\n+ql01Bqul+0v9pEGz8i3ZejtrL4bQvuvTUgD76RrrwGg+t6gvBhp+05OzmCNXc76Vq7cMik7wCZB\n/ndlKDpARl2uOeOxFrtGcpyq5SvIB5TKO+me/p8AmhvDGRZ465w9UvVTuP5uOlbeg2suljJ691RZ\nT6RlOQ/fa8KWtLiE22cv6UewV/DOoUI+Ux2mPYNznBHUqf8jdeo//ng4ACNvfw9Is4dp0f5ok85M\noFsOjEr7PhJY2Ynz7dSMHBZl2Yqm0CYK2B4llFFvQy8ztQdqj3RKKAMoTttU0DZRe4QZNiwKao8U\nmj/CqD3ClFBOkpBQUtD2yJXOKMNvApOMMeOAFcDpwJldkqqdkAP2KWXB4iaAYmNMMR2wR649xu7E\n9eySKZ/i4HvQMxwdE+Vw8fG/A+C2g6Vf9NQXDgagcs486tgGnbCHrzaZEulVzrtduoiLT/htaPfJ\n918KwISfiJo9bvusNk+frhu6/uqA38vnXxHl4tarTwbgratuAaDo2jdD53Bq3PYhUpRSNbPXI66k\nP3XUQGfsEWCT4e7u3vsuDH13Q86ZSHjKcjb4+cD1uPcMVMb5R94HwJZPSIP/gAeuAmDc90QRanaf\naH6mlfQHKO3yOsRT5SLl4tIx/3pxo3n9i78EYHBUXBucPRIpn7U21fwQztaxIDf5dnLbUyMrN0me\nHH/oxQBMuuKNVJor7UBIZm+PSNxStr5ZUXZ50VeSfHXG4YZpzZgRsmHO/LZvNheics1RsdogbWXh\n3zMoR463G6qxu1pgQ9fnjzzQ//KPgWZFuCYpPpTuGSSDZ+fyzeLjpCLa+49nADD05LndV152ACYm\n9+mU08aDZHTmuGA0JhHkD5cfDv6b1B+T7nk9OD5wK3IjbR0oL92Jf3+LvrELABdUNa9o31VukAMj\n8pZpHDsIgMjyFQBUmgEkbZKusIdv7+jEcQCsvkmu/fb+v/OOCCvCcYLR1yBfu9HK8kgxPn/YKu/a\n2xZMA2DDhmDUKJjHM2mUuOLdOfFPAIwrkt9/PFhcz274ptSpV3xJRm9nf3s/AGL/CbmMtEuHW1/W\n2jjwNeA5YC7wSCEvNxiLGW6+YRDAZNQeREyEXdgH1B6A2qM1IvLi+xitQwC1h080ZkDtkULzRxi1\nR5iIiVBKOag9OkRnlGGstU8DT3dRWrqc9AlB3vBBt3D0UcUAH1hr9+/2i3UzroferJ6G/S9dD3BL\nUpTAy/vJXMq375WJISuOLGEQY6H+1dzt4SnC0UHSAy5/TL4vniC90vWBX9lnvn01AOMeFhXBPWk3\nmcH3sXXKqom0oqAG923j0osf9kuZfHfc+6Lk/fa3vwZgYOATXBURRWPLRDm8fxu3NdAMA9uJ/OHs\nEiie0UqZSHH60Bmh3dobYcikxrWGm8DiK6DuHC4fuP2Kgt/nnye+olN3Px2AwSc3T0YL3YtlS1eV\nF1/NiOwjs5Om3vsBAM8Mcb61MoXfKTXOXq6+KDFynlzs1OT5/Prqj7tWQ1LStujUuwDYdbOMZoz5\n4etu16ztEWmIU7aw2dfS+Qrnkm6ALVPEv7GiCwMw2ZikZXRMVJyW/sxts6RxYCp5O1V96pXRSKnk\ntT2qVoV2c/nDt0fChuukf+0ndd2Z074uG174y85lj4DURNGAjy+U7wOj4UgGMxqknOz2q9UAuDEa\nfzQsbQJXfu3h+QrHhg0F4GdnPNBi1xit531XN7jJuL5N/PLs1NWaUZK3Kl1SIoZYsgibtJNzvQ2H\nX4fWnSzzba6/6W4Ajgiin7jynKnOjPp1ajBad+Wq5kc180ZRcPs+Kb6+1fUyMlWdSky4HXDJgVJX\nHnfPqwBcVS2TMd275+bhMmp77k/lnbzm0B3nM6woiqIoiqIoOzWdUoZ7OuFed3aKxI7EAolkc3+k\nWb3unrSm+4o6v8iOqGDp+1eZILZv0Lv9/ehXAJhy5WWy488eyi2RxqTU2Ui5dEOrn5BzPzj2xdCu\nhz4kM42dIuxUmGQQaqW92fFtus66cFyBn3LRPyW01sm3fhuA978pYXKcTSv2Cc+GbvvkHSQVrizo\nlU+VkBan9Hkh2EHyTab4lU7Fn9lQEewt3ebSIOZrVVoMyDGBQuD7eLmeftJT+spNEF/SxYkNfCLf\nOSDw87rnAgAmfznw4/LD4HQCX81oOFF8x355220A7FciaXMKgsvzvhbg7tXd4x+2DE399uByiQaw\ndqvYrm+Z2OqcseLz60ZGmv2Pwz6xTglsIqyQPXLurwD4ziPny4b3s7plIZnEbNvO7EZ5ri4CSKaZ\n3JnYPEH2az2+Qwfx/NZzrdsW1g/uytTsOLwyGhkueegL/R4NdpB8kHMM8NZGsXYGPP99s7+EyXrp\n8FsBSFjx53d59UvPXA7A5MUy2tXCV7iH4fsKz/+G+Nae3OdZoPm9CM11gF8+X6iTkvf/5n4OgLf2\neyR0bKru8Orc7cPleKcMS97rWD7x7bzlLJn788zPbgKgf1Sek1+HtuYD3Fpad33lHADGnjk3tU+f\nuNSdSVdXBHnFRWLysTOkcnz24iMAOOWP4iu8LCH2O+/FCwHY/cdrggO2Zr7hVlBlWFEURVEURSlY\nepUy7PdGTlvUHEz6zXnSY4uVBStm2e7qaf+/rPc0Sw3RS4q58k/iR/PrYaI+5upfly3pSpFT8rqL\ngUcHEV1+lsNBRnqFrnc6/y5xfVo49l6guaf8Sr0owBN/Ln6oiaBH6RThLllJy8U0doskBP7Hw38u\nPsSnfvYYAP4y4d8AfH3SiwD8ua/MIk5u2xbcU9jvqTM4H2cnOm/YXezg8omvJPjf939JfK7GnymR\nDSJ9Jc6uCWb+m/79UtdqGCOLiaybKtcYc7L4Zz068R+hc/p51eWxChP2y3Wz4w86V9LQ7/6Uj2yH\n8dWM+NHig3bH7b8BYLfisJrhK78uzU7F3uu18wEYfofsV/Rys0xb3CRxtEd6z/MfMQkEf9/Znwbg\nuh9IHNnjyyXf+CqQ/6z2KhY7zbtY4vLytRwMkLTYhkb+ECwU8/Oh7+RwcDO145rCG7owz2aLb6el\ntdVt7d5j8cvoxoNFGT6wpPXy4uOikbhf790ssVTT8+LOhG+PeZdKfh8WC49DvBvUs7veJfWmG0do\n4SvcU/B9hceOBuDOL94d2q01P2EXacH51V722lmy74ogppFUYy3ixft5p85fpqCjoweRaIs69KEb\nfgFA/6g8J1dfZVKCHf47x7XBxpwm+demjQi2XHhInrqNtz6q6vYniGF/+afOk++bRAGevE7aT/H0\nyEXqM6woiqIoiqIo7dOrlGHXK3E9p4/+uEvqt8m3i6LnL5fY1SzOYV/b0Ejio0U8/+dgycYrpWfT\nUd9hX115tV6+/+jcrwAQ79P8uFcfLHZ48HyJjuD8KnP1N3T4PeCrxkn8yJdzOYkVla/uczKDdeFR\n0svelJBYpc5v6cIXvgzA5A1ir2aFsPtUBH829NbvSGzWVX+SlfjO6hv4mR4ufl9uOdUsV6LLDu+Z\nbDqgKcOOgq8slL9ZHvrd1osak3RlYXPzyqbRxaKEDv2vHNzwKznXpz4nPn1n/+xJAC6qkhGAbEcz\nDrpCntn8PwZ5se1baB2nygQ2darMWbc9ATQrwr6a4afxkRpRY2+7Slb7Gv3kjPD505VRz5/N5Qen\nXPX/gyjdN82T6BnDH5ZoEU75zRQD2HHmJ6R+urHdm0/DWmxDA88vDyaPB8qwry62x5CRm3K5anZE\nOqezrAyezc5O4xkbQ9/bq9v9PPrb50RZm9A0vXsS2F14ZTQ6WeY3PHn0LcEO4bjTnw9GrSa9Jys3\n9nhfYa9e//BKWWb76DJ5fm3FFPbflePuC5Yi/2R4v/YiYCWGNngbEuQkhaYulEi1i/b/hczpmFAU\nVoSzjY3s39uqmyTMUjkypyY9glOuz9bfPzE/HF+/syNaqgwriqIoiqIoBUuvUoZb0FoY2cA/0ia6\nKbpEov1dfPp+3DWRB3xFaFsyWNXnDQkgGklTN0c/J/+fE78SgDmXS3QE358pW3wl+YCStTkdn8IY\n+l31cWhTxJuZPvilotS+8rkD+nRuNrTzW3pNYiMe/heJaLHwdIlhu/SzkqbJz9DlafPV6cN2XdDq\nfk7d9/27qudkGA1pZcW61Ize1Ox4OWfZE6KePvGe+LlveVJiun+rWnrpvrLlz6B2sSAPP1niNvPo\nn1pPU1t4M/ZX3yLq67mV64H2fYRv2xyslniKRIgonRfMXHejRik7p5XLpFOCvQIe2M6tcpd8XfLF\nFx75JgAfnX1HcKZkkAaxh1++LqoWZTkXZdgmkyTrG9i0Mqyi5jqytO+g5QA4nSWlemXjr+mcQb1n\n0ll/403by9rfqScRCSuFLmrCE3vfGewgSlumWLN+mZ3bKKNhk+9eJ793fYq7FT/KwoeXS6x4P+LJ\nqmDUb+Kd4Tvs8b7CnuL98Oec4i333dpz9uuh69fLKnyx50WNjZ90cJuX9lcWHTkkPKJjE4kOCcMA\nS78n9fmzQ6Qd4Neh7eHf29+3S33Y92WZa+Kerv8OA5rr0JKSlr+1hT/6lGzZjjL12ftRqzKsKIqi\nKIqiFCy9WxlupZdkE0HvwVd48ojp5sXxnBqejlO4hrwVdtpsb/WybBkY7YCyU1FGcure3DfexfAV\nxc+t8uZWnBvwRqCWuIgPrfU2uwl3LacQT/6BxDr89XFjAXjyBIlkcJURP/CUb3rzimu5461qFRsh\n/mkXDnk2tJtTI/wRghfr5JmWzfR66cEqe62peJn8uVzc5fgSUe+f/cY0AC69TxTiiog8M9/33B9x\nWHFckOld+NVsMGJ3l7YNFx4CwFv7ifrq+wi7NDi1YlajRI34x3mSZjtPZjh3ah5BYLtkfdh/b+zf\nJfbv7NPks8m6chXeL2Lk+M3JDkb5TSYoXuevYhb2FW+PaVUS+3MhEnGneXZ3LpnVq8SKOvdqSSZ3\nLp3Gj5qw6Gp5JiNj2fleurjvLsrPif++AoDJ88PzIjrkY7+jSIsUkIqyMH4sALefdG9oV1cvHPem\njBCNDEZUdjZf4blXSdQdFy2kLUXVH7F56K9HATAamS9gY22XN//dfMBAmdfxQSd8ZU00QrSikq+e\nFn6X5BrJyr+3+1fL+y+xbl1woVZGcQM12anr3376sZyumQ2XfnZ71vvuXDWOoiiKoiiKonQhvVsZ\nVoBmNTylBKZhMsT06ywdiZHc0C/Cks+WtVjtxqklr9WL3xlr1oUP7I7V3jLh1OjAp83Wis/bc2eK\nSvnbk04EYCSvt3pch/D8MRsmS+zSaWXh+06psKlVj+QZ3Ll6GgCJDcHM9k4oCX7cZefvduhMifAx\n68CHJQ2eEuyrGp+ZKrEib8vp4oEiEyhQx17+auhn/xq+Qn7WrVcBMPzNILJMoHK7e+oU3kiT89P/\n9idPz+74uDv+1zlfOlbbuZjpny6X8vTIy7JyXywSzlexYOiqKJJosa0kIgpZn5jYsDwiZXZYcTiO\nTCZf2UxEozuwTHeUdCU0UAoT0/YF4LXDnQ9pHyB7X+HFTRKdZtdbRNHq8fF20whFCggS/tFFw4Dm\nuNtOIXf2GHpbaegcPfY+PV/hyF7i7/v08a68yjureXVLIX3VVzcqML1eytH4ByQSj9PATbLtcuxH\nojmqSuqYORVSblOx7XOgqbqMNV/Yg6uqXwqlN9f3tz8atXybxKyvQuZxtDkPISa/+e+zrqBvDnKv\nKsOKoiiKoihKwaLKsNJjKC1vZNIBS1Pf/TiLL2+TuNGJLcGa461EQdhhOCUwSEPyPfG7HPle11+q\nuVcdRE84MKym+P6Ifi/9jfcl1uNkgqgJ3kzvriD6z/7yj4SIbqHS+qrGkZWyemBOynDAhi/LRW4Y\nIr7C/kxm//u3Vk8FYPhNb8gJnMqTiyKcY15zozDxxUvb2bPzxOo7d7xTJd1qit1BrnHLI5GdQBlu\nhfrvbgZgYFQU4Vx9hY96XqL7TH7Pj6HeM31ogRaqKUBsmIxeXXdKOFqMs8Phsz4PQJ//yMhSSmXv\nQXN50mkZV1h8wTOtculwzxean/GZL0lc80mL3gpfJMcsv2+xqK6RalFhO6IMJyqSbDmiuQLpaDQp\nn621wXwftyFV/jM/30Q3jPDaHCbpqDKsKIqiKIqiFCy9ShkuM+Fe2X3fvin1/7KrpPdU5PVMkkF/\noNRID/7jpmoA7vnmKUDaSmI7Qw99J6dfUS0nD3034++Lt8vMXaz0iHuEmuBUWLdCWaSlStLV1O1Z\nF/ruK8G+v1e/WV1fzP0IHoPeE9/plip16ysaHlC6ssPX7nt6+Fh/JrP//V9/EH/uIcnAV7gjZbmT\ncXPbpRPRRqL1nUtbTVKUof1eFcWqqT6wT12Qb1wo4USaOu75NxqXBZuC7cPlnAs+KVEEsl2h0BE1\nPdR3lNbzz5azJUbs9L0krrC730yKsO8r/HFcfIV3uUXKtrv7HutDm0Zrq2wuvnA8AKf3lSgFvnJq\n7grmf7BoB6Wyg2SIH/3fY5yvsCjEmXyF05ViZ4Px97X+TCON4TLVXmQnF7GpcfRA2X/psjb3b40B\nZds5a8qMrK+ZLU2Nub9zch09ygaTbUgdVBlWFEVRFEVRCphepQz7PYu9ikvT/m/dsc5XrtYWi4/f\nb8u0n7Cj6RtpYFr5R7gZ2H4vtaYpvEKNH9szr6RWKOvi8xrTwrf3hF3mhL67lYn8mcBOpR0yQ3ys\nU2pTV8Rl9m40tmYLANsC9aMk2vZa9lWR3KONmLJSIrvuzl2TfxtsCc/Ud/fv1LhX6+X7iL8uAZpn\nbXfk/l3kCbdKku0ipdg4n/MgKkmufoMAsbr292mLlYE9xn5plkuUfOZyj94xLnYon5SPXFfFi/XE\naBJ+RIHS5vfLkVdPz+lUvq/wkf8OfIXf2Xl9hSN9+qR+uvLMx0O7OoX00/NPkO9/eyN8rp7qK+y9\nYxZcJc8e+XpKAAAgAElEQVRldBA/OpOvcGu+t6fMEz/p6Atvy7m9aDaRdmJIuzaKX8/XjJLzVGZ9\nV830jdTzyYq5qe/+3I6OYq13HhdppGc+ZkCVYUVRFEVRFKWA6VXKsE82sxP9WKT1gbJhEj3fV6u3\nUWwMI9tQFGsapQfcJ+MevRs3Q/uiQU51EWUq08pzf6mR/SPzZbW4VKe8K+RrL/axLZVnU5TB78sf\ngXmj3ukYK7K+ZGNllOXH9GNykeQA3w/VKeFOjbn6w9MAqFqxQE7Qno+5p266WMoA41+R364e/LSk\nOiHKUHGOUkciUF76RUQNeq5mDwCeP0k+WZLT6QCI1YXrKj8KS3tEPUdlF20kJyLhCB3JvqVt7d0u\nsWg3SEh+RJAc1X3fN3bpNfumfntmiKya2V70CN+X9K0GURZ3+7mMrKRWh9yBq2p2FD8qzYqL9079\ndlGVxAD3ldN194wFoB+rgnP0UAXc8xW2h+0DwPRPuPg3Ugf5vsKO1uJKb7p/FAD9WQ6k5Sd3yaZw\n/oxmiGDjq87bh8lnR5ThskiSKcXbcPfTHX67OwuFe+eKoiiKoihKwdOrlWFl58LQ9mzzFjpOd/Zi\n24sr66tKue6fVRpEPXDqRM3+o4GwL3w6vlJ615IjACjbulh26MboG/UjRCmtipS1+ruvWr9TNzb4\nb25ru7dORQIO25x2zrAfqh9VY8v0wZImRBn24zW3wFO7zR4TUz/9dOjvAOgflfscG3NKd26+z83K\nodjptjpJY7wDM8EdsfrOKf0Jz08wtVJlR3yGAyLbOhf8uEuiSbQT4SWlSrqIDZnyhacSxsZKOfz1\n+b9tuWs7+pLvK3z6o98AYPzc18Np6mlKaSs4RdiNoJxy3n9b7OMU4YuXS0SXfveHV+bcGe4TYPk3\nJZ0ufnQmX2G/Dv7+mr1Sv1U/JJGpMs3daM9n2OHXc3VDOl5WDFDagdVi2z/xzjeyrsqwoiiKoiiK\nUrBoY1hRFEVRFEUpWHq1m0RWzuDeZKJSFy4mmselfguUJptkTaKOkUHYGp+KIm/54O6MqZarW0O3\nLMpgQq4g6/YKF1d/qC7qDXevmDsEgImIm0S7bgK5pMw714Y9/GVIwxOJkjZ8zVc3Tsj5mlUl9Zw4\ntjmsnD8k7U9mGTDHu8928ot/T2sPTC0mSv+oLLu6KVEbvpb32N3waKYQRf5iDM/MlGFUt1R2R4ht\n7/mTrXIl1pnlmFMTIeUcNi4PyYX+MkXy7BKbt4T3zxBSzs8Xc/6flKtjy5vHtdubOOeeuyur92+V\nhRIm/XKhpCXYb+eYOBd25Vjz1f0B+PGg21P7+HXTzN/JBLSB9HB3EM+VLPFJmSQ58+A7gx3kfjJN\nnPNdt/7+58NTv42IB4v+lJT4h8mls3ST8EkMzWFZeY96a5jTFOXAIEmZFknKlYjvJrETLB6jyrCi\nKIqiKIpSsPQqZdjv1cxqbJ7EsSye7XLMY+QcdT0w6Hsvp8YW81r9CE6r2NLq72UxeUZujYHuXKrU\nTQrxF7zI+TydUkBsSM1M7r0t9KsfesdXSgd7ce27Ukn3FazKY1fndPzcN8blfM1+0e18tuptXB/e\nV199NaNiqai4WS9t69ln02EtFRenCPkTZ7KlyAu5NGBm5yevxOp6vpq4w4hEU6peZJ/dAVj6A8kX\n350iSwMPjkk5umbWqQCMuWY7APFFS+QcKaVYjnN1QGKaqIRvHO+W4m0O8thaKK22uOl2Cfs3ZE0n\nlgjf0XiLbLg6cv/z3muxqysfN6zfBYBB974lxwa/7wwKOMDGb0recPfT3sQ5NzLw1xoJdDb6ruYJ\nwin1vzGYeOiFMIx5c04zTcb06/2RQza1ex+Z2Bjvw583HsSBw2QhEH+ic0eJFe0czzcdVYYVRVEU\nRVGUgqVXKcN1VnpcFUZCT533f1elfht8e9ADz1LxK+HN0Pce3WPvJWyOl/P4un05reIFoOXiASPL\nJazWR55PYIeWjyW8qILLDwtuOhiAmV+8CYDXGqoB2JaQUFjv1MrIweamcAixiqioiJPK1gAwqngD\nAPsWrwfgi3POlR2PyyGBNqygnL1rOE865cBfnnNVvAaAfrPFXs6KXaKkez519hAJtP/4Hs5fMLxE\nssOlzSkoo58Vey/K4dKlxrJLUR1+gHh/RCi12E48NyXcV6vG39v8/6FPXwJAJB4Oa5QM5haUbJFj\nl35a0rDo83cBzffrVGynHK1PiOI06A1RdTqj2SeLVNNIkUwQP2o/AH75e8mT+wQ+mi5fOPXrg4Mf\nAuBbf5kKwOxpouYltsry5SYaXoq39EeyUMTgILyW8xOGzL7Cvi/xifNOBGDIzfI+8tXWnoy/6Mim\nM8TOz466A4CaZLO0WRGRd/DD9x8NwPCmHq6Ae/Va4/EHAPDmfne5HYDsfYWveUmU/11q30/t40Ly\nEQ/qmViwWNLqtbK59aiULZPqaZj7DJBFPOZFojkvd7y5vozH5+zNL1PKcG7LpmeipMh7vt05v6eL\n0FpUURRFURRFKVh6lTKs7Nxsry1l+ruTYZwow34P+IiqDwFYWCEz8JPbAh/a9ha8yEBrfmv9PpRz\nnTrv9ND20RWi4N07+uU2z/mnbf0B+PXST4W2r5s5pENpxFqik8YDcErlQ8FGkRCc2uj7ed27WRSb\n5Acfeefq/Az91AIGwakafyz+3QM9tcwpYX4Q+hvW7ynfX/0g5yREMJS3osBlUojrh0sEiJJ3aDXt\nLfBGFqIvvp36v2+Wadz3/w0MfW9WWiRN7hk9uFWWX07O8Z5RB0iUdkPQ/M4SzYPOYkR53O1/JW85\nRXhLUmYZuIUuHE7J/PlQySAHfv5SAPr/QSIeOPVy+fcOBWD25OyWXIbmvOj2cVFImn4k9UCElUGa\nwwu99GT8+nLEhQtC350aDPDrTWMBGHWv1Nk7U7QMgKYrZWTP1SXtPXN/+2vHil95Iq14ZyqlziJ9\nzH+C/6TeyrQAlT9X4piq2QAsqNgfU5NbuYttiTD46RISR4Xza674fsxVZWEH6O6c39NVqDKsKIqi\nKIqiFCztKsPGmFHA/cBQxLXtbmvtb4wx1cCfgbHAEuA0a23HpzXuJNTbWmbzJg3UYzCMYByjzSSa\nbCPvMx1gijHmX/REe6TPtndKRKRz8ZSXrWji/CvWsnptnEjE8NWzK7niq/1YvqGO8y/dADnYo3iz\nZfSTSRpOdr3wcPacViZqyh+GHSMbUspwB9WVVuLtDrw7WC707vD2QMfhO+9IvMzrBov/br23vOqd\nX/wEHyx6jIZtvwvljxF2YYfzx7YpgwDYo1gUYV9tjad8F+X7X5eKH+/A5Hygk356GZa0XfAr8a1e\nOOXOUJp8ZeHjFY1c+I31rF2bJBIxbBwxhQH7HcHA+hdztke9tcxvsuyTISZmPNBYokEff/2ekpYR\nTwUnyDV2ZqRZmTFeOTFl8izc6MSyH4h6OGeCqIe+Pdws9NUrk5x/xVpeW7oZjGFsYjijzSTiMcus\n+KvQgfwRL+ucprE60af9nXLEFmWvVi9b0cR5V6xh9doE0aAO4RBo2loHMMkY8xFZvGMS1X3Y+NkD\n+M3w2+R7IEZlWiLcV9hKN4XrA3uolKO/XfLzYEvr/vCt4S+7vO9TVwIw+b8ST7qtMtnWO4Yc7NFl\neP60zif70QmyHHVDK2X/9/eKb/TwDZ33Fe5We3j3Vv+ZAwF4Za/wCyBXxXRYhlj5XcGyFXG+8o21\nrFmbIBIxnHJGCedd0IfGAeVs37qNXOwR3bSdqkdncvU1ct+/HjYTaPmOyZXhfWTEcIPb4N61kR44\nihWQTS0aB6621u4GHAxcbozZHfgu8Ly1dhLwfPC912MwTGIvDjXHcQCfZDkLqbFbWcKHVDMY4AMK\nyB6xmOHn1w5g9stjeO2pkdz+hy3MmdfITbdt5cjDS6DA7GFMhMmjj9P8ERCNwY0/7J/KHxvffZX6\n9asL1h6uvIz9zdcZfeNXU/ljcWIO1ZEhUGD2ALHJ//1wAO+/NCpVh2xfup7lf54BsE3fMc11CGqP\ngrdHLAb/98Pmd+5D99eyYH6cxZveIBYpptDs0VW0qwxba1cBq4L/txlj5gIjgM8B04Ld7gNeBL7T\nLansQZSYMkoCn82YKaLc9qWBOtaxkv04kgV8ADnaw3a3s4pTtVoRTreO6piPkGPYkBjDhkg26lsR\nYddJxaxYHecfz9Xyp0eq+dGNQJb2MNvqKP33e3xrlahsvxomgXKdb5+bxb3+kMEA9P9IYhF05cpq\nKSU06j7l4SQbRX3dFhd7OaXA+TW7HnSpLaU0Voot2kYRxfSJ96MplmBd4yr2jx3Fgnju+WP93uHe\ntD/jN2HDEQ5q33R+q/OzOX2zP3C0Za/dKTnOz3bhLwJF+EthRdhXEJyP3dihZYwdCt9fI37eVVtm\nUPXQTD7sQHnZEK/ggY2HsE+GmJi+j/kxXxQVbu7/Bqp24K+YtUoVci4OrhKoy04RbjhJZp0/9dX/\nC/arCNLix0CW7668RG/uS5S+lBPUH8nl7F90DAsS70GO+SNR0rnRnSWNA9vfKUdyUYaHDYkxeLDY\n1dUhi9fXsOG1BdAsLrVrk3g5bNjHtvDzjHr5wvf/vOBjWSWs7AnJL7ERwwHY/w5RySYXSb2TjVrm\nrzT3g7XiI7/b98R5NBvf2bbeMeRgj67C97VffH5Q7ozzg5fPb62emjpm5O3vAmmRbDrhK9yt9vAm\nEBRfuSr0PRv/cEiLYOOR9JeobANXZ7S3+pv/zt1tUhF16yxrGhYTLUsp0tnZw0o9+Modh8j362YG\n6c4tqoRf906rngfA45UTgbQILWkjbD0twEROzTBjzFhgKvAGMCRoKLsG8+AMx1xkjJlpjJmZqN3e\nudT2MOrsdraxmSqqaaSBEiMFNlt7NNHxZRR7IkuWNfHu+w0ctG8p69cnGTIkaKRmaw9b39ouOy11\ntoZtdiNVkQE0Ul/w+WPLiu1sS26kKjKwQ+WlblPvskfj5o1p9YfmD2iuQ/ruOozGTbUATZDZJqH3\nS03ver9Ay3cMOdijN+aPXO0BvdsmS5Y1MeuDJvabWkxjYw2RSI7v3F5mj86QdTQJY0wF8FfgSmvt\nVpPlDH5r7d0EHpilw0f1/CmFWRK3cWbxOruwDzFTRLYdwHR7VJpqC7BtdDdLw24mZ1pXLLr7ZAD2\nu/jdLrnElpo4p16wipuuG0hl3whNNsJ/ascCba9M5tvDNjUy/RbppUZvFL/cpmS4C7n+aCnA/e9r\neV+dxsXPdZ+JsE9ZpL0iE5Fn2dRYy3u8xOTIVCJN2ccSbS1/lE7d2OYxvlJV/WHYHiaYVd/sWy2/\np9SaQFkOqTfBtsiUXeXQW6Rnv2AXUYSdYuL7TzqFxG13+73w432Z/cIdTGZvoo2yj4nFgldZZtLt\nUTJ2pH3snX0zxsT0Yxk7/7fxv7kYgElXyEiDTam8niLu2yfteTXbKoixep7k0d/9+FcAjCuqCF3b\nfya+sjTo5c288+49ofojG/WstfwR95ThTCtXZWJ544Cc9s8G24FoEjXbk3zxgtXcdN1Abu5Tkt11\n0vPHuJE2Wdn8zPwRk0zEImL3zefKM/3St58D4KpqGXnKJXqEe+4ugsWr35WRlOJNQYxwz0e1LTry\njmktf3QYLwZybPxYAO489IHQbi7PT//Jgalt5bXB8pc53G97dOU71x8d2n7qQQC8sttdoWOz9RXO\npOZ2h4esu5YrL7++bhCjqvpAZBPJovbbZS3sYQwDfidzZfb69BkAzDrwYblGMCqbHimkNfyRsLP7\nLgHgb2OPkA2ztmZxZ/klqxrLGFOENIQfstY+FmxeY4wZFvw+DFjbPUnseSRtklm8zlBGM9iMAKCY\nEhqsVICFZo+mJsuXLlzDGZ+v4PMnSaOgakARm9cGy04WmD2SNpGWP0YCmj/mvXY/A8dMTSsvpQVt\njw9m/5EhQ/ZWewS4OuTMtDqkuH85QBEUnk0yvWNQe6g9kPJy6gWrQuUlVto3JeIUmj26gmyiSRjg\n98Bca+1NaT/9HTgP+Fnw+US3pLCHYa1lDjPpQ1/GmMmp7YMYziqWuq9Z2cOUFBMdM55jvzQ9tD1X\nZcfhq1GHlMoKZC/NCHxui5t7Z5/vew8Ao71Zr7nOHrXWcuFVa5k8KcbXLk6LxrrfXvzyntQs7uzy\nRxAntP990ks98LQvAjBj6qNAs0Lz6CdEnfzhiJMBiK+SVd+6UoXoKDaZZPbHf++S/IExmKJiThoz\nO7TZ5Q9fhXwp8DKp+q8oWs4KydraIHGtSynRgaIM1hw2IbVt3dnSMPvPQbK6lJsd7SvCTqVwaXFq\nrVNU9vr15ax68mEq12xm0trSlCIzqGkYK+ML3eWyskfxRhj/sGXTcXI//aPlre7n7OHUukWnitqz\n79gvybWvD+LNzpwDpCnA3kqGkdJmNaTmBIksYC6R98uMPe8I7jscLcL5ijr8Z/Tw1n7c+e3FVNQO\nY4IZSQKZ7D2IYayyi91hOdWnTRWd8xle2dAv+K/rhkxtLPs6zFrLRVevY9dJRXzzkv6p7dWHTGDF\nozOdbN2+TZJAY/N1/dinDl/tu2tkEEHmZ/Lp8k1TkA+yUQf96BH7Pyirn45/Vs6ZSzSFtt4xS5mf\nvT1A6pBY6+lvbxVWd5zbb8mXxJf62PLwcM5uD18OwIS/pb3HurAu7lJ7uHN6ozDVX18a+p6tr7Bj\nebDq58akPOdoDr7CiUBVHRqVNLm5MZlw79xdJhXxtYsrU9vL99iDrR+kYqN3qE028gJ5l172rIxo\n3D5CnqlTiMuC/O0r4b6fvqsHPz5JVnAdOUv2c1F4AKyLBtVDyMZN4jDgHOB9Y4wbU/8+0gh+xBhz\nAfAx8MXuSWLPYgsbWM3HVFDFdPsvACYyhTHskgoVBWyhQOzx6ox6HvzLNqbsVsRBx6wkYgzXf28A\nw047iQU3PA4FZo/N25exatP7zfkjYZgY2bNg80fd8sVsmT2ThLNHk2FidK+Ctcf8t2p45YkNVEQt\nr235K9YmpP4wu/K+LTx7gNQhD/2lhim7FbPvMR8DED9jEaNOP4gVj86sDEJF6TuGXVjKfLUHhW2P\n9HfugceswBi4/nsDGHDo0Wx+63UKzR5dRTbRJF4hs9fV0V2bnJ5PPzOQYzi11d/240j+bf/ygbU2\nK7vYMZbEnY0pH8hce6Pt4eJr3jBkVovfElZUtc7GEzzkwBIaV45PzZp15/nxd0azy95XMPO9q7O2\nBzZYqSZQFQZ+RWIV/uA5mZF9/WBZ532fYlFu5vxIhssmf1WiADvf2FTY3S5QJVL+pBG32pt8umfl\n/BLdffevGM2x+/wPyffmBsc5f9REzvnDlJZgJo7nourfBVtaj1TgqDSi7H34w3EAWDM2vEOZ2KNP\nlfTyJw8UlfPqkeIjeXDJv1K7Nq/qFs4nft50dnBqtft94sOXADDpj9OZZE4NK2NJwOReXsy2WmLP\nv8XhM74KwOxDZEW+TH5tvmr99v5/lv0fk/1v3ihq71MrZTW4TTVyr5Xl8vtl419MnevcyvDojVOC\nnfrolBA/9rHv1/zSkBOZ9vyJxIJGnylqtst+5oic7OGoG9I519CPa50a27Z/f5t4ow7x8uwXNz38\noDIaV8oqi85u0z4Y736eb63dP5vzRGsNA96Kwmfle7YjbO6ZuTjVfoSYTPvX2WZl1eW93V49B4Dx\n3w3UZjehKYf4um29Y7DZ2yN1SDsKcAtSvsJB2e4jSuW5Z/4rtNuE578MwMRrpGy4Mi7HdiCmeQa6\n1B7B6KNL39YzRQF9fZKMNvorB/r45XtWo9QV3zn5ItnhI1GYTVlQF2UYjQMwRXKN+GpRYz+6WfyW\n3UhWptGmww8qI7FqYovfL99WQtHQgTQsXT4p40Vbwza/cxMbZH7K0pNlFODsR6cB8ODYF4Hm+j6e\n4X3gl7lvnfcXAB79k/jjxxeHFXgAG+3cyFZXoSvQKYqiKIqiKAVL9t13pcsxQDTSPPO/o77C7eGr\nd/K/9MY6qgg7tc2d2ykj4//9FQAmPfB6xxKbTDT3UteIcvnO58YC8LW/iHJ36wiZqbzgROlBT736\nawAM++VroVOlKxXptLZOur/CmNvHV1VigWqS6hEHhzXYsB9dVxAvj7Jx3/4pv25flfDZJ1DGF51y\nV6u/Z8apwM150akObhXAiKeI+0rw/CYJa3X6jdcAMOHO3H0ls2Xs1eJr9tBz4i54Vl8JNZrJr83l\ncZd29/v3B84LfbaFy+81ySCSSeCv7KIG7PoP8Zu845j7ADi+vCF0nLPT8jdkNGMsogznvCqeTyRK\nw+DwCEgmX9lMLNssPsODO6MMezRV7vhXS9GGWgY98A7fv0xiWrsRsWz9Hf14xA5fOXakj0RMmX4W\nAGPOkBEhm/I/z18wVROJEKmoYMlVMrI29PAVACx/W1S/CT94C2iu40xRWIF02z/8hYycPDPgVQDG\n/V1U0MmXSFzmlPrdiVjCOwwbrovGXR4u+/4qlj6+b/gpr1wKwMR33wGa67vk9vbD/Pn2jtXkVhf4\n5bzP0O1EijqY37wV4uIrZLR1w1GSx8fdeiEAH54gcyVcfebqN3+E2X0/v1Le4Usflzjbz//w8NQl\nXVxvk+jcyJbDL6cxotic4jwriqIoiqIoSoGiynAB0BEfZNfLcr7AzvfRRRFwSqFTR3Z5+VwAJn9F\n1JhmZaQDCfZ7qUuXAbDwKIlWMeF28VVbePS9AMy6+nYAJu4r2ydfK1E0EgtSs/PbJZOAYw4QVWXD\ntaL03Tz8EaD5fvuUyXbnj9qVxPvAuoOaE9aeauFwSphPppGH1pREf1sm9ezUhccAsO3qYQAMmtF9\nijAAkSjxJaKqPnDWCQDUP/BvAC6oEmXTqRJxT8X2YyL7qobL4wnn/562v1PInSL8z1opUz/5tsQw\n3v118YUbf4KLCS1+lv7Iy6B3vIzWGeXQGExRjNIBdaHNmZ6zX6ZTMXGXVwEZIvR3kKbycP5xvvUJ\nwqMbfpqcteLJ3HUaay3J+npmXCUupDPuEeXzwBKpo1LP2XvePs5+vm++q0dd+ZrywBWp38Z/T/xm\nW1R3bfiMdjf1I8uY953dWfR5qR/d/bO7fBz9uqiaZY8HCq8XX3v+7RI3ePZnbgVgz5uuBGDyL2QE\nLlXGvXjlOwMupvRz41xUmNZ9YB0unzr/3NQKm/dkyKfZjM54o5HRhtxGdPxyPmXIKlYXdXKE0r17\ng/S7lVcnXygx2088WuaCjP6prG567+iXg7S0vnqes9O1gyRyz/dvfz/12903jgXg5U3rgJYjn/6q\nfn5bxOFHNnLvqE2J2lRdng2qDCuKoiiKoigFiyrDPYhc1wPvTpp7Waktod/v3Cy+j3ffLFO3xwY+\nojbShWn3FOJkEJdw4jninzXtJIkqEP+6+Iwu+KQoxVuOFKXsM7PPBGDTv0W1HPCB9FKLN4svnC1u\n7gvWV0uPf+1+sm3PI8XH6aYxohzcvVGUhAN+IGrK2HvkfiN9A7X6Oukx77J6UfgeOqH8lZY1svse\nH6e+Z+tT3t5qQR3B+QSf/cH5AJTfKn6mJc8EK2sZeQbdpgg70nzK7cwPAHjskxIV4vobPg3Af47+\nNdC8KlwmX2uXU331trX9FzdJHNGjnpL4sbv9eAkA5WvEfz0xVfwqx8XCtndK07sNMoJQ9bqMcjjr\ndMbP0kQMkZISDhjZcoZ2a7Qs08KoZzz1pAsUvoZ+YVs7G2fy1/XLycCyDi6tHIkSfUGi81z7GYns\nsPUXUt6f3ONBID0+ddvRIlza3mqQ4895S+ZDjPo/UcHGz2ieF9ETVVLTBKWrmu/RqZ9O3Vz2abnP\n3acPAWD9sRLBo+Jc8S3eJSKfX/j0+QAMfzdQhANfVxdtoifca65Muez99ndKwx+VO3HuFwCI/UdG\nH/zV+rLCm7sS8YJ+tOf7749cfKp6DjOirY8K5kzqmaatFgrEnpf7XfWS5IEjThL/8VWnSeJ/e9D9\nAEwrC48kO9Lr2sv7LQt9+tpspvo6U7l9sU72v3D6eQCM/Z1h0eLbWt23NVQZVhRFURRFUQoWVYYL\nHKcW1AYxMzcGysYb9aMAeGWrrPjz1PviOzv8WckyVc+KD9CgrZ6PaHcoI54fk5uBX/LUm8GnbD5+\n/7MBWHC6KILHHilrxJx/mfj57l4k56kP7nlNorkvuDkpkRge2yT+hn/7YB8Azr3xm3KNp+Va1YTv\nN6VWf1N8BlvofJ2ww9DiLXx79DOp75niC+eKi4DwQaPc87v1YwCYu314ap9nPhTHwuoXROkc9IrM\nCu7/0QLZwd2XNxLQbYpwOr5P+dr1AEz+SrB60hSZ+bzwbImfu8/h4t923lCZDX9QiajYfQOFbF1C\nVNs36uX+H14jvpLvvdocrnPig7JS3OQPxL8y4d23WSAK/lFfv0w2BOaxQRYr3hbMtl75VvheOlNO\ngpnxM1eIordppKzMVxmMDDjf1m1JeSYrE3K/79aPBuBfG+UZl/9b/Py7Mu5BtE7u6/m6QBEOdJfx\nRbIK5pCo5D1n+w8aJTLI05slEsSH/x1Ph0gbOUjOlkgBFcdLWr50iIzqLD9K/LkbdpNy0K9KVOja\neklT08fye7+5Ut6G/Gs5AKOWykiEq4dCcXV7kCLsKF61nVE/fY09GyVPXnS+VJRnVErd/e7xtwBw\n3dTDAHh8ruT/4ttlJC32mIx6WBPEcXd1fK5xi3sQ8YF92HDKIfxt5G8AWBUPxwvPhPN5HxiVfLz1\njzIyWh1EhXHx6DtT/8Vqw99d2Sg2bdu7KHgv7Fm6jDJfXu4swX27+/Lf887ffPzjsvvPR8o8ju8f\nLu2HdftK2obtLfM5jh82J3XqMSVSb08qlt+mFIV9gpfF5XNNQt7nr2yXtsg/V+0mvy8eBMDQ/8oz\nqX5xCQATVr2Tln7PqG2gyrCiKIqiKIpSsBi7A3uyxph1wHZg/Q67aPcykJb3MsZaOyibg9UeYdQe\nYXpOSVwAACAASURBVNQeYdQeYdQeYdQeYQJ7LM1wnp2RTtkDel0eUXuE6Vx52ZGNYQBjzMxcl5Ps\nqXTFvag9uv4cPQW1Rxi1Rxi1Rxi1R5iuupfeYhO1Rxi1R5jO3oe6SSiKoiiKoigFizaGFUVRFEVR\nlIIlH43hu/Nwze6iK+5F7dH15+gpqD3CqD3CqD3CqD3CdNW99BabqD3CqD3CdOo+drjPsKIoiqIo\niqL0FNRNQlEURVEURSlYtDGsKIqiKIqiFCw7rDFsjDneGDPPGLPAGPPdHXXdrsAYM8oY84IxZq4x\nZrYx5hvB9h8ZY1YYY94N/k7M4Zxqj5bn3SltovYIo/YIo/YIo/YIo/Zoib5zw6g9wnRLmbHWdvsf\nEAUWAuOBYuA9YPcdce0uSv8wYN/g/77AfGB34EfANWqPztljZ7eJ2kPtofZQe6g9eq5N1B5qj/b+\ndpQyfCCwwFq7yFrbCPwJ+NwOunansdausta+Hfy/DZgLjOjEKdUeLdlpbaL2CKP2CKP2CKP2CKP2\naIm+c8OoPcJ0R5nZUY3hEcCytO/L6XxhzwvGmLHAVOCNYNPXjDGzjDH3GGP6Z3katUdLeoVN1B5h\n1B5h1B5h1B5h1B4t0XduGLVHmK4qMzuqMWxa2bbTxXQzxlQAfwWutNZuBe4AJgD7AKuAX2Z7qla2\nFbI9oBfYRO0RRu0RRu0RRu0RRu3REn3nhlF7hOnKMrOjGsPLgVFp30cCK3fQtbsEY0wRYvSHrLWP\nAVhr11hrE9baJPBbZOghG9QeLdmpbaL2CKP2CKP2CKP2CKP2aIm+c8OoPcJ0dZnZUY3hN4FJxphx\nxphi4HTg7zvo2p3GGGOA3wNzrbU3pW0flrbbKcAHWZ5S7dGSndYmao8wao8wao8wao8wao+W6Ds3\njNojTHeUmVjXJS8z1tq4MeZrwHPILMZ7rLWzd8S1u4jDgHOA940x7wbbvg+cYYzZBxleWAJcnM3J\n1B4t2cltovYIo/YIo/YIo/YIo/Zoib5zw6g9wnR5mdHlmBVFURRFUZSCRVegUxRFURRFUQoWbQwr\niqIoiqIoBYs2hhVFURRFUZSCRRvDiqIoiqIoSsGijWFFURRFURSlYNHGsKIoiqIoilKwaGNYURRF\nURRFKVi0MawoiqIoiqIULNoYVhRFURRFUQoWbQwriqIoiqIoBYs2hhVFURRFUZSCRRvDiqIoiqIo\nSsGijWFFURRFURSlYNHGsKIoiqIoilKwaGNYURRFURRFKVi0MawoiqIoiqIULNoYVhRFURRFUQoW\nbQwriqIoiqIoBYs2hhVFURRFUZSCRRvDiqIoiqIoSsGijWFFURRFURSlYNHGsKIoiqIoilKwaGNY\nURRFURRFKVi0MawoiqIoiqIULNoYVhRFURRFUQoWbQwriqIoiqIoBYs2hhVFURRFUZSCRRvDiqIo\niqIoSsGijWFFURRFURSlYNHGsKIoiqIoilKwaGNYURRFURRFKVh6bWPYGHO6MWauMWa7MWahMeYT\n+U5TvjDGjDXGPG2M2WSMWW2MudUYE8t3uvKJ5o9mjDE13l/CGHNLvtOVbzSPNGOMedEYU5+WR+bl\nO035xBhTbYz5W5A3lhpjzsx3mvKJMeZBY8wqY8xWY8x8Y8yF+U5TvtH6QzDGlBhjfh+Uk23GmHeM\nMSfkO10+vbJBZIz5FPC/wJeAGcCw/KYo79wOrEXs0A/4F3AZcHM+E5UvNH+EsdZWuP+NMX2ANcCj\n+UtR/tE80ipfs9b+Lt+J6CHcBjQCQ4B9gKeMMe9Za2fnN1l540bgAmttgzFmV+BFY8w71tq38p2w\nfKD1R4gYsAw4EvgYOBF4xBizp7V2ST4Tlk6vbAwDPwaus9ZOD76vyGdiegDjgFuttfXAamPMs8Ae\neU5TPtH8kZlTkY7Ty/lOSJ7RPKK0StBh/AIwxVpbA7xijPk7cA7w3bwmLk94nQAb/E0ACrIxjNYf\nKay124EfpW160hizGNgPWJKPNLVGr3OTMMZEgf2BQcaYBcaY5YFbQFm+05ZHfgOcbowpN8aMAE4A\nns1zmvKC5o92OQ+431pr852QfKF5JCM3GmPWG2NeNcZMy3di8shkIGGtnZ+27T0KW2DAGHO7MaYW\n+BBYBTyd5yTlBa0/2sYYMwQpQz1qFKXXNYaRYasiROH6BDKENRX4QT4TlWf+i1TUW4HlwEzg8bym\nKH9o/siAMWY0MpR1X77Tkmc0j7TkO8B4YARwN/APY8yE/CYpb1QAW7xtW4C+eUhLj8Faexlig08A\njwEN+U1R3tD6IwPGmCLgIeA+a+2H+U5POr2xMVwXfN5irV1lrV0P3IT4qRQcxpgI8BxSOfUBBgL9\nEX+mQkTzR2bOBV6x1i7Od0LyjOYRD2vtG9babdbaBmvtfcCrFK49aoBKb1slsC0PaelRWGsT1tpX\ngJHApflOT57Q+qMVgrbIA4iv/dfynJwW9LrGsLV2E6J+Fuwwr0c1MArxGW6w1m4A7qVAC6bmjzY5\nF1WFNY9khwVMvhORJ+YDMWPMpLRte9PDhn3zTAzxGS44tP5oiTHGAL9HVPMvWGub8pykFvS6xnDA\nvcDXjTGDjTH9gSuBJ/OcprwQ9EoXA5caY2LGmH6IX+h7+U1ZXtH84WGMORQZAi/oKBJpaB4JMMb0\nM8YcZ4wpDeqQs4AjkBGngiOYEPQYcJ0xpo8x5jDgc4jqVXAEZeR0Y0yFMSZqjDkOOAP4T77Tlke0\n/ghzB7Ab8BlrbV17O+eD3toY/gnwJtKDnwu8A/w0rynKL58HjgfWAQuAOPDNvKYov2j+aMl5wGPW\n2oIf6g3QPNJMEXA9Un+sB74OnGytLeRYw5cBZUjklYeBSws4rJpFXCKWA5uAXwBXWmufyGuq8ovW\nHwHGmDHAxYjv9Oq0WOVn5TlpIUwBTxpXFEVRFEVRCpzeqgwriqIoiqIoSrtoY1hRFEVRFEUpWDrV\nGDbGHG+MmRcEli7IlXfSUXuEUXuEUXu0RG0SRu0RRu0RRu0RRu0RRu3RcTrsMxyssjIf+BTiOP8m\ncIa1dk7XJW/nQe0RRu0RRu3RErVJGLVHGLVHGLVHGLVHGLVH54h14tgDgQXW2kUAxpg/IeFlMhq+\n2JTYUvq0e+KGUbLP8MpNAPSNyEI2rtm+Oi7xzutWlwMQ2bS9I+nvUsrpSwN1JIi/Ya0d1JX2yJb4\nIDnXiCHrAYiaJAClRiwX6wKvGBs8BROEGF0ZL5Vr26hcKyLhA8v6DKSxYRuJeMMOs4eJyP0lKmXV\ny6YqSWtluURyqYhKPqow8hkzzWFS66z8vzFeAcCWWjlH8ebg3FtqvYsFn1n2JTuSP6J9+9jYoH7s\n2XcDAEnvYpEgEQsbZOGrxLxEkLYgcTticqx3regukg8mlEhQikxpfn/bAGLDBhJftb4p2zok2/xh\nYpKGREUJAE0Vcs1oaRyAiqJGACqjki+KjNitGCkvMRPko7S0NwX/xq38VpOUfL8tLtdoaCqSa2yX\naxVtk3Pauvp20+sopy+1bOt6e7hnFIu5DcFnsENQbmzEhD9j8pkMDguKeGgbMTFMJCq2K44GtoyK\nrUsi8llqmoI0y/dYYNtokBZXn8zdPkD2WxqnPNqP2sTmLrdHd+Dqnoaxkh8ikaT7xdtT7rtoRWDj\nbswfsapyWzK4qsPRoE2WlVtzdmq5v0ntY73tNnRsJPjuylfjBrFjbH3md3uu9oBuyCM5vgfCxwZ5\nPxoNbbaJoB7Psf4upy911JC0yR1vj3bsYIqkfmyqKk5tS1QEZaEoqBOCMpNIBnkgLnaJbJfvxZul\n3raNXohi79qmrDT4bqlr2kJjvDarEtCZxvAIYFna9+XAQf5OxpiLgIsASinnIHN0+o/y6T30Bdcc\nDMC1x/8FgKPKlwDNL6Sfrz0GgHd/sQ8Aff88XX6IBJkqmejA7XSONXY5G1jNSpYsDTblbo9c8ey3\n7ouHAHDD1fcA0C8iDbiJRVLhDox2PtMnrGTYaNBguHbdHgBsapKOyaSytQDcec4wNq2bx5qP3+x6\ne7jnHKTF3X+kTNJQc9SecsET5PcTpr4PwGGVHwFwaKkkaVC0Oft/0CiF9Y8bJe/94729ARj9uNxn\n6T9mhNMdNCyyrbg6kj9iA6sYecNlzJj2BwAavDjlJUbSfNoisdmWw6XRbIqkwrFNjW2mqSvwr1V1\njzRmHhn/PJA5zZNePJ+aNz5g7a//lL6sbQubtJo/Um/foHPnlfdof0lDzWES83/lEbJf5STpXB86\nbAkAx/STSFijYhsBGB6VexgYlY5QTbJ5NdnVwSXWJKSz9Np2WW/hpfUTAZi/fIhc402piIf9V86Z\nnOWtOJoh74LkkfeZnrs9fLx6MFIqaYoMFLsQdBZsUZCHS+QZJiqlAZIok+0N/eSzbqDYr35A8zul\nYUDw4qqW59u3v9Q1I6sk+WMr5P7Hla0DYFLJGtkekzxaHZXjqoK0lhq51r7TzwdgzCVrWV2/kPe2\n/rPz9uhqWnlvRSqkQ7rwhvEAVPSpD3Yx3qFyzJD/EZsm35sbnKD9d1eu+aN4cCW73fxlIqbtuslv\nqKbuqZ3josHv0aARE0t1AJqFGHeO4kiQF01431jwvSzIDxsa5B217CGx48DfBfVusmVHPxt7yCHd\nl0dS74GkDaczm2NLgo5Tv6rgWDlHcutWOWdDhtWsM9R/a+xyPuTt9D13mD3aex/Gho6QNJ4wOrVt\nwyHyzIcPl7piQJnUIVsbpL5avq4/AH1mSH088nFpbsaXLnM3Ih9BZ8LGpVEdmbirbLeW1xf8Put7\n6ExjuLXWdovSY629G1nLnkpTHe4OBkSDzDDkWSkYJQ1LALj9+lMBePh9ye8NQ6SgrPqqZJK5v7oT\ngHFHXgTA5Muk4LR4MJKQbO+rK8nOHh0+e/jwwfdIQbj50cNlwwDJTPEBYrfGtF5Z3SCx0QXfk1CQ\nF1WtBKDJis2KTLi36ho1rjFz9JzPAlD8aWn8Juul8p9nquX45AKirGuR4pa3kL09Us81yPSOdZdK\nJ+Azl7wEwI8H3dXWaYCKFlsODjqTBw9/E4Cbg8/a46SBdOY1nwFg08/HAFD6ZLhx3MGOWNv2qBxp\nx9wdYfahomDuUewaaWJr9yx6Mi4/VUTEwLMaJe1j7o6wdq1hbctDQjZpLX+0qPz23g2ADy+X5/q/\n0x4B4LSK57NMZbH3KfSPlqf9L5+7BerxtDIJsfv9gUGo3V2DHaWfzqpv1QBw8WKpw1b8cRwAgx+c\nBUBye6B4ta/it2sPH2efNZfIe/DICySvnlj1z9B+fYIRt+qIPJOhUXev5XQfJd6n4OqX2Yc8BMD4\nay9m+9vbZM2qMDnbo6tJ5b+098vGk6cAMP/IO7I6x6G7XQJA32DpI+PU+GSmIzKS0R77711qZ0zd\nudbQcWJL/Fqx7ZfO+jQAG34zFoDyx2fKjjZjPdu979yIp+J676LIXlIRrDxa3oPb9m5u0E4YJbXd\nblWrAdi7TyDKxKTxmwxGbrcmpK6cvk062v/8SM5Z/ZzU/wP+/I7sH7xzU2lq3STdYw+vQe7s4Br4\nWz4/VX4/V9oAT0+R9Wg+ije/s8598ysAbP33UAAaN0pSGvoFZWGk5IU9TpMO45VXSv11/ltfBmDM\n9fJ78l0RvjefI+2AZ2/4JQAHPXg1Dbdk38TtzLj5cmSZX8dIYGUnzrdTU0IZ9YQWVlF7qD1SqD1a\nUlJSBeEWaEHbpIQyUHukiPXT/JGO5o8wao8wJZSRJNSbKmh75EpnlOE3gUnGmHHACuB04Mx2jzKm\nhbJT+ZR8f/G9yQBMvlhUuSqkB+Uer+tTjA4Ejl2vuxSAxRdKb3xc9Kuh452SmH6t7qKS/tRRA1Bs\njPn/7Z13eBzV1cZ/s6tuSW5ykatcZDAYMAYMoROaAwFCDTVAAJuEZtpHQhIg+RLyEUoIvQQInYCp\nSYhpodoYA46NDQZj3Issd3Vpy3x/nHtnNVdaaVe7WsvWfZ9Hz2pnp9w5c+fO3Pec854cErVHGqHd\nKhHtXtkgLsmAcmXnNnObN54h4QA/LvpOLZFZZ8Ag/PVMPWDMm+r/OgiArIYVQEvWttjpQ72bHnuY\n+9az774PSP94oywxNmZDRNi49REVC9nsXPsoN15plp81LgiI7V4pl8qzoQdlvXEfyex01M9XARDZ\nuEk1tnWmr0P9o7qO4Htz+M0KYeFfGv2WtKED9NG2gtnWm1YIwx58bw695Le8pMcQNX4su3EfAKb/\n5FYARmTLtYt4xwyoNsg10w+KLEyvh/Qr7Q3Rn5Fmbdfr5Dr+ITNs0DF637ofvTz6dWnyjdKWC84X\nr82aydKHvTCKQJBitwSiHbCH4ZX45j4J8Vl67H2+8ze9PTEupHUmWJ+/Gfcty1SYhOrnxkO4ZRMT\n5V307ZMbJXvMYOhI/+hkeC7xZvd4358Ky6dtrUNsCgLy1Gpwdey09J/qYWKPIr0Dp337FNNbdpGg\nPerdKF821TNGjf/tXaNMINF+oO+jv4+WCsa594gdxw/+OQAD7p6ZtD1SQaveZqDqTHmOll4sz9GH\nRogrI7WQRPEq/aRYcn8ig2YCEDxUbHfn/5QB8NqV4obKflPY8uJACdFolE61h8GM6zGn5jSxw/6/\n+ASAWwc+4FttxBuXATD2ysXesuFbJHwxrndMLd+oll/zo0sAeOCOhwDo94qEVZw5TxjmtybcBkBv\nZfv+n0WpMFJ92jy1xFf1w3XdMHApUp9+IfB8Ny5HScAJsBPjAcZg7SH2cPYEaw/A9o/WEJAXgBXY\nMQSw9jChSBNrDwXbP/yw9vAj4ATIk4mttUcHkAozjOu6rwOvJ7yBIwOcZviW3CIxHnmVEhPsMbpq\nJqsDqPRszMu6VJ/Db/gYgMP3F+Zs6XEPAzDxY2GMe//t49ih48SbphMlTim4LHBdd+9OO0giMAPL\nFSMcKPJ4CIovFEazZ0AY4bqorKOZUA3NiOnlx3xzjGz/osQn67mcOWPGdSlhINBxe5jXrP6EiQDc\neIfMvg/Pl2OaSX1v1gmLMOWDcwEo+VC+910gM+5ghSRSEYold0UHSnLR1p1EqaTiYDmzqw+T7n1J\nLwna1+zaNwc9Ib+/OQGAeVfLZ/D9/6rGt5ztdpn+0bWwNVl7LLpVEmeXnCIegRolbaDjTjX7pDmM\nGCNqMqOCAien1eXBZmxd3HXi8Ale7KNijhuj0ocfGfYRALc+I8l9754q/Say8Fu9aXL2CAQ9diZ6\nkMTpzZn0F91qWe6xuH6vT72rxgVtL9VnTfZOe4uCPvZS2bKDSgUa5r179rJDARh7m9yjKzrQPzoL\n5ngU/v5e3m8vl0tfDKhHarGKkdfnpRl0HedfM7LDz6GE7RHBoTqa027/N2F6UtIJvU/dx1p6K/yo\niqj8iKDYrXZICy9Fp/YP85oHdxHvdfQe8TJ+vLMwoLF+LKykHov+d/0Eb1/PfCGerPyvpW/kbdDy\nB/JRJTmDnHrUDABuHiD5Bd59qpQ2pvZeBsCRD90NwFWnT5ENZ31BFtm4rjum42fcOlrYYUB/AL67\nS+J9vznIzwTr94n9/zQVgDF3CbsdacYsN/fct35QNd4o2+a/IrkPf1wmZPcvX3oGgP/u85zaQGy/\nsEno4OJ5lQTrDOWJNmAr0FlYWFhYWFhYWHRbpMQMJw1XGMSskWUAHHuEMMGLjhI2LmrG5RiZ+V7G\nol6gmIysX/cCoGaazCKv/5VkLj70j4netpFNm33bbCN1icxAnZsX26aw9Zhdvf9njpWZnJ7Bmoyw\nnunq5ZotWPtiGQD9QxKX3xkyXuYstOE4uY4v3fNnoGU81rKwzASPeepaAEbfL3HMY1Z95ltPW6NV\nTqZCpJ+K5srXor/L57+KRQXgvksk1vWuC0WpQrPSt5cKQz76WMncH/We2+o5WKSOaO8e1B6+L0tO\nkWug+26+Ym2DCcRdQssY2v3miuKD81SJ7K9S9tvYOzY8Vp4oMaDfKpk7k9E0oZd7zLEadrZGJYny\n2j4SY3jfFSJrNOZnelxK6BTUPiX/wlXj5Hc/FjtoNYh43h6NQicviYPF9gewKiI2WhMWb1NFWBSB\nVoYki35lg3wuqRGbrqmSMX7rVmmbWyVtyt4s9imdoWKyp2u1gC1JtS0TMMfTwK9iWijaxtpGZmx5\n0FBQGlPuz2tq4VlLAyJugI3RHkDiWsbQviclk9Ax1xrhImWnthUUUoY5fteeLOP7bbdKHP5+eXJ8\nreyj1XJOX/p9ACr+IJ6fvDf/6+2zPOyTPWuBEvX5+XVyT4x47AIAlh4tnlA9bukxRKsLLb1C+taI\nWUmcYIJo8XyfKLKlRz32IQBTe0sCl2mHCY8IIzxcMcJaZcINxZ6HyT4btUSkVo+4+BGJH//qkvt8\n671cJR6y6Mo1LTWJ29p/Uq2xsLCwsLCwsLCw2IGQWWYYwHX55melACz+t3yWbZTYXi9WOGqwjAab\n6zHEevY2S2Jrvj/vHAC0tuL1F+/s7WLoH2b6jpGJggTbHFH/LHqnK1vG0puZ9d6mukKU+n79OgnL\n6n/PTN8+027HQLCFasRv7/wrAD0Dfibr5g07AfDuZfsDUPa+9CM93zRjkmJZ4K3EwikmT+t9akSU\nAPqQP8p53/aPUwGY/oToy077TOwy5tqPvfaDZYQ7A4F+TeT/PMaoxWJdE5vTm0zpDxf9AIC+P5Vq\neeEKlemsrmF2M89U0SuyzejHzwNgsWKITUakPRQ6fn3d+494HIC7hxwpC1YktBuB6+KGmggUCNt6\nzL5zfT9rNtJksdeGJXZ+/9evAiBvrdwn2WIGcjfLfZK/WVWT26oqRNXEWJZAlbBTTo14ZaJVquJg\nTY1qm153LQD9vM924OmXdoAp7ySYLGHdScISfjg2pmeuWbt4LLw5zl4+/G0A7imSONJotTJ+Gj2X\nITfI+nAx7THDZv94sUZY/Fk1o3zrRVXMeUiVIoyqGFatyhOKBlusq3/T8epNKr5/Ut8FAJxXXNlq\nG+Ihq5fcw17+UJqZYfP9QKtFvHqLaNf2V15J874f/azoRo+6WujZXERdyG3mEfDebzTM55DO8VFq\nUGOvkDyC5z6VegGnF4l3O894bbtmvKgMvZQ9BEIpBvHrtuo+r+zQeKz00z/dIyzsxFxh7DdH5P7X\n3qhDF/wIgOE3zvTvJ14BkSQQNfYx4tElALx5vrTlqAIZc15YKsxw/8avk7qPLDNsYWFhYWFhYWHR\nbZFRZthxHAJ5eQzZQ1iCHpNVxrWaPbnhkF5RfRrlVs3lsR0DkPeQzKBQsrOXn/Wqt8qrt0t9EDci\nx9yhYzoN3dEtZ0nM7etD722+EhB/Jm5m+b79qCh/9GdmGhsaB6r9w/4q2p2H5usZtCy/ZaOUwp1x\njFToCaySuCwvLqlJ1TBP5toqZqcFaayVObJk9hldILqw8/eVWf6Y0KfGfra9lueOikG5W7hpxKuQ\nYCa6iZBBI321QEqDlleINqZWW4nWCtsR6BGLTdcV43q+J3F6HCofkSSpy5i6gPSTSQXCdtyV2/Fq\ngu4ukoZ+z+CnfMu1ckHE6JPzm2ScHHOxUUWxveM0+79dQs7QIzU9Ls1+UDv3Kwd1pZwOz6OkxoI9\nr48f+6kZwzfqJNv+5MKqVtc7Kl/6050T1BjmqdBoe6ROeUYIsilcCK3VeWwGTzFIxd7f8NjZQMwb\nFoO+JqmPcbdfI3H65111n9qj3xOpYSqb9O4pdnNUKXHS5Zg0PJ2ho8Tj9+jNdwAxRljH62oVpjF/\nE+WqUdcb3m3dn5s9g9r1oup1VVu0V/KWr48G4HSlmlCt49KVwsbg7E3q2GU44dSY4RaKKYeLYsq9\n994FxOKUtZdNM8La25R7Q7Fvf2acfUrQY4KyT3it1Bm4doH0paMmPgtAzTcyvvVPcveWGbawsLCw\nsLCwsOi2yCgzHC3Op+7g3Vm9QWYVo5ZJjFtcllbNjoN9JTPZq/KlWTxjHlnwL5mx/36DxJr+uuRr\n77dnjj4WgPxXDTbEYFF3BJh17sdfJnZuzgLHq0plxm7dsUlYpwEPSpa3N8/rLHtFI6z/mbDQbwy5\n39fWjxulre+fJjqzkVUSU+XFeaUhLqkFdJy6ntW3FyvdhRitHQ09HJiYmz77OmZ8ndadVn27tUzk\nvM1yf3gxgyoGOGQweWaFNhOajdPMCoGO8xLhIn8sYvyKc354MfWmd8hgkttkd0xPiD5fUwloO3SY\nmM+lDVPUuDRI61vHYnG1osmJ30hOwcoZQwA4WVVH1dc7qGJoNWtfOUGYtoHvq2PquNE0jK9hN8CG\nUGH7KxqI5ra/jg9OEmyk6h/BDjK6w4olbrY2mJxXKC5025W9g1ICnKPv+A8AY3Pk/jQZ4V1mCns+\nQjPCXl9RY0YqzwHjZhnay6+sUuD4vUjfNorWb7SuDrejN5qR6xIYJ+9QVzwgbKtmhLWCj6mOctBH\nlwIwalY773RpgHmPNM1VEQFKPKx4cWtbtQ/LDFtYWFhYWFhYWHRbZJQZbip0WHNgkPy5ig3Rsws9\ny1MMhI4HWvK4qAVctfs7AHy0RWJF118lsX4oFQlPf65BZurPPitaf7++LMYMb/yJivnLk+zQ/PUy\nw8n6z+eyQjyGeDvSJTYzYSt/LioLrw3WFaliM8pAnNJRZvzYkw9MAqB/SMWPdaI9nGCAYGExZ1wi\n2oWapdYM16X3iq5g6UJDu7AzGOF4MOPXt4N+saPAwUk6TrhNtHfpWmFZcrfK9Z/TJGPOwUpEItqC\njW27QlvQqAaXFLtmIJzXMZvskPkS6YDOYVHPIx1LPmXqq77Vmsezak9awz2DACgJ+PuO7heeB0Fd\n7poJ9f5jp5FCD7sBNoZice+JKjaE8+LcGPHGvA6Mgdk1/m1i1e78fdl8To0q3ADA/KzeSR+zNehc\nEP3M/Pr3woi+3keoepMRvnvzcADKpoiqjX5biCkVpeF5YOzj649F637UyvMBiDaIjbIK5P4tZRSb\n9AAAIABJREFUnCnvUynl8+h+p65x34ckHvf4HpI/Yepn6z60ISLvVSPv9rc5rbHCZlPDfo/diGcl\nJn7XJnk/KHtTqusmO7pZZtjCwsLCwsLCwqLbIrNqErlRgiNrKHlXBSWZsyjFun17i2jaFeZtBeC1\nUw4AYNlJUqPl/578GwAP/ECyLKPLVqoDyKym7CkR65xxUWyWPXe/JwC4tXwXAEbnycznF2+eDkD5\npZJR7jHEeqa0PTB/mslQs1vNlA/9sejw6fi05hWkTD1MsxKdjrse9A+xbSY4pEhRPtXfH8u1fT5Q\nS2SupvWEB/1F4r1dHd/UtA21ortQvwg4/rZ4mdYZCNTUx/L6ntN17JIqWmNOc2csBOD3Z50LQP1A\nuddCPeQebCyWPhsSMpFo67KzHisdVc6akZULO9zOSF6KnEagdWbZU4Boi0mM08fiMUPxVCW6Ekvt\nxSSqNi27UqpuTe4pVbda05bWY1TBy/IciR4oeQ1m/HbUUGI4aVeJsfxCfU9nJbqIG2BrKD/p7aKa\nGTaZ4DSOedm1HdvX2HxhZOcH+6bWACP3w9lHrvFbx92uVpBYa+0h1fjbX44BoMSojdCZdQtG/OLj\nTts3jsT36r6+9GYVF18mse6mNrsXM6yezcd8cR4AvT+eJ/vLRA6W0Q8j30iQ8JCb5bOjI4llhi0s\nLCwsLCwsLLotMsoMZwUj9C2upeBryY7Ub/Ca4dOZnEN3EdY27xiJ/YioWcvQrxYBcMXQswDo9xfZ\nT+9jVUU6NUsLr5TtXtg80Tv2AaWihvDWtQcBMGOeMJ5nTpc4mzfPPVD29bh/FuZpj+oKQV0QJpOx\n8XSpwDK7XGcy+1nf1uBlvSsy4JkXJO566PLMVe4L9XJZdVzLGeWjbx0GwKjwLKMtO44CSCqIGvGn\nuv57Rtg2gzU027KjQesNO4oJKUjTflPpyZGcFG0eh8XpDMdC3H12hRh87WFT7GxWqWTp33/uA0As\n5tbUvgV44aHDgVjcZs538gz7okn2tVdusNVtz+sj6//PCPFQhpeKtno6GLZwNMCWxg4ww7k6fjR9\nmscmcmo6ts/dctWzPWvPdDaHtb+SsXJUtjDCJvs/ZZUwpv0eFV153UvN+NXOgFlJ1YTnhelIX3Hl\nOaGrvb56pmbGZWTTMcIaZv+NvFai/lPKToaSVUagawHo96AOapVbZtjCwsLCwsLCwqLbIqPMsItD\n1HWIrFln/CBv8JGdJVOzYot8H64171QMrGa6xv56GQCXfCwxXHfvebL8/t8vAWiaJDHHi6pWe4eo\nGSAzvbUXiPLA8NOkDbOulHULfilV8XhcPoK7SgzYT14SZYObnjtDtjNrbpsxXpkkNrzKfUrDsliq\nv+w85Uvfaq0xGRoma3zrJqlHP+IJf6xwOmPZ4qF3fh2njP+8RduGvGPaOIPTzhSy/FtFGvpHwGjT\n+GJhS6b/6BAAGnrJDNnJAHHuKgIrb4sc7JDi932/m23tUjCbprR+vSpSrSHgZyFa7rNj5xutk6zt\njvSPcH5yx8xRHSOrTFR53Fw530hPYRFDxfK9qaeMcQ29Aup77DhNvaShTb3VvdhT7tXCYsm+71Mg\nn3lZsjwUFXtVN0q+yJYqYZ76vSzHLHxevD7bkiE2PWwLfyXPI10Bs0bFT2pN4Qe2DPa2HfiYeAqi\nqv3hdesBeG7zvgDsNfC/vmPpWGKt31o1XljoAsUMp4Nhi7oONU3tiwabmrHkSdvSqXlsIrsmMa+V\nWbFxnPaCtMOWtolA0GNRo4cIw/zOXveoH0V9w1St+eRJWW9A2Hj+x/G+tTmGpIpWqtuliq+nCiOu\ndZXNWGFT2enNOkl2GPCieOs9VY0MvCe0gK4FkKI9LDNsYWFhYWFhYWHRbZFRZhgXwpFgi2peXvWX\n74SdHXLfUFmuZqxRpSOrZ6qR9TLrvmv5EQB8e77MasrV5HvFUbJe6b1DvUOfcPFpALw48SEAri0W\nJQrelap1VdeIhnG+ihFe8mOpevebVySWK3c3UbbwquFt3qrOqetkz288YVcAXh/ur9zWljZrlqHr\n+MCbRwIwermKz82glm9BsJG9eizzvq8JyzELlkuN9haajplAF7q+GnlGHNf1Jd/I533fbIvmtAmz\nrV0K5qWNKsYlgbj4rtQrGvq0zQyburIH5QmDMvKDZwDIU+NsgRonmqskdBY8zdtDpG2jJ14MwKhr\nFEOsnw2ZIJoC/ljD4C5jAHhHKQuEXGFvtYdN2/Oex0/wdjG4VhhDU/P+nVXiYUQxw2alQj02r9tH\n9jniZfVDO1rAiSDqBqhtTJ6hzCkQNt8JShvcdITFGox/sDa5nUbVHaeVkdy8DjKvjrDumnFffIbs\nryQojLAZK/x8jeQxDXpSvK3tMqCGslNXh5OTQ9aQ4Tx16ENqiVxzM1bYrD9w7YJTABi4Qang7ACV\nfC0zbGFhYWFhYWFh0W2R4ZhhiETjsxia8Q2+t771FfRsWc2+ls0U5veHPxClCM2LuX1lVla4NMZm\n1t0jMVm73iez/A0nCYva+wnRrt2wXuJt++ysdEMLZSY6+jnJHl80RbZzhwyQHW7cBMRijpYfrdiU\nX06Le35ph5Hte+hVhhJGvPJXtKxG9E69zOx2ukW0ib0ZcAa1fAudJg7MX4nWeFwZkU8qjP6QiZhh\n1ceCfVSlozharEmjMj27sUgDTCpg55EALD63uOWqTVpz17/cE89Q+squ3qe+NQNu69818uVOG/sL\npZVekWDbm6GxT3I8tb7nh2UVtrmeHiPCajSINPOSmHq5cfehGL2o4WGpVfseotpQMHqr73dPjzgD\nRJMXnxuWNn53o4zlI5SygFmF7LVaiasc9tdYhdOojhFV3ksdU7plqRo/9m67DX3G+8e4dMReRqMO\nDU3Z7a9oYEAvUU5yctQ5KZY7rUjR4xYpUULe3yZ7XIktDQ7oD8D9Rzze5upeHYItoh/dXqxwIF/6\nyLqf7AFAuCA2YDiJPrbMx7YWi1Bva7p634AnVJy6zjdwnKRdVk19sll2+mAOUFrl8bzJZlx5+BN/\nBcDOjC/PFCwzbGFhYWFhYWFh0W2R4YA+B7ctHVI9+9CMpzHLMPXj+s+RqdaZZ0mc2Y3sJT83yiyl\nqXcs9i3/FWGAn75FKtfs+XOp/LPsb7JPt07VjleZ1D1WSRsWXaD2EVLxy/MkRiZYLizSXneK+kH1\nbfsD8F38s0sbzNlp5aVy7N/1/4taI3k2YMorFwEwat22y+p2MCvTK2QiQ9WMaesp7OAPPpArekQP\nue4hNX/MbocZCxqV2CoiEpP2xojUm9qg4rd0/Nx166Ta1dwpUkWpsUT6rBPp/GvnBsVuuRuEPRr/\n4HwAbhkwt9W2dimY5vlO2NlHjp8BxFQEoKUnJVVopRRtl2P+T2LwOsIMN/X13x8x1rZtb4ZmgfSn\n3k6zQjqfwLt2aRAG0Xbsrew4vU5yEgb+yR8Dmom8AFMRKHqQ0mc/4D4AQq60Kduw4/V/PQ+AwRtn\nttinGSva73NltFPlQzNspsLPxSOl6uZzuTJAeDkaKYzDrgtNje0/4s22lPcUlnqtqYiQxmdCqmNT\nQ7/U4torThLVpEkFohal1RN0rPDmiLCto17wX88W/dKIla09ahwAc264P6X2NYc5Vly0Uiryrnig\nGSMMHbougaIwRQfF3JXm2KHvV7OK7aAP63z72SYqEmmGZYYtLCwsLCwsLCy6LezLsIWFhYWFhYWF\nRbdFhsMkXAKBNtzLmuZPsPxj4XeSdDEoSxIcdDnn3Ao5rfV7xvx6g96Wz3tuEn/Vx7dLic1J+5wD\nQM5GFVqhhOVL7xAX2O++XQzAf+tEgP3ZPx4MwO9PFlmim++W0tADnm7pMks7jCIb2kWz9zkSSK9d\nGaZbRSPSLPFMu3u/aBL39k73iKxdBgr4xkW1m8OHDYM5rVCu68CgJC/SX5V83KKSbDqxTKgHdYz9\nCyRDQ4uRdxRLQu2L3ycKMxlpaa2E/jBbQhTyMlA6W8Ms0+21RcFsa5eC0TRdcn3Ks1MA+OanMVen\nvl90SIG+x9oqaNMWtFRRblDdoynYKbtXcrKHuu06DKKtMu0QGzfq3Vh/qo5K+xtUs7dG5TxWhCWx\nZnlTPwAW1pUCsGCTfK5cLf2j8Cs55tBXJS7E+VbGMM/lm8lEHG37mzYAsUQ5s/DAwiZxDRcvF3s0\nHd0yK85VyXiBkKxjyt6ZUpYapxSuAOC5cSJtyeeqcFIqY13UIdqYfOLvbkVSxKcie6fkj5kooqkl\nQdf3SS2hue6wGt93MyH0xnVSwCjwvr9YitkvzcSxVYfL9a6MyLNrbmMvb928QGJycnvnSL/TITVm\nqNlnT0pyni79bRaLSQYDcquYOuod73u8/qmxQIWLZn8pxWE8a2S0/nLnwDLDFhYWFhYWFhYW3RYZ\nZYYdB7KD0XblSeLCmH0EKjcD0KCS8sJjywAY/k+Z9a34QVGLXRQ/Kwli5cedB8C9zz4NwFVfSFGO\nomdkVlZ1ipTR/LBK5gtvr5RZcr89pYzzPf/zYwAGvOIvz0g6BMrjQEvd6OSKyouljW8M1Qkfqsxq\nnNldc2an0JFEgRNfmgrA6KXbvhxqXTSXz2tHcFqhJF8Nz5Lzrd1J2KS8b0X2LR2lStuF2vnbNSLB\nF+ErAEKq/nB2nFrHUaWtNVbN7gsdYYTfqdpFrfFla5ulhICRrOdkq76Ygdm6PpZmhs22dGkYCWGa\n5R79oCTS3X3icO+3oqB4n8pzhMk8QBVX6GhiXdBMJE6hbPXgki0JrWcmwywNyTh57tfiHaucJext\n8RJV4KBK1s/ZKuN0dnVs/AhUiT2cWvl069VnkwyAboOMUW5IPE/5LAVgjPr02uTtMHOi/ebzp/Zk\nGUc/GvsgEBtHTcZ8ZLbY7cPbZbztSDKluY0+lk7cWr+3JO6WqKr0KUlWuUBT+200JTjH5wlL/Vb+\nHr7lntRavOTGBMYb75mf5NBktrGxnUIz8eAEAwQLi/nD+Fd9yyOGm+iNN4T1L0PkSuO9s5jet51/\nI3J75/32eLVCG+Oh2mekUhIWN5+7HwCzb/YXzdKeivfq5VqWvlGh2qwOkUKyaXEgxBEFq9BlqM3+\nqWUVg4o3fbjyUDn25s0dPmZXhWWGLSwsLCwsLCwsui0yywzjkhWIEuwnMaDhtUpHqINsZLRWYrgi\nimWpL5XZdcFLIpA9fF4sTlPvWc/wRp4p7OPFTwgrsuSIRwH428sixr0lIjGi026Uss0Dp31iHF1Y\nSq+UZwfidRKGjhVWjHCwRJjSXc/+yrdaTBrJHytssg8AMxpk3VHT6v2HSiH+KFVsrivghXl7cctR\ncm00g7X6YGnTqH+oq5gmeSsfjL4XUfHJ7+0vxVred0pb307ZS7Miod1Ecu+xp+8GoGeWtHXavAlq\ngxfS1WIPUYNldENy7TJyDY1rYbalS8MYbnT52fBKiZn89yGjvd8iGzYCsOYakUCbf5Wwg1WqfGvv\nYGox5algQp+Vbf5uCunv+akUEij9hSzPXyhx8cOdZbJBnHG4+dJkeUrPc+bFwKrS15rVymCMsD6m\nbtOEX87x/W7KSzUa9Yg1W9YYTfz+0uVt4zHD+tps2lO+l8Qam/AxWsB1CDQkPlZqz8Heua0X2fAK\nMHXEa2h4Ppym1FyozUJxk0K4Zz6bjtmF43u8q5aI3TX7qjHg0471R/3cSAjaG6LsGTrRz7bWKU9u\nT0fadvl88UaXfisynx32sDdvAg5FbeQMeIV21OX7ZO0waQPbvgyz9uR5Y0mKzzvLDFtYWFhYWFhY\nWHRbZJQZjkQDbK3PI3+UxEUFPGa4YxmzmsnxvmuWQc9CmwtBmwU9FMqeku+7F58BQJ8HVXnQGVLc\nuUeVKsNozEI0Msq+KfusP34MAK+XSWyRmfVsorUSi2e/L0U2xsyU4DRTFWBbIHurw6B/ZsFR/uWX\nHvNvAN74jbC0Uc0qZCC+WSsMxIXRhpVHCvs+xCh1O+h1udVWpLd5Fp0BdU2j1TUtfgomJ9zQqXCC\nQYI9ezMyf7FvualwYTKd+c8IrRZZKHkCTq7yoJmxhyZ72xoSZC63hafJRItiRZO/B8D0QTKOmio8\nZox1yx2m3iazzO2h44VxW6O+pxIP6kQh2Nh+I02lFO1BXHi1jLdZVYMAiOSrkuNZ/k/UZyAvdo2z\nc5UCQo58FuXJjdMjW54vZcWrfW1oT8XARFOvjjHm4UKXdQdFvWdhjfLs6HN+p16WF30q3iF9RgkX\nlUgg9t/0voa/L8XCZu/9kFpDqbw4/ud5j+d7JtaGJFDvwrwmmKiGgPZyIGqq/Ax6RvJ3TGhvfJrf\nVdp9GXYcZyjwBDAQCXt/yHXdvziO0wf4O1AGLANOc113x4uqNtDg1rIg8glNrtxEgxnBMKeckNvE\nfGYBjHMc5y26jT3q+JJPaaQBB6fb2yO0dTPrXnyWXR9fQSDgcNHZxVx+US/CjXV8O+sp6Gb2WL0m\nws+v2MzG9S6BgENg0heMOnX3bts/Vq4Ocd7llaytDBEIOEw+uyeXX9SLpkg9X6x+FbqZPaDtMQQo\ndxznW7rRM6Z2XQ2fu++LPSIOg51RDGNUt7XHytUhzr18HRWVEYJqTKUXRCRMstvZQ98vTSGZYDS/\nX+qoprvZI11IhBkOA1e7rjvHcZwi4HM1WJ8HvOO67v85jvML4BfAdW3tKNoUpHZlERv2kDf7/jNk\nVtHRjFmnUNi3qJqiZ1f7yzW3xkaYs4khv10k/9wobGv2m58CLbOcve3cKOXsRrHTm7AbYjbv0Mcd\nwFqW0Yf+bKJyAfAOCdgjYSi7aPb2mCs+8P2s49FMmFnRsxpi9h39qMFwd7CcooNDObunxR7BqnqK\n31zInZvLALisl8RlT+29DIBHf34MENOAzgibHWemH+uzqnxzSR8AfnPSi2w+0OHyiUOpromy05GV\nrBk/mg3Tp9E/0JOtkP7+0YURDMLvbijm0PG9qK6JMviwBfTbewjL+Lrz7pdU0Ymehqwsh1tv7Muu\nuwWproly4KQKjji4gKUbZ9Gnx3A21i1L3B452bhDB7Bz7get/qxZHpN1C4SN81P3/rZib9saQ4Bq\n13XLE33GtH4AzV4pBrOHZM5ffIUoCmg7aUbdZMd+sly05T9aJDHkTkA9X5KIj3/uIFGqmJjrZ51N\nFv/C/u+zgTBrSg6jZ05/GtasZLb7Dn2c/snbw4VAAsywRnPPIcCSkx5MeNtU0ZYyR1aWw59u6Mue\nu+dSVwv7HL2Spskrqf1wDiTZP3LzQuw0JsZKm/ZfGZJ8nPDqNb7lCY8JHRg7qq4S72M8tvqRrcLQ\n937rOwDq9P0S6CP3i/u2d78EySbshhK2R0VTMbetnsTzI0VrOKoyA3RPMPWXqTa8JJ2RvxMPhgb5\n6uv2B2LeuoF3plbrod0zcV13reu6c9T/1cBCYDBwAvC4Wu1x4EcptWQ7Qa6TT7EjwvJZTjYFFNFI\nPetZQymeFJO1Rze1R5/+2YwaJ8lURYUBBowsYGtlI5WhFQzOKderdRt7DBwQZI/dZNJSVBigqKw3\nDRtqu23/KB2QxYTd5SFXVBhg5/IcVleEqaz5lsE9d9OrdRt7QNtjCLBRrdZtbFLSP4ueOZLI7bfH\nauiG9igdkMWeu4sfX98zkU1V1M1ZCN3QHm3dL9l4oRXdxh7pQlIxw47jlAF7Ap8AA1zXXQvywuw4\nTv92D1YH/T4NUHmIMHn9vYpzCQacGLGzkYHSIbIdFdu1SjI5NccZGLezt2mgWirCuDkys6m/T9aa\nsVgYvdFvfiaHUPFzXuZsG2x1vVtLNVvoSR+aaCTXyQc3cXu0CyNTc91k0T78bb/7fKvFm1Wb2owX\nzTvH+3/Qh1JZJ53saqr2cCNRIlVVPH6fMMBTfyXnqWP5HrpUFBp+N13iuyNfLUr7ObRslCk5YMSe\nq3jtRXcOAeD4HnVqxQDLVoZY9GkYd6dTaYreSFaV0mBNV//YzrBsZYitizbSe5cBHesfuITcSAsG\nq8NojzRrLV4zTWIZAdU/5s5vZN8JeTSFN5IbFMYyUXtEswPUDy6iPFtnsBeqffsbaY4P2bVdt1qU\nOYaglNtTuWfMGM1lV4t+7uSeMwBoVM8TzRJqduxLVZ1z4yli1/LVftWJZHDxa2cDMGfvvwMxRQoz\nPvmAPGlD3R5DZfma76hmC72C/WgKJ2cPJwrB1oUhEoLORekIzFjoeIgbjx0H+p7pd34JG+6thiT7\nR14wzJjiyri/N0RN5jN9eSneu4VShaq4QpjNeRP8dQJMtvr3M34IwJj16h2l2fPOvF/y1RiQqD3q\nq/OY9+4Y8Jhhf36BCTeQeR158/le/WPRY15whdhtxL8vBCSOt7X1E0XCHLfjOIXAi8BU13Wrkthu\nsuM4nzmO81m4oTapxnVlhN0wX/AxOzGerCRu6Ob2CNGFsnFShLWHHzW1UU69oILSQ35EMDev/Q0U\ndnR7jLv8ALJ7tF3+tzma22P9xszL93QWamqjnHFhJXf8roTiosRdjb7+0bTjjKfQsTFkR71fACLh\nxpTsEa7bsfqHHkPu+F0JwYLEy9s3t0n95hRmB10M6XjmRmp3rD6SChJihh3HyUZehJ92XfcltXid\n4zilagZSCrQ63XJd9yHgIYCe2f3dvtMXk3+OqnZSLKoSkaoqfSC9UevtMDIXt46RWdDKsOzHXb7K\nt/6K4/p4///u/NeBWAWxX39+AgCjz1G1x3VscAJailE3yhd8zECG0d8ZDEAOuTS6qipTgvYodvrE\nOVFth6jv+/GT3/et1ppKRHNoRkjHIJX+qeUN44ZTL5mXVnsEgvS/XxQ8pv5UmPA7S2VGvJ96pxz7\npGTPLzxGJr6RdbLrFqx+GmM/vUx0HV+pZp3L/iAZ6YsPu99bNxRy2eOcWsqO2om+dzfBW5/xHXk0\nOk3gpqF/bGcIhVzOuGAtZ55UyIeHiA5zR/pH+W757jv1BUwqSM9LjxM22CtVYQylVtJcrUZLzYZ6\n+DcJdKByXCjkcsaFlZxxUiGH/yDI1mg9OcECGhuE4U3UHj36DXVrS7MYppRLIu142DZHxGsRbOx6\nk4p4Y0iYUDbEt0mb94sRK5xVKtzRg+cJo2TG7WpWTLOVJ750CQCjV4vqho41dhPQyHWUB1KvW7NA\nPYtkSGuh32pW+qoYH2DV359msLaHEyCHvKTskT9wqJuK+kk8daJtgVDI5ccXruPMkwo56dhC/vbt\nFr7rk09tbWOb9gC/TfqM7eduaCxsbTUA8gKGUlEHla70+4RXDZQYI+weMB6Ax6f+Wf3if7E37V42\nzdCRj0Ti3i+6Dyf8TpZf6o56bhOzzpLz2y+v7Zj27F5Gh+pMGQkjX0vfv+fd9JpvtdI3jdfYDrap\nXUrCcRwHeARY6LruHc1+eg04V/1/LvCque2OCNd1+YrP6EERw50x3vJ+DGIty/VXa49ubI8Lr6qk\nz4gi9j6nmT2cQax1l+mv3coeV1yzhbHlOVx5cW9veXfvHzuXZzN1SqxyQP+CUayp9kp1dxt7QNtj\nCNBXfe02NnFdl4p//p3ckv5+ewSGQDe1x+Sr17NzebZvDBlwwAjopvb4yp3d6v0SwgsN6Db2SBcS\nYYYPAM4B5juOM1ctux74P+B5x3EuQORTT21vR244TGT9elaskSomvc6Uql79HtD1v9VsOl6sh56l\nqVnbuv1kdv1wxSEARBs2qf3IaQ35Yyy78OHpxwEQaJB9j1o4Fx8SVLLYykYqWEEhPZnlvgXAaMYx\nnJ08qShgKwnYIx50DXg9k1xzrcQW6Vjh2KytdVbKZIz3/+x8AEo/ntdy5RTZ006xh7oWX1+4KwCz\nXhSmeC81gb69VGL3rp++OwBzz9wJgIiqpKXhVb1S8DQ7W5s5qr6lvQ/6u2bOdbyh3uc39+8JwNJj\nhRHW1+RvM2p4alo1hfkFLHrlI2iqF3sExjI/OhPS0D+2J3zyaRPPv1jPbmMjTDhiBUsbn2eXyft2\nqH+squ/N1fNOYdL3ngZa6sImi0gvv3qC2+BnPaINMZdqsF8/APp/36+Pmp2kPuqHn9Tz1LRqdhub\nwz5HrqIynMvRU8sZmbcHczdPhyTsEcmGmiGxMaDRlfPRzJIeB3QLvw2LnbKVq1jfBalo2aYDbY0h\ny1lUrKSiEnrGNIcZK7zw15KwebDyMJmxwlp14716+b7zn5YCENa603UqHyCRMdOoitV3vn8b05vX\nnCmeMbuBqvmfkVfWn1nuFwCMjuxBWWBnlkcXJmwP0RmOfTeVArYXzJjdwNPTahg3NocJR4hSe9mU\neRw5eRgPPvffpPpHbWMOs5cPhxGt/35OkdQ+eORHJwJQ8LKqM2BqcZvPEP280KpM+vo3xi5A/QkT\nAbju9icAGK/2qWOzzT5x3TphkPPemy+HVsu3uuupcJe3er+sYgnJ2MNtaCTy5TecM+1SAL49W55n\n9ar6nTm2nrCT9Mf5aah+FxdxvG0r75eJ0OSeovShq+j2fl/dp2q9jipjtfsy7LruR8RPGzm8Q0fd\njtHLKeEITmn1t704hLfdaQtc1+02drH28GP8PnnMXjaMX512AQDupzKQ4QTZK3gYb0f+3q3ssd/E\nXDasGuSVKz5p8ZEA1Drru2X/OGDfPJrWjPRCmG7dNAqAikAeE/v+iOlr7+1W9oC2xxBcFrmuu3dm\nW7RtceC++ezxj18B0O94Kf7kBLxHdbe0R9MaCa/S983Va0v1z93OHr2cEo5wTm11UlbgFlHlbipv\nZTOLdpDRCnQaIx+Xi1j8W4n9rH0gse1isy7Z/vSDhFF++eUDARiGYoJbUVdw/ysuSD1nSEdd77RD\nx8io2WSwRDxAE0/+wreayQC1h5J7CuIea1vUFG8T0Ujs2qhrdu3VPwfgn3ffCcRqtd88QOzy5j+k\natOVj0hVvWF3CQMeTSY5QLFE8cKNtp4lGayHXCPxg9MHPOz7XV+Ty6+7HIDCT1V1r65RyMSeAAAO\nY0lEQVTYz7ZTZFc4DLklQOWLcl37BqQfJMoQmxWdXj7iHgBO/+2VAJS9IjG7gah0gtqyIm/dLeeL\nFugXuz4LtNTwThQ1rtzbug8/+G8ptziq4uOk9gPg5kVpKq/3vreXwT+nvgyAQIWoUXldPaPlozof\n5j3n7D0OgBnH3Q5AyBXbayZYx+tmq/4z5dkpAJRVfNzq/hKByU71/nw9AJUR6bv9lXKI9iiZrOCt\nu04D4PZe8myLbNlK0nAh2LhDpB20UETZqaCiQ/vJ3hqg/2t5NB4iY4a2uzmGnPWHfwLw4ga5PwNK\nfSluXpOxPLiLhC98fV0sPnnB4TLe6DFjzPsSZXrBOHlvua6v37P50puSjzKyQfXDTlRNGn2DnN9P\nDhJN7SeGi3b51qiML7FnruTvTDp4MgBZ/0lfBdsWnlx1v317974ALJno172+YYkox2VVqLquKSp/\nZFAx2cLCwsLCwsLCwqJrIbPMsCNv/8F3JeZz6VTRAW76pbD6OsY3kCdBXVHFkJoxtNEDJZbm5F5S\ny3vew1I73YsZaU0hIeCfeXdFps5Uy1h7msTCvj5MYoV1bFE8Nsr8fY/Zosc78D8qProDme/bAmZ8\nro7bOjYwFYCbb5UZoo79O6pArveXl4md7j1HNDpv++AHAAx8X+Z8PRcLKxOskAqVvhjREklmqh4r\nWd/r9pZtTpgkDO+tA1t3X8xulGNP/aUwwkXPW0a401BbD7Pn88NfXwPAS/97KwBDlJqCZnc0NPOn\nWSWTfdMxe19fpHQ+LxQ2z1QVaA2Jah1rBlm3radis+/YJG7fMX8VxrAj/pn8nBB7DI8p6JiZ36Zm\n6Be1cl+EK9bJ4u1kPEgKrZxTr79IjGFpi34i9tLX+fNGGT9HP7gSiMUKdzQGsTmiS2WfL1TLM++S\nXvJdx/Hq/qT7y+HSTbjhSFm/cNpsWZAE6eVEIat+x2CGQ4aaw6Qeizq0n8CmWoqem8Wuxwmzufiw\nxwBoVA9drbx0cS/JDTjhadG2P+Nr0YmumCXhGQWqQJ0blD5SM1TsPHIfua4PlT8KxMYmgFVh6V/7\n3SHPsfJXZCfj3lzpa6P2HoyaViPHUMvT0Q9bRSDo5UdsPK0EgOtfk3wc7X3VSjQ63I3rZdziP6pt\n6p1LM8TJeJtcowKmfv9b8qTkDC05RJ73G5RdSpRXZdUsUdEoQ5jhdnPO2oFlhi0sLCwsLCwsLLot\nMssMuypzWbG0g66U2ciZ/5aqPA9+dTIA+a/O9m/W6M/yPvBe+f3U1y8DoHytyvhsi43rarGxGs2Y\nDN3uYF9hJ4+d/KFv1XhslI4704zw2rDMKIueEv1lfe7erI1OqtaWZpgMcQ+lKnHzkrMAePx+keZ6\nZNhHvu0063LJ8eI54Hj50DPLTWrS2uDG7NkrIMcalhVfg7I5pq6VnI2FPxsLQNHszmOEswI7Tkxn\nyucSCNL7cYmhO2+FsPGjbpGY8QeHyHJ9P5hxhia7FGNOBTEmONhifb1ulpGLr5m9ePvSywsDwnbc\nubkMgDd+eoCs+M1877yk8SSM0pyt/Grov0CVYDXVZRpUHLtuy7IarbsujFSqTEqXg+PgZGV757P0\njxJz+cYIyZA340JjDLHY/vJvTgegcOUSWZzGvArdpjlVqgS5xwz72ftYFTK5lnVnS6xw4QvJM7yO\nC1nNYob1+Zp91fQodBTx1I2Sgb5nTc1s89n3Vu1o9V/HYofLJwuzPOpBUVr67vvCEJveIO1NeG/c\nK7JgXOv7M8ectWr432Xm2d46w38v65TOFQ94zQ9FXeLYAn8hkIuXSiysl3yt0VnvMNGI19fDK8XT\nNHeSaPru9oCc8Px9nwFi/fOdXUTrd9cX5Vk8fPJaACIbRdErmfhdrdSx4ZwJAPz6uicBKM9+F4Bx\ns6TC3OyJj/m2G/yef9xKlTm3zLCFhYWFhYWFhUW3RebVJJrPQpYsA+Dhq04C4Pq/PA7A5fv+FIBR\nz0hsZ9MAiRHpfZMwgQtrZIZVfplkNnoqDJ0VU9OZcN0WcW5rzpI4sdf7S0BOe5XmzOpF924SRiR3\nk0xP3e/tIb9vqfO2cTZL1b9otbDISWloZhgmQ6xVJlZ9T+w2/jJRmzjjItFcNLNyNXSsUUkHhDbv\n3iyMzsOPHgvAoLtU3wvN97ctjYywW1xA0wH7cNswXalIWIrsVtRSuirMtt42THTgz50kCg78e1py\nO2w2fujcgxWHyXX93gkXA9B4uowb940TNmO/vKBqi3nh2+4I/vXjVHls5/enqiRO95aXRbe0/G7R\nxGStltxT934HWJ9N4R48uWl/9lIVGk0dWVM947uZ0ofL0AGPO47HAQDXxQ01sfECGf8WnSuMsB4/\n21MbqZw7AIBCR66Rp1OcBkZOe+UmFC/3LTc9DWY+yD/2/CsAF0z8mSz4JIn7xXUJhGLjuT7/ROPd\ntyXMWOpHtgpTeceT8q4w/D5doOajFtsmtP96YWNHny0qCkd9T5QdFp8tHpz99xRJu+NL5Pc8R1j1\nHEf6wuqQaN7OrhbB4g+WC1Md/EwUaIa9JjG1QxcuiB3TyFsqWCXP3pEvi3qJG5RzHiZCFuSh4nIz\nofyk963Go8h6UZwZdKIUsTtskig11VwqnooHdn0KgC+V5vuiOeJ1PX2evLs1fCpKWPnr/JUWG3vH\n3nXqdhaP/2E7i613zv4UgGtfOgeAMX8WD83gCrnWB74my+8cJ5EEOe/LGOr18BTHs+3nqWphYWFh\nYWFhYWGRZmwTnWEvhlWxabn/khnB7VVnAnDEHTIbu/qstwHYGJWYknNmSyGDET+RmUSLmUAXZDXb\nQtPIfJbevAcX7yaxwRtCwvxd1OdPag3NBLY9kzcZj9/3lxlT5EnR261UmaArI7Ea6NVRmQHfvuJo\nAIKnyG8difnJFDzW1ZgpD7hLYrDee0zYg38c/X0AVh8r65+wu9hhv8LvADgwX+L1mlt1ZoMoksys\nlhn+y1+KYsnAfwlT0+sNiUst3SLH8qyivRKdoBoRKnFZc36jF8esY/62B2ZHw9Tx1Oey5nyVB/Dv\nDuxUsxjK9prlKX5G4rYRQpj/HS6xd5v3k6zjdUIY0me09PF9BkgW8lG9hL0Zmb1Bflfx4wOC+d4h\n61RFplXqMldE5Dw+qhHFl+lrJHZ83WLJxh6ssqyLPhAvxYiNEs8c1n03DWxPw7I8vvlpOaffJ7kB\nfx4qcXx5io2/f7NUSfz7Y1LDo+zPM33b72hKJ9FeBdQfOpHHf3MHAJUReT7o6m7ZhgeuTi0foC5F\nwdrOG/OCA6SC4fg86aMbIqLfqp9gJivVoNrQRzHFS69Ua5yexDEbo/RYXsNhX54AQCSq1DOy5Lpn\nB6Tv5QUV6xmU7zmq/+er5bnqu/4sCOrqZPI9LyDrFQRiuT16mWZUW3x3/NsE1YjaT+17TqOM5f/z\ngrCBo//0FQBD1PgbSVUJRb07eNq2qjpruZL7VpwsjxWIooFTVOhbP7JBmFO3Ua7jMPzxvWY9Azmk\nv19F58o5lV/STlszme/k9X3VM9U4lTNd3tH6TJfFvxn9YwDWHyTXacPBct32GCkxxzlHy1i6sUG8\ndptqRYWidk2xd6jiufLOsfJheeY6M0T1aiRqrDSucenkLQD8caC8J7qhr+K0vWOwzLCFhYWFhYWF\nhUW3heNmkPlzHGc9UAtsyNhBOxcltDyX4a7r9ktkY2sPP6w9/LD28MPaww9rDz+sPfxQ9lgeZz/b\nI1KyB+xwfcTaw4/U7pdMvgwDOI7z2Y5SSzwd52Ltkf59dBVYe/hh7eGHtYcf1h5+pOtcdhSbWHv4\nYe3hR6rnYcMkLCwsLCwsLCwsui3sy7CFhYWFhYWFhUW3xbZ4GX5oGxyzs5COc7H2SP8+ugqsPfyw\n9vDD2sMPaw8/0nUuO4pNrD38sPbwI6XzyHjMsIWFhYWFhYWFhUVXgQ2TsLCwsLCwsLCw6LawL8MW\nFhYWFhYWFhbdFhl7GXYcZ5LjON84jrPYcZxfZOq46YDjOEMdx3nXcZyFjuN86TjOFWr5TY7jrHYc\nZ676OyaJfVp7tNzvdmkTaw8/rD38sPbww9rDD2uPlrDPXD+sPfzolHvGdd1O/0Mq334HjARygHnA\nLpk4dpraXwpMUP8XAYuAXYCbgGusPVKzx/ZuE2sPaw9rD2sPa4+uaxNrD2uP9v4yxQxPBBa7rrvE\ndd0m4DnghAwdO2W4rrvWdd056v9qYCEwOIVdWnu0xHZrE2sPP6w9/LD28MPaww9rj5awz1w/rD38\n6Ix7JlMvw4OBlc2+ryL1m32bwHGcMmBP4BO16FLHcb5wHOdRx3F6J7gba4+W2CFsYu3hh7WHH9Ye\nflh7+GHt0RL2meuHtYcf6bpnMvUy7LSybLvTdHMcpxB4EZjqum4VcD8wChgPrAVuT3RXrSzrzvaA\nHcAm1h5+WHv4Ye3hh7WHH9YeLWGfuX5Ye/iRznsmUy/Dq4Chzb4PAdZk6NhpgeM42YjRn3Zd9yUA\n13XXua4bcV03CjyMuB4SgbVHS2zXNrH28MPaww9rDz+sPfyw9mgJ+8z1w9rDj3TfM5l6Gf4UKHcc\nZ4TjODnA6cBrGTp2ynAcxwEeARa6rntHs+WlzVY7EViQ4C6tPVpiu7WJtYcf1h5+WHv4Ye3hh7VH\nS9hnrh/WHn50xj2Tlb7mxYfrumHHcS4F3kCyGB91XffLTBw7TTgAOAeY7zjOXLXseuAMx3HGI+6F\nZcCURHZm7dES27lNrD38sPbww9rDD2sPP6w9WsI+c/2w9vAj7feMLcdsYWFhYWFhYWHRbWEr0FlY\nWFhYWFhYWHRb2JdhCwsLCwsLCwuLbgv7MmxhYWFhYWFhYdFtYV+GLSwsLCwsLCwsui3sy7CFhYWF\nhYWFhUW3hX0ZtrCwsLCwsLCw6LawL8MWFhYWFhYWFhbdFv8PSCysRqLbFWMAAAAASUVORK5CYII=\n", + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%matplotlib inline\n", + "plt.figure(figsize=[12, 4])\n", + "for i in range(20):\n", + " plt.subplot(2, 10, i+1)\n", + " plt.imshow(X_train[i].reshape([28, 28]))\n", + " plt.title(str(y_train[i]))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# < a whole lot of your code >" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "```\n", + "\n", + "```\n", + "\n", + "```\n", + "\n", + "```\n", + "\n", + "```\n", + "\n", + "```\n", + "\n", + "```\n", + "\n", + "```\n", + "\n", + "```\n", + "\n", + "```\n", + "\n", + "```\n", + "\n", + "```\n", + "\n", + "```\n", + "\n", + "```\n", + "\n", + "```\n", + "\n", + "```\n", + "\n", + "```\n", + "\n", + "```\n", + "\n", + "```\n", + "\n", + "```\n", + "\n", + "```\n", + "\n", + "```\n", + "\n", + "```\n", + "\n", + "```\n", + "\n", + "```\n", + "\n", + "```\n", + "\n", + "```\n", + "\n", + "```\n", + "\n", + "```\n", + "\n", + "```\n", + "\n", + "\n", + "# SPOILERS!\n", + "\n", + "Recommended pipeline\n", + "\n", + "* Adapt logistic regression from previous assignment to classify one letter against others (e.g. A vs the rest)\n", + "* Generalize it to multiclass logistic regression.\n", + " - Either try to remember lecture 0 or google it.\n", + " - Instead of weight vector you'll have to use matrix (feature_id x class_id)\n", + " - softmax (exp over sum of exps) can implemented manually or as nn.Softmax (layer) F.softmax (function)\n", + " - probably better to use STOCHASTIC gradient descent (minibatch) for greater speed\n", + " - you can also try momentum/rmsprop/adawhatever\n", + " - in which case sample should probably be shuffled (or use random subsamples on each iteration)\n", + "* Add a hidden layer. Now your logistic regression uses hidden neurons instead of inputs.\n", + " - Hidden layer uses the same math as output layer (ex-logistic regression), but uses some nonlinearity (e.g. sigmoid) instead of softmax\n", + " - You need to train both layers, not just output layer :)\n", + " - 50 hidden neurons and a sigmoid nonlinearity will do for a start. Many ways to improve. \n", + " - In ideal case this totals to 2 torch.matmul's, 1 softmax and 1 relu/sigmoid\n", + " - __make sure this neural network works better than logistic regression__\n", + " \n", + "* Now's the time to try improving the network. Consider layers (size, neuron count), nonlinearities, optimization methods, initialization - whatever you want, but please avoid convolutions for now.\n", + " \n", + "* If anything seems wrong, try going through one step of training and printing everything you compute.\n", + "* If you see NaNs midway through optimization, you can estimate log P(y|x) as via F.log_softmax(layer_before_softmax)\n", + "\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.2" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/week4_[recap]_deep_learning/seminar_tensorflow.ipynb b/week04_[recap]_deep_learning/seminar_tensorflow.ipynb similarity index 100% rename from week4_[recap]_deep_learning/seminar_tensorflow.ipynb rename to week04_[recap]_deep_learning/seminar_tensorflow.ipynb diff --git a/week4_approx_rl/README.md b/week04_approx_rl/README.md similarity index 69% rename from week4_approx_rl/README.md rename to week04_approx_rl/README.md index 94c043c4e..d75903fc9 100644 --- a/week4_approx_rl/README.md +++ b/week04_approx_rl/README.md @@ -1,6 +1,6 @@ ## Materials -* [__lecture slides I__](https://yadi.sk/i/kGPiXpse3NR3n8), [__slides II__](https://yadi.sk/i/H07O_XEh3NR3oV) -* Our [lecture](https://yadi.sk/i/AHDU2p_j3FT3nr), [second lecture](https://yadi.sk/i/yBO0q4mI3GAxYd), [seminar](https://yadi.sk/i/EeUeheri3FT3ra) (russian) +* [__slides__](https://docs.google.com/presentation/d/1HEfIyKT0rIuUQCGAsR1PIVGirccDXu5LQvxhVUjuIqM) +* Our [lecture](https://yadi.sk/i/Gd9yWV1dpuB7BQ), [seminar](https://yadi.sk/i/mvtKAIRN2yKU2g) (russian) * David Silver lecture - [video](https://www.youtube.com/watch?v=UoPei5o4fps) @@ -18,6 +18,7 @@ * Article on dueling DQN - [arxiv](https://arxiv.org/pdf/1511.06581.pdf) * Article on double DQN - [arxiv](https://arxiv.org/abs/1509.06461) * Article on prioritized experience replay - [arxiv](https://arxiv.org/abs/1511.05952) +* Article on Rainbow: Combining Improvements in Deep Reinforcement Learning - [arxiv](https://arxiv.org/abs/1710.02298) * Article on bootstrap DQN - [pdf](https://papers.nips.cc/paper/6501-deep-exploration-via-bootstrapped-dqn.pdf), [summary](http://pemami4911.github.io/paper-summaries/2016/08/16/Deep-exploration.html) * Article on asynchronuous methods in deep RL - [arxiv](https://arxiv.org/abs/1602.01783) * Successor representations for reinforcement learning - [article](https://arxiv.org/abs/1606.02396), [video](https://www.youtube.com/watch?v=kNqXCn7K-BM&feature=youtu.be) @@ -33,10 +34,17 @@ ## Practice -From now on, we have two tracks, theano and tensorflow. We'll also add pytorch support soon. +* Seminar: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/yandexdataschool/Practical_RL/blob/spring19/week04_approx_rl/seminar_pytorch.ipynb) +* Homework (main): [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/yandexdataschool/Practical_RL/blob/spring19/week04_approx_rl/homework_pytorch_main.ipynb#scrollTo=KVvvo7k_ap8w) +* Homework (debug): [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/yandexdataschool/Practical_RL/blob/spring19/week04_approx_rl/homework_pytorch_debug.ipynb#scrollTo=KVvvo7k_ap8w) -You can choose whichever track you want, but unless you're expertly familiar with your framework, we recommend you to start by completing the task in lasagne and only then reproduce your solution in your chosen framework. + + +From now on, we have two tracks, for pytorch and tensorflow. However, pytorch track is somewhat better supported by the course team. You can choose whichever track you want, but unless you're expertly familiar with your framework, we recommend you to start by completing the task in pytorch and only then reproduce your solution in your chosen framework. Begin with `seminar_.ipynb` and then proceed with `homework_.ipynb`. + __Note: you're not required to submit assignments in all three frameworks. Pick one and go with it. Maybe switch it occasionally if you want more challenge. __ + + diff --git a/week04_approx_rl/atari_wrappers.py b/week04_approx_rl/atari_wrappers.py new file mode 100644 index 000000000..e89e34ee5 --- /dev/null +++ b/week04_approx_rl/atari_wrappers.py @@ -0,0 +1,115 @@ +# taken from OpenAI baselines. + +import numpy as np +import gym + +class MaxAndSkipEnv(gym.Wrapper): + def __init__(self, env, skip=4): + """Return only every `skip`-th frame""" + gym.Wrapper.__init__(self, env) + # most recent raw observations (for max pooling across time steps) + self._obs_buffer = np.zeros((2,)+env.observation_space.shape, dtype=np.uint8) + self._skip = skip + + def step(self, action): + """Repeat action, sum reward, and max over last observations.""" + total_reward = 0.0 + done = None + for i in range(self._skip): + obs, reward, done, info = self.env.step(action) + if i == self._skip - 2: self._obs_buffer[0] = obs + if i == self._skip - 1: self._obs_buffer[1] = obs + total_reward += reward + if done: + break + # Note that the observation on the done=True frame + # doesn't matter + max_frame = self._obs_buffer.max(axis=0) + + return max_frame, total_reward, done, info + + def reset(self, **kwargs): + return self.env.reset(**kwargs) + + +class ClipRewardEnv(gym.RewardWrapper): + def __init__(self, env): + gym.RewardWrapper.__init__(self, env) + + def reward(self, reward): + """Bin reward to {+1, 0, -1} by its sign.""" + return np.sign(reward) + + +class FireResetEnv(gym.Wrapper): + def __init__(self, env): + """Take action on reset for environments that are fixed until firing.""" + gym.Wrapper.__init__(self, env) + assert env.unwrapped.get_action_meanings()[1] == 'FIRE' + assert len(env.unwrapped.get_action_meanings()) >= 3 + + def reset(self, **kwargs): + self.env.reset(**kwargs) + obs, _, done, _ = self.env.step(1) + if done: + self.env.reset(**kwargs) + obs, _, done, _ = self.env.step(2) + if done: + self.env.reset(**kwargs) + return obs + + def step(self, ac): + return self.env.step(ac) + + +class EpisodicLifeEnv(gym.Wrapper): + def __init__(self, env): + """Make end-of-life == end-of-episode, but only reset on true game over. + Done by DeepMind for the DQN and co. since it helps value estimation. + """ + gym.Wrapper.__init__(self, env) + self.lives = 0 + self.was_real_done = True + + def step(self, action): + obs, reward, done, info = self.env.step(action) + self.was_real_done = done + # check current lives, make loss of life terminal, + # then update lives to handle bonus lives + lives = self.env.unwrapped.ale.lives() + if lives < self.lives and lives > 0: + # for Qbert sometimes we stay in lives == 0 condition for a few frames + # so it's important to keep lives > 0, so that we only reset once + # the environment advertises done. + done = True + self.lives = lives + return obs, reward, done, info + + def reset(self, **kwargs): + """Reset only when lives are exhausted. + This way all states are still reachable even though lives are episodic, + and the learner need not know about any of this behind-the-scenes. + """ + if self.was_real_done: + obs = self.env.reset(**kwargs) + else: + # no-op step to advance from terminal/lost life state + obs, _, _, _ = self.env.step(0) + self.lives = self.env.unwrapped.ale.lives() + return obs + + +# in torch imgs have shape [c, h, w] instead of common [h, w, c] +class AntiTorchWrapper(gym.ObservationWrapper): + def __init__(self, env): + gym.ObservationWrapper.__init__(self, env) + + self.img_size = [env.observation_space.shape[i] + for i in [1, 2, 0] + ] + self.observation_space = gym.spaces.Box(0.0, 1.0, self.img_size) + + def _observation(self, img): + """what happens to each observation""" + img = img.transpose(1, 2, 0) + return img \ No newline at end of file diff --git a/week4_approx_rl/framebuffer.py b/week04_approx_rl/framebuffer.py similarity index 100% rename from week4_approx_rl/framebuffer.py rename to week04_approx_rl/framebuffer.py diff --git a/week4_approx_rl/homework_lasagne.ipynb b/week04_approx_rl/homework_lasagne.ipynb similarity index 100% rename from week4_approx_rl/homework_lasagne.ipynb rename to week04_approx_rl/homework_lasagne.ipynb diff --git a/week04_approx_rl/homework_pytorch_debug.ipynb b/week04_approx_rl/homework_pytorch_debug.ipynb new file mode 100644 index 000000000..5c456b432 --- /dev/null +++ b/week04_approx_rl/homework_pytorch_debug.ipynb @@ -0,0 +1,738 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Deep Q-Network implementation.\n", + "\n", + "This homework shamelessly demands you to implement a DQN - an approximate q-learning algorithm with experience replay and target networks - and see if it works any better this way.\n", + "\n", + "Original paper:\n", + "https://arxiv.org/pdf/1312.5602.pdf" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**This notebook is given for debug.** The main task is in the other notebook (**homework_pytorch_main**). The tasks are similar and share most of the code. The main difference is in environments. In main notebook it can take some 2 hours for the agent to start improving so it seems reasonable to launch the algorithm on a simpler env first. Here it is CartPole and it will train in several minutes.\n", + "\n", + "**We suggest the following pipeline:** First implement debug notebook then implement the main one.\n", + "\n", + "**About evaluation:** All points are given for the main notebook with one exception: if agent fails to beat the threshold in main notebook you can get 1 pt (instead of 3 pts) for beating the threshold in debug notebook." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# # in google colab uncomment this\n", + "\n", + "# import os\n", + "\n", + "# os.system('apt-get install -y xvfb')\n", + "# os.system('wget https://raw.githubusercontent.com/yandexdataschool/Practical_DL/fall18/xvfb -O ../xvfb')\n", + "# os.system('apt-get install -y python-opengl ffmpeg')\n", + "# os.system('pip install pyglet==1.2.4')\n", + "\n", + "# os.system('python -m pip install -U pygame --user')\n", + "\n", + "# prefix = 'https://raw.githubusercontent.com/yandexdataschool/Practical_RL/spring19/week04_approx_rl/'\n", + "\n", + "# os.system('wget ' + prefix + 'atari_wrappers.py')\n", + "# os.system('wget ' + prefix + 'utils.py')\n", + "# os.system('wget ' + prefix + 'replay_buffer.py')\n", + "# os.system('wget ' + prefix + 'framebuffer.py')\n", + "\n", + "# print('setup complete')\n", + "\n", + "# XVFB will be launched if you run on a server\n", + "import os\n", + "if type(os.environ.get(\"DISPLAY\")) is not str or len(os.environ.get(\"DISPLAY\")) == 0:\n", + " !bash ../xvfb start\n", + " %env DISPLAY = : 1" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "__Frameworks__ - we'll accept this homework in any deep learning framework. This particular notebook was designed for pytoch, but you find it easy to adapt it to almost any python-based deep learning framework." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import random\n", + "import numpy as np\n", + "import torch\n", + "import utils" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import gym\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### CartPole again" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ENV_NAME = 'CartPole-v1'\n", + "\n", + "def make_env(seed=None):\n", + " # CartPole is wrapped with a time limit wrapper by default\n", + " env = gym.make(ENV_NAME).unwrapped\n", + " if seed is not None:\n", + " env.seed(seed)\n", + " return env" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "env = make_env()\n", + "env.reset()\n", + "state_shape, n_actions = env.observation_space.shape, env.action_space.n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Building a network" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We now need to build a neural network that can map observations to state q-values.\n", + "The model does not have to be huge yet. 1-2 hidden layers with < 200 neurons and ReLU activation will probably be enough. Batch normalization and dropout can spoil everything here." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "import torch.nn as nn\n", + "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n", + "# those who have a GPU but feel unfair to use it can uncomment:\n", + "# device = torch.device('cpu')\n", + "device" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "class DQNAgent(nn.Module):\n", + " def __init__(self, state_shape, n_actions, epsilon=0):\n", + "\n", + " super().__init__()\n", + " self.epsilon = epsilon\n", + " self.n_actions = n_actions\n", + " self.state_shape = state_shape\n", + " # Define your network body here. Please make sure agent is fully contained here\n", + " assert len(state_shape) == 1\n", + " state_dim = state_shape[0]\n", + " \n", + "\n", + " \n", + " def forward(self, state_t):\n", + " \"\"\"\n", + " takes agent's observation (tensor), returns qvalues (tensor)\n", + " :param state_t: a batch states, shape = [batch_size, *state_dim=4]\n", + " \"\"\"\n", + " # Use your network to compute qvalues for given state\n", + " qvalues = \n", + "\n", + " assert qvalues.requires_grad, \"qvalues must be a torch tensor with grad\"\n", + " assert len(\n", + " qvalues.shape) == 2 and qvalues.shape[0] == state_t.shape[0] and qvalues.shape[1] == n_actions\n", + "\n", + " return qvalues\n", + "\n", + " def get_qvalues(self, states):\n", + " \"\"\"\n", + " like forward, but works on numpy arrays, not tensors\n", + " \"\"\"\n", + " model_device = next(self.parameters()).device\n", + " states = torch.tensor(states, device=model_device, dtype=torch.float32)\n", + " qvalues = self.forward(states)\n", + " return qvalues.data.cpu().numpy()\n", + "\n", + " def sample_actions(self, qvalues):\n", + " \"\"\"pick actions given qvalues. Uses epsilon-greedy exploration strategy. \"\"\"\n", + " epsilon = self.epsilon\n", + " batch_size, n_actions = qvalues.shape\n", + "\n", + " random_actions = np.random.choice(n_actions, size=batch_size)\n", + " best_actions = qvalues.argmax(axis=-1)\n", + "\n", + " should_explore = np.random.choice(\n", + " [0, 1], batch_size, p=[1-epsilon, epsilon])\n", + " return np.where(should_explore, random_actions, best_actions)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "agent = DQNAgent(state_shape, n_actions, epsilon=0.5).to(device)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now let's try out our agent to see if it raises any errors." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def evaluate(env, agent, n_games=1, greedy=False, t_max=10000):\n", + " \"\"\" Plays n_games full games. If greedy, picks actions as argmax(qvalues). Returns mean reward. \"\"\"\n", + " rewards = []\n", + " for _ in range(n_games):\n", + " s = env.reset()\n", + " reward = 0\n", + " for _ in range(t_max):\n", + " qvalues = agent.get_qvalues([s])\n", + " action = qvalues.argmax(axis=-1)[0] if greedy else agent.sample_actions(qvalues)[0]\n", + " s, r, done, _ = env.step(action)\n", + " reward += r\n", + " if done:\n", + " break\n", + "\n", + " rewards.append(reward)\n", + " return np.mean(rewards)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "evaluate(env, agent, n_games=1)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Experience replay\n", + "For this assignment, we provide you with experience replay buffer. If you implemented experience replay buffer in last week's assignment, you can copy-paste it here in main notebook **to get 2 bonus points**.\n", + "\n", + "![img](https://github.com/yandexdataschool/Practical_RL/raw/master/yet_another_week/_resource/exp_replay.png)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### The interface is fairly simple:\n", + "* `exp_replay.add(obs, act, rw, next_obs, done)` - saves (s,a,r,s',done) tuple into the buffer\n", + "* `exp_replay.sample(batch_size)` - returns observations, actions, rewards, next_observations and is_done for `batch_size` random samples.\n", + "* `len(exp_replay)` - returns number of elements stored in replay buffer." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from replay_buffer import ReplayBuffer\n", + "exp_replay = ReplayBuffer(10)\n", + "\n", + "for _ in range(30):\n", + " exp_replay.add(env.reset(), env.action_space.sample(),\n", + " 1.0, env.reset(), done=False)\n", + "\n", + "obs_batch, act_batch, reward_batch, next_obs_batch, is_done_batch = exp_replay.sample(\n", + " 5)\n", + "\n", + "assert len(exp_replay) == 10, \"experience replay size should be 10 because that's what maximum capacity is\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def play_and_record(initial_state, agent, env, exp_replay, n_steps=1):\n", + " \"\"\"\n", + " Play the game for exactly n steps, record every (s,a,r,s', done) to replay buffer. \n", + " Whenever game ends, add record with done=True and reset the game.\n", + " It is guaranteed that env has done=False when passed to this function.\n", + "\n", + " PLEASE DO NOT RESET ENV UNLESS IT IS \"DONE\"\n", + "\n", + " :returns: return sum of rewards over time and the state in which the env stays\n", + " \"\"\"\n", + " s = initial_state\n", + " sum_rewards = 0\n", + "\n", + " # Play the game for n_steps as per instructions above\n", + " \n", + "\n", + " return sum_rewards, s" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# testing your code.\n", + "exp_replay = ReplayBuffer(2000)\n", + "\n", + "state = env.reset()\n", + "play_and_record(state, agent, env, exp_replay, n_steps=1000)\n", + "\n", + "# if you're using your own experience replay buffer, some of those tests may need correction.\n", + "# just make sure you know what your code does\n", + "assert len(exp_replay) == 1000, \"play_and_record should have added exactly 1000 steps, \"\\\n", + " \"but instead added %i\" % len(exp_replay)\n", + "is_dones = list(zip(*exp_replay._storage))[-1]\n", + "\n", + "assert 0 < np.mean(is_dones) < 0.1, \"Please make sure you restart the game whenever it is 'done' and record the is_done correctly into the buffer.\"\\\n", + " \"Got %f is_done rate over %i steps. [If you think it's your tough luck, just re-run the test]\" % (\n", + " np.mean(is_dones), len(exp_replay))\n", + "\n", + "for _ in range(100):\n", + " obs_batch, act_batch, reward_batch, next_obs_batch, is_done_batch = exp_replay.sample(\n", + " 10)\n", + " assert obs_batch.shape == next_obs_batch.shape == (10,) + state_shape\n", + " assert act_batch.shape == (\n", + " 10,), \"actions batch should have shape (10,) but is instead %s\" % str(act_batch.shape)\n", + " assert reward_batch.shape == (\n", + " 10,), \"rewards batch should have shape (10,) but is instead %s\" % str(reward_batch.shape)\n", + " assert is_done_batch.shape == (\n", + " 10,), \"is_done batch should have shape (10,) but is instead %s\" % str(is_done_batch.shape)\n", + " assert [int(i) in (0, 1)\n", + " for i in is_dones], \"is_done should be strictly True or False\"\n", + " assert [\n", + " 0 <= a < n_actions for a in act_batch], \"actions should be within [0, n_actions]\"\n", + "\n", + "print(\"Well done!\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Target networks\n", + "\n", + "We also employ the so called \"target network\" - a copy of neural network weights to be used for reference Q-values:\n", + "\n", + "The network itself is an exact copy of agent network, but it's parameters are not trained. Instead, they are moved here from agent's actual network every so often.\n", + "\n", + "$$ Q_{reference}(s,a) = r + \\gamma \\cdot \\max _{a'} Q_{target}(s',a') $$\n", + "\n", + "![img](https://github.com/yandexdataschool/Practical_RL/raw/master/yet_another_week/_resource/target_net.png)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "target_network = DQNAgent(agent.state_shape, agent.n_actions, epsilon=0.5).to(device)\n", + "# This is how you can load weights from agent into target network\n", + "target_network.load_state_dict(agent.state_dict())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Learning with... Q-learning\n", + "Here we write a function similar to `agent.update` from tabular q-learning." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Compute Q-learning TD error:\n", + "\n", + "$$ L = { 1 \\over N} \\sum_i [ Q_{\\theta}(s,a) - Q_{reference}(s,a) ] ^2 $$\n", + "\n", + "With Q-reference defined as\n", + "\n", + "$$ Q_{reference}(s,a) = r(s,a) + \\gamma \\cdot max_{a'} Q_{target}(s', a') $$\n", + "\n", + "Where\n", + "* $Q_{target}(s',a')$ denotes q-value of next state and next action predicted by __target_network__\n", + "* $s, a, r, s'$ are current state, action, reward and next state respectively\n", + "* $\\gamma$ is a discount factor defined two cells above.\n", + "\n", + "\n", + "__Note 1:__ there's an example input below. Feel free to experiment with it before you write the function.\n", + "\n", + "__Note 2:__ compute_td_loss is a source of 99% of bugs in this homework. If reward doesn't improve, it often helps to go through it line by line [with a rubber duck](https://rubberduckdebugging.com/)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def compute_td_loss(states, actions, rewards, next_states, is_done,\n", + " agent, target_network,\n", + " gamma=0.99,\n", + " check_shapes=False,\n", + " device=device):\n", + " \"\"\" Compute td loss using torch operations only. Use the formulae above. \"\"\"\n", + " states = torch.tensor(states, device=device, dtype=torch.float) # shape: [batch_size, *state_shape]\n", + "\n", + " # for some torch reason should not make actions a tensor\n", + " actions = torch.tensor(actions, device=device, dtype=torch.long) # shape: [batch_size]\n", + " rewards = torch.tensor(rewards, device=device, dtype=torch.float) # shape: [batch_size]\n", + " # shape: [batch_size, *state_shape]\n", + " next_states = torch.tensor(next_states, device=device, dtype=torch.float)\n", + " is_done = torch.tensor(\n", + " is_done.astype('float32'),\n", + " device=device,\n", + " dtype=torch.float\n", + " ) # shape: [batch_size]\n", + " is_not_done = 1 - is_done\n", + "\n", + " # get q-values for all actions in current states\n", + " predicted_qvalues = agent(states)\n", + "\n", + " # compute q-values for all actions in next states\n", + " predicted_next_qvalues = target_network(next_states)\n", + " \n", + " # select q-values for chosen actions\n", + " predicted_qvalues_for_actions = predicted_qvalues[range(\n", + " len(actions)), actions]\n", + "\n", + " # compute V*(next_states) using predicted next q-values\n", + " next_state_values = \n", + "\n", + " assert next_state_values.dim(\n", + " ) == 1 and next_state_values.shape[0] == states.shape[0], \"must predict one value per state\"\n", + "\n", + " # compute \"target q-values\" for loss - it's what's inside square parentheses in the above formula.\n", + " # at the last state use the simplified formula: Q(s,a) = r(s,a) since s' doesn't exist\n", + " # you can multiply next state values by is_not_done to achieve this.\n", + " target_qvalues_for_actions = \n", + "\n", + " # mean squared error loss to minimize\n", + " loss = torch.mean((predicted_qvalues_for_actions -\n", + " target_qvalues_for_actions.detach()) ** 2)\n", + "\n", + " if check_shapes:\n", + " assert predicted_next_qvalues.data.dim(\n", + " ) == 2, \"make sure you predicted q-values for all actions in next state\"\n", + " assert next_state_values.data.dim(\n", + " ) == 1, \"make sure you computed V(s') as maximum over just the actions axis and not all axes\"\n", + " assert target_qvalues_for_actions.data.dim(\n", + " ) == 1, \"there's something wrong with target q-values, they must be a vector\"\n", + "\n", + " return loss" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Sanity checks" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "obs_batch, act_batch, reward_batch, next_obs_batch, is_done_batch = exp_replay.sample(\n", + " 10)\n", + "\n", + "loss = compute_td_loss(obs_batch, act_batch, reward_batch, next_obs_batch, is_done_batch,\n", + " agent, target_network,\n", + " gamma=0.99, check_shapes=True)\n", + "loss.backward()\n", + "\n", + "assert loss.requires_grad and tuple(loss.data.size()) == (\n", + " ), \"you must return scalar loss - mean over batch\"\n", + "assert np.any(next(agent.parameters()).grad.data.cpu().numpy() !=\n", + " 0), \"loss must be differentiable w.r.t. network weights\"\n", + "assert np.all(next(target_network.parameters()).grad is None), \"target network should not have grads\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Main loop\n", + "\n", + "It's time to put everything together and see if it learns anything." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from tqdm import trange\n", + "from IPython.display import clear_output\n", + "import matplotlib.pyplot as plt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "seed = \n", + "random.seed(seed)\n", + "np.random.seed(seed)\n", + "torch.manual_seed(seed)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "env = make_env(seed)\n", + "state_dim = env.observation_space.shape\n", + "n_actions = env.action_space.n\n", + "state = env.reset()\n", + "\n", + "agent = DQNAgent(state_dim, n_actions, epsilon=1).to(device)\n", + "target_network = DQNAgent(state_dim, n_actions, epsilon=1).to(device)\n", + "target_network.load_state_dict(agent.state_dict())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "exp_replay = ReplayBuffer(10**4)\n", + "for i in range(100):\n", + " if not utils.is_enough_ram(min_available_gb=0.1):\n", + " print(\"\"\"\n", + " Less than 100 Mb RAM available. \n", + " Make sure the buffer size in not too huge.\n", + " Also check, maybe other processes consume RAM heavily.\n", + " \"\"\"\n", + " )\n", + " break\n", + " play_and_record(state, agent, env, exp_replay, n_steps=10**2)\n", + " if len(exp_replay) == 10**4:\n", + " break\n", + "print(len(exp_replay))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "timesteps_per_epoch = 1\n", + "batch_size = 32\n", + "total_steps = 4 * 10**4\n", + "decay_steps = 1 * 10**4\n", + "\n", + "opt = torch.optim.Adam(agent.parameters(), lr=1e-4)\n", + "\n", + "init_epsilon = 1\n", + "final_epsilon = 0.1\n", + "\n", + "loss_freq = 20\n", + "refresh_target_network_freq = 100\n", + "eval_freq = 1000\n", + "\n", + "max_grad_norm = 5000" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "mean_rw_history = []\n", + "td_loss_history = []\n", + "grad_norm_history = []\n", + "initial_state_v_history = []" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": false + }, + "outputs": [], + "source": [ + "state = env.reset()\n", + "for step in trange(total_steps + 1):\n", + " if not utils.is_enough_ram():\n", + " print('less that 100 Mb RAM available, freezing')\n", + " print('make sure everything is ok and make KeyboardInterrupt to continue')\n", + " try:\n", + " while True:\n", + " pass\n", + " except KeyboardInterrupt:\n", + " pass\n", + "\n", + " agent.epsilon = utils.linear_decay(init_epsilon, final_epsilon, step, decay_steps)\n", + "\n", + " # play\n", + " _, state = play_and_record(state, agent, env, exp_replay, timesteps_per_epoch)\n", + "\n", + " # train\n", + " < sample batch_size of data from experience replay >\n", + "\n", + " loss = < compute TD loss >\n", + "\n", + " loss.backward()\n", + " grad_norm = nn.utils.clip_grad_norm_(agent.parameters(), max_grad_norm)\n", + " opt.step()\n", + " opt.zero_grad()\n", + "\n", + " if step % loss_freq == 0:\n", + " td_loss_history.append(loss.data.cpu().item())\n", + " grad_norm_history.append(grad_norm)\n", + "\n", + " if step % refresh_target_network_freq == 0:\n", + " # Load agent weights into target_network\n", + " \n", + "\n", + " if step % eval_freq == 0:\n", + " # eval the agent\n", + " mean_rw_history.append(evaluate(\n", + " make_env(seed=step), agent, n_games=3, greedy=True, t_max=1000)\n", + " )\n", + " initial_state_q_values = agent.get_qvalues(\n", + " [make_env(seed=step).reset()]\n", + " )\n", + " initial_state_v_history.append(np.max(initial_state_q_values))\n", + "\n", + " clear_output(True)\n", + " print(\"buffer size = %i, epsilon = %.5f\" %\n", + " (len(exp_replay), agent.epsilon))\n", + "\n", + " plt.figure(figsize=[16, 9])\n", + " plt.subplot(2, 2, 1)\n", + " plt.title(\"Mean reward per episode\")\n", + " plt.plot(mean_rw_history)\n", + " plt.grid()\n", + "\n", + " assert not np.isnan(td_loss_history[-1])\n", + " plt.subplot(2, 2, 2)\n", + " plt.title(\"TD loss history (smoothened)\")\n", + " plt.plot(utils.smoothen(td_loss_history))\n", + " plt.grid()\n", + "\n", + " plt.subplot(2, 2, 3)\n", + " plt.title(\"Initial state V\")\n", + " plt.plot(initial_state_v_history)\n", + " plt.grid()\n", + "\n", + " plt.subplot(2, 2, 4)\n", + " plt.title(\"Grad norm history (smoothened)\")\n", + " plt.plot(utils.smoothen(grad_norm_history))\n", + " plt.grid()\n", + "\n", + " plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "final_score = evaluate(\n", + " make_env(),\n", + " agent, n_games=30, greedy=True, t_max=1000\n", + ")\n", + "print('final score:', final_score)\n", + "assert final_score > 300, 'not good enough for DQN'\n", + "print('Well done')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "anaconda-cloud": {}, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.8" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/week04_approx_rl/homework_pytorch_main.ipynb b/week04_approx_rl/homework_pytorch_main.ipynb new file mode 100644 index 000000000..4c0d16637 --- /dev/null +++ b/week04_approx_rl/homework_pytorch_main.ipynb @@ -0,0 +1,1302 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Deep Q-Network implementation.\n", + "\n", + "This homework shamelessly demands you to implement a DQN - an approximate q-learning algorithm with experience replay and target networks - and see if it works any better this way.\n", + "\n", + "Original paper:\n", + "https://arxiv.org/pdf/1312.5602.pdf" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**This notebook is the main notebook.** Another notebook is given for debug. (**homework_pytorch_main**). The tasks are similar and share most of the code. The main difference is in environments. In main notebook it can take some 2 hours for the agent to start improving so it seems reasonable to launch the algorithm on a simpler env first. Here it is CartPole and it will train in several minutes.\n", + "\n", + "**We suggest the following pipeline:** First implement debug notebook then implement the main one.\n", + "\n", + "**About evaluation:** All points are given for the main notebook with one exception: if agent fails to beat the threshold in main notebook you can get 1 pt (instead of 3 pts) for beating the threshold in debug notebook." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# # in google colab uncomment this\n", + "\n", + "# import os\n", + "\n", + "# os.system('apt-get install -y xvfb')\n", + "# os.system('wget https://raw.githubusercontent.com/yandexdataschool/Practical_DL/fall18/xvfb -O ../xvfb')\n", + "# os.system('apt-get install -y python-opengl ffmpeg')\n", + "# os.system('pip install pyglet==1.2.4')\n", + "\n", + "# os.system('python -m pip install -U pygame --user')\n", + "\n", + "# prefix = 'https://raw.githubusercontent.com/yandexdataschool/Practical_RL/spring19/week04_approx_rl/'\n", + "\n", + "# os.system('wget ' + prefix + 'atari_wrappers.py')\n", + "# os.system('wget ' + prefix + 'utils.py')\n", + "# os.system('wget ' + prefix + 'replay_buffer.py')\n", + "# os.system('wget ' + prefix + 'framebuffer.py')\n", + "\n", + "# print('setup complete')\n", + "\n", + "# XVFB will be launched if you run on a server\n", + "import os\n", + "if type(os.environ.get(\"DISPLAY\")) is not str or len(os.environ.get(\"DISPLAY\")) == 0:\n", + " !bash ../xvfb start\n", + " %env DISPLAY = : 1" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "__Frameworks__ - we'll accept this homework in any deep learning framework. This particular notebook was designed for pytoch, but you find it easy to adapt it to almost any python-based deep learning framework." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import random\n", + "import numpy as np\n", + "import torch\n", + "import utils" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import gym\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Let's play some old videogames\n", + "![img](https://github.com/yandexdataschool/Practical_RL/raw/master/yet_another_week/_resource/nerd.png)\n", + "\n", + "This time we're gonna apply approximate q-learning to an atari game called Breakout. It's not the hardest thing out there, but it's definitely way more complex than anything we tried before.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ENV_NAME = \"BreakoutNoFrameskip-v4\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Preprocessing (3 pts)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's see what observations look like." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "env = gym.make(ENV_NAME)\n", + "env.reset()\n", + "\n", + "n_cols = 5\n", + "n_rows = 2\n", + "fig = plt.figure(figsize=(16, 9))\n", + "\n", + "for row in range(n_rows):\n", + " for col in range(n_cols):\n", + " ax = fig.add_subplot(n_rows, n_cols, row * n_cols + col + 1)\n", + " ax.imshow(env.render('rgb_array'))\n", + " env.step(env.action_space.sample())\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Let's play a little.**\n", + "\n", + "Pay attention to zoom and fps args of play function. Control: A, D, space." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# does not work in colab.\n", + "# make keyboard interrupt to continue\n", + "\n", + "from gym.utils.play import play\n", + "\n", + "play(env=gym.make(ENV_NAME), zoom=5, fps=30)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Processing game image \n", + "\n", + "Raw atari images are large, 210x160x3 by default. However, we don't need that level of detail in order to learn them.\n", + "\n", + "We can thus save a lot of time by preprocessing game image, including\n", + "* Resizing to a smaller shape, 64 x 64\n", + "* Converting to grayscale\n", + "* Cropping irrelevant image parts (top, bottom and edges)\n", + "\n", + "Also please keep one dimension for channel so that final shape would be 1 x 64 x 64.\n", + "\n", + "Tip: You can implement your own grayscale converter and assign a huge weight to the red channel. This dirty trick is not necessary but it will speed up learning." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from gym.core import ObservationWrapper\n", + "from gym.spaces import Box\n", + "\n", + "\n", + "class PreprocessAtariObs(ObservationWrapper):\n", + " def __init__(self, env):\n", + " \"\"\"A gym wrapper that crops, scales image into the desired shapes and grayscales it.\"\"\"\n", + " ObservationWrapper.__init__(self, env)\n", + "\n", + " self.img_size = (1, 64, 64)\n", + " self.observation_space = Box(0.0, 1.0, self.img_size)\n", + "\n", + "\n", + " def _to_gray_scale(self, rgb, channel_weights=[0.8, 0.1, 0.1]):\n", + " \n", + "\n", + "\n", + " def _observation(self, img):\n", + " \"\"\"what happens to each observation\"\"\"\n", + "\n", + " # Here's what you need to do:\n", + " # * crop image, remove irrelevant parts\n", + " # * resize image to self.img_size\n", + " # (use imresize from any library you want,\n", + " # e.g. opencv, skimage, PIL, keras)\n", + " # * cast image to grayscale\n", + " # * convert image pixels to (0,1) range, float32 type\n", + " \n", + " return < ... >" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import gym\n", + "# spawn game instance for tests\n", + "env = gym.make(ENV_NAME) # create raw env\n", + "env = PreprocessAtariObs(env)\n", + "observation_shape = env.observation_space.shape\n", + "n_actions = env.action_space.n\n", + "env.reset()\n", + "obs, _, _, _ = env.step(env.action_space.sample())\n", + "\n", + "# test observation\n", + "assert obs.ndim == 3, \"observation must be [channel, h, w] even if there's just one channel\"\n", + "assert obs.shape == observation_shape\n", + "assert obs.dtype == 'float32'\n", + "assert len(np.unique(obs)) > 2, \"your image must not be binary\"\n", + "assert 0 <= np.min(obs) and np.max(\n", + " obs) <= 1, \"convert image pixels to [0,1] range\"\n", + "\n", + "print(\"Formal tests seem fine. Here's an example of what you'll get.\")\n", + "\n", + "n_cols = 5\n", + "n_rows = 2\n", + "fig = plt.figure(figsize=(16, 9))\n", + "obs = env.reset()\n", + "for row in range(n_rows):\n", + " for col in range(n_cols):\n", + " ax = fig.add_subplot(n_rows, n_cols, row * n_cols + col + 1)\n", + " ax.imshow(obs[0, :, :], interpolation='none', cmap='gray')\n", + " obs, _, _, _ = env.step(env.action_space.sample())\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Wrapping." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**About the game:** You have 5 lives and get points for breaking the wall. Higher bricks cost more than the lower ones. There are 4 actions: start game (should be called at the beginning and after each life is lost), move left, move right and do nothing. There are some common wrappers used for Atari environments." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2\n", + "import atari_wrappers\n", + "\n", + "def PrimaryAtariWrap(env, clip_rewards=True):\n", + " assert 'NoFrameskip' in env.spec.id\n", + "\n", + " # This wrapper holds the same action for frames and outputs\n", + " # the maximal pixel value of 2 last frames (to handle blinking\n", + " # in some envs)\n", + " env = atari_wrappers.MaxAndSkipEnv(env, skip=4)\n", + "\n", + " # This wrapper sends done=True when each life is lost\n", + " # (not all the 5 lives that are givern by the game rules).\n", + " # It should make easier for the agent to understand that losing is bad.\n", + " env = atari_wrappers.EpisodicLifeEnv(env)\n", + "\n", + " # This wrapper laucnhes the ball when an episode starts.\n", + " # Without it the agent has to learn this action, too.\n", + " # Actually it can but learning would take longer.\n", + " env = atari_wrappers.FireResetEnv(env)\n", + "\n", + " # This wrapper transforms rewards to {-1, 0, 1} according to their sign\n", + " if clip_rewards:\n", + " env = atari_wrappers.ClipRewardEnv(env)\n", + "\n", + " # This wrapper is yours :)\n", + " env = PreprocessAtariObs(env)\n", + " return env" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Let's see if the game is still playable after applying the wrappers.**\n", + "At playing the EpisodicLifeEnv wrapper seems not to work but actually it does (because after when life finishes a new ball is dropped automatically - it means that FireResetEnv wrapper understands that a new episode began)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# does not work in colab.\n", + "# make keyboard interrupt to continue\n", + "\n", + "from gym.utils.play import play\n", + "\n", + "def make_play_env():\n", + " env = gym.make(ENV_NAME)\n", + " env = PrimaryAtariWrap(env)\n", + "# in torch imgs have shape [c, h, w] instead of common [h, w, c]\n", + " env = atari_wrappers.AntiTorchWrapper(env)\n", + " return env\n", + "\n", + "play(make_play_env(), zoom=10, fps=3)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Frame buffer\n", + "\n", + "Our agent can only process one observation at a time, so we gotta make sure it contains enough information to find optimal actions. For instance, agent has to react to moving objects so he must be able to measure object's velocity.\n", + "\n", + "To do so, we introduce a buffer that stores 4 last images. This time everything is pre-implemented for you, not really by the staff of the course :)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "from framebuffer import FrameBuffer\n", + "\n", + "def make_env(clip_rewards=True, seed=None):\n", + " env = gym.make(ENV_NAME) # create raw env\n", + " if seed is not None:\n", + " env.seed(seed)\n", + " env = PrimaryAtariWrap(env, clip_rewards)\n", + " env = FrameBuffer(env, n_frames=4, dim_order='pytorch')\n", + " return env\n", + "\n", + "env = make_env()\n", + "env.reset()\n", + "n_actions = env.action_space.n\n", + "state_shape = env.observation_space.shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for _ in range(12):\n", + " obs, _, _, _ = env.step(env.action_space.sample())\n", + "\n", + "plt.figure(figsize=[12,10])\n", + "plt.title(\"Game image\")\n", + "plt.imshow(env.render(\"rgb_array\"))\n", + "plt.show()\n", + "\n", + "plt.figure(figsize=[15,15])\n", + "plt.title(\"Agent observation (4 frames top to bottom)\")\n", + "plt.imshow(utils.img_by_obs(obs, state_shape), cmap='gray')\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## DQN as it is (4 pts)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Building a network\n", + "\n", + "We now need to build a neural network that can map images to state q-values. This network will be called on every agent's step so it better not be resnet-152 unless you have an array of GPUs. Instead, you can use strided convolutions with a small number of features to save time and memory.\n", + "\n", + "You can build any architecture you want, but for reference, here's something that will more or less work:" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![img](https://github.com/yandexdataschool/Practical_RL/raw/master/yet_another_week/_resource/dqn_arch.png)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "import torch.nn as nn\n", + "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n", + "# those who have a GPU but feel unfair to use it can uncomment:\n", + "# device = torch.device('cpu')\n", + "device" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def conv2d_size_out(size, kernel_size, stride):\n", + " \"\"\"\n", + " common use case:\n", + " cur_layer_img_w = conv2d_size_out(cur_layer_img_w, kernel_size, stride)\n", + " cur_layer_img_h = conv2d_size_out(cur_layer_img_h, kernel_size, stride)\n", + " to understand the shape for dense layer's input\n", + " \"\"\"\n", + " return (size - (kernel_size - 1) - 1) // stride + 1\n", + "\n", + "\n", + "class Flatten(nn.Module):\n", + " def __init__(self):\n", + " super().__init__()\n", + " \n", + " def forward(self, x):\n", + " return x.view(x.size(0), -1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "class DQNAgent(nn.Module):\n", + " def __init__(self, state_shape, n_actions, epsilon=0):\n", + "\n", + " super().__init__()\n", + " self.epsilon = epsilon\n", + " self.n_actions = n_actions\n", + " self.state_shape = state_shape\n", + "\n", + " # Define your network body here. Please make sure agent is fully contained here\n", + " \n", + "\n", + " def forward(self, state_t):\n", + " \"\"\"\n", + " takes agent's observation (tensor), returns qvalues (tensor)\n", + " :param state_t: a batch of 4-frame buffers, shape = [batch_size, 4, h, w]\n", + " \"\"\"\n", + " # Use your network to compute qvalues for given state\n", + " qvalues = \n", + "\n", + " assert qvalues.requires_grad, \"qvalues must be a torch tensor with grad\"\n", + " assert len(\n", + " qvalues.shape) == 2 and qvalues.shape[0] == state_t.shape[0] and qvalues.shape[1] == n_actions\n", + "\n", + " return qvalues\n", + "\n", + " def get_qvalues(self, states):\n", + " \"\"\"\n", + " like forward, but works on numpy arrays, not tensors\n", + " \"\"\"\n", + " model_device = next(self.parameters()).device\n", + " states = torch.tensor(states, device=model_device, dtype=torch.float)\n", + " qvalues = self.forward(states)\n", + " return qvalues.data.cpu().numpy()\n", + "\n", + " def sample_actions(self, qvalues):\n", + " \"\"\"pick actions given qvalues. Uses epsilon-greedy exploration strategy. \"\"\"\n", + " epsilon = self.epsilon\n", + " batch_size, n_actions = qvalues.shape\n", + "\n", + " random_actions = np.random.choice(n_actions, size=batch_size)\n", + " best_actions = qvalues.argmax(axis=-1)\n", + "\n", + " should_explore = np.random.choice(\n", + " [0, 1], batch_size, p=[1-epsilon, epsilon])\n", + " return np.where(should_explore, random_actions, best_actions)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "agent = DQNAgent(state_shape, n_actions, epsilon=0.5).to(device)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now let's try out our agent to see if it raises any errors." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def evaluate(env, agent, n_games=1, greedy=False, t_max=10000):\n", + " \"\"\" Plays n_games full games. If greedy, picks actions as argmax(qvalues). Returns mean reward. \"\"\"\n", + " rewards = []\n", + " for _ in range(n_games):\n", + " s = env.reset()\n", + " reward = 0\n", + " for _ in range(t_max):\n", + " qvalues = agent.get_qvalues([s])\n", + " action = qvalues.argmax(axis=-1)[0] if greedy else agent.sample_actions(qvalues)[0]\n", + " s, r, done, _ = env.step(action)\n", + " reward += r\n", + " if done:\n", + " break\n", + "\n", + " rewards.append(reward)\n", + " return np.mean(rewards)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "evaluate(env, agent, n_games=1)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Experience replay\n", + "For this assignment, we provide you with experience replay buffer. If you implemented experience replay buffer in last week's assignment, you can copy-paste it here **to get 2 bonus points**.\n", + "\n", + "![img](https://github.com/yandexdataschool/Practical_RL/raw/master/yet_another_week/_resource/exp_replay.png)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### The interface is fairly simple:\n", + "* `exp_replay.add(obs, act, rw, next_obs, done)` - saves (s,a,r,s',done) tuple into the buffer\n", + "* `exp_replay.sample(batch_size)` - returns observations, actions, rewards, next_observations and is_done for `batch_size` random samples.\n", + "* `len(exp_replay)` - returns number of elements stored in replay buffer." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from replay_buffer import ReplayBuffer\n", + "exp_replay = ReplayBuffer(10)\n", + "\n", + "for _ in range(30):\n", + " exp_replay.add(env.reset(), env.action_space.sample(),\n", + " 1.0, env.reset(), done=False)\n", + "\n", + "obs_batch, act_batch, reward_batch, next_obs_batch, is_done_batch = exp_replay.sample(\n", + " 5)\n", + "\n", + "assert len(exp_replay) == 10, \"experience replay size should be 10 because that's what maximum capacity is\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def play_and_record(initial_state, agent, env, exp_replay, n_steps=1):\n", + " \"\"\"\n", + " Play the game for exactly n steps, record every (s,a,r,s', done) to replay buffer. \n", + " Whenever game ends, add record with done=True and reset the game.\n", + " It is guaranteed that env has done=False when passed to this function.\n", + "\n", + " PLEASE DO NOT RESET ENV UNLESS IT IS \"DONE\"\n", + "\n", + " :returns: return sum of rewards over time and the state in which the env stays\n", + " \"\"\"\n", + " s = initial_state\n", + " sum_rewards = 0\n", + "\n", + " # Play the game for n_steps as per instructions above\n", + " \n", + "\n", + " return sum_rewards, s" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# testing your code.\n", + "exp_replay = ReplayBuffer(2000)\n", + "\n", + "state = env.reset()\n", + "play_and_record(state, agent, env, exp_replay, n_steps=1000)\n", + "\n", + "# if you're using your own experience replay buffer, some of those tests may need correction.\n", + "# just make sure you know what your code does\n", + "assert len(exp_replay) == 1000, \"play_and_record should have added exactly 1000 steps, \"\\\n", + " \"but instead added %i\" % len(exp_replay)\n", + "is_dones = list(zip(*exp_replay._storage))[-1]\n", + "\n", + "assert 0 < np.mean(is_dones) < 0.1, \"Please make sure you restart the game whenever it is 'done' and record the is_done correctly into the buffer.\"\\\n", + " \"Got %f is_done rate over %i steps. [If you think it's your tough luck, just re-run the test]\" % (\n", + " np.mean(is_dones), len(exp_replay))\n", + "\n", + "for _ in range(100):\n", + " obs_batch, act_batch, reward_batch, next_obs_batch, is_done_batch = exp_replay.sample(\n", + " 10)\n", + " assert obs_batch.shape == next_obs_batch.shape == (10,) + state_shape\n", + " assert act_batch.shape == (\n", + " 10,), \"actions batch should have shape (10,) but is instead %s\" % str(act_batch.shape)\n", + " assert reward_batch.shape == (\n", + " 10,), \"rewards batch should have shape (10,) but is instead %s\" % str(reward_batch.shape)\n", + " assert is_done_batch.shape == (\n", + " 10,), \"is_done batch should have shape (10,) but is instead %s\" % str(is_done_batch.shape)\n", + " assert [int(i) in (0, 1)\n", + " for i in is_dones], \"is_done should be strictly True or False\"\n", + " assert [\n", + " 0 <= a < n_actions for a in act_batch], \"actions should be within [0, n_actions)\"\n", + "\n", + "print(\"Well done!\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Target networks\n", + "\n", + "We also employ the so called \"target network\" - a copy of neural network weights to be used for reference Q-values:\n", + "\n", + "The network itself is an exact copy of agent network, but it's parameters are not trained. Instead, they are moved here from agent's actual network every so often.\n", + "\n", + "$$ Q_{reference}(s,a) = r + \\gamma \\cdot \\max _{a'} Q_{target}(s',a') $$\n", + "\n", + "![img](https://github.com/yandexdataschool/Practical_RL/raw/master/yet_another_week/_resource/target_net.png)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "target_network = DQNAgent(agent.state_shape, agent.n_actions, epsilon=0.5).to(device)\n", + "# This is how you can load weights from agent into target network\n", + "target_network.load_state_dict(agent.state_dict())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Learning with... Q-learning\n", + "Here we write a function similar to `agent.update` from tabular q-learning." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Compute Q-learning TD error:\n", + "\n", + "$$ L = { 1 \\over N} \\sum_i [ Q_{\\theta}(s,a) - Q_{reference}(s,a) ] ^2 $$\n", + "\n", + "With Q-reference defined as\n", + "\n", + "$$ Q_{reference}(s,a) = r(s,a) + \\gamma \\cdot max_{a'} Q_{target}(s', a') $$\n", + "\n", + "Where\n", + "* $Q_{target}(s',a')$ denotes q-value of next state and next action predicted by __target_network__\n", + "* $s, a, r, s'$ are current state, action, reward and next state respectively\n", + "* $\\gamma$ is a discount factor defined two cells above.\n", + "\n", + "\n", + "__Note 1:__ there's an example input below. Feel free to experiment with it before you write the function.\n", + "\n", + "__Note 2:__ compute_td_loss is a source of 99% of bugs in this homework. If reward doesn't improve, it often helps to go through it line by line [with a rubber duck](https://rubberduckdebugging.com/)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def compute_td_loss(states, actions, rewards, next_states, is_done,\n", + " agent, target_network,\n", + " gamma=0.99,\n", + " check_shapes=False,\n", + " device=device):\n", + " \"\"\" Compute td loss using torch operations only. Use the formulae above. \"\"\"\n", + " states = torch.tensor(states, device=device, dtype=torch.float) # shape: [batch_size, *state_shape]\n", + "\n", + " # for some torch reason should not make actions a tensor\n", + " actions = torch.tensor(actions, device=device, dtype=torch.long) # shape: [batch_size]\n", + " rewards = torch.tensor(rewards, device=device, dtype=torch.float) # shape: [batch_size]\n", + " # shape: [batch_size, *state_shape]\n", + " next_states = torch.tensor(next_states, device=device, dtype=torch.float)\n", + " is_done = torch.tensor(\n", + " is_done.astype('float32'),\n", + " device=device,\n", + " dtype=torch.float\n", + " ) # shape: [batch_size]\n", + " is_not_done = 1 - is_done\n", + "\n", + " # get q-values for all actions in current states\n", + " predicted_qvalues = agent(states)\n", + "\n", + " # compute q-values for all actions in next states\n", + " predicted_next_qvalues = target_network(next_states)\n", + " \n", + " # select q-values for chosen actions\n", + " predicted_qvalues_for_actions = predicted_qvalues[range(\n", + " len(actions)), actions]\n", + "\n", + " # compute V*(next_states) using predicted next q-values\n", + " next_state_values = \n", + "\n", + " assert next_state_values.dim(\n", + " ) == 1 and next_state_values.shape[0] == states.shape[0], \"must predict one value per state\"\n", + "\n", + " # compute \"target q-values\" for loss - it's what's inside square parentheses in the above formula.\n", + " # at the last state use the simplified formula: Q(s,a) = r(s,a) since s' doesn't exist\n", + " # you can multiply next state values by is_not_done to achieve this.\n", + " target_qvalues_for_actions = \n", + "\n", + " # mean squared error loss to minimize\n", + " loss = torch.mean((predicted_qvalues_for_actions -\n", + " target_qvalues_for_actions.detach()) ** 2)\n", + "\n", + " if check_shapes:\n", + " assert predicted_next_qvalues.data.dim(\n", + " ) == 2, \"make sure you predicted q-values for all actions in next state\"\n", + " assert next_state_values.data.dim(\n", + " ) == 1, \"make sure you computed V(s') as maximum over just the actions axis and not all axes\"\n", + " assert target_qvalues_for_actions.data.dim(\n", + " ) == 1, \"there's something wrong with target q-values, they must be a vector\"\n", + "\n", + " return loss" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Sanity checks" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "obs_batch, act_batch, reward_batch, next_obs_batch, is_done_batch = exp_replay.sample(\n", + " 10)\n", + "\n", + "loss = compute_td_loss(obs_batch, act_batch, reward_batch, next_obs_batch, is_done_batch,\n", + " agent, target_network,\n", + " gamma=0.99, check_shapes=True)\n", + "loss.backward()\n", + "\n", + "assert loss.requires_grad and tuple(loss.data.size()) == (\n", + " ), \"you must return scalar loss - mean over batch\"\n", + "assert np.any(next(agent.parameters()).grad.data.cpu().numpy() !=\n", + " 0), \"loss must be differentiable w.r.t. network weights\"\n", + "assert np.all(next(target_network.parameters()).grad is None), \"target network should not have grads\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Main loop (3 pts)\n", + "\n", + "**If deadline is tonight and it has not converged:** It is ok. Send the notebook today and when it converges send it again.\n", + "If the code is exactly the same points will not be discounted.\n", + "\n", + "It's time to put everything together and see if it learns anything." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from tqdm import trange\n", + "from IPython.display import clear_output\n", + "import matplotlib.pyplot as plt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "seed = \n", + "random.seed(seed)\n", + "np.random.seed(seed)\n", + "torch.manual_seed(seed)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "env = make_env(seed)\n", + "state_shape = env.observation_space.shape\n", + "n_actions = env.action_space.n\n", + "state = env.reset()\n", + "\n", + "agent = DQNAgent(state_shape, n_actions, epsilon=1).to(device)\n", + "target_network = DQNAgent(state_shape, n_actions).to(device)\n", + "target_network.load_state_dict(agent.state_dict())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Buffer of size $10^4$ fits into 5 Gb RAM.\n", + "\n", + "Larger sizes ($10^5$ and $10^6$ are common) can be used. It can improve the learning, but $10^4$ is quiet enough. $10^2$ will probably fail learning." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "exp_replay = ReplayBuffer(10**4)\n", + "for i in range(100):\n", + " if not utils.is_enough_ram(min_available_gb=0.1):\n", + " print(\"\"\"\n", + " Less than 100 Mb RAM available. \n", + " Make sure the buffer size in not too huge.\n", + " Also check, maybe other processes consume RAM heavily.\n", + " \"\"\"\n", + " )\n", + " break\n", + " play_and_record(state, agent, env, exp_replay, n_steps=10**2)\n", + " if len(exp_replay) == 10**4:\n", + " break\n", + "print(len(exp_replay))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "timesteps_per_epoch = 1\n", + "batch_size = 16\n", + "total_steps = 3 * 10**6\n", + "decay_steps = 10**6\n", + "\n", + "opt = torch.optim.Adam(agent.parameters(), lr=1e-4)\n", + "\n", + "init_epsilon = 1\n", + "final_epsilon = 0.1\n", + "\n", + "loss_freq = 50\n", + "refresh_target_network_freq = 5000\n", + "eval_freq = 5000\n", + "\n", + "max_grad_norm = 50\n", + "\n", + "n_lives = 5" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "mean_rw_history = []\n", + "td_loss_history = []\n", + "grad_norm_history = []\n", + "initial_state_v_history = []" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": false + }, + "outputs": [], + "source": [ + "state = env.reset()\n", + "for step in trange(total_steps + 1):\n", + " if not utils.is_enough_ram():\n", + " print('less that 100 Mb RAM available, freezing')\n", + " print('make sure everythin is ok and make KeyboardInterrupt to continue')\n", + " try:\n", + " while True:\n", + " pass\n", + " except KeyboardInterrupt:\n", + " pass\n", + "\n", + " agent.epsilon = utils.linear_decay(init_epsilon, final_epsilon, step, decay_steps)\n", + "\n", + " # play\n", + " _, state = play_and_record(state, agent, env, exp_replay, timesteps_per_epoch)\n", + "\n", + " # train\n", + " < sample batch_size of data from experience replay >\n", + "\n", + " loss = < compute TD loss >\n", + "\n", + " loss.backward()\n", + " grad_norm = nn.utils.clip_grad_norm_(agent.parameters(), max_grad_norm)\n", + " opt.step()\n", + " opt.zero_grad()\n", + "\n", + " if step % loss_freq == 0:\n", + " td_loss_history.append(loss.data.cpu().item())\n", + " grad_norm_history.append(grad_norm)\n", + "\n", + " if step % refresh_target_network_freq == 0:\n", + " # Load agent weights into target_network\n", + " \n", + "\n", + " if step % eval_freq == 0:\n", + " mean_rw_history.append(evaluate(\n", + " make_env(clip_rewards=True, seed=step), agent, n_games=3 * n_lives, greedy=True)\n", + " )\n", + " initial_state_q_values = agent.get_qvalues(\n", + " [make_env(seed=step).reset()]\n", + " )\n", + " initial_state_v_history.append(np.max(initial_state_q_values))\n", + "\n", + " clear_output(True)\n", + " print(\"buffer size = %i, epsilon = %.5f\" %\n", + " (len(exp_replay), agent.epsilon))\n", + "\n", + " plt.figure(figsize=[16, 9])\n", + "\n", + " plt.subplot(2, 2, 1)\n", + " plt.title(\"Mean reward per life\")\n", + " plt.plot(mean_rw_history)\n", + " plt.grid()\n", + "\n", + " assert not np.isnan(td_loss_history[-1])\n", + " plt.subplot(2, 2, 2)\n", + " plt.title(\"TD loss history (smoothened)\")\n", + " plt.plot(utils.smoothen(td_loss_history))\n", + " plt.grid()\n", + "\n", + " plt.subplot(2, 2, 3)\n", + " plt.title(\"Initial state V\")\n", + " plt.plot(initial_state_v_history)\n", + " plt.grid()\n", + "\n", + " plt.subplot(2, 2, 4)\n", + " plt.title(\"Grad norm history (smoothened)\")\n", + " plt.plot(utils.smoothen(grad_norm_history))\n", + " plt.grid()\n", + "\n", + " plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Agent is evaluated for 1 life, not for a whole episode of 5 lives. Rewards in evaluation are also truncated. Cuz this is what environment the agent is learning in and in this way mean rewards per life can be compared with initial state value\n", + "\n", + "The goal is to get 10 points in the real env. So 2 or better 3 points in the preprocessed one will probably be enough. You can interrupt learning then." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Final scoring is done on a whole episode with all 5 lives." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "final_score = evaluate(\n", + " make_env(clip_rewards=False, seed=9),\n", + " agent, n_games=30, greedy=True, t_max=10 * 1000\n", + ") * n_lives\n", + "print('final score:', final_score)\n", + "assert final_score > 10, 'not as cool as DQN can'\n", + "print('Cool!')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## How to interpret plots:\n", + "\n", + "This aint no supervised learning so don't expect anything to improve monotonously. \n", + "* **TD loss** is the MSE between agent's current Q-values and target Q-values. It may slowly increase or decrease, it's ok. The \"not ok\" behavior includes going NaN or stayng at exactly zero before agent has perfect performance.\n", + "* **grad norm** just shows the intensivity of training. Not ok is growing to values of about 100 (or maybe even 50) though it depends on network architecture.\n", + "* **mean reward** is the expected sum of r(s,a) agent gets over the full game session. It will oscillate, but on average it should get higher over time (after a few thousand iterations...). \n", + " * In basic q-learning implementation it takes about 40k steps to \"warm up\" agent before it starts to get better.\n", + "* **Initial state V** is the expected discounted reward for episode in the oppinion of the agent. It should behave more smoothly than **mean reward**. It should get higher over time but sometimes can experience drawdowns because of the agaent's overestimates.\n", + "* **buffer size** - this one is simple. It should go up and cap at max size.\n", + "* **epsilon** - agent's willingness to explore. If you see that agent's already at 0.01 epsilon before it's average reward is above 0 - it means you need to increase epsilon. Set it back to some 0.2 - 0.5 and decrease the pace at which it goes down.\n", + "* Smoothing of plots is done with a gaussian kernel\n", + "\n", + "At first your agent will lose quickly. Then it will learn to suck less and at least hit the ball a few times before it loses. Finally it will learn to actually score points.\n", + "\n", + "**Training will take time.** A lot of it actually. Probably you will not see any improvment during first **150k** time steps (note that by default in this notebook agent is evaluated every 5000 time steps).\n", + "\n", + "But hey, long training time isn't _that_ bad:\n", + "![img](https://github.com/yandexdataschool/Practical_RL/raw/master/yet_another_week/_resource/training.png)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## About hyperparameters:\n", + "\n", + "The task has something in common with supervised learning: loss is optimized through the buffer (instead of Train dataset). But the distribution of states and actions in the buffer **is not stationary** and depends on the policy that generated it. It can even happen that the mean TD error across the buffer is very low but the performance is extremely poor (imagine the agent collecting data to the buffer always manages to avoid the ball).\n", + "\n", + "* Total timesteps and training time: It seems to be so huge, but actually it is normal for RL.\n", + "\n", + "* $\\epsilon$ decay shedule was taken from the original paper and is like traditional for epsilon-greedy policies. At the beginning of the training the agent's greedy policy is poor so many random actions should be taken.\n", + "\n", + "* Optimizer: In the original paper RMSProp was used (they did not have Adam in 2013) and it can work not worse than Adam. For us Adam was default and it worked.\n", + "\n", + "* lr: $10^{-3}$ would probably be too huge\n", + "\n", + "* batch size: This one can be very important: if it is too small the agent can fail to learn. Huge batch takes more time to process. If batch of size 8 can not be processed on the hardware you use take 2 (or even 4) batches of size 4, divide the loss on them by 2 (or 4) and make optimization step after both backward() calls in torch.\n", + "\n", + "* target network update frequency: has something in common with learning rate. Too frequent updates can lead to divergence. Too rare can lead to slow leraning. For millions of total timesteps thousands of inner steps seem ok. One iteration of target network updating is an iteration of the (this time approximate) $\\gamma$-compression that stands behind Q-learning. The more inner steps it makes the more accurate is the compression.\n", + "* max_grad_norm - just huge enough. In torch clip_grad_norm also evaluates the norm before clipping and it can be convenient for logging." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Video" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": false + }, + "outputs": [], + "source": [ + "# record sessions\n", + "import gym.wrappers\n", + "env_monitor = gym.wrappers.Monitor(make_env(), directory=\"videos\", force=True)\n", + "sessions = [evaluate(env_monitor, agent, n_games=n_lives, greedy=True) for _ in range(10)]\n", + "env_monitor.close()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# show video\n", + "from IPython.display import HTML\n", + "import os\n", + "\n", + "video_names = list(\n", + " filter(lambda s: s.endswith(\".mp4\"), os.listdir(\"./videos/\")))\n", + "\n", + "HTML(\"\"\"\n", + "\n", + "\"\"\".format(\"./videos/\"+video_names[-1])) # this may or may not be _last_ video. Try other indices" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Let's have a closer look at this.\n", + "\n", + "If average episode score is below 200 using all 5 lives, then probably DQN has not converged fully. But anyway let's make a more complete record of an episode." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "eval_env = make_env(clip_rewards=False)\n", + "record = utils.play_and_log_episode(eval_env, agent)\n", + "print('total reward for life:', np.sum(record['rewards']))\n", + "for key in record:\n", + " print(key)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "fig = plt.figure(figsize=(5, 5))\n", + "ax = fig.add_subplot(1, 1, 1)\n", + "\n", + "ax.scatter(record['v_mc'], record['v_agent'])\n", + "ax.plot(sorted(record['v_mc']), sorted(record['v_mc']),\n", + " 'black', linestyle='--', label='x=y')\n", + "\n", + "ax.grid()\n", + "ax.legend()\n", + "ax.set_title('State Value Estimates')\n", + "ax.set_xlabel('Monte-Carlo')\n", + "ax.set_ylabel('Agent')\n", + "\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "$\\hat V_{Monte-Carlo}(s_t) = \\sum_{\\tau=0}^{episode~end} \\gamma^{\\tau-t}r_t$" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Is there a big bias? It's ok, anyway it works." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Bonus I (2 pts)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**1.** Plot several (say 3) states with high and low spreads of Q estimate by actions i.e.\n", + "$$\\max_a \\hat Q(s,a) - \\min_a \\hat Q(s,a)\\$$\n", + "Please take those states from different episodes to make sure that the states are really different.\n", + "\n", + "What should high and low spread mean at least in the world of perfect Q-fucntions?\n", + "\n", + "Comment the states you like most.\n", + "\n", + "**2.** Plot several (say 3) states with high td-error and several states with high values of\n", + "$$| \\hat V_{Monte-Carlo}(s) - \\hat V_{agent}(s)|,$$ \n", + "$$\\hat V_{agent}(s)=\\max_a \\hat Q(s,a).$$ Please take those states from different episodes to make sure that the states are really different. From what part (i.e. beginning, middle, end) of an episode did these states come from?\n", + "\n", + "Comment the states you like most." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from utils import play_and_log_episode, img_by_obs\n", + "\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Bonus II (1-5 pts). Get High Score!\n", + "\n", + "1 point to you for each 50 points of your agent. Truncated by 5 points.\n", + "\n", + "One way is to train for several days and use a big GPU (why not actually).\n", + "\n", + "Another way is to apply modifications (see **Bonus III**)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Bonus III (2+ pts). Apply modifications to DQN.\n", + "\n", + "For inspiration see [Rainbow](https://arxiv.org/abs/1710.02298) - a version of q-learning that combines lots of them.\n", + "\n", + "Points for Bonus II and Bonus III fully stack. So if modified agent gets score 250+ you get 5 pts for Bonus II + points for modifications. If the final score is 40 then you get the points for modifications.\n", + "\n", + "\n", + "Some modifications:\n", + "* [Prioritized experience replay](https://arxiv.org/abs/1511.05952) (5 pts for your own implementation, 2 pts for using a ready one)\n", + "* [double q-learning](https://arxiv.org/abs/1509.06461) (2 pts)\n", + "* [dueling q-learning](https://arxiv.org/abs/1511.06581) (2 pts)\n", + "* multi-step heuristics (see [Rainbow](https://arxiv.org/abs/1710.02298)) (2 pts)\n", + "* [distributional RL](https://arxiv.org/abs/1707.06887)(distributional and distributed stand for different things here) (5 pts)\n", + "* Other modifications (2+ pts depending on complexity)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Bonus IV (4+ pts). Distributed RL.\n", + "\n", + "Solve the task in a distributed way. It can strongly speed up learning. See [article](https://arxiv.org/pdf/1602.01783.pdf) or some guides." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**As usual bonus points for all the tasks fully stack.**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "anaconda-cloud": {}, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.1" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/week4_approx_rl/homework_tf.ipynb b/week04_approx_rl/homework_tf.ipynb similarity index 100% rename from week4_approx_rl/homework_tf.ipynb rename to week04_approx_rl/homework_tf.ipynb diff --git a/week4_approx_rl/replay_buffer.py b/week04_approx_rl/replay_buffer.py similarity index 100% rename from week4_approx_rl/replay_buffer.py rename to week04_approx_rl/replay_buffer.py diff --git a/week4_approx_rl/seminar_lasagne.ipynb b/week04_approx_rl/seminar_lasagne.ipynb similarity index 100% rename from week4_approx_rl/seminar_lasagne.ipynb rename to week04_approx_rl/seminar_lasagne.ipynb diff --git a/week04_approx_rl/seminar_pytorch.ipynb b/week04_approx_rl/seminar_pytorch.ipynb new file mode 100644 index 000000000..0223454dc --- /dev/null +++ b/week04_approx_rl/seminar_pytorch.ipynb @@ -0,0 +1,454 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Approximate q-learning\n", + "\n", + "In this notebook you will teach a __pytorch__ neural network to do Q-learning." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# # in google colab uncomment this\n", + "\n", + "# import os\n", + "\n", + "# os.system('apt-get install -y xvfb')\n", + "# os.system('wget https://raw.githubusercontent.com/yandexdataschool/Practical_DL/fall18/xvfb -O ../xvfb')\n", + "# os.system('apt-get install -y python-opengl ffmpeg')\n", + "# os.system('pip install pyglet==1.2.4')\n", + "\n", + "# XVFB will be launched if you run on a server\n", + "import os\n", + "if type(os.environ.get(\"DISPLAY\")) is not str or len(os.environ.get(\"DISPLAY\")) == 0:\n", + " !bash ../xvfb start\n", + " %env DISPLAY = : 1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import gym\n", + "import numpy as np\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "%matplotlib inline" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "scrolled": false + }, + "outputs": [], + "source": [ + "env = gym.make(\"CartPole-v0\").env\n", + "env.reset()\n", + "n_actions = env.action_space.n\n", + "state_dim = env.observation_space.shape\n", + "\n", + "plt.imshow(env.render(\"rgb_array\"))\n", + "env.close()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Approximate Q-learning: building the network\n", + "\n", + "To train a neural network policy one must have a neural network policy. Let's build it.\n", + "\n", + "\n", + "Since we're working with a pre-extracted features (cart positions, angles and velocities), we don't need a complicated network yet. In fact, let's build something like this for starters:\n", + "\n", + "![img](https://raw.githubusercontent.com/yandexdataschool/Practical_RL/spring19/yet_another_week/_resource/qlearning_scheme.png)\n", + "\n", + "For your first run, please only use linear layers (nn.Linear) and activations. Stuff like batch normalization or dropout may ruin everything if used haphazardly. \n", + "\n", + "Also please avoid using nonlinearities like sigmoid & tanh: agent's observations are not normalized so sigmoids may become saturated from init.\n", + "\n", + "Ideally you should start small with maybe 1-2 hidden layers with < 200 neurons and then increase network size if agent doesn't beat the target score." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import torch\n", + "import torch.nn as nn\n", + "import torch.nn.functional as F" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "network = nn.Sequential()\n", + "\n", + "network.add_module('layer1', < ... >)\n", + "\n", + "\n", + "\n", + "# hint: use state_dim[0] as input size" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def get_action(state, epsilon=0):\n", + " \"\"\"\n", + " sample actions with epsilon-greedy policy\n", + " recap: with p = epsilon pick random action, else pick action with highest Q(s,a)\n", + " \"\"\"\n", + " state = torch.tensor(state[None], dtype=torch.float32)\n", + " q_values = network(state).detach().numpy()\n", + "\n", + " # YOUR CODE\n", + "\n", + " return int( < epsilon-greedily selected action > )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "s = env.reset()\n", + "assert tuple(network(torch.tensor([s]*3, dtype=torch.float32)).size()) == (\n", + " 3, n_actions), \"please make sure your model maps state s -> [Q(s,a0), ..., Q(s, a_last)]\"\n", + "assert isinstance(list(network.modules(\n", + "))[-1], nn.Linear), \"please make sure you predict q-values without nonlinearity (ignore if you know what you're doing)\"\n", + "assert isinstance(get_action(\n", + " s), int), \"get_action(s) must return int, not %s. try int(action)\" % (type(get_action(s)))\n", + "\n", + "# test epsilon-greedy exploration\n", + "for eps in [0., 0.1, 0.5, 1.0]:\n", + " state_frequencies = np.bincount(\n", + " [get_action(s, epsilon=eps) for i in range(10000)], minlength=n_actions)\n", + " best_action = state_frequencies.argmax()\n", + " assert abs(state_frequencies[best_action] -\n", + " 10000 * (1 - eps + eps / n_actions)) < 200\n", + " for other_action in range(n_actions):\n", + " if other_action != best_action:\n", + " assert abs(state_frequencies[other_action] -\n", + " 10000 * (eps / n_actions)) < 200\n", + " print('e=%.1f tests passed' % eps)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Q-learning via gradient descent\n", + "\n", + "We shall now train our agent's Q-function by minimizing the TD loss:\n", + "$$ L = { 1 \\over N} \\sum_i (Q_{\\theta}(s,a) - [r(s,a) + \\gamma \\cdot max_{a'} Q_{-}(s', a')]) ^2 $$\n", + "\n", + "\n", + "Where\n", + "* $s, a, r, s'$ are current state, action, reward and next state respectively\n", + "* $\\gamma$ is a discount factor defined two cells above.\n", + "\n", + "The tricky part is with $Q_{-}(s',a')$. From an engineering standpoint, it's the same as $Q_{\\theta}$ - the output of your neural network policy. However, when doing gradient descent, __we won't propagate gradients through it__ to make training more stable (see lectures).\n", + "\n", + "To do so, we shall use `x.detach()` function which basically says \"consider this thing constant when doingbackprop\"." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def to_one_hot(y_tensor, n_dims=None):\n", + " \"\"\" helper: take an integer vector and convert it to 1-hot matrix. \"\"\"\n", + " y_tensor = y_tensor.type(torch.LongTensor).view(-1, 1)\n", + " n_dims = n_dims if n_dims is not None else int(torch.max(y_tensor)) + 1\n", + " y_one_hot = torch.zeros(\n", + " y_tensor.size()[0], n_dims).scatter_(1, y_tensor, 1)\n", + " return y_one_hot\n", + "\n", + "\n", + "def where(cond, x_1, x_2):\n", + " \"\"\" helper: like np.where but in pytorch. \"\"\"\n", + " return (cond * x_1) + ((1-cond) * x_2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def compute_td_loss(states, actions, rewards, next_states, is_done, gamma=0.99, check_shapes=False):\n", + " \"\"\" Compute td loss using torch operations only. Use the formula above. \"\"\"\n", + " states = torch.tensor(\n", + " states, dtype=torch.float32) # shape: [batch_size, state_size]\n", + " actions = torch.tensor(actions, dtype=torch.int32) # shape: [batch_size]\n", + " rewards = torch.tensor(rewards, dtype=torch.float32) # shape: [batch_size]\n", + " # shape: [batch_size, state_size]\n", + " next_states = torch.tensor(next_states, dtype=torch.float32)\n", + " is_done = torch.tensor(is_done, dtype=torch.float32) # shape: [batch_size]\n", + "\n", + " # get q-values for all actions in current states\n", + " predicted_qvalues = network(states)\n", + "\n", + " # select q-values for chosen actions\n", + " predicted_qvalues_for_actions = torch.sum(\n", + " predicted_qvalues * to_one_hot(actions, n_actions), dim=1)\n", + "\n", + " # compute q-values for all actions in next states\n", + " predicted_next_qvalues = # YOUR CODE\n", + "\n", + " # compute V*(next_states) using predicted next q-values\n", + " next_state_values = # YOUR CODE\n", + " assert next_state_values.dtype == torch.float32\n", + "\n", + " # compute \"target q-values\" for loss - it's what's inside square parentheses in the above formula.\n", + " target_qvalues_for_actions = # YOUR CODE\n", + "\n", + " # at the last state we shall use simplified formula: Q(s,a) = r(s,a) since s' doesn't exist\n", + " target_qvalues_for_actions = where(\n", + " is_done, rewards, target_qvalues_for_actions)\n", + "\n", + " # mean squared error loss to minimize\n", + " loss = torch.mean((predicted_qvalues_for_actions -\n", + " target_qvalues_for_actions.detach()) ** 2)\n", + "\n", + " if check_shapes:\n", + " assert predicted_next_qvalues.data.dim(\n", + " ) == 2, \"make sure you predicted q-values for all actions in next state\"\n", + " assert next_state_values.data.dim(\n", + " ) == 1, \"make sure you computed V(s') as maximum over just the actions axis and not all axes\"\n", + " assert target_qvalues_for_actions.data.dim(\n", + " ) == 1, \"there's something wrong with target q-values, they must be a vector\"\n", + "\n", + " return loss" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# sanity checks\n", + "s = env.reset()\n", + "a = env.action_space.sample()\n", + "next_s, r, done, _ = env.step(a)\n", + "loss = compute_td_loss([s], [a], [r], [next_s], [done], check_shapes=True)\n", + "loss.backward()\n", + "\n", + "assert len(loss.size()) == 0, \"you must return scalar loss - mean over batch\"\n", + "assert np.any(next(network.parameters()).grad.detach().numpy() !=\n", + " 0), \"loss must be differentiable w.r.t. network weights\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Playing the game" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "opt = torch.optim.Adam(network.parameters(), lr=1e-4)\n", + "epsilon = 0.5" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def generate_session(t_max=1000, epsilon=0, train=False):\n", + " \"\"\"play env with approximate q-learning agent and train it at the same time\"\"\"\n", + " total_reward = 0\n", + " s = env.reset()\n", + "\n", + " for t in range(t_max):\n", + " a = get_action(s, epsilon=epsilon)\n", + " next_s, r, done, _ = env.step(a)\n", + "\n", + " if train:\n", + " opt.zero_grad()\n", + " compute_td_loss([s], [a], [r], [next_s], [done]).backward()\n", + " opt.step()\n", + "\n", + " total_reward += r\n", + " s = next_s\n", + " if done:\n", + " break\n", + "\n", + " return total_reward" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "scrolled": true + }, + "outputs": [], + "source": [ + "for i in range(1000):\n", + " session_rewards = [generate_session(\n", + " epsilon=epsilon, train=True) for _ in range(100)]\n", + " print(\"epoch #{}\\tmean reward = {:.3f}\\tepsilon = {:.3f}\".format(\n", + " i, np.mean(session_rewards), epsilon))\n", + "\n", + " epsilon *= 0.99\n", + " assert epsilon >= 1e-4, \"Make sure epsilon is always nonzero during training\"\n", + "\n", + " if np.mean(session_rewards) > 300:\n", + " print(\"You Win!\")\n", + " break" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### How to interpret results\n", + "\n", + "\n", + "Welcome to the f.. world of deep f...n reinforcement learning. Don't expect agent's reward to smoothly go up. Hope for it to go increase eventually. If it deems you worthy.\n", + "\n", + "Seriously though,\n", + "* __ mean reward__ is the average reward per game. For a correct implementation it may stay low for some 10 epochs, then start growing while oscilating insanely and converges by ~50-100 steps depending on the network architecture. \n", + "* If it never reaches target score by the end of for loop, try increasing the number of hidden neurons or look at the epsilon.\n", + "* __ epsilon__ - agent's willingness to explore. If you see that agent's already at < 0.01 epsilon before it's is at least 200, just reset it back to 0.1 - 0.5." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Record videos\n", + "\n", + "As usual, we now use `gym.wrappers.Monitor` to record a video of our agent playing the game. Unlike our previous attempts with state binarization, this time we expect our agent to act ~~(or fail)~~ more smoothly since there's no more binarization error at play.\n", + "\n", + "As you already did with tabular q-learning, we set epsilon=0 for final evaluation to prevent agent from exploring himself to death." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "scrolled": true + }, + "outputs": [], + "source": [ + "# record sessions\n", + "import gym.wrappers\n", + "env = gym.wrappers.Monitor(gym.make(\"CartPole-v0\"),\n", + " directory=\"videos\", force=True)\n", + "sessions = [generate_session(epsilon=0, train=False) for _ in range(100)]\n", + "env.close()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# show video\n", + "from IPython.display import HTML\n", + "import os\n", + "\n", + "video_names = list(\n", + " filter(lambda s: s.endswith(\".mp4\"), os.listdir(\"./videos/\")))\n", + "\n", + "HTML(\"\"\"\n", + "\n", + "\"\"\".format(\"./videos/\"+video_names[-1])) # this may or may not be _last_ video. Try other indices" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "anaconda-cloud": {}, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.8" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/week4_approx_rl/seminar_tf.ipynb b/week04_approx_rl/seminar_tf.ipynb similarity index 96% rename from week4_approx_rl/seminar_tf.ipynb rename to week04_approx_rl/seminar_tf.ipynb index c7047146a..06e2fdd2c 100644 --- a/week4_approx_rl/seminar_tf.ipynb +++ b/week04_approx_rl/seminar_tf.ipynb @@ -74,7 +74,7 @@ "\n", "Since we're working with a pre-extracted features (cart positions, angles and velocities), we don't need a complicated network yet. In fact, let's build something like this for starters:\n", "\n", - "![img](https://s14.postimg.org/uzay2q5rl/qlearning_scheme.png)\n", + "![img](https://raw.githubusercontent.com/yandexdataschool/Practical_RL/master/yet_another_week/_resource/qlearning_scheme.png)\n", "\n", "For your first run, please only use linear layers (L.Dense) and activations. Stuff like batch normalization or dropout may ruin everything if used haphazardly. \n", "\n", @@ -186,11 +186,11 @@ "outputs": [], "source": [ "# Create placeholders for the tuple and a special indicator for game end (is_done = True)\n", - "states_ph = tf.placeholder('float32', shape=(None,) + state_dim)\n", - "actions_ph = tf.placeholder('int32', shape=[None])\n", - "rewards_ph = tf.placeholder('float32', shape=[None])\n", - "next_states_ph = tf.placeholder('float32', shape=(None,) + state_dim)\n", - "is_done_ph = tf.placeholder('bool', shape=[None])" + "states_ph = keras.backend.placeholder(dtype='float32', shape=(None,) + state_dim)\n", + "actions_ph = keras.backend.placeholder(dtype='int32', shape=[None])\n", + "rewards_ph = keras.backend.placeholder(dtype='float32', shape=[None])\n", + "next_states_ph = keras.backend.placeholder(dtype='float32', shape=(None,) + state_dim)\n", + "is_done_ph = keras.backend.placeholder(dtype='bool', shape=[None])" ] }, { diff --git a/week04_approx_rl/utils.py b/week04_approx_rl/utils.py new file mode 100644 index 000000000..cea654848 --- /dev/null +++ b/week04_approx_rl/utils.py @@ -0,0 +1,87 @@ +import numpy as np +import psutil +from scipy.signal import convolve, gaussian +import torch +from torch import nn +import os + +def get_cum_discounted_rewards(rewards, gamma): + """ + evaluates cumulative discounted rewards: + r_t + gamma * r_{t+1} + gamma^2 * r_{t_2} + ... + """ + cum_rewards = [] + cum_rewards.append(rewards[-1]) + for r in reversed(rewards[:-1]): + cum_rewards.insert(0, r + gamma * cum_rewards[0]) + return cum_rewards + + +def play_and_log_episode(env, agent, gamma=0.99, t_max=10000): + """ + always greedy + """ + states = [] + v_mc = [] + v_agent = [] + q_spreads = [] + td_errors = [] + rewards = [] + + s = env.reset() + for step in range(t_max): + states.append(s) + qvalues = agent.get_qvalues([s]) + max_q_value, min_q_value = np.max(qvalues), np.min(qvalues) + v_agent.append(max_q_value) + q_spreads.append(max_q_value - min_q_value) + if step > 0: + td_errors.append(np.abs(rewards[-1] + gamma * v_agent[-1] - v_agent[-2])) + + action = qvalues.argmax(axis=-1)[0] + + s, r, done, _ = env.step(action) + rewards.append(r) + if done: + break + td_errors.append(np.abs(rewards[-1] + gamma * v_agent[-1] - v_agent[-2])) + + v_mc = get_cum_discounted_rewards(rewards, gamma) + + return_pack = { + 'states': np.array(states), + 'v_mc': np.array(v_mc), + 'v_agent': np.array(v_agent), + 'q_spreads': np.array(q_spreads), + 'td_errors': np.array(td_errors), + 'rewards': np.array(rewards), + 'episode_finished': np.array(done) + } + + return return_pack + + +def img_by_obs(obs, state_dim): + """ + Unwraps obs by channels. + observation is of shape [c, h=w, w=h] + """ + return obs.reshape([-1, state_dim[2]]) + + +def is_enough_ram(min_available_gb = 0.1): + mem = psutil.virtual_memory() + return mem.available >= min_available_gb * (1024 ** 3) + + +def linear_decay(init_val, final_val, cur_step, total_steps): + if cur_step >= total_steps: + return final_val + return (init_val * (total_steps - cur_step) + final_val * cur_step) / total_steps + + +def smoothen(values): + kernel = gaussian(100, std=100) + # kernel = np.concatenate([np.arange(100), np.arange(99, -1, -1)]) + kernel = kernel / np.sum(kernel) + return convolve(values, kernel, 'valid') diff --git a/week5_explore/README.md b/week05_explore/README.md similarity index 85% rename from week5_explore/README.md rename to week05_explore/README.md index 8cc723409..f4fe9b595 100644 --- a/week5_explore/README.md +++ b/week05_explore/README.md @@ -1,3 +1,5 @@ +[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/yandexdataschool/Practical_RL/blob/spring19/week05_explore/week5.ipynb) + ### Slides - [here](https://yadi.sk/i/H0zVBROe3TWWHz) ## Exploration and exploitation @@ -6,7 +8,8 @@ * Alternative lecture by N. de Freitas (with bayesian opt) - [video](https://www.youtube.com/watch?v=vz3D36VXefI) * Our lectures (russian) - "mathematical" lecture (by Alexander Vorobev) '17 - [slides](https://yadi.sk/i/JAeItALT3JmvCL), [video](https://yadi.sk/i/bVHmu9gt3Hi9Ym) - - "engineering" lecture '18 - [video](https://yadi.sk/i/_myWJ13O3TdzXo) + - "practical" lecture '18 - [video](https://yadi.sk/i/_myWJ13O3TdzXo) + - Seminar - [video](https://yadi.sk/i/du7FLXs13TdzZS) diff --git a/week5_explore/action_rewards.npy b/week05_explore/action_rewards.npy similarity index 99% rename from week5_explore/action_rewards.npy rename to week05_explore/action_rewards.npy index 848a26938..231bcb18b 100644 Binary files a/week5_explore/action_rewards.npy and b/week05_explore/action_rewards.npy differ diff --git a/week5_explore/all_states.npy b/week05_explore/all_states.npy similarity index 99% rename from week5_explore/all_states.npy rename to week05_explore/all_states.npy index 16539f012..43940d9ba 100644 Binary files a/week5_explore/all_states.npy and b/week05_explore/all_states.npy differ diff --git a/week5_explore/bayes.py b/week05_explore/bayes.py similarity index 100% rename from week5_explore/bayes.py rename to week05_explore/bayes.py diff --git a/week5_explore/bnn.png b/week05_explore/bnn.png similarity index 100% rename from week5_explore/bnn.png rename to week05_explore/bnn.png diff --git a/week5_explore/river_swim.png b/week05_explore/river_swim.png similarity index 100% rename from week5_explore/river_swim.png rename to week05_explore/river_swim.png diff --git a/week05_explore/week5.ipynb b/week05_explore/week5.ipynb new file mode 100644 index 000000000..bd2e1149a --- /dev/null +++ b/week05_explore/week5.ipynb @@ -0,0 +1,1483 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Run this if in COLAB\n", + "!pip install --upgrade https://github.com/Theano/Theano/archive/master.zip\n", + "!pip install --upgrade https://github.com/Lasagne/Lasagne/archive/master.zip\n", + " \n", + "!wget -q https://raw.githubusercontent.com/yandexdataschool/Practical_RL/master/week5_explore/bayes.py\n", + "!wget -q https://raw.githubusercontent.com/yandexdataschool/Practical_RL/master/week5_explore/action_rewards.npy\n", + "!wget -q https://raw.githubusercontent.com/yandexdataschool/Practical_RL/master/week5_explore/all_states.npy" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from abc import ABCMeta, abstractmethod, abstractproperty\n", + "import enum\n", + "\n", + "import numpy as np\n", + "np.set_printoptions(precision=3)\n", + "np.set_printoptions(suppress=True)\n", + "\n", + "import pandas\n", + "\n", + "from matplotlib import pyplot as plt\n", + "%matplotlib inline" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Contents\n", + "* [1. Bernoulli Bandit](#Part-1.-Bernoulli-Bandit)\n", + " * [Bonus 1.1. Gittins index (5 points)](#Bonus-1.1.-Gittins-index-%285-points%29.)\n", + " * [HW 1.1. Nonstationary Bernoulli bandit](#HW-1.1.-Nonstationary-Bernoulli-bandit)\n", + "* [2. Contextual bandit](#Part-2.-Contextual-bandit)\n", + " * [2.1 Bulding a BNN agent](#2.1-Bulding-a-BNN-agent)\n", + " * [2.2 Training the agent](#2.2-Training-the-agent)\n", + " * [HW 2.1 Better exploration](#HW-2.1-Better-exploration)\n", + "* [3. Exploration in MDP](#Part-3.-Exploration-in-MDP)\n", + " * [Bonus 3.1 Posterior sampling RL (3 points)](#Bonus-3.1-Posterior-sampling-RL-%283-points%29)\n", + " * [Bonus 3.2 Bootstrapped DQN (10 points)](#Bonus-3.2-Bootstrapped-DQN-%2810-points%29)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Part 1. Bernoulli Bandit\n", + "\n", + "We are going to implement several exploration strategies for simplest problem - bernoulli bandit.\n", + "\n", + "The bandit has $K$ actions. Action produce 1.0 reward $r$ with probability $0 \\le \\theta_k \\le 1$ which is unknown to agent, but fixed over time. Agent's objective is to minimize regret over fixed number $T$ of action selections:\n", + "\n", + "$$\\rho = T\\theta^* - \\sum_{t=1}^T r_t$$\n", + "\n", + "Where $\\theta^* = \\max_k\\{\\theta_k\\}$\n", + "\n", + "**Real-world analogy:**\n", + "\n", + "Clinical trials - we have $K$ pills and $T$ ill patient. After taking pill, patient is cured with probability $\\theta_k$. Task is to find most efficient pill.\n", + "\n", + "A research on clinical trials - https://arxiv.org/pdf/1507.08025.pdf" + ] + }, + { + "cell_type": "code", + "execution_count": 88, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "class BernoulliBandit:\n", + " def __init__(self, n_actions=5):\n", + " self._probs = np.random.random(n_actions)\n", + "\n", + " @property\n", + " def action_count(self):\n", + " return len(self._probs)\n", + "\n", + " def pull(self, action):\n", + " if np.any(np.random.random() > self._probs[action]):\n", + " return 0.0\n", + " return 1.0\n", + "\n", + " def optimal_reward(self):\n", + " \"\"\" Used for regret calculation\n", + " \"\"\"\n", + " return np.max(self._probs)\n", + "\n", + " def step(self):\n", + " \"\"\" Used in nonstationary version\n", + " \"\"\"\n", + " pass\n", + "\n", + " def reset(self):\n", + " \"\"\" Used in nonstationary version\n", + " \"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": 89, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "class AbstractAgent(metaclass=ABCMeta):\n", + " def init_actions(self, n_actions):\n", + " self._successes = np.zeros(n_actions)\n", + " self._failures = np.zeros(n_actions)\n", + " self._total_pulls = 0\n", + "\n", + " @abstractmethod\n", + " def get_action(self):\n", + " \"\"\"\n", + " Get current best action\n", + " :rtype: int\n", + " \"\"\"\n", + " pass\n", + "\n", + " def update(self, action, reward):\n", + " \"\"\"\n", + " Observe reward from action and update agent's internal parameters\n", + " :type action: int\n", + " :type reward: int\n", + " \"\"\"\n", + " self._total_pulls += 1\n", + " if reward == 1:\n", + " self._successes[action] += 1\n", + " else:\n", + " self._failures[action] += 1\n", + "\n", + " @property\n", + " def name(self):\n", + " return self.__class__.__name__\n", + "\n", + "\n", + "class RandomAgent(AbstractAgent):\n", + " def get_action(self):\n", + " return np.random.randint(0, len(self._successes))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Epsilon-greedy agent\n", + "\n", + "**for** $t = 1,2,...$ **do**\n", + "\n", + "   **for** $k = 1,...,K$ **do**\n", + "\n", + "       $\\hat\\theta_k \\leftarrow \\alpha_k / (\\alpha_k + \\beta_k)$\n", + "\n", + "   **end for** \n", + "\n", + "   $x_t \\leftarrow argmax_{k}\\hat\\theta$ with probability $1 - \\epsilon$ or random action with probability $\\epsilon$\n", + "\n", + "   Apply $x_t$ and observe $r_t$\n", + "\n", + "   $(\\alpha_{x_t}, \\beta_{x_t}) \\leftarrow (\\alpha_{x_t}, \\beta_{x_t}) + (r_t, 1-r_t)$\n", + "\n", + "**end for**\n", + "\n", + "Implement the algorithm above in the cell below:" + ] + }, + { + "cell_type": "code", + "execution_count": 90, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "class EpsilonGreedyAgent(AbstractAgent):\n", + " def __init__(self, epsilon=0.01):\n", + " self._epsilon = epsilon\n", + "\n", + " def get_action(self):\n", + " # YOUR CODE HERE\n", + "\n", + " @property\n", + " def name(self):\n", + " return self.__class__.__name__ + \"(epsilon={})\".format(self._epsilon)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### UCB Agent\n", + "Epsilon-greedy strategy heve no preference for actions. It would be better to select among actions that are uncertain or have potential to be optimal. One can come up with idea of index for each action that represents otimality and uncertainty at the same time. One efficient way to do it is to use UCB1 algorithm:\n", + "\n", + "**for** $t = 1,2,...$ **do**\n", + "\n", + "   **for** $k = 1,...,K$ **do**\n", + "\n", + "       $w_k \\leftarrow \\alpha_k / (\\alpha_k + \\beta_k) + \\sqrt{2log\\ t \\ / \\ (\\alpha_k + \\beta_k)}$\n", + "\n", + "   **end for** \n", + "\n", + "   **end for** \n", + " $x_t \\leftarrow argmax_{k}w$\n", + "\n", + "   Apply $x_t$ and observe $r_t$\n", + "\n", + "   $(\\alpha_{x_t}, \\beta_{x_t}) \\leftarrow (\\alpha_{x_t}, \\beta_{x_t}) + (r_t, 1-r_t)$\n", + "\n", + "**end for**\n", + "\n", + "More versions and optimality analysis - https://homes.di.unimi.it/~cesabian/Pubblicazioni/ml-02.pdf" + ] + }, + { + "cell_type": "code", + "execution_count": 91, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "class UCBAgent(AbstractAgent):\n", + " def get_action(self):\n", + " # YOUR CODE HERE" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Thompson sampling\n", + "\n", + "UCB1 algorithm does not take into account actual distribution of rewards. If we know the distribution - we can do much better by using Thompson sampling:\n", + "\n", + "**for** $t = 1,2,...$ **do**\n", + "\n", + "   **for** $k = 1,...,K$ **do**\n", + "\n", + "       Sample $\\hat\\theta_k \\sim beta(\\alpha_k, \\beta_k)$\n", + "\n", + "   **end for** \n", + "\n", + "   $x_t \\leftarrow argmax_{k}\\hat\\theta$\n", + "\n", + "   Apply $x_t$ and observe $r_t$\n", + "\n", + "   $(\\alpha_{x_t}, \\beta_{x_t}) \\leftarrow (\\alpha_{x_t}, \\beta_{x_t}) + (r_t, 1-r_t)$\n", + "\n", + "**end for**\n", + " \n", + "\n", + "More on Thompson Sampling:\n", + "https://web.stanford.edu/~bvr/pubs/TS_Tutorial.pdf" + ] + }, + { + "cell_type": "code", + "execution_count": 95, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "class ThompsonSamplingAgent(AbstractAgent):\n", + " def get_action(self):\n", + " # YOUR CODE HERE" + ] + }, + { + "cell_type": "code", + "execution_count": 93, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def plot_regret(env, agents, n_steps=5000, n_trials=50):\n", + " scores = {\n", + " agent.name: [0.0 for step in range(n_steps)] for agent in agents\n", + " }\n", + "\n", + " for trial in range(n_trials):\n", + " env.reset()\n", + "\n", + " for a in agents:\n", + " a.init_actions(env.action_count)\n", + "\n", + " for i in range(n_steps):\n", + " optimal_reward = env.optimal_reward()\n", + "\n", + " for agent in agents:\n", + " action = agent.get_action()\n", + " reward = env.pull(action)\n", + " agent.update(action, reward)\n", + " scores[agent.name][i] += optimal_reward - reward\n", + "\n", + " env.step() # change bandit's state if it is unstationary\n", + "\n", + " plt.figure(figsize=(17, 8))\n", + " for agent in agents:\n", + " plt.plot(np.cumsum(scores[agent.name]) / n_trials)\n", + "\n", + " plt.legend([agent.name for agent in agents])\n", + "\n", + " plt.ylabel(\"regret\")\n", + " plt.xlabel(\"steps\")\n", + "\n", + " plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 96, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Uncomment agents\n", + "agents = [\n", + " # EpsilonGreedyAgent(),\n", + " # UCBAgent(),\n", + " # ThompsonSamplingAgent()\n", + "]\n", + "\n", + "plot_regret(BernoulliBandit(), agents, n_steps=10000, n_trials=10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Bonus 1.1. Gittins index (5 points).\n", + "\n", + "Bernoulli bandit problem has an optimal solution - Gittins index algorithm. Implement finite horizon version of the algorithm and demonstrate it's performance with experiments. some articles:\n", + "- Wikipedia article - https://en.wikipedia.org/wiki/Gittins_index\n", + "- Different algorithms for index computation - http://www.ece.mcgill.ca/~amahaj1/projects/bandits/book/2013-bandit-computations.pdf (see \"Bernoulli\" section)\n", + " " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# HW 1.1. Nonstationary Bernoulli bandit\n", + "\n", + "What if success probabilities change over time? Here is an example of such bandit:" + ] + }, + { + "cell_type": "code", + "execution_count": 97, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "class DriftingBandit(BernoulliBandit):\n", + " def __init__(self, n_actions=5, gamma=0.01):\n", + " \"\"\"\n", + " Idea from https://github.com/iosband/ts_tutorial\n", + " \"\"\"\n", + " super().__init__(n_actions)\n", + "\n", + " self._gamma = gamma\n", + "\n", + " self._successes = None\n", + " self._failures = None\n", + " self._steps = 0\n", + "\n", + " self.reset()\n", + "\n", + " def reset(self):\n", + " self._successes = np.zeros(self.action_count) + 1.0\n", + " self._failures = np.zeros(self.action_count) + 1.0\n", + " self._steps = 0\n", + "\n", + " def step(self):\n", + " action = np.random.randint(self.action_count)\n", + " reward = self.pull(action)\n", + " self._step(action, reward)\n", + "\n", + " def _step(self, action, reward):\n", + " self._successes = self._successes * (1 - self._gamma) + self._gamma\n", + " self._failures = self._failures * (1 - self._gamma) + self._gamma\n", + " self._steps += 1\n", + "\n", + " self._successes[action] += reward\n", + " self._failures[action] += 1.0 - reward\n", + "\n", + " self._probs = np.random.beta(self._successes, self._failures)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "And a picture how it's reward probabilities change over time" + ] + }, + { + "cell_type": "code", + "execution_count": 98, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "drifting_env = DriftingBandit(n_actions=5)\n", + "\n", + "drifting_probs = []\n", + "for i in range(20000):\n", + " drifting_env.step()\n", + " drifting_probs.append(drifting_env._probs)\n", + "\n", + "plt.figure(figsize=(17, 8))\n", + "plt.plot(pandas.DataFrame(drifting_probs).rolling(window=20).mean())\n", + "\n", + "plt.xlabel(\"steps\")\n", + "plt.ylabel(\"Success probability\")\n", + "plt.title(\"Reward probabilities over time\")\n", + "plt.legend([\"Action {}\".format(i) for i in range(drifting_env.action_count)])\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Your task is to invent an agent that will have better regret than stationary agents from above." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# YOUR AGENT HERE SECTION" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "drifting_agents = [\n", + " ThompsonSamplingAgent(),\n", + " EpsilonGreedyAgent(),\n", + " UCBAgent(),\n", + " YourAgent()\n", + "]\n", + "\n", + "plot_regret(DriftingBandit(), drifting_agents, n_steps=20000, n_trials=10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Part 2. Contextual bandit\n", + "\n", + "Now we will solve much more complex problem - reward will depend on bandit's state.\n", + "\n", + "**Real-word analogy:**\n", + "\n", + "> Contextual advertising. We have a lot of banners and a lot of different users. Users can have different features: age, gender, search requests. We want to show banner with highest click probability.\n", + "\n", + "If we want use strategies from above, we need some how store reward distributions conditioned both on actions and bandit's state. \n", + "One way to do this - use bayesian neural networks. Instead of giving pointwise estimates of target, they maintain probability distributions\n", + "\n", + "\n", + "Picture from https://arxiv.org/pdf/1505.05424.pdf\n", + "\n", + "\n", + "More material:\n", + " * A post on the matter - [url](http://twiecki.github.io/blog/2016/07/05/bayesian-deep-learning/)\n", + " * Theano+PyMC3 for more serious stuff - [url](http://pymc-devs.github.io/pymc3/notebooks/bayesian_neural_network_advi.html)\n", + " * Same stuff in tensorflow - [url](http://edwardlib.org/tutorials/bayesian-neural-network)\n", + " \n", + "Let's load our dataset:" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "State size: 60, actions: 10\n" + ] + } + ], + "source": [ + "all_states = np.load(\"all_states.npy\")\n", + "action_rewards = np.load(\"action_rewards.npy\")\n", + "\n", + "state_size = all_states.shape[1]\n", + "n_actions = action_rewards.shape[1]\n", + "\n", + "print(\"State size: %i, actions: %i\" % (state_size, n_actions))" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import theano\n", + "import theano.tensor as T\n", + "import lasagne\n", + "from lasagne import init\n", + "from lasagne.layers import *\n", + "import bayes\n", + "\n", + "as_bayesian = bayes.bbpwrap(bayes.NormalApproximation(std=0.1))\n", + "BayesDenseLayer = as_bayesian(DenseLayer)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2.1 Bulding a BNN agent\n", + "\n", + "Let's implement epsilon-greedy BNN agent" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "class BNNAgent:\n", + " \"\"\"a bandit with bayesian neural net\"\"\"\n", + "\n", + " def __init__(self, state_size, n_actions):\n", + " input_states = T.matrix(\"states\")\n", + " target_actions = T.ivector(\"actions taken\")\n", + " target_rewards = T.vector(\"rewards\")\n", + "\n", + " self.total_samples_seen = theano.shared(\n", + " np.int32(0), \"number of training samples seen so far\")\n", + " batch_size = target_actions.shape[0] # por que?\n", + "\n", + " # Network\n", + " inp = InputLayer((None, state_size), name='input')\n", + " # YOUR NETWORK HERE\n", + " out = \n", + "\n", + " # Prediction\n", + " prediction_all_actions = get_output(out, inputs=input_states)\n", + " self.predict_sample_rewards = theano.function(\n", + " [input_states], prediction_all_actions)\n", + "\n", + " # Training\n", + "\n", + " # select prediction for target action\n", + " prediction_target_actions = prediction_all_actions[T.arange(\n", + " batch_size), target_actions]\n", + "\n", + " # loss = negative log-likelihood (mse) + KL\n", + " negative_llh = T.sum((prediction_target_actions - target_rewards)**2)\n", + "\n", + " kl = bayes.get_var_cost(out) / (self.total_samples_seen+batch_size)\n", + "\n", + " loss = (negative_llh + kl)/batch_size\n", + "\n", + " self.weights = get_all_params(out, trainable=True)\n", + " self.out = out\n", + "\n", + " # gradient descent\n", + " updates = lasagne.updates.adam(loss, self.weights)\n", + " # update counts\n", + " updates[self.total_samples_seen] = self.total_samples_seen + \\\n", + " batch_size.astype('int32')\n", + "\n", + " self.train_step = theano.function([input_states, target_actions, target_rewards],\n", + " [negative_llh, kl],\n", + " updates=updates,\n", + " allow_input_downcast=True)\n", + "\n", + " def sample_prediction(self, states, n_samples=1):\n", + " \"\"\"Samples n_samples predictions for rewards,\n", + "\n", + " :returns: tensor [n_samples, state_i, action_i]\n", + " \"\"\"\n", + " assert states.ndim == 2, \"states must be 2-dimensional\"\n", + "\n", + " return np.stack([self.predict_sample_rewards(states) for _ in range(n_samples)])\n", + "\n", + " epsilon = 0.25\n", + "\n", + " def get_action(self, states):\n", + " \"\"\"\n", + " Picks action by \n", + " - with p=1-epsilon, taking argmax of average rewards\n", + " - with p=epsilon, taking random action\n", + " This is exactly e-greedy policy.\n", + " \"\"\"\n", + "\n", + " reward_samples = self.sample_prediction(states, n_samples=100)\n", + " # ^-- samples for rewards, shape = [n_samples,n_states,n_actions]\n", + "\n", + " best_actions = reward_samples.mean(axis=0).argmax(axis=-1)\n", + " # ^-- we take mean over samples to compute expectation, then pick best action with argmax\n", + "\n", + " # YOUR CODE HERE\n", + " chosen_actions = <-- implement epsilon-greedy strategy - ->\n", + "\n", + " return chosen_actions\n", + "\n", + " def train(self, states, actions, rewards, n_iters=10):\n", + " \"\"\"\n", + " trains to predict rewards for chosen actions in given states\n", + " \"\"\"\n", + " loss_sum = kl_sum = 0\n", + " for _ in range(n_iters):\n", + " loss, kl = self.train_step(states, actions, rewards)\n", + " loss_sum += loss\n", + " kl_sum += kl\n", + "\n", + " return loss_sum / n_iters, kl_sum / n_iters\n", + "\n", + " @property\n", + " def name(self):\n", + " return self.__class__.__name__" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2.2 Training the agent" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "N_ITERS = 100" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def get_new_samples(states, action_rewards, batch_size=10):\n", + " \"\"\"samples random minibatch, emulating new users\"\"\"\n", + " batch_ix = np.random.randint(0, len(states), batch_size)\n", + " return states[batch_ix], action_rewards[batch_ix]" + ] + }, + { + "cell_type": "code", + "execution_count": 100, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "from IPython.display import clear_output\n", + "\n", + "from pandas import DataFrame\n", + "moving_average = lambda x, **kw: DataFrame(\n", + " {'x': np.asarray(x)}).x.ewm(**kw).mean().values\n", + "\n", + "def train_contextual_agent(agent, batch_size=10, n_iters=100):\n", + " rewards_history = []\n", + "\n", + " for i in range(n_iters):\n", + " b_states, b_action_rewards = get_new_samples(\n", + " all_states, action_rewards, batch_size)\n", + " b_actions = agent.get_action(b_states)\n", + " b_rewards = b_action_rewards[\n", + " np.arange(batch_size), b_actions\n", + " ]\n", + "\n", + " mse, kl = agent.train(b_states, b_actions, b_rewards, n_iters=100)\n", + "\n", + " rewards_history.append(b_rewards.mean())\n", + "\n", + " if i % 10 == 0:\n", + " clear_output(True)\n", + " print(\"iteration #%i\\tmean reward=%.3f\\tmse=%.3f\\tkl=%.3f\" %\n", + " (i, np.mean(rewards_history[-10:]), mse, kl))\n", + " plt.plot(rewards_history)\n", + " plt.plot(moving_average(np.array(rewards_history), alpha=0.1))\n", + " plt.title(\"Reward per epesode\")\n", + " plt.xlabel(\"Episode\")\n", + " plt.ylabel(\"Reward\")\n", + " plt.show()\n", + "\n", + " samples = agent.sample_prediction(\n", + " b_states[:1], n_samples=100).T[:, 0, :]\n", + " for i in range(len(samples)):\n", + " plt.hist(samples[i], alpha=0.25, label=str(i))\n", + " plt.legend(loc='best')\n", + " print('Q(s,a) std:', ';'.join(\n", + " list(map('{:.3f}'.format, np.std(samples, axis=1)))))\n", + " print('correct', b_action_rewards[0].argmax())\n", + " plt.title(\"p(Q(s, a))\")\n", + " plt.show()\n", + "\n", + " return moving_average(np.array(rewards_history), alpha=0.1)" + ] + }, + { + "cell_type": "code", + "execution_count": 101, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "iteration #90\tmean reward=0.560\tmse=0.457\tkl=0.044\n" + ] + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Q(s,a) std: 0.178;0.011;0.000;0.000;0.195;0.000;0.000;0.124;0.023;0.000\n", + "correct 4\n" + ] + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.5/dist-packages/ipykernel_launcher.py:37: FutureWarning: pd.ewm_mean is deprecated for ndarrays and will be removed in a future version\n" + ] + } + ], + "source": [ + "bnn_agent = BNNAgent(state_size=state_size, n_actions=n_actions)\n", + "greedy_agent_rewards = train_contextual_agent(\n", + " bnn_agent, batch_size=10, n_iters=N_ITERS)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## HW 2.1 Better exploration\n", + "\n", + "Use strategies from first part to gain more reward in contextual setting" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "class ThompsonBNNAgent(BNNAgent):\n", + " def get_action(self, states):\n", + " \"\"\"\n", + " picks action based by taking _one_ sample from BNN and taking action with highest sampled reward (yes, that simple)\n", + " This is exactly thompson sampling.\n", + " \"\"\"\n", + "\n", + " # YOUR CODE HERE" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "iteration #90\tmean reward=0.360\tmse=0.590\tkl=0.038\n" + ] + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Q(s,a) std: 0.000;0.028;0.277;0.000;0.044;0.059;0.063;0.093;0.000;0.018\n", + "correct 2\n" + ] + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXoAAAD8CAYAAAB5Pm/hAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4xLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvAOZPmwAAFo9JREFUeJzt3XuQ1eWd5/H3NzRIRCIIzcVutBs1Cl4GEYKWLiE6GmRSeIEYjBlRIeymMjuw2dTq7FZtCmsqManNGk2y2aDxMnFKkmGthSWGDUEtjRnFBjQhYS1RVJogtC0gV/vis3/00bQE6KbP7/Tp/vX7VdV1frfzPN9zWj/8+umnnxMpJSRJ+fWxchcgSSotg16Scs6gl6ScM+glKecMeknKOYNeknLOoJeknDPoJSnnDHpJyrmKchcAMHz48FRTU1PuMiSpV1m3bt3bKaXKjq7rEUFfU1NDXV1ducuQpF4lIt7ozHUO3UhSzhn0kpRzBr0k5VyPGKOXpHJpbm6mvr6eQ4cOlbuUoxo4cCDV1dX079+/S8836CX1afX19QwePJiamhoiotzl/IWUEo2NjdTX11NbW9ulNjocuomIByJiZ0RsbHfslIhYHRGvFB6HFo5HRNwbEZsj4ncRMbFLVUlSNzl06BDDhg3rkSEPEBEMGzasqJ84OjNG/xAw/bBjdwBrUkpnAWsK+wBXA2cVvhYAP+pyZZLUTXpqyH+g2Po6DPqU0tPAO4cdvgZ4uLD9MHBtu+P/lNo8BwyJiNFFVShJKkpXx+hHppS2F7bfAkYWtquAre2uqy8c244k9QL/+mpjpu1dcsawDq9ZtWoVCxcupLW1lfnz53PHHXd0+JzjUfQvY1NKKSKO+xPGI2IBbcM7nHbaacWW0af964ofHPdzLpn5dyWoRNLxam1t5atf/SqrV6+murqayZMnM3PmTMaPH59ZH12dR7/jgyGZwuPOwvFtwJh211UXjv2FlNKSlNKklNKkysoOl2qQpFxau3YtZ555JmPHjmXAgAHMmTOH5cuXZ9pHV4N+BTC3sD0XWN7u+M2F2TcXA3vaDfFIkg6zbds2xoz58/1xdXU127Yd8f64yzocuomIR4FpwPCIqAe+AdwF/Dwi5gFvADcULn8cmAFsBg4At2ZarSTpuHUY9CmlG49y6oojXJuArxZblI7Pxn4nA3Be654yVyLpeFVVVbF165/nsNTX11NVVZVpH651I0llNHnyZF555RW2bNlCU1MTS5cuZebMmZn24RIIktROZ6ZDZqmiooIf/OAHfPazn6W1tZXbbruNc889N9s+Mm1NknTcZsyYwYwZM0rWvkM3kpRzBr0k5ZxBL0k5Z9BLUs4Z9JKUcwa9JOWc0ytzoOWdto8LaD74Nv1HjezgaknHtOWZbNur/TcdXnLbbbexcuVKRowYwcaNGzu8/nh5Ry9JZXbLLbewatWqkrVv0EtSmU2dOpVTTjmlZO0b9JKUcwa9JOWcQS9JOWfQ50BF6wAqWgcwoOUEKg74LZX0UU6vlKT2OjEdMms33ngjTz31FG+//TbV1dUsXryYefPmZda+QS9JZfboo4+WtH1/zpeknDPoJSnnDHpJyjmDXpJyzqCXpJwz6CUp55xemQP7328CYM/7h+j33j5aI33k/Cc+PqgcZUm90gtvvZBpe5NHTT7m+a1bt3LzzTezY8cOIoIFCxawcOHCTGsw6CWpjCoqKvjud7/LxIkT2bt3LxdddBFXXnkl48ePz6wPh25yoKW5iZbmJpreO1TuUiQdp9GjRzNx4kQABg8ezLhx49i2bVumfRj0ktRDvP7662zYsIEpU6Zk2q5BL0k9wL59+5g1axbf+973+MQnPpFp2wa9JJVZc3Mzs2bN4qabbuL666/PvH2DXpLKKKXEvHnzGDduHF/72tdK0oezbiSpnY6mQ2bt2Wef5ac//Snnn38+EyZMAOCb3/wmM2bMyKyPooI+Iv4DMB9IwO+BW4HRwFJgGLAO+NuUUlORdUpSLl122WWklDq+sAhdHrqJiCrg74FJKaXzgH7AHODbwN0ppTOBXUB2q+dLko5bsWP0FcDHI6ICOBHYDlwOLCucfxi4tsg+JElF6HLQp5S2Af8NeJO2gN9D21DN7pRSS+GyeqCq2CIlSV1XzNDNUOAaoBY4FRgETD+O5y+IiLqIqGtoaOhqGZKkDhQzdPPXwJaUUkNKqRl4DLgUGFIYygGoBo74t7wppSUppUkppUmVlZVFlCFJOpZigv5N4OKIODEiArgC+CPwJDC7cM1cYHlxJUqSitHl6ZUppecjYhmwHmgBNgBLgF8ASyPiHwvHfpJFoZLUHfY/vzbT9gZN+dQxzx86dIipU6fy3nvv0dLSwuzZs1m8eHGmNRQ1jz6l9A3gG4cdfg049iuTJAFwwgkn8MQTT3DSSSfR3NzMZZddxtVXX83FF1+cWR8ugSBJZRQRnHTSSUDbmjfNzc20jYZnx6CXpDJrbW1lwoQJjBgxgiuvvNJliiUpb/r168eLL75IfX09a9euZePGjZm2b9BLUg8xZMgQPvOZz7Bq1apM2zXoJamMGhoa2L17NwAHDx5k9erVnHPOOZn24TLFktROR9Mhs7Z9+3bmzp1La2sr77//PjfccAOf+9znMu3DoJekMrrgggvYsGFDSftw6EaScs6gl6ScM+glKecMeknKOYNeknLOoJeknHN6pSS1s+3lXZm2V3X20E5d19rayqRJk6iqqmLlypWZ1uAdvST1APfccw/jxo0rSdsGvSSVWX19Pb/4xS+YP39+Sdo36CWpzBYtWsR3vvMdPvax0kSyQS9JZbRy5UpGjBjBRRddVLI+DHpJKqNnn32WFStWUFNTw5w5c3jiiSf40pe+lGkfBr0kldG3vvUt6uvref3111m6dCmXX345jzzySKZ9OL1Sktrp7HTI3sSgl6QeYtq0aUybNi3zdh26kaScM+glKecMeknKOYNeknLOoJeknDPoJSnnnF4pSe1s/cPvMm1vzLkXdHhNTU0NgwcPpl+/flRUVFBXV5dpDQa9JPUATz75JMOHDy9J2w7dSFLOGfSSVGYRwVVXXcVFF13EkiVLMm/foRtJKrPf/OY3VFVVsXPnTq688krOOeccpk6dmln7Rd3RR8SQiFgWEf8vIjZFxCURcUpErI6IVwqP+VshSJIyVFVVBcCIESO47rrrWLt2babtFzt0cw+wKqV0DvBXwCbgDmBNSuksYE1hX5J0BPv372fv3r0fbv/qV7/ivPPOy7SPLg/dRMTJwFTgFoCUUhPQFBHXANMKlz0MPAXcXkyRktRdOjMdMks7duzguuuuA6ClpYUvfvGLTJ8+PdM+ihmjrwUagAcj4q+AdcBCYGRKaXvhmreAkcWVKEn5NXbsWF566aWS9lHM0E0FMBH4UUrpQmA/hw3TpJQSkI705IhYEBF1EVHX0NBQRBmSpGMpJujrgfqU0vOF/WW0Bf+OiBgNUHjceaQnp5SWpJQmpZQmVVZWFlGGJOlYuhz0KaW3gK0RcXbh0BXAH4EVwNzCsbnA8qIqlCQVpdh59P8e+OeIGAC8BtxK2z8eP4+IecAbwA1F9iFJKkJRQZ9SehGYdIRTVxTTriQpOy6BIEk55xIIktTOoVd3Z9rewDOGdHjN7t27mT9/Phs3biQieOCBB7jkkksyq8Ggl6QyW7hwIdOnT2fZsmU0NTVx4MCBTNs36CWpjPbs2cPTTz/NQw89BMCAAQMYMGBApn04Ri9JZbRlyxYqKyu59dZbufDCC5k/fz779+/PtA+DXpLKqKWlhfXr1/OVr3yFDRs2MGjQIO66665M+zDoJamMqqurqa6uZsqUKQDMnj2b9evXZ9qHQS9JZTRq1CjGjBnDyy+/DMCaNWsYP358pn34y1hJaqcz0yGz9v3vf5+bbrqJpqYmxo4dy4MPPphp+wa9JJXZhAkTqKurK1n7Dt1IUs4Z9JKUcwa9JOWcQS9JOWfQS1LOGfSSlHNOr5SkdrZs2ZJpe7W1tcc8//LLL/OFL3zhw/3XXnuNO++8k0WLFmVWg0EvSWV09tln8+KLLwLQ2tpKVVUV1113XaZ9OHQjST3EmjVrOOOMMzj99NMzbdegl6QeYunSpdx4442Zt2vQS1IP0NTUxIoVK/j85z+fedsGvST1AL/85S+ZOHEiI0eOzLxtg16SeoBHH320JMM24KwbSfqIjqZDlsL+/ftZvXo1P/7xj0vSvkEvSWU2aNAgGhsbS9a+QzeSlHMGvSTlnEEvSTln0EtSzhn0kpRzBr0k5ZzTKyWpnV27nsu0vaFDL+7wmrvvvpv777+fiOD888/nwQcfZODAgZnVUPQdfUT0i4gNEbGysF8bEc9HxOaI+FlEDCi+TEnKp23btnHvvfdSV1fHxo0baW1tZenSpZn2kcXQzUJgU7v9bwN3p5TOBHYB8zLoQ5Jyq6WlhYMHD9LS0sKBAwc49dRTM22/qKCPiGrgb4D7C/sBXA4sK1zyMHBtMX1IUp5VVVXx9a9/ndNOO43Ro0dz8sknc9VVV2XaR7F39N8D/hPwfmF/GLA7pdRS2K8HqorsQ5Jya9euXSxfvpwtW7bwpz/9if379/PII49k2keXgz4iPgfsTCmt6+LzF0REXUTUNTQ0dLUMSerVfv3rX1NbW0tlZSX9+/fn+uuv57e//W2mfRRzR38pMDMiXgeW0jZkcw8wJCI+mM1TDWw70pNTSktSSpNSSpMqKyuLKEOSeq/TTjuN5557jgMHDpBSYs2aNYwbNy7TPro8vTKl9A/APwBExDTg6ymlmyLiX4DZtIX/XGB5BnVKUrfozHTILE2ZMoXZs2czceJEKioquPDCC1mwYEGmfZRiHv3twNKI+EdgA/CTEvQhSbmxePFiFi9eXLL2Mwn6lNJTwFOF7deAT2XRriSpeC6BIEk5Z9BLUs4Z9JKUcwa9JOWcQS9JOecyxZLUzrO79mba3qVDB3d4zT333MN9991HSokvf/nLLFq0KNMavKOXpDLauHEj9913H2vXruWll15i5cqVbN68OdM+DHpJKqNNmzYxZcoUTjzxRCoqKvj0pz/NY489lmkfBr0kldF5553HM888Q2NjIwcOHODxxx9n69atmfbhGL0kldG4ceO4/fbbueqqqxg0aBATJkygX79+mfbhHb0kldm8efNYt24dTz/9NEOHDuWTn/xkpu17Ry9JZbZz505GjBjBm2++yWOPPcZzz2X7AeUGvSS105npkFmbNWsWjY2N9O/fnx/+8IcMGTIk0/YNekkqs2eeeaak7TtGL0k5Z9BLUs4Z9JL6vJRSuUs4pmLrM+gl9WkDBw6ksbGxx4Z9SonGxkYGDhzY5Tb8ZaykPq26upr6+noaGhrKXcpRDRw4kOrq6i4/36CX1Kf179+f2tracpdRUg7dSFLOGfSSlHMGvSTlnEEvSTln0EtSzhn0kpRzBr0k5ZxBL0k5Z9BLUs4Z9JKUcy6BIPVRL7z1QqbtTR41OdP2lB3v6CUp5wx6Scq5Lgd9RIyJiCcj4o8R8YeIWFg4fkpErI6IVwqPQ7MrV5J0vIoZo28B/mNKaX1EDAbWRcRq4BZgTUrproi4A7gDuL34UiVlPa6uvqHLd/Qppe0ppfWF7b3AJqAKuAZ4uHDZw8C1xRYpSeq6TMboI6IGuBB4HhiZUtpeOPUWMPIoz1kQEXURUdeTP9lFknq7ooM+Ik4C/hewKKX0bvtzqe1DGI/4QYwppSUppUkppUmVlZXFliFJOoqi5tFHRH/aQv6fU0qPFQ7viIjRKaXtETEa2FlskVJv5Zi6eoJiZt0E8BNgU0rpv7c7tQKYW9ieCyzvenmSpGIVc0d/KfC3wO8j4sXCsf8M3AX8PCLmAW8ANxRXoiSpGF0O+pTSb4A4yukrutquJClb/mWsJOWcQS9JOWfQS1LOGfSSlHMGvSTlnEEvSTln0EtSzhn0kpRzBr0k5ZxBL0k5Z9BLUs4Z9JKUcwa9JOWcQS9JOWfQS1LOFfVRgpL0gZ78sYmTR00udwll5R29JOWcQS9JOWfQS1LOGfSSlHMGvSTlnLNu+oB3D+7/i2Nbtmz5cLu2trY7y+nxevLsEakrvKOXpJwz6CUp5wx6Sco5g16Scs6gl6ScM+glKeecXtnHHNy7G4CG11+jsmYs8NGplodz6qXyIMsps71xgTTv6CUp57yjz5n3Duyjubn5w/2PDx7SLf1u/cPvPtwec+4F3dKnpM7xjl6Scq4kd/QRMR24B+gH3J9SuqsU/ejI+jf3p1/zuwCkAwcBaOnX9sgJp/zF9W9v3ffh9vAxJx2z7UOv7ubtrXs/cmz4mMHwVsufD5zbcY3dtcxA47bGY54fVjWsW+pQfmT93253jPlnfkcfEf2AHwJXA+OBGyNifNb9SJI6pxR39J8CNqeUXgOIiKXANcAfS9CXDtOv+V02DRoFA06ktaUfZ+/b0XaieXDb+UOF8fvXt9LwTisA+3a/B8Dgs2oAWH+w7e78vF2HOPR+2yydV9as/nMng0/9cLPhwDv8qfGEj9TQctiCaYde3f2R8wPP6J7fG0hqU4ox+ipga7v9+sIxSVIZlG3WTUQsABYUdvdFxMtdbGo48HY2VfVaff096OuvH3wP+urrP70zF5Ui6LcBY9rtVxeOfURKaQmwpNjOIqIupTSp2HZ6s77+HvT11w++B3399XekFEM3LwBnRURtRAwA5gArStCPJKkTMr+jTym1RMTfAf+XtumVD6SU/pB1P5KkzinJGH1K6XHg8VK0fQRFD//kQF9/D/r66wffg77++o8pUkrlrkGSVEIugSBJOdfrgj4iTomI1RHxSuFx6FGua42IFwtfvf6XwRExPSJejojNEXHHEc6fEBE/K5x/PiJqur/K0urEe3BLRDS0+77PL0edpRIRD0TEzojYeJTzERH3Ft6f30XExO6usZQ68fqnRcSedt///9rdNfZUvS7ogTuANSmls4A1hf0jOZhSmlD4mtl95WWvk8tKzAN2pZTOBO4Gvt29VZbWcSyt8bN23/f7u7XI0nsImH6M81cDZxW+FgA/6oaautNDHPv1AzzT7vt/ZzfU1Cv0xqC/Bni4sP0wcG0Za+kuHy4rkVJqAj5YVqK99u/LMuCKiIhurLHUOvMe5FpK6WngnWNccg3wT6nNc8CQiBjdPdWVXidev46iNwb9yJTS9sL2W8DIo1w3MCLqIuK5iOjt/xh0ZlmJD69JKbUAe4A8Lc3Y2aU1ZhWGLZZFxJgjnM8zlx+BSyLipYj4ZUR0Yh3VvqFHfvBIRPwaGHWEU/+l/U5KKUXE0aYNnZ5S2hYRY4EnIuL3KaVXs65VPcr/AR5NKb0XEf+Wtp9wLi9zTeo+62n7/35fRMwA/jdtw1h9Xo8M+pTSXx/tXETsiIjRKaXthR9Ldx6ljW2Fx9ci4ingQqC3Bn1nlpX44Jr6iKgATgaOvRh779Lhe5BSav967we+0w119SSdWn4kr1JK77bbfjwi/kdEDE8p9cU1cD6iNw7drADmFrbnAssPvyAihkbECYXt4cCl9O5lkjuzrET792U28ETK1x9JdPgeHDYePRPY1I319QQrgJsLs28uBva0G+bMvYgY9cHvpSLiU7TlW55udrqsR97Rd+Au4OcRMQ94A7gBICImAf8upTQfGAf8OCLep+2bfVdKqdcG/dGWlYiIO4G6lNIK4CfATyNiM22/sJpTvoqz18n34O8jYibQQtt7cEvZCi6BiHgUmAYMj4h64BtAf4CU0v+k7a/RZwCbgQPAreWptDQ68fpnA1+JiBbgIDAnZzc7XeZfxkpSzvXGoRtJ0nEw6CUp5wx6Sco5g16Scs6gl6ScM+glKecMeknKOYNeknLu/wNbUSqUIPBwvQAAAABJRU5ErkJggg==\n", + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.5/dist-packages/ipykernel_launcher.py:32: FutureWarning: pd.ewm_mean is deprecated for ndarrays and will be removed in a future version\n" + ] + } + ], + "source": [ + "thompson_agent_rewards = train_contextual_agent(ThompsonBNNAgent(state_size=state_size, n_actions=n_actions),\n", + " batch_size=10, n_iters=N_ITERS)" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "class BayesUCBBNNAgent(BNNAgent):\n", + " q = 90\n", + "\n", + " def get_action(self, states):\n", + " \"\"\"\n", + " Compute q-th percentile of rewards P(r|s,a) for all actions\n", + " Take actions that have highest percentiles.\n", + "\n", + " This implements bayesian UCB strategy\n", + " \"\"\"\n", + "\n", + " # YOUR CODE HERE" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "iteration #90\tmean reward=0.630\tmse=0.354\tkl=0.047\n" + ] + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXcAAAD8CAYAAACMwORRAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4xLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvAOZPmwAAIABJREFUeJzsnXecJGW1/r9v566ePLsLm2Z2gSUsSXQBAUEUkaRgQsGLiopcA+aA13tFxXgxXfGi/lAUFQVRFFCy4SpBJAnCEpZlYWZz6NlJnbv6/f1R/VZXV1d1V8/0xK3n89nPznRXd73T4bxPPeec5wgpJT58+PDhY34hMNML8OHDhw8frYcf3H348OFjHsIP7j58+PAxD+EHdx8+fPiYh/CDuw8fPnzMQ/jB3YcPHz7mIfzg7sOHDx/zEH5w9+HDh495CD+4+/Dhw8c8RGimTrxgwQK5YsWKmTq9Dx8+fMxJPPzww7uklAsbHTdjwX3FihU89NBDM3V6Hz58+JiTEEIMeDnOl2V8+PDhYx7CD+4+fPjwMQ/hB3cfPnz4mIfwg7sPHz58zEP4wd2HDx8+5iE8BXchxKlCiGeEEOuFEJ92uL9PCPEXIcQ/hRD/EkKc3vql+vDhw4cPr2gY3IUQQeAK4DRgNXCuEGK17bD/Aq6XUh4BnAN8r9UL9eHDhw8f3uGFuR8FrJdSbpBS5oHrgLNsx0igo/xzJ7CldUv04cPHnoCBZIq/rts57eeVUnLDw5tI54vTfu6phJfgvhTYaPl9U/k2Kz4PnCeE2ATcCnzQ6YmEEBcKIR4SQjy0c+f0v4k+fPiYvfjh3Rt43zUPM91znTfsSvHxXz/GXU9un9bzTjValVA9F7haSrkMOB34uRCi5rmllFdKKddIKdcsXNiwe9aHDx97EMayRdJ5nZ1juWk/L8Bods9j7puB5Zbfl5Vvs+LdwPUAUsq/AzFgQSsW6MOHjz0D6bwOwMBQeprPawT1zB4oyzwIrBJCrBRCRDASpjfbjhkETgIQQhyEEdx93cWHDx+eoYLsQHKag3vO2FRS5f/nCxoGdyllEbgIuAN4CqMqZq0Q4lIhxJnlwz4OvEcI8RhwLXC+nG7hzIcPH3MaKrgOJlPTe97ypjLfEqqeXCGllLdiJEqtt11i+flJ4LjWLs2HDx97EjIzJMuo8ypZaL7A71D14cPHrEBqhmSZlB/cffjw4WPqoILr4HQnVHPGppLKzS9Zxg/uPnz4mBVI5YqEAoKhVJ6xbGHazpsuGJtKpuAzdx8+fPhoKfSSJFcssd+iNmB6pRmfufvw4cPHFEFVqhy02HAxmU5pxtfcffjw4WOKoALrgXu3A9PL3P1qGR8+fPiYIqjAuldHjJ5EhMGh6at1n6917n5w9+HDx4xD6d1aJEhfjzbNmvse2qHqw4cPH1MNxdy1SIj+3mkO7oWyt0xBp1SaP431fnD34cPHjENJIlo0SH+PxtaRDPliaXrObWHs86kc0g/uPnz4mHEo5p6IhOjrTVCSsGn39LD3lEVrT80j3d0P7j58+JhxWDX3/l4NmD6PmXROp0sLmz/PF/jB3YcPHzMOJYdoEUOWARicBt1dSkm6oLOgLQrMr3JIP7j78OFjxqEqVRLREAvbo8TDwWlJquaKJfSSZEFbBJhf5ZB+cPfhw8eMI50vIgREQwGEEPT1aNNS666Y+sL2GFDpVp0P8BTchRCnCiGeEUKsF0J82uH+bwshHi3/WyeEGG79Un348DFfkc7rJCIhhBAA9E1TOaRi6oq5z6dRew2HdQghgsAVwMnAJuBBIcTN5QEdAEgpP2o5/oPAEVOwVh8+fMxTpPNFtEjQ/L2/R+Nv63ZSKkkCATGF5zWYutLc51MjkxfmfhSwXkq5QUqZB64Dzqpz/LkYo/Z8+PDhwxNSOb06uPdq5Ioldozlpvi8BlNf2K4SqnsQcweWAhstv28CjnY6UAjRD6wE/jz5pfmYKuwYy/K5m9by9bMPpy3qadIiAD+97wXCwQBvPbpvClfnoxlsHclw6e+f5BtnH06iznuZyhV57zUPM5yu+KS/ec0y3nbMipas44Hnh/jtI5v46hsONaUVhWvuH6Cgl3jncStdH28w98r6+3sTAAwkU+zdGWvJGr9621Mc2d/Dq1bvZd6mTMMWulTLPL1tlM/89nEKunPn6jH79vKZ0w+qe95/bEjylduerup+ff+J+3LaoYsn9Hd4RasTqucAv5FSOl7bCCEuFEI8JIR4aOfOnS0+tQ+veGRgN7c9sY1128eaetxv/7mZGx7ZNEWr8jERPPD8ELc9sY2nt43WPW79jnHufnYXgYBgYXuUjbvT3PbEtpat4w//2sJ1D25k13i+5r5r7h/g1w/V/9yk8zqJaIW5q4C+c7w1zL2gl/jR3c/X/M0qgdqTiFT9rnD/c0keGRymSwuzsD1a9W8oled3/9zc8Ny3r93Gk1tGqh4bCwcbPm6y8ELbNgPLLb8vK9/mhHOAD7g9kZTySuBKgDVr1swfE4c5BqUrFpps784XS4wX548mOR+gmLiVkTselzHu/+wZB7FmRQ/v+PED5m2tgEp+Dg6lTIkDjDrywaE03Vqk7uNTeZ3OeNj8Xf080qI1bhnOoJckI5nqzUfJMG2xEFokaA7uUFCv0U/OP5JQsJoLf+kPT3LtA4MNzz2YTLPvwjZ+fP6Rk/kTmoYX5v4gsEoIsVIIEcEI4DfbDxJCHAh0A39v7RJ9tBrqA+12qemGfFFnJDN/NMn5ABX8GgVBdb/qxHQKZJOBGq5hr3DZNZ4nndcbri+dK5KwaO4quDfatLxCrcu+jophWdB4TWzeMiOZAu3RUE1gV49JF3SkrP89GhhK01duzJpONAzuUsoicBFwB/AUcL2Ucq0Q4lIhxJmWQ88BrpON/lIfMw516VkoNcnc9RIjmXzDD7OP6YNX5j6SNhhrR1wF91DLujH1kjR9YOzBXdWqj+eKFHT3z1s6r1dp7rFwkGgowGiLmLuyMrC/ThXbg5Dxmtg2vJF0wXzN7NCiIaSEbMH97yqVjCsXZakwnfCUTZNS3grcarvtEtvvn2/dsnxMJdSXeiKyTEGXZArVX0QfM4dmmbtixIlosGWVIVuGM+ZVoH08njXYj2YK9LZFcYK9FBKMq4xWMffBpLHJNGLuds19JFMwr3bsUFcaqXyReMRZQ98+liVfLNFXThBPJ/wO1T0Qip00L8sYm0GrvnA+Jg+lITcK7sPpAvFwkGjICEJxh0A2UaiAHg0FGEhWd5Vag3s9jT+V19Gi1QGyMx5umeau1mFfQzqvEwkFCAcDJKIhs3rGuuZOF+YeLxOcemZj6rz9s1GW8TH/YMoydS6TnaCCe6u+cD4mj2aYu5WBJiIh8sUSxSY/A05QAezofXprmLv1d7c1FvUS+WKJhO1qsCseYThTW30zEah15IslshZd3XrFYDB3myzjkbm7nlcF9xmQZfzgvgdCtVjnmw3uus/cZxsqmnv9IGhnoCqg2ROIE8HAUIpIMMDRK3vYNZ5n3KJbDyRT5rlGXD43aYsjpBUd8XBLEviqYkc9v/Xzm8rp5qZiJJltzD3tzty1cl9BvdzFwFCKYECwpCs+qb9hIvCD+x6IiTD3UkmaMo7P3GcPmmHu1cG9saTgFYPJNMt64qwo68pWq97BoTSHLO2su0a1Bnsep0sLm4ngyWDneI50XndcR6ZQYe6JSMgcuQfGpjCaKdAZdy7jNDfIOsx9IJlmaVecsEO1zVTDD+57INSHsdiE5m5l+fZaYR8zAymlqSE3qlkfsTFQ1TDUislDA8k0/T2aKT1YK2R2jec5rBxU3a4u1OcxMUWau9psnNZhtT2I25h7pqCT10vuzF3JMnU2yJmqlAE/uO+RSE+AuVcHd5+5zwZkCyUzD9KoZHA4k6/SjhVLticQm4WSPPp7E/SpCUqqoan8/6HLFGN23kjU5zFu69rsiodJ5fWmc0N2qPUctryrvI7Ka2W1PUhEQ1Wbnb03wA4l52QK9Zn7TNS4gx/c90godtKM5m4dVuxr7rMDKvh0l0sG6/UfGInBirxQYZ2TY+5DKUNj7+vR6IiF6dbCZk25YvD7LGijPRpyTY6qNdi9cTq11nSpDgylEQIOXtIBVF/lWG0PtEiQbMEY3gGVz3nXBJn7SLrASKbgM3cf0wfFTgrFJmSZos/cZxtUsOzrTVAsSdfEXragky2UnBOqk2TuKpCrANbXmzAZu2LMfb1aOTnaXEK1VV2qg8kUSzrjpi2CNbGbzutmSaM6vxr5Z+8NsKOSUHXeIAfKm1tfz/TXuIMf3PdIZCYiy1iZux/cZwVUkFI11G7vy6hDkEp4qPTwAnupX3+PZga1gaE0XVqYzni4nBxtLqFa8ZeZXI5Htf+3R0MEA6Jqk0lZbA8qSWYjWKtNpdNFllEykttrODCDZZDgB/c9EiZzn6jm7ssyswIqmKvg4fa+DDsEdxWYJptQHUgakseybs1cy5bhLAW9xEAyZW489ZKjag21HaqGjDTZK8XBpJHUFELQEauWhzIW2wMlz6hgrTYVN+YeDAhi4YBrcFe19b7m7mNaoJek6YXRTIeqL8vMPqj3oc9k7s4M1ykxaDL3SWruA0Mp9u6ImRa2fT0aekmyeXfGSCaWyyO7tLDrlUXaTXNvgSwzniuSTOXNZG+XFjETu1JKUpYmpnjYOL/abCqvm7ujZSIScs1bDCRTLGiL1vXZn0r4wX0PQ8bStNIMc8+Vg3tPonVdgz4mB1OWKQdQt4oZU16YgiYmezWIWstzO8fZMpzxxNzdNPeuFtj+KjuE/rLu3REPm6WQuWKJksS0PbAz9+F0gWBAVLlV2qFFg64VRwPJmSuDBD+473GwMrWJaO6L2qO+LDNLMJIplLsfjcEWbgzXZKCWZpxoKEBATL6JyR7A1M/3PZekJCu/d8YjjLhU9KRzOgFhrMmKjhYwd3tOoCseNjdBFcQrHarVeYiRTIGueLhmspQVWjjkKm0NDqVnxFNGwQ/uexisZlFNlUKWj13YHmU0WzTLxXzMHIYzeTrjYXMQhhvDVUzVytyFEIakMAnNPZUrsms8Z7J1MDb/WDjA3c8ak9bUfZ3xMHm95GiPm8oXSURCNUE0GBC0x0KTY+62ap7OeEUeUnJK3OItA5aEah3TMAUtGnTU3LMFnW2jWVMOmgn4wX0Pg7VsayKauyonG8v67H2mMZIp0hkPo0WChAKibrWMENAeq9Z+60kKXuCUMBRC0NejsW77OGBhzGW930nSy+R1V8vcyXapDiTT9CQitMfC5jpGXJi7+l8RoNFMwbVSRiHh4ou/aXcaKTEtGWYCfnDfw2D9IDbj524P7n4j08xjOG0wdyFEVdCqOS5ToCMWJhCoZsZaJDQp21+3Uj9V1x0LB1hU/rzUG5uXyuuuScd6f5cXDA6lqjafrvJmUSpJk+gozV39r4z16pmGKcQjQceEqrXGf6bgKbgLIU4VQjwjhFgvhPi0yzFvFkI8KYRYK4T4ZWuX6aNVSE1Uc9eNILCo3dB3/YqZmceoRTboiLvXkbvZ1k521J7qQO23NemYDU09mim1dNXRz9O52kEdCl3xSEPHy3p4YVd1TqAjHkZKGMsVK4M6wtWyTMqmuddDIuIsy7j6uJdK8MAPIT00sT+oCTQM7kKIIHAFcBqwGjhXCLHadswq4D+A46SUBwMfmYK1+mgBMhYfj0ITunkNc/eD+4xj2BK0u+rIF24MdLKa+0AyTWc8XCNdVIJ7Jeh31GHuxoi91ssy+WKJrSOZqgBr1s6nCzW2B7FQECGsTUx5d+aeG4cbP8B7N3+GXK528xkcStMWDdGTsJRRju+EX54Nt34C/nnNhP6mZuClAPMoYL2UcgOAEOI64CzgScsx7wGukFLuBpBS7mj1Qn20BoqVdMbDE5Jl1GX2bGXuUsq61Q1z9VxOsNr4dsbD7BzPNTzOingkOClW7OZ4qGQQ631qE3K6ukjni6615J1NyjKlkiSZMv6mjbvTlCRVI+6s8lDGVoIZCAi0sMHE9ZJkLFek02ld256AX58PyfUciOScUh9wStUhA8lU1ZULG/4Kv70QMrvhjG/Cmnd7/psmCi+yzFJgo+X3TeXbrNgf2F8Ica8Q4n4hxKmtWqAP77j9iW2s+dJd5IruOqrSGbu08ITq3Cv+HLOv1n3LcIbVl9zBYxuHp/xcQ6k8h37+Tu5bv2vKz+WEUklWyQZGc467LOPI3KOTG7W3cSjNcodSv30XtgGwz0LnoGpHymLeZYdi7l6Hsn/2pic48st/5Mgv/5E3fO8+AFYuqKzDmthNOdgexMt5iLFsASkdulMf+Rn86CTIjcH5f2Bd70m8X/yG0vanqg7buDtjbHJSwt++Dj87C2Id8J4/w5EXwDSQgla1ToWAVcCJwDLgb0KIQ6WUVd8yIcSFwIUAfX19LTq1D4Xnd6XYNZ5nLFsk2ub8ZVEf6M54uGrcWCNYSyFhdjL3JzaPkCnoPL8rxeFle9epwtPbRhnPFVm3fYxj91swpedywli2iJQVuaMz7j5M2l1zD01Kcx/P6XTEap93eY/Gz951FEet7DFvayv7urhWy4RdEqrxMAVdlt0bG4erxzYNc+De7fzbS/sBaI+GOMLyWbBuMvaEKlQGh1d6A8p/n16EO/4DHrgS9jkR3vBDaFvE3w+MsuCef9B50wfggrsgYDzXUCrP4n4dfvNOWPs7OPTN8Nr/gcj0Vc94Ye6bgeWW35eVb7NiE3CzlLIgpXweWIcR7KsgpbxSSrlGSrlm4cKFE12zDxco6SRfR27J5ItmWVx+AqWQbZEQWiQ4K6tlVGleKwZQNDyXy8Dl6YK9Nb4zHmbMof9ASunK3LVIcFIdqrmCTizsHEJO2H+haUkARomkm36eyhfrMnfwRiaklAwk0xy5ooe3vbSft720n9cdsbSqSsia2LUnVKG84eX16q7ezLChlT9wJRxzEZz3W2hbBECgbSGfL5xPcMvDcP/3jHXoRXozL/De5z4Aa2+Ek78Ib7hyWgM7eGPuDwKrhBArMYL6OcBbbcfcCJwL/EQIsQBDptnQyoX6aAxV0VIvuKfyOlo4SCQUaLpDNRQQBALuX9KZhqpQmOwACk/nKm8kM/U6DNtMrdT/o5kC3ZYk3njOCPhdDqPiDOY+8dcqW9SJhtxb8+3ocrm6SOf0GkdI8zFaJRg3mkM6nC4wli3Wbfm3JnZT+SKRUICQZQSeFqlm7ov0rfCjC2D3C3Dmd+HFb696Pi0S4ubSMXxt5bNof/wC3PddSO3krkiJfK4N3no97P/quuueKjQM7lLKohDiIuAOIAj8WEq5VghxKfCQlPLm8n2vFkI8CejAJ6WUyalcuI9amMy9TtBO54to0RDhYIBik8E9Um4Pt3b5zSaogFtv7FmroJj7TFkx2M3AuiyDLazBvZ5tbSISJK+XKOilpmd86uWZum7M3QlOnu4FvUReL7lWy9SrsrFjwIMLYywcJBYOGAnVvG74xpRK8MwtsPylaJEgY9kiw5kCB4hBVt/2EZAFePuNsOJlNc9nXHEINr3sK+yf+CpENMZCvVx27zDHvOItnLH/MQ3XPVXwpLlLKW8FbrXddonlZwl8rPzPxwzBiyyjys7CwUBzHap6dXCfjcx9sGwSVW9gcaugPMtnjLnbzMBMB0XbeuoNnNAsnu6d8eaCu0rax8JNMHctzFCqWnM3pZE6de7gzdPdNAlr0BXaWe4JKJakccXw+K/hdxdCtJMz2t/GT/OvIrLlQa6PXIoIdMDbb4NFBzk+l7riGAv1wJuuAmBw8wjX/O0ejl+wsuGapxIz40XpY0qgGHuunixTvgQOBwNNj9mLlNldlxbmhV3pBo+YXhT1Ept2Z4DJD6BoBKXtwizQ3OO1zL3ecVZUpjEVG3Zi2qE8YmIh75tCZzzM87tSVbdVhmM7h6JmRu2pq6lG/uldccPZNCAEPZEi/OkLsNehkOjlnA3/y1HiJpY/uJNNspPY+bcTrBOknUbtOQ1HmQn49gPzCDlPzN2YPBMJiqY199nM3LeOZCmWk4lTnVBV2i7MHHNX5+2wM3dbiWo9WabRDNB6UJVW0WaYu4PmXilHdGPu3p0hB4bSLGqPuvrUKKjPbyqv85bizTC6GU6/DN52I79c8WViMksy1sfb5aVEG7Bvu5MkVDZ8t8Ha0wWfuc8jeNPcddpjIULBQFNNTDmLLNOlzT5Pd8WkYfI2tg3PVdZ2F7ZHZ6xqaCRTIBYOmLJIZ1m+sHu6O9n9KiijrIkkoFVwb0Zz74yHGc0avi6qgiWTr601t0KZonll7l780zu1MBuH0iwLJXlT5tdw0JnQfywAA4tO4qT1K3ntisUUn9vd8LkqHvAVQuHknz8T8Jn7PII3zd2wV21ac7fIMkaNfKmpOvmphtLAF7VHJz2AouG5ytru4cs6GcnkPTfYtBL21ni3qUX2qhorKl4qzV/pVGQZ78y9U4sYvi7ZyvnUuWsGYuTTcN93EdvX1p3iZMXAUMp9GHVqF/zgeLj6NRxdfIiRdJ6zR39KSBbh5C+Yh2mREJkiJNO6p+Act/nRQP0NdTrhM/d5BMXY65ZC5nS0aFmWKZU8t9DniyVzmIK17K6ZhNpUYjCZJhIKsM/CxKRHx3k5F8AhSzv541M7yBTcS/mmCkZ3aiV4REIBo//AgblHQgFHhl1JqDb/ek0koVpJ+uZNmUidu0pK0Yvwm3fButsA+EHwcP6x860gD3Ht7MwWdLaP5pyZe24cfnE27FoHWi8XjF7MiXIZ+4jN/KX7bE7q2cc8VG14W0eynoJ75erHwtwzeSJB59d8OuEz93mEiizjzlyNQGRUy0iJ56Ebds0dZpd52EAyzfLuOG3RydnYejrXUJq9OqLs1VF/AtJUwskMzCkXMlI+zmkDT5gJ1YnIMsZnLdpEAHMam2d6qquEqpSGsda624zmn5M+x0q5iQ9suRiuPdcI/A4YHHK2H0YvGD4wWx+FN/0EPvwYd+7/BYpSsIsu7ll8ftXhqlt160jGG3MPOydUO7X6E5ymA35wn0fwIsukcmVZphyovUoz+SrNffKzLVuNF5Ip+nsTxmX1FCdUB5Np+nsSLZnxOVGMOAyScLIgqGdbq9jyRHIU2TJzb6aJqVOrlY7S9oTq3d+Ah38CL/soHPchOP5jXLzs5/xEO98I+Ld8zNgAbKh4y1tkmZION38I1t8Fr/k2HHg6BMNsX/k6Ts1/jWOz3yGgdVc9j2Liu9POlg12BAKCeDhYo7nPtN4OfnCfV2gky+glSa5YMkshrY9p+Nw2zR1mz8AOKSWDQ8ag5smaYXnBwFCKvl5tRl8HJ0uBTst8UIV6gaYyeWgCsswEEqpOm6E6txYJwRM3wJ+/BIe9BU76nHlMeyLBjzkLjv8EPPJT+Ns3ap67MghbM7zS7/0OXP4ieOyXcOJn4CXnm8caFUaCIqGaKh2rPOQ1QCdso/a8+MBPB3zNfR5BBXW3OnfTKCkSJBw0Lhm9lkNaZZlKY8nsCO67xvOk8zr9vRobhzJTqrmb2m6P1lQNdqvhFEC6tNo68pFMwRygbYcWnbgsoz5jE9Pca2UZLajDXZ+DJUfAmf9bpa2bVySv/C+jbPEvX4L2veGI88zjBpJpVsd20fWnT8K/roNiFvpfBq/+Mhz02qp1WO2F7bmShOV3NxtiOzTbqD3DKsH5NZ9O+MF9HqFRKaT5RYoGCYgmg7teIhJSZXfONdUzBXMiUK/GUCpPuqBPmde6OTfUwty9dE+2ErmiXu4q9aC5ZwocuLjd8XkiwQDBgJhQQrVSCuk9uHdYEvEK6XyRYEAQffLXMLLRkE9C1UHVNEWTEHzt5TC2FW6+yLDSXfVq6Hspr1n3Mz7P3YjHwnD4OXD0v8NeBzuuw/q62Q3LrA6RHR7Zt2YbtVfvNZ9O+MF9HqFRE5M5eSYSMht+CsUmEqplWaY9FkKI2prqmYI5r7InwTPbxpHSSPg1amaZzLlW9CYqU32m+XWw+8oodGkRF83dmYEKIcqBaRIJ1SY6VJWvi5UUpHI67WEQ93wbFh8O+72q5nE1pmjnXGuw83V3wqO/gAd/yMFo/LH7HE551+ehfa+667Be8cRtm5NVpvEqrWiRoDn4A+q/5tMJP7jPIzTS3BVzj0eCJvMqlLwx95xFlgkEBB2x2WMeNpBMIwQs74mbTCyVL05RcK9cJSTKDTbTrbmrTdXOLDvjYXJFo/8gFg5S0EuM5+pbCyQioWlj7mBIetbNMJPXeW3oHzC0Ad5yjWOpY40pWkSDNe8y/hWyFDf/k+Ov3MQ5BxzKKQ0CO9iZu7ss411zD5nEyctrPl3wE6rzCI2qZcyys0jIZOHeNXe9iqVNdip9KzE4lGZJZ5xoKGgysamy/R1IpumIhejSInU9yqcSajOxa8J27/NRF4ZvheYy4LkRTM29Ceau1lhdLZPn3fIGWHgQHHCG62PApfQ2HGNrx+HsLmm1w6hdYN0U7QnVKubu0T4gHq68hl5e8+mCz9znERpp7inL5BmTuXuVZSylkODuzT0TUPMqocLEpspfZmAoXVVu1+mxe7KVcHN6tDLcvTpi5rrqsUgtOrHgni3ohAKiygvdFfdeDgP3woJVvI4QW0aWQ2oFaL0cMPw3VpQ2wglXQcD5uRqV3pqynAfrAYBgQNARCzGaLdYkVLUJMnf1GtZz4Zxu+MF9HqGRLJOx2KuOhSZeCgnO3twzhcGhNK86yLgcn4wZlqdzJVMcvLTT/N2p/HCqYTJ3B1nGer8ZaOoy91BVMtArsoWSN719w1/hrs9C+xJ47i+8Ty8P8f76xyDayTvzRbYEl7Lk4Ne7PoXyzXFL4A8MebP6rXpOLVwO7tXMPRYOIIRRSu9VN1cDPqBydVHvNZ8u+MF9nkAvSbPb1EtCNRwwviheZJmiXqIkqWbuWsS02J1JjOeK7BrPm6xNm4QZViMoW+HTD11s3tYVD7NrfHqrZVyZu61EdcRlEwDgqd8DoEX2qvFY94JsUW+st2eG4cb3Q+8q+Pe/QSjK1355O7teeIJvnNQOQ8/x1KOP8tfu1/PxgPtzdTpU2VgxmEwTCQbYu8N7+WFXPMJGMjWauxCCRCTB4iYTAAAgAElEQVREKl+kPeYtPFqT0iOzxDQMPGruQohThRDPCCHWCyE+7XD/+UKInUKIR8v/Lmj9Un3UgzWg5xqVQkaClg7VxsFdsXtrcO+Mh2YFc1c+L/1lw6jJmGE1grIVtra4z4jmXiehChWG62oatu0JoyX/V+dx3siPyGRzTa8hVyg1Du63XWyULb7h/xlJ0ECQYtdKbskdDse8H874JhfHPstznfWnFTVqFhtIplnWEycY8F76qp7TyWo4HgnSEQtXzV6tBy0SIlPQKZVkXf/86UbDrUkIEQSuAE7GGIT9oBDiZinlk7ZDfyWlvGgK1ujDA6zB3ZW5W7oBVYdq0YP9gHo+qyyjqh6mqp7cK6w17lDR3KdiGpO15FLBKD+cXuY+minQHgvVBDN7U9WIU+JVL8JNH4B4NxxwGq965GfEA89C9kiIdXheQ7ao1/eVefImo1zx5Z+GpS8xb+7SwmQKOrny/FVjMlj9MKRM0Vw196G052SqgnqtnIJ7IhJ09Zd3gqrQyhR087MwG5i7l+uOo4D1UsoNAEKI64CzAHtw99Eknt+V4sEXhszfu7UIJ69uXMrlhJzFLKye5i6EoSuqDlUvmruqjKhm7mH0kmQ8V6Q95v2DvH7HGO2xsGm6NVnYk2naBMywto1kGc8V2G9RbePJYxuHeWb7GAAPld8rK3PviIcZKw+hVsE2W9C5/Ylt5msbEIKTDlxUNdsUDNuEe9cnOXbf3hqWmMoVeXrbGC/pr/Y+AYOZO1VjtEeN/oP7NwzREQ/z9w3GGOMOq7xw3+WGidbZP4WDX8cfdizi1E3fgiuOgo6lEAgyVgwSfMWn0fZ/uetrlivo7na/mx+G33/Y6DY94RNVd6mg98t/DJKIGld/NXa/DuhymdsrpWQwmeLolT0Nn8NpHU4bixYJEQp6Jyxxi43DSKZY9fwzCS/BfSmw0fL7JuBoh+PeKIQ4AVgHfFRKudF+gBDiQuBCgL6+vuZXO8/wuZvX8rd1O6tuu+ujJ7Bqr+a72zwx95xOIhJCCNFUKWTeKbhbTKCaCe7vu+YRXrS8i6+ffbjnx9TD5uEMHbEQHbFqJtaMGdYXb3mS9dvHueOjJzis92G2jGTN3xe0Raq03a54uOxRXjAZ8s2PbeFTv/lX9fOcuC8Xn3pg1W0PD+zmvKv+wU/eeSSvOGBR1X0/+/sA37jzGR757Mk1gSKZytPt0BofCAhW9Cb441Pb+eNT2wFY1h2vVLTsfAb+72uw+iw4+HUArF12Nte+EOcXSx6GYha9WCC15Qk6fvNO+MhDoDkHzWyh5MzcH/k53PJxo5HojVdBsHrtKxYYVz1f+H2FGy7tjjuew4ruRITkeK18lEzlSeX1hqP17Fi1qI2+Hs1Rylm5INFUc1bC8pkbzuRpi4a8VRFNMVqVUP09cK2UMieE+Hfgp8Ar7QdJKa8ErgRYs2bN9E84mGXYNZbjZfst4GtvPJR128d419UPsWFXavLB3VVzr1QHhJsI7oq5R23MHQwJYHkT69ydzrdUox5OF+hJ1HqFNKO5P7djnJ0OgUNKya7xPOe9tI/3vnxfwJA4rCzb+jqo4L5hZ4pwUPDnj5+IEPC2qx7g+Z2pmud/bue4eX57cH9u5zh6STKQTHHYsq6q+waH0hxiqdix4pYPvawqQWpuAiUdbroIIgk4vWK8lYgEubd4EPmzP0YkFOCZLaN84rs/5/eBzxrWu2/6seN5ckUbcy/m4fZPw0NXwT4nGva6DhvD8asW8sB/nmR+XgNCsLiz8VXc8m6NZ3eM1dxecYNsLri/45gVvO2l/Y73XX7uEU09l3XUnpOh20zBy/ayGaq+v8vKt5mQUiallOrb8SPgJfhoiJFMgUUdUZZ1a7y4z7j8HkxObPC0NaDXa2JSwV1ddnqpc3fW3CdmmpXK6WSbGO/XCPYvUzAgiIYCnqtllKPkcDpPyeZtnyno5PUSS7s0lnUb/9ps1RVdDja2g0MplnVrLO8xHrNyQcIczWeFCkyDDvepz8GA7fNQ1Ets3p1x1Zi1SMhc67JurVIN8shPYdMDcNp/Q1tlI4nbqosGh1I8KVdw54LzDZfGtb9zPE+2UKp2hLzzv4zAftyH4d9ucGX8AIvaY+b6lnTFPeVs+ns1Nu7O1LxH9pyLVwTq1OgHA6Kp5Kx10PjILLH7BW/B/UFglRBipRAiApwD3Gw9QAix2PLrmcBTrVvi/IXVg6JLi9ARC5k1u81CBeBwUNQJ7pWmjUgTlr+O1TITcEQslaSRTGvhGLzhTIFOm0SRiIY8M/ed4znSeZ2ShHHbY9w8XKywd4WCEZCtMkFfj8ZgMlUzjk8FfHsAN+4zPgf2wL9luLZipyHSQ/CnS2HF8XDo2VV3JWzVRWotv0u8BZa8GP7wMRjfUfOUyuIAMKpvHvwhHHkBnHwpBFtfYd3Xq5Evltg2mq26XVlPLOtuLri3EhXLC718BTdHgruUsghcBNyBEbSvl1KuFUJcKoQ4s3zYh4QQa4UQjwEfAs6fqgXPFzh5UKxYkHD8onuBCuht0ZB7h2pONz+IlWqZiWnuXWZjiffgrsyVWsncRx0ug7VI0LPmbr1SGrHPH/VQs2wy93JwNxJ81YOa+3s1UnmdpK2efNCFuStbYah42SiooO86K9QJf/4SZEfhtMtqvFvso/bUhrM7q8PrfwD5lDEgw4assqOQEm77FMS64BX/6X1NTUKVutq/H4PJNHt3xGZ03GOlt6LI8FwK7gBSylullPtLKfeVUn65fNslUsqbyz//h5TyYCnl4VLKV0gpn57KRc8HOLHCvh7N8RLdC8zgHgu5M/eCbl6GNzOJyUmWsc7D9ArFDlvK3NP5mpriZvxSrMHCyS4X6tcsd9iY++50gbFcsYq5q0BvD0wqcG/ana4ad2j9DNQ+pkmNecuj8NCP4agLYa/VNXdr4erqIrXhDKcLsPAAo9rlqd/Dln9WPS6r6tyfuMGwFjjpkrpSzGSh/t7BIftml246mdpqWLui55rm7mMK4NRl2N+rsXl3xhObtkM1LrVFw+7DOnJF8zK8mVJINZPVytxj4QCRUKApWUax6WyLgrtqGqll7t5lGasWbv9bTObuRZYp1zdXXCMrzFqxbGtgGk7nGc0W2W9RGwVdsmW40u2rAvh+i9pqNvvBIWMQuKduTMWqtV44sab3EKj4l6sOS3VlYL4WR78XYp01049yBZ2OQA7u/Kxh1fvitzdezySwuDNGKCB4wWGza1ZvbzUqCVWluc+83S/4wX3G4BQ4+nsSFEuSLcNZt4e5QrHr9miIfNE5eFobRsKByZVCmo6ITcgyih26bT7NYjxfpCRrNXH72LN6GExaA2713zLqsAHboZwoVTB0GtS8vCeOENUsXP18/KoFVY8z7kuZ920dyVZthgPJFMu74/W7J1O7jCai370XNv4DTv4CxLscD01YAlNBL5mfPbOmPNYBR78Pnv4DbF9rPi5bLHHizp/D2BY47etQxz6gFQgFAyzrjlfJaKlckV3juaY8ZaYCSupMpvLk9ZLP3Pd0jDpc8qtGnIkkVatkGQ+lkIFyRcBESyHV2pti7mU23SrmrjYWext+PBzyLssMpdlnoREcaph7WXJqNG6tS6s4ZFa6WCvBPRoKsqQzXh3cy8H8hFULqx6nfm6PhTi8XAK50SbT1ASzYs4w6Lrzs/D94+Dr+8L1bzfklBedB4e/1XXt1qavzbsz6CXJPgsT5Mu+8IAx1SjSZrL3Ukny6tK9HLPlZ/Cif4M+p7aX1qOvN1H13TCnYs2wLKNKQtXV15zS3H20Hk6+H4rt2S89vaAqoepqP6BXjRELB0WTmns1O7N7czeCGlytpvhMFm6auMHcPcoyybQZRO35g+F0gWBANOyg7LR0Tw64JPj6erSq5Ki6YjhyZQ+RYKAqaBm2wlplsy9/HqyDwE1segi+vgp+dibc/31D937lZ+Hdd8GnB+B1V7ha6UJ1QvWF8prM10O9t1oPHPUeoyxy17MUnrmDb4e/x5bOI+CMb9Z9bVqJ/h6NgWTarDqaaI17qxEIGBOttpab3XzmvofDyfdjr/YYkVCgSirwCsXW22IhSrK2Cqaol8gXS1WTZsLBgOtG4PTcETtzb3JgR0Yx96JeUxY4EbhVsxg2to2Z+1i2wFAqz/57tTvmD9QQ6kZ12FbzsIFkytFXvL9Xq0mULmyP0hYNsaynWm4YTKbo70mYteyK5VsHgQOGpn7nZyEch3Ovg4tfgHf83kiCLj+qpjvUCQlLMlCt79Byg1TV6/HSD0AoBjd/kMgN5/OMXM6fj/iOce5pQn+vxli2aL7vZo17M5VDUwQtEqowdz+479kwnf0svh+BgCgzvIkz9/YyE7NLM+lCxRFSIRIMUPQwZs9Jc4fmPd1VwJXSW5VOI1Qqjmx17pGguZHUQ2UeqmZITPZSSI+VD9bcg5uJVV+vxq7xPOO5Ys1x/Zb3XNkK9/Vq9CQitEVD5mZf07Dz3J9h8D4jmB9wGkTbGq7VjrilAWcgmSYWDrBqL+N5qgzR2hYaY+0G/47evoR35C8m6KLjTxX6bJvdQDJNZzw8K7zTtUjQzFd4Haw91fCD+wxhJFOg3cGDon+C5ZBWWcb6u4KqVNFszL2pDtUazT0yIc0dDPY+WbhZ2mqRIOmyBWs9mJptr+Zo3TuaKXgKHOoKJp0vsnMs5ygTKHZp1rYn0ybD7+9NMDhkyA2mrXCPhhDlzd7W7NTXkzB2yD9/CTr7JlWpEgkGCAUE6bxu6Pk9CdOyoOa9Pf7jcPT72HrmtSTpbMp/pRVQuYaBZKXBa8UMSzIK1iHZvua+h2MkXXDc4fvKl+/NyhZWWQZqg7sqDUxYNfeQt4SqU507GEF1PFf0PIc1ZUlytiKp6tZBqkVDSNl4A6lotomqpKjCsMdWckNzz1s2i1qZwFqnnS3obBvNmgG/r0djPFdkKJWvcbns79WqrAjUIHCeuQ22PAIv/xSEog3X6AYhhNkXMDiUMjc6cJhZmuiF075GKm40pE9345Bi7tbXw+m1nglYh374mvseDrc25f4ejXRedzSyqgdV0aI0dXu5ofIOiYetCdVAU/YDYZsNqlq/1zFz1gqWXAuSqiPpApFQoCbIJDza/g4OpegtSx9OzF1p7o3QpUXIFko8u90wAnOTZcAISBtt5ZJmk9NQumZkXF+vxsZyk5M5CDwg4C9fhp594PBzG66vEbRIiPFckcGyVNTZ4H1V712snp/7FCAeCbKoPcrAUJqCXmLzsLvHznRDyZ3BgKjxH5op+MF9huCm56ovdbMGYmrGqbJhtQdtc8Se5YMXDgQ8M/dIKFCTWKw7ld4BacuszlYxd6fgq7pwG1kQDFikkU4HiWk4nffEwtQV2OObRwDn6o2OWJhuLWwE8KRzcB9MpmtGxq3oTVDQJVtHMpVB4I//GrY/ASd+piU+Llo0yEAyRbZQor9Xoz1qDAJxq4RS791MtPyrK5ktw0bZpteh2FMNFdw7PSTgpwt+cJ8huDF3e/mbV6gAbBqC2TX3fG1C1ZBlGss/uWKJqIODXrPmYVZZphWNTMNp59fQboblBkNjVsE9XJVA1EuSsVyxxpTMCWqDeWzjMB2xkGtdfF9vgsFk2tTQ1Ua+rFszm5zMkXHpXfDM7ayKGhvGYDKNTD7HxanL4HcXwt6HwiFvaLg2L0hEQjy9dcxcoxCCjpj7GEXlDTTdzB2MfMPAUKqyQc4S5q6umGdLpQz4A7JnDG567rLuOAGBo0VsPeR13QjuoUbB3ZZQ9SjL2JOpYG299xbcrRUsrWLuTq9hpXbb/Rz5YomtIxn6epcBhsSUyusU9BLhYICxbAEpvemnaoN5YvMI+yx0r1jp79H458bdDCZTtEdDdGthyI0Ri7Sxd0eMgaEUW5LDvC94C1x+HuTHWAPcE10At67i18V/IMeicPwn4NgPtqwrNB4JMla+qlLBskuLuF6Rqfcu6jaJaQrR36txwyM51pWnY810d6qCqjqaLZUy4Af3GYGUsuxmWMvwoqEgizvjTde6K1nGDO52Wcacn1qtuTcjy9jRrKd7dUK1Bcw9U2BpV22dtdVf2w2bdqcpSaqYOxh/y4K2aFODjtVjU3m9rkzQ36txy+NbeW5nikO784ibPgCP/gKiHVwl+xl4fgUfT9/PUrETDjgdjnoPpR1P89htN/Hi0U38XD+ZZWd8lpOPOrThmppBwqIXq6lI9cpcczPI3JWEdc/6XURDARa1TzyZ3EoouXO2VMqAH9xnBGoIhBsr7O/VmmfujWQZB809EgyY5VtentuOylR6b86Q6XyRSDmJ2wrmPpopsHpx7VBnq0ufGwZsSc0urTq4e7H7VeiybNI1MkE+bQyx0PMck+vmLplj1eBdfDJ0PfwrC0e+B2SJxNr7eXn6TtbLJTz2kq9w+plvBiCw7yv55r2r2TKSIVss8YelrR9Pqa50lnTFTCvoLptMZcVMMndVMXP/hiR9PVp9j51phPrM+bLMHo7htHMJn0J/r8ada7c39ZxKOnGTZVJOmntQMJr1yNydNPdmE6p5ne5EmO2juRZp7s6DopX+mSm4M/dBW8lhh7lRGX+Ll0EdCk4WEgC8cC/cfBEMbQDgWOD2MtF8oe1IVrztCsNWF/hDYj1fv+NpQHDVqjVVz9/Xq7FhV6pqva2Esv21dnp2xsOmHYEduRlNqBprVMnf2QJrQnW2wNN1lRDiVCHEM0KI9UIIZ+9Q47g3CiGkEGKN2zE+nO1+rejrSZBMVboZvcAuyziVQgZEtflXqAn7ASfmHgoGaIu6J97sSOd0s0Fmssy9oJdI5XVnzd0Lc0+m0SJBFrYZ0VYxLlX+N9zgPbKiPRYyZ2D09SQgN24Mib76dJAlwxLgPzaRPO+PXJT/IO/IX8zfj7vKDOzG4zTAeBJ70FJXA91a2BwE3kqoqznrxlHPWkJJao4DsqcY3VrY7MJuamDJFEPlsrwk4KcLDd8dIUQQuAI4DVgNnCuEqHH9F0K0Ax8G/tHqRc43mMy9jiwDtVN46iFXlk6idTT3RCRUVaYVmaTmDtWt942QyhfNYdaT7VCtx6zt04WcMDhklBWq18M+fMTcgD0w90BAmEF3P309/L8T4MGrDD+W990HK0+AaDs9+67hj8Hj+Gvp8JpEoHrPnUbG9Zk171MTzNRmaJWUVN2/U5dvrvzexWZAlhFCVDV4zRao5sC5xtyPAtZLKTdIKfPAdcBZDsd9EfhvoHkz8j0MKnC4ZdbtnXheUNHcg+bvVqRzupnRVwgHBcUGLfrqudxazZ2af9yQyeuV4D7JhGq9qx/VqNWIuVvdFVX5otqo1PANr1/WrniIC8O3seC6M6CQMdj6qV+BSCUgKzsBqJVXlCTi5Chp9aCZCijmbg2WnfEwUmJW0ViRLZQIiNqmtumCWudsqXGHCnOfTZq7l+C+FNho+X1T+TYTQogXA8ullLe0cG2O+Oed1/DoZadRKHj3NJltGDF9whswd4ekaq6o8+VbnqwJqHndCMCupZAFvSqZCspbxoOfu14i4sLSurSwZ809lS/SWw7uuUky93oJz2BAEAu7J4tL5W5PazBTBm7qbxnJFIiHg+5Jw5HNhnf6ZfvA1/fj95l38JngzxGrTob33Qsrj3d8WF9PgnBQsLizusqnUwvTGQ87epPbm51aDbUZ9tk0d3Auc1XDsWeqWUetc7bUuMPs1NwnnVAVQgSAb+FhKLYQ4kLgQoC+voll/bM7X+CY9H2Mje8m3L1oQs8x02ikubfHDF1x20jtRdC/No3ww7uf5yX93Zx6yGLz9nyxRESzBvfqwJbOFausB8CYo5r36OfulFBVf8OzO8YbPodekmQLJZMhT5a5N5qSlIiEzK5cO3aMGQldq8wRCgZot+QP3BqkTNx+MSTXG8MqkGzZOsyjHYdywps/WjOE2orXHbGE/l6NoEOVx9uP6Wd5d23AWrEgwWmH7M3Jq/dyX88kcMy+vZx2yN7su6jyenS5mYdhGY49Qzjl4L3YOAtmp1pxyJJOXnHAQg5fPr1OmfXgJbhvBpZbfl9Wvk2hHTgE+L/yTr43cLMQ4kwp5UPWJ5JSXglcCbBmzZoJeb6WYkbpW358COZocFdDIOp5UHRqYUdvD8VY7Q06pixTR3O3n8+75u7+ZXYy3HKCYtFt0RCRUGDSQ7IbTUnS6ozaM+ec2oJDhyV/UHfQ8bo7jClHJ11iOCUCB5b/NcJrDlvCaw5b4njfx199gOPt4WCA75/3Eg/PPjEctLij5vnrDUA3h2PPEI7o6+aKf+uesfM7oTsR4SfvPGqml1EFL9vvg8AqIcRKIUQEOAe4Wd0ppRyRUi6QUq6QUq4A7gdqAnurIKPGzlhI7Z6Kp58WqMBR77LWTctWt9kDV65BnXsmX6u5hzyO2XOrljHWGWE0U2joYqnq7OORILFQYNLVMo3q0LVwyDWhaq9xV7BWiLh6uefTcOsnYMEBcMwHJ7r8WY+uOtYSueLMBncf3tAwuEspi8BFwB3AU8D1Usq1QohLhRBnTvUCa1AeEKDP4eA+7MFt0E3LVo0l9sClpBOV5HKqc7fa/YIhy3iulqkjy+T1UsNmKFVnn4gGiYWDk65zN5PSMeern3rMfTCZJhgQLLF1t1rH5Y26Bfe7vwHDg/Cab0Fo9pS9tRqVBjVnzX0mZRkf3uBJc5dS3grcarvtEpdjT5z8stwRiBsjwPT03A3uoxlnL3crOuNhto/WatlKqrFXgih2LYQwZA/7JKZcscpXBpT9gERKWfcqol4ppJXh2Z/fCqV/a5EQsXCwJczdadiJQj3NfWAozdKuuNmNqdClhXlm25j5/Icts71HO56Gey83bHZXvGxS65/tsNox2JEt6ER95j7rMee234BmaG0yMzzDK5k4GibrMOQOJ9akmKWdKVsDcNShOSld0Ku6UwEiZZbfqByyUZ07ODM8KzKWMX/RUKAlCdV6NehqAIUTBpMpx8oTQwozNoQazX33C/CLsyHaDid/cVJrnwuIhY33yVGWKZSI+cx91mPOvUMhrQeY28G9brKujM542FHLHjGZu4MsU/7CRUIOwT2nOzJ3oKE0U09z92oeVsPcJ1sK2eA1rBfcX0g6V1oYnu55ckWdTEGvJGuHnoerXwP5MXj7jcY80T0AXZpzg1quqPua+xzAnAvu4ViCggxCdu4G9+F03pPm7qRlO1XLSCmNOvegc3Av6CXyesl0/1Mwg3udOaqlkqSgS1fN3e7J4oZ0leYemPQkJjc/fAUt6pxQHUkXGMkUXJl7QZdmCWpHPGz4wlx9BuTH4e03w+LDJ7XuuQQ1PtAOo1pmzoWOPQ5z7h2KRUKMkEBkR2d6KROC1yEQbppnpVqmErhU2WMVc7ewcRVYnTpUrY93gv257fA6as/0kw+3iLk3mJKUcGHuaoydky+J+lvUIIi95S64+rWVjtPFh01qzXMNbgPQsz5znxOYe8E9HGBEJgjkRmZ6KROC1yEQXS6M2KkU0hxgrYK7TXNP52vtfsGbLKOCez37AXCuh7ZCrUGLtkZzH8kUHf3wFeKREOm8XuONYh9xZ4X6WwaG0nQyzrH3Xwi5UXj7Tcbkoz0MHXHnHga/WmZuYM69Q7FwkDE0Qvm5GdwbmYYpNGLuVs3dDO4usoyqrLEnVD0F92J95t7WYN6mfQ2JSIhoODipJiYpJSOZxswdahPPg+Ua9yrN/ZGfw80fYllqLSDZsiPJjyNfJz42COf8co9j7ApuzpB+nfvcwJzzc4+GDOa+ND83ZZlG1gMKqhLEGjSNoObA3E3pJFj+v1qWyTiM2AOjzh2oO0fVvnHYIYTwZB6WyRcRwrjyioUmV+eeKegUdNlQcwfjdbJesQwkUyxoi1Zue/oWuNloRjqMn/KHyAoiT7azn1hP8pQrWejiEbMnwO19Vd4yPmY35iRzH0UjXJibwV2VMjYuhazVssdzRfSyzNBIlrEGTzViz55QVaWQE2LuL9wLP30t3Pdd9omONTQPS+V1tLLZVCw8uQ5VL1c/agCFPak6kLQYhm19DG64AJYcAZ9Yx+5XfI0gOvtnH+eS4vlEDn39hNc4H9AVD5PO61VXgVIaHkF+KeTsx5xl7pE5Gty9MndVhmfVslVQi4QC1QlVe3APBRjLVu6v6N0T19yrgvvoFrj+7Uai8fm/8SsCrB08ArZ9x1WbTueL5vkn28Tk5TVU3bj2Zq/BoTTH7NMLo1vhl+dAvAfOvQ7aFhF+6Xs47bblLA2n2FJq41KX7tc9BZ2WBrWF5VmlijT4TUyzH3Nu+xVCMB5oI6qPQQM/k9kI0ye8AXNPRIIEA6Lqslj9vLgzVsXcczbpJBqyJ1RboLkrWUYvwK/PNwL7hX+Bix7ilo5zWJ57Dn52Fuxc5/g86bxuXjlEQwGyk5BlTF8Z+2s4tAG+tRp+dhb7Pf8LlokdVaP2sgWdbaNZjopsMBqSsiPw1uug3XBbNF7zAJsLbXTEwrNmPudMoZL3qRAMM7j7zH3WY05Sk0ygjaDUIZ+CaNtML6cpeGXuQojykOLa4L6kM85AMk2pJAkERE1FS00ppEtCNWT60Lhvkjm7LHPXJbDxH/CmH5tj4v605EKufeF4rhWXwM9fD++6HbqWVz1PKqcTj4Rg17N0y1H0kqSol1ztA+rB9TW8538gtQsiCfbb8EXuiUL6hu/Cga+CfV9BMh3gp6GvcsJjjxuM/c0/rbrSUK95MuU8m3VPg1NSfybnp/poDnNy+00H240f5mAj03C6wRAIC+wJLRXoF3fFgEolSKNSyIrmXmv5C01o7mt/B/d/D45+HxzyRvOYrniYtdleOO+3Rungz19vBFkL0vkiqwJb4f+dwOvXXkSA0oTZ+4iT3e/YNnjsWjji3+CiB3n2LX/ji4XzSMWXwCM/g2vPYelNb2Z1YIBNaz4NH3kcVp1c89wqoM2moQ3jpl0AACAASURBVAszBVMatBAMVcLqB/fZjzkZ3LPBjvIPc68c0ov1gEKnrRTNytyhErSdNHcnWaa2ick4vljyUOceFPDHL8DiF8HJl1avMx5mLFdE3+tQeOuvYGQjXPMGSA9VnieX5WOj/w2lIgvGn+GNwb9NWHd3ZO73fw9KRTjWqHyJLNqPq/TTufvoH8CnB+Adv+evB3+J43P/Q+zEj7le8Smpxw/uzsxdNZ/5sszsx5x8h/Lh8hdzDvrLDDdom7eihrmXGavJ3PM25h50kWXyRYIBUfOFDJve741LITtGnoXdz8NLzq+xuu3UIsa8zWwB+o+Ft1xjOChe/RoY3wnAW0avZkVhPZx9Nbu6X8SnQr8iNz6x9284XSAUEJXqn8wwPPhjOPj10LMPUNnIUnkdQlFYeQJ/ib6SQCRhjvpzgs/cK3BqpMv5zH3OYI4Gd8Xc515wH/Fg96vgpLlHQgEzOKlKkBr7gWCwpolJi1hmXm59DL51MG3DTwHeZJnujXcCAg48o+aYGoa36mSDwQ9tgKtPh0d/ydn533FP9+vgwDNYe9h/sFCMoD1wuafXwY6aYScPXWWYeh33EfMYJUGlLc1eA8kUfb2JuvbGKqD5mnvFN8iJufveMrMfc/IdyofmsCyTbjyoQ6FLq/b2GEkbQU01I6U9yzLFar39X9fD6CYW3XMJIBuUQhpf5vYXboflR0Nb7WhDR6uEfV8B591glE3e+D6eYxl/Wv4hADILX8QN+vF0PnqlYaXbJIatdr+FDNz/fdj3pKpO0rhZ516RfgaG0g2HKvvMvYJgQNAeC1UHdz+hOmfgKbgLIU4VQjwjhFgvhPi0w/3vFUI8LoR4VAhxjxBideuXWoEeKQf3OSjLNKO5d8TDjGYLpj/KSHmCk6rhTjeQZZRdcDpv83JfdztE2oltuZ/TA/9oyNyXiR1Ed62Fg17jeEyn20i2FcfB226ElS/nw8UPE4kZgTUWDnBZ4S3IQBDu+M+mS1qrpiT98xpI7YSXfbTqmEBAlG1/jQ1QL0k2DWUcPWWq/xbjqqirjm/NngS7NKgSqr7mPvvR8B0SQgSBK4DTgNXAuQ7B+5dSykOllC8CLgO+1fKVWmAG9znI3Icz3svsuuLhspZtBKjhMnOPh6uZe85WCqn+V3JNOq+jqRF7u9ZDcj288j8pLDyYz4R/iZ5Pu64hXyxxSqA8DtdBklHrNP42hy7V5UdSPO9GniguNa84oqEg2+lh0yHvh6f/AL+90GDgHjGsrn7GtsOfvwj9L3OcjKRFguZ4v22jWfJ6ib5Gwd1n7lUwBqBX6tx95j534GX7PQpYL6XcIKXMA9cBZ1kPkFJa20UTwJR2F0UjYcZJzDnNPVvQyRZK3qtlbI6LysPc3n3pVAppvT1lHbG37nbj/wNOJ/uqr7BM7OLA537iuoZcscSrgw+hL1xtJivd1jmSdnaGTBcqXu5Q0Ws3HPDv8Mr/gsevh5+cbnSNesCwMg277VNQyMJr/wccdHQtEjKTzgNJw+q338Hq1wq1UTVqMttTYJ0rC5W+h5iHUl4fMwsvTUxLgY2W3zcBR9sPEkJ8APgYEAFe2ZLVuSAaMpwh26ZYlnng+SE+f/Nafvv+Y1vCVLw2MCl0WczD+nuNxx+4uL2iubvVuYeqg3umoNOjKkTW3Q6LVkN3PwFtKX/QX8qpL1wNL7wWQnHufmYzv11X4FvvfQNCCIKZXawRzyAP+ITrOp1MzqzI2Eox1WuZLZbghE/CwoMM9n7liUa1jdRZt22Ugej+nHzBVyBY/TEdThc4Knc/PH8jvPKzsGCV43kT0RA3PbqZ257YanryNJJluhPG39LdwG9/T0FXPMLW4Qp3qzB3X5aZ7WhZh6qU8grgCiHEW4H/At5hP0YIcSFwIUBfX9+EzxULBxglweIplmXueXYnT24dZedYjuUNEnFesGm3IT0sLtepN4K9CsWY4BQx9XNVCeKkuUNFlknliizv1owcxeDfzVrwcDDAVwvnckr4n8a0IeB44FgpyP7pWeKv/BTLdvyVoJCUVr/WdZ3RUJCeRITNw87SirInVkndSnAvJzsPeg1ccBf84aOw7V8ggsSHxzm59Be4+lF441Vmx+totgDZUc7a/E3Y6xA47sOu67r41AP4+3NJ8/e9OmIs667/2r9sv4V88ayDeUl/d93j9hTs3Rnjj09tN7uhVXD3vWVmP7wE982AtZd8Wfk2N1wHfN/pDinllcCVAGvWrJmwdBMLBxmRUy/LvFAe7JByGNc2EQyWpwA1Yo8KJnPPFCjoJVJ5vay5W2q4MSpaAgKzld8uy5gJ1ef+ZDT67H8aYExi2sxCfnH4NZy/KguhKP/7t0GWD/yWs+75Kmy5nwN3DrNRLmR5A0/zvh7NHIRhh93bRrG+qoEdex0M774TMJwHT/ncHZxUvJvLt1+N+MFxcOp/Q2IBo5sG+Wb4F2j5JJz5Kwi6XwWdeMAiTjygtrqnHiKhAG87ZkVTj5nP6O/VyBVL7BjLsXdnzPeWmUPwEtwfBFYJIVZiBPVzgLdaDxBCrJJSPlv+9QzgWaYQsXCA3VKb8mqZgfJgB7uz4ISfrxz8vF4FWOuMFXvv0sJmJUjGUgppdW20yzJmcH/mdtB6YdkawPBSCQUEO6J9sPpAAB645wG+UVhM/0tO4UWPf4Xleo6rOYPz69SGgxEEHnpht+N9aZufvLJecBvYsXM8Rzqv83uO5WNvOoeV//dBuPG9gMEslgVhx4s/zKKlL6m7Jh+ThxpqMpBMGcG9oCOEH9znAhq+Q1LKInARcAfwFHC9lHKtEOJSIcSZ5cMuEkKsFUI8iqG710gyrUQ0FGS4lEBOsSwzWE7CZRxmcU7s+dLs3RHzrN9bE5V2vd5aCZIvlqqGaajgnjODe5FEBFh/F6x6NQQq5w8HA1WlkEZSVHB3+xnwnj/zRNcr+G3glIZr7e/R2DKSIecwGzVlGbEHFubu4i0zaLkCWF9cBO+60yipfPddXHPUjRyS/RHaKZc0XJOPyaO/10hAK6KTLZaIhgJ1G8F8zA540tyllLcCt9puu8Tys7vwOQUwNHdtSmWZ0WyB3eUEYatkmYGhdMNSPCuioSDxcJCRTKHG5laLhCqau14ypjBtfQwevIrYfp80b88XSxR0yb7ZJyGzG/avDtThoKiaxKQ2kYGhNOx9ONcsv5RtqR0N19rXm0BKI6+w78Jq35a0ZcQeVCot3LxlrPLOQDIFob2MpijgiQf+RTRRpC06Jw1N5xyWdsUJiMqG609hmjuYk9dWSnMXhTQU6w9mniis7NE+zWeiGEg27pC0w6gzLpgTmazMXckduTKb4t7L4ZGfsupf3wQMRq+uOg4Y+gsEQkYnpwWRUDVzV2Vv6u+3Sz5uUHmEQQfd3RwWUtbcAwFBJOg+JHtgKI0Qhr+6mnlq3pdsboP0MTlEQgGWdMUrzN0fjj1nMCffpVgoyAjleuUpkmYGqoL75GWZVK7IrvEcKxbUr7O2Q3UIqlr3Lofgni+W6AgW4JnbINLOkqd/wrGBJ8gXS6TyRU4JPMjBG681jLViHVXPb5VlSiVpbiID5eRvTm8uuKt6ciuchoVEQwFHCQcMOWxJZ5wVCxI1SdpBDxYCPlqLFb0JU6L0h2PPHczJ4B4NBxiV5S/4FEkzKrhBRVaYDBQD7WsyMKkmkpF0NXNPRENVlr/HyUegkII3XUW2c1++Gf4BpfRuSgP3cXn4f9ndfSi89js1z28Ed0OWGcsVKUnjamH7aI5sQa/R892wsC2KFgmaDM8K00/eIqVEw8G6zL2vR6O/V6ti7rmizpaRDH29zW2QPiaHvl6tirn7DUxzA3MzuFuZ+xRVzAwm03SX9e1WaO6KgXotg1TojIcZzRRMucQqy5iWv3qJVxTvhsQi2O9VbDvpchYwwsH3fYS9b30nm+QCHj/hSojUBsVwsDLJSbH2w5Z1AcaGlFeSTwMIIejr0RxlmUy+tsIiFg64VssMJtOsWKDR15Ng0+602YC0aXcGKfGZ+zSjv0djOG1UbGULJb+BaY5gTr5LsXCAUTn1sszKBQmioUBLqmXMGvcG7e92KM19JFOgPRoyatmzo2iRCnMP5MdZk38QDn4dBILIJUfwneIbWLjjXkqBCO8oXEy4Y6Hj84eDAQrlqhWVtD1saSdgvAZeNXco17o7MfecTiISqqqwiIWDlSYmC8ZzRZKpPH09Cfp7NQq6ZEu5OWpwghukj8nBmk8xNHefuc8FzNHgbtXcp4i5D6Xp701UyR+TwUAyTWc83LRnidLcR9JlH/gnb4L/XsErR2805aLDU/cRJQ8HvwEwkmDf18/k8QM+xCMn/IRNclHNiD0Fq+Y+YjJ3FdxT5Uocbx8TJaMoF0uFdL5YM781GgqYgx+sMD1gejWToStpRt3nJ1SnF31lQjI4lDaS9z5znxOYk+9SNGRh7hnnxpnJwNR2ezQjcdkizX0ijLNLi5Ap6Owcz3FgNAk3XQRCcPqWyzkgvxaAl2b/SjK40PBbx+hQ1Qny6MoL2KntC9QOx1YIBwXFcjBWSdv+3gTtsZApy3jR3MEoh8wXS2wfy1bdXmM5jDtzV+y8r0czg7iStAaG0miRIAvbop7W46M1MN+HoZRfCjmHMCeDeywcNOrcYUpkGVPb7dWqqlImg4FkuulkKlS6VDfvGuEz6csAAe/5C6PRJXw78G30Hc/w4vwjPNz2cgjUGoeZZYgudeFhyzBtaxdsf6/WtCzT31MdjBUM5l59/ljYuRRSyTr9vRqLO+OEg8JMbg+WX0O/gWZ60RYNsaAtwmAy7VfLzCHM2eCeI4IeiE6JLGPVdq3a9kRR0EtsHm48KMJEMQ+/Ph9u+gD7jj+CoMRbx69m38I6OOu7sPgw/nTY1+kgTeDHpxCmyGMdFSPOqCW4p8wGIucvpLXOfdhSkdPfkzCYu2qQ8gC3WvdUTjftfhVioaBjE9NAMk1PIkJ7LEwwIFjeXUnSqioaH9MP5R3k17nPHczJd0ll6/PhjimpljG13Z4EiejkmfuW4Qx6SXpPpt77P7D2d/DEbzn2nvP5e/SDXBC8hb/3vh5WG1b62Z7V/EfhAkR2N5vFXmxtq8xPsRqHZQrVdrt2WEshRzMFoqEAsXCQvl6NTbvTZPK6Z1lmSVecYEBUlZGCwdzjNcw9aNojWDE4lKoK4H3lK4hSSU5Y2vIxefT3JkzN3a+WmRuYk++SaTwVap8SWUZpuwvaIsTDoUkHdyVTeEoE7ngK/noZHPJG+ORzDLziuzxZ6uf/9MO5Z5/KKDktEuTG0svYdeJlXBa4oMqCNRAwDMHyuk4qVyRU7gh1Qiggqpi7cqLs7zEqVbaPZT3LMuFggKVdcQdZRq+5coiGAq7M3RrA+3uMJO32sSz5YsmvcZ8h9JW9g8azRb/OfY5gTgZ303gq2D5lsozSdg3mPjlZxqoj10VJNxKmsQ447TKIaHDIG3lX4VOcX7iYtkTFs0Vp2Nv3P4f/K72oJnirIdkqmemmU4fL81aher6r2oikbM4B0N54BCqhWs3cnZqY8sUSW4YzVXXsfb0JxnNFHh003me/xn1m0N+rIaXRU+Fr7nMDczO4///2zj1Mrrq84593ztx2dje7uRGSzYVAAgElgG5FBAsEEBAKtVWEoo8VKo9Pi8V7EVtqaZXitX0e0UeKWqQqUOCxqUYQAQl9BCSAcjUSEjYkJOS6l+x1Lr/+cc5v5syZM7fNZHdn9/08Dw+Zc86ePXv27Hfe+f7eixc5DDvth8aW8X38T8WjB93yd+veQeLRCAvak5UPfPzbsH2DK+yt84DiQc3+2avWwx4ey4YuehbEPVNUGRok7kuF7B0ey3+/Zb4IudbIHcL7ug+GpEKGFTFt7x0mZyiKzq2Yr395t3ddKu6Tgf++qy3THDTlb8k2nhpy2hpuyxS8XVdg/H3Tx4vNlIlEKmR57NsCD/2LO0jjzX+e39yejObHg/rH81mxHBzLhuaixx03Ih8cy5b128FLhfQ8977hTD475/BZycJkpxo9d3BFoM/XLgECA7o9wjx3f467/3wA6/+wByciLOqsbYqV0liW+taLtIipOWhKcQfXKhiUtobbMnlv14sYW+MOQ+lsSWFOPdTU7GrD9yCXhgu+VjTsORIRZiVdwe0sEnc3Gh8YSZPNGeJO8R9cPBphNJNjaDRTtoAJAkVMQ2P5TwdORFg8pyV/rlqxImAXVdNe2+HgNSQ8Oyjru6/WzvHfKzvYZHvvMF2dLcTqeKNRGse8tnjJJC1latO0v6VEzGFA2mCkH3LhDajGw6t7iv3xVCKKMYQW3NSCMe4ngYqLqbksPHc3rDgHOrpKdtuIfZZP3K1Y2vTFcrbMYEgBkZ9gnrv/04EV2XrEfVmg8CisIyQU5qj6O0P27B2iJeYwvz1RdNzhs5JF51YmHts7CHR+arNQ01+tiJwnIhtFZJOIXBuy/5Mi8qKIPCsiD4rIssZfajHJWIQDkgIMjPZXPb5Wgj1g8sOox5kxY0fGVYzcX30UBl6H1ZeE7rbRtN9zt1ZL75BbVRpqy3j93CuLuzusw85o9X86sNZUPbbM0kDLgOHAiD1LMlo6R7Vn72BokZJ9Y9Qc98nFvrnqgmpzUPWvVkQc4GbgfOA44DIROS5w2DNAtzFmNXA38OVGX2iQZMyh33jZIw20Znr2DhGNCIs63WjRitJ4WxAUCqIqpPA9exckZsEx54futtG0P6q2C6r7y0Tu1vYYHMuUrU6Fgi2TH+PnewNZOo7IvTURZV5bIu+fF9r91ha5h33CsW+MGrlPLvYZ1iKm5qCW39LbgE3GmM3GmDHgDuBi/wHGmIeNMTZF4nHcOcaHlEQ0Ql++p3vpoqoxJrT9bDV69g3RNbvF7b5IobJzvFWqVXPcx4bgxbVw3EUQC18s7GhxqzX9o+WSUQeRgi2TKJcKOVqaY+4n5kTI5Ez+E0CRLTO3fnG3X/fijn4ee2Uvv9myD4CWQLRnm0/ZyD2/kB0SnS/LR+6a4z6Z2Dd7jdybg1r+aruA13yvt3nbynEl8POwHSJylYhsEJENu3fvrv0qQ0jGHHqtuIekQz72yl5O/+rDbNlTOhmoEtv2D7NkdkFgbNQ7XlvGjoxbPLtMlsfGdTA2AKvfX/Yci2enWDy7pciuiESElphT3pbxpUIGLZHgcQB7DpSK+9EL2okIdTfqOnpBO89v7+ey/3icz937HADz2ovPEZyjumtglNFMLjQ6P3bhLETgmMPb67oOpbEcu9Cd4qWN25qDhk4ZFpEPAN3A6WH7jTG3ALcAdHd3jz/9BNdz358u3/bXNv/a0TvM8jpG2/UOjRVFjwXPfXyRux0ZVzZ97Nm7YFYXLDut7DmuOWslV5x2RMn2VDzK/gqe+8BIJrQjo5+Y475h7B4YBdwulJYlc1I88pkz6aoz/fDzFxzLRScsyr9uTTgc7/WIt9joz4p7oZ1v6e9qzarDeOTTZ2qr30nmrctms/4z+ntoFmoR9+3AEt/rxd62IkTkbODzwOnGmNHGXF55klGHfTlPdEJsGdu+1k4wqpW+4XTRwmU+n3ycnntPpX4og3tg0y/hHR/Ld3QMoyXuhOaqtyac/M8XVqF6YDRDJmcqFjHZ1MI9BzxxbynuN79kHIuYbYkopxw1t+Ix1paxue49IWmQFhFRQZki6O+heajFlnkSWCkiy0UkDlwKrPUfICInAd8BLjLG7Gr8ZZaSiEXYmZ0F4sD+V0v22wXCvjrEPZczJemANuVwOD3eyL2CuD9/D5hsRUumEql4tEIqpJPfF/S7/Vhxt5F7R0t9w0TGSzBy37p3CCcidJWzrxRFqYuq4m6MyQBXA/cDLwF3GWNeEJEbROQi77CvAG3Af4vIb0VkbZnTNYxk1GEg48D8VbDj2ZL9Vth6h2oX94HRDMaUqQQdR+TuHxkXyvP3wILjYUEw+ag2UvEKnrsTye8LZqr4sbaMjdxnTZC4JwKpkD37hljUmdQiJUVpEDV57saYdcC6wLbrff8+u8HXVZWELV9fdCK8/Au3w5VvwXE8kbstmff7zoUF1foj97By+jwHdsFrv4Ezr6v7vPlrizvYAs+wBVW7r9KCasGWGaM9GcWp1CKhgQRTIbfuHeQI7fioKA2jacMkd5JPFhaeAIO7YWBH0f6CuI/VfM58rrcverWWxniyZfwj40rY+HPAwDHvrvu8Fn9Jf9Bz9+ciV6tQBTdy76xzvuvBkBd3X+SuRUqK0jiaVtwTUccVhoUnuBt2/K5o/3gid7sI6xc5JyIkY5FxiXvFVr+//xl0LoUFb6r7vBa/aAcLS+JF4l49ct89MDphfjv4KlQzWfqG0/QOpbVISVEaSNOKezLmNZ467E2AlIj7eDz3sMgd3Ah5cHQ8tkxhZFwRowdg86/gmAuKrKR68XdaDPPcLZU893i04Ln72wsfahK+BdXCJxy1ZRSlUTSxuLviMBZJwbyVjYncreceEPdUwsn3SKmH4Mi4PK88BNlRWDV+SwYCtkzFyL26LZPOmsmJ3NO5fAdJjdwVpXE0r7jnxcHz3X3ins0Z+kfGH7kHM0ZSsfENyQ6OjMuzcR0kO2HpO+o+px9/7ntYnrulki0T9eXXd0yg5x51IkQjwmgmW2jRoJ67ojSM5hV3+7E+44l7/3Y44LY0GBhJY4zrl/fXky0znCYZi5T0zkiNY0h22Mg4ALIZ+MN9cPR54BxcgXDFyN1vy1RsP1CwhSYycgf3dziSzrF17xDz2hIVi60URamPphX3osZTdlF1pxu92wh88ewWBkYzZLK19XvvG0qHClxrvP4h2dv2D5WMjANg62MwvP+gLRmo4rn7XleexFQ4LmhHHWrskOyefYNqyShKg2lacbeNp0YzWTh8tbvRs2asFXNsh2ul9I/UZqn4Z4j6aYk7dS+ols2U2bgOnAQcdVZd5wsjVYMtE3OkYldHv7hPZuSug68VpbE0r7jnsy1y0NIJs5fnxb1vOM17Io/yrdcv4U2yJV+pWY1g6wFLa7x+Wybfx90vWsa4KZBHng6JtrrOF4b10uNOpGTAhU2NrOS3QyByn0DPHdxPX/0jaXb0j2jPEkVpME0r7gVbxhNd36Lqgf59XBf7ERFyXO48WHPGTO9QOnRRMZWo35YJGxnHyw9Abw8c+yd1nasc1ksPi8zjgX705fBH/BPVesCSjDq8susAxmimjKI0muYV90A/cBae4DYQG97Psue/yVz66Z1zIhc5v2agv7ZJTf0VI/f6bBmbBpmPqDOjcN/fwdwVsPrSus5VDuulh4q7t62S3w4QdQoR/0TmuYNbq/CqbfWrOe6K0lCaVtyTgZax+UXV5+9hVc8PuTN7Br3vvJ42GaH9lf+t6Zy9w+nQRcUWb0E1l6u9BX3JyLjHvgn7NsP5X4ZoY0TUFieFzTi14l4tA6XIc59oWyZa6I2jkbuiNJYmFveQyB3gvutIR1r4plxG24pTeTnXRdfmu6uez51alC2O3Ht+Dd+/gPnGHRU3kqnNmikZGde3HdZ/FVZdCCsOfiHVUostU6mAyX8cTHy2jH2Dbo07zG2d2E8NijLdaXpxt42naJ0HsxZDdpT753+YXGoeHak4d2TP4LC+38Gulyqez/ryRYuKj3wZev6PszZ+ASFXc9vfkpFxv/h7MDk490v1/ZBVqMWWqbqg6uW5RyNS9Y2g0djf4dK5rSULwoqiHBxNK+4JX+OpPMvfCQuO576WC+loiRFzItzvnEFWovD0Dyqer6Q6dd9m2PwwHL6aRXsf4wrnvpp996KRcS8/AC/cC6d9AmYvq/OnrEyrL1smSEHcKwu2tWU6U7EJF1gr7poGqSiNp2nFvcSWAbj4ZvjIg+wdKfRJMal5PNf+Tvjdj91FzTL05TtCevbAU7e5U57+4k7eWLiGz0bvILvj+Zquzea4H9v7CNxxOcw/Fk69pt4fsSrJWASR8MjdvvlVqk4FN2KHic+UgcI1qt+uKI2nJnEXkfNEZKOIbBKRa0P2/7GIPC0iGRF5b+MvsxTbWyZvywBEHIgmirJeZrXEeCh1nlsV+lL5hVVb+NTREoPMGPz2h26LgFmL2HzKjfTTysJfXg0DOyFbObVy694hLos+zPyffwQOPx4+vA5ijR8fJyKkYk4Zz91986uWLSMixJ3IhPvt4LdlVNwVpdFUbeYhIg5wM3AOsA14UkTWGmNe9B22FfhL4NOH4iLDsI2nwhY5e4fSrF7silVnS4zHs2+GOUfBIzfBsReFZqvkPfeWGPz+p+4AkO4rAIjNOoxPpz/Kbb03wdeOcb8g3g7JDkjOgkS7+1+8FWKtnLl5J2+NPgRHnQOX3OZuP0SkEtGSXu7gz5ap7qNHHZnw6lQo1Cos0zRIRWk4tXRqehuwyRizGUBE7gAuBvLibox51dtXWxOXBuH2Jin9lv5K046WGK/sHoULb4QfXQK/+Q6842MlX1MUuT/1fXeQxlFrAHdR8pHcCTxx+n9xcutOGN7P7t072bFzB/H0EImRQRLZ14nlholnh1kxdoCHW8/nzMtuB+fQimZr3KniuVf/FcecSNFowYnCtpBQW0ZRGk8t4t4FvOZ7vQ04eTzfTESuAq4CWLp06XhOUYTbm6Q4ch/NZBlOZ/Ni1ZmKuVH50ee6Nsuv/hWOfx+0H170dfkF1aGtsGU9rPkHiBRHv9s7ToK3LAbguh9s4Jfb38gLVJBPnLySMw+xsAOcumIei2eXimN7MspblnZywuLOGs4xl5OXzzkUl1eRE5Z00L1sNgs7khP+vRVlujOhPVaNMbcAtwB0d3fXXhFUhqQdku0jmPXS0RKj17YfOO9GuPlkeOB6+LNbSr6uPRnFeeY/IRKFkz6Y32d960FfC4KevYOctWoBt36o+2B/jIPii+85PnR7zIlw71+fWtM5vnX5Wxt5STWzZtUC1qxaMCnfW1GmO7UsqG4HHEhpAgAACT5JREFUlvheL/a2TToJOyTbR19gmlJHKsZYJuceN+dIN2vl2TvdAiX/1w2nWZnsgye/5/ry7QXRsRknQ15nSGO8IiW1ExRFmaLUIu5PAitFZLmIxIFLgbWH9rJqIxF1Sjz34BxU+//8RKbTPgkdS+Bnn4KxofzX9Q6NcU3udjBZOPsfi87Z4mV12OZhuwZGGUnnVNwVRZmyVBV3Y0wGuBq4H3gJuMsY84KI3CAiFwGIyB+JyDbgfcB3ROSFQ3nRlmQs4vZz95Gfg5qy2TKu957vDBlPwYXfcCtWf/JRyLlvDl19T3P62Hqv2OiIonNGIkJLrNA8TMfCKYoy1anJczfGrAPWBbZd7/v3k7h2zYSSjDrFee6Q99dLI3dfT/eV58C7/tltC/CrL8Hp13JF/7fYEz2ceWWKjVoTTt5ztxWoy4JTlhRFUaYITT20MhmLsHeweBBHIV+9kC3j357nlKth90ZY/xXY+RxH5nq4femX+GCZYqNUPMqwJ+5b9w0REejqbHxhkqIoSiNoanF3PffgguoYIm4qIPgi96C4i8AFX4d9W+AP97E+t5rXF6wp+71SvlF7PXuHWNTZUnF8naIoymTS1OqUjJUWMfUNp5mVjBHxeqbYHuX9YdOYonF4/+2ku6/iuvSVFQt5Ur5Rez2aKaMoyhSnycXdKV1QDUxTaotHiYgvWyZIag67TruBbWZ+xRL81kQ0v6C6de+gTg5SFGVK0/TiHha5+3uyRyJu35RKc1T7hkJ6uQewkXv/SJr9Q2mN3BVFmdI0tbi7vWVKUyGDEXhRlWoIvV6730ptb1PxKINjGbZ6aZDag1xRlKlMc4u7137AmEIng7Ah1x2peMXIvT+QYRNGKu4wPJYt5Lhr5K4oyhSmqcW9ZEg2pZ47uJF731BxyqSffEfICrZMayLK4GiWnn2a464oytSnucU9WjxH1RhT4rmD22emouc+XNyPJoyWmMNwOsuW3YPMbY3TlmjqLFJFUaY5TS3udtiDHdhxYDRDNmfG4bmnqw6Itm1/N74xoIupiqJMeZpa3G3kbhdVg9WpFtvTPZcL7zJso/1KA6Lt0IuNOwfUklEUZcrT3OLudWu0nrv1zoNZLx0tMYyBAa/CNEjfULrqgGgb1Y9mctowTFGUKU+Ti7tny3iRez7rJVUq7lDIZw/SN5yuOiDaP65ObRlFUaY6TS3uibwt40XugY6Qlry4l/Hde4fHqg6I9g+aVnFXFGWq09TiHozc+8pE7rZnjC1WCuJ67pUHRPsXW7X1gKIoU52axF1EzhORjSKySUSuDdmfEJE7vf1PiMgRjb7QMKznbsU9P6gjZEEVKkTuIVWtQawtk4o7zGur/EagKIoy2VQVdxFxgJuB84HjgMtE5LjAYVcC+40xK4BvADc1+kLDCBYx9Q6PEXci+e2WklF7PrI5w8BIprot44n70jmpilk1iqIoU4FaIve3AZuMMZuNMWPAHcDFgWMuBm7z/n03cJZMgAImAqmQ/cNpOkJSGit57uUWYYO0eLaM+u2KojQDtZRZdgGv+V5vA04ud4wxJiMifcBcYE8jLrIctojpK/dv5Jb1m9nZN8KCjmTJccmYQyIa4dZHN/OTZ7YX7ct4ue+1LqhqjruiKM3AhNbQi8hVwFUAS5cuPejzzW9LcOVpy9nRNwzAygVtnHH0YaHHfvzso3lue2/ovhOXdHLqinkVv1cqHuUz5x7Du45bcHAXrSiKMgGIv6Ni6AEipwBfMMac673+HIAx5kbfMfd7xzwmIlFgJzDfVDh5d3e32bBhQwN+BEVRlJmDiDxljOmudlwtnvuTwEoRWS4iceBSYG3gmLXAh7x/vxd4qJKwK4qiKIeWqraM56FfDdwPOMD3jDEviMgNwAZjzFrgu8DtIrIJ2If7BqAoiqJMEjV57saYdcC6wLbrff8eAd7X2EtTFEVRxktTV6gqiqIo4ai4K4qiTENU3BVFUaYhKu6KoijTEBV3RVGUaUjVIqZD9o1FdgM94/zyeRzi1gZNht6PYvR+FNB7Ucx0uB/LjDHzqx00aeJ+MIjIhloqtGYKej+K0ftRQO9FMTPpfqgtoyiKMg1RcVcURZmGNKu43zLZFzDF0PtRjN6PAnovipkx96MpPXdFURSlMs0auSuKoigVaDpxrzasezojIktE5GEReVFEXhCRa7ztc0TkARF52fv/7Mm+1olERBwReUZEfuq9Xu4Nat/kDW6fMRPNRaRTRO4Wkd+LyEsicspMfT5E5BPe38nzIvJjEUnOpGejqcS9xmHd05kM8CljzHHA24G/8X7+a4EHjTErgQe91zOJa4CXfK9vAr7hDWzfjzvAfabw78B9xphVwAm492XGPR8i0gX8LdBtjHkzbrvyS5lBz0ZTiTu1Deuethhjdhhjnvb+PYD7h9tF8YDy24A/nZwrnHhEZDFwAXCr91qANbiD2mEG3Q8R6QD+GHe+AsaYMWNMLzP3+YgCLd50uBSwgxn0bDSbuIcN6+6apGuZVETkCOAk4AlggTFmh7drJzCTBr3+G/BZIOe9ngv0GmMy3uuZ9IwsB3YD3/dsqltFpJUZ+HwYY7YDXwW24op6H/AUM+jZaDZxVwARaQPuAT5ujOn37/PGG86IFCgRuRDYZYx5arKvZYoQBd4CfNsYcxIwSMCCmSnPh7eucDHuG94ioBU4b1IvaoJpNnHfDizxvV7sbZsxiEgMV9h/aIy519v8hogs9PYvBHZN1vVNMKcCF4nIq7gW3Rpcz7nT+ygOM+sZ2QZsM8Y84b2+G1fsZ+LzcTawxRiz2xiTBu7FfV5mzLPRbOJey7DuaYvnJ38XeMkY83XfLv+A8g8B/zPR1zYZGGM+Z4xZbIw5AvdZeMgYcznwMO6gdphZ92Mn8JqIHONtOgt4kZn5fGwF3i4iKe/vxt6LGfNsNF0Rk4i8G9dntcO6vzjJlzRhiMhpwKPAcxQ85utwffe7gKW4nTYvMcbsm5SLnCRE5Azg08aYC0XkSNxIfg7wDPABY8zoZF7fRCEiJ+IuLseBzcCHcYO4Gfd8iMg/Ae/HzTJ7BvgrXI99RjwbTSfuiqIoSnWazZZRFEVRakDFXVEUZRqi4q4oijINUXFXFEWZhqi4K4qiTENU3BVFUaYhKu6KoijTEBV3RVGUacj/A2AtXxKwv0L0AAAAAElFTkSuQmCC\n", + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Q(s,a) std: 0.067;0.027;0.093;0.069;0.014;0.148;0.173;0.026;0.043;0.101\n", + "correct 5\n" + ] + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.5/dist-packages/ipykernel_launcher.py:32: FutureWarning: pd.ewm_mean is deprecated for ndarrays and will be removed in a future version\n" + ] + } + ], + "source": [ + "ucb_agent_rewards = train_contextual_agent(BayesUCBBNNAgent(state_size=state_size, n_actions=n_actions),\n", + " batch_size=10, n_iters=N_ITERS)" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "plt.figure(figsize=(17, 8))\n", + "\n", + "plt.plot(greedy_agent_rewards)\n", + "plt.plot(thompson_agent_rewards)\n", + "plt.plot(ucb_agent_rewards)\n", + "\n", + "plt.legend([\n", + " \"Greedy BNN\",\n", + " \"Thompson sampling BNN\",\n", + " \"UCB BNN\"\n", + "])\n", + "\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Part 3. Exploration in MDP\n", + "\n", + "The following problem, called \"river swim\", illustrates importance of exploration in context of mdp's." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "Picture from https://arxiv.org/abs/1306.0940" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Rewards and transition probabilities are unknown to an agent. Optimal policy is to swim against current, while easiest way to gain reward is to go left." + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "class RiverSwimEnv:\n", + " LEFT_REWARD = 5.0 / 1000\n", + " RIGHT_REWARD = 1.0\n", + "\n", + " def __init__(self, intermediate_states_count=4, max_steps=16):\n", + " self._max_steps = max_steps\n", + " self._current_state = None\n", + " self._steps = None\n", + " self._interm_states = intermediate_states_count\n", + " self.reset()\n", + "\n", + " def reset(self):\n", + " self._steps = 0\n", + " self._current_state = 1\n", + " return self._current_state, 0.0, False\n", + "\n", + " @property\n", + " def n_actions(self):\n", + " return 2\n", + "\n", + " @property\n", + " def n_states(self):\n", + " return 2 + self._interm_states\n", + "\n", + " def _get_transition_probs(self, action):\n", + " if action == 0:\n", + " if self._current_state == 0:\n", + " return [0, 1.0, 0]\n", + " else:\n", + " return [1.0, 0, 0]\n", + "\n", + " elif action == 1:\n", + " if self._current_state == 0:\n", + " return [0, .4, .6]\n", + " if self._current_state == self.n_states - 1:\n", + " return [.4, .6, 0]\n", + " else:\n", + " return [.05, .6, .35]\n", + " else:\n", + " raise RuntumeError(\n", + " \"Unknown action {}. Max action is {}\".format(action, self.n_actions))\n", + "\n", + " def step(self, action):\n", + " \"\"\"\n", + " :param action:\n", + " :type action: int\n", + " :return: observation, reward, is_done\n", + " :rtype: (int, float, bool)\n", + " \"\"\"\n", + " reward = 0.0\n", + "\n", + " if self._steps >= self._max_steps:\n", + " return self._current_state, reward, True\n", + "\n", + " transition = np.random.choice(\n", + " range(3), p=self._get_transition_probs(action))\n", + " if transition == 0:\n", + " self._current_state -= 1\n", + " elif transition == 1:\n", + " pass\n", + " else:\n", + " self._current_state += 1\n", + "\n", + " if self._current_state == 0:\n", + " reward = self.LEFT_REWARD\n", + " elif self._current_state == self.n_states - 1:\n", + " reward = self.RIGHT_REWARD\n", + "\n", + " self._steps += 1\n", + " return self._current_state, reward, False" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's implement q-learning agent with epsilon-greedy exploration strategy and see how it performs." + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "class QLearningAgent:\n", + " def __init__(self, n_states, n_actions, lr=0.2, gamma=0.95, epsilon=0.1):\n", + " self._gamma = gamma\n", + " self._epsilon = epsilon\n", + " self._q_matrix = np.zeros((n_states, n_actions))\n", + " self._lr = lr\n", + "\n", + " def get_action(self, state):\n", + " if np.random.random() < self._epsilon:\n", + " return np.random.randint(0, self._q_matrix.shape[1])\n", + " else:\n", + " return np.argmax(self._q_matrix[state])\n", + "\n", + " def get_q_matrix(self):\n", + " \"\"\" Used for policy visualization\n", + " \"\"\"\n", + "\n", + " return self._q_matrix\n", + "\n", + " def start_episode(self):\n", + " \"\"\" Used in PSRL agent\n", + " \"\"\"\n", + " pass\n", + "\n", + " def update(self, state, action, reward, next_state):\n", + " # YOUR CODE HERE\n", + " # Finish implementation of q-learnig agent" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def train_mdp_agent(agent, env, n_episodes):\n", + " episode_rewards = []\n", + "\n", + " for ep in range(n_episodes):\n", + " state, ep_reward, is_done = env.reset()\n", + " agent.start_episode()\n", + " while not is_done:\n", + " action = agent.get_action(state)\n", + "\n", + " next_state, reward, is_done = env.step(action)\n", + " agent.update(state, action, reward, next_state)\n", + "\n", + " state = next_state\n", + " ep_reward += reward\n", + "\n", + " episode_rewards.append(ep_reward)\n", + " return episode_rewards" + ] + }, + { + "cell_type": "code", + "execution_count": 102, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.5/dist-packages/ipykernel_launcher.py:6: FutureWarning: pd.ewm_mean is deprecated for ndarrays and will be removed in a future version\n", + " \n" + ] + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "env = RiverSwimEnv()\n", + "agent = QLearningAgent(env.n_states, env.n_actions)\n", + "rews = train_mdp_agent(agent, env, 1000)\n", + "plt.figure(figsize=(15, 8))\n", + "\n", + "plt.plot(moving_average(np.array(rews), alpha=.1))\n", + "plt.xlabel(\"Episode count\")\n", + "plt.ylabel(\"Reward\")\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's visualize our policy:" + ] + }, + { + "cell_type": "code", + "execution_count": 103, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def plot_policy(agent):\n", + " fig = plt.figure(figsize=(15, 8))\n", + " ax = fig.add_subplot(111)\n", + " ax.matshow(agent.get_q_matrix().T)\n", + " ax.set_yticklabels(['', 'left', 'right'])\n", + " plt.xlabel(\"State\")\n", + " plt.ylabel(\"Action\")\n", + " plt.title(\"Values of state-action pairs\")\n", + " plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 104, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "plot_policy(agent)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As your see, agent uses suboptimal policy of going left and does not explore the right state." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Bonus 3.1 Posterior sampling RL (3 points)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we will implement Thompson Sampling for MDP!\n", + "\n", + "General algorithm:\n", + "\n", + ">**for** episode $k = 1,2,...$ **do**\n", + ">> sample $M_k \\sim f(\\bullet\\ |\\ H_k)$\n", + "\n", + ">> compute policy $\\mu_k$ for $M_k$\n", + "\n", + ">> **for** time $t = 1, 2,...$ **do**\n", + "\n", + ">>> take action $a_t$ from $\\mu_k$ \n", + "\n", + ">>> observe $r_t$ and $s_{t+1}$\n", + ">>> update $H_k$\n", + "\n", + ">> **end for**\n", + "\n", + ">**end for**\n", + "\n", + "In our case we will model $M_k$ with two matricies: transition and reward. Transition matrix is sampled from dirichlet distribution. Reward matrix is sampled from normal-gamma distribution.\n", + "\n", + "Distributions are updated with bayes rule - see continious distribution section at https://en.wikipedia.org/wiki/Conjugate_prior\n", + "\n", + "Article on PSRL - https://arxiv.org/abs/1306.0940" + ] + }, + { + "cell_type": "code", + "execution_count": 105, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def sample_normal_gamma(mu, lmbd, alpha, beta):\n", + " \"\"\" https://en.wikipedia.org/wiki/Normal-gamma_distribution\n", + " \"\"\"\n", + " tau = np.random.gamma(alpha, beta)\n", + " mu = np.random.normal(mu, 1.0 / np.sqrt(lmbd * tau))\n", + " return mu, tau\n", + "\n", + "\n", + "class PsrlAgent:\n", + " def __init__(self, n_states, n_actions, horizon=10):\n", + " self._n_states = n_states\n", + " self._n_actions = n_actions\n", + " self._horizon = horizon\n", + "\n", + " # params for transition sampling - Dirichlet distribution\n", + " self._transition_counts = np.zeros(\n", + " (n_states, n_states, n_actions)) + 1.0\n", + "\n", + " # params for reward sampling - Normal-gamma distribution\n", + " self._mu_matrix = np.zeros((n_states, n_actions)) + 1.0\n", + " self._state_action_counts = np.zeros(\n", + " (n_states, n_actions)) + 1.0 # lambda\n", + "\n", + " self._alpha_matrix = np.zeros((n_states, n_actions)) + 1.0\n", + " self._beta_matrix = np.zeros((n_states, n_actions)) + 1.0\n", + "\n", + " def _value_iteration(self, transitions, rewards):\n", + " # YOU CODE HERE\n", + " state_values = < Find action values with value iteration >\n", + " return state_values\n", + "\n", + " def start_episode(self):\n", + " # sample new mdp\n", + " self._sampled_transitions = np.apply_along_axis(\n", + " np.random.dirichlet, 1, self._transition_counts)\n", + "\n", + " sampled_reward_mus, sampled_reward_stds = sample_normal_gamma(\n", + " self._mu_matrix,\n", + " self._state_action_counts,\n", + " self._alpha_matrix,\n", + " self._beta_matrix\n", + " )\n", + "\n", + " self._sampled_rewards = sampled_reward_mus\n", + " self._current_value_function = self._value_iteration(\n", + " self._sampled_transitions, self._sampled_rewards)\n", + "\n", + " def get_action(self, state):\n", + " return np.argmax(self._sampled_rewards[state] +\n", + " self._current_value_function.dot(self._sampled_transitions[state]))\n", + "\n", + " def update(self, state, action, reward, next_state):\n", + " # YOUR CODE HERE\n", + " # update rules - https://en.wikipedia.org/wiki/Conjugate_prior\n", + "\n", + " def get_q_matrix(self):\n", + " return self._sampled_rewards + self._current_value_function.dot(self._sampled_transitions)" + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.5/dist-packages/ipykernel_launcher.py:7: FutureWarning: pd.ewm_mean is deprecated for ndarrays and will be removed in a future version\n", + " import sys\n" + ] + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from pandas import DataFrame\n", + "moving_average = lambda x, **kw: DataFrame(\n", + " {'x': np.asarray(x)}).x.ewm(**kw).mean().values\n", + "\n", + "horizon = 20\n", + "env = RiverSwimEnv(max_steps=horizon)\n", + "agent = PsrlAgent(env.n_states, env.n_actions, horizon=horizon)\n", + "rews = train_mdp_agent(agent, env, 1000)\n", + "\n", + "plt.figure(figsize=(15, 8))\n", + "plt.plot(moving_average(np.array(rews), alpha=0.1))\n", + "\n", + "plt.xlabel(\"Episode count\")\n", + "plt.ylabel(\"Reward\")\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 72, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "plot_policy(agent)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Bonus 3.2 Bootstrapped DQN (10 points)\n", + "\n", + "Implement Bootstrapped DQN algorithm and compare it's performance with ordinary DQN on BeamRider Atari game. Links:\n", + "- https://arxiv.org/abs/1602.04621" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/week6_policy_based/README.md b/week06_policy_based/README.md similarity index 53% rename from week6_policy_based/README.md rename to week06_policy_based/README.md index af35de0df..b48b49c9e 100644 --- a/week6_policy_based/README.md +++ b/week06_policy_based/README.md @@ -1,37 +1,24 @@ ## Materials -* [Slides](https://docviewer.yandex.ru/?url=ya-disk-public%3A%2F%2FG3IXcG62RwNUGSSos%2BuGhtgXNfsBjP9RxUtUfgCffIk%3D%3A%2Flecture6.pdf&name=lecture6.pdf&c=58c876c4863a) +* [Slides](https://yadi.sk/i/keSzKSgA2oYuwQ) * Video lecture by D. Silver - [video](https://www.youtube.com/watch?v=KHZVXao4qXs) * Our [lecture](https://yadi.sk/i/yPIPkO_f3TPsNK), [seminar(pytorch)](https://yadi.sk/i/flW8ezGk3TPsQ5), [seminar(theano)](https://yadi.sk/i/8f9NX_E73GKBkT) * Alternative lecture by J. Schulman part 1 - [video](https://www.youtube.com/watch?v=BB-BhTn6DCM) * Alternative lecture by J. Schulman part 2 - [video](https://www.youtube.com/watch?v=Wnl-Qh2UHGg) +* Andrej Karpathy's [post](http://karpathy.github.io/2016/05/31/rl/) on policy gradients ## More materials * Actually proving the policy gradient for discounted rewards - [article](https://papers.nips.cc/paper/1713-policy-gradient-methods-for-reinforcement-learning-with-function-approximation.pdf) * On variance of policy gradient and optimal baselines: [article](https://papers.nips.cc/paper/4264-analysis-and-improvement-of-policy-gradient-estimation.pdf), another [article](https://arxiv.org/pdf/1301.2315.pdf) -* Generalized Advantage Estimation - a way you can speed up training for homework_*.ipynb - [article](https://arxiv.org/abs/1506.02438) +* Learn Advatangeg Actor Critic with a [comic](https://hackernoon.com/intuitive-rl-intro-to-advantage-actor-critic-a2c-4ff545978752) * Generalizing log-derivative trick - [url](http://blog.shakirm.com/2015/11/machine-learning-trick-of-the-day-5-log-derivative-trick/) * Combining policy gradient and q-learning - [arxiv](https://arxiv.org/abs/1611.01626) -* Bayesian perspective on why reparameterization & logderivative tricks matter (Vetrov's take) - [pdf](https://www.sdsj.ru/slides/Vetrov.pdf) +* Variational perspective on reinforcement learning (from DeepBayes) - [pdf](http://incompleteideas.net/book/the-book-2nd.html) * Adversarial review of policy gradient - [blog](http://www.argmin.net/2018/02/20/reinforce/) -## Homework +Run seminar notebook in colab: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/yandexdataschool/Practical_RL/blob/spring19/week06_policy_based/reinforce_pytorch.ipynb) -As usual, pick reinfoce_.ipynb for starters and then proceed with homework_.ipynb. - -To run seminar notebook in colab -* run it [here](https://colab.research.google.com/github/yandexdataschool/Practical_DL/blob/fall18/week10_rl/reinforce_pytorch.ipynb#scrollTo=y9nfDwJY3sGI) -* paste this to install libraries -``` -!pip install gym -!apt-get install -y xvfb -!wget https://raw.githubusercontent.com/yandexdataschool/Practical_DL/fall18/xvfb -!apt-get install -y python-opengl ffmpeg -!pip install pyglet==1.2.4 - -!bash ./xvfb start -%env DISPLAY=:1 -``` +Run optional homework notebook in colab: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/yandexdataschool/Practical_RL/blob/spring19/week06_policy_based/a2c-optional.ipynb) diff --git a/week06_policy_based/a2c-optional.ipynb b/week06_policy_based/a2c-optional.ipynb new file mode 100644 index 000000000..5139ae867 --- /dev/null +++ b/week06_policy_based/a2c-optional.ipynb @@ -0,0 +1,328 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# # in google colab uncomment this\n", + "\n", + "# import os\n", + "\n", + "# os.system('apt-get install -y xvfb')\n", + "# os.system('wget https://raw.githubusercontent.com/yandexdataschool/Practical_DL/fall18/xvfb -O ../xvfb')\n", + "# os.system('apt-get install -y python-opengl ffmpeg')\n", + "# os.system('pip install pyglet==1.2.4')\n", + "\n", + "# os.system('python -m pip install -U pygame --user')\n", + "\n", + "# print('setup complete')\n", + "\n", + "# XVFB will be launched if you run on a server\n", + "import os\n", + "if type(os.environ.get(\"DISPLAY\")) is not str or len(os.environ.get(\"DISPLAY\")) == 0:\n", + " !bash ../xvfb start\n", + " %env DISPLAY = : 1" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Implementing Advantage-Actor Critic (A2C)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this notebook you will implement Advantage Actor Critic algorithm that trains on a batch of Atari 2600 environments running in parallel. \n", + "\n", + "Firstly, we will use environment wrappers implemented in file `atari_wrappers.py`. These wrappers preprocess observations (resize, grayscal, take max between frames, skip frames and stack them together) and rewards. Some of the wrappers help to reset the environment and pass `done` flag equal to `True` when agent dies.\n", + "File `env_batch.py` includes implementation of `ParallelEnvBatch` class that allows to run multiple environments in parallel. To create an environment we can use `nature_dqn_env` function. Note that if you are using \n", + "PyTorch and not using `tensorboardX` you will need to implement a wrapper that will log **raw** total rewards that the *unwrapped* environment returns and redefine the implemention of `nature_dqn_env` function here. \n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "from atari_wrappers import nature_dqn_env\n", + "\n", + "\n", + "env = nature_dqn_env(\"SpaceInvadersNoFrameskip-v4\", nenvs=8)\n", + "obs = env.reset()\n", + "assert obs.shape == (8, 84, 84, 4)\n", + "assert obs.dtype == np.uint8" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Next, we will need to implement a model that predicts logits and values. It is suggested that you use the same model as in [Nature DQN paper](https://web.stanford.edu/class/psych209/Readings/MnihEtAlHassibis15NatureControlDeepRL.pdf) with a modification that instead of having a single output layer, it will have two output layers taking as input the output of the last hidden layer. **Note** that this model is different from the model you used in homework where you implemented DQN. You can use your favorite deep learning framework here. We suggest that you use orthogonal initialization with parameter $\\sqrt{2}$ for kernels and initialize biases with zeros. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# import tensorflow as torch\n", + "# import torch as tf\n", + "\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You will also need to define and use a policy that wraps the model. While the model computes logits for all actions, the policy will sample actions and also compute their log probabilities. `policy.act` should return a dictionary of all the arrays that are needed to interact with an environment and train the model.\n", + " Note that actions must be an `np.ndarray` while the other\n", + "tensors need to have the type determined by your deep learning framework. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "class Policy:\n", + " def __init__(self, model):\n", + " self.model = model\n", + " \n", + " def act(self, inputs):\n", + " \n", + " # Should return a dict containing keys ['actions', 'logits', 'log_probs', 'values']." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Next will pass the environment and policy to a runner that collects partial trajectories from the environment. \n", + "The class that does is is already implemented for you." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "from runners import EnvRunner" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This runner interacts with the environment for a given number of steps and returns a dictionary containing\n", + "keys \n", + "\n", + "* 'observations' \n", + "* 'rewards' \n", + "* 'resets'\n", + "* 'actions'\n", + "* all other keys that you defined in `Policy`\n", + "\n", + "under each of these keys there is a python `list` of interactions with the environment of specified length $T$ — the size of partial trajectory. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To train the part of the model that predicts state values you will need to compute the value targets. \n", + "Any callable could be passed to `EnvRunner` to be applied to each partial trajectory after it is collected. \n", + "Thus, we can implement and use `ComputeValueTargets` callable. \n", + "The formula for the value targets is simple:\n", + "\n", + "$$\n", + "\\hat v(s_t) = \\sum_{t'=0}^{T - 1}\\gamma^{t'}r_{t+t'} + \\gamma^T \\hat{v}(s_{t+T}),\n", + "$$\n", + "\n", + "In implementation, however, do not forget to use \n", + "`trajectory['resets']` flags to check if you need to add the value targets at the next step when \n", + "computing value targets for the current step. You can access `trajectory['state']['latest_observation']`\n", + "to get last observations in partial trajectory — $s_{t+T}$." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "class ComputeValueTargets:\n", + " def __init__(self, policy, gamma=0.99):\n", + " self.policy = policy\n", + " \n", + " def __call__(self, trajectory):\n", + " # This method should modify trajectory inplace by adding \n", + " # an item with key 'value_targets' to it. \n", + " " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "After computing value targets we will transform lists of interactions into tensors\n", + "with the first dimension `batch_size` which is equal to `T * nenvs`, i.e. you essentially need\n", + "to flatten the first two dimensions. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "class MergeTimeBatch:\n", + " \"\"\" Merges first two axes typically representing time and env batch. \"\"\"\n", + " def __call__(self, trajectory):\n", + " # Modify trajectory inplace. \n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "model = \n", + "policy = Policy(model)\n", + "runner = EnvRunner(env, policy, nsteps=5,\n", + " transforms=[ComputeValueTargets(),\n", + " MergeTimeBatch()])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now is the time to implement the advantage actor critic algorithm itself. You can look into your lecture,\n", + "[Mnih et al. 2016](https://arxiv.org/abs/1602.01783) paper, and [lecture](https://www.youtube.com/watch?v=Tol_jw5hWnI&list=PLkFD6_40KJIxJMR-j5A1mkxK26gh_qg37&index=20) by Sergey Levine." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "class A2C:\n", + " def __init__(self,\n", + " policy,\n", + " optimizer,\n", + " value_loss_coef=0.25,\n", + " entropy_coef=0.01,\n", + " max_grad_norm=0.5):\n", + " self.policy = policy\n", + " self.optimizer = optimizer\n", + " self.value_loss_coef = value_loss_coef\n", + " self.entropy_coef = entropy_coef\n", + " self.max_grad_norm = max_grad_norm\n", + " \n", + " def policy_loss(self, trajectory):\n", + " # You will need to compute advantages here. \n", + " \n", + " \n", + " def value_loss(self, trajectory):\n", + " \n", + " \n", + " def loss(self, trajectory):\n", + " \n", + " \n", + " def step(self, trajectory):\n", + " " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now you can train your model. With reasonable hyperparameters training on a single GTX1080 for 10 million steps across all batched environments (which translates to about 5 hours of wall clock time)\n", + "it should be possible to achieve *average raw reward over last 100 episodes* (the average is taken over 100 last \n", + "episodes in each environment in the batch) of about 600. You should plot this quantity with respect to \n", + "`runner.step_var` — the number of interactions with all environments. It is highly \n", + "encouraged to also provide plots of the following quantities (these are useful for debugging as well):\n", + "\n", + "* [Coefficient of Determination](https://en.wikipedia.org/wiki/Coefficient_of_determination) between \n", + "value targets and value predictions\n", + "* Entropy of the policy $\\pi$\n", + "* Value loss\n", + "* Policy loss\n", + "* Value targets\n", + "* Value predictions\n", + "* Gradient norm\n", + "* Advantages\n", + "* A2C loss\n", + "\n", + "For optimization we suggest you use RMSProp with learning rate starting from 7e-4 and linearly decayed to 0, smoothing constant (alpha in PyTorch and decay in TensorFlow) equal to 0.99 and epsilon equal to 1e-5." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "a2c = \n", + "\n", + "" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "rl", + "language": "python", + "name": "rl" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/week06_policy_based/atari_wrappers.py b/week06_policy_based/atari_wrappers.py new file mode 100644 index 000000000..778ac99c9 --- /dev/null +++ b/week06_policy_based/atari_wrappers.py @@ -0,0 +1,307 @@ +""" Environment wrappers. """ +from collections import deque + +import cv2 +import gym +import gym.spaces as spaces +from gym.envs import atari +import numpy as np +import tensorflow as tf + +from env_batch import ParallelEnvBatch +cv2.ocl.setUseOpenCL(False) + + +class EpisodicLife(gym.Wrapper): + """ Sets done flag to true when agent dies. """ + def __init__(self, env): + super(EpisodicLife, self).__init__(env) + self.lives = 0 + self.real_done = True + + def step(self, action): + obs, rew, done, info = self.env.step(action) + self.real_done = done + info["real_done"] = done + lives = self.env.unwrapped.ale.lives() + if 0 < lives < self.lives: + done = True + self.lives = lives + return obs, rew, done, info + + def reset(self, **kwargs): + if self.real_done: + obs = self.env.reset(**kwargs) + else: + obs, _, _, _ = self.env.step(0) + self.lives = self.env.unwrapped.ale.lives() + return obs + + +class FireReset(gym.Wrapper): + """ Makes fire action when reseting environment. + + Some environments are fixed until the agent makes the fire action, + this wrapper makes this action so that the epsiode starts automatically. + """ + def __init__(self, env): + super(FireReset, self).__init__(env) + action_meanings = env.unwrapped.get_action_meanings() + if len(action_meanings) < 3: + raise ValueError( + "env.unwrapped.get_action_meanings() must be of length >= 3" + f"but is of length {len(action_meanings)}") + if env.unwrapped.get_action_meanings()[1] != "FIRE": + raise ValueError( + "env.unwrapped.get_action_meanings() must have 'FIRE' " + f"under index 1, but is {action_meanings}") + + def step(self, action): + return self.env.step(action) + + def reset(self, **kwargs): + self.env.reset(**kwargs) + obs, _, done, _ = self.env.step(1) + if done: + self.env.reset(**kwargs) + obs, _, done, _ = self.env.step(2) + if done: + self.env.reset(**kwargs) + return obs + + +class StartWithRandomActions(gym.Wrapper): + """ Makes random number of random actions at the beginning of each + episode. """ + def __init__(self, env, max_random_actions=30): + super(StartWithRandomActions, self).__init__(env) + self.max_random_actions = max_random_actions + self.real_done = True + + def step(self, action): + obs, rew, done, info = self.env.step(action) + self.real_done = info.get("real_done", True) + return obs, rew, done, info + + def reset(self, **kwargs): + obs = self.env.reset() + if self.real_done: + num_random_actions = np.random.randint(self.max_random_actions + 1) + for _ in range(num_random_actions): + obs, _, _, _ = self.env.step(self.env.action_space.sample()) + self.real_done = False + return obs + + +class ImagePreprocessing(gym.ObservationWrapper): + """ Preprocesses image-observations by possibly grayscaling and resizing. """ + def __init__(self, env, width=84, height=84, grayscale=True): + super(ImagePreprocessing, self).__init__(env) + self.width = width + self.height = height + self.grayscale = grayscale + ospace = self.env.observation_space + low, high, dtype = ospace.low.min(), ospace.high.max(), ospace.dtype + if self.grayscale: + self.observation_space = spaces.Box(low=low, high=high, + shape=(width, height), dtype=dtype) + else: + obs_shape = (width, height) + self.observation_space.shape[2:] + self.observation_space = spaces.Box(low=low, high=high, + shape=obs_shape, dtype=dtype) + + def observation(self, observation): + """ Performs image preprocessing. """ + if self.grayscale: + observation = cv2.cvtColor(observation, cv2.COLOR_RGB2GRAY) + observation = cv2.resize(observation, (self.width, self.height), + cv2.INTER_AREA) + return observation + + +class MaxBetweenFrames(gym.ObservationWrapper): + """ Takes maximum between two subsequent frames. """ + def __init__(self, env): + if (isinstance(env.unwrapped, atari.AtariEnv) and + "NoFrameskip" not in env.spec.id): + raise ValueError("MaxBetweenFrames requires NoFrameskip in atari env id") + super(MaxBetweenFrames, self).__init__(env) + self.last_obs = None + + def observation(self, observation): + obs = np.maximum(observation, self.last_obs) + self.last_obs = observation + return obs + + def reset(self, **kwargs): + self.last_obs = self.env.reset() + return self.last_obs + + +class QueueFrames(gym.ObservationWrapper): + """ Queues specified number of frames together along new dimension. """ + def __init__(self, env, nframes, concat=False): + super(QueueFrames, self).__init__(env) + self.obs_queue = deque([], maxlen=nframes) + self.concat = concat + ospace = self.observation_space + if self.concat: + oshape = ospace.shape[:-1] + (ospace.shape[-1] * nframes,) + else: + oshape = ospace.shape + (nframes,) + self.observation_space = spaces.Box(ospace.low.min(), ospace.high.max(), + oshape, ospace.dtype) + + def observation(self, observation): + self.obs_queue.append(observation) + return (np.concatenate(self.obs_queue, -1) if self.concat + else np.dstack(self.obs_queue)) + + def reset(self, **kwargs): + obs = self.env.reset() + for _ in range(self.obs_queue.maxlen - 1): + self.obs_queue.append(obs) + return self.observation(obs) + + +class SkipFrames(gym.Wrapper): + """ Performs the same action for several steps and returns the final result. + """ + def __init__(self, env, nskip=4): + super(SkipFrames, self).__init__(env) + if (isinstance(env.unwrapped, atari.AtariEnv) and + "NoFrameskip" not in env.spec.id): + raise ValueError("SkipFrames requires NoFrameskip in atari env id") + self.nskip = nskip + + def step(self, action): + total_reward = 0.0 + for _ in range(self.nskip): + obs, rew, done, info = self.env.step(action) + total_reward += rew + if done: + break + return obs, total_reward, done, info + + def reset(self, **kwargs): + return self.env.reset(**kwargs) + + +class ClipReward(gym.RewardWrapper): + """ Modifes reward to be in {-1, 0, 1} by taking sign of it. """ + def reward(self, reward): + return np.sign(reward) + + +class TFSummaries(gym.Wrapper): + """ Writes env summaries.""" + def __init__(self, env, prefix=None, running_mean_size=100, step_var=None): + super(TFSummaries, self).__init__(env) + self.episode_counter = 0 + self.prefix = prefix or self.env.spec.id + self.step_var = (step_var if step_var is not None + else tf.train.get_global_step()) + + nenvs = getattr(self.env.unwrapped, "nenvs", 1) + self.rewards = np.zeros(nenvs) + self.had_ended_episodes = np.zeros(nenvs, dtype=np.bool) + self.episode_lengths = np.zeros(nenvs) + self.reward_queues = [deque([], maxlen=running_mean_size) + for _ in range(nenvs)] + + def should_write_summaries(self): + """ Returns true if it's time to write summaries. """ + return np.all(self.had_ended_episodes) + + def add_summaries(self): + """ Writes summaries. """ + tf.contrib.summary.scalar( + f"{self.prefix}/total_reward", + tf.reduce_mean([q[-1] for q in self.reward_queues]), + step=self.step_var) + tf.contrib.summary.scalar( + f"{self.prefix}/reward_mean_{self.reward_queues[0].maxlen}", + tf.reduce_mean([np.mean(q) for q in self.reward_queues]), + step=self.step_var) + tf.contrib.summary.scalar( + f"{self.prefix}/episode_length", + tf.reduce_mean(self.episode_lengths), + step=self.step_var) + if self.had_ended_episodes.size > 1: + tf.contrib.summary.scalar( + f"{self.prefix}/min_reward", + min(q[-1] for q in self.reward_queues), + step=self.step_var) + tf.contrib.summary.scalar( + f"{self.prefix}/max_reward", + max(q[-1] for q in self.reward_queues), + step=self.step_var) + self.episode_lengths.fill(0) + self.had_ended_episodes.fill(False) + + def step(self, action): + obs, rew, done, info = self.env.step(action) + self.rewards += rew + self.episode_lengths[~self.had_ended_episodes] += 1 + + info_collection = [info] if isinstance(info, dict) else info + done_collection = [done] if isinstance(done, bool) else done + done_indices = [i for i, info in enumerate(info_collection) + if info.get("real_done", done_collection[i])] + for i in done_indices: + if not self.had_ended_episodes[i]: + self.had_ended_episodes[i] = True + self.reward_queues[i].append(self.rewards[i]) + self.rewards[i] = 0 + + if self.should_write_summaries(): + self.add_summaries() + return obs, rew, done, info + + def reset(self, **kwargs): + self.rewards.fill(0) + self.episode_lengths.fill(0) + self.had_ended_episodes.fill(False) + return self.env.reset(**kwargs) + + +def nature_dqn_env(env_id, nenvs=None, seed=None, + summaries=True, clip_reward=True): + """ Wraps env as in Nature DQN paper. """ + if "NoFrameskip" not in env_id: + raise ValueError(f"env_id must have 'NoFrameskip' but is {env_id}") + if nenvs is not None: + if seed is None: + seed = list(range(nenvs)) + if isinstance(seed, int): + seed = [seed] * nenvs + if len(seed) != nenvs: + raise ValueError(f"seed has length {len(seed)} but must have " + f"length equal to nenvs which is {nenvs}") + + env = ParallelEnvBatch([ + lambda i=i, env_seed=env_seed: nature_dqn_env( + env_id, seed=env_seed, summaries=False, clip_reward=False) + for i, env_seed in enumerate(seed) + ]) + if summaries: + env = TFSummaries(env, prefix=env_id) + if clip_reward: + env = ClipReward(env) + return env + + env = gym.make(env_id) + env.seed(seed) + if summaries: + env = TFSummaries(env) + env = EpisodicLife(env) + if "FIRE" in env.unwrapped.get_action_meanings(): + env = FireReset(env) + env = StartWithRandomActions(env, max_random_actions=30) + env = MaxBetweenFrames(env) + env = SkipFrames(env, 4) + env = ImagePreprocessing(env, width=84, height=84, grayscale=True) + env = QueueFrames(env, 4) + if clip_reward: + env = ClipReward(env) + return env diff --git a/week06_policy_based/env_batch.py b/week06_policy_based/env_batch.py new file mode 100644 index 000000000..208d413e8 --- /dev/null +++ b/week06_policy_based/env_batch.py @@ -0,0 +1,201 @@ +# pylint: skip-file +from multiprocessing import Process, Pipe + +from gym import Env, Wrapper, Space +import numpy as np + + +class SpaceBatch(Space): + def __init__(self, spaces): + first_type = type(spaces[0]) + first_shape = spaces[0].shape + first_dtype = spaces[0].dtype + for space in spaces: + if not isinstance(space, first_type): + raise TypeError("spaces have different types: {}, {}" + .format(first_type, type(space))) + if first_shape != space.shape: + raise ValueError("spaces have different shapes: {}, {}" + .format(first_shape, space.shape)) + if first_dtype != space.dtype: + raise ValueError("spaces have different data types: {}, {}" + .format(first_dtype, space.dtype)) + + self.spaces = spaces + super(SpaceBatch, self).__init__(shape=self.spaces[0].shape, + dtype=self.spaces[0].dtype) + + def sample(self): + return np.stack([space.sample() for space in self.spaces]) + + def __getattr__(self, attr): + return getattr(self.spaces[0], attr) + + +class EnvBatch(Env): + def __init__(self, make_env, nenvs=None): + make_env_functions = self._get_make_env_functions(make_env, nenvs) + self._envs = [make_env() for make_env in make_env_functions] + self._nenvs = len(self.envs) + # self.observation_space = SpaceBatch([env.observation_space + # for env in self._envs]) + self.action_space = SpaceBatch([env.action_space + for env in self._envs]) + + def _get_make_env_functions(self, make_env, nenvs): + if nenvs is None and not isinstance(make_env, list): + raise ValueError("When nenvs is None make_env" + " must be a list of callables") + if nenvs is not None and not callable(make_env): + raise ValueError("When nenvs is not None make_env must be callable") + + if nenvs is not None: + make_env = [make_env for _ in range(nenvs)] + return make_env + + @property + def nenvs(self): + return self._nenvs + + @property + def envs(self): + return self._envs + + def _check_actions(self, actions): + if not len(actions) == self.nenvs: + raise ValueError( + "number of actions is not equal to number of envs: " + "len(actions) = {}, nenvs = {}" + .format(len(actions), self.nenvs)) + + def step(self, actions): + self._check_actions(actions) + obs, rews, resets, infos = [], [], [], [] + for env, action in zip(self._envs, actions): + ob, rew, done, info = env.step(action) + if done: + ob = env.reset() + obs.append(ob) + rews.append(rew) + resets.append(done) + infos.append(info) + return np.stack(obs), np.stack(rews), np.stack(resets), infos + + def reset(self): + return np.stack([env.reset() for env in self.envs]) + + +class SingleEnvBatch(Wrapper, EnvBatch): + def __init__(self, env): + super(SingleEnvBatch, self).__init__(env) + self.observation_space = SpaceBatch([self.env.observation_space]) + self.action_space = SpaceBatch([self.env.action_space]) + + @property + def nenvs(self): + return 1 + + @property + def envs(self): + return [self.env] + + def step(self, actions): + self._check_actions(actions) + ob, rew, done, info = self.env.step(actions[0]) + if done: + ob = self.env.reset() + return ob[None], np.expand_dims(rew, 0), np.expand_dims(done, 0), [info] + + def reset(self): + return self.env.reset()[None] + + +def worker(parent_connection, worker_connection, make_env_function, + send_spaces=True): + # Adapted from SubprocVecEnv github.com/openai/baselines + parent_connection.close() + env = make_env_function() + if send_spaces: + worker_connection.send((env.observation_space, env.action_space)) + while True: + cmd, action = worker_connection.recv() + if cmd == "step": + ob, rew, done, info = env.step(action) + if done: + ob = env.reset() + worker_connection.send((ob, rew, done, info)) + elif cmd == "reset": + ob = env.reset() + worker_connection.send(ob) + elif cmd == "close": + env.close() + worker_connection.close() + break + else: + raise NotImplementedError("Unknown command %s" % cmd) + + +class ParallelEnvBatch(EnvBatch): + """ + An abstract batch of environments. + """ + def __init__(self, make_env, nenvs=None): + make_env_functions = self._get_make_env_functions(make_env, nenvs) + self._nenvs = len(make_env_functions) + self._parent_connections, self._worker_connections = zip(*[ + Pipe() for _ in range(self._nenvs) + ]) + self._processes = [ + Process( + target=worker, + args=(parent_connection, worker_connection, make_env), + daemon=True + ) + for i, (parent_connection, worker_connection, make_env) + in enumerate(zip(self._parent_connections, + self._worker_connections, + make_env_functions)) + ] + for p in self._processes: + p.start() + self._closed = False + + for conn in self._worker_connections: + conn.close() + + observation_spaces, action_spaces = [], [] + for conn in self._parent_connections: + ob_space, ac_space = conn.recv() + observation_spaces.append(ob_space) + action_spaces.append(ac_space) + self.observation_space = SpaceBatch(observation_spaces) + self.action_space = SpaceBatch(action_spaces) + + @property + def nenvs(self): + return self._nenvs + + def step(self, actions): + self._check_actions(actions) + for conn, a in zip(self._parent_connections, actions): + conn.send(("step", a)) + results = [conn.recv() for conn in self._parent_connections] + obs, rews, dones, infos = zip(*results) + return np.stack(obs), np.stack(rews), np.stack(dones), infos + + def reset(self): + for conn in self._parent_connections: + conn.send(("reset", None)) + return np.stack([conn.recv() for conn in self._parent_connections]) + + def close(self): + if self._closed: + return + for conn in self._parent_connections: + conn.send(("close", None)) + for p in self._processes: + p.join() + self._closed = True + + def render(self): + raise ValueError("render not defined for %s" % self) diff --git a/week6_policy_based/reinforce_lasagne.ipynb b/week06_policy_based/reinforce_lasagne.ipynb similarity index 100% rename from week6_policy_based/reinforce_lasagne.ipynb rename to week06_policy_based/reinforce_lasagne.ipynb diff --git a/week06_policy_based/reinforce_pytorch.ipynb b/week06_policy_based/reinforce_pytorch.ipynb new file mode 100644 index 000000000..90a409d67 --- /dev/null +++ b/week06_policy_based/reinforce_pytorch.ipynb @@ -0,0 +1,426 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# REINFORCE in pytorch\n", + "\n", + "Just like we did before for q-learning, this time we'll design a pytorch network to learn `CartPole-v0` via policy gradient (REINFORCE).\n", + "\n", + "Most of the code in this notebook is taken from approximate qlearning, so you'll find it more or less familiar and even simpler." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# # in google colab uncomment this\n", + "\n", + "# import os\n", + "\n", + "# os.system('apt-get install -y xvfb')\n", + "# os.system('wget https://raw.githubusercontent.com/yandexdataschool/Practical_DL/fall18/xvfb -O ../xvfb')\n", + "# os.system('apt-get install -y python-opengl ffmpeg')\n", + "# os.system('pip install pyglet==1.2.4')\n", + "\n", + "# os.system('python -m pip install -U pygame --user')\n", + "\n", + "# print('setup complete')\n", + "\n", + "# XVFB will be launched if you run on a server\n", + "import os\n", + "if type(os.environ.get(\"DISPLAY\")) is not str or len(os.environ.get(\"DISPLAY\")) == 0:\n", + " !bash ../xvfb start\n", + " %env DISPLAY = : 1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import gym\n", + "import numpy as np\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "%matplotlib inline\n", + "\n", + "env = gym.make(\"CartPole-v0\").env\n", + "env.reset()\n", + "\n", + "plt.imshow(env.render(\"rgb_array\"))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Building the network for REINFORCE" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For REINFORCE algorithm, we'll need a model that predicts action probabilities given states. Let's define such a model below." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import torch\n", + "import torch.nn as nn" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# Build a simple neural network that predicts policy logits. \n", + "# Keep it simple: CartPole isn't worth deep architectures.\n", + "model = nn.Sequential(\n", + " < YOUR CODE HERE: define a neural network that predicts policy logits >\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Predict function" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def predict_probs(states):\n", + " \"\"\" \n", + " Predict action probabilities given states.\n", + " :param states: numpy array of shape [batch, state_shape]\n", + " :returns: numpy array of shape [batch, n_actions]\n", + " \"\"\"\n", + " # convert states, compute logits, use softmax to get probability\n", + " \n", + " return < your code >" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "test_states = np.array([env.reset() for _ in range(5)])\n", + "test_probas = predict_probs(test_states)\n", + "assert isinstance(\n", + " test_probas, np.ndarray), \"you must return np array and not %s\" % type(test_probas)\n", + "assert tuple(test_probas.shape) == (\n", + " test_states.shape[0], env.action_space.n), \"wrong output shape: %s\" % np.shape(test_probas)\n", + "assert np.allclose(np.sum(test_probas, axis=1),\n", + " 1), \"probabilities do not sum to 1\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Play the game\n", + "\n", + "We can now use our newly built agent to play the game." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def generate_session(t_max=1000):\n", + " \"\"\" \n", + " play a full session with REINFORCE agent and train at the session end.\n", + " returns sequences of states, actions andrewards\n", + " \"\"\"\n", + " # arrays to record session\n", + " states, actions, rewards = [], [], []\n", + " s = env.reset()\n", + "\n", + " for t in range(t_max):\n", + " # action probabilities array aka pi(a|s)\n", + " action_probs = predict_probs(np.array([s]))[0]\n", + "\n", + " # Sample action with given probabilities.\n", + " a = < your code >\n", + " new_s, r, done, info = env.step(a)\n", + "\n", + " # record session history to train later\n", + " states.append(s)\n", + " actions.append(a)\n", + " rewards.append(r)\n", + "\n", + " s = new_s\n", + " if done:\n", + " break\n", + "\n", + " return states, actions, rewards" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# test it\n", + "states, actions, rewards = generate_session()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Computing cumulative rewards" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def get_cumulative_rewards(rewards, # rewards at each step\n", + " gamma=0.99 # discount for reward\n", + " ):\n", + " \"\"\"\n", + " take a list of immediate rewards r(s,a) for the whole session \n", + " compute cumulative returns (a.k.a. G(s,a) in Sutton '16)\n", + " G_t = r_t + gamma*r_{t+1} + gamma^2*r_{t+2} + ...\n", + "\n", + " The simple way to compute cumulative rewards is to iterate from last to first time tick\n", + " and compute G_t = r_t + gamma*G_{t+1} recurrently\n", + "\n", + " You must return an array/list of cumulative rewards with as many elements as in the initial rewards.\n", + " \"\"\"\n", + " \n", + " return < array of cumulative rewards >" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "get_cumulative_rewards(rewards)\n", + "assert len(get_cumulative_rewards(list(range(100)))) == 100\n", + "assert np.allclose(get_cumulative_rewards([0, 0, 1, 0, 0, 1, 0], gamma=0.9), [\n", + " 1.40049, 1.5561, 1.729, 0.81, 0.9, 1.0, 0.0])\n", + "assert np.allclose(get_cumulative_rewards(\n", + " [0, 0, 1, -2, 3, -4, 0], gamma=0.5), [0.0625, 0.125, 0.25, -1.5, 1.0, -4.0, 0.0])\n", + "assert np.allclose(get_cumulative_rewards(\n", + " [0, 0, 1, 2, 3, 4, 0], gamma=0), [0, 0, 1, 2, 3, 4, 0])\n", + "print(\"looks good!\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Loss function and updates\n", + "\n", + "We now need to define objective and update over policy gradient.\n", + "\n", + "Our objective function is\n", + "\n", + "$$ J \\approx { 1 \\over N } \\sum _{s_i,a_i} \\pi_\\theta (a_i | s_i) \\cdot G(s_i,a_i) $$\n", + "\n", + "\n", + "Following the REINFORCE algorithm, we can define our objective as follows: \n", + "\n", + "$$ \\hat J \\approx { 1 \\over N } \\sum _{s_i,a_i} log \\pi_\\theta (a_i | s_i) \\cdot G(s_i,a_i) $$\n", + "\n", + "When you compute gradient of that function over network weights $ \\theta $, it will become exactly the policy gradient.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def to_one_hot(y_tensor, ndims):\n", + " \"\"\" helper: take an integer vector and convert it to 1-hot matrix. \"\"\"\n", + " y_tensor = y_tensor.type(torch.LongTensor).view(-1, 1)\n", + " y_one_hot = torch.zeros(\n", + " y_tensor.size()[0], ndims).scatter_(1, y_tensor, 1)\n", + " return y_one_hot" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# Your code: define optimizers\n", + "optimizer = torch.optim.Adam(model.parameters(), 1e-3)\n", + "\n", + "\n", + "def train_on_session(states, actions, rewards, gamma=0.99, entropy_coef=1e-2):\n", + " \"\"\"\n", + " Takes a sequence of states, actions and rewards produced by generate_session.\n", + " Updates agent's weights by following the policy gradient above.\n", + " Please use Adam optimizer with default parameters.\n", + " \"\"\"\n", + "\n", + " # cast everything into torch tensors\n", + " states = torch.tensor(states, dtype=torch.float32)\n", + " actions = torch.tensor(actions, dtype=torch.int32)\n", + " cumulative_returns = np.array(get_cumulative_rewards(rewards, gamma))\n", + " cumulative_returns = torch.tensor(cumulative_returns, dtype=torch.float32)\n", + "\n", + " # predict logits, probas and log-probas using an agent.\n", + " logits = model(states)\n", + " probs = nn.functional.softmax(logits, -1)\n", + " log_probs = nn.functional.log_softmax(logits, -1)\n", + "\n", + " assert all(isinstance(v, torch.Tensor) for v in [logits, probs, log_probs]), \\\n", + " \"please use compute using torch tensors and don't use predict_probs function\"\n", + "\n", + " # select log-probabilities for chosen actions, log pi(a_i|s_i)\n", + " log_probs_for_actions = torch.sum(\n", + " log_probs * to_one_hot(actions, env.action_space.n), dim=1)\n", + " \n", + " # Compute loss here. Don't forgen entropy regularization with `entropy_coef` \n", + " entropy = < your code >\n", + " loss = < your code\n", + "\n", + " # Gradient descent step\n", + " < your code >\n", + "\n", + " # technical: return session rewards to print them later\n", + " return np.sum(rewards)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### The actual training" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "for i in range(100):\n", + " rewards = [train_on_session(*generate_session())\n", + " for _ in range(100)] # generate new sessions\n", + " print(\"mean reward:%.3f\" % (np.mean(rewards)))\n", + " if np.mean(rewards) > 500:\n", + " print(\"You Win!\") # but you can train even further\n", + " break" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Video" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# record sessions\n", + "import gym.wrappers\n", + "env = gym.wrappers.Monitor(gym.make(\"CartPole-v0\"),\n", + " directory=\"videos\", force=True)\n", + "sessions = [generate_session() for _ in range(100)]\n", + "env.close()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# show video\n", + "from IPython.display import HTML\n", + "import os\n", + "\n", + "video_names = list(\n", + " filter(lambda s: s.endswith(\".mp4\"), os.listdir(\"./videos/\")))\n", + "\n", + "HTML(\"\"\"\n", + "\n", + "\"\"\".format(\"./videos/\"+video_names[-1])) # this may or may not be the _last_ video. Try other indices" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.0" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/week06_policy_based/reinforce_tensorflow.ipynb b/week06_policy_based/reinforce_tensorflow.ipynb new file mode 100644 index 000000000..39c21b74f --- /dev/null +++ b/week06_policy_based/reinforce_tensorflow.ipynb @@ -0,0 +1,387 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# REINFORCE in TensorFlow (3 pts)¶\n", + "\n", + "This notebook implements a basic reinforce algorithm a.k.a. policy gradient for CartPole env.\n", + "\n", + "It has been deliberately written to be as simple and human-readable.\n", + "\n", + "Authors: [Practical_RL](https://github.com/yandexdataschool/Practical_RL) course team" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%env THEANO_FLAGS = 'floatX=float32'\n", + "import os\n", + "if type(os.environ.get(\"DISPLAY\")) is not str or len(os.environ.get(\"DISPLAY\")) == 0:\n", + " !bash ../xvfb start\n", + " %env DISPLAY = : 1" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The notebook assumes that you have [openai gym](https://github.com/openai/gym) installed.\n", + "\n", + "In case you're running on a server, [use xvfb](https://github.com/openai/gym#rendering-on-a-server)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import gym\n", + "import numpy as np\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "%matplotlib inline\n", + "\n", + "env = gym.make(\"CartPole-v0\")\n", + "\n", + "# gym compatibility: unwrap TimeLimit\n", + "if hasattr(env, 'env'):\n", + " env = env.env\n", + "\n", + "env.reset()\n", + "n_actions = env.action_space.n\n", + "state_dim = env.observation_space.shape\n", + "\n", + "plt.imshow(env.render(\"rgb_array\"))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Building the network for REINFORCE" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For REINFORCE algorithm, we'll need a model that predicts action probabilities given states.\n", + "\n", + "For numerical stability, please __do not include the softmax layer into your network architecture__.\n", + "We'll use softmax or log-softmax where appropriate." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import tensorflow as tf\n", + "\n", + "# create input variables. We only need for REINFORCE\n", + "states = tf.placeholder('float32', (None,)+state_dim, name=\"states\")\n", + "actions = tf.placeholder('int32', name=\"action_ids\")\n", + "cumulative_rewards = tf.placeholder('float32', name=\"cumulative_returns\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "\n", + "\n", + "\n", + "logits = \n", + "\n", + "policy = tf.nn.softmax(logits)\n", + "log_policy = tf.nn.log_softmax(logits)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# utility function to pick action in one given state\n", + "def get_action_proba(s): return policy.eval({states: [s]})[0]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Loss function and updates\n", + "\n", + "We now need to define objective and update over policy gradient.\n", + "\n", + "Our objective function is\n", + "\n", + "$$ J \\approx { 1 \\over N } \\sum _{s_i,a_i} \\pi_\\theta (a_i | s_i) \\cdot G(s_i,a_i) $$\n", + "\n", + "\n", + "Following the REINFORCE algorithm, we can define our objective as follows: \n", + "\n", + "$$ \\hat J \\approx { 1 \\over N } \\sum _{s_i,a_i} log \\pi_\\theta (a_i | s_i) \\cdot G(s_i,a_i) $$\n", + "\n", + "When you compute gradient of that function over network weights $ \\theta $, it will become exactly the policy gradient.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# get probabilities for parti\n", + "indices = tf.stack([tf.range(tf.shape(log_policy)[0]), actions], axis=-1)\n", + "log_policy_for_actions = tf.gather_nd(log_policy, indices)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# policy objective as in the last formula. please use mean, not sum.\n", + "# note: you need to use log_policy_for_actions to get log probabilities for actions taken.\n", + "\n", + "J = " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# regularize with entropy\n", + "entropy = " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# all network weights\n", + "all_weights = \n", + "\n", + "# weight updates. maximizing J is same as minimizing -J. Adding negative entropy.\n", + "loss = -J - 0.1*entropy\n", + "\n", + "update = tf.train.AdamOptimizer().minimize(loss, var_list=all_weights)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Computing cumulative rewards" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "\n", + "def get_cumulative_rewards(rewards, # rewards at each step\n", + " gamma=0.99 # discount for reward\n", + " ):\n", + " \"\"\"\n", + " take a list of immediate rewards r(s,a) for the whole session \n", + " compute cumulative rewards R(s,a) (a.k.a. G(s,a) in Sutton '16)\n", + " R_t = r_t + gamma*r_{t+1} + gamma^2*r_{t+2} + ...\n", + "\n", + " The simple way to compute cumulative rewards is to iterate from last to first time tick\n", + " and compute R_t = r_t + gamma*R_{t+1} recurrently\n", + "\n", + " You must return an array/list of cumulative rewards with as many elements as in the initial rewards.\n", + " \"\"\"\n", + "\n", + " \n", + "\n", + " return < array of cumulative rewards >" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "assert len(get_cumulative_rewards(range(100))) == 100\n", + "assert np.allclose(get_cumulative_rewards([0, 0, 1, 0, 0, 1, 0], gamma=0.9), [\n", + " 1.40049, 1.5561, 1.729, 0.81, 0.9, 1.0, 0.0])\n", + "assert np.allclose(get_cumulative_rewards(\n", + " [0, 0, 1, -2, 3, -4, 0], gamma=0.5), [0.0625, 0.125, 0.25, -1.5, 1.0, -4.0, 0.0])\n", + "assert np.allclose(get_cumulative_rewards(\n", + " [0, 0, 1, 2, 3, 4, 0], gamma=0), [0, 0, 1, 2, 3, 4, 0])\n", + "print(\"looks good!\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def train_step(_states, _actions, _rewards):\n", + " \"\"\"given full session, trains agent with policy gradient\"\"\"\n", + " _cumulative_rewards = get_cumulative_rewards(_rewards)\n", + " update.run({states: _states, actions: _actions,\n", + " cumulative_rewards: _cumulative_rewards})" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Playing the game" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def generate_session(t_max=1000):\n", + " \"\"\"play env with REINFORCE agent and train at the session end\"\"\"\n", + "\n", + " # arrays to record session\n", + " states, actions, rewards = [], [], []\n", + "\n", + " s = env.reset()\n", + "\n", + " for t in range(t_max):\n", + "\n", + " # action probabilities array aka pi(a|s)\n", + " action_probas = get_action_proba(s)\n", + "\n", + " a = \n", + "\n", + " new_s, r, done, info = env.step(a)\n", + "\n", + " # record session history to train later\n", + " states.append(s)\n", + " actions.append(a)\n", + " rewards.append(r)\n", + "\n", + " s = new_s\n", + " if done:\n", + " break\n", + "\n", + " train_step(states, actions, rewards)\n", + "\n", + " return sum(rewards)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "s = tf.InteractiveSession()\n", + "s.run(tf.global_variables_initializer())\n", + "\n", + "for i in range(100):\n", + "\n", + " rewards = [generate_session() for _ in range(100)] # generate new sessions\n", + "\n", + " print(\"mean reward:%.3f\" % (np.mean(rewards)))\n", + "\n", + " if np.mean(rewards) > 300:\n", + " print(\"You Win!\")\n", + " break" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Results & video" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# record sessions\n", + "import gym.wrappers\n", + "env = gym.wrappers.Monitor(gym.make(\"CartPole-v0\"),\n", + " directory=\"videos\", force=True)\n", + "sessions = [generate_session() for _ in range(100)]\n", + "env.close()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# show video\n", + "from IPython.display import HTML\n", + "import os\n", + "\n", + "video_names = list(\n", + " filter(lambda s: s.endswith(\".mp4\"), os.listdir(\"./videos/\")))\n", + "\n", + "HTML(\"\"\"\n", + "\n", + "\"\"\".format(\"./videos/\"+video_names[-1])) # this may or may not be _last_ video. Try other indices" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# That's all, thank you for your attention!" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.3" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/week06_policy_based/runners.py b/week06_policy_based/runners.py new file mode 100644 index 000000000..83394afb2 --- /dev/null +++ b/week06_policy_based/runners.py @@ -0,0 +1,60 @@ +""" RL env runner """ +from collections import defaultdict +import numpy as np + + +class EnvRunner: + """ Reinforcement learning runner in an environment with given policy """ + def __init__(self, env, policy, nsteps, + transforms=None, step_var=None): + self.env = env + self.policy = policy + self.nsteps = nsteps + self.transforms = transforms or [] + self.step_var = step_var if step_var is not None else 0 + self.state = {"latest_observation": self.env.reset()} + + @property + def nenvs(self): + """ Returns number of batched envs or `None` if env is not batched """ + return getattr(self.env.unwrapped, "nenvs", None) + + def reset(self): + """ Resets env and runner states. """ + self.state["latest_observation"] = self.env.reset() + self.policy.reset() + + def get_next(self): + """ Runs the agent in the environment. """ + trajectory = defaultdict(list, {"actions": []}) + observations = [] + rewards = [] + resets = [] + self.state["env_steps"] = self.nsteps + + for i in range(self.nsteps): + observations.append(self.state["latest_observation"]) + act = self.policy.act(self.state["latest_observation"]) + if "actions" not in act: + raise ValueError("result of policy.act must contain 'actions' " + f"but has keys {list(act.keys())}") + for key, val in act.items(): + trajectory[key].append(val) + + obs, rew, done, _ = self.env.step(trajectory["actions"][-1]) + self.state["latest_observation"] = obs + rewards.append(rew) + resets.append(done) + self.step_var += self.nenvs or 1 + + # Only reset if the env is not batched. Batched envs should auto-reset. + if not self.nenvs and np.all(done): + self.state["env_steps"] = i + 1 + self.state["latest_observation"] = self.env.reset() + + trajectory.update(observations=observations, rewards=rewards, resets=resets) + trajectory["state"] = self.state + + for transform in self.transforms: + transform(trajectory) + return trajectory diff --git a/week7_[recap]_rnn/README.md b/week07_[recap]_rnn/README.md similarity index 88% rename from week7_[recap]_rnn/README.md rename to week07_[recap]_rnn/README.md index 99f24b360..61f8428cf 100644 --- a/week7_[recap]_rnn/README.md +++ b/week07_[recap]_rnn/README.md @@ -13,6 +13,7 @@ * OpenAI research on sentiment analysis that sheds some light on what's inside LSTM language model. # Homework description +[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/yandexdataschool/Practical_RL/blob/spring19/week07_%5Brecap%5D_rnn/seminar_pytorch.ipynb) This week's practice gets you acquainted with basics of recurrent neural networks. For simplicity, we'll train them on character language modelling task. Pick any one of `seminar_lasagne`, `seminar_lasagne_ingraph` or `seminar_tf`. diff --git a/week7_[recap]_rnn/arxiv_data.csv b/week07_[recap]_rnn/arxiv_data.csv similarity index 100% rename from week7_[recap]_rnn/arxiv_data.csv rename to week07_[recap]_rnn/arxiv_data.csv diff --git a/week7_[recap]_rnn/mtg_card_names.txt b/week07_[recap]_rnn/mtg_card_names.txt similarity index 100% rename from week7_[recap]_rnn/mtg_card_names.txt rename to week07_[recap]_rnn/mtg_card_names.txt diff --git a/week7_[recap]_rnn/names b/week07_[recap]_rnn/names similarity index 100% rename from week7_[recap]_rnn/names rename to week07_[recap]_rnn/names diff --git a/week7_[recap]_rnn/rnn.png b/week07_[recap]_rnn/rnn.png similarity index 100% rename from week7_[recap]_rnn/rnn.png rename to week07_[recap]_rnn/rnn.png diff --git a/week7_[recap]_rnn/seminar_lasagne.ipynb b/week07_[recap]_rnn/seminar_lasagne.ipynb similarity index 100% rename from week7_[recap]_rnn/seminar_lasagne.ipynb rename to week07_[recap]_rnn/seminar_lasagne.ipynb diff --git a/week7_[recap]_rnn/seminar_lasagne_ingraph.ipynb b/week07_[recap]_rnn/seminar_lasagne_ingraph.ipynb similarity index 100% rename from week7_[recap]_rnn/seminar_lasagne_ingraph.ipynb rename to week07_[recap]_rnn/seminar_lasagne_ingraph.ipynb diff --git a/week07_[recap]_rnn/seminar_pytorch.ipynb b/week07_[recap]_rnn/seminar_pytorch.ipynb new file mode 100644 index 000000000..93a6b86c9 --- /dev/null +++ b/week07_[recap]_rnn/seminar_pytorch.ipynb @@ -0,0 +1,765 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "name": "seminar_pytorch.ipynb", + "version": "0.3.2", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + } + }, + "cells": [ + { + "metadata": { + "id": "BBnWIbH4xNou", + "colab_type": "text" + }, + "cell_type": "markdown", + "source": [ + "### Generating names with recurrent neural networks\n", + "\n", + "This time you'll find yourself delving into the heart (and other intestines) of recurrent neural networks on a class of toy problems.\n", + "\n", + "Struggle to find a name for the variable? Let's see how you'll come up with a name for your son/daughter. Surely no human has expertize over what is a good child name, so let us train RNN instead;\n", + "\n", + "It's dangerous to go alone, take these:" + ] + }, + { + "metadata": { + "id": "LtOprx8exNo4", + "colab_type": "code", + "colab": {} + }, + "cell_type": "code", + "source": [ + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "%matplotlib inline\n", + "\n", + "# if you're in in colab, uncomment\n", + "# !wget https://raw.githubusercontent.com/yandexdataschool/Practical_RL/99ae2a3dae648428edbfc41fd10ed688e5365161/week07_%5Brecap%5D_rnn/names -O names" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "id": "dO7UJFP-xNpM", + "colab_type": "text" + }, + "cell_type": "markdown", + "source": [ + "# Our data\n", + "The dataset contains ~8k earthling names from different cultures, all in latin transcript.\n", + "\n", + "This notebook has been designed so as to allow you to quickly swap names for something similar: deep learning article titles, IKEA furniture, pokemon names, etc." + ] + }, + { + "metadata": { + "id": "OkxTfJKBxNpT", + "colab_type": "code", + "colab": {} + }, + "cell_type": "code", + "source": [ + "import os\n", + "start_token = \" \"\n", + "\n", + "with open(\"names\") as f:\n", + " lines = f.read()[:-1].split('\\n')\n", + " lines = [start_token + line for line in lines]" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "id": "CSUSTDKWxNph", + "colab_type": "code", + "colab": {} + }, + "cell_type": "code", + "source": [ + "print ('n samples = ',len(lines))\n", + "for x in lines[::1000]:\n", + " print (x)\n", + " \n" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "id": "n-ztLh0wxNp2", + "colab_type": "code", + "colab": {} + }, + "cell_type": "code", + "source": [ + "MAX_LENGTH = max(map(len, lines))\n", + "print(\"max length =\", MAX_LENGTH)\n", + "\n", + "plt.title('Sequence length distribution')\n", + "plt.hist(list(map(len, lines)),bins=25);" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "id": "-Ab7zEqJxNp-", + "colab_type": "text" + }, + "cell_type": "markdown", + "source": [ + "# Text processing\n", + "\n", + "First we need next to collect a \"vocabulary\" of all unique tokens i.e. unique characters. We can then encode inputs as a sequence of character ids." + ] + }, + { + "metadata": { + "id": "RpppLCJ8xNqC", + "colab_type": "code", + "colab": {} + }, + "cell_type": "code", + "source": [ + "#all unique characters go here\n", + "tokens = \n", + "\n", + "tokens = list(tokens)\n", + "\n", + "num_tokens = len(tokens)\n", + "print ('num_tokens = ', num_tokens)\n", + "\n", + "assert 50 < num_tokens < 60, \"Names should contain within 50 and 60 unique tokens depending on encoding\"" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "id": "3rObMYUnxNqL", + "colab_type": "text" + }, + "cell_type": "markdown", + "source": [ + "### Convert characters to integers\n", + "\n", + "Torch is built for crunching numbers, not strings. \n", + "To train our neural network, we'll need to replace characters with their indices in tokens list.\n", + "\n", + "Let's compose a dictionary that does this mapping." + ] + }, + { + "metadata": { + "id": "HFUPS7SZxNqP", + "colab_type": "code", + "colab": {} + }, + "cell_type": "code", + "source": [ + "token_to_id = its identifier (index in tokens list)>" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "id": "wBD3JmwLxNqd", + "colab_type": "code", + "colab": {} + }, + "cell_type": "code", + "source": [ + "assert len(tokens) == len(token_to_id), \"dictionaries must have same size\"\n", + "\n", + "for i in range(num_tokens):\n", + " assert token_to_id[tokens[i]] == i, \"token identifier must be it's position in tokens list\"\n", + "\n", + "print(\"Seems alright!\")" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "id": "AATQIEw-xNqw", + "colab_type": "code", + "colab": {} + }, + "cell_type": "code", + "source": [ + "def to_matrix(lines, max_len=None, pad=token_to_id[' '], dtype='int32', batch_first = True):\n", + " \"\"\"Casts a list of names into rnn-digestable matrix\"\"\"\n", + " \n", + " max_len = max_len or max(map(len, lines))\n", + " lines_ix = np.zeros([len(lines), max_len], dtype) + pad\n", + "\n", + " for i in range(len(lines)):\n", + " line_ix = [token_to_id[c] for c in lines[i]]\n", + " lines_ix[i, :len(line_ix)] = line_ix\n", + " \n", + " if not batch_first: # convert [batch, time] into [time, batch]\n", + " lines_ix = np.transpose(lines_ix)\n", + "\n", + " return lines_ix" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "id": "e6LjFvFYxNrA", + "colab_type": "code", + "colab": {} + }, + "cell_type": "code", + "source": [ + "#Example: cast 4 random names to matrices, pad with zeros\n", + "print('\\n'.join(lines[::2000]))\n", + "print(to_matrix(lines[::2000]))" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "id": "eRsgx9TXxNrJ", + "colab_type": "text" + }, + "cell_type": "markdown", + "source": [ + "# Recurrent neural network\n", + "\n", + "We can rewrite recurrent neural network as a consecutive application of dense layer to input $x_t$ and previous rnn state $h_t$. This is exactly what we're gonna do now.\n", + "\n", + "\n", + "Since we're training a language model, there should also be:\n", + "* An embedding layer that converts character id x_t to a vector.\n", + "* An output layer that predicts probabilities of next phoneme" + ] + }, + { + "metadata": { + "id": "LyPI4I1exNrO", + "colab_type": "code", + "colab": {} + }, + "cell_type": "code", + "source": [ + "import torch, torch.nn as nn\n", + "import torch.nn.functional as F\n", + "\n", + "class CharRNNCell(nn.Module):\n", + " \"\"\"\n", + " Implement the scheme above as torch module\n", + " \"\"\"\n", + " def __init__(self, num_tokens=len(tokens), embedding_size=16, rnn_num_units=64):\n", + " super(self.__class__,self).__init__()\n", + " self.num_units = rnn_num_units\n", + " \n", + " self.embedding = nn.Embedding(num_tokens, embedding_size)\n", + " self.rnn_update = nn.Linear(embedding_size + rnn_num_units, rnn_num_units)\n", + " self.rnn_to_logits = nn.Linear(rnn_num_units, num_tokens)\n", + " \n", + " def forward(self, x, h_prev):\n", + " \"\"\"\n", + " This method computes h_next(x, h_prev) and log P(x_next | h_next)\n", + " We'll call it repeatedly to produce the whole sequence.\n", + " \n", + " :param x: batch of character ids, int64[batch_size]\n", + " :param h_prev: previous rnn hidden states, float32 matrix [batch, rnn_num_units]\n", + " \"\"\"\n", + " # get vector embedding of x\n", + " x_emb = self.embedding(x)\n", + " \n", + " # compute next hidden state using self.rnn_update\n", + " # hint: use torch.cat(..., dim=...) for concatenation\n", + " h_next = ###YOUR CODE HERE\n", + " \n", + " h_next = torch.tanh(h_next)\n", + " \n", + " assert h_next.size() == h_prev.size()\n", + " \n", + " #compute logits for next character probs\n", + " logits = ###YOUR CODE\n", + " \n", + " return h_next, F.log_softmax(logits, -1)\n", + " \n", + " def initial_state(self, batch_size):\n", + " \"\"\" return rnn state before it processes first input (aka h0) \"\"\"\n", + " return torch.zeros(batch_size, self.num_units)" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "id": "TFhNjoG5xNrV", + "colab_type": "code", + "colab": {} + }, + "cell_type": "code", + "source": [ + "char_rnn = CharRNNCell()" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "id": "DRG9DwU4xNrk", + "colab_type": "text" + }, + "cell_type": "markdown", + "source": [ + "### RNN loop\n", + "\n", + "Once we've defined a single RNN step, we can apply it in a loop to get predictions on each step." + ] + }, + { + "metadata": { + "id": "1wZV18EMxNro", + "colab_type": "code", + "colab": {} + }, + "cell_type": "code", + "source": [ + "def rnn_loop(char_rnn, batch_ix):\n", + " \"\"\"\n", + " Computes log P(next_character) for all time-steps in lines_ix\n", + " :param lines_ix: an int32 matrix of shape [batch, time], output of to_matrix(lines)\n", + " \"\"\"\n", + " batch_size, max_length = batch_ix.size()\n", + " hid_state = char_rnn.initial_state(batch_size)\n", + " logprobs = []\n", + "\n", + " for x_t in batch_ix.transpose(0,1):\n", + " hid_state, logp_next = char_rnn(x_t, hid_state) # <-- here we call your one-step code\n", + " logprobs.append(logp_next)\n", + " \n", + " return torch.stack(logprobs, dim=1)" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "id": "cyeW8FGTxNry", + "colab_type": "code", + "colab": {} + }, + "cell_type": "code", + "source": [ + "batch_ix = to_matrix(lines[:5])\n", + "batch_ix = torch.tensor(batch_ix, dtype=torch.int64)\n", + "\n", + "logp_seq = rnn_loop(char_rnn, batch_ix)\n", + "\n", + "assert torch.max(logp_seq).data.numpy() <= 0\n", + "assert tuple(logp_seq.size()) == batch_ix.shape + (num_tokens,)" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "id": "s4-dWabbxNr6", + "colab_type": "text" + }, + "cell_type": "markdown", + "source": [ + "### Likelihood and gradients\n", + "\n", + "We can now train our neural network to minimize crossentropy (maximize log-likelihood) with the actual next tokens.\n", + "\n", + "To do so in a vectorized manner, we take `batch_ix[:, 1:]` - a matrix of token ids shifted i step to the left so i-th element is acutally the \"next token\" for i-th prediction" + ] + }, + { + "metadata": { + "id": "GLHf-Aq8xNr9", + "colab_type": "code", + "colab": {} + }, + "cell_type": "code", + "source": [ + "predictions_logp = logp_seq[:, :-1]\n", + "actual_next_tokens = batch_ix[:, 1:]\n", + "\n", + "logp_next = torch.gather(predictions_logp, dim=2, index=actual_next_tokens[:,:,None])\n", + "\n", + "loss = -logp_next.mean()" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "id": "UaCMB9JPxNsK", + "colab_type": "code", + "colab": {} + }, + "cell_type": "code", + "source": [ + "loss.backward()" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "id": "4vQT0t-qxNsa", + "colab_type": "code", + "colab": {} + }, + "cell_type": "code", + "source": [ + "for w in char_rnn.parameters():\n", + " assert w.grad is not None and torch.max(torch.abs(w.grad)).data.numpy() != 0, \\\n", + " \"Loss is not differentiable w.r.t. a weight with shape %s. Check forward method.\" % (w.size(),)" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "id": "_Gxi9BhJxNsn", + "colab_type": "text" + }, + "cell_type": "markdown", + "source": [ + "### The training loop\n", + "\n", + "We train our char-rnn exactly the same way we train any deep learning model: by minibatch sgd.\n", + "\n", + "The only difference is that this time we sample strings, not images or sound." + ] + }, + { + "metadata": { + "id": "kjcJR_1cxNsq", + "colab_type": "code", + "colab": {} + }, + "cell_type": "code", + "source": [ + "from IPython.display import clear_output\n", + "from random import sample\n", + "\n", + "char_rnn = CharRNNCell()\n", + "opt = torch.optim.Adam(char_rnn.parameters())\n", + "history = []" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "id": "6STFsVgsxNsy", + "colab_type": "code", + "colab": {} + }, + "cell_type": "code", + "source": [ + "\n", + "for i in range(1000):\n", + " batch_ix = to_matrix(sample(lines, 32), max_len=MAX_LENGTH)\n", + " batch_ix = torch.tensor(batch_ix, dtype=torch.int64)\n", + " \n", + " logp_seq = rnn_loop(char_rnn, batch_ix)\n", + " \n", + " # compute loss\n", + " \n", + " \n", + " loss = ###YOUR CODE\n", + " \n", + " # train with backprop\n", + " \n", + " \n", + " history.append(loss.data.numpy())\n", + " if (i+1)%100==0:\n", + " clear_output(True)\n", + " plt.plot(history,label='loss')\n", + " plt.legend()\n", + " plt.show()\n", + "\n", + "assert np.mean(history[:10]) > np.mean(history[-10:]), \"RNN didn't converge.\"" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "id": "Kae9COy8xNtB", + "colab_type": "text" + }, + "cell_type": "markdown", + "source": [ + "### RNN: sampling\n", + "Once we've trained our network a bit, let's get to actually generating stuff. \n", + "All we need is the single rnn step function you have defined in `char_rnn.forward`." + ] + }, + { + "metadata": { + "id": "piuEtqYexNtG", + "colab_type": "code", + "colab": {} + }, + "cell_type": "code", + "source": [ + "def generate_sample(char_rnn, seed_phrase=' ', max_length=MAX_LENGTH, temperature=1.0):\n", + " '''\n", + " The function generates text given a phrase of length at least SEQ_LENGTH.\n", + " :param seed_phrase: prefix characters. The RNN is asked to continue the phrase\n", + " :param max_length: maximum output length, including seed_phrase\n", + " :param temperature: coefficient for sampling. higher temperature produces more chaotic outputs,\n", + " smaller temperature converges to the single most likely output\n", + " '''\n", + " \n", + " x_sequence = [token_to_id[token] for token in seed_phrase]\n", + " x_sequence = torch.tensor([x_sequence], dtype=torch.int64)\n", + " hid_state = char_rnn.initial_state(batch_size=1)\n", + " \n", + " #feed the seed phrase, if any\n", + " for i in range(len(seed_phrase) - 1):\n", + " hid_state, _ = char_rnn(x_sequence[:, i], hid_state)\n", + " \n", + " #start generating\n", + " for _ in range(max_length - len(seed_phrase)):\n", + " hid_state, logp_next = char_rnn(x_sequence[:, -1], hid_state)\n", + " p_next = F.softmax(logp_next / temperature, dim=-1).data.numpy()[0]\n", + " \n", + " # sample next token and push it back into x_sequence\n", + " next_ix = np.random.choice(num_tokens,p=p_next)\n", + " next_ix = torch.tensor([[next_ix]], dtype=torch.int64)\n", + " x_sequence = torch.cat([x_sequence, next_ix], dim=1)\n", + " \n", + " return ''.join([tokens[ix] for ix in x_sequence.data.numpy()[0]])" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "id": "wLx2EBLJxNtY", + "colab_type": "code", + "colab": {} + }, + "cell_type": "code", + "source": [ + "for _ in range(10):\n", + " print(generate_sample(char_rnn))" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "id": "QzbIA1WPxNtg", + "colab_type": "code", + "colab": {} + }, + "cell_type": "code", + "source": [ + "for _ in range(50):\n", + " print(generate_sample(char_rnn, seed_phrase=' Trump'))" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "id": "QwqUmy_pxNtp", + "colab_type": "text" + }, + "cell_type": "markdown", + "source": [ + "### Try it out!\n", + "You've just implemented a recurrent language model that can be tasked with generating any kind of sequence, so there's plenty of data you can try it on:\n", + "\n", + "* Novels/poems/songs of your favorite author\n", + "* News titles/clickbait titles\n", + "* Source code of Linux or Tensorflow\n", + "* Molecules in [smiles](https://en.wikipedia.org/wiki/Simplified_molecular-input_line-entry_system) format\n", + "* Melody in notes/chords format\n", + "* Ikea catalog titles\n", + "* Pokemon names\n", + "* Cards from Magic, the Gathering / Hearthstone\n", + "\n", + "If you're willing to give it a try, here's what you wanna look at:\n", + "* Current data format is a sequence of lines, so a novel can be formatted as a list of sentences. Alternatively, you can change data preprocessing altogether.\n", + "* While some datasets are readily available, others can only be scraped from the web. Try `Selenium` or `Scrapy` for that.\n", + "* Make sure MAX_LENGTH is adjusted for longer datasets. There's also a bonus section about dynamic RNNs at the bottom.\n", + "* More complex tasks require larger RNN architecture, try more neurons or several layers. It would also require more training iterations.\n", + "* Long-term dependencies in music, novels or molecules are better handled with LSTM or GRU\n", + "\n", + "__Good hunting!__" + ] + }, + { + "metadata": { + "collapsed": true, + "id": "BgbahGE0xNtw", + "colab_type": "text" + }, + "cell_type": "markdown", + "source": [ + "### More seriously\n", + "\n", + "What we just did is a manual low-level implementation of RNN. While it's cool, i guess you won't like the idea of re-writing it from scratch on every occasion. \n", + "\n", + "As you might have guessed, torch has a solution for this. To be more specific, there are two options:\n", + "* `nn.RNNCell(emb_size, rnn_num_units)` - implements a single step of RNN just like you did. Basically concat-linear-tanh\n", + "* `nn.RNN(emb_size, rnn_num_units` - implements the whole rnn_loop for you.\n", + "\n", + "There's also `nn.LSTMCell` vs `nn.LSTM`, `nn.GRUCell` vs `nn.GRU`, etc. etc.\n", + "\n", + "In this example we'll rewrite the char_rnn and rnn_loop using high-level rnn API." + ] + }, + { + "metadata": { + "id": "7uewn3r3xNtz", + "colab_type": "code", + "colab": {} + }, + "cell_type": "code", + "source": [ + "class CharRNNLoop(nn.Module):\n", + " def __init__(self, num_tokens=num_tokens, emb_size=16, rnn_num_units=64):\n", + " super(self.__class__, self).__init__()\n", + " self.emb = nn.Embedding(num_tokens, emb_size)\n", + " self.rnn = nn.RNN(emb_size, rnn_num_units, batch_first=True)\n", + " self.hid_to_logits = nn.Linear(rnn_num_units, num_tokens)\n", + " \n", + " def forward(self, x):\n", + " h_seq, _ = self.rnn(self.emb(x))\n", + " next_logits = self.hid_to_logits(h_seq)\n", + " next_logp = F.log_softmax(next_logits, dim=-1)\n", + " return next_logp\n", + " \n", + "model = CharRNNLoop()" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "id": "ls-H-9u6xNt9", + "colab_type": "code", + "colab": {} + }, + "cell_type": "code", + "source": [ + "# the model applies over the whole sequence\n", + "batch_ix = to_matrix(sample(lines, 32), max_len=MAX_LENGTH)\n", + "batch_ix = torch.tensor(batch_ix, dtype=torch.int64)\n", + "\n", + "logp_seq = model(batch_ix)\n", + "\n", + "# compute loss. This time we use nll_loss with some duct tape\n", + "loss = F.nll_loss(logp_seq[:, 1:].contiguous().view(-1, num_tokens), \n", + " batch_ix[:, :-1].contiguous().view(-1))\n", + "\n", + "loss.backward()" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "collapsed": true, + "id": "oG-50QGhxNuG", + "colab_type": "text" + }, + "cell_type": "markdown", + "source": [ + "Here's another example" + ] + }, + { + "metadata": { + "id": "5LTWutHKxNuL", + "colab_type": "code", + "colab": {} + }, + "cell_type": "code", + "source": [ + "import torch, torch.nn as nn\n", + "import torch.nn.functional as F\n", + "\n", + "class CharLSTMCell(nn.Module):\n", + " \"\"\"\n", + " Implements something like CharRNNCell, but with LSTM\n", + " \"\"\"\n", + " def __init__(self, num_tokens=len(tokens), embedding_size=16, rnn_num_units=64):\n", + " super(self.__class__,self).__init__()\n", + " self.num_units = rnn_num_units\n", + " self.emb = nn.Embedding(num_tokens, embedding_size)\n", + " self.lstm = nn.LSTMCell(embedding_size, rnn_num_units)\n", + " self.rnn_to_logits = nn.Linear(rnn_num_units, num_tokens)\n", + " \n", + " def forward(self, x, prev_state):\n", + " (prev_h, prev_c) = prev_state\n", + " (next_h, next_c) = self.lstm(self.emb(x), (prev_h, prev_c))\n", + " logits = self.rnn_to_logits(next_h)\n", + " \n", + " return (next_h, next_c), F.log_softmax(logits, -1)\n", + " \n", + " def initial_state(self, batch_size):\n", + " \"\"\" LSTM has two state variables, cell and hid \"\"\"\n", + " return torch.zeros(batch_size, self.num_units), torch.zeros(batch_size, self.num_units)\n", + " \n", + "char_lstm = CharLSTMCell()" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "id": "KR02vVMDxNuS", + "colab_type": "code", + "colab": {} + }, + "cell_type": "code", + "source": [ + "# the model applies over the whole sequence\n", + "batch_ix = to_matrix(sample(lines, 32), max_len=MAX_LENGTH)\n", + "batch_ix = torh.tensor(batch_ix, dtype=torch.int64)\n", + "\n", + "logp_seq = rnn_loop(char_lstm, batch_ix)\n", + "\n", + "# compute loss. This time we use nll_loss with some duct tape\n", + "loss = F.nll_loss(logp_seq[:, 1:].contiguous().view(-1, num_tokens), \n", + " batch_ix[:, :-1].contiguous().view(-1))\n", + "\n", + "loss.backward()" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "id": "5eVtel6_xNuX", + "colab_type": "text" + }, + "cell_type": "markdown", + "source": [ + "__Bonus quest: __ implement a model that uses 2 LSTM layers (the second lstm uses the first as input) and train it on your data." + ] + } + ] +} diff --git a/week7_[recap]_rnn/seminar_tf.ipynb b/week07_[recap]_rnn/seminar_tf.ipynb similarity index 100% rename from week7_[recap]_rnn/seminar_tf.ipynb rename to week07_[recap]_rnn/seminar_tf.ipynb diff --git a/week8_scst/README.md b/week07_seq2seq/README.md similarity index 73% rename from week8_scst/README.md rename to week07_seq2seq/README.md index 8c2da06d1..c2fcd1340 100644 --- a/week8_scst/README.md +++ b/week07_seq2seq/README.md @@ -9,16 +9,18 @@ * Self-critical sequence traning [original article](https://arxiv.org/abs/1612.00563) ## Practice -As usual, go to practice_{your framework}.ipynb above and follow instructions from there. [pytorch](https://github.com/yandexdataschool/Practical_RL/blob/master/week8_scst/practice_torch.ipynb), [tensorflow](https://github.com/yandexdataschool/Practical_RL/blob/master/week8_scst/practice_tf.ipynb), [theano](https://github.com/yandexdataschool/Practical_RL/blob/master/week8_scst/practice_theano.ipynb) -Binder quickstart (lasts 1 hour): [![Binder](https://mybinder.org/badge.svg)](https://mybinder.org/v2/gh/yandexdataschool/Practical_RL/master) +[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/yandexdataschool/Practical_RL/blob/spring19/week07_seq2seq/practice_torch.ipynb) + + +As usual, go to practice_{your framework}.ipynb above and follow instructions from there. [pytorch](./practice_torch.ipynb), [tensorflow](./practice_tf.ipynb), [theano](./practice_theano.ipynb) ## More materials * An [awesome post](http://distill.pub/2016/augmented-rnns/) explaining attention and long-term memory models. * [BLEU](http://www.aclweb.org/anthology/P02-1040.pdf) and [CIDEr](https://arxiv.org/pdf/1411.5726.pdf) articles. * Image captioning * MSCOCO captioning [challenge](http://mscoco.org/dataset/#captions-challenge2015) - * Captioning baseline [notebook](https://github.com/yandexdataschool/HSE_deeplearning/blob/master/week7/captioning_solution_ars.ipynb) + * Captioning tutorial [notebook](https://github.com/yandexdataschool/Practical_DL/tree/980121c7b3147ed28a7c1360df5038d3432b8cc3/week07_seq2seq) * Other articles on reinforcement learning for natural language: * [task-oriented conversation system](https://arxiv.org/abs/1703.07055) * [generating dialogues](https://arxiv.org/abs/1606.01541) diff --git a/week8_scst/basic_model_tf.py b/week07_seq2seq/basic_model_tf.py similarity index 100% rename from week8_scst/basic_model_tf.py rename to week07_seq2seq/basic_model_tf.py diff --git a/week8_scst/basic_model_theano.py b/week07_seq2seq/basic_model_theano.py similarity index 100% rename from week8_scst/basic_model_theano.py rename to week07_seq2seq/basic_model_theano.py diff --git a/week8_scst/basic_model_torch.py b/week07_seq2seq/basic_model_torch.py similarity index 89% rename from week8_scst/basic_model_torch.py rename to week07_seq2seq/basic_model_torch.py index fc68ae971..329432bd2 100644 --- a/week8_scst/basic_model_torch.py +++ b/week07_seq2seq/basic_model_torch.py @@ -1,7 +1,6 @@ import torch import torch.nn as nn import torch.nn.functional as F -from torch.autograd import Variable # Note: unlike official pytorch tutorial, this model doesn't process one sample at a time # because it's slow on GPU. instead it uses masks just like ye olde theano/tensorflow. @@ -65,8 +64,9 @@ def forward(self, inp, out, eps=1e-30, **flags): In other words, logp are probabilities of __current__ output at each tick, not the next one therefore you can get likelihood as logprobas * tf.one_hot(out,n_tokens) """ + device = next(self.parameters()).device batch_size = inp.shape[0] - bos = Variable(torch.LongTensor([self.out_voc.bos_ix] * batch_size)) + bos = torch.tensor([self.out_voc.bos_ix] * batch_size, dtype=torch.long, device=device) logits_seq = [torch.log(to_one_hot(bos, len(self.out_voc)) + eps)] hid_state = self.encode(inp, **flags) @@ -87,9 +87,10 @@ def translate(self, inp, greedy=False, max_len = None, eps = 1e-30, **flags): :return: output tokens int32[batch,time] and log-probabilities of all tokens at each tick, [batch,time,n_tokens] """ + device = next(self.parameters()).device batch_size = inp.shape[0] - bos = Variable(torch.LongTensor([self.out_voc.bos_ix] * batch_size)) - mask = Variable(torch.ones(batch_size).type(torch.ByteTensor)) + bos = torch.tensor([self.out_voc.bos_ix] * batch_size, dtype=torch.long, device=device) + mask = torch.ones(batch_size, dtype=torch.uint8, device=device) logits_seq = [torch.log(to_one_hot(bos, len(self.out_voc)) + eps)] out_seq = [bos] @@ -115,7 +116,7 @@ def translate(self, inp, greedy=False, max_len = None, eps = 1e-30, **flags): ### Utility functions ### -def infer_mask(seq, eos_ix, batch_first=True, include_eos=True, type=torch.FloatTensor): +def infer_mask(seq, eos_ix, batch_first=True, include_eos=True, dtype=torch.float): """ compute length given output indices and eos code :param seq: tf matrix [time,batch] if batch_first else [batch,time] @@ -124,7 +125,7 @@ def infer_mask(seq, eos_ix, batch_first=True, include_eos=True, type=torch.Float :returns: lengths, int32 vector of shape [batch] """ assert seq.dim() == 2 - is_eos = (seq == eos_ix).type(torch.FloatTensor) + is_eos = (seq == eos_ix).to(dtype=torch.float) if include_eos: if batch_first: is_eos = torch.cat((is_eos[:,:1]*0, is_eos[:, :-1]), dim=1) @@ -132,9 +133,9 @@ def infer_mask(seq, eos_ix, batch_first=True, include_eos=True, type=torch.Float is_eos = torch.cat((is_eos[:1,:]*0, is_eos[:-1, :]), dim=0) count_eos = torch.cumsum(is_eos, dim=1 if batch_first else 0) mask = count_eos == 0 - return mask.type(type) + return mask.to(dtype=dtype) -def infer_length(seq, eos_ix, batch_first=True, include_eos=True, type=torch.LongTensor): +def infer_length(seq, eos_ix, batch_first=True, include_eos=True, dtype=torch.long): """ compute mask given output indices and eos code :param seq: tf matrix [time,batch] if time_major else [batch,time] @@ -142,16 +143,15 @@ def infer_length(seq, eos_ix, batch_first=True, include_eos=True, type=torch.Lon :param include_eos: if True, the time-step where eos first occurs is has mask = 1 :returns: mask, float32 matrix with '0's and '1's of same shape as seq """ - mask = infer_mask(seq, eos_ix, batch_first, include_eos, type) + mask = infer_mask(seq, eos_ix, batch_first, include_eos, dtype) return torch.sum(mask, dim=1 if batch_first else 0) def to_one_hot(y, n_dims=None): """ Take integer y (tensor or variable) with n dims and convert it to 1-hot representation with n+1 dims. """ - y_tensor = y.data if isinstance(y, Variable) else y - y_tensor = y_tensor.type(torch.LongTensor).view(-1, 1) + y_tensor = y.data + y_tensor = y_tensor.to(dtype=torch.long).view(-1, 1) n_dims = n_dims if n_dims is not None else int(torch.max(y_tensor)) + 1 - y_one_hot = torch.zeros(y_tensor.size()[0], n_dims).scatter_(1, y_tensor, 1) + y_one_hot = torch.zeros(y_tensor.size()[0], n_dims, device=y.device).scatter_(1, y_tensor, 1) y_one_hot = y_one_hot.view(*y.shape, -1) - return Variable(y_one_hot) if isinstance(y, Variable) else y_one_hot - + return y_one_hot diff --git a/week8_scst/bonus.ipynb b/week07_seq2seq/bonus.ipynb similarity index 99% rename from week8_scst/bonus.ipynb rename to week07_seq2seq/bonus.ipynb index 2c7778acb..11ecd3c58 100644 --- a/week8_scst/bonus.ipynb +++ b/week07_seq2seq/bonus.ipynb @@ -48,7 +48,7 @@ "\n", "Some seq2seq tasks can benefit from the attention mechanism. In addition to taking the _last_ time-step of encoder hidden state, we can allow decoder to peek on any time-step of his choice.\n", "\n", - "![img](https://s30.postimg.org/f8um3kt5d/google_seq2seq_attention.gif)\n", + "![img](https://xiandong79.github.io/downloads/nmt-model-fast.gif)\n", "\n", "\n", "#### Recommended steps:\n", @@ -344,4 +344,4 @@ }, "nbformat": 4, "nbformat_minor": 2 -} \ No newline at end of file +} diff --git a/week8_scst/he-pron-wiktionary.txt b/week07_seq2seq/he-pron-wiktionary.txt similarity index 100% rename from week8_scst/he-pron-wiktionary.txt rename to week07_seq2seq/he-pron-wiktionary.txt diff --git a/week8_scst/main_dataset.txt b/week07_seq2seq/main_dataset.txt similarity index 100% rename from week8_scst/main_dataset.txt rename to week07_seq2seq/main_dataset.txt diff --git a/week07_seq2seq/practice_tf.ipynb b/week07_seq2seq/practice_tf.ipynb new file mode 100644 index 000000000..da535d868 --- /dev/null +++ b/week07_seq2seq/practice_tf.ipynb @@ -0,0 +1,1040 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Reinforcement Learning for seq2seq\n", + "\n", + "This time we'll solve a problem of transribing hebrew words in english, also known as g2p (grapheme2phoneme)\n", + "\n", + " * word (sequence of letters in source language) -> translation (sequence of letters in target language)\n", + "\n", + "Unlike what most deep learning practicioners do, we won't only train it to maximize likelihood of correct translation, but also employ reinforcement learning to actually teach it to translate with as few errors as possible.\n", + "\n", + "\n", + "### About the task\n", + "\n", + "One notable property of Hebrew is that it's consonant language. That is, there are no wovels in the written language. One could represent wovels with diacritics above consonants, but you don't expect people to do that in everyay life.\n", + "\n", + "Therefore, some hebrew characters will correspond to several english letters and others - to none, so we should use encoder-decoder architecture to figure that out.\n", + "\n", + "![img](https://esciencegroup.files.wordpress.com/2016/03/seq2seq.jpg)\n", + "_(img: esciencegroup.files.wordpress.com)_\n", + "\n", + "Encoder-decoder architectures are about converting anything to anything, including\n", + " * Machine translation and spoken dialogue systems\n", + " * [Image captioning](http://mscoco.org/dataset/#captions-challenge2015) and [image2latex](https://openai.com/requests-for-research/#im2latex) (convolutional encoder, recurrent decoder)\n", + " * Generating [images by captions](https://arxiv.org/abs/1511.02793) (recurrent encoder, convolutional decoder)\n", + " * Grapheme2phoneme - convert words to transcripts\n", + " \n", + "We chose simplified __Hebrew->English__ machine translation for words and short phrases (character-level), as it is relatively quick to train even without a gpu cluster." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# If True, only translates phrases shorter than 20 characters (way easier).\n", + "EASY_MODE = True\n", + "# Useful for initial coding.\n", + "# If false, works with all phrases (please switch to this mode for homework assignment)\n", + "\n", + "MODE = \"he-to-en\" # way we translate. Either \"he-to-en\" or \"en-to-he\"\n", + "# maximal length of _generated_ output, does not affect training\n", + "MAX_OUTPUT_LENGTH = 50 if not EASY_MODE else 20\n", + "REPORT_FREQ = 100 # how often to evaluate validation score" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 1: preprocessing\n", + "\n", + "We shall store dataset as a dictionary\n", + "`{ word1:[translation1,translation2,...], word2:[...],...}`.\n", + "\n", + "This is mostly due to the fact that many words have several correct translations.\n", + "\n", + "We have implemented this thing for you so that you can focus on more interesting parts.\n", + "\n", + "\n", + "__Attention python2 users!__ You may want to cast everything to unicode later during homework phase, just make sure you do it _everywhere_." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "from collections import defaultdict\n", + "word_to_translation = defaultdict(list) # our dictionary\n", + "\n", + "bos = '_'\n", + "eos = ';'\n", + "\n", + "with open(\"main_dataset.txt\") as fin:\n", + " for line in fin:\n", + "\n", + " en, he = line[:-1].lower().replace(bos, ' ').replace(eos,\n", + " ' ').split('\\t')\n", + " word, trans = (he, en) if MODE == 'he-to-en' else (en, he)\n", + "\n", + " if len(word) < 3:\n", + " continue\n", + " if EASY_MODE:\n", + " if max(len(word), len(trans)) > 20:\n", + " continue\n", + "\n", + " word_to_translation[word].append(trans)\n", + "\n", + "print(\"size = \", len(word_to_translation))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# get all unique lines in source language\n", + "all_words = np.array(list(word_to_translation.keys()))\n", + "# get all unique lines in translation language\n", + "all_translations = np.array(\n", + " [ts for all_ts in word_to_translation.values() for ts in all_ts])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### split the dataset\n", + "\n", + "We hold out 10% of all words to be used for validation.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "scrolled": true + }, + "outputs": [], + "source": [ + "from sklearn.model_selection import train_test_split\n", + "train_words, test_words = train_test_split(\n", + " all_words, test_size=0.1, random_state=42)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Building vocabularies\n", + "\n", + "We now need to build vocabularies that map strings to token ids and vice versa. We're gonna need these fellas when we feed training data into model or convert output matrices into english words." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "from voc import Vocab\n", + "inp_voc = Vocab.from_lines(''.join(all_words), bos=bos, eos=eos, sep='')\n", + "out_voc = Vocab.from_lines(''.join(all_translations), bos=bos, eos=eos, sep='')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# Here's how you cast lines into ids and backwards.\n", + "batch_lines = all_words[:5]\n", + "batch_ids = inp_voc.to_matrix(batch_lines)\n", + "batch_lines_restored = inp_voc.to_lines(batch_ids)\n", + "\n", + "print(\"lines\")\n", + "print(batch_lines)\n", + "print(\"\\nwords to ids (0 = bos, 1 = eos):\")\n", + "print(batch_ids)\n", + "print(\"\\nback to words\")\n", + "print(batch_lines_restored)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Draw word/translation length distributions to estimate the scope of the task." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "%matplotlib inline\n", + "plt.figure(figsize=[8, 4])\n", + "plt.subplot(1, 2, 1)\n", + "plt.title(\"words\")\n", + "plt.hist(list(map(len, all_words)), bins=20)\n", + "\n", + "plt.subplot(1, 2, 2)\n", + "plt.title('translations')\n", + "plt.hist(list(map(len, all_translations)), bins=20)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 3: deploy encoder-decoder (1 point)\n", + "\n", + "__assignment starts here__\n", + "\n", + "Our architecture consists of two main blocks:\n", + "* Encoder reads words character by character and outputs code vector (usually a function of last RNN state)\n", + "* Decoder takes that code vector and produces translations character by character\n", + "\n", + "Than it gets fed into a model that follows this simple interface:\n", + "* __`model.symbolic_translate(inp, **flags) -> out, logp`__ - takes symbolic int32 matrix of hebrew words, produces output tokens sampled from the model and output log-probabilities for all possible tokens at each tick.\n", + " * if given flag __`greedy=True`__, takes most likely next token at each iteration. Otherwise samples with next token probabilities predicted by model.\n", + "* __`model.symbolic_score(inp, out, **flags) -> logp`__ - takes symbolic int32 matrices of hebrew words and their english translations. Computes the log-probabilities of all possible english characters given english prefices and hebrew word.\n", + "* __`model.weights`__ - weights from all model layers [a list of variables]\n", + "\n", + "That's all! It's as hard as it gets. With those two methods alone you can implement all kinds of prediction and training." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import tensorflow as tf\n", + "tf.reset_default_graph()\n", + "s = tf.InteractiveSession()\n", + "\n", + "# ^^^ if you get \"variable *** already exists\": re-run this cell again" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "from basic_model_tf import BasicTranslationModel\n", + "model = BasicTranslationModel('model', inp_voc, out_voc,\n", + " emb_size=64, hid_size=128)\n", + "\n", + "s.run(tf.global_variables_initializer())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "scrolled": true + }, + "outputs": [], + "source": [ + "# Play around with symbolic_translate and symbolic_score\n", + "inp = tf.placeholder_with_default(np.random.randint(\n", + " 0, 10, [3, 5], dtype='int32'), [None, None])\n", + "out = tf.placeholder_with_default(np.random.randint(\n", + " 0, 10, [3, 5], dtype='int32'), [None, None])\n", + "\n", + "# translate inp (with untrained model)\n", + "sampled_out, logp = model.symbolic_translate(inp, greedy=False)\n", + "print(\"\\nSymbolic_translate output:\\n\", sampled_out, logp)\n", + "print(\"\\nSample translations:\\n\", s.run(sampled_out))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# score logp(out | inp) with untrained input\n", + "logp = model.symbolic_score(inp, out)\n", + "print(\"\\nSymbolic_score output:\\n\", logp)\n", + "print(\"\\nLog-probabilities (clipped):\\n\", s.run(logp)[:, :2, :5])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# Prepare any operations you want here\n", + "input_sequence = tf.placeholder('int32', [None, None])\n", + "greedy_translations, logp = \n", + "\n", + "\n", + "def translate(lines):\n", + " \"\"\"\n", + " You are given a list of input lines. \n", + " Make your neural network translate them.\n", + " :return: a list of output lines\n", + " \"\"\"\n", + " # Convert lines to a matrix of indices\n", + " lines_ix = \n", + "\n", + " # Compute translations in form of indices\n", + " trans_ix = s.run(greedy_translations, { < YOUR CODE - feed dict > })\n", + "\n", + " # Convert translations back into strings\n", + " return out_voc.to_lines(trans_ix)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "print(\"Sample inputs:\", all_words[:3])\n", + "print(\"Dummy translations:\", translate(all_words[:3]))\n", + "\n", + "assert isinstance(greedy_translations,\n", + " tf.Tensor) and greedy_translations.dtype.is_integer, \"trans must be a tensor of integers (token ids)\"\n", + "assert translate(all_words[:3]) == translate(\n", + " all_words[:3]), \"make sure translation is deterministic (use greedy=True and disable any noise layers)\"\n", + "assert type(translate(all_words[:3])) is list and (type(translate(all_words[:1])[0]) is str or type(\n", + " translate(all_words[:1])[0]) is unicode), \"translate(lines) must return a sequence of strings!\"\n", + "print(\"Tests passed!\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Scoring function\n", + "\n", + "LogLikelihood is a poor estimator of model performance.\n", + "* If we predict zero probability once, it shouldn't ruin entire model.\n", + "* It is enough to learn just one translation if there are several correct ones.\n", + "* What matters is how many mistakes model's gonna make when it translates!\n", + "\n", + "Therefore, we will use minimal Levenshtein distance. It measures how many characters do we need to add/remove/replace from model translation to make it perfect. Alternatively, one could use character-level BLEU/RougeL or other similar metrics.\n", + "\n", + "The catch here is that Levenshtein distance is not differentiable: it isn't even continuous. We can't train our neural network to maximize it by gradient descent." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import editdistance # !pip install editdistance\n", + "\n", + "\n", + "def get_distance(word, trans):\n", + " \"\"\"\n", + " A function that takes word and predicted translation\n", + " and evaluates (Levenshtein's) edit distance to closest correct translation\n", + " \"\"\"\n", + " references = word_to_translation[word]\n", + " assert len(references) != 0, \"wrong/unknown word\"\n", + " return min(editdistance.eval(trans, ref) for ref in references)\n", + "\n", + "\n", + "def score(words, bsize=100):\n", + " \"\"\"a function that computes levenshtein distance for bsize random samples\"\"\"\n", + " assert isinstance(words, np.ndarray)\n", + "\n", + " batch_words = np.random.choice(words, size=bsize, replace=False)\n", + " batch_trans = translate(batch_words)\n", + "\n", + " distances = list(map(get_distance, batch_words, batch_trans))\n", + "\n", + " return np.array(distances, dtype='float32')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# should be around 5-50 and decrease rapidly after training :)\n", + "[score(test_words, 10).mean() for _ in range(5)]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 2: Supervised pre-training\n", + "\n", + "Here we define a function that trains our model through maximizing log-likelihood a.k.a. minimizing crossentropy." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# import utility functions\n", + "from basic_model_tf import initialize_uninitialized, infer_length, infer_mask, select_values_over_last_axis\n", + "\n", + "\n", + "class supervised_training:\n", + "\n", + " # variable for inputs and correct answers\n", + " input_sequence = tf.placeholder('int32', [None, None])\n", + " reference_answers = tf.placeholder('int32', [None, None])\n", + "\n", + " # Compute log-probabilities of all possible tokens at each step. Use model interface.\n", + " logprobs_seq = \n", + "\n", + " # compute mean crossentropy\n", + " crossentropy = - select_values_over_last_axis(logprobs_seq, reference_answers)\n", + "\n", + " mask = infer_mask(reference_answers, out_voc.eos_ix)\n", + "\n", + " loss = tf.reduce_sum(crossentropy * mask)/tf.reduce_sum(mask)\n", + "\n", + " # Build weights optimizer. Use model.weights to get all trainable params.\n", + " train_step = \n", + "\n", + "\n", + "# intialize optimizer params while keeping model intact\n", + "initialize_uninitialized(s)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Actually run training on minibatches" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import random\n", + "\n", + "\n", + "def sample_batch(words, word_to_translation, batch_size):\n", + " \"\"\"\n", + " sample random batch of words and random correct translation for each word\n", + " example usage:\n", + " batch_x,batch_y = sample_batch(train_words, word_to_translations,10)\n", + " \"\"\"\n", + " # choose words\n", + " batch_words = np.random.choice(words, size=batch_size)\n", + "\n", + " # choose translations\n", + " batch_trans_candidates = list(map(word_to_translation.get, batch_words))\n", + " batch_trans = list(map(random.choice, batch_trans_candidates))\n", + "\n", + " return inp_voc.to_matrix(batch_words), out_voc.to_matrix(batch_trans)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "bx, by = sample_batch(train_words, word_to_translation, batch_size=3)\n", + "print(\"Source:\")\n", + "print(bx)\n", + "print(\"Target:\")\n", + "print(by)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "scrolled": false + }, + "outputs": [], + "source": [ + "from IPython.display import clear_output\n", + "from tqdm import tqdm, trange # or use tqdm_notebook,tnrange\n", + "\n", + "loss_history = []\n", + "editdist_history = []\n", + "\n", + "for i in trange(25000):\n", + " bx, by = sample_batch(train_words, word_to_translation, 32)\n", + "\n", + " feed_dict = {\n", + " supervised_training.input_sequence: bx,\n", + " supervised_training.reference_answers: by\n", + " }\n", + "\n", + " loss, _ = s.run([supervised_training.loss,\n", + " supervised_training.train_step], feed_dict)\n", + " loss_history.append(loss)\n", + "\n", + " if (i+1) % REPORT_FREQ == 0:\n", + " clear_output(True)\n", + " current_scores = score(test_words)\n", + " editdist_history.append(current_scores.mean())\n", + " plt.figure(figsize=(12, 4))\n", + " plt.subplot(131)\n", + " plt.title('train loss / traning time')\n", + " plt.plot(loss_history)\n", + " plt.grid()\n", + " plt.subplot(132)\n", + " plt.title('val score distribution')\n", + " plt.hist(current_scores, bins=20)\n", + " plt.subplot(133)\n", + " plt.title('val score / traning time')\n", + " plt.plot(editdist_history)\n", + " plt.grid()\n", + " plt.show()\n", + " print(\"llh=%.3f, mean score=%.3f\" %\n", + " (np.mean(loss_history[-10:]), np.mean(editdist_history[-10:])))\n", + "\n", + "# Note: it's okay if loss oscillates up and down as long as it gets better on average over long term (e.g. 5k batches)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "for word in train_words[:10]:\n", + " print(\"%s -> %s\" % (word, translate([word])[0]))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "test_scores = []\n", + "for start_i in trange(0, len(test_words), 32):\n", + " batch_words = test_words[start_i:start_i+32]\n", + " batch_trans = translate(batch_words)\n", + " distances = list(map(get_distance, batch_words, batch_trans))\n", + " test_scores.extend(distances)\n", + "\n", + "print(\"Supervised test score:\", np.mean(test_scores))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Preparing for reinforcement learning (2 points)\n", + "\n", + "First we need to define loss function as a custom tf operation.\n", + "\n", + "The simple way to do so is through `tensorflow.py_func` wrapper.\n", + "```\n", + "def my_func(x):\n", + " # x will be a numpy array with the contents of the placeholder below\n", + " return np.sinh(x)\n", + "inp = tf.placeholder(tf.float32)\n", + "y = tf.py_func(my_func, [inp], tf.float32)\n", + "```\n", + "\n", + "\n", + "__Your task__ is to implement `_compute_levenshtein` function that takes matrices of words and translations, along with input masks, then converts those to actual words and phonemes and computes min-levenshtein via __get_distance__ function above.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def _compute_levenshtein(words_ix, trans_ix):\n", + " \"\"\"\n", + " A custom tensorflow operation that computes levenshtein loss for predicted trans.\n", + "\n", + " Params:\n", + " - words_ix - a matrix of letter indices, shape=[batch_size,word_length]\n", + " - words_mask - a matrix of zeros/ones, \n", + " 1 means \"word is still not finished\"\n", + " 0 means \"word has already finished and this is padding\"\n", + "\n", + " - trans_mask - a matrix of output letter indices, shape=[batch_size,translation_length]\n", + " - trans_mask - a matrix of zeros/ones, similar to words_mask but for trans_ix\n", + "\n", + "\n", + " Please implement the function and make sure it passes tests from the next cell.\n", + "\n", + " \"\"\"\n", + "\n", + " # convert words to strings\n", + " words = \n", + "\n", + " assert type(words) is list and type(\n", + " words[0]) is str and len(words) == len(words_ix)\n", + "\n", + " # convert translations to lists\n", + " translations = \n", + "\n", + " assert type(distances) in (list, tuple, np.ndarray) and len(\n", + " distances) == len(words_ix)\n", + "\n", + " distances = np.array(list(distances), dtype='float32')\n", + " return distances\n", + "\n", + "\n", + "def compute_levenshtein(words_ix, trans_ix):\n", + " out = tf.py_func(_compute_levenshtein, [words_ix, trans_ix, ], tf.float32)\n", + " out.set_shape([None])\n", + "\n", + " return tf.stop_gradient(out)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Simple test suite to make sure your implementation is correct. Hint: if you run into any bugs, feel free to use print from inside _compute_levenshtein." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# test suite\n", + "# sample random batch of (words, correct trans, wrong trans)\n", + "batch_words = np.random.choice(train_words, size=100)\n", + "batch_trans = list(map(random.choice, map(\n", + " word_to_translation.get, batch_words)))\n", + "batch_trans_wrong = np.random.choice(all_translations, size=100)\n", + "\n", + "batch_words_ix = tf.constant(inp_voc.to_matrix(batch_words))\n", + "batch_trans_ix = tf.constant(out_voc.to_matrix(batch_trans))\n", + "batch_trans_wrong_ix = tf.constant(out_voc.to_matrix(batch_trans_wrong))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# assert compute_levenshtein is zero for ideal translations\n", + "correct_answers_score = compute_levenshtein(\n", + " batch_words_ix, batch_trans_ix).eval()\n", + "\n", + "assert np.all(correct_answers_score ==\n", + " 0), \"a perfect translation got nonzero levenshtein score!\"\n", + "\n", + "print(\"Everything seems alright!\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# assert compute_levenshtein matches actual scoring function\n", + "wrong_answers_score = compute_levenshtein(\n", + " batch_words_ix, batch_trans_wrong_ix).eval()\n", + "\n", + "true_wrong_answers_score = np.array(\n", + " list(map(get_distance, batch_words, batch_trans_wrong)))\n", + "\n", + "assert np.all(wrong_answers_score ==\n", + " true_wrong_answers_score), \"for some word symbolic levenshtein is different from actual levenshtein distance\"\n", + "\n", + "print(\"Everything seems alright!\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Once you got it working...\n", + "\n", + "\n", + "* You may now want to __remove/comment asserts__ from function code for a slight speed-up.\n", + "\n", + "* There's a more detailed tutorial on custom tensorflow ops: [`py_func`](https://www.tensorflow.org/api_docs/python/tf/py_func), [`low-level`](https://www.tensorflow.org/api_docs/python/tf/py_func)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3. Self-critical policy gradient (2 points)\n", + "\n", + "In this section you'll implement algorithm called self-critical sequence training (here's an [article](https://arxiv.org/abs/1612.00563)).\n", + "\n", + "The algorithm is a vanilla policy gradient with a special baseline. \n", + "\n", + "$$ \\nabla J = E_{x \\sim p(s)} E_{y \\sim \\pi(y|x)} \\nabla log \\pi(y|x) \\cdot (R(x,y) - b(x)) $$\n", + "\n", + "Here reward R(x,y) is a __negative levenshtein distance__ (since we minimize it). The baseline __b(x)__ represents how well model fares on word __x__.\n", + "\n", + "In practice, this means that we compute baseline as a score of greedy translation, $b(x) = R(x,y_{greedy}(x)) $.\n", + "\n", + "![img](https://github.com/yandexdataschool/Practical_RL/raw/master/yet_another_week/_resource/scheme.png)\n", + "Luckily, we already obtained the required outputs: `model.greedy_translations, model.greedy_mask` and we only need to compute levenshtein using `compute_levenshtein` function.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "class trainer:\n", + "\n", + " input_sequence = tf.placeholder('int32', [None, None])\n", + "\n", + " # use model to __sample__ symbolic translations given input_sequence\n", + " sample_translations, sample_logp = \n", + " # use model to __greedy__ symbolic translations given input_sequence\n", + " greedy_translations, greedy_logp = \n", + "\n", + " rewards = - compute_levenshtein(input_sequence, sample_translations)\n", + "\n", + " # compute __negative__ levenshtein for greedy mode\n", + " baseline = \n", + "\n", + " # compute advantage using rewards and baseline\n", + " advantage = \n", + " assert advantage.shape.ndims == 1, \"advantage must be of shape [batch_size]\"\n", + "\n", + " # compute log_pi(a_t|s_t), shape = [batch, seq_length]\n", + " logprobs_phoneme = # YOUR CODE\n", + " # ^-- hint: look at how crossentropy is implemented in supervised learning loss above\n", + " # mind the sign - this one should not be multiplied by -1 :)\n", + "\n", + "\n", + " # Compute policy gradient\n", + " # or rather surrogate function who's gradient is policy gradient\n", + " J = logprobs_phoneme*advantage[:, None]\n", + "\n", + " mask = infer_mask(sample_translations, out_voc.eos_ix)\n", + " loss = - tf.reduce_sum(J*mask) / tf.reduce_sum(mask)\n", + "\n", + " # regularize with negative entropy. Don't forget the sign!\n", + " # note: for entropy you need probabilities for all tokens (sample_logp), not just phoneme_logprobs\n", + " entropy = \n", + " # hint: you can get sample probabilities from sample_logp using math :)\n", + "\n", + "\n", + " assert entropy.shape.ndims == 2, \"please make sure elementwise entropy is of shape [batch,time]\"\n", + "\n", + " loss -= 0.01*tf.reduce_sum(entropy*mask) / tf.reduce_sum(mask)\n", + "\n", + " # compute weight updates, clip by norm\n", + " grads = tf.gradients(loss, model.weights)\n", + " grads = tf.clip_by_global_norm(grads, 50)[0]\n", + "\n", + " train_step = tf.train.AdamOptimizer(\n", + " learning_rate=1e-5).apply_gradients(zip(grads, model.weights,))\n", + "\n", + "\n", + "initialize_uninitialized()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Policy gradient training\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "scrolled": false + }, + "outputs": [], + "source": [ + "for i in trange(100000):\n", + " bx = sample_batch(train_words, word_to_translation, 32)[0]\n", + " pseudo_loss, _ = s.run([trainer.loss, trainer.train_step], {\n", + " trainer.input_sequence: bx})\n", + "\n", + " loss_history.append(\n", + " pseudo_loss\n", + " )\n", + "\n", + " if (i+1) % REPORT_FREQ == 0:\n", + " clear_output(True)\n", + " current_scores = score(test_words)\n", + " editdist_history.append(current_scores.mean())\n", + " plt.figure(figsize=(8, 4))\n", + " plt.subplot(121)\n", + " plt.title('val score distribution')\n", + " plt.hist(current_scores, bins=20)\n", + " plt.subplot(122)\n", + " plt.title('val score / traning time')\n", + " plt.plot(editdist_history)\n", + " plt.grid()\n", + " plt.show()\n", + " print(\"J=%.3f, mean score=%.3f\" %\n", + " (np.mean(loss_history[-10:]), np.mean(editdist_history[-10:])))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "model.translate(\"EXAMPLE;\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Results" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "for word in train_words[:10]:\n", + " print(\"%s -> %s\" % (word, translate([word])[0]))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "scrolled": true + }, + "outputs": [], + "source": [ + "test_scores = []\n", + "for start_i in trange(0, len(test_words), 32):\n", + " batch_words = test_words[start_i:start_i+32]\n", + " batch_trans = translate(batch_words)\n", + " distances = list(map(get_distance, batch_words, batch_trans))\n", + " test_scores.extend(distances)\n", + "print(\"Supervised test score:\", np.mean(test_scores))\n", + "\n", + "# ^^ If you get Out Of Memory, please replace this with batched computation" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 6: Make it actually work (5++ pts)\n", + "\n", + "\n", + "In this section we want you to finally __restart with EASY_MODE=False__ and experiment to find a good model/curriculum for that task.\n", + "\n", + "We recommend you to start with the following architecture\n", + "\n", + "```\n", + "encoder---decoder\n", + "\n", + " P(y|h)\n", + " ^\n", + " LSTM -> LSTM\n", + " ^ ^\n", + " biLSTM -> LSTM\n", + " ^ ^\n", + "input y_prev\n", + "```\n", + "\n", + "__Note:__ you can fit all 4 state tensors of both LSTMs into a in a single state - just assume that it contains, for example, [h0, c0, h1, c1] - pack it in encode and update in decode.\n", + "\n", + "\n", + "Here are some cool ideas on what you can do then.\n", + "\n", + "__General tips & tricks:__\n", + "* In some tensorflow versions and for some layers, it is required that each rnn/gru/lstm cell gets it's own `tf.variable_scope(unique_name, reuse=False)`.\n", + " * Otherwise it will complain about wrong tensor sizes because it tries to reuse weights from one rnn to the other.\n", + "* You will likely need to adjust pre-training time for such a network.\n", + "* Supervised pre-training may benefit from clipping gradients somehow.\n", + "* SCST may indulge a higher learning rate in some cases and changing entropy regularizer over time.\n", + "* It's often useful to save pre-trained model parameters to not re-train it every time you want new policy gradient parameters. \n", + "* When leaving training for nighttime, try setting REPORT_FREQ to a larger value (e.g. 500) not to waste time on it.\n", + "\n", + "__Formal criteria:__\n", + "To get 5 points we want you to build an architecture that:\n", + "* _doesn't consist of single GRU_\n", + "* _works better_ than single GRU baseline. \n", + "* We also want you to provide either learning curve or trained model, preferably both\n", + "* ... and write a brief report or experiment log describing what you did and how it fared.\n", + "\n", + "### Attention\n", + "There's more than one way to connect decoder to encoder\n", + " * __Vanilla:__ layer_i of encoder last state goes to layer_i of decoder initial state\n", + " * __Every tick:__ feed encoder last state _on every iteration_ of decoder.\n", + " * __Attention:__ allow decoder to \"peek\" at one (or several) positions of encoded sequence on every tick.\n", + " \n", + "The most effective (and cool) of those is, of course, attention.\n", + "You can read more about attention [in this nice blog post](https://distill.pub/2016/augmented-rnns/). The easiest way to begin is to use \"soft\" attention with \"additive\" or \"dot-product\" intermediate layers.\n", + "\n", + "__Tips__\n", + "* Model usually generalizes better if you no longer allow decoder to see final encoder state\n", + "* Once your model made it through several epochs, it is a good idea to visualize attention maps to understand what your model has actually learned\n", + "\n", + "* There's more stuff [here](https://github.com/yandexdataschool/Practical_RL/blob/master/week8_scst/bonus.ipynb)\n", + "* If you opted for hard attention, we recommend [gumbel-softmax](https://blog.evjang.com/2016/11/tutorial-categorical-variational.html) instead of sampling. Also please make sure soft attention works fine before you switch to hard.\n", + "\n", + "### UREX\n", + "* This is a way to improve exploration in policy-based settings. The main idea is that you find and upweight under-appreciated actions.\n", + "* Here's [video](https://www.youtube.com/watch?v=fZNyHoXgV7M&feature=youtu.be&t=3444)\n", + " and an [article](https://arxiv.org/abs/1611.09321).\n", + "* You may want to reduce batch size 'cuz UREX requires you to sample multiple times per source sentence.\n", + "* Once you got it working, try using experience replay with importance sampling instead of (in addition to) basic UREX.\n", + "\n", + "### Some additional ideas:\n", + "* (advanced deep learning) It may be a good idea to first train on small phrases and then adapt to larger ones (a.k.a. training curriculum).\n", + "* (advanced nlp) You may want to switch from raw utf8 to something like unicode or even syllables to make task easier.\n", + "* (advanced nlp) Since hebrew words are written __with vowels omitted__, you may want to use a small Hebrew vowel markup dataset at `he-pron-wiktionary.txt`.\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Bonus hints: [here](https://github.com/yandexdataschool/Practical_RL/blob/master/week8_scst/bonus.ipynb)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "assert not EASY_MODE, \"make sure you set EASY_MODE = False at the top of the notebook.\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": true + }, + "source": [ + "`[your report/log here or anywhere you please]`" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "__Contributions:__ This notebook is brought to you by\n", + "* Yandex [MT team](https://tech.yandex.com/translate/)\n", + "* Denis Mazur ([DeniskaMazur](https://github.com/DeniskaMazur)), Oleg Vasilev ([Omrigan](https://github.com/Omrigan/)), Dmitry Emelyanenko ([TixFeniks](https://github.com/tixfeniks)) and Fedor Ratnikov ([justheuristic](https://github.com/justheuristic/))\n", + "* Dataset is parsed from [Wiktionary](https://en.wiktionary.org), which is under CC-BY-SA and GFDL licenses.\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.2" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/week8_scst/practice_theano.ipynb b/week07_seq2seq/practice_theano.ipynb similarity index 99% rename from week8_scst/practice_theano.ipynb rename to week07_seq2seq/practice_theano.ipynb index 4efce5992..dc8a372b1 100644 --- a/week8_scst/practice_theano.ipynb +++ b/week07_seq2seq/practice_theano.ipynb @@ -4,7 +4,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Week 8: Reinforcement Learning for seq2seq\n", + "## Reinforcement Learning for seq2seq\n", "\n", "This time we'll solve a problem of transribing hebrew words in english, also known as g2p (grapheme2phoneme)\n", "\n", @@ -1293,4 +1293,4 @@ }, "nbformat": 4, "nbformat_minor": 1 -} \ No newline at end of file +} diff --git a/week07_seq2seq/practice_torch.ipynb b/week07_seq2seq/practice_torch.ipynb new file mode 100644 index 000000000..cd6919cba --- /dev/null +++ b/week07_seq2seq/practice_torch.ipynb @@ -0,0 +1,877 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Reinforcement Learning for seq2seq\n", + "\n", + "This time we'll solve a problem of transribing hebrew words in english, also known as g2p (grapheme2phoneme)\n", + "\n", + " * word (sequence of letters in source language) -> translation (sequence of letters in target language)\n", + "\n", + "Unlike what most deep learning practicioners do, we won't only train it to maximize likelihood of correct translation, but also employ reinforcement learning to actually teach it to translate with as few errors as possible.\n", + "\n", + "\n", + "### About the task\n", + "\n", + "One notable property of Hebrew is that it's consonant language. That is, there are no wovels in the written language. One could represent wovels with diacritics above consonants, but you don't expect people to do that in everyay life.\n", + "\n", + "Therefore, some hebrew characters will correspond to several english letters and others - to none, so we should use encoder-decoder architecture to figure that out.\n", + "\n", + "![img](https://esciencegroup.files.wordpress.com/2016/03/seq2seq.jpg)\n", + "_(img: esciencegroup.files.wordpress.com)_\n", + "\n", + "Encoder-decoder architectures are about converting anything to anything, including\n", + " * Machine translation and spoken dialogue systems\n", + " * [Image captioning](http://mscoco.org/dataset/#captions-challenge2015) and [image2latex](https://htmlpreview.github.io/?https://github.com/openai/requests-for-research/blob/master/_requests_for_research/im2latex.html) (convolutional encoder, recurrent decoder)\n", + " * Generating [images by captions](https://arxiv.org/abs/1511.02793) (recurrent encoder, convolutional decoder)\n", + " * Grapheme2phoneme - convert words to transcripts\n", + " \n", + "We chose simplified __Hebrew->English__ machine translation for words and short phrases (character-level), as it is relatively quick to train even without a gpu cluster." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# if running in colab, uncomment\n", + "# !wget https://github.com/yandexdataschool/Practical_RL/raw/c54b56049df85315490d1cec42b2fda0c96ad9b2/week07_scst/basic_model_torch.py -O basic_model_torch.py\n", + "# !wget https://github.com/yandexdataschool/Practical_RL/raw/c54b56049df85315490d1cec42b2fda0c96ad9b2/week07_scst/main_dataset.txt -O main_dataset.txt\n", + "# !wget https://github.com/yandexdataschool/Practical_RL/raw/c54b56049df85315490d1cec42b2fda0c96ad9b2/week07_scst/voc.py -O voc.py\n", + "# !pip3 install torch==1.0.0 nltk editdistance" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# If True, only translates phrases shorter than 20 characters (way easier).\n", + "EASY_MODE = True\n", + "# Useful for initial coding.\n", + "# If false, works with all phrases (please switch to this mode for homework assignment)\n", + "\n", + "# way we translate. Either \"he-to-en\" or \"en-to-he\"\n", + "MODE = \"he-to-en\"\n", + "# maximal length of _generated_ output, does not affect training\n", + "MAX_OUTPUT_LENGTH = 50 if not EASY_MODE else 20\n", + "REPORT_FREQ = 100 # how often to evaluate validation score" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 1: preprocessing\n", + "\n", + "We shall store dataset as a dictionary\n", + "`{ word1:[translation1,translation2,...], word2:[...],...}`.\n", + "\n", + "This is mostly due to the fact that many words have several correct translations.\n", + "\n", + "We have implemented this thing for you so that you can focus on more interesting parts.\n", + "\n", + "\n", + "__Attention python2 users!__ You may want to cast everything to unicode later during homework phase, just make sure you do it _everywhere_." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "from collections import defaultdict\n", + "word_to_translation = defaultdict(list) # our dictionary\n", + "\n", + "bos = '_'\n", + "eos = ';'\n", + "\n", + "with open(\"main_dataset.txt\", encoding=\"utf-8\") as fin:\n", + " for line in fin:\n", + "\n", + " en, he = line[:-1].lower().replace(bos, ' ').replace(eos,\n", + " ' ').split('\\t')\n", + " word, trans = (he, en) if MODE == 'he-to-en' else (en, he)\n", + "\n", + " if len(word) < 3:\n", + " continue\n", + " if EASY_MODE:\n", + " if max(len(word), len(trans)) > 20:\n", + " continue\n", + "\n", + " word_to_translation[word].append(trans)\n", + "\n", + "print(\"size = \", len(word_to_translation))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# get all unique lines in source language\n", + "all_words = np.array(list(word_to_translation.keys()))\n", + "# get all unique lines in translation language\n", + "all_translations = np.array(\n", + " [ts for all_ts in word_to_translation.values() for ts in all_ts])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### split the dataset\n", + "\n", + "We hold out 10% of all words to be used for validation.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.model_selection import train_test_split\n", + "train_words, test_words = train_test_split(\n", + " all_words, test_size=0.1, random_state=42)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Building vocabularies\n", + "\n", + "We now need to build vocabularies that map strings to token ids and vice versa. We're gonna need these fellas when we feed training data into model or convert output matrices into english words." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from voc import Vocab\n", + "inp_voc = Vocab.from_lines(''.join(all_words), bos=bos, eos=eos, sep='')\n", + "out_voc = Vocab.from_lines(''.join(all_translations), bos=bos, eos=eos, sep='')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Here's how you cast lines into ids and backwards.\n", + "batch_lines = all_words[:5]\n", + "batch_ids = inp_voc.to_matrix(batch_lines)\n", + "batch_lines_restored = inp_voc.to_lines(batch_ids)\n", + "\n", + "print(\"lines\")\n", + "print(batch_lines)\n", + "print(\"\\nwords to ids (0 = bos, 1 = eos):\")\n", + "print(batch_ids)\n", + "print(\"\\nback to words\")\n", + "print(batch_lines_restored)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Draw word/translation length distributions to estimate the scope of the task." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "%matplotlib inline\n", + "plt.figure(figsize=[8, 4])\n", + "plt.subplot(1, 2, 1)\n", + "plt.title(\"words\")\n", + "plt.hist(list(map(len, all_words)), bins=20)\n", + "\n", + "plt.subplot(1, 2, 2)\n", + "plt.title('translations')\n", + "plt.hist(list(map(len, all_translations)), bins=20)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 3: deploy encoder-decoder (1 point)\n", + "\n", + "__assignment starts here__\n", + "\n", + "Our architecture consists of two main blocks:\n", + "* Encoder reads words character by character and outputs code vector (usually a function of last RNN state)\n", + "* Decoder takes that code vector and produces translations character by character\n", + "\n", + "Than it gets fed into a model that follows this simple interface:\n", + "* __`model(inp, out, **flags) -> logp`__ - takes symbolic int32 matrices of hebrew words and their english translations. Computes the log-probabilities of all possible english characters given english prefices and hebrew word.\n", + "* __`model.translate(inp, **flags) -> out, logp`__ - takes symbolic int32 matrix of hebrew words, produces output tokens sampled from the model and output log-probabilities for all possible tokens at each tick.\n", + " * if given flag __`greedy=True`__, takes most likely next token at each iteration. Otherwise samples with next token probabilities predicted by model.\n", + "\n", + "That's all! It's as hard as it gets. With those two methods alone you can implement all kinds of prediction and training." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "import torch.nn as nn\n", + "import torch.nn.functional as F" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from basic_model_torch import BasicTranslationModel\n", + "model = BasicTranslationModel(inp_voc, out_voc,\n", + " emb_size=64, hid_size=256)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Play around with symbolic_translate and symbolic_score\n", + "inp = torch.tensor(np.random.randint(0, 10, [3, 5]), dtype=torch.int64)\n", + "out = torch.tensor(np.random.randint(0, 10, [3, 5]), dtype=torch.int64)\n", + "\n", + "# translate inp (with untrained model)\n", + "sampled_out, logp = model.translate(inp, greedy=False)\n", + "\n", + "print(\"Sample translations:\\n\", sampled_out)\n", + "print(\"Log-probabilities at each step:\\n\", logp)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# score logp(out | inp) with untrained input\n", + "logp = model(inp, out)\n", + "print(\"Symbolic_score output:\\n\", logp)\n", + "\n", + "print(\"Log-probabilities of output tokens:\\n\",\n", + " torch.gather(logp, dim=2, index=out[:, :, None]))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def translate(lines, max_len=MAX_OUTPUT_LENGTH):\n", + " \"\"\"\n", + " You are given a list of input lines. \n", + " Make your neural network translate them.\n", + " :return: a list of output lines\n", + " \"\"\"\n", + " # Convert lines to a matrix of indices\n", + " lines_ix = inp_voc.to_matrix(lines)\n", + " lines_ix = torch.tensor(lines_ix, dtype=torch.int64)\n", + "\n", + " # Compute translations in form of indices\n", + " trans_ix = \n", + "\n", + " # Convert translations back into strings\n", + " return out_voc.to_lines(trans_ix.data.numpy())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(\"Sample inputs:\", all_words[:3])\n", + "print(\"Dummy translations:\", translate(all_words[:3]))\n", + "trans = translate(all_words[:3])\n", + "\n", + "assert translate(all_words[:3]) == translate(\n", + " all_words[:3]), \"make sure translation is deterministic (use greedy=True and disable any noise layers)\"\n", + "assert type(translate(all_words[:3])) is list and (type(translate(all_words[:1])[0]) is str or type(\n", + " translate(all_words[:1])[0]) is unicode), \"translate(lines) must return a sequence of strings!\"\n", + "# note: if translation freezes, make sure you used max_len parameter\n", + "print(\"Tests passed!\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Scoring function\n", + "\n", + "LogLikelihood is a poor estimator of model performance.\n", + "* If we predict zero probability once, it shouldn't ruin entire model.\n", + "* It is enough to learn just one translation if there are several correct ones.\n", + "* What matters is how many mistakes model's gonna make when it translates!\n", + "\n", + "Therefore, we will use minimal Levenshtein distance. It measures how many characters do we need to add/remove/replace from model translation to make it perfect. Alternatively, one could use character-level BLEU/RougeL or other similar metrics.\n", + "\n", + "The catch here is that Levenshtein distance is not differentiable: it isn't even continuous. We can't train our neural network to maximize it by gradient descent." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import editdistance # !pip install editdistance\n", + "\n", + "\n", + "def get_distance(word, trans):\n", + " \"\"\"\n", + " A function that takes word and predicted translation\n", + " and evaluates (Levenshtein's) edit distance to closest correct translation\n", + " \"\"\"\n", + " references = word_to_translation[word]\n", + " assert len(references) != 0, \"wrong/unknown word\"\n", + " return min(editdistance.eval(trans, ref) for ref in references)\n", + "\n", + "\n", + "def score(words, bsize=100):\n", + " \"\"\"a function that computes levenshtein distance for bsize random samples\"\"\"\n", + " assert isinstance(words, np.ndarray)\n", + "\n", + " batch_words = np.random.choice(words, size=bsize, replace=False)\n", + " batch_trans = translate(batch_words)\n", + "\n", + " distances = list(map(get_distance, batch_words, batch_trans))\n", + "\n", + " return np.array(distances, dtype='float32')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# should be around 5-50 and decrease rapidly after training :)\n", + "[score(test_words, 10).mean() for _ in range(5)]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 2: Supervised pre-training (2 points)\n", + "\n", + "Here we define a function that trains our model through maximizing log-likelihood a.k.a. minimizing crossentropy." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import random\n", + "\n", + "\n", + "def sample_batch(words, word_to_translation, batch_size):\n", + " \"\"\"\n", + " sample random batch of words and random correct translation for each word\n", + " example usage:\n", + " batch_x,batch_y = sample_batch(train_words, word_to_translations,10)\n", + " \"\"\"\n", + " # choose words\n", + " batch_words = np.random.choice(words, size=batch_size)\n", + "\n", + " # choose translations\n", + " batch_trans_candidates = list(map(word_to_translation.get, batch_words))\n", + " batch_trans = list(map(random.choice, batch_trans_candidates))\n", + " return batch_words, batch_trans" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "bx, by = sample_batch(train_words, word_to_translation, batch_size=3)\n", + "print(\"Source:\")\n", + "print(bx)\n", + "print(\"Target:\")\n", + "print(by)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from basic_model_torch import infer_length, infer_mask, to_one_hot\n", + "\n", + "\n", + "def compute_loss_on_batch(input_sequence, reference_answers):\n", + " \"\"\" Compute crossentropy loss given a batch of sources and translations \"\"\"\n", + " input_sequence = torch.tensor(inp_voc.to_matrix(input_sequence), dtype=torch.int64)\n", + " reference_answers = torch.tensor(out_voc.to_matrix(reference_answers), dtype=torch.int64)\n", + "\n", + " # Compute log-probabilities of all possible tokens at each step. Use model interface.\n", + " logprobs_seq = # YOUR CODE\n", + "\n", + " # compute elementwise crossentropy as negative log-probabilities of reference_answers.\n", + " crossentropy = - \\\n", + " torch.sum(logprobs_seq *\n", + " to_one_hot(reference_answers, len(out_voc)), dim=-1)\n", + " assert crossentropy.dim(\n", + " ) == 2, \"please return elementwise crossentropy, don't compute mean just yet\"\n", + "\n", + " # average with mask\n", + " mask = infer_mask(reference_answers, out_voc.eos_ix)\n", + " loss = torch.sum(crossentropy * mask) / torch.sum(mask)\n", + "\n", + " return loss" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# test it\n", + "loss = compute_loss_on_batch(*sample_batch(train_words, word_to_translation, 3))\n", + "print('loss = ', loss)\n", + "\n", + "assert loss.item() > 0.0\n", + "loss.backward()\n", + "for w in model.parameters():\n", + " assert w.grad is not None and torch.max(torch.abs(w.grad)).item() != 0, \\\n", + " \"Loss is not differentiable w.r.t. a weight with shape %s. Check comput_loss_on_batch.\" % (\n", + " w.size(),)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### Actually train the model\n", + "\n", + "Minibatches and stuff..." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from IPython.display import clear_output\n", + "from tqdm import tqdm, trange # or use tqdm_notebook,tnrange\n", + "\n", + "loss_history = []\n", + "editdist_history = []\n", + "entropy_history = []\n", + "opt = torch.optim.Adam(model.parameters())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "\n", + "for i in trange(25000):\n", + " loss = compute_loss_on_batch(*sample_batch(train_words, word_to_translation, 32))\n", + "\n", + " # train with backprop\n", + " loss.backward()\n", + " opt.step()\n", + " opt.zero_grad()\n", + "\n", + " loss_history.append(loss.item())\n", + "\n", + " if (i+1) % REPORT_FREQ == 0:\n", + " clear_output(True)\n", + " current_scores = score(test_words)\n", + " editdist_history.append(current_scores.mean())\n", + " print(\"llh=%.3f, mean score=%.3f\" %\n", + " (np.mean(loss_history[-10:]), np.mean(editdist_history[-10:])))\n", + " plt.figure(figsize=(12, 4))\n", + " plt.subplot(131)\n", + " plt.title('train loss / traning time')\n", + " plt.plot(loss_history)\n", + " plt.grid()\n", + " plt.subplot(132)\n", + " plt.title('val score distribution')\n", + " plt.hist(current_scores, bins=20)\n", + " plt.subplot(133)\n", + " plt.title('val score / traning time (lower is better)')\n", + " plt.plot(editdist_history)\n", + " plt.grid()\n", + " plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "__How to interpret the plots:__\n", + "\n", + "* __Train loss__ - that's your model's crossentropy over minibatches. It should go down steadily. Most importantly, it shouldn't be NaN :)\n", + "* __Val score distribution__ - distribution of translation edit distance (score) within batch. It should move to the left over time.\n", + "* __Val score / training time__ - it's your current mean edit distance. This plot is much whimsier than loss, but make sure it goes below 8 by 2500 steps. \n", + "\n", + "If it doesn't, first try to re-create both model and opt. You may have changed it's weight too much while debugging. If that doesn't help, it's debugging time." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for word in train_words[:10]:\n", + " print(\"%s -> %s\" % (word, translate([word])[0]))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "test_scores = []\n", + "for start_i in trange(0, len(test_words), 32):\n", + " batch_words = test_words[start_i:start_i+32]\n", + " batch_trans = translate(batch_words)\n", + " distances = list(map(get_distance, batch_words, batch_trans))\n", + " test_scores.extend(distances)\n", + "\n", + "print(\"Supervised test score:\", np.mean(test_scores))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Self-critical policy gradient (2 points)\n", + "\n", + "In this section you'll implement algorithm called self-critical sequence training (here's an [article](https://arxiv.org/abs/1612.00563)).\n", + "\n", + "The algorithm is a vanilla policy gradient with a special baseline. \n", + "\n", + "$$ \\nabla J = E_{x \\sim p(s)} E_{y \\sim \\pi(y|x)} \\nabla log \\pi(y|x) \\cdot (R(x,y) - b(x)) $$\n", + "\n", + "Here reward R(x,y) is a __negative levenshtein distance__ (since we minimize it). The baseline __b(x)__ represents how well model fares on word __x__.\n", + "\n", + "In practice, this means that we compute baseline as a score of greedy translation, $b(x) = R(x,y_{greedy}(x)) $.\n", + "\n", + "![img](https://github.com/yandexdataschool/Practical_RL/raw/master/yet_another_week/_resource/scheme.png)\n", + "\n", + "\n", + "Luckily, we already obtained the required outputs: `model.greedy_translations, model.greedy_mask` and we only need to compute levenshtein using `compute_levenshtein` function.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def compute_reward(input_sequence, translations):\n", + " \"\"\" computes sample-wise reward given token ids for inputs and translations \"\"\"\n", + " distances = list(map(get_distance,\n", + " inp_voc.to_lines(input_sequence.data.numpy()),\n", + " out_voc.to_lines(translations.data.numpy())))\n", + " # use negative levenshtein distance so that larger reward means better policy\n", + " return - torch.tensor(distances, dtype=torch.int64)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def scst_objective_on_batch(input_sequence, max_len=MAX_OUTPUT_LENGTH):\n", + " \"\"\" Compute pseudo-loss for policy gradient given a batch of sources \"\"\"\n", + " input_sequence = torch.tensor(inp_voc.to_matrix(input_sequence), dtype=torch.int64)\n", + "\n", + " # use model to __sample__ symbolic translations given input_sequence\n", + " sample_translations, sample_logp = # YOUR CODE\n", + " # use model to __greedy__ symbolic translations given input_sequence\n", + " greedy_translations, greedy_logp = # YOUR CODE\n", + "\n", + " # compute rewards and advantage\n", + " rewards = compute_reward(input_sequence, sample_translations)\n", + " baseline = \n", + "\n", + " # compute advantage using rewards and baseline\n", + " advantage = # YOUR CODE\n", + "\n", + " # compute log_pi(a_t|s_t), shape = [batch, seq_length]\n", + " logp_sample = # YOUR CODE\n", + " \n", + " # ^-- hint: look at how crossentropy is implemented in supervised learning loss above\n", + " # mind the sign - this one should not be multiplied by -1 :)\n", + "\n", + " # policy gradient pseudo-loss. Gradient of J is exactly policy gradient.\n", + " J = logp_sample * advantage[:, None]\n", + "\n", + " assert J.dim() == 2, \"please return elementwise objective, don't compute mean just yet\"\n", + "\n", + " # average with mask\n", + " mask = infer_mask(sample_translations, out_voc.eos_ix)\n", + " loss = - torch.sum(J * mask) / torch.sum(mask)\n", + "\n", + " # regularize with negative entropy. Don't forget the sign!\n", + " # note: for entropy you need probabilities for all tokens (sample_logp), not just logp_sample\n", + " entropy = \n", + " # hint: you can get sample probabilities from sample_logp using math :)\n", + "\n", + " assert entropy.dim(\n", + " ) == 2, \"please make sure elementwise entropy is of shape [batch,time]\"\n", + "\n", + " reg = - 0.01 * torch.sum(entropy * mask) / torch.sum(mask)\n", + "\n", + " return loss + reg, torch.sum(entropy * mask) / torch.sum(mask)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Policy gradient training\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "entropy_history = [np.nan] * len(loss_history)\n", + "opt = torch.optim.Adam(model.parameters(), lr=1e-5)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for i in trange(100000):\n", + " loss, ent = scst_objective_on_batch(\n", + " sample_batch(train_words, word_to_translation, 32)[0]) # [0] = only source sentence\n", + "\n", + " # train with backprop\n", + " loss.backward()\n", + " opt.step()\n", + " opt.zero_grad()\n", + "\n", + " loss_history.append(loss.item())\n", + " entropy_history.append(ent.item())\n", + "\n", + " if (i+1) % REPORT_FREQ == 0:\n", + " clear_output(True)\n", + " current_scores = score(test_words)\n", + " editdist_history.append(current_scores.mean())\n", + " plt.figure(figsize=(12, 4))\n", + " plt.subplot(131)\n", + " plt.title('val score distribution')\n", + " plt.hist(current_scores, bins=20)\n", + " plt.subplot(132)\n", + " plt.title('val score / traning time')\n", + " plt.plot(editdist_history)\n", + " plt.grid()\n", + " plt.subplot(133)\n", + " plt.title('policy entropy / traning time')\n", + " plt.plot(entropy_history)\n", + " plt.grid()\n", + " plt.show()\n", + " print(\"J=%.3f, mean score=%.3f\" %\n", + " (np.mean(loss_history[-10:]), np.mean(editdist_history[-10:])))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "__Debugging tips:__\n", + "\n", + "\n", + " * As usual, don't expect improvements right away, but in general the model should be able to show some positive changes by 5k steps.\n", + " * Entropy is a good indicator of many problems. \n", + " * If it reaches zero, you may need greater entropy regularizer.\n", + " * If it has rapid changes time to time, you may need gradient clipping.\n", + " * If it oscillates up and down in an erratic manner... it's perfectly okay for entropy to do so. But it should decrease at the end.\n", + " \n", + " * We don't show loss_history cuz it's uninformative for pseudo-losses in policy gradient. However, if something goes wrong you can check it to see if everything isn't a constant zero." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Results" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for word in train_words[:10]:\n", + " print(\"%s -> %s\" % (word, translate([word])[0]))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "test_scores = []\n", + "for start_i in trange(0, len(test_words), 32):\n", + " batch_words = test_words[start_i:start_i+32]\n", + " batch_trans = translate(batch_words)\n", + " distances = list(map(get_distance, batch_words, batch_trans))\n", + " test_scores.extend(distances)\n", + "print(\"Supervised test score:\", np.mean(test_scores))\n", + "\n", + "# ^^ If you get Out Of MemoryError, please replace this with batched computation" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 6: Make it actually work (5++ pts)\n", + "\n", + "In this section we want you to finally __restart with EASY_MODE=False__ and experiment to find a good model/curriculum for that task.\n", + "\n", + "We recommend you to start with the following architecture\n", + "\n", + "```\n", + "encoder---decoder\n", + "\n", + " P(y|h)\n", + " ^\n", + " LSTM -> LSTM\n", + " ^ ^\n", + " biLSTM -> LSTM\n", + " ^ ^\n", + "input y_prev\n", + "```\n", + "\n", + "__Note:__ you can fit all 4 state tensors of both LSTMs into a in a single state - just assume that it contains, for example, [h0, c0, h1, c1] - pack it in encode and update in decode.\n", + "\n", + "\n", + "Here are some cool ideas on what you can do then.\n", + "\n", + "__General tips & tricks:__\n", + "* You will likely need to adjust pre-training time for such a network.\n", + "* Supervised pre-training may benefit from clipping gradients somehow.\n", + "* SCST may indulge a higher learning rate in some cases and changing entropy regularizer over time.\n", + "* It's often useful to save pre-trained model parameters to not re-train it every time you want new policy gradient parameters. \n", + "* When leaving training for nighttime, try setting REPORT_FREQ to a larger value (e.g. 500) not to waste time on it.\n", + "\n", + "__Formal criteria:__\n", + "To get 5 points we want you to build an architecture that:\n", + "* _doesn't consist of single GRU_\n", + "* _works better_ than single GRU baseline. \n", + "* We also want you to provide either learning curve or trained model, preferably both\n", + "* ... and write a brief report or experiment log describing what you did and how it fared.\n", + "\n", + "### Attention\n", + "There's more than one way to connect decoder to encoder\n", + " * __Vanilla:__ layer_i of encoder last state goes to layer_i of decoder initial state\n", + " * __Every tick:__ feed encoder last state _on every iteration_ of decoder.\n", + " * __Attention:__ allow decoder to \"peek\" at one (or several) positions of encoded sequence on every tick.\n", + " \n", + "The most effective (and cool) of those is, of course, attention.\n", + "You can read more about attention [in this nice blog post](https://distill.pub/2016/augmented-rnns/). The easiest way to begin is to use \"soft\" attention with \"additive\" or \"dot-product\" intermediate layers.\n", + "\n", + "__Tips__\n", + "* Model usually generalizes better if you no longer allow decoder to see final encoder state\n", + "* Once your model made it through several epochs, it is a good idea to visualize attention maps to understand what your model has actually learned\n", + "\n", + "* There's more stuff [here](https://github.com/yandexdataschool/Practical_RL/blob/master/week8_scst/bonus.ipynb)\n", + "* If you opted for hard attention, we recommend [gumbel-softmax](https://blog.evjang.com/2016/11/tutorial-categorical-variational.html) instead of sampling. Also please make sure soft attention works fine before you switch to hard.\n", + "\n", + "### UREX\n", + "* This is a way to improve exploration in policy-based settings. The main idea is that you find and upweight under-appreciated actions.\n", + "* Here's [video](https://www.youtube.com/watch?v=fZNyHoXgV7M&feature=youtu.be&t=3444)\n", + " and an [article](https://arxiv.org/abs/1611.09321).\n", + "* You may want to reduce batch size 'cuz UREX requires you to sample multiple times per source sentence.\n", + "* Once you got it working, try using experience replay with importance sampling instead of (in addition to) basic UREX.\n", + "\n", + "### Some additional ideas:\n", + "* (advanced deep learning) It may be a good idea to first train on small phrases and then adapt to larger ones (a.k.a. training curriculum).\n", + "* (advanced nlp) You may want to switch from raw utf8 to something like unicode or even syllables to make task easier.\n", + "* (advanced nlp) Since hebrew words are written __with vowels omitted__, you may want to use a small Hebrew vowel markup dataset at `he-pron-wiktionary.txt`.\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "assert not EASY_MODE, \"make sure you set EASY_MODE = False at the top of the notebook.\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "`[your report/log here or anywhere you please]`" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "__Contributions:__ This notebook is brought to you by\n", + "* Yandex [MT team](https://tech.yandex.com/translate/)\n", + "* Denis Mazur ([DeniskaMazur](https://github.com/DeniskaMazur)), Oleg Vasilev ([Omrigan](https://github.com/Omrigan/)), Dmitry Emelyanenko ([TixFeniks](https://github.com/tixfeniks)) and Fedor Ratnikov ([justheuristic](https://github.com/justheuristic/))\n", + "* Dataset is parsed from [Wiktionary](https://en.wiktionary.org), which is under CC-BY-SA and GFDL licenses.\n" + ] + } + ], + "metadata": { + "language_info": { + "name": "python", + "pygments_lexer": "ipython3" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/week8_scst/scheme.svg b/week07_seq2seq/scheme.svg similarity index 100% rename from week8_scst/scheme.svg rename to week07_seq2seq/scheme.svg diff --git a/week8_scst/voc.py b/week07_seq2seq/voc.py similarity index 100% rename from week8_scst/voc.py rename to week07_seq2seq/voc.py diff --git a/week7_pomdp/README.md b/week08_pomdp/README.md similarity index 85% rename from week7_pomdp/README.md rename to week08_pomdp/README.md index 0b045c0c0..e0013ca20 100644 --- a/week7_pomdp/README.md +++ b/week08_pomdp/README.md @@ -1,5 +1,5 @@ # Materials -[lecture slides](https://yadi.sk/d/RGx8BUCr3Gq6DC) +[lecture slides](https://yadi.sk/i/3-4q71hhwtFxuw) _Links on all articles mentioned during the lecture could be found in "References" at the very end of the presentation slides. All other interesing links which contribute to the topic of POMDP are presented below_ @@ -23,5 +23,6 @@ _Links on all articles mentioned during the lecture could be found in "Reference # Practice +[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/yandexdataschool/Practical_RL/blob/spring19/week08_pomdp/practice_pytorch.ipynb) The assignment is platform and framewerk independent, so choose the framework that suits you best, but pay attention on how many you will need to implement youself in case of nonstandart ones. diff --git a/week7_pomdp/atari_util.py b/week08_pomdp/atari_util.py similarity index 100% rename from week7_pomdp/atari_util.py rename to week08_pomdp/atari_util.py diff --git a/week7_pomdp/env_pool.py b/week08_pomdp/env_pool.py similarity index 100% rename from week7_pomdp/env_pool.py rename to week08_pomdp/env_pool.py diff --git a/week7_pomdp/homework_common_part2.ipynb b/week08_pomdp/homework_common_part2.ipynb similarity index 100% rename from week7_pomdp/homework_common_part2.ipynb rename to week08_pomdp/homework_common_part2.ipynb diff --git a/week7_pomdp/img1.jpg b/week08_pomdp/img1.jpg similarity index 100% rename from week7_pomdp/img1.jpg rename to week08_pomdp/img1.jpg diff --git a/week7_pomdp/img2.jpg b/week08_pomdp/img2.jpg similarity index 100% rename from week7_pomdp/img2.jpg rename to week08_pomdp/img2.jpg diff --git a/week7_pomdp/img3.jpg b/week08_pomdp/img3.jpg similarity index 100% rename from week7_pomdp/img3.jpg rename to week08_pomdp/img3.jpg diff --git a/week08_pomdp/practice_pytorch.ipynb b/week08_pomdp/practice_pytorch.ipynb new file mode 100644 index 000000000..bb74cf8da --- /dev/null +++ b/week08_pomdp/practice_pytorch.ipynb @@ -0,0 +1,699 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "from __future__ import print_function, division\n", + "from IPython.core import display\n", + "import matplotlib.pyplot as plt\n", + "%matplotlib inline\n", + "import numpy as np\n", + "\n", + "# if you're running in colab\n", + "# !wget https://raw.githubusercontent.com/yandexdataschool/Practical_RL/0ccb0673965dd650d9b284e1ec90c2bfd82c8a94/week08_pomdp/atari_util.py\n", + "# !wget https://raw.githubusercontent.com/yandexdataschool/Practical_RL/0ccb0673965dd650d9b284e1ec90c2bfd82c8a94/week08_pomdp/env_pool.py\n", + "\n", + "# If you are running on a server, launch xvfb to record game videos\n", + "# Please make sure you have xvfb installed\n", + "import os\n", + "if type(os.environ.get(\"DISPLAY\")) is not str or len(os.environ.get(\"DISPLAY\")) == 0:\n", + " !bash ../xvfb start\n", + " %env DISPLAY = : 1" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Kung-Fu, recurrent style\n", + "\n", + "In this notebook we'll once again train RL agent for for atari [KungFuMaster](https://gym.openai.com/envs/KungFuMaster-v0/), this time using recurrent neural networks.\n", + "\n", + "![http://www.retroland.com/wp-content/uploads/2011/07/King-Fu-Master.jpg](http://www.retroland.com/wp-content/uploads/2011/07/King-Fu-Master.jpg)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[33mWARN: gym.spaces.Box autodetected dtype as . Please provide explicit dtype.\u001b[0m\n", + "Observation shape: (1, 42, 42)\n", + "Num actions: 14\n", + "Action names: ['NOOP', 'UP', 'RIGHT', 'LEFT', 'DOWN', 'DOWNRIGHT', 'DOWNLEFT', 'RIGHTFIRE', 'LEFTFIRE', 'DOWNFIRE', 'UPRIGHTFIRE', 'UPLEFTFIRE', 'DOWNRIGHTFIRE', 'DOWNLEFTFIRE']\n" + ] + } + ], + "source": [ + "import gym\n", + "from atari_util import PreprocessAtari\n", + "\n", + "\n", + "def make_env():\n", + " env = gym.make(\"KungFuMasterDeterministic-v0\")\n", + " env = PreprocessAtari(env, height=42, width=42,\n", + " crop=lambda img: img[60:-30, 15:],\n", + " color=False, n_frames=1)\n", + " return env\n", + "\n", + "\n", + "env = make_env()\n", + "\n", + "obs_shape = env.observation_space.shape\n", + "n_actions = env.action_space.n\n", + "\n", + "print(\"Observation shape:\", obs_shape)\n", + "print(\"Num actions:\", n_actions)\n", + "print(\"Action names:\", env.env.env.get_action_meanings())" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/jheuristic/anaconda3/lib/python3.6/site-packages/scipy/misc/pilutil.py:482: FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`.\n", + " if issubdtype(ts, int):\n", + "/home/jheuristic/anaconda3/lib/python3.6/site-packages/scipy/misc/pilutil.py:485: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n", + " elif issubdtype(type(size), float):\n" + ] + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAANEAAAEICAYAAADBfBG8AAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAFmVJREFUeJzt3XvUHHV9x/H3hyBoASHcEgi3wAGO4CVGxFTKRbyFVAXa\nqsFWUWkJlVA80FMIKFLUAirQKBUImnIRQSqi1BNQCnhpEeRiCJcIJIAQckMIBAVpE7/9Y2Zhstl9\nnnl2dp+Z2f28ztmzszOzu99J5ru/3/xmnu8oIjCzzm1QdgBmdeckMivISWRWkJPIrCAnkVlBTiKz\ngpxEfUjSTpJ+J2lM2bEMAidRAZKmS7pd0u8lrUynPyVJZcYVEY9HxKYRsbbMOAaFk6hDkk4EZgNf\nBsYD44BjgP2AjUoMzUZbRPgxwgewOfB74C+HWe/PgV8Bq4EngNMzy3YBAvhEumwVSRK+FVgAPAuc\n3/R5nwQWpuv+CNi5zfc2PnvD9PVPgC8AtwK/A/4T2Aq4Io3tDmCXzPtnpzGtBu4C9s8sew1waRrD\nQuCfgCWZ5dsD1wBPAY8C/1D2/1fP94eyA6jjA5gKrGnspEOsdxDwBpIW/43ACuCwdFljR78QeDXw\nHuAPwPeBbYEJwErgwHT9w4BFwOuADYHPALe2+d5WSbQI2C39AXgAeAh4V/pZlwH/nnn/36RJtiFw\nIrAceHW67Czgp8BYYIc04ZekyzZIk+40ktZ4V+AR4L1l/5/1dH8oO4A6PtKdbHnTvFvT1uNF4IA2\n7/tX4Lx0urGjT8gsfxr4cOb1NcCn0+nrgaMyyzYAXqBFa9QmiU7NLD8HuD7z+v3A/CG2dxXwpnR6\nnaQA/jaTRG8DHm9676xsgvbjw8dEnXka2FrSho0ZEfH2iNgiXbYBgKS3SbpF0lOSniPprm3d9Fkr\nMtMvtni9aTq9MzBb0rOSngWeAUTSYuWR93uQdKKkhZKeS79r80zc25N09Rqy0zsD2zdiTN97Csnx\nYt9yEnXmF8BLwKHDrPdt4Dpgx4jYnKTr1unI3RPAjIjYIvN4TUTc2uHntSRpf+Ak4EPA2PSH4Tle\niXsZSTeuYcemGB9tinGziJjWzRirxknUgYh4Fvhn4OuS/krSppI2kDQJ2CSz6mbAMxHxB0n7Ah8p\n8LUXArMk7Q0gaXNJHyzwee1sRnK89xSwoaTTgNdmll+dxjFW0gRgZmbZL4HVkk6S9BpJYyS9XtJb\nexBnZTiJOhQRXwJOIBmdWknSPbqI5Fe80Tp8CjhD0vMkB9tXF/i+a4GzgaskrQbuAw7peAPa+xHJ\n8ddDwG9IBjuyXbYzgCUkI2//BXyXpFUmkvNS7wcmpct/C3yDpDvYt5Qe/Jl1RNLfA9Mj4sCyYymL\nWyIbEUnbSdov7b7uSTIEfm3ZcZVpw+FXMVvHRiTd1okkQ/pXAV8vNaKS9aw7J2kqyZnvMcA3IuKs\nnnyRWcl6kkTp1cMPAe8mOQi9AzgiIh7o+peZlaxX3bl9gUUR8QiApKtIzqm0TCJJHt2wKvptRGwz\n3Eq9GliYwLrDoktoOrMu6WhJd0q6s0cxmBX1mzwr9aolanVWfp3WJiLmAHPALZHVW69aoiWseznI\nDsDSHn2XWal6lUR3ALtLmihpI2A6yTVkZn2nJ925iFgjaSbJJSRjgLkRcX8vvsusbJW47MfHRFZR\nd0XEPsOt5Mt+zAqqxWU/xx9/fNkh2ACaPXt2rvXcEpkVVIuWaLTMmDEDgIsuuqjtsqzm9ZrXGely\nqye3RKlWSdJq2UUXXfTyzp+dn03ATpZbfTmJUm4VrFNOohyyCTZjxowhu3btllv/chKZFeSBhZyG\nGyRoXset0eBwS5RDnoRw0gyuWlz2MxonW0c6PJ1nHQ9x19vs2bNzXfbjJDJrI28SuTtnVpCTyKwg\nj85VyNhZY9ebt+rMVSVEYiPhlqgiGgm06sxVLz+y8626nERmBXWcRJJ2TG9gtVDS/ZKOT+efLulJ\nSfPTR1/fm8asyDHRGuDEiLhb0mbAXZJuTJedFxFfKR6eWfV1nEQRsYzkrmlExPOSFpL/1odmfaMr\nx0SSdgHeDNyezpopaYGkuZJaHhm7Auq6sgMJjUd2vlVX4SFuSZvyyl2uV0u6APg8ScXTz5PcqfqT\nze9zBdT1OWHqqVBLJOlVJAl0RUR8DyAiVkTE2oj4I3AxSXF7s75VZHROwDeBhRFxbmb+dpnVDie5\nt6hZ3yrSndsP+Chwr6T56bxTgCPSu2gH8BjgvxGwvlZkdO6/aX33h3mdh2NV5D/hGNrAXjt374NH\nrPP6DXteOaLl3fiMPN9RthkzZrSsMeFEeoUv+7EhOVmG5ySy3IYqbjnInESWm4tOtuYksiE5YYbn\nGgs2rEEdnctbY2FgR+csv0FJmk65O2dWkJPIrCAnkVlBA3NM1HyPoVZn4lstzz5nNc9rfNasWQ/3\nahO64swzdy87hL4zUC3RcAfIeQ6gszfpyvse628DlUTDnfNoXt5q/Tzr2GAZqCRqbkVaLW+ebl6/\n1fvdGg22gUqiZp3c1a75Pa2Ol2yw+IoFszZG7YoFSY8BzwNrgTURsY+kLYHvALuQ/HXrhyLCVTis\nL3WrO/eOiJiUydqTgZsiYnfgpvS1WV/q1XmiQ4GD0ulLgZ8AJ/Xou0ZkJOeDWs1v9Z6sQ37+89HZ\nkA5dv//+ZYfQd7qRRAH8OD2uuSitJzcurZBKRCyTtG0Xvqdrit4m0iyrG925/SJiMnAIcKykA/K8\nqcwKqCM9X9TpOjYYCidRRCxNn1cC15IUa1zRqD+XPq9s8b45EbFPntGPbhvplQvtXvv8kEHxCqib\npHeEQNImwHtIijVeBxyZrnYk8IMi39Ntrc71DLXcbCiFzhNJ2pWk9YHk+OrbEfFFSVsBVwM7AY8D\nH4yIZ4b4HJ8nssoZlfNEEfEI8KYW858G3lnks83qohZXLJiVpH9qLEz+wuSyQ7ABdPdn7s61Xi2S\naNsdKnWayWwdtUiiDa4e6IvNreJqkUTzd5g//EpmJalFEo3faXzZIdgAWsrSXOu5n2RWUC1aIg8s\nWJX5PJFZe7nOE7k7Z1aQk8isoFocE90w2Vcs2Oibene+KxbcEpkV5CQyK8hJZFZQLY6JJs3zFQtW\ngpy7nVsis4I6bokk7UlS5bRhV+A0YAvg74Cn0vmnRMS8jiMEPvLx04ZdZ9aJxwFw5jlfK/JVhTiG\nfosh327bcRJFxIPAJABJY4AnSeotfAI4LyK+0ulnd2LtSWuTiRKvEHIMgxlDt46J3gksjojfSOrS\nR47MmLPHJBPnlPL1jmGAY+hWEk0Hrsy8ninpY8CdwImjUcx+0H79HEN1Yig8sCBpI+ADwH+ksy4A\ndiPp6i2jzW9Btyugjjl7zCu/PiVxDIMZQzdaokOAuyNiBUDjGUDSxcAPW70prdk9J12v8FXcg/br\n5xiqE0M3kugIMl05Sds1itkDh5NURO25QeuHO4bqxFAoiST9CfBuIFtz90uSJpHcLeKxpmU9M2i/\nfo6hOjEUrYD6ArBV07yPFoqoQ4P26+cYqhNDLS77yWPQfv0cQ3Vi6JskGrRfP8dQnRj6JokG7dfP\nMVQnhr5JokH79XMM1Ymhb5Jo0H79HEN1YuibJBq0Xz/HUJ0Y+iaJBu3XzzFUJ4ZaFG9cvnzaaIVi\n9rLx4+e5eKPZaKhFd+6Wyb61ilWXWyKzgpxEZgU5icwKqsUx0TvunlR2CDaIxvtOeWajohYtUZ66\nc2bdl6/unFsis4JyJZGkuZJWSrovM29LSTdKejh9HpvOl6SvSlokaYEk31zI+lrelugSYGrTvJOB\nmyJid+Cm9DUk1X92Tx9Hk5TQMutbuZIoIn4GPNM0+1Dg0nT6UuCwzPzLInEbsIWk7boRrFkVFTkm\nGtcojZU+N66XnQA8kVlvSTpvHd0u3mhWll6MzrUqxr3eVdrdLt5oVpYiLdGKRjctfV6Zzl8C7JhZ\nbwcg31krsxoqkkTXAUem00cCP8jM/1g6SjcFeC5TEdWs7+Tqzkm6EjgI2FrSEuBzwFnA1ZKOAh4H\nPpiuPg+YBiwCXiC5X5FZ38qVRBFxRJtF72yxbgDHFgnKrE58xYJZQU4is4KcRGYFOYnMCnISmRXk\nJDIryElkVpCTyKwgJ5FZQU4is4KcRGYFOYnMCnISmRXkJDIryElkVpCTyKwgJ5FZQcMmUZvqp1+W\n9Ou0wum1krZI5+8i6UVJ89PHhb0M3qwK8rREl7B+9dMbgddHxBuBh4BZmWWLI2JS+jimO2GaVdew\nSdSq+mlE/Dgi1qQvbyMpi2U2kLpxTPRJ4PrM64mSfiXpp5L2b/cmV0C1flGoAqqkU4E1wBXprGXA\nThHxtKS3AN+XtHdErG5+bzcroN58w5SXpw+eeluRj6p1DEOpenx11nFLJOlI4H3AX6dlsoiIlyLi\n6XT6LmAxsEc3Am0nu3OUpQoxjETd4q26jpJI0lTgJOADEfFCZv42ksak07uS3F7lkW4EmlcVdpAq\nxJBVtXj6zbDduTbVT2cBGwM3SgK4LR2JOwA4Q9IaYC1wTEQ035KlJxpdlDJ3mCrE0E6VY6u7YZOo\nTfXTb7ZZ9xrgmqJBdaKxc5TZ369CDK0cPPU2J08P1eLGx0M5eOptfO3tZ7z8+rhbBzOG4Sz41rSX\npz/9Ld9Iupt82Y9ZQX2RRMfdeto6z4Maw1AarY9boe6rfXcOYI97FnAc5e4cZcVw/rmvBWDmCeud\nimux3lc4P72X+3DrW361b4n2uGfBOs+DFEMjgZqnh1ovz/o2MrVPoqwyE6lKMTScf+5rnSyjoLbd\nuarsrGXG0eiSNRJluIRpXt+6oy9aoofe9MayQyg1huzxzcwTVrd83ZxAPibqntq2RNZacyvjVqf3\n+qIlstYtS3OrNNS61rnaJ9Ggd+WympOjMbCQTSYnUPfVPomyB/Zl7cxViGEo2WSy7qt9Etm6nCij\nr/YDC1X45a9CDFl77bXXeleS33zDlMpdXd4v3BKZFVTbJFo790DWzj1wnddlxVF2DMNxK9Rbte/O\nAex2/NiyQ6hEDA0HT71t3fND5z7gY6Ue6rQC6umSnsxUOp2WWTZL0iJJD0p6b68Cb6UKO3IVYmjm\nBOqtTiugApyXqXQ6D0DSXsB0YO/0PV9vFC7ptsWzV7F49ip2O34si2ev6sVX5I6j7BisXHlqLPxM\n0i45P+9Q4KqIeAl4VNIiYF/gFx1HmEMVduIqxGDlKDKwMDMtaD9XUqMPMwF4IrPOknTeerpVAbWx\n45bZjapCDFaeTpPoAmA3YBJJ1dNz0vlqsW7L6qYRMSci9omIfTqMYT1V2ImrEIMvOh1dHSVRRKyI\niLUR8UfgYpIuGyQtz46ZVXcAlhYL0YrwoELvdVoBdbvMy8OBxsjddcB0SRtLmkhSAfWXxUIcWhV+\n+asQg5Wn0wqoB0maRNJVewyYARAR90u6GniApND9sRGxtjehWyvuyo2+rlZATdf/IvDFIkHlUZVf\n/6rEYeWp7WU/rVRhiLkKMdjoUnpXlHKDGOb+RENd97Xf8icB+J/xLUfSR0UVYsiqak3wurn5hil3\n5Rk9rsW1cydMbn/r19vnfRZIduS3Tfv8aIVUuRiybr4heR7q382G1/h3HE7tu3NV2GmrEEMr7/uX\n+WWHMBBq0Z0zK0n/dOd+eMqkskOwAZS3Ja99d86sbE4is4KcRGYFeWDBrD0PLJgV4YEFs1FSi+7c\n8uXThlps1hPjx8/rn+7cLZN95t2qy905s4KcRGYFOYnMCuq0Aup3MtVPH5M0P52/i6QXM8su7GXw\nZlWQZ2DhEuB84LLGjIj4cGNa0jnAc5n1F0dEV0/svONunyeyEozPV6iqUAVUSQI+BBw8gtBGbPz4\neb38eLNCig5x7w+siIiHM/MmSvoVsBr4TET8vNUbJR0NHJ3nS67cfvuCYZqN3BFLu9QSDfc9wJWZ\n18uAnSLiaUlvAb4vae+IWK+CYETMAeaAr52zeus4iSRtCPwF8JbGvLSQ/Uvp9F2SFgN7AIXqbeeV\nPXZqnKBtNc8xlB/DaMTR7vu6/W9RZIj7XcCvI2JJY4akbRq3UpG0K0kF1EeKhTgyrf5RRvuKB8dQ\nrRh6HUeeIe4rSW6NsqekJZKOShdNZ92uHMABwAJJ9wDfBY6JiGe6Fq1ZBXVaAZWI+HiLedcA1xQP\ny6w+fMWCWUF9mUTZ/m5ZV4A7hurE0Os4avGnECNRhasbHMNgxVCLP8rzyVYrwxFLl+b6o7xaJJFZ\nSfrnL1uT619H5vI//WcAPvqLz3U7GMdQwxg6i2NmrrX6cmDBbDQ5icwKchKZFVSLY6Lx229Vynu7\nxTFUJwbIH8fyfH8J4ZbIrKhatETbjB/ZHbrPPfuznHDS5QBcfulnOeGk0b+TnWOoTgydxjGwLdEV\nl5zFuHGbvPx63LhNuOKSsxzDAMfQ6zjq0RJtu8WI39P8j9TJZxTlGKoTQy/jqMUVCyO9lfy3Lzlj\nndcf+fhpIw+qIMdQnRg6jePmG6b0z2U/I00is27Im0R9d0xkNtry/Hn4jpJukbRQ0v2Sjk/nbynp\nRkkPp89j0/mS9FVJiyQtkDS51xthVqY8LdEa4MSIeB0wBThW0l7AycBNEbE7cFP6GuAQkgIlu5PU\nlbug61GbVciwSRQRyyLi7nT6eWAhMAE4FLg0Xe1S4LB0+lDgskjcBmwhabuuR25WESMa4k7LCb8Z\nuB0YFxHLIEk0Sdumq00Ansi8bUk6b1nTZ+WugHrzDVNGEqbZqMqdRJI2Jank8+mIWJ2U4W69aot5\n642+uQKq9Ytco3OSXkWSQFdExPfS2Ssa3bT0eWU6fwmwY+btOwA5L6Awq588o3MCvgksjIhzM4uu\nA45Mp48EfpCZ/7F0lG4K8Fyj22fWlyJiyAfwZyTdsQXA/PQxDdiKZFTu4fR5y3R9Af8GLAbuBfbJ\n8R3hhx8VfNw53L4bEfW4YsGsJL5iwWw0OInMCnISmRXkJDIrqCp/lPdb4Pfpc7/Ymv7Znn7aFsi/\nPTvn+bBKjM4BSLozz0hIXfTT9vTTtkD3t8fdObOCnERmBVUpieaUHUCX9dP29NO2QJe3pzLHRGZ1\nVaWWyKyWnERmBZWeRJKmSnowLWxy8vDvqB5Jj0m6V9J8SXem81oWcqkiSXMlrZR0X2ZebQvRtNme\n0yU9mf4fzZc0LbNsVro9D0p674i/MM+l3r16AGNI/mRiV2Aj4B5grzJj6nA7HgO2bpr3JeDkdPpk\n4Oyy4xwi/gOAycB9w8VP8mcw15P8ycsU4Pay48+5PacD/9hi3b3S/W5jYGK6P44ZyfeV3RLtCyyK\niEci4n+Bq0gKnfSDdoVcKicifgY80zS7toVo2mxPO4cCV0XESxHxKLCIZL/MrewkalfUpG4C+LGk\nu9ICLNBUyAXYtu27q6ld/HX+P5uZdkHnZrrXhben7CTKVdSkBvaLiMkkNfeOlXRA2QH1UF3/zy4A\ndgMmkVSeOiedX3h7yk6ivihqEhFL0+eVwLUk3YF2hVzqoq8K0UTEiohYGxF/BC7mlS5b4e0pO4nu\nAHaXNFHSRsB0kkIntSFpE0mbNaaB9wD30b6QS130VSGapuO2w0n+jyDZnumSNpY0kaRy7y9H9OEV\nGEmZBjxEMipyatnxdBD/riSjO/cA9ze2gTaFXKr4AK4k6eL8H8kv81Ht4qeDQjQV2Z7L03gXpImz\nXWb9U9PteRA4ZKTf58t+zAoquztnVntOIrOCnERmBTmJzApyEpkV5CQyK8hJZFbQ/wPTMFRqoBLr\nRQAAAABJRU5ErkJggg==\n", + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAP8AAAEICAYAAACQ6CLfAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAFz5JREFUeJzt3X20HHV9x/H35z5xQxIeEkIMJAUfooItpi1G6sORolhE\nFGzViihpy7HtsfRYH9qqfcJWrZ6K2HP06EFFUquAjzVVasmJIIVaHsSIQagBBBMTEhACuXm6T9/+\nMXPL3jtzc/fe3Z3dze/zOmfP3f3N7M539u53Z+a3M7+vIgIzS09PuwMws/Zw8pslyslvlignv1mi\nnPxmiXLymyXKyZ8wSSdKCkl97Y5lNiRdIOm6dsfR7Zz8TSTpBkmPSTqswmWGpGdUtbyqlX1BRcQX\nIuLl7YzrUODkbxJJJwIvBgJ4dVuD6SDK+HPWgfxPaZ4Lgf8BrgTW1E6QtFjSv0t6QtJtkt4v6aaa\n6c+WtF7So5L+V9Lra6ZdKekTkr4labekWyQ9PZ92Yz7bDyUNSfrdqUFJ6pH015IelLRT0r9IOnLK\nbH8gaZuk7ZLeWfPc1ZJuz+PeIemjNdNOk/TfknZJ+qGk02um3SDpA5JuBvYC75V0+5S43i5pXX7/\nlZJ+kC9ni6RLamadWMdd+Tr+hqTfm/L+vSB/Xx/P/75gSiz/IOnm/P27TtIxU9+nJEWEb024AfcC\nbwV+HRgBltZMuzq/HQ6cDGwBbsqnzc8f/z7QB/wa8AjwnHz6lcCjwOp8+heAq2teO4BnHCSuP8hj\nexqwAPga8Pl82on586/K4/gV4GHgZfn07wFvzu8vAE7L7x8P/AI4m2wDcmb+eEk+/QbgZ8Bz8piP\nBHYDK2viug14Q37/9HzZPcApwA7gvCkx9tU89/dq3r9FwGPAm/NlnZ8/XlwTy33AM4F5+eMPtfvz\n0gk3b/mbQNKLgBOAL0XE98k+bG/Mp/UCvwP8XUTsjYgfA2trnn4O8EBEfC4iRiPiDuCrwGtr5vla\nRNwaEaNkyb9qFuFdAHw0Iu6PiCHgPcAbpnTyvS8i9kTEj4DPkSUQZF9iz5B0TEQMRcT/5O1vAq6N\niGsjYjwi1gO3k30ZTLgyIu7K1+lx4BsTrytpJfBsYB1ARNwQET/KX+tOsi+jl9S5fq8ENkfE5/Nl\nXQXcA7yqZp7PRcRPImIf8CVm9/4dspz8zbEGuC4iHskff5End/2XkG2RttTMX3v/BOD5+e7zLkm7\nyBL2KTXzPFRzfy/ZVrhexwEP1jx+MI9n6TTxPJg/B+Aisi3mPfnu9Dk1Mb9uSswvApZN85qQvScT\nXypvBP4tIvYCSHq+pOslPSzpceCPgXp3zaeu38Q6HF/zuJH375DVVT/xdCJJ84DXA72SJj5khwFH\nSXousAkYBZYDP8mnr6h5iS3AdyPizBaFuI0sWSf8Uh7PjjymiXjuqZm+DSAiNgPn5x12vw18RdLi\nPObPR8RbDrLcqZeLXgccI2kV2ZfA22umfRH4OPCKiNgv6WM8mfwzXXY6df0m1uHbMzwved7yN+48\nYIzsWH5VfjsJ+C/gwogYIzvOvkTS4ZKeTdY5OOGbwDMlvVlSf357nqST6lz+DrLj+elcBbxd0lMl\nLQA+CFyTH0JM+Js8tueQ9T1cAyDpTZKWRMQ4sCufdwz4V+BVkn5LUq+kQUmnS1rONPLlfQX4J7Lj\n9PU1kxcCj+aJv5r8kCn3MDB+kHW8luz9e6OkvrzT82Sy99UOwsnfuDVkx5Q/i4iHJm5kW7IL8mPr\ni8k6vR4CPk+WkAcAImI38HLgDWRbsYeAD5PtPdTjEmBtvvv9+pLpV+TLvBH4KbAf+NMp83yXrFNw\nA/CRiJg4geYs4C5JQ8A/k3XQ7Y+ILcC5wHvJknML8OfM/Hn6IvAy4MtTvnzeCvy9pN3A35IdlwOQ\nHxp8ALg5X8fTal8wIn5B1m/yTrJOx78Azqk5BLNpKO8RtQpJ+jDwlIhYM+PMZi3iLX8F8t/xT8nO\nd9Fqso60r7c7LkubO/yqsZBsV/84YCdwKdlPX2Zt491+s0R5t98sUQ3t9ks6i6wXuBf4TER86GDz\n9w/Mj8HBoxtZpJkdxP79jzEyvEf1zDvn5M9PW/0E2XndW4HbJK3LT18tNTh4NKeuvniuizSzGdx+\n68frnreR3f7VwL35OePDZBeunNvA65lZhRpJ/uOZfP72ViafTw2ApD/MLwu9fWRkTwOLM7NmaiT5\ny44rCj8dRMTlEXFqRJza3z+/gcWZWTM10uG3lckXqCwnvyBkOhraR//NmxpYpJkdjA7sq3veRrb8\ntwEr8wtGBsjOTV/XwOuZWYXmvOWPiFFJFwP/SfZT3xURcVfTIjOzlmrod/6IuJbskkoz6zI+w88s\nUZVe2BNHzGP/i0+pcpFmSYn/uqHueb3lN0uUk98sUU5+s0Q5+c0S5eQ3S1Slvf3Di4It549Oaovx\n4iUCkkcXMoio/7Mxm3k7SbPjHr6r/ud6y2+WKCe/WaKc/GaJcvKbJaracftDxNjkDo4YKX7/lHUC\ntpsGxgttU9cFgLK2duor7wBST7E9hjtsW1ASI/3F/wNADPcW2zqsv6/0MzRa/Lw09PmfxXM77L9t\nZlVx8pslyslvlignv1miGq3Y8wCwGxgDRiPi1IPOPyL6fj657LzGGomgQmVfk2UdSh3WyTStsvUp\n70vrLNNtrro19ibHrZH6O/ya0dv/mxHxSBNex8wq5N1+s0Q1mvwBXCfp+5L+sGyG2oo9Y3tcsces\nUzS62//CiNgm6VhgvaR7IuLG2hki4nLgcoDB5Su65YjY7JDX6NDd2/K/OyV9nax4543Tzg/ElBOx\nVNbh0YFfEVPjhmk6Kzss9rK4gdJ9vtL/RTuV9F2NT7M+PZ0We4my2MtOYqzqMzTn3X5J8yUtnLgP\nvBxwLS6zLtHIln8p8HVJE6/zxYj4dlOiMrOWa6Rc1/3Ac5sYi5lVyD/1mSWq0kt6RUknWYd1kE2n\nGzr3ykx7BmUXdJCVvb893XJGaInS2Nv4GfKW3yxRTn6zRDn5zRLl5DdLlJPfLFGV9vYHEFO+bnx6\nb2tNe3pvyamzGi22tdUhdnpv6WeoG0/vNbPu5uQ3S5ST3yxRTn6zRFVbsacvGF1cR69Sh3WaAaWd\nTx0Z51SzKf7SaevTaPGjblifJscY/S7RbWYzcPKbJcrJb5YoJ79Zombs8JN0BXAOsDMifjlvWwRc\nA5wIPAC8PiIem3Fp40J7p47g2Wm9MmZdrMkluq8EzprS9m5gQ0SsBDbkj82si8yY/Pk4/I9OaT4X\nWJvfXwuc1+S4zKzF5nrMvzQitgPkf4+dbsZJFXuGXLHHrFO0vMMvIi6PiFMj4tTeBfNbvTgzq9Nc\nz/DbIWlZRGyXtAzYWdezAnpGpjY2ehqXmf2/WfSfz3XLvw5Yk99fA3xjjq9jZm0yY/JLugr4HvAs\nSVslXQR8CDhT0mbgzPyxmXWRGXf7I+L8aSa9tMmxmFmFfIafWaIqv6R3bHGhx8/MmqXPl/Sa2Qyc\n/GaJcvKbJcrJb5aoakt0D4uBrQNVLtIsKRpu7iW9ZnYIcvKbJcrJb5YoJ79Zopz8Zoly8pslyslv\nlignv1minPxmiapnJJ8rJO2UtKmm7RJJP5e0Mb+d3dowzazZ5lq0A+CyiFiV365tblhm1mpzLdph\nZl2ukWP+iyXdmR8WHN20iMysEnNN/k8CTwdWAduBS6ebcVLFnj2u2GPWKeaU/BGxIyLGImIc+DSw\n+iDzPlmxZ74r9ph1ijklf16lZ8JrgE3TzWtmnWnGwTzyoh2nA8dI2gr8HXC6pFVkxYEeAP6ohTGa\nWQvMtWjHZ1sQi5lVyGf4mSXKyW+WKCe/WaKc/GaJcvKbJcrJb5YoJ79Zopz8Zoly8pslyslvlign\nv1minPxmiXLymyXKyW+WKCe/WaKc/GaJcvKbJaqeij0rJF0v6W5Jd0l6W96+SNJ6SZvzvx6+26yL\n1LPlHwXeGREnAacBfyLpZODdwIaIWAlsyB+bWZeop2LP9oi4I7+/G7gbOB44F1ibz7YWOK9VQZpZ\n883qmF/SicCvArcASyNiO2RfEMCx0zzHRTvMOlDdyS9pAfBV4M8i4ol6n+eiHWadqa7kl9RPlvhf\niIiv5c07Jop35H93tiZEM2uFenr7RTZO/90R8dGaSeuANfn9NcA3mh+embXKjEU7gBcCbwZ+JGlj\n3vZe4EPAlyRdBPwMeF1rQjSzVqinYs9NgKaZ/NLmhmNmVfEZfmaJcvKbJaqeY/6m0Tj07Z18BDE2\nLwrzxXQHGW3Ut68Y1PhAcb7x3uL6dIve4fre+LGB7l1He5K3/GaJcvKbJcrJb5YoJ79Zoirt8Ot/\naA/L//G/J7Vte9cLCvMNH9neDqWBJ4odX8ddekuhbdebVhfbVrYkpKZTyVu8Yv1Qoa1vZ/Eyjvsv\nPK7Q1s0dnanylt8sUU5+s0Q5+c0S5eQ3S5ST3yxR1Z7ee9hh9J749Mlt41VGUJ+ekWJb39Ilhbbx\n3gqCaRGNF3/RGD7qsEJb7xNl5zCXvGAXvxep8pbfLFFOfrNEOfnNEtVIxZ5LJP1c0sb8dnbrwzWz\nZqmnw2+iYs8dkhYC35e0Pp92WUR8pN6F7V/Sx0/eMnl4/94DJaeFtvlM0QOLigFsfttTC20aK3ly\nGzswy8ZGmHZwhP3FeR84r2TeviMKTQMPdd7/zGavnjH8tgMTxTl2S5qo2GNmXayRij0AF0u6U9IV\n0xXqdMUes87USMWeTwJPB1aR7RlcWvY8V+wx60xzrtgTETsiYiwixoFPA8XrW82sY814zD9dxR5J\nyyYKdQKvATbN9Fo9IzBv5+ROpZGFnTeAp8aKAQw+XJxvZGGxrZ3XtZ+y+r5C29ED+0rn/e59xYEH\nLlv95ULbkt7i9fwXrntroa1vdweOumoH1UjFnvMlrSLr530A+KOWRGhmLdFIxZ5rmx+OmVXFZ/iZ\nJcrJb5aoSi/pBegZnfy47NLSaPNgkBqts63sDL82Xtq6cePTCm1/dWZ55fS3/MZ3C233DC8rtG3a\nt7zQ1lNnZR/rbN7ymyXKyW+WKCe/WaKc/GaJqrbDTzDeP7mp3Wfz1Wtq3NB5sfc/Xvwu/8Tml5TO\ne8ep1xTaHhrbW2j74I/PKrT1HJhDcNZxvOU3S5ST3yxRTn6zRDn5zRLl5DdLVKW9/dELw0dOOXW3\nAyv2jA0WTy8em1cyYxcMWrnnzkWl7c/ce2GhbXS4+HHof7BYxccODd7ymyXKyW+WKCe/WaLqqdgz\nKOlWST/MK/a8L29/qqRbJG2WdI2kknKuZtap6unwOwCcERFD+Si+N0n6D+AdZBV7rpb0KeAisuG8\np6XBMfpPmjwg5MjdxYow7e4EHDmi2JO38ITHC2177j2y0Na7r7PO+e0ZLY/nWct2FNru2XFsoS1w\nh9+hasYtf2SG8of9+S2AM4Cv5O1rgfNaEqGZtUS94/b35iP37gTWA/cBuyJiYnybrUxTwqu2Ys/o\nE8ULR8ysPepK/rw4xypgOVlxjpPKZpvmuf9fsafviMPnHqmZNdWsevsjYhdwA3AacJSkiT6D5cC2\n5oZmZq1UT8WeJcBIROySNA94GfBh4HrgtcDVwBqgfKTIGhFibKzzf10s67QbGy+Ju9Mu6C8x3l9+\nGuJLj7mn0HbvL44ptI00PSLrFPX09i8D1krqJdtT+FJEfFPSj4GrJb0f+AFZSS8z6xL1VOy5k6ws\n99T2+3FxTrOu1fn74GbWEk5+s0RVeknvQN8oJyx+dFLb/b0LCvOpAy/zXXrE7kLbTw8rxt67v7O+\nT0fnl3f4/crglkLbnkeKP8X6nO1DV2d9Us2sMk5+s0Q5+c0S5eQ3S1SlHX6j4z3sHJrSSdaBnXtj\nC4tB9ajYcdYz0gVn+C0oqS0O3HWgeB2W9rWxvrhVzlt+s0Q5+c0S5eQ3S5ST3yxR1RbtGOpj+ObF\nk9oGy/uj2mrgiWLH17atKwpthw9XEU1j5u0sP0fvW5eeVmg74szitqCsNLl1Lo3VP6+3/GaJcvKb\nJcrJb5YoJ79Zohqp2HOlpJ9K2pjfVrU+XDNrlkYq9gD8eUR85SDPnaRnBOZv74K61ocQjZe/30PP\nOrrQNvho8bTm6On8U5jtST2z+PWsnjH8Aiir2GNmXWxOFXsi4pZ80gck3SnpMkmlRd0mVezZv6dJ\nYZtZo+ZUsUfSLwPvAZ4NPA9YBPzlNM99smLP4PwmhW1mjZprxZ6zImJ7XsTzAPA5PIy3WVeZc8Ue\nScsiYrskkVXo3TTTa4VgzCNCVqy8w2500NfuH4pmU0SqkYo938m/GARsBP54DrGaWZs0UrHnjJZE\nZGaV8Bl+Zoly8pslqtLr+cf7Ye9SnzFm1iqzGX/BW36zRDn5zRLl5DdLlJPfLFGVdvj1jMDhO3xB\noFmr9IzMYt7WhWFmnczJb5YoJ79Zopz8ZomqtMNPMbuKImY2OyWV5KflLb9Zopz8Zoly8pslyslv\nlqi6kz8fvvsHkr6ZP36qpFskbZZ0jSSPzmfWRWbT2/824G7giPzxh4HLIuJqSZ8CLgI+ebAX6BkO\nFm6ZUtS+pCLMeG/5Nf8Du4YLbRorVplphfGB4oCXIwuLF0+X9bb2DRXPuewZruZnj+gt/34fPqr4\nXd0zVgxeo8W2vqHi/6Eqw0eXlodAJbFHX3HdB3YdKD55mqpGzTY2v/h5GRsoxjhdlaR6Pv+9++vP\nh3qLdiwHXgl8Jn8s4AxgolTXWrIRfM2sS9S72/8x4C+Aia+VxcCuiJioDLYVOL7sibUVe0ZGXLHH\nrFPUU6X3HGBnRHy/trlk1tJ9p9qKPf39rthj1inqOeZ/IfBqSWcDg2TH/B8DjpLUl2/9lwPbWhem\nmTVbPeP2v4esLh+STgfeFREXSPoy8FrgamAN8I2ZXmt0vtixenKHzciC4g7D+EB5B8xxNxU7e/qf\nqKbjbOj4YgfZL1aVxFnStPjO4tu8YGs1nWYjC8sr82x7cXGnr/dAcYdu4PFi27G3Nx7XXP38JeUj\nVPbtK8ZZ9tlasaG+Ts1WePTk4ud3aEVJR2VveTxP+d5goe2wxybX5J5NSfVGfuf/S+Adku4l6wP4\nbAOvZWYVm9WFPRFxA1mhTiLiflyc06xr+Qw/s0Q5+c0SpYjqBtSU9DDwYP7wGOCRyhbeWofSuoDX\np9MdbH1OiIgl9bxIpck/acHS7RFxalsW3mSH0rqA16fTNWt9vNtvlignv1mi2pn8l7dx2c12KK0L\neH06XVPWp23H/GbWXt7tN0uUk98sUZUnv6SzJP2vpHslvbvq5TdK0hWSdkraVNO2SNL6fEiz9ZKO\nbmeMsyFphaTrJd0t6S5Jb8vbu26dJA1KulXSD/N1eV/e3tVDzrVqCL1Kk19SL/AJ4BXAycD5kk6u\nMoYmuBI4a0rbu4ENEbES2JA/7hajwDsj4iTgNOBP8v9JN67TAeCMiHgusAo4S9JpPDnk3ErgMbIh\n57rJxBB6E5qyPlVv+VcD90bE/RExTHY58LkVx9CQiLgReHRK87lkQ5lBlw1pFhHbI+KO/P5usg/Z\n8XThOkVmKH/Yn9+CLh5yrpVD6FWd/McDW2oeTzv8V5dZGhHbIUsm4Ng2xzMnkk4EfhW4hS5dp3wX\neSOwE1gP3EedQ851qDkPoTeTqpO/7uG/rFqSFgBfBf4sIp5odzxzFRFjEbGKbHSp1cBJZbNVG9Xc\nNDqE3kwqLdRJ9i21oubxoTL81w5JyyJiu6RlZFudriGpnyzxvxARX8ubu3qdImKXpBvI+jG6dci5\nlg6hV/WW/zZgZd5bOQC8AVhXcQytsI5sKDOoc0izTpEfQ34WuDsiPlozqevWSdISSUfl9+cBLyPr\nw7iebMg56JJ1gWwIvYhYHhEnkuXKdyLiApq1PhFR6Q04G/gJ2bHYX1W9/CbEfxWwHRgh25O5iOw4\nbAOwOf+7qN1xzmJ9XkS223gnsDG/nd2N6wScAvwgX5dNwN/m7U8DbgXuBb4MHNbuWOewbqcD32zm\n+vj0XrNE+Qw/s0Q5+c0S5eQ3S5ST3yxRTn6zRDn5zRLl5DdL1P8B8FPBd33wU/8AAAAASUVORK5C\nYII=\n", + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "s = env.reset()\n", + "for _ in range(100):\n", + " s, _, _, _ = env.step(env.action_space.sample())\n", + "\n", + "plt.title('Game image')\n", + "plt.imshow(env.render('rgb_array'))\n", + "plt.show()\n", + "\n", + "plt.title('Agent observation')\n", + "plt.imshow(s.reshape([42, 42]))\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### POMDP setting\n", + "\n", + "The atari game we're working with is actually a POMDP: your agent needs to know timing at which enemies spawn and move, but cannot do so unless it has some memory. \n", + "\n", + "Let's design another agent that has a recurrent neural net memory to solve this. Here's a sketch.\n", + "\n", + "![img](img1.jpg)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import torch\n", + "import torch.nn as nn\n", + "import torch.nn.functional as F\n", + "\n", + "# a special module that converts [batch, channel, w, h] to [batch, units]\n", + "\n", + "\n", + "class Flatten(nn.Module):\n", + " def forward(self, input):\n", + " return input.view(input.size(0), -1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "class SimpleRecurrentAgent(nn.Module):\n", + " def __init__(self, obs_shape, n_actions, reuse=False):\n", + " \"\"\"A simple actor-critic agent\"\"\"\n", + " super(self.__class__, self).__init__()\n", + "\n", + " self.conv0 = nn.Conv2d(1, 32, kernel_size=(3, 3), stride=(2, 2))\n", + " self.conv1 = nn.Conv2d(32, 32, kernel_size=(3, 3), stride=(2, 2))\n", + " self.conv2 = nn.Conv2d(32, 32, kernel_size=(3, 3), stride=(2, 2))\n", + " self.flatten = Flatten()\n", + "\n", + " self.hid = nn.Linear(512, 128)\n", + " self.rnn = nn.LSTMCell(128, 128)\n", + "\n", + " self.logits = nn.Linear(128, n_actions)\n", + " self.state_value = nn.Linear(128, 1)\n", + "\n", + " def forward(self, prev_state, obs_t):\n", + " \"\"\"\n", + " Takes agent's previous step and observation, \n", + " returns next state and whatever it needs to learn (tf tensors)\n", + " \"\"\"\n", + "\n", + " # YOUR CODE: apply the whole neural net for one step here.\n", + " # See docs on self.rnn(...)\n", + " # the recurrent cell should take the last feedforward dense layer as input\n", + " \n", + "\n", + " new_state = \n", + " logits = \n", + " state_value = \n", + "\n", + " return new_state, (logits, state_value)\n", + "\n", + " def get_initial_state(self, batch_size):\n", + " \"\"\"Return a list of agent memory states at game start. Each state is a np array of shape [batch_size, ...]\"\"\"\n", + " return torch.zeros((batch_size, 128)), torch.zeros((batch_size, 128))\n", + "\n", + " def sample_actions(self, agent_outputs):\n", + " \"\"\"pick actions given numeric agent outputs (np arrays)\"\"\"\n", + " logits, state_values = agent_outputs\n", + " probs = F.softmax(logits)\n", + " return torch.multinomial(probs, 1)[:, 0].data.numpy()\n", + "\n", + " def step(self, prev_state, obs_t):\n", + " \"\"\" like forward, but obs_t is a numpy array \"\"\"\n", + " obs_t = torch.tensor(np.asarray(obs_t), dtype=torch.float32)\n", + " (h, c), (l, s) = self.forward(prev_state, obs_t)\n", + " return (h.detach(), c.detach()), (l.detach(), s.detach())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "n_parallel_games = 5\n", + "gamma = 0.99\n", + "\n", + "agent = SimpleRecurrentAgent(obs_shape, n_actions)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "state = [env.reset()]\n", + "_, (logits, value) = agent.step(agent.get_initial_state(1), state)\n", + "print(\"action logits:\\n\", logits)\n", + "print(\"state values:\\n\", value)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Let's play!\n", + "Let's build a function that measures agent's average reward." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def evaluate(agent, env, n_games=1):\n", + " \"\"\"Plays an entire game start to end, returns session rewards.\"\"\"\n", + "\n", + " game_rewards = []\n", + " for _ in range(n_games):\n", + " # initial observation and memory\n", + " observation = env.reset()\n", + " prev_memories = agent.get_initial_state(1)\n", + "\n", + " total_reward = 0\n", + " while True:\n", + " new_memories, readouts = agent.step(\n", + " prev_memories, observation[None, ...])\n", + " action = agent.sample_actions(readouts)\n", + "\n", + " observation, reward, done, info = env.step(action[0])\n", + "\n", + " total_reward += reward\n", + " prev_memories = new_memories\n", + " if done:\n", + " break\n", + "\n", + " game_rewards.append(total_reward)\n", + " return game_rewards" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "env_monitor = gym.wrappers.Monitor(env, directory=\"kungfu_videos\", force=True)\n", + "rw = evaluate(agent, env_monitor, n_games=3,)\n", + "env_monitor.close()\n", + "print(rw)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# show video\n", + "from IPython.display import HTML\n", + "import os\n", + "\n", + "video_names = list(filter(lambda s: s.endswith(\n", + " \".mp4\"), os.listdir(\"./kungfu_videos/\")))\n", + "\n", + "HTML(\"\"\"\n", + "\n", + "\"\"\".format(\"./kungfu_videos/\"+video_names[-1])) # this may or may not be _last_ video. Try other indices" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Training on parallel games\n", + "\n", + "We introduce a class called EnvPool - it's a tool that handles multiple environments for you. Here's how it works:\n", + "![img](img2.jpg)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "from env_pool import EnvPool\n", + "pool = EnvPool(agent, make_env, n_parallel_games)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We gonna train our agent on a thing called __rollouts:__\n", + "![img](img3.jpg)\n", + "\n", + "A rollout is just a sequence of T observations, actions and rewards that agent took consequently.\n", + "* First __s0__ is not necessarily initial state for the environment\n", + "* Final state is not necessarily terminal\n", + "* We sample several parallel rollouts for efficiency" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# for each of n_parallel_games, take 10 steps\n", + "rollout_obs, rollout_actions, rollout_rewards, rollout_mask = pool.interact(10)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "print(\"Actions shape:\", rollout_actions.shape)\n", + "print(\"Rewards shape:\", rollout_rewards.shape)\n", + "print(\"Mask shape:\", rollout_mask.shape)\n", + "print(\"Observations shape: \", rollout_obs.shape)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Actor-critic objective\n", + "\n", + "Here we define a loss function that uses rollout above to train advantage actor-critic agent.\n", + "\n", + "\n", + "Our loss consists of three components:\n", + "\n", + "* __The policy \"loss\"__\n", + " $$ \\hat J = {1 \\over T} \\cdot \\sum_t { \\log \\pi(a_t | s_t) } \\cdot A_{const}(s,a) $$\n", + " * This function has no meaning in and of itself, but it was built such that\n", + " * $ \\nabla \\hat J = {1 \\over N} \\cdot \\sum_t { \\nabla \\log \\pi(a_t | s_t) } \\cdot A(s,a) \\approx \\nabla E_{s, a \\sim \\pi} R(s,a) $\n", + " * Therefore if we __maximize__ J_hat with gradient descent we will maximize expected reward\n", + " \n", + " \n", + "* __The value \"loss\"__\n", + " $$ L_{td} = {1 \\over T} \\cdot \\sum_t { [r + \\gamma \\cdot V_{const}(s_{t+1}) - V(s_t)] ^ 2 }$$\n", + " * Ye Olde TD_loss from q-learning and alike\n", + " * If we minimize this loss, V(s) will converge to $V_\\pi(s) = E_{a \\sim \\pi(a | s)} R(s,a) $\n", + "\n", + "\n", + "* __Entropy Regularizer__\n", + " $$ H = - {1 \\over T} \\sum_t \\sum_a {\\pi(a|s_t) \\cdot \\log \\pi (a|s_t)}$$\n", + " * If we __maximize__ entropy we discourage agent from predicting zero probability to actions\n", + " prematurely (a.k.a. exploration)\n", + " \n", + " \n", + "So we optimize a linear combination of $L_{td}$ $- \\hat J$, $-H$\n", + " \n", + "```\n", + "\n", + "```\n", + "\n", + "```\n", + "\n", + "```\n", + "\n", + "```\n", + "\n", + "```\n", + "\n", + "\n", + "__One more thing:__ since we train on T-step rollouts, we can use N-step formula for advantage for free:\n", + " * At the last step, $A(s_t,a_t) = r(s_t, a_t) + \\gamma \\cdot V(s_{t+1}) - V(s) $\n", + " * One step earlier, $A(s_t,a_t) = r(s_t, a_t) + \\gamma \\cdot r(s_{t+1}, a_{t+1}) + \\gamma ^ 2 \\cdot V(s_{t+2}) - V(s) $\n", + " * Et cetera, et cetera. This way agent starts training much faster since it's estimate of A(s,a) depends less on his (imperfect) value function and more on actual rewards. There's also a [nice generalization](https://arxiv.org/abs/1506.02438) of this.\n", + "\n", + "\n", + "__Note:__ it's also a good idea to scale rollout_len up to learn longer sequences. You may wish set it to >=20 or to start at 10 and then scale up as time passes." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def to_one_hot(y, n_dims=None):\n", + " \"\"\" Take an integer tensor and convert it to 1-hot matrix. \"\"\"\n", + " y_tensor = y.to(dtype=torch.int64).view(-1, 1)\n", + " n_dims = n_dims if n_dims is not None else int(torch.max(y_tensor)) + 1\n", + " y_one_hot = torch.zeros(y_tensor.size()[0], n_dims).scatter_(1, y_tensor, 1)\n", + " return y_one_hot" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "opt = torch.optim.Adam(agent.parameters(), lr=1e-5)\n", + "\n", + "\n", + "def train_on_rollout(states, actions, rewards, is_not_done, prev_memory_states, gamma=0.99):\n", + " \"\"\"\n", + " Takes a sequence of states, actions and rewards produced by generate_session.\n", + " Updates agent's weights by following the policy gradient above.\n", + " Please use Adam optimizer with default parameters.\n", + " \"\"\"\n", + "\n", + " # shape: [batch_size, time, c, h, w]\n", + " states = torch.tensor(np.asarray(states), dtype=torch.float32)\n", + " actions = torch.tensor(np.array(actions), dtype=torch.int64) # shape: [batch_size, time]\n", + " rewards = torch.tensor(np.array(rewards), dtype=torch.float32) # shape: [batch_size, time]\n", + " is_not_done = torch.tensor(np.array(is_not_done), dtype=torch.float32) # shape: [batch_size, time]\n", + " rollout_length = rewards.shape[1] - 1\n", + "\n", + " # predict logits, probas and log-probas using an agent.\n", + " memory = [m.detach() for m in prev_memory_states]\n", + "\n", + " logits = [] # append logit sequence here\n", + " state_values = [] # append state values here\n", + " for t in range(rewards.shape[1]):\n", + " obs_t = states[:, t]\n", + "\n", + " # use agent to comute logits_t and state values_t.\n", + " # append them to logits and state_values array\n", + "\n", + " memory, (logits_t, values_t) = \n", + "\n", + " logits.append(logits_t)\n", + " state_values.append(values_t)\n", + "\n", + " logits = torch.stack(logits, dim=1)\n", + " state_values = torch.stack(state_values, dim=1)\n", + " probas = F.softmax(logits, dim=2)\n", + " logprobas = F.log_softmax(logits, dim=2)\n", + "\n", + " # select log-probabilities for chosen actions, log pi(a_i|s_i)\n", + " actions_one_hot = to_one_hot(actions, n_actions).view(\n", + " actions.shape[0], actions.shape[1], n_actions)\n", + " logprobas_for_actions = torch.sum(logprobas * actions_one_hot, dim=-1)\n", + "\n", + " # Now let's compute two loss components:\n", + " # 1) Policy gradient objective.\n", + " # Notes: Please don't forget to call .detach() on advantage term. Also please use mean, not sum.\n", + " # it's okay to use loops if you want\n", + " J_hat = 0 # policy objective as in the formula for J_hat\n", + "\n", + " # 2) Temporal difference MSE for state values\n", + " # Notes: Please don't forget to call on V(s') term. Also please use mean, not sum.\n", + " # it's okay to use loops if you want\n", + " value_loss = 0\n", + "\n", + " cumulative_returns = state_values[:, -1].detach()\n", + "\n", + " for t in reversed(range(rollout_length)):\n", + " r_t = rewards[:, t] # current rewards\n", + " # current state values\n", + " V_t = state_values[:, t]\n", + " V_next = state_values[:, t + 1].detach() # next state values\n", + " # log-probability of a_t in s_t\n", + " logpi_a_s_t = logprobas_for_actions[:, t]\n", + "\n", + " # update G_t = r_t + gamma * G_{t+1} as we did in week6 reinforce\n", + " cumulative_returns = G_t = r_t + gamma * cumulative_returns\n", + "\n", + " # Compute temporal difference error (MSE for V(s))\n", + " value_loss += \n", + "\n", + " # compute advantage A(s_t, a_t) using cumulative returns and V(s_t) as baseline\n", + " advantage = \n", + " advantage = advantage.detach()\n", + "\n", + " # compute policy pseudo-loss aka -J_hat.\n", + " J_hat += \n", + "\n", + " # regularize with entropy\n", + " entropy_reg = \n", + "\n", + " # add-up three loss components and average over time\n", + " loss = -J_hat / rollout_length +\\\n", + " value_loss / rollout_length +\\\n", + " -0.01 * entropy_reg\n", + "\n", + " # Gradient descent step\n", + " < your code >\n", + "\n", + " return loss.data.numpy()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# let's test it\n", + "memory = list(pool.prev_memory_states)\n", + "rollout_obs, rollout_actions, rollout_rewards, rollout_mask = pool.interact(10)\n", + "\n", + "train_on_rollout(rollout_obs, rollout_actions,\n", + " rollout_rewards, rollout_mask, memory)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Train \n", + "\n", + "just run train step and see if agent learns any better" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "from IPython.display import clear_output\n", + "from tqdm import trange\n", + "from pandas import DataFrame\n", + "moving_average = lambda x, **kw: DataFrame(\n", + " {'x': np.asarray(x)}).x.ewm(**kw).mean().values\n", + "\n", + "rewards_history = []" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "for i in trange(15000):\n", + "\n", + " memory = list(pool.prev_memory_states)\n", + " rollout_obs, rollout_actions, rollout_rewards, rollout_mask = pool.interact(\n", + " 10)\n", + " train_on_rollout(rollout_obs, rollout_actions,\n", + " rollout_rewards, rollout_mask, memory)\n", + "\n", + " if i % 100 == 0:\n", + " rewards_history.append(np.mean(evaluate(agent, env, n_games=1)))\n", + " clear_output(True)\n", + " plt.plot(rewards_history, label='rewards')\n", + " plt.plot(moving_average(np.array(rewards_history),\n", + " span=10), label='rewards ewma@10')\n", + " plt.legend()\n", + " plt.show()\n", + " if rewards_history[-1] >= 10000:\n", + " print(\"Your agent has just passed the minimum homework threshold\")\n", + " break" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Relax and grab some refreshments while your agent is locked in an infinite loop of violence and death.\n", + "\n", + "__How to interpret plots:__\n", + "\n", + "The session reward is the easy thing: it should in general go up over time, but it's okay if it fluctuates ~~like crazy~~. It's also OK if it reward doesn't increase substantially before some 10k initial steps. However, if reward reaches zero and doesn't seem to get up over 2-3 evaluations, there's something wrong happening.\n", + "\n", + "\n", + "Since we use a policy-based method, we also keep track of __policy entropy__ - the same one you used as a regularizer. The only important thing about it is that your entropy shouldn't drop too low (`< 0.1`) before your agent gets the yellow belt. Or at least it can drop there, but _it shouldn't stay there for long_.\n", + "\n", + "If it does, the culprit is likely:\n", + "* Some bug in entropy computation. Remember that it is $ - \\sum p(a_i) \\cdot log p(a_i) $\n", + "* Your agent architecture converges too fast. Increase entropy coefficient in actor loss. \n", + "* Gradient explosion - just [clip gradients](https://stackoverflow.com/a/43486487) and maybe use a smaller network\n", + "* Us. Or TF developers. Or aliens. Or lizardfolk. Contact us on forums before it's too late!\n", + "\n", + "If you're debugging, just run `logits, values = agent.step(batch_states)` and manually look into logits and values. This will reveal the problem 9 times out of 10: you'll likely see some NaNs or insanely large numbers or zeros. Try to catch the moment when this happens for the first time and investigate from there." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### \"Final\" evaluation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "env_monitor = gym.wrappers.Monitor(env, directory=\"kungfu_videos\", force=True)\n", + "final_rewards = evaluate(agent, env_monitor, n_games=20,)\n", + "env_monitor.close()\n", + "print(\"Final mean reward\", np.mean(final_rewards))\n", + "\n", + "video_names = list(filter(lambda s: s.endswith(\n", + " \".mp4\"), os.listdir(\"./kungfu_videos/\")))\n", + "HTML(\"\"\"\n", + "\n", + "\"\"\".format(\"./kungfu_videos/\"+video_names[-1])) # this may or may not be _last_ video. Try other indices" + ] + } + ], + "metadata": { + "anaconda-cloud": {}, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.5" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/week7_pomdp/practice_tensorflow.ipynb b/week08_pomdp/practice_tensorflow.ipynb similarity index 100% rename from week7_pomdp/practice_tensorflow.ipynb rename to week08_pomdp/practice_tensorflow.ipynb diff --git a/week7_pomdp/practice_theano.ipynb b/week08_pomdp/practice_theano.ipynb similarity index 100% rename from week7_pomdp/practice_theano.ipynb rename to week08_pomdp/practice_theano.ipynb diff --git a/week7_pomdp/theano_optional_recurrence_tutorial.ipynb b/week08_pomdp/theano_optional_recurrence_tutorial.ipynb similarity index 100% rename from week7_pomdp/theano_optional_recurrence_tutorial.ipynb rename to week08_pomdp/theano_optional_recurrence_tutorial.ipynb diff --git a/week9_policy_II/README.md b/week09_policy_II/README.md similarity index 75% rename from week9_policy_II/README.md rename to week09_policy_II/README.md index 1ed36866e..0350aac9a 100644 --- a/week9_policy_II/README.md +++ b/week09_policy_II/README.md @@ -6,12 +6,13 @@ This section covers some steroids for policy gradient methods, along with a cool * Lecture on NPG and TRPO by J. Schulman - [video](https://www.youtube.com/watch?v=_t5fpZuuf-4) * Alternative lecture on TRPO and open problems by... J. Schulman - [video](https://www.youtube.com/watch?v=gb5Q2XL5c8A) -* Our videos: [lecture](https://yadi.sk/i/OP0B1BEj3UcmW9), [seminar(pytorch)](https://yadi.sk/i/D8mHrKM63UcmWh) [seminar(theano)](https://yadi.sk/i/b0ol2gUV3HiKKJ) (russian) +* Our videos: [lecture](https://yadi.sk/i/c7GR1kAAJc00Og), [seminar(pytorch)](https://yadi.sk/i/OGZJJjkQH_7h5g) [seminar(theano)](https://yadi.sk/i/b0ol2gUV3HiKKJ) (russian) * Original articles - [TRPO](https://arxiv.org/abs/1502.05477), [NPG](https://papers.nips.cc/paper/2073-a-natural-policy-gradient.pdf) ## Practice -Go to `seminar_TRPO_.ipynb` and follow instructions in the notebook. +* Seminar: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/yandexdataschool/Practical_RL/blob/spring19/week09_policy_II/seminar_TRPO_pytorch.ipynb) +* Homework: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/yandexdataschool/Practical_RL/blob/spring19/week09_policy_II/ppo.ipynb) ## More: Reinforcement learning in large/continuous action spaces While you already know algorithms that will work with continuously many actions, it can't hurt to learn something more specialized. diff --git a/week09_policy_II/mujoco_wrappers.py b/week09_policy_II/mujoco_wrappers.py new file mode 100644 index 000000000..ea4c61c03 --- /dev/null +++ b/week09_policy_II/mujoco_wrappers.py @@ -0,0 +1,97 @@ +""" MuJoCo env wrappers. """ +# Adapted from https://github.com/openai/baselines +import gym +import numpy as np + + +class RunningMeanVar: + """ Computes running mean and variance. + + Args: + eps (float): a small constant used to initialize mean to zero and + variance to 1. + shape tuple(int): shape of the statistics. + """ + # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm + def __init__(self, eps=1e-4, shape=()): + self.mean = np.zeros(shape) + self.var = np.ones(shape) + self.count = eps + + def update(self, batch): + """ Updates the running statistics given a batch of samples. """ + if not batch.shape[1:] == self.mean.shape: + raise ValueError(f"batch has invalid shape: {batch.shape}, " + f"expected shape {(None,) + self.mean.shape}") + batch_mean = np.mean(batch, axis=0) + batch_var = np.var(batch, axis=0) + batch_count = batch.shape[0] + self.update_from_moments(batch_mean, batch_var, batch_count) + + def update_from_moments(self, batch_mean, batch_var, batch_count): + """ Updates the running statistics given their new values on new data. """ + self.mean, self.var, self.count = update_mean_var_count_from_moments( + self.mean, self.var, self.count, batch_mean, batch_var, batch_count) + + +def update_mean_var_count_from_moments(mean, var, count, + batch_mean, batch_var, batch_count): + """ Updates running mean statistics given a new batch. """ + delta = batch_mean - mean + tot_count = count + batch_count + + new_mean = mean + delta * batch_count / tot_count + new_var = ( + var * (count / tot_count) + + batch_var * (batch_count / tot_count) + + np.square(delta) * (count * batch_count / tot_count ** 2)) + new_count = tot_count + + return new_mean, new_var, new_count + + +class Normalize(gym.Wrapper): + """ + A vectorized wrapper that normalizes the observations + and returns from an environment. + """ + # pylint: disable=too-many-arguments + def __init__(self, env, obs=True, ret=True, + clipobs=10., cliprew=10., gamma=0.99, eps=1e-8): + super().__init__(env) + self.obs_rmv = (RunningMeanVar(shape=self.observation_space.shape) + if obs else None) + self.ret_rmv = RunningMeanVar(shape=()) if ret else None + self.clipob = clipobs + self.cliprew = cliprew + self.ret = np.zeros(getattr(self.env.unwrapped, "nenvs", 1)) + self.gamma = gamma + self.eps = eps + + def observation(self, obs): + """ Preprocesses a given observation. """ + if not self.obs_rmv: + return obs + rmv_batch = (np.expand_dims(obs, 0) + if not hasattr(self.env.unwrapped, "nenvs") + else obs) + self.obs_rmv.update(rmv_batch) + obs = (obs - self.obs_rmv.mean) / np.sqrt(self.obs_rmv.var + self.eps) + obs = np.clip(obs, -self.clipob, self.clipob) + return obs + + def step(self, action): + obs, rews, resets, info = self.env.step(action) + self.ret = self.ret * self.gamma + rews + obs = self.observation(obs) + if self.ret_rmv: + self.ret_rmv.update(self.ret) + rews = np.clip(rews / np.sqrt(self.ret_rmv.var + self.eps), + -self.cliprew, self.cliprew) + self.ret[resets] = 0. + return obs, rews, resets, info + + def reset(self, **kwargs): + self.ret = np.zeros(getattr(self.env.unwrapped, "nenvs", 1)) + obs = self.env.reset(**kwargs) + return self.observation(obs) diff --git a/week09_policy_II/ppo.ipynb b/week09_policy_II/ppo.ipynb new file mode 100644 index 000000000..3eb0312d9 --- /dev/null +++ b/week09_policy_II/ppo.ipynb @@ -0,0 +1,516 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# # in google colab uncomment this\n", + "\n", + "# import os\n", + "\n", + "# os.system('apt-get install -y xvfb')\n", + "# os.system('wget https://raw.githubusercontent.com/yandexdataschool/Practical_DL/fall18/xvfb -O ../xvfb')\n", + "# os.system('apt-get install -y python-opengl ffmpeg')\n", + "# os.system('pip install pyglet==1.2.4')\n", + "# os.system('pip install gym')\n", + "\n", + "# prefix = 'https://raw.githubusercontent.com/yandexdataschool/Practical_RL/spring19/week06_policy_II/'\n", + "\n", + "# os.system('wget ' + prefix + 'runners.py')\n", + "# os.system('wget ' + prefix + 'mujoco_wrappers.py')\n", + "\n", + "# print('setup complete')\n", + "\n", + "# XVFB will be launched if you run on a server\n", + "import os\n", + "if type(os.environ.get(\"DISPLAY\")) is not str or len(os.environ.get(\"DISPLAY\")) == 0:\n", + " !bash ../xvfb start\n", + " %env DISPLAY = : 1" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Implementing Proximal Policy Optimization \n", + "\n", + "\n", + "In this notebook you will be implementing Proximal Policy Optimization algorithm, \n", + "scaled up version of which was used to train [OpenAI Five](https://openai.com/blog/openai-five/) \n", + "to [win](https://openai.com/blog/how-to-train-your-openai-five/) against the\n", + "world champions in Dota 2.\n", + "You will be solving a continuous control environment on which it may be easier and faster \n", + "to train an agent, however note that PPO here may not be the best algorithm as, for example,\n", + "Deep Deterministic Policy Gradient and Soft Actor Critic may be more suited \n", + "for continuous control environments. To run the environment you will need to install \n", + "[pybullet-gym](https://github.com/benelot/pybullet-gym) which unlike MuJoCo \n", + "does not require you to have a license.\n", + "\n", + "To install the library:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!git clone https://github.com/benelot/pybullet-gym lib/pybullet-gym\n", + "!pip install -e lib/pybullet-gym" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The overall structure of the code is similar to the one in the A2C optional homework, but don't worry if you haven't done it, it should be relatively easy to figure it out. \n", + "First, we will create an instance of the environment. \n", + "We will normalize the observations and rewards, but before that you will need a wrapper that will \n", + "write summaries, mainly, the total reward during an episode. You can either use one for `TensorFlow` \n", + "implemented in `atari_wrappers.py` file from the optional A2C homework, or implement your own. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import gym \n", + "import pybulletgym\n", + "\n", + "env = gym.make(\"HalfCheetahMuJoCoEnv-v0\")\n", + "print(\"observation space: \", env.observation_space,\n", + " \"\\nobservations:\", env.reset())\n", + "print(\"action space: \", env.action_space, \n", + " \"\\naction_sample: \", env.action_space.sample())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "class Summaries(gym.Wrapper):\n", + " \"\"\" Wrapper to write summaries. \"\"\"\n", + " def step(self, action):\n", + " # TODO: implement writing summaries\n", + " return self.env.step(action)\n", + " \n", + " def reset(self, **kwargs):\n", + " # TODO: implement writing summaries\n", + " return self.env.reset(**kwargs)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The normalization wrapper will subtract running mean from observations and rewards and divide \n", + "the resulting quantities by the running variances." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from mujoco_wrappers import Normalize\n", + "\n", + "env = Normalize(Summaries(gym.make(\"HalfCheetahMuJoCoEnv-v0\")));\n", + "env.unwrapped.seed(0);" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Next, you will need to define a model for training. We suggest that you use two separate networks: one for policy\n", + "and another for value function. Each network should be a 3-layer MLP with 64 hidden units, $\\mathrm{tanh}$ \n", + "activation function, kernel matrices initialized with orthogonal initializer with parameter $\\sqrt{2}$\n", + "and biases initialized with zeros. \n", + "\n", + "Our policy distribution is going to be multivariate normal with diagonal covariance. \n", + "The network from above will predict the mean, and the covariance should be represented by a single \n", + "(learned) vector of size 6 (corresponding to the dimensionality of the action space from above). \n", + "You should initialize this vector to zero and take the exponent of it to always\n", + "have a non-negative quantity. \n", + "\n", + "Overall the model should return three things: predicted mean of the distribution, variance vector, \n", + "value function. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# import tensorflow as tf\n", + "# import torch\n", + "\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This model will be wrapped by a `Policy`. The policy can work in two modes, but in either case \n", + "it is going to return dictionary with string-type keys. The first mode is when the policy is \n", + "used to sample actions for a trajectory which will later be used for training. In this case \n", + "the flag `training` passed to `act` method is `False` and the method should return \n", + "a `dict` with the following keys: \n", + "\n", + "* `\"actions\"`: actions to pass to the environment\n", + "* `\"log_probs\"`: log-probabilities of sampled actions\n", + "* `\"values\"`: value function $V^\\pi(s)$ predictions.\n", + "\n", + "We don't need to use the values under these keys for training, so all of them should be of type `np.ndarray`.\n", + "\n", + "When `training` is `True`, the model is training on a given batch of observations. In this\n", + "case it should return a `dict` with the following keys\n", + "\n", + "* `\"distribution\"`: an instance of multivariate normal distribution (`torch.distributions.MultivariateNormal` or `tf.distributions.MultivariateNormalDiag`)\n", + "* `\"values\"`: value function $V^\\pi(s)$ prediction.\n", + "\n", + "The distinction about the modes comes into play depending on where the policy is used: if it is called from `EnvRunner`, \n", + "the `training` flag is `False`, if it is called from `PPO`, the `training` flag is `True`. These classed \n", + "will be described below. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "class Policy:\n", + " def __init__(self, model):\n", + " self.model = model\n", + " \n", + " def act(self, inputs, training=False):\n", + " \n", + " # Should return a dict." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We will use `EnvRunner` to perform interactions with an environment with a policy for a fixed number of timesteps. Calling `.get_next()` on a runner will return a trajectory — dictionary \n", + "containing keys\n", + "\n", + "* `\"observations\"`\n", + "* `\"rewards\"` \n", + "* `\"resets\"`\n", + "* `\"actions\"`\n", + "* all other keys that you defined in `Policy`,\n", + "\n", + "under each of these keys there is a `np.ndarray` of specified length $T$ — the size of partial trajectory. \n", + "\n", + "Additionally, before returning a trajectory this runner can apply a list of transformations. \n", + "Each transformation is simply a callable that should modify passed trajectory in-place." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "class AsArray:\n", + " \"\"\" \n", + " Converts lists of interactions to ndarray.\n", + " \"\"\"\n", + " def __call__(self, trajectory):\n", + " # Modify trajectory inplace. \n", + " for k, v in filter(lambda kv: kv[0] != \"state\",\n", + " trajectory.items()):\n", + " trajectory[k] = np.asarray(v)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "from runners import EnvRunner\n", + "\n", + "class DummyPolicy:\n", + " def act(self, inputs, training=False):\n", + " assert not training\n", + " return {\"actions\": np.random.randn(6), \"values\": np.nan}\n", + " \n", + "runner = EnvRunner(env, DummyPolicy(), 3,\n", + " transforms=[AsArray()])\n", + "trajectory = runner.get_next()\n", + "\n", + "{k: v.shape for k, v in trajectory.items() if k != \"state\"}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You will need to implement the following two transformations. \n", + "\n", + "The first is `GAE` that implements [Generalized Advantage Estimator](https://arxiv.org/abs/1506.02438).\n", + "In it you should add two keys to the trajectory: `\"advantages\"` and `\"value_targets\"`. In GAE the advantages\n", + "$A_t^{\\mathrm{GAE}(\\gamma,\\lambda)}$ are essentially defined as the exponential \n", + "moving average with parameter $\\lambda$ of the regular advantages \n", + "$\\hat{A}^{(n)}(s_t) = \\sum_{l=0}^{T-1} \\gamma^l r_{t+l} + \\gamma^{T} V^\\pi(s_{t+l}) - V^\\pi(s_t)$. \n", + "The exact formula for the computation is the following\n", + "\n", + "$$\n", + "A_t^{\\mathrm{GAE}(\\gamma,\\lambda)} = \\sum_{l=0}^{T-1} (\\gamma\\lambda)^l\\delta_{t + l}^V,\n", + "$$\n", + "where $\\delta_{t+l}^V = r_{t+l} + \\gamma V^\\pi(s_{t+l+1}) - V^\\pi(s_{t+l})$. You can look at the \n", + "derivation (formulas 11-16) in the paper. Don't forget to reset the summation on terminal\n", + "states as determined by the flags `trajectory[\"resets\"]`. You can use `trajectory[\"values\"]`\n", + "to get values of all observations except the most recent which is stored under \n", + " `trajectory[\"state\"][\"latest_observation\"]`. For this observation you will need to call the policy \n", + " to get the value prediction.\n", + "\n", + "Once you computed the advantages, you can get the targets for training the value function by adding \n", + "back values:\n", + "$$\n", + "\\hat{V}(s_{t+l}) = A_{t+l}^{\\mathrm{GAE}(\\gamma,\\lambda)} + V(s_{t + l}),\n", + "$$\n", + "where $\\hat{V}$ is a tensor of value targets that are used to train the value function. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "class GAE:\n", + " \"\"\" Generalized Advantage Estimator. \"\"\"\n", + " def __init__(self, policy, gamma=0.99, lambda_=0.95):\n", + " self.policy = policy\n", + " self.gamma = gamma\n", + " self.lambda_ = lambda_\n", + " \n", + " def __call__(self, trajectory):\n", + " " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The main advantage of PPO over simpler policy based methods like A2C is that it is possible\n", + "to train on the same trajectory for multiple gradient steps. The following class wraps \n", + "an `EnvRunner`. It should call the runner to get a trajectory, then return minibatches \n", + "from it for a number of epochs, shuffling the data before each epoch." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "class TrajectorySampler:\n", + " \"\"\" Samples minibatches from trajectory for a number of epochs. \"\"\"\n", + " def __init__(self, runner, num_epochs, num_minibatches, transforms=None):\n", + " self.runner = runner\n", + " self.num_epochs = num_epochs\n", + " self.num_minibatches = num_minibatches\n", + " self.transforms = transforms or []\n", + " self.minibatch_count = 0\n", + " self.epoch_count = 0\n", + " self.trajectory = None\n", + " \n", + " def shuffle_trajectory(self):\n", + " \"\"\" Shuffles all elements in trajectory.\n", + " \n", + " Should be called at the beginning of each epoch.\n", + " \"\"\"\n", + " \n", + " \n", + " def get_next(self):\n", + " \"\"\" Returns next minibatch. \"\"\"\n", + " " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "A common trick to use with GAE is to normalize advantages, the following transformation does that. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "class NormalizeAdvantages:\n", + " \"\"\" Normalizes advantages to have zero mean and variance 1. \"\"\"\n", + " def __call__(self, trajectory):\n", + " adv = trajectory[\"advantages\"]\n", + " adv = (adv - adv.mean()) / (adv.std() + 1e-8)\n", + " trajectory[\"advantages\"] = adv" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Finally, we can create our PPO runner. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def make_ppo_runner(env, policy, num_runner_steps=2048,\n", + " gamma=0.99, lambda_=0.95, \n", + " num_epochs=10, num_minibatches=32):\n", + " \"\"\" Creates runner for PPO algorithm. \"\"\"\n", + " runner_transforms = [AsArray(),\n", + " GAE(policy, gamma=gamma, lambda_=lambda_)]\n", + " runner = EnvRunner(env, policy, num_runner_steps, \n", + " transforms=runner_transforms)\n", + " \n", + " sampler_transforms = [NormalizeAdvantages()]\n", + " sampler = TrajectorySampler(runner, num_epochs=num_epochs, \n", + " num_minibatches=num_minibatches,\n", + " transforms=sampler_transforms)\n", + " return sampler" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In the next cell you will need to implement Proximal Policy Optimization algorithm itself. The algorithm\n", + "modifies the typical policy gradient loss in the following way:\n", + "\n", + "$$\n", + "L_{\\pi} = \\frac{1}{T-1}\\sum_{l=0}^{T-1}\n", + "\\frac{\\pi_\\theta(a_{t+l}|s_{t+l})}{\\pi_\\theta^{\\text{old}}(a_{t+l}|s_{t+l})}\n", + "A^{\\mathrm{GAE}(\\gamma,\\lambda)}_{t+l}\\\\\n", + "L_{\\pi}^{\\text{clipped}} = \\frac{1}{T-1}\\sum_{l=0}^{T-1}\\mathrm{clip}\\left(\n", + "\\frac{\\pi_\\theta(a_{t+l}|s_{t+l})}{\\pi_{\\theta^{\\text{old}}}(a_{t+l}|s_{t+l})}\n", + "\\cdot A^{\\mathrm{GAE(\\gamma, \\lambda)}}_{t+l},\n", + "1 - \\text{cliprange}, 1 + \\text{cliprange}\\right)\\\\\n", + "L_{\\text{policy}} = \\max\\left(L_\\pi, L_{\\pi}^{\\text{clipped}}\\right).\n", + "$$\n", + "\n", + "Additionally, the value loss is modified in the following way:\n", + "\n", + "$$\n", + "L_V = \\frac{1}{T-1}\\sum_{l=0}^{T-1}(V_\\theta(s_{t+l}) - \\hat{V}(s_{t+l}))^2\\\\\n", + "L_{V}^{\\text{clipped}} = \\frac{1}{T-1}\\sum_{l=0}^{T-1}\n", + "V_{\\theta^{\\text{old}}}(s_{t+l}) +\n", + "\\text{clip}\\left(\n", + "V_\\theta(s_{t+l}) - V_{\\theta^\\text{old}}(s_{t+l}),\n", + "-\\text{cliprange}, \\text{cliprange}\n", + "\\right)\\\\\n", + "L_{\\text{value}} = \\max\\left(L_V, L_V^{\\text{clipped}}\\right).\n", + "$$" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "class PPO:\n", + " def __init__(self, policy, optimizer,\n", + " cliprange=0.2,\n", + " value_loss_coef=0.25,\n", + " max_grad_norm=0.5):\n", + " self.policy = policy\n", + " self.optimizer = optimizer\n", + " self.cliprange = cliprange\n", + " self.value_loss_coef = value_loss_coef\n", + " # Note that we don't need entropy regularization for this env.\n", + " self.max_grad_norm = max_grad_norm\n", + " \n", + " def policy_loss(self, trajectory, act):\n", + " \"\"\" Computes and returns policy loss on a given trajectory. \"\"\"\n", + " \n", + " \n", + " def value_loss(self, trajectory, act):\n", + " \"\"\" Computes and returns value loss on a given trajectory. \"\"\"\n", + " \n", + " \n", + " def loss(self, trajectory):\n", + " act = self.policy.act(trajectory[\"observations\"], training=True)\n", + " policy_loss = self.policy_loss(trajectory, act)\n", + " value_loss = self.value_loss(trajectory, act)\n", + " return policy_loss + self.value_loss_coef * value_loss\n", + " \n", + " def step(self, trajectory):\n", + " \"\"\" Computes the loss function and performs a single gradient step. \"\"\"\n", + " " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now everything is ready to do training. In one million of interactions it should be possible to \n", + "achieve the total raw reward of about 1500. You should plot this quantity with respect to \n", + "`runner.step_var` — the number of interactions with the environment. It is highly \n", + "encouraged to also provide plots of the following quantities (these are useful for debugging as well):\n", + "\n", + "* [Coefficient of Determination](https://en.wikipedia.org/wiki/Coefficient_of_determination) between \n", + "value targets and value predictions\n", + "* Entropy of the policy $\\pi$\n", + "* Value loss\n", + "* Policy loss\n", + "* Value targets\n", + "* Value predictions\n", + "* Gradient norm\n", + "* Advantages\n", + "\n", + "For optimization it is suggested to use Adam optimizer with linearly annealing learning rate \n", + "from 3e-4 to 0 and epsilon 1e-5." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "rl", + "language": "python", + "name": "rl" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/week09_policy_II/runners.py b/week09_policy_II/runners.py new file mode 120000 index 000000000..4c11d0d4f --- /dev/null +++ b/week09_policy_II/runners.py @@ -0,0 +1 @@ +../week06_policy_based/runners.py \ No newline at end of file diff --git a/week09_policy_II/seminar_TRPO_pytorch.ipynb b/week09_policy_II/seminar_TRPO_pytorch.ipynb new file mode 100644 index 000000000..4fa31fe2a --- /dev/null +++ b/week09_policy_II/seminar_TRPO_pytorch.ipynb @@ -0,0 +1,839 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# # in google colab uncomment this\n", + "\n", + "# import os\n", + "\n", + "# os.system('apt-get install -y xvfb')\n", + "# os.system('wget https://raw.githubusercontent.com/yandexdataschool/Practical_DL/fall18/xvfb -O ../xvfb')\n", + "# os.system('apt-get install -y python-opengl ffmpeg')\n", + "# os.system('pip install pyglet==1.2.4')\n", + "\n", + "# launch XVFB if you run on a server\n", + "import os\n", + "if type(os.environ.get(\"DISPLAY\")) is not str or len(os.environ.get(\"DISPLAY\")) == 0:\n", + " !bash ../xvfb start\n", + " %env DISPLAY = : 1" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Let's make a TRPO!\n", + "\n", + "In this notebook we will write the code of the one Trust Region Policy Optimization.\n", + "As usually, it contains a few different parts which we are going to reproduce.\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "import torch\n", + "import torch.nn as nn\n", + "import torch.nn.functional as F\n", + "from torch.autograd import Variable" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[33mWARN: gym.spaces.Box autodetected dtype as . Please provide explicit dtype.\u001b[0m\n", + "Observation Space Box(6,)\n", + "Action Space Discrete(3)\n" + ] + } + ], + "source": [ + "import gym\n", + "\n", + "env = gym.make(\"Acrobot-v1\")\n", + "env.reset()\n", + "observation_shape = env.observation_space.shape\n", + "n_actions = env.action_space.n\n", + "print(\"Observation Space\", env.observation_space)\n", + "print(\"Action Space\", env.action_space)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAQsAAAD8CAYAAABgtYFHAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvNQv5yAAADjJJREFUeJzt3X3I3Wd9x/H3Z+mDborpw70Qkkgqhkn/2GpzUyvKcC2O2onpH1VaZAYJBDYHFQcu3WBD2B+6P6wKQw2rLA617XygoXRzXVoZ+8PaO/bBPqz2rrQ0oZqobd0Q3arf/XGu6DGmua8793lM3i84nOt3/a7fOd9TTj69fr9znXOnqpCklfzGtAuQNB8MC0ldDAtJXQwLSV0MC0ldDAtJXcYSFkmuSvJ4kuUke8bxHJImK6NeZ5FkHfBt4K3AIeA+4PqqenSkTyRposYxs7gMWK6q71TV/wK3ADvG8DySJuisMTzmJuCZoe1DwBtOdsCFF15YW7duHUMpko45ePDg96tq4VSPH0dYdEmyG9gN8OpXv5qlpaVplSKdEZI8vZbjx3EachjYMrS9ufX9iqraW1WLVbW4sHDKYSdpQsYRFvcB25JclOQc4Dpg/xieR9IEjfw0pKpeTPJnwFeBdcBnquqRUT+PpMkayzWLqroTuHMcjy1pOlzBKamLYSGpi2EhqYthIamLYSGpi2EhqYthIamLYSGpi2EhqYthIamLYSGpi2EhqYthIamLYSGpi2EhqYthIamLYSGpi2EhqYthIamLYSGpi2EhqYthIamLYSGpi2EhqYthIamLYSGpi2EhqYthIamLYSGpi2EhqYthIamLYSGpi2EhqYthIamLYSGpy4phkeQzSY4keXio7/wkdyV5ot2f1/qT5BNJlpM8lOTScRYvaXJ6Zhb/CFx1XN8e4EBVbQMOtG2AtwHb2m038MnRlClp2lYMi6r6D+CHx3XvAPa19j7gmqH+z9bA14H1STaOqlhJ03Oq1yw2VNWzrf1dYENrbwKeGRp3qPX9miS7kywlWTp69OgpliFpUtZ8gbOqCqhTOG5vVS1W1eLCwsJay5A0ZqcaFt87dnrR7o+0/sPAlqFxm1ufpDl3qmGxH9jZ2juB24f639M+FbkceGHodEXSHDtrpQFJvgC8BbgwySHgb4APA7cl2QU8DbyrDb8TuBpYBn4MvHcMNUuaghXDoqquf4ldV55gbAHvW2tRkmaPKzgldTEsJHUxLCR1MSwkdTEsJHUxLCR1MSwkdTEsJHUxLCR1yWDR5ZSLSKZfhHT6O1hVi6d68IrLvSdh+/btLC0tTbsM6bSWZE3HexoiqYthIamLYSGpi2EhqYthIamLYSGpi2EhqYthIamLYSGpi2EhqYthIamLYSGpi2EhqYthIamLYSGpi2EhqYthIamLYSGpi2EhqYthIamLYSGpi2EhqYthIanLimGRZEuSe5I8muSRJDe0/vOT3JXkiXZ/XutPkk8kWU7yUJJLx/0iJI1fz8ziReDPq+pi4HLgfUkuBvYAB6pqG3CgbQO8DdjWbruBT468akkTt2JYVNWzVfXN1v5v4DFgE7AD2NeG7QOuae0dwGdr4OvA+iQbR165pIla1TWLJFuB1wP3Ahuq6tm267vAhtbeBDwzdNih1idpjnWHRZJXAF8C3l9VPxreV4O/rryqP26cZHeSpSRLR48eXc2hkqagKyySnM0gKD5XVV9u3d87dnrR7o+0/sPAlqHDN7e+X1FVe6tqsaoWFxYWTrV+SRPS82lIgJuBx6rqo0O79gM7W3sncPtQ/3vapyKXAy8Mna5ImlNndYx5E/DHwLeSPND6/hL4MHBbkl3A08C72r47gauBZeDHwHtHWrGkqVgxLKrqP4G8xO4rTzC+gPetsS5JM8YVnJK6GBaSuhgWkroYFpK6GBaSuhgWkroYFpK6GBaSuhgWkroYFpK6GBaSuvR8kUz6hYMHf/VrQtu3r+pnTDTHnFmo2/FB8VJ9Oj0ZFupyslAwMM4MhoVW1BMGBsbpz7CQ1MWwkNTFsJDUxbDQihZZGskYzTfDQl1OFgYGxZnBsFC3E4WCQXHmcAWnVsVwOHM5s5DUxbCQ1MWwkNTFsJDUxbCQ1MWwkNTFsJDUxbCQ1MWwkNTFsJDUxbCQ1MWwkNTFsJDUZcWwSPKyJN9I8mCSR5J8qPVflOTeJMtJbk1yTus/t20vt/1bx/sSJE1Cz8zip8AVVfV7wCXAVUkuBz4C3FRVrwWeA3a18buA51r/TW2cpDm3YljUwP+0zbPbrYArgC+2/n3ANa29o23T9l+ZxN+Jl+Zc1zWLJOuSPAAcAe4CngSer6oX25BDwKbW3gQ8A9D2vwBccILH3J1kKcnS0aNH1/YqNHW1ffu0S9CYdYVFVf2sqi4BNgOXAa9b6xNX1d6qWqyqxYWFhbU+nKQxW9WnIVX1PHAP8EZgfZJjP8u3GTjc2oeBLQBt/6uAH4ykWklT0/NpyEKS9a39cuCtwGMMQuPaNmwncHtr72/btP13V5V/aluacz0/2LsR2JdkHYNwua2q7kjyKHBLkr8F7gdubuNvBv4pyTLwQ+C6MdQtacJWDIuqegh4/Qn6v8Pg+sXx/T8B3jmS6iTNDFdwSupiWOikcvDgtEvQjDAsJHUxLCR1MSwkdTEsJHUxLCR1MSwkdTEsJHUxLCR1MSwkdTEsJHUxLCR1MSwkdTEsJHUxLCR1MSy0KkssssTitMvQFPT8rJ70awFxbHuRpWmUoylwZqEVnWwm4SzjzGFYSOpiWOikemYOzi7ODIaFpC6GhaQuhoVOqufTDj8ROTMYFpK6GBZa0clmDs4qzhwuytJJ1fbt5OBBQ0HOLDQa/uWy059hIamLYSGpi2EhqYthIamLYSGpi2EhqUt3WCRZl+T+JHe07YuS3JtkOcmtSc5p/ee27eW2f+t4Spc0SauZWdwAPDa0/RHgpqp6LfAcsKv17wKea/03tXGS5lxXWCTZDPwR8A9tO8AVwBfbkH3ANa29o23T9l/ZxkuaY70zi48BHwR+3rYvAJ6vqhfb9iFgU2tvAp4BaPtfaOMlzbEVwyLJ24EjVTXS9bxJdidZSrJ09OjRUT60Rqy2b592CZoBPTOLNwHvSPIUcAuD04+PA+uTHPsi2mbgcGsfBrYAtP2vAn5w/INW1d6qWqyqxYWFhTW9CM0Gvx9yelsxLKrqxqraXFVbgeuAu6vq3cA9wLVt2E7g9tbe37Zp+++uqhpp1ZImbi3rLP4C+ECSZQbXJG5u/TcDF7T+DwB71laipFmwqt+zqKqvAV9r7e8Al51gzE+Ad46gNkkzxBWckroYFpK6GBaSuhgWkroYFpK6GBaSuhgW6uKSbxkWGimXfJ++DAtJXQwLSV0MC0ldDAtJXQwLSV0MC0ldDAtJXQwLSV0MC0ldDAtJXQwLSV0MC0ldDAtJXQwLSV0MC0ldDAtJXQwLdfPXss5shoVGzl/LOj0ZFpK6GBaSuqzqDyNLXrc4czmzkNTFsJDUxbCQ1MWwkNTFsJDUxbCQ1KUrLJI8leRbSR5IstT6zk9yV5In2v15rT9JPpFkOclDSS4d5wuQNBmrmVn8QVVdUlWLbXsPcKCqtgEH2jbA24Bt7bYb+OSoipU0PWs5DdkB7GvtfcA1Q/2frYGvA+uTbFzD80iaAb0rOAv4tyQFfLqq9gIbqurZtv+7wIbW3gQ8M3Tsodb37FAfSXYzmHkA/DTJw6dQ/7RcCHx/2kV0mqdaYb7qnadaAX5nLQf3hsWbq+pwkt8G7kryX8M7q6pakHRrgbMXIMnS0OnNzJuneuepVpiveuepVhjUu5bju05Dqupwuz8CfAW4DPjesdOLdn+kDT8MbBk6fHPrkzTHVgyLJL+V5JXH2sAfAg8D+4GdbdhO4PbW3g+8p30qcjnwwtDpiqQ51XMasgH4SpJj4z9fVf+a5D7gtiS7gKeBd7XxdwJXA8vAj4H3djzH3tUWPmXzVO881QrzVe881QprrDdVq7rUIOkM5QpOSV2mHhZJrkryeFvxuWflI8Zez2eSHBn+KHeWV6sm2ZLkniSPJnkkyQ2zWnOSlyX5RpIHW60fav0XJbm31XRrknNa/7lte7nt3zqpWodqXpfk/iR3zEGt411pXVVTuwHrgCeB1wDnAA8CF0+5pt8HLgUeHur7O2BPa+8BPtLaVwP/AgS4HLh3CvVuBC5t7VcC3wYunsWa23O+orXPBu5tNdwGXNf6PwX8SWv/KfCp1r4OuHUK/30/AHweuKNtz3KtTwEXHtc3svfBRF/MCV7cG4GvDm3fCNw4zZpaHVuPC4vHgY2tvRF4vLU/DVx/onFTrP124K2zXjPwm8A3gTcwWNh01vHvCeCrwBtb+6w2LhOscTODrzJcAdzR/mHNZK3teU8UFiN7H0z7NOSlVnvOmtWuVp2KNvV9PYP/Y89kzW1a/wCDdTl3MZhZPl9VL56gnl/U2va/AFwwqVqBjwEfBH7eti9gdmuFX660PthWSMMI3wf+YO8qVa1+teokJHkF8CXg/VX1o/ZRNzBbNVfVz4BLkqxnsMDvdVMu6YSSvB04UlUHk7xl2vV0GvlK62HTnlnMy2rPmV6tmuRsBkHxuar6cuue6Zqr6nngHgZT+fVJjv2Pa7ieX9Ta9r8K+MGESnwT8I4kTwG3MDgV+fiM1gqMf6X1tMPiPmBbu8J8DoMLQ/unXNOJzOxq1QymEDcDj1XVR4d2zVzNSRbajIIkL2dwbeUxBqFx7UvUeuw1XAvcXe0Ee9yq6saq2lxVWxm8L++uqnfPYq0woZXWk7wA8xIXZa5mcAX/SeCvZqCeLzD4huz/MTiP28Xg3PMA8ATw78D5bWyAv2+1fwtYnEK9b2ZwrvoQ8EC7XT2LNQO/C9zfan0Y+OvW/xrgGwxW/f4zcG7rf1nbXm77XzOl98Rb+OWnITNZa6vrwXZ75Ni/pVG+D1zBKanLtE9DJM0Jw0JSF8NCUhfDQlIXw0JSF8NCUhfDQlIXw0JSl/8Huhr8fpmXAZ4AAAAASUVORK5CYII=\n", + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import matplotlib.pyplot as plt\n", + "%matplotlib inline\n", + "plt.imshow(env.render('rgb_array'))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 1: Defining a network\n", + "\n", + "With all it's complexity, at it's core TRPO is yet another policy gradient method. \n", + "\n", + "This essentially means we're actually training a stochastic policy $ \\pi_\\theta(a|s) $. \n", + "\n", + "And yes, it's gonna be a neural network. So let's start by defining one." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "class TRPOAgent(nn.Module):\n", + " def __init__(self, state_shape, n_actions, hidden_size=32):\n", + " '''\n", + " Here you should define your model\n", + " You should have LOG-PROBABILITIES as output because you will need it to compute loss\n", + " We recommend that you start simple: \n", + " use 1-2 hidden layers with 100-500 units and relu for the first try\n", + " '''\n", + " nn.Module.__init__(self)\n", + "\n", + " \n", + " self.model = None\n", + "\n", + " def forward(self, states):\n", + " \"\"\"\n", + " takes agent's observation (Variable), returns log-probabilities (Variable)\n", + " :param state_t: a batch of states, shape = [batch_size, state_shape]\n", + " \"\"\"\n", + "\n", + " # Use your network to compute log_probs for given state\n", + " log_probs = self.model(states)\n", + " return log_probs\n", + "\n", + " def get_log_probs(self, states):\n", + " '''\n", + " Log-probs for training\n", + " '''\n", + "\n", + " return self.forward(states)\n", + "\n", + " def get_probs(self, states):\n", + " '''\n", + " Probs for interaction\n", + " '''\n", + "\n", + " return torch.exp(self.forward(states))\n", + "\n", + " def act(self, obs, sample=True):\n", + " '''\n", + " Samples action from policy distribution (sample = True) or takes most likely action (sample = False)\n", + " :param: obs - single observation vector\n", + " :param sample: if True, samples from \\pi, otherwise takes most likely action\n", + " :returns: action (single integer) and probabilities for all actions\n", + " '''\n", + "\n", + " probs = self.get_probs(Variable(torch.FloatTensor([obs]))).data.numpy()\n", + "\n", + " if sample:\n", + " action = int(np.random.choice(n_actions, p=probs[0]))\n", + " else:\n", + " action = int(np.argmax(probs))\n", + "\n", + " return action, probs[0]\n", + "\n", + "\n", + "agent = TRPOAgent(observation_shape, n_actions)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "sampled: [(2, array([0.35253003, 0.37892205, 0.26854792], dtype=float32)), (2, array([0.35269254, 0.37673423, 0.27057323], dtype=float32)), (0, array([0.35406563, 0.37682924, 0.26910514], dtype=float32)), (0, array([0.3560282 , 0.37561142, 0.2683604 ], dtype=float32)), (1, array([0.35539204, 0.37685862, 0.26774937], dtype=float32))]\n", + "greedy: [(1, array([0.3518883 , 0.37830737, 0.2698043 ], dtype=float32)), (1, array([0.3544095 , 0.37609497, 0.26949552], dtype=float32)), (1, array([0.35528135, 0.37493262, 0.269786 ], dtype=float32)), (1, array([0.3589018 , 0.37457928, 0.26651892], dtype=float32)), (1, array([0.35414994, 0.3769723 , 0.26887777], dtype=float32))]\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.5/dist-packages/torch/nn/modules/container.py:67: UserWarning: Implicit dimension choice for log_softmax has been deprecated. Change the call to include dim=X as an argument.\n", + " input = module(input)\n" + ] + } + ], + "source": [ + "# Check if log-probabilities satisfies all the requirements\n", + "log_probs = agent.get_log_probs(Variable(torch.FloatTensor([env.reset()])))\n", + "assert isinstance(\n", + " log_probs, Variable) and log_probs.requires_grad, \"qvalues must be a torch variable with grad\"\n", + "assert len(\n", + " log_probs.shape) == 2 and log_probs.shape[0] == 1 and log_probs.shape[1] == n_actions\n", + "sums = torch.sum(torch.exp(log_probs), dim=1)\n", + "assert (0.999 < sums).all() and (1.001 > sums).all()\n", + "\n", + "# Demo use\n", + "print(\"sampled:\", [agent.act(env.reset()) for _ in range(5)])\n", + "print(\"greedy:\", [agent.act(env.reset(), sample=False) for _ in range(5)])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Flat parameters operations\n", + "\n", + "We are going to use it" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def get_flat_params_from(model):\n", + " params = []\n", + " for param in model.parameters():\n", + " params.append(param.data.view(-1))\n", + "\n", + " flat_params = torch.cat(params)\n", + " return flat_params\n", + "\n", + "\n", + "def set_flat_params_to(model, flat_params):\n", + " prev_ind = 0\n", + " for param in model.parameters():\n", + " flat_size = int(np.prod(list(param.size())))\n", + " param.data.copy_(\n", + " flat_params[prev_ind:prev_ind + flat_size].view(param.size()))\n", + " prev_ind += flat_size" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Compute cummulative reward just like you did in vanilla REINFORCE" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import scipy.signal\n", + "\n", + "\n", + "def get_cummulative_returns(r, gamma=1):\n", + " \"\"\"\n", + " Computes cummulative discounted rewards given immediate rewards\n", + " G_i = r_i + gamma*r_{i+1} + gamma^2*r_{i+2} + ...\n", + " Also known as R(s,a).\n", + " \"\"\"\n", + " r = np.array(r)\n", + " assert r.ndim >= 1\n", + " return scipy.signal.lfilter([1], [1, -gamma], r[::-1], axis=0)[::-1]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# simple demo on rewards [0,0,1,0,0,1]\n", + "get_cummulative_returns([0, 0, 1, 0, 0, 1], gamma=0.9)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Rollout**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def rollout(env, agent, max_pathlength=2500, n_timesteps=50000):\n", + " \"\"\"\n", + " Generate rollouts for training.\n", + " :param: env - environment in which we will make actions to generate rollouts.\n", + " :param: act - the function that can return policy and action given observation.\n", + " :param: max_pathlength - maximum size of one path that we generate.\n", + " :param: n_timesteps - total sum of sizes of all pathes we generate.\n", + " \"\"\"\n", + " paths = []\n", + "\n", + " total_timesteps = 0\n", + " while total_timesteps < n_timesteps:\n", + " obervations, actions, rewards, action_probs = [], [], [], []\n", + " obervation = env.reset()\n", + " for _ in range(max_pathlength):\n", + " action, policy = agent.act(obervation)\n", + " obervations.append(obervation)\n", + " actions.append(action)\n", + " action_probs.append(policy)\n", + " obervation, reward, done, _ = env.step(action)\n", + " rewards.append(reward)\n", + " total_timesteps += 1\n", + " if done or total_timesteps == n_timesteps:\n", + " path = {\"observations\": np.array(obervations),\n", + " \"policy\": np.array(action_probs),\n", + " \"actions\": np.array(actions),\n", + " \"rewards\": np.array(rewards),\n", + " \"cumulative_returns\": get_cummulative_returns(rewards),\n", + " }\n", + " paths.append(path)\n", + " break\n", + " return paths" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "paths = rollout(env, agent, max_pathlength=5, n_timesteps=100)\n", + "print(paths[-1])\n", + "assert (paths[0]['policy'].shape == (5, n_actions))\n", + "assert (paths[0]['cumulative_returns'].shape == (5,))\n", + "assert (paths[0]['rewards'].shape == (5,))\n", + "assert (paths[0]['observations'].shape == (5,)+observation_shape)\n", + "assert (paths[0]['actions'].shape == (5,))\n", + "print('It\\'s ok')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 3: Auxiliary functions\n", + "\n", + "Now let's define the loss functions and something else for actual TRPO training." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The surrogate reward should be\n", + "$$J_{surr}= {1 \\over N} \\sum\\limits_{i=0}^N \\frac{\\pi_{\\theta}(s_i, a_i)}{\\pi_{\\theta_{old}}(s_i, a_i)}A_{\\theta_{old}(s_i, a_i)}$$\n", + "\n", + "For simplicity, let's use cummulative returns instead of advantage for now:\n", + "$$J'_{surr}= {1 \\over N} \\sum\\limits_{i=0}^N \\frac{\\pi_{\\theta}(s_i, a_i)}{\\pi_{\\theta_{old}}(s_i, a_i)}G_{\\theta_{old}(s_i, a_i)}$$\n", + "\n", + "Or alternatively, minimize the surrogate loss:\n", + "$$ L_{surr} = - J'_{surr} $$ \n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def get_loss(agent, observations, actions, cummulative_returns, old_probs):\n", + " \"\"\"\n", + " Computes TRPO objective\n", + " :param: observations - batch of observations\n", + " :param: actions - batch of actions\n", + " :param: cummulative_returns - batch of cummulative returns\n", + " :param: old_probs - batch of probabilities computed by old network\n", + " :returns: scalar value of the objective function\n", + " \"\"\"\n", + " batch_size = observations.shape[0]\n", + " log_probs_all = agent.get_log_probs(observations)\n", + " probs_all = torch.exp(log_probs_all)\n", + "\n", + " probs_for_actions = probs_all[torch.arange(\n", + " 0, batch_size, out=torch.LongTensor()), actions]\n", + " old_probs_for_actions = old_probs[torch.arange(\n", + " 0, batch_size, out=torch.LongTensor()), actions]\n", + "\n", + " # Compute surrogate loss, aka importance-sampled policy gradient\n", + " Loss = \n", + "\n", + " assert Loss.shape == torch.Size([])\n", + " return Loss" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can ascend these gradients as long as our $pi_\\theta(a|s)$ satisfies the constraint\n", + "$$E_{s,\\pi_{\\Theta_{t}}}\\Big[KL(\\pi(\\Theta_{t}, s) \\:||\\:\\pi(\\Theta_{t+1}, s))\\Big]< \\alpha$$\n", + "\n", + "\n", + "where\n", + "\n", + "$$KL(p||q) = E _p log({p \\over q})$$" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def get_kl(agent, observations, actions, cummulative_returns, old_probs):\n", + " \"\"\"\n", + " Computes KL-divergence between network policy and old policy\n", + " :param: observations - batch of observations\n", + " :param: actions - batch of actions\n", + " :param: cummulative_returns - batch of cummulative returns (we don't need it actually)\n", + " :param: old_probs - batch of probabilities computed by old network\n", + " :returns: scalar value of the KL-divergence\n", + " \"\"\"\n", + " batch_size = observations.shape[0]\n", + " log_probs_all = agent.get_log_probs(observations)\n", + " probs_all = torch.exp(log_probs_all)\n", + "\n", + " # Compute Kullback-Leibler divergence (see formula above)\n", + " # Note: you need to sum KL and entropy over all actions, not just the ones agent took\n", + " old_log_probs = torch.log(old_probs+1e-10)\n", + "\n", + " kl = \n", + "\n", + " assert kl.shape == torch.Size([])\n", + " assert (kl > -0.0001).all() and (kl < 10000).all()\n", + " return kl" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def get_entropy(agent, observations):\n", + " \"\"\"\n", + " Computes entropy of the network policy \n", + " :param: observations - batch of observations\n", + " :returns: scalar value of the entropy\n", + " \"\"\"\n", + "\n", + " observations = Variable(torch.FloatTensor(observations))\n", + "\n", + " batch_size = observations.shape[0]\n", + " log_probs_all = agent.get_log_probs(observations)\n", + " probs_all = torch.exp(log_probs_all)\n", + "\n", + " entropy = torch.sum(-probs_all * log_probs_all) / batch_size\n", + "\n", + " assert entropy.shape == torch.Size([])\n", + " return entropy" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Linear search**\n", + "\n", + "TRPO in its core involves ascending surrogate policy gradient constrained by KL divergence. \n", + "\n", + "In order to enforce this constraint, we're gonna use linesearch. You can find out more about it [here](https://en.wikipedia.org/wiki/Linear_search)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def linesearch(f, x, fullstep, max_kl):\n", + " \"\"\"\n", + " Linesearch finds the best parameters of neural networks in the direction of fullstep contrainted by KL divergence.\n", + " :param: f - function that returns loss, kl and arbitrary third component.\n", + " :param: x - old parameters of neural network.\n", + " :param: fullstep - direction in which we make search.\n", + " :param: max_kl - constraint of KL divergence.\n", + " :returns:\n", + " \"\"\"\n", + " max_backtracks = 10\n", + " loss, _, = f(x)\n", + " for stepfrac in .5**np.arange(max_backtracks):\n", + " xnew = x + stepfrac * fullstep\n", + " new_loss, kl = f(xnew)\n", + " actual_improve = new_loss - loss\n", + " if kl.data.numpy() <= max_kl and actual_improve.data.numpy() < 0:\n", + " x = xnew\n", + " loss = new_loss\n", + " return x" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Conjugate gradients**\n", + "\n", + "Since TRPO includes contrainted optimization, we will need to solve Ax=b using conjugate gradients.\n", + "\n", + "In general, CG is an algorithm that solves Ax=b where A is positive-defined. A is Hessian matrix so A is positive-defined. You can find out more about them [here](https://en.wikipedia.org/wiki/Conjugate_gradient_method)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "from numpy.linalg import inv\n", + "\n", + "\n", + "def conjugate_gradient(f_Ax, b, cg_iters=10, residual_tol=1e-10):\n", + " \"\"\"\n", + " This method solves system of equation Ax=b using iterative method called conjugate gradients\n", + " :f_Ax: function that returns Ax\n", + " :b: targets for Ax\n", + " :cg_iters: how many iterations this method should do\n", + " :residual_tol: epsilon for stability\n", + " \"\"\"\n", + " p = b.clone()\n", + " r = b.clone()\n", + " x = torch.zeros(b.size())\n", + " rdotr = torch.sum(r*r)\n", + " for i in range(cg_iters):\n", + " z = f_Ax(p)\n", + " v = rdotr / (torch.sum(p*z) + 1e-8)\n", + " x += v * p\n", + " r -= v * z\n", + " newrdotr = torch.sum(r*r)\n", + " mu = newrdotr / (rdotr + 1e-8)\n", + " p = r + mu * p\n", + " rdotr = newrdotr\n", + " if rdotr < residual_tol:\n", + " break\n", + " return x" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# This code validates conjugate gradients\n", + "A = np.random.rand(8, 8)\n", + "A = np.matmul(np.transpose(A), A)\n", + "\n", + "\n", + "def f_Ax(x):\n", + " return torch.matmul(torch.FloatTensor(A), x.view((-1, 1))).view(-1)\n", + "\n", + "\n", + "b = np.random.rand(8)\n", + "\n", + "w = np.matmul(np.matmul(inv(np.matmul(np.transpose(A), A)),\n", + " np.transpose(A)), b.reshape((-1, 1))).reshape(-1)\n", + "print(w)\n", + "print(conjugate_gradient(f_Ax, torch.FloatTensor(b)).numpy())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 4: training\n", + "In this section we construct the whole update step function." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def update_step(agent, observations, actions, cummulative_returns, old_probs, max_kl):\n", + " \"\"\"\n", + " This function does the TRPO update step\n", + " :param: observations - batch of observations\n", + " :param: actions - batch of actions\n", + " :param: cummulative_returns - batch of cummulative returns\n", + " :param: old_probs - batch of probabilities computed by old network\n", + " :param: max_kl - controls how big KL divergence may be between old and new policy every step.\n", + " :returns: KL between new and old policies and the value of the loss function.\n", + " \"\"\"\n", + "\n", + " # Here we prepare the information\n", + " observations = Variable(torch.FloatTensor(observations))\n", + " actions = torch.LongTensor(actions)\n", + " cummulative_returns = Variable(torch.FloatTensor(cummulative_returns))\n", + " old_probs = Variable(torch.FloatTensor(old_probs))\n", + "\n", + " # Here we compute gradient of the loss function\n", + " loss = get_loss(agent, observations, actions,\n", + " cummulative_returns, old_probs)\n", + " grads = torch.autograd.grad(loss, agent.parameters())\n", + " loss_grad = torch.cat([grad.view(-1) for grad in grads]).data\n", + "\n", + " def Fvp(v):\n", + " # Here we compute Fx to do solve Fx = g using conjugate gradients\n", + " # We actually do here a couple of tricks to compute it efficiently\n", + "\n", + " kl = get_kl(agent, observations, actions,\n", + " cummulative_returns, old_probs)\n", + "\n", + " grads = torch.autograd.grad(kl, agent.parameters(), create_graph=True)\n", + " flat_grad_kl = torch.cat([grad.view(-1) for grad in grads])\n", + "\n", + " kl_v = (flat_grad_kl * Variable(v)).sum()\n", + " grads = torch.autograd.grad(kl_v, agent.parameters())\n", + " flat_grad_grad_kl = torch.cat(\n", + " [grad.contiguous().view(-1) for grad in grads]).data\n", + "\n", + " return flat_grad_grad_kl + v * 0.1\n", + "\n", + " # Here we solveolve Fx = g system using conjugate gradients\n", + " stepdir = conjugate_gradient(Fvp, -loss_grad, 10)\n", + "\n", + " # Here we compute the initial vector to do linear search\n", + " shs = 0.5 * (stepdir * Fvp(stepdir)).sum(0, keepdim=True)\n", + "\n", + " lm = torch.sqrt(shs / max_kl)\n", + " fullstep = stepdir / lm[0]\n", + "\n", + " neggdotstepdir = (-loss_grad * stepdir).sum(0, keepdim=True)\n", + "\n", + " # Here we get the start point\n", + " prev_params = get_flat_params_from(agent)\n", + "\n", + " def get_loss_kl(params):\n", + " # Helper for linear search\n", + " set_flat_params_to(agent, params)\n", + " return [get_loss(agent, observations, actions, cummulative_returns, old_probs),\n", + " get_kl(agent, observations, actions, cummulative_returns, old_probs)]\n", + "\n", + " # Here we find our new parameters\n", + " new_params = linesearch(get_loss_kl, prev_params, fullstep, max_kl)\n", + "\n", + " # And we set it to our network\n", + " set_flat_params_to(agent, new_params)\n", + "\n", + " return get_loss_kl(new_params)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### Step 5: Main TRPO loop\n", + "\n", + "Here we will train our network!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import time\n", + "from itertools import count\n", + "from collections import OrderedDict\n", + "\n", + "# this is hyperparameter of TRPO. It controls how big KL divergence may be between old and new policy every step.\n", + "max_kl = 0.01\n", + "numeptotal = 0 # this is number of episodes that we played.\n", + "\n", + "start_time = time.time()\n", + "\n", + "for i in count(1):\n", + "\n", + " print(\"\\n********** Iteration %i ************\" % i)\n", + "\n", + " # Generating paths.\n", + " print(\"Rollout\")\n", + " paths = rollout(env, agent)\n", + " print(\"Made rollout\")\n", + "\n", + " # Updating policy.\n", + " observations = np.concatenate([path[\"observations\"] for path in paths])\n", + " actions = np.concatenate([path[\"actions\"] for path in paths])\n", + " returns = np.concatenate([path[\"cumulative_returns\"] for path in paths])\n", + " old_probs = np.concatenate([path[\"policy\"] for path in paths])\n", + "\n", + " loss, kl = update_step(agent, observations, actions,\n", + " returns, old_probs, max_kl)\n", + "\n", + " # Report current progress\n", + " episode_rewards = np.array([path[\"rewards\"].sum() for path in paths])\n", + "\n", + " stats = OrderedDict()\n", + " numeptotal += len(episode_rewards)\n", + " stats[\"Total number of episodes\"] = numeptotal\n", + " stats[\"Average sum of rewards per episode\"] = episode_rewards.mean()\n", + " stats[\"Std of rewards per episode\"] = episode_rewards.std()\n", + " stats[\"Time elapsed\"] = \"%.2f mins\" % ((time.time() - start_time)/60.)\n", + " stats[\"KL between old and new distribution\"] = kl.data.numpy()\n", + " stats[\"Entropy\"] = get_entropy(agent, observations).data.numpy()\n", + " stats[\"Surrogate loss\"] = loss.data.numpy()\n", + " for k, v in stats.items():\n", + " print(k + \": \" + \" \" * (40 - len(k)) + str(v))\n", + " i += 1" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Homework option I: better sampling (10+pts)\n", + "\n", + "In this section, you're invited to implement a better rollout strategy called _vine_.\n", + "\n", + "![img](https://s17.postimg.org/i90chxgvj/vine.png)\n", + "\n", + "In most gym environments, you can actually backtrack by using states. You can find a wrapper that saves/loads states in [the mcts seminar](https://github.com/yandexdataschool/Practical_RL/blob/spring19/week10_planning/seminar_MCTS.ipynb).\n", + "\n", + "You can read more about in the [TRPO article](https://arxiv.org/abs/1502.05477) in section 5.2.\n", + "\n", + "The goal here is to implement such rollout policy (we recommend using tree data structure like in the seminar above).\n", + "Then you can assign cummulative rewards similar to `get_cummulative_rewards`, but for a tree.\n", + "\n", + "__bonus task__ - parallelize samples using multiple cores" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": true + }, + "source": [ + "# Homework option II (10+pts)\n", + "\n", + "Let's use TRPO to train evil robots! (pick any of two)\n", + "* [MuJoCo robots](https://gym.openai.com/envs#mujoco)\n", + "* [Box2d robot](https://gym.openai.com/envs/BipedalWalker-v2)\n", + "\n", + "The catch here is that those environments have continuous action spaces. \n", + "\n", + "Luckily, TRPO is a policy gradient method, so it's gonna work for any parametric $\\pi_\\theta(a|s)$. We recommend starting with gaussian policy:\n", + "\n", + "$$\\pi_\\theta(a|s) = N(\\mu_\\theta(s),\\sigma^2_\\theta(s)) = {1 \\over \\sqrt { 2 \\pi {\\sigma^2}_\\theta(s) } } e^{ (a - \n", + "\\mu_\\theta(s))^2 \\over 2 {\\sigma^2}_\\theta(s) } $$\n", + "\n", + "In the $\\sqrt { 2 \\pi {\\sigma^2}_\\theta(s) }$ clause, $\\pi$ means ~3.1415926, not agent's policy.\n", + "\n", + "This essentially means that you will need two output layers:\n", + "* $\\mu_\\theta(s)$, a dense layer with linear activation\n", + "* ${\\sigma^2}_\\theta(s)$, a dense layer with activation tf.exp (to make it positive; like rho from bandits)\n", + "\n", + "For multidimensional actions, you can use fully factorized gaussian (basically a vector of gaussians).\n", + "\n", + "__bonus task__: compare performance of continuous action space method to action space discretization" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.6" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/week09_policy_II/seminar_TRPO_tensorflow.ipynb b/week09_policy_II/seminar_TRPO_tensorflow.ipynb new file mode 100644 index 000000000..f4ac1569b --- /dev/null +++ b/week09_policy_II/seminar_TRPO_tensorflow.ipynb @@ -0,0 +1,792 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# launch XVFB if you run on a server\n", + "import os\n", + "if type(os.environ.get(\"DISPLAY\")) is not str or len(os.environ.get(\"DISPLAY\")) == 0:\n", + " !bash ../xvfb start\n", + " %env DISPLAY = : 1" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Let's make a TRPO!\n", + "\n", + "In this notebook we will write the code of the one Trust Region Policy Optimization.\n", + "As usually, it contains a few different parts which we are going to reproduce.\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "import tensorflow as tf" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import gym\n", + "\n", + "env = gym.make(\"Acrobot-v1\")\n", + "env.reset()\n", + "observation_shape = env.observation_space.shape\n", + "n_actions = env.action_space.n\n", + "print(\"Observation Space\", env.observation_space)\n", + "print(\"Action Space\", env.action_space)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "%matplotlib inline\n", + "plt.imshow(env.render('rgb_array'))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 1: Defining a network\n", + "\n", + "With all it's complexity, at it's core TRPO is yet another policy gradient method. \n", + "\n", + "This essentially means we're actually training a stochastic policy $ \\pi_\\theta(a|s) $. \n", + "\n", + "And yes, it's gonna be a neural network. So let's start by defining one." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# input tensors\n", + "observations_ph = tf.placeholder(\n", + " shape=(None, observation_shape[0]), dtype=tf.float32)\n", + "# Actions that we made\n", + "actions_ph = tf.placeholder(shape=(None,), dtype=tf.int32)\n", + "# \"G = r + gamma*r' + gamma^2*r'' + ...\"\n", + "cummulative_returns_ph = tf.placeholder(shape=(None,), dtype=tf.float32)\n", + "# Action probabilities from previous iteration\n", + "old_probs_ph = tf.placeholder(shape=(None, n_actions), dtype=tf.float32)\n", + "\n", + "all_inputs = [observations_ph, actions_ph,\n", + " cummulative_returns_ph, old_probs_ph]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def denselayer(name, x, out_dim, nonlinearity=None):\n", + " with tf.variable_scope(name):\n", + " if nonlinearity is None:\n", + " nonlinearity = tf.identity\n", + "\n", + " x_shape = x.get_shape().as_list()\n", + "\n", + " w = tf.get_variable('w', shape=[x_shape[1], out_dim])\n", + " b = tf.get_variable(\n", + " 'b', shape=[out_dim], initializer=tf.constant_initializer(0))\n", + " o = nonlinearity(tf.matmul(x, w) + b)\n", + "\n", + " return o\n", + "\n", + "\n", + "sess = tf.InteractiveSession()\n", + "\n", + "nn = observations_ph\n", + "\n", + "\n", + "\n", + "policy_out = \n", + "\n", + "probs_out = tf.exp(policy_out)\n", + "\n", + "weights = tf.trainable_variables()\n", + "sess.run(tf.global_variables_initializer())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 2: Actions and rollouts\n", + "\n", + "In this section, we'll define functions that take actions $ a \\sim \\pi_\\theta(a|s) $ and rollouts $ $." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# compile function\n", + "\n", + "\n", + "def act(obs, sample=True):\n", + " \"\"\"\n", + " Samples action from policy distribution (sample = True) or takes most likely action (sample = False)\n", + " :param: obs - single observation vector\n", + " :param sample: if True, samples from \\pi, otherwise takes most likely action\n", + " :returns: action (single integer) and probabilities for all actions\n", + " \"\"\"\n", + "\n", + " probs = sess.run(probs_out, feed_dict={\n", + " observations_ph: obs.reshape((1, -1))})[0]\n", + "\n", + " if sample:\n", + " action = int(np.random.choice(n_actions, p=probs))\n", + " else:\n", + " action = int(np.argmax(probs))\n", + "\n", + " return action, probs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# demo\n", + "print(\"sampled:\", [act(env.reset()) for _ in range(5)])\n", + "print(\"greedy:\", [act(env.reset(), sample=False) for _ in range(5)])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Compute cummulative reward just like you did in vanilla REINFORCE" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import scipy.signal\n", + "\n", + "\n", + "def get_cummulative_returns(r, gamma=1):\n", + " \"\"\"\n", + " Computes cummulative discounted rewards given immediate rewards\n", + " G_i = r_i + gamma*r_{i+1} + gamma^2*r_{i+2} + ...\n", + " Also known as R(s,a).\n", + " \"\"\"\n", + " r = np.array(r)\n", + " assert r.ndim >= 1\n", + " return scipy.signal.lfilter([1], [1, -gamma], r[::-1], axis=0)[::-1]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# simple demo on rewards [0,0,1,0,0,1]\n", + "get_cummulative_returns([0, 0, 1, 0, 0, 1], gamma=0.9)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Rollout**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def rollout(env, act, max_pathlength=2500, n_timesteps=50000):\n", + " \"\"\"\n", + " Generate rollouts for training.\n", + " :param: env - environment in which we will make actions to generate rollouts.\n", + " :param: act - the function that can return policy and action given observation.\n", + " :param: max_pathlength - maximum size of one path that we generate.\n", + " :param: n_timesteps - total sum of sizes of all pathes we generate.\n", + " \"\"\"\n", + " paths = []\n", + "\n", + " total_timesteps = 0\n", + " while total_timesteps < n_timesteps:\n", + " obervations, actions, rewards, action_probs = [], [], [], []\n", + " obervation = env.reset()\n", + " for _ in range(max_pathlength):\n", + " action, policy = act(obervation)\n", + " obervations.append(obervation)\n", + " actions.append(action)\n", + " action_probs.append(policy)\n", + " obervation, reward, done, _ = env.step(action)\n", + " rewards.append(reward)\n", + " total_timesteps += 1\n", + " if done or total_timesteps == n_timesteps:\n", + " path = {\"observations\": np.array(obervations),\n", + " \"policy\": np.array(action_probs),\n", + " \"actions\": np.array(actions),\n", + " \"rewards\": np.array(rewards),\n", + " \"cumulative_returns\": get_cummulative_returns(rewards),\n", + " }\n", + " paths.append(path)\n", + " break\n", + " return paths" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "paths = rollout(env, act, max_pathlength=5, n_timesteps=100)\n", + "print(paths[-1])\n", + "assert (paths[0]['policy'].shape == (5, n_actions))\n", + "assert (paths[0]['cumulative_returns'].shape == (5,))\n", + "assert (paths[0]['rewards'].shape == (5,))\n", + "assert (paths[0]['observations'].shape == (5,)+observation_shape)\n", + "assert (paths[0]['actions'].shape == (5,))\n", + "print('It\\'s ok')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 3: loss functions\n", + "\n", + "Now let's define the loss functions and constraints for actual TRPO training." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The surrogate reward should be\n", + "$$J_{surr}= {1 \\over N} \\sum\\limits_{i=0}^N \\frac{\\pi_{\\theta}(s_i, a_i)}{\\pi_{\\theta_{old}}(s_i, a_i)}A_{\\theta_{old}(s_i, a_i)}$$\n", + "\n", + "For simplicity, let's use cummulative returns instead of advantage for now:\n", + "$$J'_{surr}= {1 \\over N} \\sum\\limits_{i=0}^N \\frac{\\pi_{\\theta}(s_i, a_i)}{\\pi_{\\theta_{old}}(s_i, a_i)}G_{\\theta_{old}(s_i, a_i)}$$\n", + "\n", + "Or alternatively, minimize the surrogate loss:\n", + "$$ L_{surr} = - J'_{surr} $$" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# select probabilities of chosen actions\n", + "batch_size = tf.shape(observations_ph)[0]\n", + "probs_all = tf.reshape(probs_out, [-1])\n", + "probs_for_actions = tf.gather(probs_all, tf.range(\n", + " 0, batch_size) * n_actions + actions_ph)\n", + "old_probs_all = tf.reshape(old_probs_ph, [-1])\n", + "old_probs_for_actions = tf.gather(\n", + " old_probs_all, tf.range(0, batch_size) * n_actions + actions_ph)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# Compute surrogate loss: negative importance-sampled policy gradient\n", + "\n", + "L_surr = " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# compute and return surrogate policy gradient\n", + "def var_shape(x):\n", + " return [k.value for k in x.get_shape()]\n", + "\n", + "\n", + "def numel(x):\n", + " return np.prod(var_shape(x))\n", + "\n", + "\n", + "def flatgrad(loss, var_list):\n", + " grads = tf.gradients(loss, var_list)\n", + " return tf.concat([tf.reshape(grad, [numel(v)])\n", + " for (v, grad) in zip(var_list, grads)], 0)\n", + "\n", + "\n", + "flat_grad_surr = flatgrad(L_surr, weights)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can ascend these gradients as long as our $pi_\\theta(a|s)$ satisfies the constraint\n", + "$$E_{s,\\pi_{\\Theta_{t}}}\\Big[KL(\\pi(\\Theta_{t}, s) \\:||\\:\\pi(\\Theta_{t+1}, s))\\Big]< \\alpha$$\n", + "\n", + "\n", + "where\n", + "\n", + "$$KL(p||q) = E _p log({p \\over q})$$" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# Compute Kullback-Leibler divergence (see formula above)\n", + "# Note: you need to sum KL and entropy over all actions, not just the ones agent took\n", + "old_log_probs = tf.log(old_probs_ph+1e-10)\n", + "\n", + "kl = \n", + "\n", + "# Compute policy entropy\n", + "entropy = \n", + "\n", + "losses = [L_surr, kl, entropy]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Linear search**\n", + "\n", + "TRPO in its core involves ascending surrogate policy gradient constrained by KL divergence. \n", + "\n", + "In order to enforce this constraint, we're gonna use linesearch. You can find out more about it [here](https://en.wikipedia.org/wiki/Linear_search)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def linesearch(f, x, fullstep, max_kl):\n", + " \"\"\"\n", + " Linesearch finds the best parameters of neural networks in the direction of fullstep contrainted by KL divergence.\n", + " :param: f - function that returns loss, kl and arbitrary third component.\n", + " :param: x - old parameters of neural network.\n", + " :param: fullstep - direction in which we make search.\n", + " :param: max_kl - constraint of KL divergence.\n", + " :returns:\n", + " \"\"\"\n", + " max_backtracks = 10\n", + " loss, _, _ = f(x)\n", + " for stepfrac in .5**np.arange(max_backtracks):\n", + " xnew = x + stepfrac * fullstep\n", + " new_loss, kl, _ = f(xnew)\n", + " actual_improve = new_loss - loss\n", + " if kl <= max_kl and actual_improve < 0:\n", + " x = xnew\n", + " loss = new_loss\n", + " return x" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 4: training\n", + "In this section we construct rest parts of our computational graph" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def slice_vector(vector, shapes):\n", + " \"\"\"\n", + " Slices symbolic vector into several symbolic tensors of given shapes.\n", + " Auxilary function used to un-flatten gradients, tangents etc.\n", + " :param vector: 1-dimensional symbolic vector\n", + " :param shapes: list or tuple of shapes (list, tuple or symbolic)\n", + " :returns: list of symbolic tensors of given shapes\n", + " \"\"\"\n", + " assert len(vector.get_shape()) == 1, \"vector must be 1-dimensional\"\n", + " start = 0\n", + " tensors = []\n", + " for shape in shapes:\n", + " size = np.prod(shape)\n", + " tensor = tf.reshape(vector[start:(start + size)], shape)\n", + " tensors.append(tensor)\n", + " start += size\n", + " return tensors" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# intermediate grad in conjugate_gradient\n", + "conjugate_grad_intermediate_vector = tf.placeholder(\n", + " dtype=tf.float32, shape=(None,))\n", + "\n", + "# slice flat_tangent into chunks for each weight\n", + "weight_shapes = [sess.run(var).shape for var in weights]\n", + "tangents = slice_vector(conjugate_grad_intermediate_vector, weight_shapes)\n", + "\n", + "# KL divergence where first arg is fixed\n", + "kl_firstfixed = tf.reduce_sum((tf.stop_gradient(probs_out) * (tf.stop_gradient(\n", + " tf.log(probs_out)) - tf.log(probs_out)))) / tf.cast(batch_size, tf.float32)\n", + "\n", + "# compute fisher information matrix (used for conjugate gradients and to estimate KL)\n", + "gradients = tf.gradients(kl_firstfixed, weights)\n", + "gradient_vector_product = [tf.reduce_sum(\n", + " g[0] * t) for (g, t) in zip(gradients, tangents)]\n", + "\n", + "fisher_vec_prod = flatgrad(gradient_vector_product, weights)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### TRPO helpers\n", + "\n", + "Here we define a few helper functions used in the main TRPO loop" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Conjugate gradients**\n", + "\n", + "Since TRPO includes contrainted optimization, we will need to solve Ax=b using conjugate gradients.\n", + "\n", + "In general, CG is an algorithm that solves Ax=b where A is positive-defined. A is Hessian matrix so A is positive-defined. You can find out more about them [here](https://en.wikipedia.org/wiki/Conjugate_gradient_method)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "from numpy.linalg import inv\n", + "\n", + "\n", + "def conjugate_gradient(f_Ax, b, cg_iters=10, residual_tol=1e-10):\n", + " \"\"\"\n", + " This method solves system of equation Ax=b using iterative method called conjugate gradients\n", + " :f_Ax: function that returns Ax\n", + " :b: targets for Ax\n", + " :cg_iters: how many iterations this method should do\n", + " :residual_tol: epsilon for stability\n", + " \"\"\"\n", + " p = b.copy()\n", + " r = b.copy()\n", + " x = np.zeros_like(b)\n", + " rdotr = r.dot(r)\n", + " for i in range(cg_iters):\n", + " z = f_Ax(p)\n", + " v = rdotr / (p.dot(z) + 1e-8)\n", + " x += v * p\n", + " r -= v * z\n", + " newrdotr = r.dot(r)\n", + " mu = newrdotr / (rdotr + 1e-8)\n", + " p = r + mu * p\n", + " rdotr = newrdotr\n", + " if rdotr < residual_tol:\n", + " break\n", + " return x" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# This code validates conjugate gradients\n", + "A = np.random.rand(8, 8)\n", + "A = np.matmul(np.transpose(A), A)\n", + "\n", + "\n", + "def f_Ax(x):\n", + " return np.matmul(A, x.reshape(-1, 1)).reshape(-1)\n", + "\n", + "\n", + "b = np.random.rand(8)\n", + "\n", + "w = np.matmul(np.matmul(inv(np.matmul(np.transpose(A), A)),\n", + " np.transpose(A)), b.reshape((-1, 1))).reshape(-1)\n", + "print(w)\n", + "print(conjugate_gradient(f_Ax, b))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# Compile a function that exports network weights as a vector\n", + "flat_weights = tf.concat([tf.reshape(var, [-1]) for var in weights], axis=0)\n", + "\n", + "# ... and another function that imports vector back into network weights\n", + "flat_weights_placeholder = tf.placeholder(tf.float32, shape=(None,))\n", + "assigns = slice_vector(flat_weights_placeholder, weight_shapes)\n", + "\n", + "load_flat_weights = [w.assign(ph) for w, ph in zip(weights, assigns)]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### Step 5: Main TRPO loop\n", + "\n", + "Here we will train our network!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import time\n", + "from itertools import count\n", + "from collections import OrderedDict\n", + "\n", + "# this is hyperparameter of TRPO. It controls how big KL divergence may be between old and new policy every step.\n", + "max_kl = 0.01\n", + "cg_damping = 0.1 # This parameters regularize addition to\n", + "numeptotal = 0 # this is number of episodes that we played.\n", + "\n", + "start_time = time.time()\n", + "\n", + "for i in count(1):\n", + "\n", + " print(\"\\n********** Iteration %i ************\" % i)\n", + "\n", + " # Generating paths.\n", + " print(\"Rollout\")\n", + " paths = rollout(env, act)\n", + " print(\"Made rollout\")\n", + "\n", + " # Updating policy.\n", + " observations = np.concatenate([path[\"observations\"] for path in paths])\n", + " actions = np.concatenate([path[\"actions\"] for path in paths])\n", + " returns = np.concatenate([path[\"cumulative_returns\"] for path in paths])\n", + " old_probs = np.concatenate([path[\"policy\"] for path in paths])\n", + " inputs_batch = [observations, actions, returns, old_probs]\n", + " feed_dict = {observations_ph: observations,\n", + " actions_ph: actions,\n", + " old_probs_ph: old_probs,\n", + " cummulative_returns_ph: returns,\n", + " }\n", + " old_weights = sess.run(flat_weights)\n", + "\n", + " def fisher_vector_product(p):\n", + " \"\"\"gets intermediate grads (p) and computes fisher*vector \"\"\"\n", + " feed_dict[conjugate_grad_intermediate_vector] = p\n", + " return sess.run(fisher_vec_prod, feed_dict) + cg_damping * p\n", + "\n", + " flat_grad = sess.run(flat_grad_surr, feed_dict)\n", + "\n", + " stepdir = conjugate_gradient(fisher_vector_product, -flat_grad)\n", + " shs = .5 * stepdir.dot(fisher_vector_product(stepdir))\n", + " lm = np.sqrt(shs / max_kl)\n", + " fullstep = stepdir / lm\n", + "\n", + " # Compute new weights with linesearch in the direction we found with CG\n", + "\n", + " def losses_f(flat_weights):\n", + " feed_dict[flat_weights_placeholder] = flat_weights\n", + " sess.run(load_flat_weights, feed_dict)\n", + " return sess.run(losses, feed_dict)\n", + "\n", + " new_weights = linesearch(losses_f, old_weights, fullstep, max_kl)\n", + " feed_dict[flat_weights_placeholder] = new_weights\n", + " sess.run(load_flat_weights, feed_dict)\n", + "\n", + " # Report current progress\n", + " L_surr, kl, entropy = sess.run(losses, feed_dict)\n", + " episode_rewards = np.array([path[\"rewards\"].sum() for path in paths])\n", + "\n", + " stats = OrderedDict()\n", + " numeptotal += len(episode_rewards)\n", + " stats[\"Total number of episodes\"] = numeptotal\n", + " stats[\"Average sum of rewards per episode\"] = episode_rewards.mean()\n", + " stats[\"Std of rewards per episode\"] = episode_rewards.std()\n", + " stats[\"Entropy\"] = entropy\n", + " stats[\"Time elapsed\"] = \"%.2f mins\" % ((time.time() - start_time)/60.)\n", + " stats[\"KL between old and new distribution\"] = kl\n", + " stats[\"Surrogate loss\"] = L_surr\n", + " for k, v in stats.items():\n", + " print(k + \": \" + \" \" * (40 - len(k)) + str(v))\n", + " i += 1" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Homework option I: better sampling (10+pts)\n", + "\n", + "In this section, you're invited to implement a better rollout strategy called _vine_.\n", + "\n", + "![img](https://s17.postimg.org/i90chxgvj/vine.png)\n", + "\n", + "In most gym environments, you can actually backtrack by using states. You can find a wrapper that saves/loads states in [the mcts seminar](https://github.com/yandexdataschool/Practical_RL/blob/spring19/week10_planning/seminar_MCTS.ipynb).\n", + "\n", + "You can read more about in the [TRPO article](https://arxiv.org/abs/1502.05477) in section 5.2.\n", + "\n", + "The goal here is to implement such rollout policy (we recommend using tree data structure like in the seminar above).\n", + "Then you can assign cummulative rewards similar to `get_cummulative_rewards`, but for a tree.\n", + "\n", + "__bonus task__ - parallelize samples using multiple cores" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": true + }, + "source": [ + "# Homework option II (10+pts)\n", + "\n", + "Let's use TRPO to train evil robots! (pick any of two)\n", + "* [MuJoCo robots](https://gym.openai.com/envs#mujoco)\n", + "* [Box2d robot](https://gym.openai.com/envs/BipedalWalker-v2)\n", + "\n", + "The catch here is that those environments have continuous action spaces. \n", + "\n", + "Luckily, TRPO is a policy gradient method, so it's gonna work for any parametric $\\pi_\\theta(a|s)$. We recommend starting with gaussian policy:\n", + "\n", + "$$\\pi_\\theta(a|s) = N(\\mu_\\theta(s),\\sigma^2_\\theta(s)) = {1 \\over \\sqrt { 2 \\pi {\\sigma^2}_\\theta(s) } } e^{ (a - \n", + "\\mu_\\theta(s))^2 \\over 2 {\\sigma^2}_\\theta(s) } $$\n", + "\n", + "In the $\\sqrt { 2 \\pi {\\sigma^2}_\\theta(s) }$ clause, $\\pi$ means ~3.1415926, not agent's policy.\n", + "\n", + "This essentially means that you will need two output layers:\n", + "* $\\mu_\\theta(s)$, a dense layer with linear activation\n", + "* ${\\sigma^2}_\\theta(s)$, a dense layer with activation tf.exp (to make it positive; like rho from bandits)\n", + "\n", + "For multidimensional actions, you can use fully factorized gaussian (basically a vector of gaussians).\n", + "\n", + "__bonus task__: compare performance of continuous action space method to action space discretization" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.6" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/week09_policy_II/seminar_TRPO_theano.ipynb b/week09_policy_II/seminar_TRPO_theano.ipynb new file mode 100644 index 000000000..2585a2b33 --- /dev/null +++ b/week09_policy_II/seminar_TRPO_theano.ipynb @@ -0,0 +1,792 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# launch XVFB if you run on a server\n", + "import os\n", + "if type(os.environ.get(\"DISPLAY\")) is not str or len(os.environ.get(\"DISPLAY\")) == 0:\n", + " !bash ../xvfb start\n", + " %env DISPLAY = : 1" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Let's make a TRPO!\n", + "\n", + "In this notebook we will write the code of the one Trust Region Policy Optimization.\n", + "As usually, it contains a few different parts which we are going to reproduce.\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "import theano\n", + "import theano.tensor as T" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Observation Space Box(6,)\n", + "Action Space Discrete(3)\n" + ] + } + ], + "source": [ + "import gym\n", + "\n", + "env = gym.make(\"Acrobot-v1\")\n", + "env.reset()\n", + "\n", + "observation_shape = env.observation_space.shape\n", + "n_actions = env.action_space.n\n", + "print(\"Observation Space\", env.observation_space)\n", + "print(\"Action Space\", env.action_space)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAQsAAAD8CAYAAABgtYFHAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAADktJREFUeJzt3W2opGd9x/Hvr5sHbRXXJMew7K5sxKXoizZmD3ElpdhE\nS0zFzYsEIlKXsLDQWlAs2E0LLUJfaF+YIBR1aaRrUZPUB7KEtDZsEkpfGHPWPLuNOUpqDhvclTzY\nItpG/30x19Fx9yTn2j0zZ2Z2vx8Y5rqv+7pn/hNmf7nue64zk6pCklbzG5MuQNJsMCwkdTEsJHUx\nLCR1MSwkdTEsJHUZS1gkuTrJk0kWk+wbx3NIWl8Z9TqLJBuA7wLvBpaAB4H3V9V3RvpEktbVOGYW\nlwOLVfX9qvpf4DZg1xieR9I6OmcMj7kZeGZoewl4+ysdcNFFF9W2bdvGUIqkZYcPH/5RVc2d7vHj\nCIus0HfSuU6SvcBegDe+8Y0sLCyMoRRJy5L811qOH8dpyBKwdWh7C3D0xEFVtb+q5qtqfm7utMNO\n0joZR1g8CGxPckmS84AbgINjeB5J62jkpyFV9VKSPwO+AWwAPl9VT4z6eSStr3Fcs6Cq7gbuHsdj\nS5oMV3BK6mJYSOpiWEjqYlhI6mJYSOpiWEjqYlhI6mJYSOpiWEjqYlhI6mJYSOpiWEjqYlhI6mJY\nSOpiWEjqYlhI6mJYSOpiWEjqYlhI6mJYSOpiWEjqYlhI6mJYSOpiWEjqYlhI6mJYSOpiWEjqYlhI\n6mJYSOpiWEjqYlhI6mJYSOpiWEjqYlhI6mJYSOqyalgk+XySY0keH+q7IMk9SZ5q969v/Uny6SSL\nSR5Nctk4i5e0fnpmFv8IXH1C3z7gUFVtBw61bYD3ANvbbS/wmdGUKWnSVg2Lqvp34LkTuncBB1r7\nAHDtUP8XauCbwMYkm0ZVrKTJOd1rFhdX1bMA7f4NrX8z8MzQuKXWd5Ike5MsJFk4fvz4aZYhab2M\n+gJnVuirlQZW1f6qmq+q+bm5uRGXIWnUTjcsfrh8etHuj7X+JWDr0LgtwNHTL0/StDjdsDgI7G7t\n3cCdQ/0fbJ+K7AReXD5dkTTbzlltQJIvA+8ELkqyBPwN8AngjiR7gB8A17fhdwPXAIvAT4Abx1Cz\npAlYNSyq6v0vs+uqFcYW8KG1FiVp+riCU1IXw0JSF8NCUhfDQlIXw0JSF8NCUhfDQlIXw0JSF8NC\nUpcMFl1OuIhk8kVIZ77DVTV/ugevutx7PezYsYOFhYVJlyGd0ZKVvkGin6chkroYFpK6GBaSuhgW\nkroYFpK6GBaSuhgWkroYFpK6GBaSuhgWkroYFpK6GBaSuhgWkroYFpK6GBaSuhgWkroYFpK6GBaS\nuhgWkroYFpK6GBaSuhgWkroYFpK6rBoWSbYmuS/JkSRPJPlw678gyT1Jnmr3r2/9SfLpJItJHk1y\n2bhfhKTx65lZvAT8eVW9BdgJfCjJW4F9wKGq2g4catsA7wG2t9te4DMjr1rSuls1LKrq2ar6dmv/\nN3AE2AzsAg60YQeAa1t7F/CFGvgmsDHJppFXLmldndI1iyTbgLcBDwAXV9WzMAgU4A1t2GbgmaHD\nllqfpBnWHRZJXgN8FfhIVf34lYau0HfSDx8n2ZtkIcnC8ePHe8uQNCFdYZHkXAZB8cWq+lrr/uHy\n6UW7P9b6l4CtQ4dvAY6e+JhVtb+q5qtqfm5u7nTrl7ROej4NCXArcKSqPjW06yCwu7V3A3cO9X+w\nfSqyE3hx+XRF0uw6p2PMFcAfA48lebj1/SXwCeCOJHuAHwDXt313A9cAi8BPgBtHWrGkiVg1LKrq\nP1j5OgTAVSuML+BDa6xL0pRxBaekLoaFpC6GhaQuhoWkLoaFpC6GhaQuhoWkLoaFpC6GhaQuhoWk\nLoaFpC49f0gm/dLhw7/+Z0I7dpz0VSU6QzmzULcTg+Ll+nRmMizU5ZVCwcA4OxgWWlVPGBgYZz7D\nQlIXw0JSF8NCUhfDQquaZ2EkYzTbDAt1eaUwMCjODoaFuq0UCgbF2cMVnDolhsPZy5mFpC6GhaQu\nhoWkLoaFpC6GhaQuhoWkLoaFpC6GhaQuhoWkLoaFpC6GhaQuhoWkLoaFpC6rhkWSVyX5VpJHkjyR\n5OOt/5IkDyR5KsntSc5r/ee37cW2f9t4X4Kk9dAzs/gZcGVV/S5wKXB1kp3AJ4Gbq2o78Dywp43f\nAzxfVW8Gbm7jJM24VcOiBv6nbZ7bbgVcCXyl9R8Arm3tXW2btv+qJH5PvDTjuq5ZJNmQ5GHgGHAP\n8D3ghap6qQ1ZAja39mbgGYC2/0XgwhUec2+ShSQLx48fX9urkDR2XWFRVT+vqkuBLcDlwFtWGtbu\nV5pFnPSDmFW1v6rmq2p+bm6ut15JE3JKn4ZU1QvA/cBOYGOS5a/l2wIcbe0lYCtA2/864LlRFCtp\ncno+DZlLsrG1Xw28CzgC3Adc14btBu5s7YNtm7b/3qryp7alGdfzhb2bgANJNjAIlzuq6q4k3wFu\nS/K3wEPArW38rcA/JVlkMKO4YQx1S1pnq4ZFVT0KvG2F/u8zuH5xYv9PgetHUp2kqeEKTkldDAtJ\nXQwLrap27Jh0CZoChoWkLoaFpC6GhaQuhoWkLoaFpC6GhaQuhoWkLoaFpC6GhUYmhw9PugSNkWEh\nqYthIamLYSGpi2EhqYthodOywDwLzE+6DK0jw0Kn5MSQMDDOHoaFur1cMBgYZwfDQl1WCwQD48xn\nWGhkDIwzm2EhqYthIamLYaGRmWdh0iVojAwLdVktCAyKM59hoTUzKM4OPb91KgGGwtnOmYWkLoaF\npC6GhaQuhoWkLoaFpC6GhaQuhoWkLt1hkWRDkoeS3NW2L0nyQJKnktye5LzWf37bXmz7t42ndEnr\n6VRmFh8GjgxtfxK4uaq2A88De1r/HuD5qnozcHMbJ2nGdYVFki3AHwH/0LYDXAl8pQ05AFzb2rva\nNm3/VW28pBnWO7O4BfgY8Iu2fSHwQlW91LaXgM2tvRl4BqDtf7GNlzTDVg2LJO8FjlXV8G/TrTRT\nqI59w4+7N8lCkoXjx493FStpcnpmFlcA70vyNHAbg9OPW4CNSZb/EG0LcLS1l4CtAG3/64DnTnzQ\nqtpfVfNVNT83N7emFyFp/FYNi6q6qaq2VNU24Abg3qr6AHAfcF0bthu4s7UPtm3a/nur6qSZhaTZ\nspZ1Fn8BfDTJIoNrEre2/luBC1v/R4F9aytR0jQ4pe+zqKr7gftb+/vA5SuM+Slw/QhqkzRFXMEp\nqYthIamLYSGpi2EhqYthIamLYSGpi2EhqYthIamLYSGpi2EhqYthIamLYSGpi2EhqYthIamLYSGp\ni2EhqYthIamLYSGpi2GhLrVjx6RL0IQZFpK6GBaSuhgWkroYFpK6GBaSuhgWkroYFpK6GBYaqRw+\nPOkSNCaGhaQup/TDyDq7uYrz7ObMQlIXw0JSF8NCUhfDQlIXw0JSF8NCUpeusEjydJLHkjycZKH1\nXZDkniRPtfvXt/4k+XSSxSSPJrlsnC9A0vo4lZnFH1TVpVU137b3AYeqajtwqG0DvAfY3m57gc+M\nqlhJk7OW05BdwIHWPgBcO9T/hRr4JrAxyaY1PI+kKdC7grOAf0tSwOeqaj9wcVU9C1BVzyZ5Qxu7\nGXhm6Nil1vfs8AMm2ctg5gHwsySPn+ZrmISLgB9NuohOs1QrzFa9s1QrwG+v5eDesLiiqo62QLgn\nyX++wtis0FcndQwCZz9AkoWh05upN0v1zlKtMFv1zlKtMKh3Lcd3nYZU1dF2fwz4OnA58MPl04t2\nf6wNXwK2Dh2+BTi6liIlTd6qYZHkt5K8drkN/CHwOHAQ2N2G7QbubO2DwAfbpyI7gReXT1ckza6e\n05CLga8nWR7/par61yQPAnck2QP8ALi+jb8buAZYBH4C3NjxHPtPtfAJm6V6Z6lWmK16Z6lWWGO9\nqTrpcoIkncQVnJK6TDwsklyd5Mm24nPf6keMvZ7PJzk2/FHuNK9WTbI1yX1JjiR5IsmHp7XmJK9K\n8q0kj7RaP976L0nyQKv19iTntf7z2/Zi279tvWodqnlDkoeS3DUDtY53pXVVTewGbAC+B7wJOA94\nBHjrhGv6feAy4PGhvr8D9rX2PuCTrX0N8C8MPi7eCTwwgXo3AZe19muB7wJvncaa23O+prXPBR5o\nNdwB3ND6Pwv8SWv/KfDZ1r4BuH0C/30/CnwJuKttT3OtTwMXndA3svfBur6YFV7cO4BvDG3fBNw0\nyZpaHdtOCIsngU2tvQl4srU/B7x/pXETrP1O4N3TXjPwm8C3gbczWNh0zonvCeAbwDta+5w2LutY\n4xYGf8pwJXBX+4c1lbW2510pLEb2Ppj0acjLrfacNr+2WhVYbbXqRLSp79sY/B97Kmtu0/qHGazL\nuYfBzPKFqnpphXp+WWvb/yJw4XrVCtwCfAz4Rdu+kOmtFX610vpwWyENI3wfTPoLe7tWe06xqak/\nyWuArwIfqaoft4+6Vxy6Qt+61VxVPwcuTbKRwQK/t7xCPROrNcl7gWNVdTjJOzvqmYb3wshXWg+b\n9MxiVlZ7TvVq1STnMgiKL1bV11r3VNdcVS8A9zM4X96YZPl/XMP1/LLWtv91wHPrVOIVwPuSPA3c\nxuBU5JYprRUY/0rrSYfFg8D2doX5PAYXhg5OuKaVTO1q1QymELcCR6rqU0O7pq7mJHNtRkGSVwPv\nAo4A9wHXvUyty6/hOuDeaifY41ZVN1XVlqraxuB9eW9VfWAaa4V1Wmm9nhdgXuaizDUMruB/D/ir\nKajnywz+Qvb/GKTvHgbnnoeAp9r9BW1sgL9vtT8GzE+g3t9jMH18FHi43a6ZxpqB3wEearU+Dvx1\n638T8C0Gq37/GTi/9b+qbS+2/W+a0Hvinfzq05CprLXV9Ui7PbH8b2mU7wNXcErqMunTEEkzwrCQ\n1MWwkNTFsJDUxbCQ1MWwkNTFsJDUxbCQ1OX/ASKF9tx4Ki+cAAAAAElFTkSuQmCC\n", + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import matplotlib.pyplot as plt\n", + "%matplotlib inline\n", + "plt.imshow(env.render('rgb_array'))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 1: Defining a network\n", + "\n", + "With all it's complexity, at it's core TRPO is yet another policy gradient method. \n", + "\n", + "This essentially means we're actually training a stochastic policy $ \\pi_\\theta(a|s) $. \n", + "\n", + "And yes, it's gonna be a neural network. So let's start by defining one." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# input tensors\n", + "observations = T.matrix(name=\"obs\")\n", + "actions = T.ivector(name=\"action\")\n", + "cummulative_returns = T.vector(name=\"G = r + gamma*r' + gamma^2*r'' + ...\")\n", + "old_probs = T.matrix(name=\"action probabilities from previous iteration\")\n", + "\n", + "all_inputs = [observations, actions, cummulative_returns, old_probs]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# Create neural network.\n", + "from lasagne.layers import *\n", + "\n", + "nn = InputLayer((None,)+observation_shape, input_var=observations)\n", + "\n", + "\n", + "\n", + "policy = \n", + "\n", + "probs = get_output(policy)\n", + "\n", + "weights = get_all_params(policy, trainable=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 2: Actions and rollouts\n", + "\n", + "In this section, we'll define functions that take actions $ a \\sim \\pi_\\theta(a|s) $ and rollouts $ $." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# compile function\n", + "get_policy = theano.function([observations], probs, allow_input_downcast=True)\n", + "\n", + "\n", + "def act(obs, sample=True):\n", + " \"\"\"\n", + " Samples action from policy distribution (sample = True) or takes most likely action (sample = False)\n", + " :param: obs - single observation vector\n", + " :param sample: if True, samples from \\pi, otherwise takes most likely action\n", + " :returns: action (single integer) and probabilities for all actions\n", + " \"\"\"\n", + "\n", + " policy = get_policy([obs])[0]\n", + "\n", + " if sample:\n", + " action = int(np.random.choice(n_actions, p=policy))\n", + " else:\n", + " action = int(np.argmax(policy))\n", + "\n", + " return action, policy" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# demo\n", + "print(\"sampled:\", [act(env.reset()) for _ in range(100)])\n", + "print(\"greedy:\", [act(env.reset(), sample=False) for _ in range(100)])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Compute cummulative reward just like you did in vanilla REINFORCE" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import scipy.signal\n", + "\n", + "\n", + "def get_cummulative_returns(r, gamma=1):\n", + " \"\"\"\n", + " Computes cummulative discounted rewards given immediate rewards\n", + " G_i = r_i + gamma*r_{i+1} + gamma^2*r_{i+2} + ...\n", + " Also known as R(s,a).\n", + " \"\"\"\n", + " r = np.array(r)\n", + " assert r.ndim >= 1\n", + " return scipy.signal.lfilter([1], [1, -gamma], r[::-1], axis=0)[::-1]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# simple demo on rewards [0,0,1,0,0,1]\n", + "get_cummulative_returns([0, 0, 1, 0, 0, 1], gamma=0.9)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Rollout**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def rollout(env, act, max_pathlength=2500, n_timesteps=50000):\n", + " \"\"\"\n", + " Generate rollouts for training.\n", + " :param: env - environment in which we will make actions to generate rollouts.\n", + " :param: act - the function that can return policy and action given observation.\n", + " :param: max_pathlength - maximum size of one path that we generate.\n", + " :param: n_timesteps - total sum of sizes of all pathes we generate.\n", + " \"\"\"\n", + " paths = []\n", + "\n", + " total_timesteps = 0\n", + " while total_timesteps < n_timesteps:\n", + " obervations, actions, rewards, action_probs = [], [], [], []\n", + " obervation = env.reset()\n", + " for _ in range(max_pathlength):\n", + " action, policy = act(obervation)\n", + " obervations.append(obervation)\n", + " actions.append(action)\n", + " action_probs.append(policy)\n", + " obervation, reward, done, _ = env.step(action)\n", + " rewards.append(reward)\n", + " total_timesteps += 1\n", + " if done or total_timesteps == n_timesteps:\n", + " path = {\"observations\": np.array(obervations),\n", + " \"policy\": np.array(action_probs),\n", + " \"actions\": np.array(actions),\n", + " \"rewards\": np.array(rewards),\n", + " \"cumulative_returns\": get_cummulative_returns(rewards),\n", + " }\n", + " paths.append(path)\n", + " break\n", + " return paths" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "paths = rollout(env, act, max_pathlength=5, n_timesteps=100)\n", + "print(paths[-1])\n", + "assert (paths[0]['policy'].shape == (5, n_actions))\n", + "assert (paths[0]['cumulative_returns'].shape == (5,))\n", + "assert (paths[0]['rewards'].shape == (5,))\n", + "assert (paths[0]['observations'].shape == (5,)+observation_shape)\n", + "assert (paths[0]['actions'].shape == (5,))\n", + "print('It\\'s ok')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 3: loss functions\n", + "\n", + "Now let's define the loss functions and constraints for actual TRPO training." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The surrogate reward should be\n", + "$$J_{surr}= {1 \\over N} \\sum\\limits_{i=0}^N \\frac{\\pi_{\\theta}(s_i, a_i)}{\\pi_{\\theta_{old}}(s_i, a_i)}A_{\\theta_{old}(s_i, a_i)}$$\n", + "\n", + "For simplicity, let's use cummulative returns instead of advantage for now:\n", + "$$J'_{surr}= {1 \\over N} \\sum\\limits_{i=0}^N \\frac{\\pi_{\\theta}(s_i, a_i)}{\\pi_{\\theta_{old}}(s_i, a_i)}G_{\\theta_{old}(s_i, a_i)}$$\n", + "\n", + "Or alternatively, minimize the surrogate loss:\n", + "$$ L_{surr} = - J'_{surr} $$" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# select probabilities of chosen actions\n", + "batch_size = actions.shape[0]\n", + "\n", + "probs_for_actions = probs[T.arange(batch_size), actions]\n", + "old_probs_for_actions = old_probs[T.arange(batch_size), actions]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# Compute surrogate loss: negative importance-sampled policy gradient\n", + "\n", + "L_surr = " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# compute and return surrogate policy gradient\n", + "\n", + "\n", + "def get_flat_gradient(loss, var_list):\n", + " \"\"\"gradient of loss wrt var_list flattened into a large vector\"\"\"\n", + " grads = T.grad(loss, var_list)\n", + " return T.concatenate([grad.ravel() for grad in grads])\n", + "\n", + "\n", + "get_surrogate_gradients = theano.function(all_inputs, get_flat_gradient(L_surr, weights),\n", + " allow_input_downcast=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can ascend these gradients as long as our $pi_\\theta(a|s)$ satisfies the constraint\n", + "$$E_{s,\\pi_{\\Theta_{t}}}\\Big[KL(\\pi(\\Theta_{t}, s) \\:||\\:\\pi(\\Theta_{t+1}, s))\\Big]< \\alpha$$\n", + "\n", + "\n", + "where\n", + "\n", + "$$KL(p||q) = E _p log({p \\over q})$$" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# Compute Kullback-Leibler divergence (see formula above)\n", + "# Note: you need to sum KL and entropy over all actions, not just the ones agent took\n", + "old_log_probs = T.log(old_probs + 1e-10)\n", + "\n", + "kl = \n", + "\n", + "# Compute policy entropy\n", + "entropy = \n", + "\n", + "compute_losses = theano.function(all_inputs, [L_surr, kl, entropy],\n", + " allow_input_downcast=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Linear search**\n", + "\n", + "TRPO in its core involves ascending surrogate policy gradient constrained by KL divergence. \n", + "\n", + "In order to enforce this constraint, we're gonna use linesearch. You can find out more about it [here](https://en.wikipedia.org/wiki/Linear_search)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def linesearch(f, x, fullstep, max_kl):\n", + " \"\"\"\n", + " Linesearch finds the best parameters of neural networks in the direction of fullstep contrainted by KL divergence.\n", + " :param: f - function that returns loss, kl and arbitrary third component.\n", + " :param: x - old parameters of neural network.\n", + " :param: fullstep - direction in which we make search.\n", + " :param: max_kl - constraint of KL divergence.\n", + " :returns:\n", + " \"\"\"\n", + " max_backtracks = 10\n", + " loss, _, _ = f(x)\n", + " for stepfrac in .5 ** np.arange(max_backtracks):\n", + " xnew = x + stepfrac * fullstep\n", + " new_loss, kl, _ = f(xnew)\n", + " actual_improve = new_loss - loss\n", + " if kl <= max_kl and actual_improve < 0:\n", + " x = xnew\n", + " loss = new_loss\n", + " return x" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 4: training\n", + "In this section we construct rest parts of our computational graph" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def slice_vector(vector, shapes):\n", + " \"\"\"\n", + " Slices symbolic vector into several symbolic tensors of given shapes.\n", + " Auxilary function used to un-flatten gradients, tangents etc.\n", + " :param vector: 1-dimensional symbolic vector\n", + " :param shapes: list or tuple of shapes (list, tuple or symbolic)\n", + " :returns: list of symbolic tensors of given shapes\n", + " \"\"\"\n", + " assert vector.ndim == 1, \"vector must be 1-dimensional\"\n", + " start = 0\n", + " tensors = []\n", + " for shape in shapes:\n", + " size = T.prod(shape)\n", + " tensor = vector[start:(start + size)].reshape(shape)\n", + " tensors.append(tensor)\n", + " start += size\n", + " return tensors" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "conjugate_grad_intermediate_vector = T.vector(\n", + " \"intermediate grad in conjugate_gradient\")\n", + "\n", + "# slice flat_tangent into chunks for each weight\n", + "weight_shapes = [var.get_value().shape for var in weights]\n", + "tangents = slice_vector(conjugate_grad_intermediate_vector, weight_shapes)\n", + "\n", + "# KL divergence where first arg is fixed\n", + "from theano.gradient import disconnected_grad as const\n", + "kl_firstfixed = (const(probs) * (const(T.log(probs)) -\n", + " T.log(probs))).sum(axis=-1).mean()\n", + "\n", + "# compute fisher information matrix (used for conjugate gradients and to estimate KL)\n", + "gradients = T.grad(kl_firstfixed, weights)\n", + "gradient_vector_product = [T.sum(g * t) for (g, t) in zip(gradients, tangents)]\n", + "\n", + "fisher_vector_product = get_flat_gradient(\n", + " sum(gradient_vector_product), weights)\n", + "\n", + "compute_fisher_vector_product = theano.function(\n", + " [observations, conjugate_grad_intermediate_vector], fisher_vector_product)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### TRPO helpers\n", + "\n", + "Here we define a few helper functions used in the main TRPO loop" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Conjugate gradients**\n", + "\n", + "Since TRPO includes contrainted optimization, we will need to solve Ax=b using conjugate gradients.\n", + "\n", + "In general, CG is an algorithm that solves Ax=b where A is positive-defined. A is Hessian matrix so A is positive-defined. You can find out more about them [here](https://en.wikipedia.org/wiki/Conjugate_gradient_method)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "from numpy.linalg import inv\n", + "\n", + "\n", + "def conjugate_gradient(f_Ax, b, cg_iters=10, residual_tol=1e-10):\n", + " \"\"\"\n", + " This method solves system of equation Ax=b using iterative method called conjugate gradients\n", + " :f_Ax: function that returns Ax\n", + " :b: targets for Ax\n", + " :cg_iters: how many iterations this method should do\n", + " :residual_tol: epsilon for stability\n", + " \"\"\"\n", + " p = b.copy()\n", + " r = b.copy()\n", + " x = np.zeros_like(b)\n", + " rdotr = r.dot(r)\n", + " for i in range(cg_iters):\n", + " z = f_Ax(p)\n", + " v = rdotr / (p.dot(z) + 1e-8)\n", + " x += v * p\n", + " r -= v * z\n", + " newrdotr = r.dot(r)\n", + " mu = newrdotr / (rdotr + 1e-8)\n", + " p = r + mu * p\n", + " rdotr = newrdotr\n", + " if rdotr < residual_tol:\n", + " break\n", + " return x" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# This code validates conjugate gradients\n", + "A = np.random.rand(8, 8)\n", + "A = np.matmul(np.transpose(A), A)\n", + "\n", + "\n", + "def f_Ax(x):\n", + " return np.matmul(A, x.reshape(-1, 1)).reshape(-1)\n", + "\n", + "\n", + "b = np.random.rand(8)\n", + "\n", + "w = np.matmul(np.matmul(inv(np.matmul(np.transpose(A), A)),\n", + " np.transpose(A)), b.reshape((-1, 1))).reshape(-1)\n", + "print(w)\n", + "print(conjugate_gradient(f_Ax, b))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# Compile a function that exports network weights as a vector\n", + "flat_weights = T.concatenate([var.ravel() for var in weights])\n", + "get_flat_weights = theano.function([], flat_weights)\n", + "\n", + "# ... and another function that imports vector back into network weights\n", + "flat_weights_placeholder = T.vector(\"flattened weights\")\n", + "assigns = slice_vector(flat_weights_placeholder, weight_shapes)\n", + "\n", + "load_flat_weights = theano.function(\n", + " [flat_weights_placeholder], updates=dict(zip(weights, assigns)))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### Step 5: Main TRPO loop\n", + "\n", + "Here we will train our network!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import time\n", + "from itertools import count\n", + "from collections import OrderedDict\n", + "\n", + "# this is hyperparameter of TRPO. It controls how big KL divergence may be between old and new policy every step.\n", + "max_kl = 0.01\n", + "cg_damping = 0.1 # This parameters regularize addition to\n", + "numeptotal = 0 # this is number of episodes that we played.\n", + "\n", + "start_time = time.time()\n", + "\n", + "for i in count(1):\n", + "\n", + " print(\"\\n********** Iteration %i ************\" % i)\n", + "\n", + " # Generating paths.\n", + " print(\"Rollout\")\n", + " paths = rollout(env, act)\n", + " print(\"Made rollout\")\n", + "\n", + " # Updating policy.\n", + " observations = np.concatenate([path[\"observations\"] for path in paths])\n", + " actions = np.concatenate([path[\"actions\"] for path in paths])\n", + " returns = np.concatenate([path[\"cumulative_returns\"] for path in paths])\n", + " old_probs = np.concatenate([path[\"policy\"] for path in paths])\n", + " inputs_batch = [observations, actions, returns, old_probs]\n", + "\n", + " old_weights = get_flat_weights()\n", + "\n", + " def fisher_vector_product(p):\n", + " \"\"\"gets intermediate grads (p) and computes fisher*vector \"\"\"\n", + " return compute_fisher_vector_product(observations, p) + cg_damping * p\n", + "\n", + " flat_grad = get_surrogate_gradients(*inputs_batch)\n", + "\n", + " stepdir = conjugate_gradient(fisher_vector_product, -flat_grad)\n", + " shs = .5 * stepdir.dot(fisher_vector_product(stepdir))\n", + " lm = np.sqrt(shs / max_kl)\n", + " fullstep = stepdir / lm\n", + "\n", + " # Compute new weights with linesearch in the direction we found with CG\n", + "\n", + " def losses_f(flat_weights):\n", + " load_flat_weights(flat_weights)\n", + " return compute_losses(*inputs_batch)\n", + "\n", + " new_weights = linesearch(losses_f, old_weights, fullstep, max_kl)\n", + "\n", + " load_flat_weights(new_weights)\n", + "\n", + " # Report current progress\n", + " L_surr, kl, entropy = compute_losses(*inputs_batch)\n", + " episode_rewards = np.array([path[\"rewards\"].sum() for path in paths])\n", + "\n", + " stats = OrderedDict()\n", + " numeptotal += len(episode_rewards)\n", + " stats[\"Total number of episodes\"] = numeptotal\n", + " stats[\"Average sum of rewards per episode\"] = episode_rewards.mean()\n", + " stats[\"Std of rewards per episode\"] = episode_rewards.std()\n", + " stats[\"Entropy\"] = entropy\n", + " stats[\"Time elapsed\"] = \"%.2f mins\" % ((time.time() - start_time)/60.)\n", + " stats[\"KL between old and new distribution\"] = kl\n", + " stats[\"Surrogate loss\"] = L_surr\n", + " for k, v in stats.items():\n", + " print(k + \": \" + \" \" * (40 - len(k)) + str(v))\n", + " i += 1" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Homework option I: better sampling (10+pts)\n", + "\n", + "In this section, you're invited to implement a better rollout strategy called _vine_.\n", + "\n", + "![img](https://s17.postimg.org/i90chxgvj/vine.png)\n", + "\n", + "In most gym environments, you can actually backtrack by using states. You can find a wrapper that saves/loads states in [the mcts seminar](https://github.com/yandexdataschool/Practical_RL/blob/spring19/week10_planning/seminar_MCTS.ipynb).\n", + "\n", + "You can read more about in the [TRPO article](https://arxiv.org/abs/1502.05477) in section 5.2.\n", + "\n", + "The goal here is to implement such rollout policy (we recommend using tree data structure like in the seminar above).\n", + "Then you can assign cummulative rewards similar to `get_cummulative_rewards`, but for a tree.\n", + "\n", + "__bonus task__ - parallelize samples using multiple cores" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": true + }, + "source": [ + "# Homework option II (10+pts)\n", + "\n", + "Let's use TRPO to train evil robots! (pick any of two)\n", + "* [MuJoCo robots](https://gym.openai.com/envs#mujoco)\n", + "* [Box2d robot](https://gym.openai.com/envs/BipedalWalker-v2)\n", + "\n", + "The catch here is that those environments have continuous action spaces. \n", + "\n", + "Luckily, TRPO is a policy gradient method, so it's gonna work for any parametric $\\pi_\\theta(a|s)$. We recommend starting with gaussian policy:\n", + "\n", + "$$\\pi_\\theta(a|s) = N(\\mu_\\theta(s),\\sigma^2_\\theta(s)) = {1 \\over \\sqrt { 2 \\pi {\\sigma^2}_\\theta(s) } } e^{ (a - \n", + "\\mu_\\theta(s))^2 \\over 2 {\\sigma^2}_\\theta(s) } $$\n", + "\n", + "In the $\\sqrt { 2 \\pi {\\sigma^2}_\\theta(s) }$ clause, $\\pi$ means ~3.1415926, not agent's policy.\n", + "\n", + "This essentially means that you will need two output layers:\n", + "* $\\mu_\\theta(s)$, a dense layer with linear activation\n", + "* ${\\sigma^2}_\\theta(s)$, a dense layer with activation T.exp (to make it positive; like rho from bandits)\n", + "\n", + "For multidimensional actions, you can use fully factorized gaussian (basically a vector of gaussians).\n", + "\n", + "__bonus task__: compare performance of continuous action space method to action space discretization" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.6" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/week10_planning/README.md b/week10_planning/README.md new file mode 100644 index 000000000..ca7689341 --- /dev/null +++ b/week10_planning/README.md @@ -0,0 +1,26 @@ +## Assignments + +Just as usual, start with `seminar_MCTS.ipynb` +[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/yandexdataschool/Practical_RL/blob/spring19/week10_planning/seminar_MCTS.ipynb) + +## Materials: planning + +* Planning by dynamic programming (D. Silver) - [video](https://www.youtube.com/watch?v=Nd1-UUMVfz4) +* Planning via tree search [videos 2-6 from CS188](https://www.youtube.com/channel/UCHBzJsIcRIVuzzHVYabikTQ) +* Our lecture: + * [Lecture(planning, MCTS, planning in POMDP)](https://yadi.sk/i/lOAUu7o13JBHFz) & [Seminar(MCTS)](https://yadi.sk/i/bkmjEZrk3JBHGF) + * Slides [part1](https://yadi.sk/i/3PM9zCP33J3ub3) (intro), [part2](https://yadi.sk/i/M03xvZ2y3JMQre) (pomdp) +* Monte-carlo tree search + * Monte-carlo tree search step-by-step by J.Levine (very intuitive) - [video](https://www.youtube.com/watch?v=UXW2yZndl7U) + * Udacity video on monte-carlo tree search (first part of a chain) - [video](https://www.youtube.com/watch?v=onBYsen2_eA) + * A Survey of Monte Carlo Tree Search Methods (2011-2012) [pdf](http://mcts.ai/pubs/mcts-survey-master.pdf) + * Reminder: UCB-1 - [slides](https://www.cs.bham.ac.uk/internal/courses/robotics/lectures/ucb1.pdf) + * Guide to MCTS - [url](https://jeffbradberry.com/posts/2015/09/intro-to-monte-carlo-tree-search/) + or [cached](https://webcache.googleusercontent.com/search?q=cache:jeffbradberry.com/posts/2015/09/intro-to-monte-carlo-tree-search/) + +## Supplementary materials + +* Integrating learning and planning (D. Silver) - [video](https://www.youtube.com/watch?v=ItMutbeOHtc) +* Approximating the MCTS optimal actions - 5vision solution for deephack.RL, code by Mikhail Pavlov - [repo](https://github.com/5vision/uct_atari) +* Alpha Go - [explanation](https://medium.com/@jonathan_hui/alphago-how-it-works-technically-26ddcc085319) +* Monte Carlo Planning in POMDP [pdf](https://papers.nips.cc/paper/4031-monte-carlo-planning-in-large-pomdps.pdf) diff --git a/week10_planning/seminar_MCTS.ipynb b/week10_planning/seminar_MCTS.ipynb new file mode 100644 index 000000000..a6571e6c3 --- /dev/null +++ b/week10_planning/seminar_MCTS.ipynb @@ -0,0 +1,686 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "# In google collab, uncomment this:\n", + "# !wget https://bit.ly/2FMJP5K -q -O setup.py\n", + "# !bash setup.py 2>&1 1>stdout.log | tee stderr.log\n", + "\n", + "# This code creates a virtual display to draw game images on.\n", + "# If you are running locally, just ignore it\n", + "import os\n", + "if type(os.environ.get(\"DISPLAY\")) is not str or len(os.environ.get(\"DISPLAY\")) == 0:\n", + " !bash ../xvfb start\n", + " %env DISPLAY = : 1\n", + "\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "%matplotlib inline" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Seminar: Monte-carlo tree search (5 pts)\n", + "\n", + "In this seminar, we'll implement a vanilla MCTS planning and use it to solve some Gym envs.\n", + "\n", + "But before we do that, we first need to modify gym env to allow saving and loading game states to facilitate backtracking." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import gym\n", + "from gym.core import Wrapper\n", + "from pickle import dumps, loads\n", + "from collections import namedtuple\n", + "\n", + "# a container for get_result function below. Works just like tuple, but prettier\n", + "ActionResult = namedtuple(\n", + " \"action_result\", (\"snapshot\", \"observation\", \"reward\", \"is_done\", \"info\"))\n", + "\n", + "\n", + "class WithSnapshots(Wrapper):\n", + " \"\"\"\n", + " Creates a wrapper that supports saving and loading environemnt states.\n", + " Required for planning algorithms.\n", + "\n", + " This class will have access to the core environment as self.env, e.g.:\n", + " - self.env.reset() #reset original env\n", + " - self.env.ale.cloneState() #make snapshot for atari. load with .restoreState()\n", + " - ...\n", + "\n", + " You can also use reset, step and render directly for convenience.\n", + " - s, r, done, _ = self.step(action) #step, same as self.env.step(action)\n", + " - self.render(close=True) #close window, same as self.env.render(close=True)\n", + " \"\"\"\n", + "\n", + " def get_snapshot(self, render=False):\n", + " \"\"\"\n", + " :returns: environment state that can be loaded with load_snapshot \n", + " Snapshots guarantee same env behaviour each time they are loaded.\n", + "\n", + " Warning! Snapshots can be arbitrary things (strings, integers, json, tuples)\n", + " Don't count on them being pickle strings when implementing MCTS.\n", + "\n", + " Developer Note: Make sure the object you return will not be affected by \n", + " anything that happens to the environment after it's saved.\n", + " You shouldn't, for example, return self.env. \n", + " In case of doubt, use pickle.dumps or deepcopy.\n", + "\n", + " \"\"\"\n", + " if render:\n", + " self.render() # close popup windows since we can't pickle them\n", + " self.close()\n", + " \n", + " if self.unwrapped.viewer is not None:\n", + " self.unwrapped.viewer.close()\n", + " self.unwrapped.viewer = None\n", + " return dumps(self.env)\n", + "\n", + " def load_snapshot(self, snapshot, render=False):\n", + " \"\"\"\n", + " Loads snapshot as current env state.\n", + " Should not change snapshot inplace (in case of doubt, deepcopy).\n", + " \"\"\"\n", + "\n", + " assert not hasattr(self, \"_monitor\") or hasattr(\n", + " self.env, \"_monitor\"), \"can't backtrack while recording\"\n", + "\n", + " if render:\n", + " self.render() # close popup windows since we can't load into them\n", + " self.close()\n", + " self.env = loads(snapshot)\n", + "\n", + " def get_result(self, snapshot, action):\n", + " \"\"\"\n", + " A convenience function that \n", + " - loads snapshot, \n", + " - commits action via self.step,\n", + " - and takes snapshot again :)\n", + "\n", + " :returns: next snapshot, next_observation, reward, is_done, info\n", + "\n", + " Basically it returns next snapshot and everything that env.step would have returned.\n", + " \"\"\"\n", + "\n", + " \n", + "\n", + " return ActionResult(< next_snapshot > , #fill in the variables\n", + " < next_observation > ,\n", + " < reward > , < is_done > , < info > )" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Try out snapshots:\n", + "Let`s check our wrapper. At first, reset environment and save it, further randomly play some actions and restore our environment from the snapshot. It should be the same as our previous initial state." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# make env\n", + "env = WithSnapshots(gym.make(\"CartPole-v0\"))\n", + "env.reset()\n", + "\n", + "n_actions = env.action_space.n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(\"initial_state:\")\n", + "plt.imshow(env.render('rgb_array'))\n", + "env.close()\n", + "\n", + "# create first snapshot\n", + "snap0 = env.get_snapshot()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# play without making snapshots (faster)\n", + "while True:\n", + " is_done = env.step(env.action_space.sample())[2]\n", + " if is_done:\n", + " print(\"Whoops! We died!\")\n", + " break\n", + "\n", + "print(\"final state:\")\n", + "plt.imshow(env.render('rgb_array'))\n", + "env.close()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# reload initial state\n", + "env.load_snapshot(snap0)\n", + "\n", + "print(\"\\n\\nAfter loading snapshot\")\n", + "plt.imshow(env.render('rgb_array'))\n", + "env.close()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# get outcome (snapshot, observation, reward, is_done, info)\n", + "res = env.get_result(snap0, env.action_space.sample())\n", + "\n", + "snap1, observation, reward = res[:3]\n", + "\n", + "# second step\n", + "res2 = env.get_result(snap1, env.action_space.sample())" + ] + }, + { + "attachments": { + "MCTS.png": { + "image/png": "" + } + }, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# MCTS: Monte-Carlo tree search\n", + "\n", + "![MCTS.png](attachment:MCTS.png)\n", + "\n", + "In this section, we'll implement the vanilla MCTS algorithm with UCB1-based node selection.\n", + "\n", + "$$\n", + "\\dot{v_a} = v_a + C_n \\sqrt{\\frac{2 \\log {N}}{n_a}}\n", + "$$\n", + "\n", + "where: \n", + "- $N$ - number of time-steps so far,\n", + "- $n_a$ - times action a is taken\n", + "- $C_n$ - balance between exploration-exploitation, for $R \\in [0,1]$ $ C_p = \\frac{1}{\\sqrt{2}}$ \n", + "[paper](http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.374.1202&rep=rep1&type=pdf)\n", + "\n", + "We will start by implementing the `Node` class - a simple class that acts like MCTS node and supports some of the MCTS algorithm steps.\n", + "\n", + "This MCTS implementation makes some assumptions about the environment, you can find those _in the notes section at the end of the notebook_." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "assert isinstance(env, WithSnapshots)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "class Node:\n", + " \"\"\" a tree node for MCTS \"\"\"\n", + "\n", + " # metadata:\n", + " parent = None # parent Node\n", + " value_sum = 0. # sum of state values from all visits (numerator)\n", + " times_visited = 0 # counter of visits (denominator)\n", + "\n", + " def __init__(self, parent, action,):\n", + " \"\"\"\n", + " Creates and empty node with no children.\n", + " Does so by commiting an action and recording outcome.\n", + "\n", + " :param parent: parent Node\n", + " :param action: action to commit from parent Node\n", + "\n", + " \"\"\"\n", + "\n", + " self.parent = parent\n", + " self.action = action\n", + " self.children = set() # set of child nodes\n", + "\n", + " # get action outcome and save it\n", + " res = env.get_result(parent.snapshot, action)\n", + " self.snapshot, self.observation, self.immediate_reward, self.is_done, _ = res\n", + "\n", + " def is_leaf(self):\n", + " return len(self.children) == 0\n", + "\n", + " def is_root(self):\n", + " return self.parent is None\n", + "\n", + " def get_mean_value(self):\n", + " return self.value_sum / self.times_visited if self.times_visited != 0 else 0\n", + "\n", + " def ucb_score(self, scale=10, max_value=1e100):\n", + " \"\"\"\n", + " Computes ucb1 upper bound using current value and visit counts for node and it's parent.\n", + "\n", + " :param scale: Multiplies upper bound by that. From hoeffding inequality, assumes reward range to be [0,scale].\n", + " :param max_value: a value that represents infinity (for unvisited nodes)\n", + "\n", + " \"\"\"\n", + "\n", + " if self.times_visited == 0:\n", + " return max_value\n", + "\n", + " # compute ucb-1 additive component (to be added to mean value)\n", + " # hint: you can use self.parent.times_visited for N times node was considered,\n", + " # and self.times_visited for n times it was visited\n", + "\n", + " U = \n", + "\n", + " return self.get_mean_value() + scale*U\n", + "\n", + " # MCTS steps\n", + "\n", + " def select_best_leaf(self):\n", + " \"\"\"\n", + " Picks the leaf with highest priority to expand\n", + " Does so by recursively picking nodes with best UCB-1 score until it reaches the leaf.\n", + "\n", + " \"\"\"\n", + " if self.is_leaf():\n", + " return self\n", + "\n", + " children = self.children\n", + "\n", + " best_child = \n", + "\n", + " if node.is_done:\n", + " node.propagate(0)\n", + "\n", + " else: # node is not terminal\n", + " " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Plan and execute\n", + "In this section, we use the MCTS implementation to find optimal policy." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "env = WithSnapshots(gym.make(\"CartPole-v0\"))\n", + "root_observation = env.reset()\n", + "root_snapshot = env.get_snapshot()\n", + "root = Root(root_snapshot, root_observation)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# plan from root:\n", + "plan_mcts(root, n_iters=1000)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# import copy\n", + "# saved_root = copy.deepcopy(root)\n", + "# root = saved_root" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from IPython.display import clear_output\n", + "from itertools import count\n", + "from gym.wrappers import Monitor\n", + "\n", + "total_reward = 0 # sum of rewards\n", + "test_env = loads(root_snapshot) # env used to show progress\n", + "\n", + "for i in count():\n", + "\n", + " # get best child\n", + " best_child = ');\n", - " fmt_picker.addClass('mpl-toolbar-option ui-widget ui-widget-content');\n", - " fmt_picker_span.append(fmt_picker);\n", - " nav_element.append(fmt_picker_span);\n", - " this.format_dropdown = fmt_picker[0];\n", - "\n", - " for (var ind in mpl.extensions) {\n", - " var fmt = mpl.extensions[ind];\n", - " var option = $(\n", - " '