From edfdd1ccfd7f690debbe658e61a3f579d6dffba5 Mon Sep 17 00:00:00 2001 From: Michael Solotky <25284863+MichaelSolotky@users.noreply.github.com> Date: Sun, 9 Aug 2020 14:23:28 +0300 Subject: [PATCH] Upgrade atari_wrapper to tf2 (#452) * add actor-critic theory * update tf functions for tf2 api * bug + code style fix * revert the import of tf in a method * remove TFSummaries * add tf2 summaries * remove log_dir from TFSummaries * bring back links to a2c algo description * unify notation in formulas * Add add_summary_scalar() stub to SummariesBase * Replace default False value with a more idiomatic None Co-authored-by: MichaelSolotky <> Co-authored-by: Lionel Miller --- week06_policy_based/a2c-optional.ipynb | 74 ++++++++++++++++++++++++-- week06_policy_based/atari_wrappers.py | 56 ++++++++++++------- 2 files changed, 107 insertions(+), 23 deletions(-) diff --git a/week06_policy_based/a2c-optional.ipynb b/week06_policy_based/a2c-optional.ipynb index 4cb4186ad..316a4e21e 100644 --- a/week06_policy_based/a2c-optional.ipynb +++ b/week06_policy_based/a2c-optional.ipynb @@ -144,10 +144,10 @@ "To train the part of the model that predicts state values you will need to compute the value targets. \n", "Any callable could be passed to `EnvRunner` to be applied to each partial trajectory after it is collected. \n", "Thus, we can implement and use `ComputeValueTargets` callable. \n", - "The formula for the value targets is simple:\n", + "The formula for the value targets is simple, it's the right side of the following equation:\n", "\n", "$$\n", - "\\hat v(s_t) = \\left( \\sum_{t'=0}^{T - 1 - t} \\gamma^{t'}r_{t+t'} \\right) + \\gamma^T \\hat{v}(s_{t+T}),\n", + "V(s_t) = \\left( \\sum_{t'=0}^{T - 1 - t} \\gamma^{t'} \\cdot r (s_{t+t'}, a_{t + t'}) \\right) + \\gamma^T \\cdot V(s_{t+T}),\n", "$$\n", "\n", "In implementation, however, do not forget to use \n", @@ -165,7 +165,7 @@ "class ComputeValueTargets:\n", " def __init__(self, policy, gamma=0.99):\n", " self.policy = policy\n", - " \n", + "\n", " def __call__(self, trajectory):\n", " # This method should modify trajectory inplace by adding\n", " # an item with key 'value_targets' to it.\n", @@ -214,7 +214,58 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Now is the time to implement the advantage actor critic algorithm itself. You can look into your lecture,\n", + "# Actor-critic objective\n", + "\n", + "Here we define a loss function that uses rollout above to train advantage actor-critic agent.\n", + "\n", + "\n", + "Our loss consists of three components:\n", + "\n", + "* __The policy \"loss\"__\n", + " $$ \\hat J = {1 \\over T} \\cdot \\sum_t { \\log \\pi(a_t | s_t) } \\cdot A_{const}(s,a) $$\n", + " * This function has no meaning in and of itself, but it was built such that\n", + " * $ \\nabla \\hat J = {1 \\over N} \\cdot \\sum_t { \\nabla \\log \\pi(a_t | s_t) } \\cdot A(s,a) \\approx \\nabla E_{s, a \\sim \\pi} R(s,a) $\n", + " * Therefore if we __maximize__ J_hat with gradient descent we will maximize expected reward\n", + " \n", + " \n", + "* __The value \"loss\"__\n", + " $$ L_{td} = {1 \\over T} \\cdot \\sum_t { [r + \\gamma \\cdot V_{const}(s_{t+1}) - V(s_t)] ^ 2 }$$\n", + " * Ye Olde TD_loss from q-learning and alike\n", + " * If we minimize this loss, V(s) will converge to $V_\\pi(s) = E_{a \\sim \\pi(a | s)} R(s,a) $\n", + "\n", + "\n", + "* __Entropy Regularizer__\n", + " $$ H = - {1 \\over T} \\sum_t \\sum_a {\\pi(a|s_t) \\cdot \\log \\pi (a|s_t)}$$\n", + " * If we __maximize__ entropy we discourage agent from predicting zero probability to actions\n", + " prematurely (a.k.a. exploration)\n", + " \n", + " \n", + "So we optimize a linear combination of $L_{td}$ $- \\hat J$, $-H$\n", + " \n", + "```\n", + "\n", + "```\n", + "\n", + "```\n", + "\n", + "```\n", + "\n", + "```\n", + "\n", + "```\n", + "\n", + "\n", + "__One more thing:__ since we train on T-step rollouts, we can use N-step formula for advantage for free:\n", + " * At the last step, $A(s_t,a_t) = r(s_t, a_t) + \\gamma \\cdot V(s_{t+1}) - V(s) $\n", + " * One step earlier, $A(s_t,a_t) = r(s_t, a_t) + \\gamma \\cdot r(s_{t+1}, a_{t+1}) + \\gamma ^ 2 \\cdot V(s_{t+2}) - V(s) $\n", + " * Et cetera, et cetera. This way agent starts training much faster since it's estimate of A(s,a) depends less on his (imperfect) value function and more on actual rewards. There's also a [nice generalization](https://arxiv.org/abs/1506.02438) of this." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can also look into your lecture,\n", "[Mnih et al. 2016](https://arxiv.org/abs/1602.01783) paper, and [lecture](https://www.youtube.com/watch?v=Tol_jw5hWnI&list=PLkFD6_40KJIxJMR-j5A1mkxK26gh_qg37&index=20) by Sergey Levine." ] }, @@ -288,9 +339,22 @@ } ], "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", "name": "python", - "pygments_lexer": "ipython3" + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.3" } }, "nbformat": 4, diff --git a/week06_policy_based/atari_wrappers.py b/week06_policy_based/atari_wrappers.py index c3c45740b..4bc23f50a 100644 --- a/week06_policy_based/atari_wrappers.py +++ b/week06_policy_based/atari_wrappers.py @@ -213,12 +213,16 @@ def __init__(self, env, prefix=None, running_mean_size=100): self.episode_counter = 0 self.prefix = prefix or self.env.spec.id - nenvs = getattr(self.env.unwrapped, "nenvs", 1) - self.rewards = np.zeros(nenvs) - self.had_ended_episodes = np.zeros(nenvs, dtype=np.bool) - self.episode_lengths = np.zeros(nenvs) + self.nenvs = getattr(self.env.unwrapped, "nenvs", 1) + self.rewards = np.zeros(self.nenvs) + self.had_ended_episodes = np.zeros(self.nenvs, dtype=np.bool) + self.episode_lengths = np.zeros(self.nenvs) self.reward_queues = [deque([], maxlen=running_mean_size) - for _ in range(nenvs)] + for _ in range(self.nenvs)] + self.global_step = 0 + + def add_summary_scalar(self, name, value): + raise NotImplementedError def should_write_summaries(self): """ Returns true if it's time to write summaries. """ @@ -260,6 +264,8 @@ def step(self, action): self.reward_queues[i].append(self.rewards[i]) self.rewards[i] = 0 + self.global_step += self.nenvs + if self.should_write_summaries(): self.add_summaries() return obs, rew, done, info @@ -272,19 +278,22 @@ def reset(self, **kwargs): class TFSummaries(SummariesBase): - """ Writes env summaries using TensorFlow.""" + """ Writes env summaries using TensorFlow. + In order to write summaries in a specific directory, + you may define a writer and set it as default just before + training loop as in an example here + https://www.tensorflow.org/api_docs/python/tf/summary + Other summaries could be added in A2C class or elsewhere + """ - def __init__(self, env, prefix=None, running_mean_size=100, step_var=None): + def __init__(self, env, prefix=None, + running_mean_size=100, step_var=None): super().__init__(env, prefix, running_mean_size) - import tensorflow as tf - self.step_var = (step_var if step_var is not None - else tf.train.get_global_step()) - def add_summary_scalar(self, name, value): import tensorflow as tf - tf.contrib.summary.scalar(name, value, step = self.step_var) + tf.summary.scalar(name, value, self.global_step) class NumpySummaries(SummariesBase): @@ -304,7 +313,7 @@ def get_values(cls, name): def clear(cls): cls._summaries = defaultdict(list) - def __init__(self, env, prefix = None, running_mean_size = 100): + def __init__(self, env, prefix=None, running_mean_size=100): super().__init__(env, prefix, running_mean_size) def add_summary_scalar(self, name, value): @@ -316,6 +325,7 @@ def nature_dqn_env(env_id, nenvs=None, seed=None, """ Wraps env as in Nature DQN paper. """ if "NoFrameskip" not in env_id: raise ValueError(f"env_id must have 'NoFrameskip' but is {env_id}") + if nenvs is not None: if seed is None: seed = list(range(nenvs)) @@ -327,20 +337,30 @@ def nature_dqn_env(env_id, nenvs=None, seed=None, env = ParallelEnvBatch([ lambda i=i, env_seed=env_seed: nature_dqn_env( - env_id, seed=env_seed, summaries=False, clip_reward=False) + env_id, seed=env_seed, summaries=None, clip_reward=False) for i, env_seed in enumerate(seed) ]) - if summaries: - summaries_class = NumpySummaries if summaries == 'Numpy' else TFSummaries - env = summaries_class(env, prefix=env_id) + if summaries is not None: + if summaries == 'Numpy': + env = NumpySummaries(env, prefix=env_id) + elif summaries == 'TensorFlow': + env = TFSummaries(env, prefix=env_id) + else: + raise ValueError( + f"Unknown `summaries` value: expected either 'Numpy' or 'TensorFlow', got {summaries}") if clip_reward: env = ClipReward(env) return env env = gym.make(env_id) env.seed(seed) - if summaries: + if summaries == 'Numpy': + env = NumpySummaries(env) + elif summaries == 'TensorFlow': env = TFSummaries(env) + elif summaries: + raise ValueError(f"summaries must be either Numpy, " + f"or TensorFlow, or a falsy value, but is {summaries}") env = EpisodicLife(env) if "FIRE" in env.unwrapped.get_action_meanings(): env = FireReset(env)