Merge pull request #1 from Rufaim/tensorflow_2_x

Tensorflow 2 x
Rufaim · Mar 13, 2020 · 7bcbee7 · 7bcbee7
2 parents 344bfb0 + 273f56c
commit 7bcbee7
Show file tree

Hide file tree

Showing 15 changed files with 390 additions and 4 deletions.
diff --git a/actor_critic_deep_q_learning.py → ...roblem1.x/actor_critic_deep_q_learning.py b/actor_critic_deep_q_learning.py → ...roblem1.x/actor_critic_deep_q_learning.py
diff --git a/nn_layers/__init__.py → ...pendulum_problem1.x/nn_layers/__init__.py b/nn_layers/__init__.py → ...pendulum_problem1.x/nn_layers/__init__.py
diff --git a/nn_layers/interfaces.py → ...ndulum_problem1.x/nn_layers/interfaces.py b/nn_layers/interfaces.py → ...ndulum_problem1.x/nn_layers/interfaces.py
diff --git a/nn_layers/nets_constructors.py → ...problem1.x/nn_layers/nets_constructors.py b/nn_layers/nets_constructors.py → ...problem1.x/nn_layers/nets_constructors.py
diff --git a/nn_layers/normalization.py → ...lum_problem1.x/nn_layers/normalization.py b/nn_layers/normalization.py → ...lum_problem1.x/nn_layers/normalization.py
diff --git a/replay_buffer.py → (old)pendulum_problem1.x/replay_buffer.py b/replay_buffer.py → (old)pendulum_problem1.x/replay_buffer.py
diff --git a/model_train.py → (old)pendulum_problem1.x/train_ddpg.py b/model_train.py → (old)pendulum_problem1.x/train_ddpg.py
diff --git a/README.md b/README.md
@@ -1,6 +1,26 @@
 # Pendulum-problem
-This is InvertedPendulum problem from classic Control theory solved with ActorCritic model from modern Reinforcement learning
+This is an [Inverted Pendulum](https://en.wikipedia.org/wiki/Inverted_pendulum)  problem from classic Control theory solved with [Deep Deteministic Policy Gradients](https://arxiv.org/pdf/1509.02971v2.pdf) model.
 
-Links:
-* [Original article](https://arxiv.org/pdf/1509.02971v2.pdf)
-* [InvertedPendulum problem](https://en.wikipedia.org/wiki/Inverted_pendulum "Inverted pendulum") 
+The model was trained on [OpenAI's Pendulum-v0](https://gym.openai.com/envs/Pendulum-v0/).
+
+![Learning Process](tensorboard_screenshot/screenshot_2020-03-13.png)
+
+To change parameters of the model please use global variables of the train script. 
+
+### Usage
+*Note*: commands for Ubuntu 18.04
+1. Clone the repository \
+```git clone ```
+2. Go to source root directory \
+```cd pendulum_problem```
+3. Run train script \
+```python3 train_ddpg.py```
+
+Folder `(old)pendulum_problem1.x` contain an outdated implementation for tensorflow 1.8.0
+
+### Requirements:
+*Note*: was tested on
+
+* tensorflow == 2.0
+* numpy == 1.18.1
+* gym == 1.17.1
diff --git a/pendulum_problem/ddpg.py b/pendulum_problem/ddpg.py
@@ -0,0 +1,100 @@
+import tensorflow as tf
+import numpy as np
+
+class DeepDeterministicPolicyGradients(object):
+    def __init__(self, actor_net, critic_net, action_exploration_func, replay_buffer, \
+                 action_size, state_size, discount_factor,batch_size):
+        self.actor_net = actor_net
+        self.critic_net = critic_net
+        self.action_exploration_func = action_exploration_func
+        self.replay_buffer = replay_buffer
+        self.action_size = action_size
+        self.state_size = state_size
+        self.discount_factor = discount_factor
+        self.batch_size = batch_size
+
+        self._init_target_nets()
+
+        self.actor_optimizer = tf.keras.optimizers.Adam(self.actor_net.learning_rate)
+        self.critic_optimizer = tf.keras.optimizers.Adam(self.critic_net.learning_rate)
+
+    def _init_target_nets(self):
+        # train nets warmup
+        inp = tf.zeros((0, self.state_size))
+        a = self.actor_net(inp)
+        self.critic_net(inp, a)
+
+        self.target_actor_net = self.actor_net.clone()
+        self.target_critic_net = self.critic_net.clone()
+
+        # target nets warmup
+        a = self.target_actor_net(inp)
+        self.target_critic_net(inp, a)
+
+        for v2, v1 in zip(self.target_actor_net.trainable_variables, self.actor_net.trainable_variables):
+            v2.assign(v1)
+
+        for v2, v1 in zip(self.target_critic_net.trainable_variables, self.critic_net.trainable_variables):
+            v2.assign(v1)
+
+    def actor_predict(self, state):
+        s = np.atleast_2d(state)
+        return self.actor_net(s).numpy()
+
+    def critic_predict(self, state):
+        s = np.atleast_2d(state)
+        a = self.actor_net(s)
+        return self.critic_net(s, a).numpy()
+
+    def get_action(self, state):
+        return self.action_exploration_func(self.actor_predict(state))
+
+    def add_to_buffer(self, S, A, R, T, S1):
+        self.replay_buffer.add(S, A, R, T, S1)
+
+    def update(self):
+        if self.replay_buffer.size() >= self.batch_size:
+            states, actions, rewards, terminates, next_states = self.replay_buffer.sample_batch(self.batch_size)
+
+            target_actions = self.target_actor_net(next_states)
+            target_q_vals = self.target_critic_net(next_states, target_actions).numpy()
+
+            y_is = rewards.reshape((self.batch_size, 1))
+            terminates = terminates.reshape((self.batch_size, 1))
+            y_is[~terminates] += self.discount_factor * target_q_vals[~terminates]
+
+            self._update_critic(states, actions, y_is)
+            self._update_actor(states)
+
+        self._update_target_networks()
+
+    @tf.function
+    def _update_critic(self, states, actions, target_qs):
+        with tf.GradientTape() as tape:
+            predicted_q_value = self.critic_net(states, actions)
+            critic_loss = tf.reduce_sum((target_qs - predicted_q_value) ** 2)
+
+        grads = tape.gradient(critic_loss, self.critic_net.trainable_variables)
+        #grads = tf.clip_by_global_norm(grads, self.critic_net.grad_norm)
+        self.critic_optimizer.apply_gradients(zip(grads, self.critic_net.trainable_variables))
+
+    @tf.function
+    def _update_actor(self, states):
+        with tf.GradientTape() as tape:
+            actions = self.actor_net(states)
+            q_vals = self.critic_net(states, actions)
+            # minus since we are ascending
+            q_vals = -q_vals
+
+        grads = tape.gradient(q_vals, self.actor_net.trainable_variables)
+        # expectation from sum
+        grads = [g / self.batch_size for g in grads]
+        self.actor_optimizer.apply_gradients(zip(grads, self.actor_net.trainable_variables))
+
+    @tf.function
+    def _update_target_networks(self):
+        for v2, v1 in zip(self.target_actor_net.trainable_variables, self.actor_net.trainable_variables):
+            v2.assign(self.actor_net.tau * v1 + (1 - self.actor_net.tau) * v2)
+
+        for v2, v1 in zip(self.target_critic_net.trainable_variables, self.critic_net.trainable_variables):
+            v2.assign(self.critic_net.tau * v1 + (1 - self.critic_net.tau) * v2)
diff --git a/pendulum_problem/exploration.py b/pendulum_problem/exploration.py
@@ -0,0 +1,37 @@
+import numpy as np
+
+class RandomActionExploration(object):
+    def __init__(self,p_rand_action, action_size, action_bounds, seed=None):
+        self.p_rand_action = p_rand_action
+        self.action_size = action_size
+        self.action_bounds = np.array(action_bounds)
+        self._random_generator = np.random.RandomState(seed)
+
+    def __call__(self,action):
+        if self._random_generator.rand() < self.p_rand_action:
+            a = self.action_bounds * (2*self._random_generator.random(self.action_size)-1)
+            return a
+        return action
+
+# Based on https://github.com/openai/baselines/blob/master/baselines/ddpg/noise.py
+class OrnsteinUhlenbeckActionNoise(object):
+    def __init__(self, mu, sigma, theta=.15, dt=1e-2, x0=None,seed=None):
+        self.theta = theta
+        self.mu = mu
+        self.sigma = sigma
+        self.dt = dt
+        self.x0 = x0
+        self.seed = seed
+        self.reset()
+
+    def __call__(self,action):
+        x = self.x_prev + self.theta * (self.mu - self.x_prev) * self.dt + self.sigma * np.sqrt(self.dt) * self._random_generator.normal(size=self.mu.shape)
+        self.x_prev = x
+        return action + x
+
+    def reset(self):
+        self._random_generator = np.random.RandomState(seed=self.seed)
+        self.x_prev = self.x0 if self.x0 is not None else np.zeros_like(self.mu)
+
+    def __str__(self):
+        return 'OrnsteinUhlenbeckActionNoise(mu={}, sigma={})'.format(self.mu, self.sigma)
diff --git a/pendulum_problem/neural_nets.py b/pendulum_problem/neural_nets.py
@@ -0,0 +1,60 @@
+import tensorflow as tf
+from utils import clone_net_structure
+
+class ActorNet(tf.keras.Model):
+    """Actor network for Actor-Critic model.
+    It receives state as input and produces action vector
+    """
+    def __init__(self, net_structure, action_bounds, tau, learning_rate):
+        super(ActorNet, self).__init__()
+        self.net_structure = net_structure
+        self.action_bounds = tf.constant(action_bounds, shape=[1, len(action_bounds)], dtype=tf.float32)
+        self.tau = tf.constant(tau, dtype=tf.float32)
+        self.learning_rate = learning_rate
+
+    @tf.function
+    def call(self, input, training=None, mask=None):
+        out = input
+        for layer in self.net_structure:
+            kwargs = {}
+            if training is not None:
+                kwargs['training'] = training
+            if mask is not None:
+                kwargs['mask'] = mask
+            out = layer(out, **kwargs)
+        scaled_out = out * self.action_bounds
+        return scaled_out
+
+    def clone(self):
+        structure = clone_net_structure(self.net_structure)
+        return ActorNet(structure, self.action_bounds, self.tau, self.learning_rate)
+
+
+class CriticNet(tf.keras.Model):
+    """Critic network for Actor-Critic model.
+    It receives state and action as input and produces Q-value
+    """
+    def __init__(self, net_structure, tau, learning_rate, grad_norm):
+        super(CriticNet, self).__init__()
+        self.net_structure = net_structure
+        self.tau = tf.constant(tau, dtype=tf.float32)
+        self.learning_rate = learning_rate
+        self.grad_norm = grad_norm
+
+    @tf.function
+    def call(self, input, action, training=None, mask=None):
+        out = input
+        for i, layer in enumerate(self.net_structure):
+            if i == 1:
+                out = tf.concat([out, action], axis=-1)
+            kwargs = {}
+            if training is not None:
+                kwargs['training'] = training
+            if mask is not None:
+                kwargs['mask'] = mask
+            out = layer(out, **kwargs)
+        return out
+
+    def clone(self):
+        structure = clone_net_structure(self.net_structure)
+        return CriticNet(structure, self.tau, self.learning_rate, self.grad_norm)
diff --git a/pendulum_problem/replay_buffer.py b/pendulum_problem/replay_buffer.py
@@ -0,0 +1,75 @@
+import numpy as np
+
+class ReplayBuffer(object):
+    """Classical Replay Buffer storing SARS tuples.
+    The right side of the buffer contains recent experiments
+
+    Parameters:
+    ----------
+    buffer_size : int (default -1)
+        Maximum buffer capacity. If equals to -1 than size is unlimited.
+    seed : int (default None)
+        Seed to sample minibatches.
+    """
+    def __init__(self, buffer_size=-1, seed=None):
+        self.buffer_size = buffer_size
+        self._random_generator = np.random.RandomState(seed)
+        self._s  = []
+        self._a  = []
+        self._r  = []
+        self._t  = []
+        self._s1 = []
+
+    def __str__(self):
+        if self.buffer_size > 0:
+            return "ReplayBuffer size {} of {}".format(self.size(),self.buffer_size)
+        else:
+            return f"ReplayBuffer size {self.size()}"
+
+    def size(self):
+        """Returns current buffer size
+        """
+        return len(self._s)
+
+    def add(self, s, a, r, t, s1):
+        """Pushes a SARS-tuple to buffer
+        """
+        self._s.append(s)
+        self._a.append(a)
+        self._r.append(r)
+        self._t.append(t)
+        self._s1.append(s1)
+
+        if self.buffer_size > 0 and self.size() >= self.buffer_size :
+            self._s.pop(0)
+            self._a.pop(0)
+            self._r.pop(0)
+            self._t.pop(0)
+            self._s1.pop(0)
+
+        return self
+
+    def sample_batch(self, batch_size):
+        """Returns minibatch sampled from the buffer with replacments.
+        """
+        batch = min(self.size(),batch_size)
+        idx = self._random_generator.randint(0,self.size(),(batch,))
+
+        s_batch,a_batch,r_batch,t_batch,s1_batch = [],[],[],[],[]
+        for i in idx:
+            s_batch.append(self._s[i])
+            a_batch.append(self._a[i])
+            r_batch.append(self._r[i])
+            t_batch.append(self._t[i])
+            s1_batch.append(self._s1[i])
+
+        return np.array(s_batch,dtype=np.float32), np.array(a_batch,dtype=np.float32), np.array(r_batch,dtype=np.float32),\
+               np.array(t_batch,dtype=np.bool), np.array(s1_batch,dtype=np.float32)
+
+    def clear(self):
+        self._s.clear()
+        self._a.clear()
+        self._r.clear()
+        self._t.clear()
+        self._s1.clear()
+        return self
diff --git a/pendulum_problem/train_ddpg.py b/pendulum_problem/train_ddpg.py
@@ -0,0 +1,76 @@
+import datetime
+import numpy as np
+import tensorflow as tf
+import gym
+from ddpg import DeepDeterministicPolicyGradients
+from replay_buffer import ReplayBuffer
+from neural_nets import ActorNet, CriticNet
+from exploration import OrnsteinUhlenbeckActionNoise
+
+MINIBATCH_SIZE = 200
+MAX_EPISODES = 1000
+MAX_EP_STEPS = 1000
+
+TAU = 0.001
+ACTOR_LEARNING_RATE = 0.0001
+CRITIC_LEARNING_RATE = 0.001
+GRADIENT_MAX_NORM = 5
+BUFFER_SIZE = 100000
+DISCOUNT_FACTOR = 0.99
+P_RAND_ACTION = 0.05
+SEED = 42
+
+
+kernel_init = tf.keras.initializers.glorot_normal(SEED)
+environment = gym.make('Pendulum-v0')
+state_size = environment.observation_space.shape[0]
+action_size = environment.action_space.shape[0]
+action_bound = environment.action_space.high[0]
+
+CRITIC_NET_STRUCTURE = [tf.keras.layers.Dense(300,kernel_initializer=kernel_init),
+                        tf.keras.layers.BatchNormalization(),
+                        tf.keras.layers.ReLU(),
+                        tf.keras.layers.Dense(400,kernel_initializer=kernel_init,activation=tf.nn.relu),
+                        tf.keras.layers.Dense(1,kernel_initializer=kernel_init)
+                        ]
+ACTOR_NET_STRUCTURE = [tf.keras.layers.Dense(300,kernel_initializer=kernel_init),
+                        tf.keras.layers.BatchNormalization(),
+                        tf.keras.layers.ReLU(),
+                        tf.keras.layers.Dense(400,kernel_initializer=kernel_init,activation=tf.nn.relu),
+                        tf.keras.layers.Dense(action_size,kernel_initializer=kernel_init,activation=tf.nn.tanh)
+                        ]
+
+
+actor_net = ActorNet(ACTOR_NET_STRUCTURE,[action_bound],TAU,ACTOR_LEARNING_RATE)
+critic_net = CriticNet(CRITIC_NET_STRUCTURE,TAU,CRITIC_LEARNING_RATE,GRADIENT_MAX_NORM)
+
+action_noise = OrnsteinUhlenbeckActionNoise(np.zeros((action_size,)),0.2)
+
+replay_buffer = ReplayBuffer(BUFFER_SIZE,SEED)
+model = DeepDeterministicPolicyGradients(actor_net,critic_net,action_noise,\
+                                         replay_buffer,action_size,state_size,DISCOUNT_FACTOR,MINIBATCH_SIZE)
+
+logdir = "logs/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
+file_writer = tf.summary.create_file_writer(logdir)
+file_writer.set_as_default()
+
+for i in range(MAX_EPISODES):
+    state = environment.reset()
+    ep_reward = 0
+
+    for j in range(MAX_EP_STEPS):
+        a = model.actor_predict(state)
+        a = a.reshape((-1,))
+        next_state, r, t, _ = environment.step(a)
+        model.add_to_buffer(np.squeeze(state), a, r, t, np.squeeze(next_state))
+
+        model.update()
+
+        state = next_state.copy()
+        ep_reward += r
+        if t:
+            break
+    tf.summary.scalar('Episode reward', data=ep_reward, step=i)
+    file_writer.flush()
+    print('Episode: {:d} | Reward: {:.2f} |'.format(i, ep_reward, i))
+environment.close()