Skip to content

Commit

Permalink
Merge pull request #1 from Rufaim/tensorflow_2_x
Browse files Browse the repository at this point in the history
Tensorflow 2 x
  • Loading branch information
Rufaim committed Mar 13, 2020
2 parents 344bfb0 + 273f56c commit 7bcbee7
Show file tree
Hide file tree
Showing 15 changed files with 390 additions and 4 deletions.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
28 changes: 24 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,26 @@
# Pendulum-problem
This is InvertedPendulum problem from classic Control theory solved with ActorCritic model from modern Reinforcement learning
This is an [Inverted Pendulum](https://en.wikipedia.org/wiki/Inverted_pendulum) problem from classic Control theory solved with [Deep Deteministic Policy Gradients](https://arxiv.org/pdf/1509.02971v2.pdf) model.

Links:
* [Original article](https://arxiv.org/pdf/1509.02971v2.pdf)
* [InvertedPendulum problem](https://en.wikipedia.org/wiki/Inverted_pendulum "Inverted pendulum")
The model was trained on [OpenAI's Pendulum-v0](https://gym.openai.com/envs/Pendulum-v0/).

![Learning Process](tensorboard_screenshot/screenshot_2020-03-13.png)

To change parameters of the model please use global variables of the train script.

### Usage
*Note*: commands for Ubuntu 18.04
1. Clone the repository \
```git clone ```
2. Go to source root directory \
```cd pendulum_problem```
3. Run train script \
```python3 train_ddpg.py```

Folder `(old)pendulum_problem1.x` contain an outdated implementation for tensorflow 1.8.0

### Requirements:
*Note*: was tested on

* tensorflow == 2.0
* numpy == 1.18.1
* gym == 1.17.1
100 changes: 100 additions & 0 deletions pendulum_problem/ddpg.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
import tensorflow as tf
import numpy as np

class DeepDeterministicPolicyGradients(object):
def __init__(self, actor_net, critic_net, action_exploration_func, replay_buffer, \
action_size, state_size, discount_factor,batch_size):
self.actor_net = actor_net
self.critic_net = critic_net
self.action_exploration_func = action_exploration_func
self.replay_buffer = replay_buffer
self.action_size = action_size
self.state_size = state_size
self.discount_factor = discount_factor
self.batch_size = batch_size

self._init_target_nets()

self.actor_optimizer = tf.keras.optimizers.Adam(self.actor_net.learning_rate)
self.critic_optimizer = tf.keras.optimizers.Adam(self.critic_net.learning_rate)

def _init_target_nets(self):
# train nets warmup
inp = tf.zeros((0, self.state_size))
a = self.actor_net(inp)
self.critic_net(inp, a)

self.target_actor_net = self.actor_net.clone()
self.target_critic_net = self.critic_net.clone()

# target nets warmup
a = self.target_actor_net(inp)
self.target_critic_net(inp, a)

for v2, v1 in zip(self.target_actor_net.trainable_variables, self.actor_net.trainable_variables):
v2.assign(v1)

for v2, v1 in zip(self.target_critic_net.trainable_variables, self.critic_net.trainable_variables):
v2.assign(v1)

def actor_predict(self, state):
s = np.atleast_2d(state)
return self.actor_net(s).numpy()

def critic_predict(self, state):
s = np.atleast_2d(state)
a = self.actor_net(s)
return self.critic_net(s, a).numpy()

def get_action(self, state):
return self.action_exploration_func(self.actor_predict(state))

def add_to_buffer(self, S, A, R, T, S1):
self.replay_buffer.add(S, A, R, T, S1)

def update(self):
if self.replay_buffer.size() >= self.batch_size:
states, actions, rewards, terminates, next_states = self.replay_buffer.sample_batch(self.batch_size)

target_actions = self.target_actor_net(next_states)
target_q_vals = self.target_critic_net(next_states, target_actions).numpy()

y_is = rewards.reshape((self.batch_size, 1))
terminates = terminates.reshape((self.batch_size, 1))
y_is[~terminates] += self.discount_factor * target_q_vals[~terminates]

self._update_critic(states, actions, y_is)
self._update_actor(states)

self._update_target_networks()

@tf.function
def _update_critic(self, states, actions, target_qs):
with tf.GradientTape() as tape:
predicted_q_value = self.critic_net(states, actions)
critic_loss = tf.reduce_sum((target_qs - predicted_q_value) ** 2)

grads = tape.gradient(critic_loss, self.critic_net.trainable_variables)
#grads = tf.clip_by_global_norm(grads, self.critic_net.grad_norm)
self.critic_optimizer.apply_gradients(zip(grads, self.critic_net.trainable_variables))

@tf.function
def _update_actor(self, states):
with tf.GradientTape() as tape:
actions = self.actor_net(states)
q_vals = self.critic_net(states, actions)
# minus since we are ascending
q_vals = -q_vals

grads = tape.gradient(q_vals, self.actor_net.trainable_variables)
# expectation from sum
grads = [g / self.batch_size for g in grads]
self.actor_optimizer.apply_gradients(zip(grads, self.actor_net.trainable_variables))

@tf.function
def _update_target_networks(self):
for v2, v1 in zip(self.target_actor_net.trainable_variables, self.actor_net.trainable_variables):
v2.assign(self.actor_net.tau * v1 + (1 - self.actor_net.tau) * v2)

for v2, v1 in zip(self.target_critic_net.trainable_variables, self.critic_net.trainable_variables):
v2.assign(self.critic_net.tau * v1 + (1 - self.critic_net.tau) * v2)
37 changes: 37 additions & 0 deletions pendulum_problem/exploration.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
import numpy as np

class RandomActionExploration(object):
def __init__(self,p_rand_action, action_size, action_bounds, seed=None):
self.p_rand_action = p_rand_action
self.action_size = action_size
self.action_bounds = np.array(action_bounds)
self._random_generator = np.random.RandomState(seed)

def __call__(self,action):
if self._random_generator.rand() < self.p_rand_action:
a = self.action_bounds * (2*self._random_generator.random(self.action_size)-1)
return a
return action

# Based on https://github.com/openai/baselines/blob/master/baselines/ddpg/noise.py
class OrnsteinUhlenbeckActionNoise(object):
def __init__(self, mu, sigma, theta=.15, dt=1e-2, x0=None,seed=None):
self.theta = theta
self.mu = mu
self.sigma = sigma
self.dt = dt
self.x0 = x0
self.seed = seed
self.reset()

def __call__(self,action):
x = self.x_prev + self.theta * (self.mu - self.x_prev) * self.dt + self.sigma * np.sqrt(self.dt) * self._random_generator.normal(size=self.mu.shape)
self.x_prev = x
return action + x

def reset(self):
self._random_generator = np.random.RandomState(seed=self.seed)
self.x_prev = self.x0 if self.x0 is not None else np.zeros_like(self.mu)

def __str__(self):
return 'OrnsteinUhlenbeckActionNoise(mu={}, sigma={})'.format(self.mu, self.sigma)
60 changes: 60 additions & 0 deletions pendulum_problem/neural_nets.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
import tensorflow as tf
from utils import clone_net_structure

class ActorNet(tf.keras.Model):
"""Actor network for Actor-Critic model.
It receives state as input and produces action vector
"""
def __init__(self, net_structure, action_bounds, tau, learning_rate):
super(ActorNet, self).__init__()
self.net_structure = net_structure
self.action_bounds = tf.constant(action_bounds, shape=[1, len(action_bounds)], dtype=tf.float32)
self.tau = tf.constant(tau, dtype=tf.float32)
self.learning_rate = learning_rate

@tf.function
def call(self, input, training=None, mask=None):
out = input
for layer in self.net_structure:
kwargs = {}
if training is not None:
kwargs['training'] = training
if mask is not None:
kwargs['mask'] = mask
out = layer(out, **kwargs)
scaled_out = out * self.action_bounds
return scaled_out

def clone(self):
structure = clone_net_structure(self.net_structure)
return ActorNet(structure, self.action_bounds, self.tau, self.learning_rate)


class CriticNet(tf.keras.Model):
"""Critic network for Actor-Critic model.
It receives state and action as input and produces Q-value
"""
def __init__(self, net_structure, tau, learning_rate, grad_norm):
super(CriticNet, self).__init__()
self.net_structure = net_structure
self.tau = tf.constant(tau, dtype=tf.float32)
self.learning_rate = learning_rate
self.grad_norm = grad_norm

@tf.function
def call(self, input, action, training=None, mask=None):
out = input
for i, layer in enumerate(self.net_structure):
if i == 1:
out = tf.concat([out, action], axis=-1)
kwargs = {}
if training is not None:
kwargs['training'] = training
if mask is not None:
kwargs['mask'] = mask
out = layer(out, **kwargs)
return out

def clone(self):
structure = clone_net_structure(self.net_structure)
return CriticNet(structure, self.tau, self.learning_rate, self.grad_norm)
75 changes: 75 additions & 0 deletions pendulum_problem/replay_buffer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
import numpy as np

class ReplayBuffer(object):
"""Classical Replay Buffer storing SARS tuples.
The right side of the buffer contains recent experiments
Parameters:
----------
buffer_size : int (default -1)
Maximum buffer capacity. If equals to -1 than size is unlimited.
seed : int (default None)
Seed to sample minibatches.
"""
def __init__(self, buffer_size=-1, seed=None):
self.buffer_size = buffer_size
self._random_generator = np.random.RandomState(seed)
self._s = []
self._a = []
self._r = []
self._t = []
self._s1 = []

def __str__(self):
if self.buffer_size > 0:
return "ReplayBuffer size {} of {}".format(self.size(),self.buffer_size)
else:
return f"ReplayBuffer size {self.size()}"

def size(self):
"""Returns current buffer size
"""
return len(self._s)

def add(self, s, a, r, t, s1):
"""Pushes a SARS-tuple to buffer
"""
self._s.append(s)
self._a.append(a)
self._r.append(r)
self._t.append(t)
self._s1.append(s1)

if self.buffer_size > 0 and self.size() >= self.buffer_size :
self._s.pop(0)
self._a.pop(0)
self._r.pop(0)
self._t.pop(0)
self._s1.pop(0)

return self

def sample_batch(self, batch_size):
"""Returns minibatch sampled from the buffer with replacments.
"""
batch = min(self.size(),batch_size)
idx = self._random_generator.randint(0,self.size(),(batch,))

s_batch,a_batch,r_batch,t_batch,s1_batch = [],[],[],[],[]
for i in idx:
s_batch.append(self._s[i])
a_batch.append(self._a[i])
r_batch.append(self._r[i])
t_batch.append(self._t[i])
s1_batch.append(self._s1[i])

return np.array(s_batch,dtype=np.float32), np.array(a_batch,dtype=np.float32), np.array(r_batch,dtype=np.float32),\
np.array(t_batch,dtype=np.bool), np.array(s1_batch,dtype=np.float32)

def clear(self):
self._s.clear()
self._a.clear()
self._r.clear()
self._t.clear()
self._s1.clear()
return self
76 changes: 76 additions & 0 deletions pendulum_problem/train_ddpg.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
import datetime
import numpy as np
import tensorflow as tf
import gym
from ddpg import DeepDeterministicPolicyGradients
from replay_buffer import ReplayBuffer
from neural_nets import ActorNet, CriticNet
from exploration import OrnsteinUhlenbeckActionNoise

MINIBATCH_SIZE = 200
MAX_EPISODES = 1000
MAX_EP_STEPS = 1000

TAU = 0.001
ACTOR_LEARNING_RATE = 0.0001
CRITIC_LEARNING_RATE = 0.001
GRADIENT_MAX_NORM = 5
BUFFER_SIZE = 100000
DISCOUNT_FACTOR = 0.99
P_RAND_ACTION = 0.05
SEED = 42


kernel_init = tf.keras.initializers.glorot_normal(SEED)
environment = gym.make('Pendulum-v0')
state_size = environment.observation_space.shape[0]
action_size = environment.action_space.shape[0]
action_bound = environment.action_space.high[0]

CRITIC_NET_STRUCTURE = [tf.keras.layers.Dense(300,kernel_initializer=kernel_init),
tf.keras.layers.BatchNormalization(),
tf.keras.layers.ReLU(),
tf.keras.layers.Dense(400,kernel_initializer=kernel_init,activation=tf.nn.relu),
tf.keras.layers.Dense(1,kernel_initializer=kernel_init)
]
ACTOR_NET_STRUCTURE = [tf.keras.layers.Dense(300,kernel_initializer=kernel_init),
tf.keras.layers.BatchNormalization(),
tf.keras.layers.ReLU(),
tf.keras.layers.Dense(400,kernel_initializer=kernel_init,activation=tf.nn.relu),
tf.keras.layers.Dense(action_size,kernel_initializer=kernel_init,activation=tf.nn.tanh)
]


actor_net = ActorNet(ACTOR_NET_STRUCTURE,[action_bound],TAU,ACTOR_LEARNING_RATE)
critic_net = CriticNet(CRITIC_NET_STRUCTURE,TAU,CRITIC_LEARNING_RATE,GRADIENT_MAX_NORM)

action_noise = OrnsteinUhlenbeckActionNoise(np.zeros((action_size,)),0.2)

replay_buffer = ReplayBuffer(BUFFER_SIZE,SEED)
model = DeepDeterministicPolicyGradients(actor_net,critic_net,action_noise,\
replay_buffer,action_size,state_size,DISCOUNT_FACTOR,MINIBATCH_SIZE)

logdir = "logs/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
file_writer = tf.summary.create_file_writer(logdir)
file_writer.set_as_default()

for i in range(MAX_EPISODES):
state = environment.reset()
ep_reward = 0

for j in range(MAX_EP_STEPS):
a = model.actor_predict(state)
a = a.reshape((-1,))
next_state, r, t, _ = environment.step(a)
model.add_to_buffer(np.squeeze(state), a, r, t, np.squeeze(next_state))

model.update()

state = next_state.copy()
ep_reward += r
if t:
break
tf.summary.scalar('Episode reward', data=ep_reward, step=i)
file_writer.flush()
print('Episode: {:d} | Reward: {:.2f} |'.format(i, ep_reward, i))
environment.close()
Loading

0 comments on commit 7bcbee7

Please sign in to comment.