-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #1 from Rufaim/tensorflow_2_x
Tensorflow 2 x
- Loading branch information
Showing
15 changed files
with
390 additions
and
4 deletions.
There are no files selected for viewing
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,26 @@ | ||
# Pendulum-problem | ||
This is InvertedPendulum problem from classic Control theory solved with ActorCritic model from modern Reinforcement learning | ||
This is an [Inverted Pendulum](https://en.wikipedia.org/wiki/Inverted_pendulum) problem from classic Control theory solved with [Deep Deteministic Policy Gradients](https://arxiv.org/pdf/1509.02971v2.pdf) model. | ||
|
||
Links: | ||
* [Original article](https://arxiv.org/pdf/1509.02971v2.pdf) | ||
* [InvertedPendulum problem](https://en.wikipedia.org/wiki/Inverted_pendulum "Inverted pendulum") | ||
The model was trained on [OpenAI's Pendulum-v0](https://gym.openai.com/envs/Pendulum-v0/). | ||
|
||
![Learning Process](tensorboard_screenshot/screenshot_2020-03-13.png) | ||
|
||
To change parameters of the model please use global variables of the train script. | ||
|
||
### Usage | ||
*Note*: commands for Ubuntu 18.04 | ||
1. Clone the repository \ | ||
```git clone ``` | ||
2. Go to source root directory \ | ||
```cd pendulum_problem``` | ||
3. Run train script \ | ||
```python3 train_ddpg.py``` | ||
|
||
Folder `(old)pendulum_problem1.x` contain an outdated implementation for tensorflow 1.8.0 | ||
|
||
### Requirements: | ||
*Note*: was tested on | ||
|
||
* tensorflow == 2.0 | ||
* numpy == 1.18.1 | ||
* gym == 1.17.1 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,100 @@ | ||
import tensorflow as tf | ||
import numpy as np | ||
|
||
class DeepDeterministicPolicyGradients(object): | ||
def __init__(self, actor_net, critic_net, action_exploration_func, replay_buffer, \ | ||
action_size, state_size, discount_factor,batch_size): | ||
self.actor_net = actor_net | ||
self.critic_net = critic_net | ||
self.action_exploration_func = action_exploration_func | ||
self.replay_buffer = replay_buffer | ||
self.action_size = action_size | ||
self.state_size = state_size | ||
self.discount_factor = discount_factor | ||
self.batch_size = batch_size | ||
|
||
self._init_target_nets() | ||
|
||
self.actor_optimizer = tf.keras.optimizers.Adam(self.actor_net.learning_rate) | ||
self.critic_optimizer = tf.keras.optimizers.Adam(self.critic_net.learning_rate) | ||
|
||
def _init_target_nets(self): | ||
# train nets warmup | ||
inp = tf.zeros((0, self.state_size)) | ||
a = self.actor_net(inp) | ||
self.critic_net(inp, a) | ||
|
||
self.target_actor_net = self.actor_net.clone() | ||
self.target_critic_net = self.critic_net.clone() | ||
|
||
# target nets warmup | ||
a = self.target_actor_net(inp) | ||
self.target_critic_net(inp, a) | ||
|
||
for v2, v1 in zip(self.target_actor_net.trainable_variables, self.actor_net.trainable_variables): | ||
v2.assign(v1) | ||
|
||
for v2, v1 in zip(self.target_critic_net.trainable_variables, self.critic_net.trainable_variables): | ||
v2.assign(v1) | ||
|
||
def actor_predict(self, state): | ||
s = np.atleast_2d(state) | ||
return self.actor_net(s).numpy() | ||
|
||
def critic_predict(self, state): | ||
s = np.atleast_2d(state) | ||
a = self.actor_net(s) | ||
return self.critic_net(s, a).numpy() | ||
|
||
def get_action(self, state): | ||
return self.action_exploration_func(self.actor_predict(state)) | ||
|
||
def add_to_buffer(self, S, A, R, T, S1): | ||
self.replay_buffer.add(S, A, R, T, S1) | ||
|
||
def update(self): | ||
if self.replay_buffer.size() >= self.batch_size: | ||
states, actions, rewards, terminates, next_states = self.replay_buffer.sample_batch(self.batch_size) | ||
|
||
target_actions = self.target_actor_net(next_states) | ||
target_q_vals = self.target_critic_net(next_states, target_actions).numpy() | ||
|
||
y_is = rewards.reshape((self.batch_size, 1)) | ||
terminates = terminates.reshape((self.batch_size, 1)) | ||
y_is[~terminates] += self.discount_factor * target_q_vals[~terminates] | ||
|
||
self._update_critic(states, actions, y_is) | ||
self._update_actor(states) | ||
|
||
self._update_target_networks() | ||
|
||
@tf.function | ||
def _update_critic(self, states, actions, target_qs): | ||
with tf.GradientTape() as tape: | ||
predicted_q_value = self.critic_net(states, actions) | ||
critic_loss = tf.reduce_sum((target_qs - predicted_q_value) ** 2) | ||
|
||
grads = tape.gradient(critic_loss, self.critic_net.trainable_variables) | ||
#grads = tf.clip_by_global_norm(grads, self.critic_net.grad_norm) | ||
self.critic_optimizer.apply_gradients(zip(grads, self.critic_net.trainable_variables)) | ||
|
||
@tf.function | ||
def _update_actor(self, states): | ||
with tf.GradientTape() as tape: | ||
actions = self.actor_net(states) | ||
q_vals = self.critic_net(states, actions) | ||
# minus since we are ascending | ||
q_vals = -q_vals | ||
|
||
grads = tape.gradient(q_vals, self.actor_net.trainable_variables) | ||
# expectation from sum | ||
grads = [g / self.batch_size for g in grads] | ||
self.actor_optimizer.apply_gradients(zip(grads, self.actor_net.trainable_variables)) | ||
|
||
@tf.function | ||
def _update_target_networks(self): | ||
for v2, v1 in zip(self.target_actor_net.trainable_variables, self.actor_net.trainable_variables): | ||
v2.assign(self.actor_net.tau * v1 + (1 - self.actor_net.tau) * v2) | ||
|
||
for v2, v1 in zip(self.target_critic_net.trainable_variables, self.critic_net.trainable_variables): | ||
v2.assign(self.critic_net.tau * v1 + (1 - self.critic_net.tau) * v2) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
import numpy as np | ||
|
||
class RandomActionExploration(object): | ||
def __init__(self,p_rand_action, action_size, action_bounds, seed=None): | ||
self.p_rand_action = p_rand_action | ||
self.action_size = action_size | ||
self.action_bounds = np.array(action_bounds) | ||
self._random_generator = np.random.RandomState(seed) | ||
|
||
def __call__(self,action): | ||
if self._random_generator.rand() < self.p_rand_action: | ||
a = self.action_bounds * (2*self._random_generator.random(self.action_size)-1) | ||
return a | ||
return action | ||
|
||
# Based on https://github.com/openai/baselines/blob/master/baselines/ddpg/noise.py | ||
class OrnsteinUhlenbeckActionNoise(object): | ||
def __init__(self, mu, sigma, theta=.15, dt=1e-2, x0=None,seed=None): | ||
self.theta = theta | ||
self.mu = mu | ||
self.sigma = sigma | ||
self.dt = dt | ||
self.x0 = x0 | ||
self.seed = seed | ||
self.reset() | ||
|
||
def __call__(self,action): | ||
x = self.x_prev + self.theta * (self.mu - self.x_prev) * self.dt + self.sigma * np.sqrt(self.dt) * self._random_generator.normal(size=self.mu.shape) | ||
self.x_prev = x | ||
return action + x | ||
|
||
def reset(self): | ||
self._random_generator = np.random.RandomState(seed=self.seed) | ||
self.x_prev = self.x0 if self.x0 is not None else np.zeros_like(self.mu) | ||
|
||
def __str__(self): | ||
return 'OrnsteinUhlenbeckActionNoise(mu={}, sigma={})'.format(self.mu, self.sigma) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,60 @@ | ||
import tensorflow as tf | ||
from utils import clone_net_structure | ||
|
||
class ActorNet(tf.keras.Model): | ||
"""Actor network for Actor-Critic model. | ||
It receives state as input and produces action vector | ||
""" | ||
def __init__(self, net_structure, action_bounds, tau, learning_rate): | ||
super(ActorNet, self).__init__() | ||
self.net_structure = net_structure | ||
self.action_bounds = tf.constant(action_bounds, shape=[1, len(action_bounds)], dtype=tf.float32) | ||
self.tau = tf.constant(tau, dtype=tf.float32) | ||
self.learning_rate = learning_rate | ||
|
||
@tf.function | ||
def call(self, input, training=None, mask=None): | ||
out = input | ||
for layer in self.net_structure: | ||
kwargs = {} | ||
if training is not None: | ||
kwargs['training'] = training | ||
if mask is not None: | ||
kwargs['mask'] = mask | ||
out = layer(out, **kwargs) | ||
scaled_out = out * self.action_bounds | ||
return scaled_out | ||
|
||
def clone(self): | ||
structure = clone_net_structure(self.net_structure) | ||
return ActorNet(structure, self.action_bounds, self.tau, self.learning_rate) | ||
|
||
|
||
class CriticNet(tf.keras.Model): | ||
"""Critic network for Actor-Critic model. | ||
It receives state and action as input and produces Q-value | ||
""" | ||
def __init__(self, net_structure, tau, learning_rate, grad_norm): | ||
super(CriticNet, self).__init__() | ||
self.net_structure = net_structure | ||
self.tau = tf.constant(tau, dtype=tf.float32) | ||
self.learning_rate = learning_rate | ||
self.grad_norm = grad_norm | ||
|
||
@tf.function | ||
def call(self, input, action, training=None, mask=None): | ||
out = input | ||
for i, layer in enumerate(self.net_structure): | ||
if i == 1: | ||
out = tf.concat([out, action], axis=-1) | ||
kwargs = {} | ||
if training is not None: | ||
kwargs['training'] = training | ||
if mask is not None: | ||
kwargs['mask'] = mask | ||
out = layer(out, **kwargs) | ||
return out | ||
|
||
def clone(self): | ||
structure = clone_net_structure(self.net_structure) | ||
return CriticNet(structure, self.tau, self.learning_rate, self.grad_norm) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,75 @@ | ||
import numpy as np | ||
|
||
class ReplayBuffer(object): | ||
"""Classical Replay Buffer storing SARS tuples. | ||
The right side of the buffer contains recent experiments | ||
Parameters: | ||
---------- | ||
buffer_size : int (default -1) | ||
Maximum buffer capacity. If equals to -1 than size is unlimited. | ||
seed : int (default None) | ||
Seed to sample minibatches. | ||
""" | ||
def __init__(self, buffer_size=-1, seed=None): | ||
self.buffer_size = buffer_size | ||
self._random_generator = np.random.RandomState(seed) | ||
self._s = [] | ||
self._a = [] | ||
self._r = [] | ||
self._t = [] | ||
self._s1 = [] | ||
|
||
def __str__(self): | ||
if self.buffer_size > 0: | ||
return "ReplayBuffer size {} of {}".format(self.size(),self.buffer_size) | ||
else: | ||
return f"ReplayBuffer size {self.size()}" | ||
|
||
def size(self): | ||
"""Returns current buffer size | ||
""" | ||
return len(self._s) | ||
|
||
def add(self, s, a, r, t, s1): | ||
"""Pushes a SARS-tuple to buffer | ||
""" | ||
self._s.append(s) | ||
self._a.append(a) | ||
self._r.append(r) | ||
self._t.append(t) | ||
self._s1.append(s1) | ||
|
||
if self.buffer_size > 0 and self.size() >= self.buffer_size : | ||
self._s.pop(0) | ||
self._a.pop(0) | ||
self._r.pop(0) | ||
self._t.pop(0) | ||
self._s1.pop(0) | ||
|
||
return self | ||
|
||
def sample_batch(self, batch_size): | ||
"""Returns minibatch sampled from the buffer with replacments. | ||
""" | ||
batch = min(self.size(),batch_size) | ||
idx = self._random_generator.randint(0,self.size(),(batch,)) | ||
|
||
s_batch,a_batch,r_batch,t_batch,s1_batch = [],[],[],[],[] | ||
for i in idx: | ||
s_batch.append(self._s[i]) | ||
a_batch.append(self._a[i]) | ||
r_batch.append(self._r[i]) | ||
t_batch.append(self._t[i]) | ||
s1_batch.append(self._s1[i]) | ||
|
||
return np.array(s_batch,dtype=np.float32), np.array(a_batch,dtype=np.float32), np.array(r_batch,dtype=np.float32),\ | ||
np.array(t_batch,dtype=np.bool), np.array(s1_batch,dtype=np.float32) | ||
|
||
def clear(self): | ||
self._s.clear() | ||
self._a.clear() | ||
self._r.clear() | ||
self._t.clear() | ||
self._s1.clear() | ||
return self |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,76 @@ | ||
import datetime | ||
import numpy as np | ||
import tensorflow as tf | ||
import gym | ||
from ddpg import DeepDeterministicPolicyGradients | ||
from replay_buffer import ReplayBuffer | ||
from neural_nets import ActorNet, CriticNet | ||
from exploration import OrnsteinUhlenbeckActionNoise | ||
|
||
MINIBATCH_SIZE = 200 | ||
MAX_EPISODES = 1000 | ||
MAX_EP_STEPS = 1000 | ||
|
||
TAU = 0.001 | ||
ACTOR_LEARNING_RATE = 0.0001 | ||
CRITIC_LEARNING_RATE = 0.001 | ||
GRADIENT_MAX_NORM = 5 | ||
BUFFER_SIZE = 100000 | ||
DISCOUNT_FACTOR = 0.99 | ||
P_RAND_ACTION = 0.05 | ||
SEED = 42 | ||
|
||
|
||
kernel_init = tf.keras.initializers.glorot_normal(SEED) | ||
environment = gym.make('Pendulum-v0') | ||
state_size = environment.observation_space.shape[0] | ||
action_size = environment.action_space.shape[0] | ||
action_bound = environment.action_space.high[0] | ||
|
||
CRITIC_NET_STRUCTURE = [tf.keras.layers.Dense(300,kernel_initializer=kernel_init), | ||
tf.keras.layers.BatchNormalization(), | ||
tf.keras.layers.ReLU(), | ||
tf.keras.layers.Dense(400,kernel_initializer=kernel_init,activation=tf.nn.relu), | ||
tf.keras.layers.Dense(1,kernel_initializer=kernel_init) | ||
] | ||
ACTOR_NET_STRUCTURE = [tf.keras.layers.Dense(300,kernel_initializer=kernel_init), | ||
tf.keras.layers.BatchNormalization(), | ||
tf.keras.layers.ReLU(), | ||
tf.keras.layers.Dense(400,kernel_initializer=kernel_init,activation=tf.nn.relu), | ||
tf.keras.layers.Dense(action_size,kernel_initializer=kernel_init,activation=tf.nn.tanh) | ||
] | ||
|
||
|
||
actor_net = ActorNet(ACTOR_NET_STRUCTURE,[action_bound],TAU,ACTOR_LEARNING_RATE) | ||
critic_net = CriticNet(CRITIC_NET_STRUCTURE,TAU,CRITIC_LEARNING_RATE,GRADIENT_MAX_NORM) | ||
|
||
action_noise = OrnsteinUhlenbeckActionNoise(np.zeros((action_size,)),0.2) | ||
|
||
replay_buffer = ReplayBuffer(BUFFER_SIZE,SEED) | ||
model = DeepDeterministicPolicyGradients(actor_net,critic_net,action_noise,\ | ||
replay_buffer,action_size,state_size,DISCOUNT_FACTOR,MINIBATCH_SIZE) | ||
|
||
logdir = "logs/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") | ||
file_writer = tf.summary.create_file_writer(logdir) | ||
file_writer.set_as_default() | ||
|
||
for i in range(MAX_EPISODES): | ||
state = environment.reset() | ||
ep_reward = 0 | ||
|
||
for j in range(MAX_EP_STEPS): | ||
a = model.actor_predict(state) | ||
a = a.reshape((-1,)) | ||
next_state, r, t, _ = environment.step(a) | ||
model.add_to_buffer(np.squeeze(state), a, r, t, np.squeeze(next_state)) | ||
|
||
model.update() | ||
|
||
state = next_state.copy() | ||
ep_reward += r | ||
if t: | ||
break | ||
tf.summary.scalar('Episode reward', data=ep_reward, step=i) | ||
file_writer.flush() | ||
print('Episode: {:d} | Reward: {:.2f} |'.format(i, ep_reward, i)) | ||
environment.close() |
Oops, something went wrong.