Skip to content

Commit 80fddff

Browse files
committed
Add models.
Add some comments.
1 parent 8f3e621 commit 80fddff

File tree

5 files changed

+27
-22
lines changed

5 files changed

+27
-22
lines changed

actor_network.h5

20 KB
Binary file not shown.

agents.py

Lines changed: 14 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -4,26 +4,26 @@
44

55
class DQNAgent:
66

7-
def __init__(self, hidden_layers, state_spec, action_spec, buffer, learning_rate, is_prioritized, is_double):
7+
def __init__(self, hidden_layers, state_spec, action_spec, buffer, learning_rate, is_prioritized):
88
self.buffer = buffer
9-
self.training_network = tf.keras.models.Sequential()
10-
self.training_network.add(tf.keras.layers.InputLayer(input_shape=(state_spec,)))
9+
self.network = tf.keras.models.Sequential()
10+
self.network.add(tf.keras.layers.InputLayer(input_shape=(state_spec,)))
1111

1212
for hidden_layer in hidden_layers:
13-
self.training_network.add(tf.keras.layers.Dense(hidden_layer, activation='relu'))
13+
self.network.add(tf.keras.layers.Dense(hidden_layer, activation='relu'))
1414

15-
self.training_network.add(tf.keras.layers.Dense(action_spec, activation='linear'))
15+
self.network.add(tf.keras.layers.Dense(action_spec, activation='linear'))
1616

1717
self.optimizer = tf.keras.optimizers.Adam(learning_rate)
1818
self.is_prioritized = is_prioritized
19-
self.is_double = is_double
2019
self.loss = tf.keras.losses.mean_squared_error
2120

21+
# Epsilon-greedy policy
2222
def play_action(self, state, epsilon):
2323
if np.random.random() < epsilon:
24-
return np.random.choice(self.training_network.output_shape[1])
24+
return np.random.choice(self.network.output_shape[1])
2525
else:
26-
return np.argmax(self.training_network.predict(np.atleast_2d(state))[0])
26+
return np.argmax(self.network.predict(np.atleast_2d(state))[0])
2727

2828
def train(self, gamma, batch_size):
2929
if self.is_prioritized:
@@ -32,12 +32,12 @@ def train(self, gamma, batch_size):
3232
else:
3333
states, actions, rewards, states_next, dones = self.buffer.sample(batch_size)
3434

35-
target = rewards + (1 - dones) * gamma * np.max(self.training_network.predict(states_next), axis=1)
35+
target = rewards + (1 - dones) * gamma * np.max(self.network.predict(states_next), axis=1)
3636

3737
# Custom training loop taken by the teaching material on the course website
38-
mask = tf.one_hot(actions, self.training_network.output_shape[1])
38+
mask = tf.one_hot(actions, self.network.output_shape[1])
3939
with tf.GradientTape() as tape:
40-
q_values = self.training_network(states)
40+
q_values = self.network(states)
4141
predicted = tf.reduce_sum(q_values * mask, axis=1)
4242

4343
if self.is_prioritized:
@@ -48,7 +48,7 @@ def train(self, gamma, batch_size):
4848
if self.is_prioritized:
4949
self.buffer.update_priority(ids, (target - predicted))
5050

51-
variables = self.training_network.trainable_variables
51+
variables = self.network.trainable_variables
5252
gradients = tape.gradient(loss, variables)
5353
self.optimizer.apply_gradients(zip(gradients, variables))
5454

@@ -78,13 +78,15 @@ def __init__(self, hidden_layers_actor, hidden_layers_critic, state_spec, action
7878
self.optimizer_critic = tf.keras.optimizers.Adam(learning_rate_critic)
7979
self.loss = tf.keras.losses.mean_squared_error
8080

81+
# Playing action by following the policy (output of the actor network)
8182
def play_action(self, state):
8283
probabilities = self.actor_network(np.atleast_2d(state))
8384
selection_probabilities = probabilities[0] / np.sum(probabilities[0])
8485
action = np.random.choice(self.actor_network.output_shape[1], p=selection_probabilities)
8586
return action
8687

8788
def play_and_train(self, state, env, gamma):
89+
# persistent needed because i will call tape.gradient 2 times one for the critic and one for the actor
8890
with tf.GradientTape(persistent=True) as tape:
8991
probabilities = self.actor_network(np.atleast_2d(state))
9092
# I need to normalize probability because numpy wants that the sum must be 1 and the softmax gives me

dqn_no_priority_network.h5

17 KB
Binary file not shown.

dqn_with_priority_network.h5

17 KB
Binary file not shown.

my_main.py

Lines changed: 13 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -24,11 +24,12 @@
2424
}
2525

2626

27+
# Training method for Actor critic
2728
def start_training_ac():
2829
env = gym.make(hyperparams['environment'])
2930
state_spec = len(env.observation_space.sample())
3031
action_spec = env.action_space.n
31-
log_name = 'final'
32+
log_name = 'final_build'
3233
log_dir = 'logs/acrobotAC/' + log_name
3334
log_writer = tf.summary.create_file_writer(log_dir)
3435

@@ -49,21 +50,25 @@ def start_training_ac():
4950
total_rewards[episode] = episode_reward
5051
avg_rewards = total_rewards[max(0, episode - 20):(episode + 1)].mean()
5152
env.reset()
53+
5254
with log_writer.as_default():
5355
tf.summary.scalar('episode reward', episode_reward, step=episode)
5456
tf.summary.scalar('avg for 20 episodes', avg_rewards, step=episode)
5557

58+
agent.actor_network.save_weights('actor_network.h5')
59+
5660

61+
# Training method for dqn
5762
def start_training_dqn(is_prioritized):
5863
if is_prioritized:
59-
prio = "Prio"
64+
prio = "with_priority"
6065
else:
61-
prio = ""
66+
prio = "no_priority"
6267

6368
env = gym.make(hyperparams['environment'])
6469
state_spec = len(env.observation_space.sample())
6570
action_spec = env.action_space.n
66-
log_name = 'final' + prio
71+
log_name = 'final_build' + prio
6772
log_dir = 'logs/acrobot/' + log_name
6873

6974
log_writer = tf.summary.create_file_writer(log_dir)
@@ -95,15 +100,12 @@ def start_training_dqn(is_prioritized):
95100

96101
total_rewards[episode] = episode_reward
97102
avg_rewards = total_rewards[max(0, episode - 20):(episode + 1)].mean()
98-
99103
env.reset()
100-
if episode % 100 == 0:
101-
print(avg_rewards)
102104

103105
with log_writer.as_default():
104106
tf.summary.scalar('episode reward', episode_reward, step=episode)
105107
tf.summary.scalar('avg for 20 episodes', avg_rewards, step=episode)
106-
108+
agent.network.save_weights('dqn_{}_network.h5'.format(prio))
107109
env.close()
108110

109111

@@ -119,10 +121,10 @@ def test_model(model, is_ac):
119121
agent.actor_network.load_weights(model)
120122

121123
else:
122-
agent = DQNAgent(hyperparams['network'], state_spec, action_spec, buffer, hyperparams['learning_rate'],
124+
agent = DQNAgent(hyperparams['hidden_layer_dqn'], state_spec, action_spec, buffer, hyperparams['learning_rate_dqn'],
123125
is_prioritized)
124126

125-
agent.training_network.load_weights(model)
127+
agent.network.load_weights(model)
126128
obs = env.reset()
127129
env.render()
128130
# Play 20 episodes
@@ -157,6 +159,7 @@ def test_model(model, is_ac):
157159
if args.mode == 'train':
158160
print('TRAIN')
159161
print("PER", args.per)
162+
print("Actor critic", args.ac)
160163
if args.ac:
161164
start_training_ac()
162165
else:

0 commit comments

Comments
 (0)