4
4
5
5
class DQNAgent :
6
6
7
- def __init__ (self , hidden_layers , state_spec , action_spec , buffer , learning_rate , is_prioritized , is_double ):
7
+ def __init__ (self , hidden_layers , state_spec , action_spec , buffer , learning_rate , is_prioritized ):
8
8
self .buffer = buffer
9
- self .training_network = tf .keras .models .Sequential ()
10
- self .training_network .add (tf .keras .layers .InputLayer (input_shape = (state_spec ,)))
9
+ self .network = tf .keras .models .Sequential ()
10
+ self .network .add (tf .keras .layers .InputLayer (input_shape = (state_spec ,)))
11
11
12
12
for hidden_layer in hidden_layers :
13
- self .training_network .add (tf .keras .layers .Dense (hidden_layer , activation = 'relu' ))
13
+ self .network .add (tf .keras .layers .Dense (hidden_layer , activation = 'relu' ))
14
14
15
- self .training_network .add (tf .keras .layers .Dense (action_spec , activation = 'linear' ))
15
+ self .network .add (tf .keras .layers .Dense (action_spec , activation = 'linear' ))
16
16
17
17
self .optimizer = tf .keras .optimizers .Adam (learning_rate )
18
18
self .is_prioritized = is_prioritized
19
- self .is_double = is_double
20
19
self .loss = tf .keras .losses .mean_squared_error
21
20
21
+ # Epsilon-greedy policy
22
22
def play_action (self , state , epsilon ):
23
23
if np .random .random () < epsilon :
24
- return np .random .choice (self .training_network .output_shape [1 ])
24
+ return np .random .choice (self .network .output_shape [1 ])
25
25
else :
26
- return np .argmax (self .training_network .predict (np .atleast_2d (state ))[0 ])
26
+ return np .argmax (self .network .predict (np .atleast_2d (state ))[0 ])
27
27
28
28
def train (self , gamma , batch_size ):
29
29
if self .is_prioritized :
@@ -32,12 +32,12 @@ def train(self, gamma, batch_size):
32
32
else :
33
33
states , actions , rewards , states_next , dones = self .buffer .sample (batch_size )
34
34
35
- target = rewards + (1 - dones ) * gamma * np .max (self .training_network .predict (states_next ), axis = 1 )
35
+ target = rewards + (1 - dones ) * gamma * np .max (self .network .predict (states_next ), axis = 1 )
36
36
37
37
# Custom training loop taken by the teaching material on the course website
38
- mask = tf .one_hot (actions , self .training_network .output_shape [1 ])
38
+ mask = tf .one_hot (actions , self .network .output_shape [1 ])
39
39
with tf .GradientTape () as tape :
40
- q_values = self .training_network (states )
40
+ q_values = self .network (states )
41
41
predicted = tf .reduce_sum (q_values * mask , axis = 1 )
42
42
43
43
if self .is_prioritized :
@@ -48,7 +48,7 @@ def train(self, gamma, batch_size):
48
48
if self .is_prioritized :
49
49
self .buffer .update_priority (ids , (target - predicted ))
50
50
51
- variables = self .training_network .trainable_variables
51
+ variables = self .network .trainable_variables
52
52
gradients = tape .gradient (loss , variables )
53
53
self .optimizer .apply_gradients (zip (gradients , variables ))
54
54
@@ -78,13 +78,15 @@ def __init__(self, hidden_layers_actor, hidden_layers_critic, state_spec, action
78
78
self .optimizer_critic = tf .keras .optimizers .Adam (learning_rate_critic )
79
79
self .loss = tf .keras .losses .mean_squared_error
80
80
81
+ # Playing action by following the policy (output of the actor network)
81
82
def play_action (self , state ):
82
83
probabilities = self .actor_network (np .atleast_2d (state ))
83
84
selection_probabilities = probabilities [0 ] / np .sum (probabilities [0 ])
84
85
action = np .random .choice (self .actor_network .output_shape [1 ], p = selection_probabilities )
85
86
return action
86
87
87
88
def play_and_train (self , state , env , gamma ):
89
+ # persistent needed because i will call tape.gradient 2 times one for the critic and one for the actor
88
90
with tf .GradientTape (persistent = True ) as tape :
89
91
probabilities = self .actor_network (np .atleast_2d (state ))
90
92
# I need to normalize probability because numpy wants that the sum must be 1 and the softmax gives me
0 commit comments