-
Notifications
You must be signed in to change notification settings - Fork 1
/
actor.py
79 lines (67 loc) · 2.88 KB
/
actor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
from collections import deque
import random
import numpy as np
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.optimizers import RMSprop, SGD
class Actor(object):
"""
actor learns to predict a set of "deltas", which represent
the difference in value between a new state and the current
state. the actor will choose the action that will result in
the largest delta.
"""
def __init__(self, state_size, action_size):
self.state_size = state_size
self.action_size = action_size
self.memory = deque(maxlen=2000)
self.brain = self._build_model()
self.epsilon = 1
self.exploration_min = 0.01
self.exploration_decay = 0.995
self.n_batch = 32
def _build_model(self):
model = Sequential()
model.add(Dense(164, input_shape=(self.state_size,), activation='relu'))
model.add(Dense(150, activation='relu'))
model.add(Dense(self.action_size, activation='linear')) # output is action space, not a single value prediction
a_optimizer = SGD(lr=0.1, decay=1e-6, momentum=0.9, nesterov=True)
model.compile(loss='mse', optimizer=a_optimizer)
return model
def remember(self, experience):
self.memory.append(experience)
def predict(self, state):
return self.brain.predict(state)
def act(self, state):
if np.random.rand() <= self.epsilon:
return random.randrange(self.action_size)
else:
return np.argmax(self.brain.predict(state)[0])
def replay(self):
"""
actor learns the deltas between current state and next state.
for every current state, the actor predicts value deltas for each
possible action.
the actor no longer moves to the highest value state, but takes the
action with the highest value difference.
"""
if len(self.memory) > self.n_batch:
X_train = []
y_train = []
minibatch = random.sample(self.memory, self.n_batch)
for memory in minibatch:
m_orig_state, m_action, m_value = memory
old_qval = self.brain.predict(m_orig_state.reshape(1, self.state_size,) )
y = np.zeros((1, self.action_size))
y[:] = old_qval[:]
y[0][m_action] = m_value
X_train.append(m_orig_state.reshape((self.state_size,)))
y_train.append(y.reshape((self.action_size,)))
X_train = np.array(X_train)
y_train = np.array(y_train)
self.brain.fit(X_train, y_train, batch_size=self.n_batch, epochs=1, verbose=0)
else:
pass
# decrease epsilon over time
if self.epsilon > self.exploration_min:
self.epsilon *= self.exploration_decay