forked from blakeMilner/DeepQLearning
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgpuqlearn.moon
372 lines (294 loc) · 12.2 KB
/
gpuqlearn.moon
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
require 'math'
require 'nnx'
require 'os'
require 'optim'
require 'cutorch'
require 'cunn'
math.randomseed os.time!
torch.setdefaulttensortype 'torch.FloatTensor'
Brain = {}
-- HELPER FUNCTIONS --
export randf = (s, e) ->
return (math.random(0, (e - s) * 9999) / 10000) + s
-- new methods for table
table.merge = (t1, t2) ->
t = t1
for i = 1, #t2
t[#t + 1] = t2[i]
return t
table.copy = (t) ->
u = {k, v for k, v in pairs t}
return setmetatable(u, getmetatable t)
table.length = (T) ->
count = 0
count += 1 for _ in pairs T
return count
-- returns experience table for single network decision
-- contains the state, action chosen, whether a reward was obtained, and the
-- state that resulted from the action. This is later used to train the network
-- Remember that the utility of an action is evaluated from the reward gained and
-- the utility of the state it led to (recursive definition)
export Experience = (state0, action0, reward0, state1) ->
NewExperience =
state0: state0
action0: action0
reward0: reward0
state1: state1
return NewExperience
-- BRAIN
Brain.init = (num_states, num_actions) ->
-- Number of past state/action pairs input to the network. 0 = agent lives in-the-moment :)
Brain.temporal_window = 2
-- Maximum number of experiences that we will save for training
Brain.experience_size = 30000
-- experience necessary to start learning
Brain.start_learn_threshold = 300
-- gamma is a crucial parameter that controls how much plan-ahead the agent does. In [0,1]
-- Determines the amount of weight placed on the utility of the state resulting from an action.
Brain.gamma = 0.9
-- number of steps we will learn for
Brain.learning_steps_total = 100000
-- how many steps of the above to perform only random actions (in the beginning)?
Brain.learning_steps_burnin = 300
-- controls exploration exploitation tradeoff. Will decay over time
-- a higher epsilon means we are more likely to choose random actions
Brain.epsilon = 1.0
-- what epsilon value do we bottom out on? 0.0 => purely deterministic policy at end
Brain.epsilon_min = 0.05
-- what epsilon to use when learning is turned off. This is for testing
Brain.epsilon_test_time = 0.01
[[== states and actions that go into neural net:
(state0,action0),(state1,action1), ... , (stateN)
this variable controls the size of that temporal window.
]]
Brain.net_inputs = (num_states + num_actions) * Brain.temporal_window + num_states
Brain.hidden_nodes = 16
Brain.num_states = num_states
Brain.num_actions = num_actions
Brain.net_outputs = Brain.num_actions
[[== Window size dictates the number of states, actions, rewards, and net inputs that we
save. The temporal window size is the number of time states/actions that are input
to the network and must be smaller than or equal to window_size
]]
Brain.window_size = math.max Brain.temporal_window, 2
-- advanced feature. Sometimes a random action should be biased towards some values
-- for example in flappy bird, we may want to choose to not flap more often
Brain.random_action_distribution = {}
if table.length(Brain.random_action_distribution) > 0
-- this better sum to 1 by the way, and be of length this.num_actions
if table.length(Brain.random_action_distribution) != Brain.num_actions
print 'TROUBLE. random_action_distribution should be same length as num_actions.'
s = 0.0
for k = 1, table.length Brain.random_action_distribution
s += Brain.random_action_distribution[k]
if math.abs(s - 1.0) > 0.0001
print 'TROUBLE. random_action_distribution should sum to 1!'
-- define architecture
Brain.net = nn.Sequential!
Brain.net\add nn.Linear Brain.net_inputs, Brain.hidden_nodes
Brain.net\add nn.Threshold 0, 0
Brain.net\add nn.Linear Brain.hidden_nodes, Brain.hidden_nodes
Brain.net\add nn.Threshold 0, 0
Brain.net\add nn.Linear Brain.hidden_nodes, Brain.net_outputs
Brain.net\cuda! -- move network to GPU
Brain.criterion = nn.MSECriterion!\cuda!
-- other learning parameters
Brain.learning_rate = 0.01
Brain.learning_rate_decay = 5e-7
Brain.batch_size = 16
Brain.momentum = 0.9
-- various housekeeping variables
Brain.age = 0 -- incremented every backward!
-- number of times we've called forward - lets us know when our input temporal
-- window is filled up
Brain.forward_passes = 0
Brain.learning = true
-- coefficients for regression
Brain.coefL1 = 0.001
Brain.coefL2 = 0.001
-- parameters for optim.sgd
Brain.parameters, Brain.gradParameters = Brain.net\getParameters!
-- These windows track old experiences, states, actions, rewards, and net inputs
-- over time. They should all start out as empty with a fixed size.
-- This is a first in, last out data structure that is shifted along time
Brain.experience = {}
Brain.state_window = {}
Brain.action_window = {}
Brain.reward_window = {}
Brain.net_window = {}
for i = 1, Brain.window_size
Brain.state_window[i] = {}
Brain.action_window[i] = {}
Brain.reward_window[i] = {}
Brain.net_window[i] = {}
-- a bit of a helper function. It returns a random action
-- we are abstracting this away because in future we may want to
-- do more sophisticated things. For example some actions could be more
-- or less likely at "rest"/default state.
Brain.random_action = ->
-- if we don't have a random action distribution defined then sample evenly
if table.length(Brain.random_action_distribution) == 0
return (torch.random! % Brain.net_outputs) + 1
-- okay, lets do some fancier sampling:
else
p = randf 0, 1
cumprob = 0.0
for k = 1, Brain.num_actions
cumprob += Brain.random_action_distribution[k]
if p < cumprob
return k
-- compute the value of doing any action in this state
-- and return the argmax action and its value
Brain.policy = (state) ->
tensor_state = torch.Tensor(state)\cuda!
action_values = Brain.net\forward tensor_state
maxval = action_values[1]
max_index = 1
-- find maximum output and note its index and value
--max_index = i for i = 2, Brain.net_outputs when action_values[i] > action_values[max_index]
for i = 2, Brain.net_outputs
if action_values[i] > maxval
maxval = action_values[i]
max_index = i
return action: max_index, value: maxval
-- This function assembles the input to the network by concatenating
-- old (state, chosen_action) pairs along with the current state
-- return s = (x,a,x,a,x,a,xt) state vector.
Brain.getNetInput = (xt) ->
w = {}
w = table.merge(w, xt) -- start with current state
-- and now go backwards and append states and actions from history temporal_window times
n = Brain.window_size + 1
for k = 1, Brain.temporal_window do
-- state
w = table.merge w, Brain.state_window[n - k]
-- action, encoded as 1-of-k indicator vector. We scale it up a bit because
-- we don't want weight regularization to undervalue this information, as it only exists once
action1ofk = {}
action1ofk[i] = 0 for i = 1, Brain.num_actions
-- assign action taken for current state to be 1, all others are 0
action1ofk[Brain.action_window[n - k]] = 1.0 * Brain.num_states
w = table.merge w, action1ofk
return w
-- This function computes an action by either:
-- 1. Giving the current state and past (state, action) pairs to the network
-- and letting it choose the best acction
-- 2. Choosing a random action
Brain.forward = (input_array) ->
Brain.forward_passes += 1
local action, net_input
-- if we have enough (state, action) pairs in our memory to fill up
-- our network input then we'll proceed to let our network choose the action
if Brain.forward_passes > Brain.temporal_window
net_input = Brain.getNetInput input_array
-- if learning is turned on then epsilon should be decaying
if Brain.learning
-- compute (decaying) epsilon for the epsilon-greedy policy
new_epsilon = 1.0 - (Brain.age - Brain.learning_steps_burnin)/(Brain.learning_steps_total - Brain.learning_steps_burnin)
-- don't let epsilon go above 1.0
Brain.epsilon = math.min(1.0, math.max(Brain.epsilon_min, new_epsilon))
else
-- if learning is turned off then use the epsilon we've specified for testing
Brain.epsilon = Brain.epsilon_test_time
-- use epsilon probability to choose whether we use network action or random action
if randf(0, 1) < Brain.epsilon
action = Brain.random_action!
else
-- otherwise use our policy to make decision
best_action = Brain.policy net_input
action = best_action.action -- this is the action number
else
-- pathological case that happens first few iterations when we can't
-- fill up our network inputs. Just default to random action in this case
net_input = {}
action = Brain.random_action!
-- shift the network input, state, and action chosen into our windows
table.remove Brain.net_window, 1
table.insert Brain.net_window, net_input
table.remove Brain.state_window, 1
table.insert Brain.state_window, input_array
table.remove Brain.action_window, 1
table.insert Brain.action_window, action
return action
-- This function trains the network using the reward resulting from the last action
-- It will save this past experience which consists of:
-- the state, action chosen, whether a reward was obtained, and the
-- state that resulted from the action
-- After that, it will train the network (using a batch of experiences) using a
-- random sampling of our entire experience history.
Brain.backward = (reward) ->
-- add reward to our history
table.remove Brain.reward_window, 1
table.insert Brain.reward_window, reward
-- if learning is turned off then don't do anything
return unless Brain.learning
Brain.age += 1
-- if we've had enough states and actions to fill up our net input then add
-- this new experience to our history
if Brain.forward_passes > Brain.temporal_window + 1
-- make experience and fill it up
e = Experience nil, nil, nil, nil
n = Brain.window_size
e.state0 = Brain.net_window[n - 1]
e.action0 = Brain.action_window[n - 1]
e.reward0 = Brain.reward_window[n - 1]
e.state1 = Brain.net_window[n]
-- if our experience table isn't larger than the max size then expand
if table.length(Brain.experience) < Brain.experience_size
table.insert Brain.experience, e
-- Otherwise replace random experience. finite memory!
else
ri = torch.random 1, Brain.experience_size
Brain.experience[ri] = e
-- if we have enough experience in memory then start training
if table.length(Brain.experience) > Brain.start_learn_threshold
inputs = torch.Tensor(Brain.batch_size, Brain.net_inputs)\cuda!
targets = torch.Tensor(Brain.batch_size, Brain.net_outputs)\cuda!
for k = 1, Brain.batch_size
-- choose random experience
re = math.random 1, table.length Brain.experience
e = Brain.experience[re]
-- copy state from experience
x = torch.Tensor(e.state0)\cuda!
-- compute best action for the new state
best_action = Brain.policy e.state1
-- get current action output values
-- we want to make the target outputs the same as the actual outputs
-- expect for the action that was chose - we want to replace this with
-- the reward that was obtained + the utility of the resulting state
all_outputs = Brain.net\forward x
inputs[k] = x\clone!
targets[k] = all_outputs\clone!
targets[k][e.action0] = e.reward0 + Brain.gamma * best_action.value
-- create training function to give to optim.sgd
feval = (x) ->
collectgarbage!
-- get new network parameters
Brain.parameters\copy x unless x == Brain.parameters
-- reset gradients
Brain.gradParameters\zero!
-- evaluate function for complete mini batch
outputs = Brain.net\forward inputs
f = Brain.criterion\forward outputs, targets
-- estimate df/dW
df_do = Brain.criterion\backward outputs, targets
Brain.net\backward inputs, df_do
-- penalties (L1 and L2):
if Brain.coefL1 != 0 or Brain.coefL2 != 0
-- locals:
norm,sign = torch.norm, torch.sign
-- Loss:
f += Brain.coefL1 * norm Brain.parameters, 1
f += Brain.coefL2 * 0.5 * norm(Brain.parameters, 2) ^ 2
-- Gradients:
Brain.gradParameters\add(sign(Brain.parameters)\mul(Brain.coefL1) + Brain.parameters\clone!\mul Brain.coefL2)
-- return f and df/dX
return f, Brain.gradParameters
-- fire up optim.sgd
sgdState =
learningRate: Brain.learning_rate
momentum: Brain.momentum
learningRateDecay: Brain.learning_rate_decay
optim.sgd feval, Brain.parameters, sgdState
-- export
return Brain