forked from blakeMilner/DeepQLearning
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdeepqlearn.lua
418 lines (335 loc) · 13.8 KB
/
deepqlearn.lua
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
require 'math'
require 'nnx'
require 'os'
require 'optim'
math.randomseed( os.time() )
torch.setdefaulttensortype('torch.FloatTensor')
local Brain = {}
--[[ HELPER FUNCTIONS --]]
function randf(s, e)
return (math.random(0,(e-s)*9999)/10000) + s;
end
-- new methods for table
function table.merge(t1, t2)
local t = t1
for i = 1, #t2 do
t[#t+1] = t2[i]
end
return t
end
function table.copy(t)
local u = { }
for k, v in pairs(t) do u[k] = v end
return setmetatable(u, getmetatable(t))
end
function table.length(T)
local count = 0
for _ in pairs(T) do count = count + 1 end
return count
end
--[[ returns experience table for single network decision
contains the state, action chosen, whether a reward was obtained, and the
state that resulted from the action. This is later used to train the network
Remember that the utility of an action is evaluated from the reward gained and
the utility of the state it led to (recursive definition)
--]]
function Experience(state0, action0, reward0, state1)
local Experience = {};
Experience.state0 = state0;
Experience.action0 = action0;
Experience.reward0 = reward0;
Experience.state1 = state1;
return Experience;
end
-- BRAIN
function Brain.init(num_states, num_actions)
-- Number of past state/action pairs input to the network. 0 = agent lives in-the-moment :)
Brain.temporal_window = 2
-- Maximum number of experiences that we will save for training
Brain.experience_size = 30000
-- experience necessary to start learning
Brain.start_learn_threshold = 300
-- gamma is a crucial parameter that controls how much plan-ahead the agent does. In [0,1]
-- Determines the amount of weight placed on the utility of the state resulting from an action.
Brain.gamma = 0.9;
-- number of steps we will learn for
Brain.learning_steps_total = 100000
-- how many steps of the above to perform only random actions (in the beginning)?
Brain.learning_steps_burnin = 300;
-- controls exploration exploitation tradeoff. Will decay over time
-- a higher epsilon means we are more likely to choose random actions
Brain.epsilon = 1.0
-- what epsilon value do we bottom out on? 0.0 => purely deterministic policy at end
Brain.epsilon_min = 0.05;
-- what epsilon to use when learning is turned off. This is for testing
Brain.epsilon_test_time = 0.01;
--[[ states and actions that go into neural net:
(state0,action0),(state1,action1), ... , (stateN)
this variable controls the size of that temporal window.
--]]
Brain.net_inputs = (num_states + num_actions) * Brain.temporal_window + num_states;
Brain.hidden_nodes = 16
Brain.num_states = num_states;
Brain.num_actions = num_actions;
Brain.net_outputs = Brain.num_actions;
--[[ Window size dictates the number of states, actions, rewards, and net inputs that we
save. The temporal window size is the number of time states/actions that are input
to the network and must be smaller than or equal to window_size
--]]
Brain.window_size = math.max(Brain.temporal_window, 2);
-- advanced feature. Sometimes a random action should be biased towards some values
-- for example in flappy bird, we may want to choose to not flap more often
Brain.random_action_distribution = {};
if(table.length(Brain.random_action_distribution) > 0) then
-- this better sum to 1 by the way, and be of length this.num_actions
if(table.length(Brain.random_action_distribution) ~= Brain.num_actions) then
print('TROUBLE. random_action_distribution should be same length as num_actions.');
end
local s = 0.0;
for k = 1, table.length(Brain.random_action_distribution) do
s = s + Brain.random_action_distribution[k]
end
if(math.abs(s - 1.0) > 0.0001) then
print('TROUBLE. random_action_distribution should sum to 1!');
end
end
-- define architecture
Brain.net = nn.Sequential()
Brain.net:add(nn.Linear(Brain.net_inputs, Brain.hidden_nodes))
Brain.net:add(nn.Threshold(0,0))
Brain.net:add(nn.Linear(Brain.hidden_nodes, Brain.hidden_nodes))
Brain.net:add(nn.Threshold(0,0))
Brain.net:add(nn.Linear(Brain.hidden_nodes, Brain.net_outputs))
Brain.criterion = nn.MSECriterion()
-- other learning parameters
Brain.learning_rate = 0.01;
Brain.learning_rate_decay = 5e-7
Brain.batch_size = 16;
Brain.momentum = 0.9;
-- various housekeeping variables
Brain.age = 0; -- incremented every backward()
-- number of times we've called forward - lets us know when our input temporal
-- window is filled up
Brain.forward_passes = 0;
Brain.learning = true;
-- coefficients for regression
Brain.coefL1 = 0.001
Brain.coefL2 = 0.001
-- parameters for optim.sgd
Brain.parameters, Brain.gradParameters = Brain.net:getParameters()
-- These windows track old experiences, states, actions, rewards, and net inputs
-- over time. They should all start out as empty with a fixed size.
-- This is a first in, last out data structure that is shifted along time
Brain.experience = {};
Brain.state_window = {}
Brain.action_window = {}
Brain.reward_window = {}
Brain.net_window = {}
for i = 1, Brain.window_size do
Brain.state_window[i] = {}
Brain.action_window[i] = {}
Brain.reward_window[i] = {}
Brain.net_window[i] = {}
end
end
-- a bit of a helper function. It returns a random action
-- we are abstracting this away because in future we may want to
-- do more sophisticated things. For example some actions could be more
-- or less likely at "rest"/default state.
function Brain.random_action()
-- if we don't have a random action distribution defined then sample evenly
if(table.length(Brain.random_action_distribution) == 0) then
return (torch.random() % Brain.net_outputs) + 1
-- okay, lets do some fancier sampling:
else
local p = randf(0, 1);
local cumprob = 0.0;
for k= 1, Brain.num_actions do
cumprob = cumprob + Brain.random_action_distribution[k];
if(p < cumprob) then
return k
end
end
end
end
-- compute the value of doing any action in this state
-- and return the argmax action and its value
function Brain.policy(state)
local tensor_state = torch.Tensor(state)
local action_values = Brain.net:forward(tensor_state);
local maxval = action_values[1]
local max_index = 1
-- find maximum output and note its index and value
for i = 2, Brain.net_outputs do
if action_values[i] > maxval then
maxval = action_values[i]
max_index = i
end
end
return {action = max_index, value = maxval};
end
-- This function assembles the input to the network by concatenating
-- old (state, chosen_action) pairs along with the current state
-- return s = (x,a,x,a,x,a,xt) state vector.
function Brain.getNetInput(xt)
local w = {};
w = table.merge(w, xt); -- start with current state
-- and now go backwards and append states and actions from history temporal_window times
local n = Brain.window_size + 1;
for k = 1, Brain.temporal_window do
-- state
w = table.merge(w, Brain.state_window[n-k]);
-- action, encoded as 1-of-k indicator vector. We scale it up a bit because
-- we dont want weight regularization to undervalue this information, as it only exists once
local action1ofk = {};
for i = 1, Brain.num_actions do
action1ofk[i] = 0
end
-- assign action taken for current state to be 1, all others are 0
action1ofk[Brain.action_window[n-k]] = 1.0*Brain.num_states;
w = table.merge(w, action1ofk);
end
return w;
end
--[[ This function computes an action by either:
1. Giving the current state and past (state, action) pairs to the network
and letting it choose the best acction
2. Choosing a random action
--]]
function Brain.forward(input_array)
Brain.forward_passes = Brain.forward_passes + 1;
local action, net_input;
-- if we have enough (state, action) pairs in our memory to fill up
-- our network input then we'll proceed to let our network choose the action
if(Brain.forward_passes > Brain.temporal_window ) then
net_input = Brain.getNetInput(input_array);
-- if learning is turned on then epsilon should be decaying
if(Brain.learning) then
-- compute (decaying) epsilon for the epsilon-greedy policy
local new_epsilon = 1.0 - (Brain.age - Brain.learning_steps_burnin)/(Brain.learning_steps_total - Brain.learning_steps_burnin)
-- don't let epsilon go above 1.0
Brain.epsilon = math.min(1.0, math.max(Brain.epsilon_min, new_epsilon));
else
-- if learning is turned off then use the epsilon we've specified for testing
Brain.epsilon = Brain.epsilon_test_time;
end
-- use epsilon probability to choose whether we use network action or random action
if(randf(0, 1) < Brain.epsilon) then
action = Brain.random_action();
else
-- otherwise use our policy to make decision
local best_action = Brain.policy(net_input);
action = best_action.action; -- this is the action number
end
else
-- pathological case that happens first few iterations when we can't
-- fill up our network inputs. Just default to random action in this case
net_input = {};
action = Brain.random_action();
end
-- shift the network input, state, and action chosen into our windows
table.remove( Brain.net_window, 1)
table.insert( Brain.net_window, net_input)
table.remove( Brain.state_window, 1)
table.insert( Brain.state_window, input_array)
table.remove( Brain.action_window, 1)
table.insert( Brain.action_window, action)
return action;
end
--[[
This function trains the network using the reward resulting from the last action
It will save this past experience which consists of:
the state, action chosen, whether a reward was obtained, and the
state that resulted from the action
After that, it will train the network (using a batch of experiences) using a
random sampling of our entire experience history.
--]]
function Brain.backward(reward)
-- add reward to our history
table.remove( Brain.reward_window, 1)
table.insert( Brain.reward_window, reward)
-- if learning is turned off then don't do anything
if(not Brain.learning) then
return;
end
Brain.age = Brain.age + 1;
-- if we've had enough states and actions to fill up our net input then add
-- this new experience to our history
if(Brain.forward_passes > Brain.temporal_window + 1) then
-- make experience and fill it up
local e = Experience(nil, nil, nil, nil);
local n = Brain.window_size;
e.state0 = Brain.net_window[n-1];
e.action0 = Brain.action_window[n-1];
e.reward0 = Brain.reward_window[n-1];
e.state1 = Brain.net_window[n];
-- if our experience table isn't larger than the max size then expand
if(table.length(Brain.experience) < Brain.experience_size) then
table.insert(Brain.experience, e)
else
-- Otherwise replace random experience. finite memory!
local ri = torch.random(1, Brain.experience_size);
Brain.experience[ri] = e;
end
end
-- if we have enough experience in memory then start training
if(table.length(Brain.experience) > Brain.start_learn_threshold) then
inputs = torch.Tensor(Brain.batch_size, Brain.net_inputs)
targets = torch.Tensor(Brain.batch_size, Brain.net_outputs)
for k = 1, Brain.batch_size do
-- choose random experience
local re = math.random(1, table.length(Brain.experience));
local e = Brain.experience[re];
-- copy state from experience
local x = torch.Tensor(e.state0);
-- compute best action for the new state
local best_action = Brain.policy(e.state1);
--[[ get current action output values
we want to make the target outputs the same as the actual outputs
expect for the action that was chose - we want to replace this with
the reward that was obtained + the utility of the resulting state
--]]
local all_outputs = Brain.net:forward(x);
inputs[k] = x:clone();
targets[k] = all_outputs:clone();
targets[k][e.action0] = e.reward0 + Brain.gamma * best_action.value;
end
-- create training function to give to optim.sgd
local feval = function(x)
collectgarbage()
-- get new network parameters
if x ~= Brain.parameters then
Brain.parameters:copy(x)
end
-- reset gradients
Brain.gradParameters:zero()
-- evaluate function for complete mini batch
local outputs = Brain.net:forward(inputs)
local f = Brain.criterion:forward(outputs, targets)
-- estimate df/dW
local df_do = Brain.criterion:backward(outputs, targets)
Brain.net:backward(inputs, df_do)
-- penalties (L1 and L2):
if Brain.coefL1 ~= 0 or Brain.coefL2 ~= 0 then
-- locals:
local norm,sign = torch.norm,torch.sign
-- Loss:
f = f + Brain.coefL1 * norm(Brain.parameters,1)
f = f + Brain.coefL2 * norm(Brain.parameters,2)^2/2
-- Gradients:
Brain.gradParameters:add( sign(Brain.parameters):mul(Brain.coefL1) + Brain.parameters:clone():mul(Brain.coefL2) )
end
-- return f and df/dX
return f, Brain.gradParameters
end
-- fire up optim.sgd
sgdState = {
learningRate = Brain.learning_rate,
momentum = Brain.momentum,
learningRateDecay = Brain.learning_rate_decay
}
optim.sgd(feval, Brain.parameters, sgdState)
end
end
-- export
return Brain