forked from blakeMilner/DeepQLearning
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Compiled gpuqlearn.moon into gpuqlearn.lua
- Loading branch information
Yoni Lerner
committed
Jan 26, 2015
1 parent
d58397a
commit bad4191
Showing
1 changed file
with
249 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,249 @@ | ||
require('math') | ||
require('nnx') | ||
require('os') | ||
require('optim') | ||
require('cutorch') | ||
require('cunn') | ||
math.randomseed(os.time()) | ||
torch.setdefaulttensortype('torch.FloatTensor') | ||
local Brain = { } | ||
randf = function(s, e) | ||
return (math.random(0, (e - s) * 9999) / 10000) + s | ||
end | ||
table.merge = function(t1, t2) | ||
local t = t1 | ||
for i = 1, #t2 do | ||
t[#t + 1] = t2[i] | ||
end | ||
return t | ||
end | ||
table.copy = function(t) | ||
local u | ||
do | ||
local _tbl_0 = { } | ||
for k, v in pairs(t) do | ||
_tbl_0[k] = v | ||
end | ||
u = _tbl_0 | ||
end | ||
return setmetatable(u, getmetatable(t)) | ||
end | ||
table.length = function(T) | ||
local count = 0 | ||
for _ in pairs(T) do | ||
count = count + 1 | ||
end | ||
return count | ||
end | ||
Experience = function(state0, action0, reward0, state1) | ||
local NewExperience = { | ||
state0 = state0, | ||
action0 = action0, | ||
reward0 = reward0, | ||
state1 = state1 | ||
} | ||
return NewExperience | ||
end | ||
Brain.init = function(num_states, num_actions) | ||
Brain.temporal_window = 2 | ||
Brain.experience_size = 30000 | ||
Brain.start_learn_threshold = 300 | ||
Brain.gamma = 0.9 | ||
Brain.learning_steps_total = 100000 | ||
Brain.learning_steps_burnin = 300 | ||
Brain.epsilon = 1.0 | ||
Brain.epsilon_min = 0.05 | ||
Brain.epsilon_test_time = 0.01 | ||
local _ = [[== states and actions that go into neural net: | ||
(state0,action0),(state1,action1), ... , (stateN) | ||
this variable controls the size of that temporal window. | ||
]] | ||
Brain.net_inputs = (num_states + num_actions) * Brain.temporal_window + num_states | ||
Brain.hidden_nodes = 16 | ||
Brain.num_states = num_states | ||
Brain.num_actions = num_actions | ||
Brain.net_outputs = Brain.num_actions | ||
_ = [[== Window size dictates the number of states, actions, rewards, and net inputs that we | ||
save. The temporal window size is the number of time states/actions that are input | ||
to the network and must be smaller than or equal to window_size | ||
]] | ||
Brain.window_size = math.max(Brain.temporal_window, 2) | ||
Brain.random_action_distribution = { } | ||
if table.length(Brain.random_action_distribution) > 0 then | ||
if table.length(Brain.random_action_distribution) ~= Brain.num_actions then | ||
print('TROUBLE. random_action_distribution should be same length as num_actions.') | ||
end | ||
local s = 0.0 | ||
for k = 1, table.length(Brain.random_action_distribution) do | ||
s = s + Brain.random_action_distribution[k] | ||
end | ||
if math.abs(s - 1.0) > 0.0001 then | ||
print('TROUBLE. random_action_distribution should sum to 1!') | ||
end | ||
end | ||
Brain.net = nn.Sequential() | ||
Brain.net:add(nn.Linear(Brain.net_inputs, Brain.hidden_nodes)) | ||
Brain.net:add(nn.Threshold(0, 0)) | ||
Brain.net:add(nn.Linear(Brain.hidden_nodes, Brain.hidden_nodes)) | ||
Brain.net:add(nn.Threshold(0, 0)) | ||
Brain.net:add(nn.Linear(Brain.hidden_nodes, Brain.net_outputs)) | ||
Brain.net:cuda() | ||
Brain.criterion = nn.MSECriterion():cuda() | ||
Brain.learning_rate = 0.01 | ||
Brain.learning_rate_decay = 5e-7 | ||
Brain.batch_size = 16 | ||
Brain.momentum = 0.9 | ||
Brain.age = 0 | ||
Brain.forward_passes = 0 | ||
Brain.learning = true | ||
Brain.coefL1 = 0.001 | ||
Brain.coefL2 = 0.001 | ||
Brain.parameters, Brain.gradParameters = Brain.net:getParameters() | ||
Brain.experience = { } | ||
Brain.state_window = { } | ||
Brain.action_window = { } | ||
Brain.reward_window = { } | ||
Brain.net_window = { } | ||
for i = 1, Brain.window_size do | ||
Brain.state_window[i] = { } | ||
Brain.action_window[i] = { } | ||
Brain.reward_window[i] = { } | ||
Brain.net_window[i] = { } | ||
end | ||
end | ||
Brain.random_action = function() | ||
if table.length(Brain.random_action_distribution) == 0 then | ||
return (torch.random() % Brain.net_outputs) + 1 | ||
else | ||
local p = randf(0, 1) | ||
local cumprob = 0.0 | ||
for k = 1, Brain.num_actions do | ||
cumprob = cumprob + Brain.random_action_distribution[k] | ||
if p < cumprob then | ||
return k | ||
end | ||
end | ||
end | ||
end | ||
Brain.policy = function(state) | ||
local tensor_state = torch.Tensor(state):cuda() | ||
local action_values = Brain.net:forward(tensor_state) | ||
local maxval = action_values[1] | ||
local max_index = 1 | ||
for i = 2, Brain.net_outputs do | ||
if action_values[i] > maxval then | ||
maxval = action_values[i] | ||
max_index = i | ||
end | ||
end | ||
return { | ||
action = max_index, | ||
value = maxval | ||
} | ||
end | ||
Brain.getNetInput = function(xt) | ||
local w = { } | ||
w = table.merge(w, xt) | ||
local n = Brain.window_size + 1 | ||
for k = 1, Brain.temporal_window do | ||
w = table.merge(w, Brain.state_window[n - k]) | ||
local action1ofk = { } | ||
for i = 1, Brain.num_actions do | ||
action1ofk[i] = 0 | ||
end | ||
action1ofk[Brain.action_window[n - k]] = 1.0 * Brain.num_states | ||
w = table.merge(w, action1ofk) | ||
end | ||
return w | ||
end | ||
Brain.forward = function(input_array) | ||
Brain.forward_passes = Brain.forward_passes + 1 | ||
local action, net_input | ||
if Brain.forward_passes > Brain.temporal_window then | ||
net_input = Brain.getNetInput(input_array) | ||
if Brain.learning then | ||
local new_epsilon = 1.0 - (Brain.age - Brain.learning_steps_burnin) / (Brain.learning_steps_total - Brain.learning_steps_burnin) | ||
Brain.epsilon = math.min(1.0, math.max(Brain.epsilon_min, new_epsilon)) | ||
else | ||
Brain.epsilon = Brain.epsilon_test_time | ||
end | ||
if randf(0, 1) < Brain.epsilon then | ||
action = Brain.random_action() | ||
else | ||
local best_action = Brain.policy(net_input) | ||
action = best_action.action | ||
end | ||
else | ||
net_input = { } | ||
action = Brain.random_action() | ||
end | ||
table.remove(Brain.net_window, 1) | ||
table.insert(Brain.net_window, net_input) | ||
table.remove(Brain.state_window, 1) | ||
table.insert(Brain.state_window, input_array) | ||
table.remove(Brain.action_window, 1) | ||
table.insert(Brain.action_window, action) | ||
return action | ||
end | ||
Brain.backward = function(reward) | ||
table.remove(Brain.reward_window, 1) | ||
table.insert(Brain.reward_window, reward) | ||
if not (Brain.learning) then | ||
return | ||
end | ||
Brain.age = Brain.age + 1 | ||
if Brain.forward_passes > Brain.temporal_window + 1 then | ||
local e = Experience(nil, nil, nil, nil) | ||
local n = Brain.window_size | ||
e.state0 = Brain.net_window[n - 1] | ||
e.action0 = Brain.action_window[n - 1] | ||
e.reward0 = Brain.reward_window[n - 1] | ||
e.state1 = Brain.net_window[n] | ||
if table.length(Brain.experience) < Brain.experience_size then | ||
table.insert(Brain.experience, e) | ||
else | ||
local ri = torch.random(1, Brain.experience_size) | ||
Brain.experience[ri] = e | ||
end | ||
end | ||
if table.length(Brain.experience) > Brain.start_learn_threshold then | ||
local inputs = torch.Tensor(Brain.batch_size, Brain.net_inputs):cuda() | ||
local targets = torch.Tensor(Brain.batch_size, Brain.net_outputs):cuda() | ||
for k = 1, Brain.batch_size do | ||
local re = math.random(1, table.length(Brain.experience)) | ||
local e = Brain.experience[re] | ||
local x = torch.Tensor(e.state0):cuda() | ||
local best_action = Brain.policy(e.state1) | ||
local all_outputs = Brain.net:forward(x) | ||
inputs[k] = x:clone() | ||
targets[k] = all_outputs:clone() | ||
targets[k][e.action0] = e.reward0 + Brain.gamma * best_action.value | ||
end | ||
local feval | ||
feval = function(x) | ||
collectgarbage() | ||
if not (x == Brain.parameters) then | ||
Brain.parameters:copy(x) | ||
end | ||
Brain.gradParameters:zero() | ||
local outputs = Brain.net:forward(inputs) | ||
local f = Brain.criterion:forward(outputs, targets) | ||
local df_do = Brain.criterion:backward(outputs, targets) | ||
Brain.net:backward(inputs, df_do) | ||
if Brain.coefL1 ~= 0 or Brain.coefL2 ~= 0 then | ||
local norm, sign = torch.norm, torch.sign | ||
f = f + (Brain.coefL1 * norm(Brain.parameters, 1)) | ||
f = f + (Brain.coefL2 * 0.5 * norm(Brain.parameters, 2) ^ 2) | ||
Brain.gradParameters:add(sign(Brain.parameters):mul(Brain.coefL1) + Brain.parameters:clone():mul(Brain.coefL2)) | ||
end | ||
return f, Brain.gradParameters | ||
end | ||
local sgdState = { | ||
learningRate = Brain.learning_rate, | ||
momentum = Brain.momentum, | ||
learningRateDecay = Brain.learning_rate_decay | ||
} | ||
return optim.sgd(feval, Brain.parameters, sgdState) | ||
end | ||
end | ||
return Brain |