Skip to content

Commit

Permalink
Added GPU version
Browse files Browse the repository at this point in the history
  • Loading branch information
Yoni Lerner committed Jan 26, 2015
1 parent e1260f9 commit d58397a
Showing 1 changed file with 372 additions and 0 deletions.
372 changes: 372 additions & 0 deletions gpuqlearn.moon
Original file line number Diff line number Diff line change
@@ -0,0 +1,372 @@
require 'math'
require 'nnx'
require 'os'
require 'optim'
require 'cutorch'
require 'cunn'

math.randomseed os.time!
torch.setdefaulttensortype 'torch.FloatTensor'

Brain = {}

-- HELPER FUNCTIONS --

export randf = (s, e) ->
return (math.random(0, (e - s) * 9999) / 10000) + s

-- new methods for table

table.merge = (t1, t2) ->
t = t1
for i = 1, #t2
t[#t + 1] = t2[i]
return t

table.copy = (t) ->
u = {k, v for k, v in pairs t}
return setmetatable(u, getmetatable t)

table.length = (T) ->
count = 0
count += 1 for _ in pairs T
return count

-- returns experience table for single network decision
-- contains the state, action chosen, whether a reward was obtained, and the
-- state that resulted from the action. This is later used to train the network
-- Remember that the utility of an action is evaluated from the reward gained and
-- the utility of the state it led to (recursive definition)
export Experience = (state0, action0, reward0, state1) ->
NewExperience =
state0: state0
action0: action0
reward0: reward0
state1: state1
return NewExperience

-- BRAIN

Brain.init = (num_states, num_actions) ->
-- Number of past state/action pairs input to the network. 0 = agent lives in-the-moment :)
Brain.temporal_window = 2
-- Maximum number of experiences that we will save for training
Brain.experience_size = 30000
-- experience necessary to start learning
Brain.start_learn_threshold = 300
-- gamma is a crucial parameter that controls how much plan-ahead the agent does. In [0,1]
-- Determines the amount of weight placed on the utility of the state resulting from an action.
Brain.gamma = 0.9
-- number of steps we will learn for
Brain.learning_steps_total = 100000
-- how many steps of the above to perform only random actions (in the beginning)?
Brain.learning_steps_burnin = 300
-- controls exploration exploitation tradeoff. Will decay over time
-- a higher epsilon means we are more likely to choose random actions
Brain.epsilon = 1.0
-- what epsilon value do we bottom out on? 0.0 => purely deterministic policy at end
Brain.epsilon_min = 0.05
-- what epsilon to use when learning is turned off. This is for testing
Brain.epsilon_test_time = 0.01

[[== states and actions that go into neural net:
(state0,action0),(state1,action1), ... , (stateN)
this variable controls the size of that temporal window.
]]
Brain.net_inputs = (num_states + num_actions) * Brain.temporal_window + num_states
Brain.hidden_nodes = 16
Brain.num_states = num_states
Brain.num_actions = num_actions
Brain.net_outputs = Brain.num_actions

[[== Window size dictates the number of states, actions, rewards, and net inputs that we
save. The temporal window size is the number of time states/actions that are input
to the network and must be smaller than or equal to window_size
]]
Brain.window_size = math.max Brain.temporal_window, 2

-- advanced feature. Sometimes a random action should be biased towards some values
-- for example in flappy bird, we may want to choose to not flap more often
Brain.random_action_distribution = {}
if table.length(Brain.random_action_distribution) > 0
-- this better sum to 1 by the way, and be of length this.num_actions
if table.length(Brain.random_action_distribution) != Brain.num_actions
print 'TROUBLE. random_action_distribution should be same length as num_actions.'

s = 0.0

for k = 1, table.length Brain.random_action_distribution
s += Brain.random_action_distribution[k]

if math.abs(s - 1.0) > 0.0001
print 'TROUBLE. random_action_distribution should sum to 1!'


-- define architecture
Brain.net = nn.Sequential!

Brain.net\add nn.Linear Brain.net_inputs, Brain.hidden_nodes
Brain.net\add nn.Threshold 0, 0

Brain.net\add nn.Linear Brain.hidden_nodes, Brain.hidden_nodes
Brain.net\add nn.Threshold 0, 0

Brain.net\add nn.Linear Brain.hidden_nodes, Brain.net_outputs

Brain.net\cuda! -- move network to GPU

Brain.criterion = nn.MSECriterion!\cuda!


-- other learning parameters
Brain.learning_rate = 0.01
Brain.learning_rate_decay = 5e-7
Brain.batch_size = 16
Brain.momentum = 0.9

-- various housekeeping variables
Brain.age = 0 -- incremented every backward!

-- number of times we've called forward - lets us know when our input temporal
-- window is filled up
Brain.forward_passes = 0
Brain.learning = true

-- coefficients for regression
Brain.coefL1 = 0.001
Brain.coefL2 = 0.001

-- parameters for optim.sgd
Brain.parameters, Brain.gradParameters = Brain.net\getParameters!

-- These windows track old experiences, states, actions, rewards, and net inputs
-- over time. They should all start out as empty with a fixed size.
-- This is a first in, last out data structure that is shifted along time
Brain.experience = {}
Brain.state_window = {}
Brain.action_window = {}
Brain.reward_window = {}
Brain.net_window = {}
for i = 1, Brain.window_size
Brain.state_window[i] = {}
Brain.action_window[i] = {}
Brain.reward_window[i] = {}
Brain.net_window[i] = {}

-- a bit of a helper function. It returns a random action
-- we are abstracting this away because in future we may want to
-- do more sophisticated things. For example some actions could be more
-- or less likely at "rest"/default state.
Brain.random_action = ->
-- if we don't have a random action distribution defined then sample evenly
if table.length(Brain.random_action_distribution) == 0
return (torch.random! % Brain.net_outputs) + 1

-- okay, lets do some fancier sampling:
else
p = randf 0, 1
cumprob = 0.0

for k = 1, Brain.num_actions
cumprob += Brain.random_action_distribution[k]

if p < cumprob
return k

-- compute the value of doing any action in this state
-- and return the argmax action and its value
Brain.policy = (state) ->
tensor_state = torch.Tensor(state)\cuda!
action_values = Brain.net\forward tensor_state

maxval = action_values[1]
max_index = 1

-- find maximum output and note its index and value
--max_index = i for i = 2, Brain.net_outputs when action_values[i] > action_values[max_index]
for i = 2, Brain.net_outputs
if action_values[i] > maxval
maxval = action_values[i]
max_index = i

return action: max_index, value: maxval

-- This function assembles the input to the network by concatenating
-- old (state, chosen_action) pairs along with the current state
-- return s = (x,a,x,a,x,a,xt) state vector.
Brain.getNetInput = (xt) ->
w = {}
w = table.merge(w, xt) -- start with current state

-- and now go backwards and append states and actions from history temporal_window times
n = Brain.window_size + 1
for k = 1, Brain.temporal_window do
-- state
w = table.merge w, Brain.state_window[n - k]
-- action, encoded as 1-of-k indicator vector. We scale it up a bit because
-- we don't want weight regularization to undervalue this information, as it only exists once
action1ofk = {}
action1ofk[i] = 0 for i = 1, Brain.num_actions

-- assign action taken for current state to be 1, all others are 0
action1ofk[Brain.action_window[n - k]] = 1.0 * Brain.num_states

w = table.merge w, action1ofk

return w

-- This function computes an action by either:
-- 1. Giving the current state and past (state, action) pairs to the network
-- and letting it choose the best acction
-- 2. Choosing a random action
Brain.forward = (input_array) ->
Brain.forward_passes += 1

local action, net_input

-- if we have enough (state, action) pairs in our memory to fill up
-- our network input then we'll proceed to let our network choose the action
if Brain.forward_passes > Brain.temporal_window
net_input = Brain.getNetInput input_array

-- if learning is turned on then epsilon should be decaying
if Brain.learning
-- compute (decaying) epsilon for the epsilon-greedy policy
new_epsilon = 1.0 - (Brain.age - Brain.learning_steps_burnin)/(Brain.learning_steps_total - Brain.learning_steps_burnin)

-- don't let epsilon go above 1.0
Brain.epsilon = math.min(1.0, math.max(Brain.epsilon_min, new_epsilon))
else
-- if learning is turned off then use the epsilon we've specified for testing
Brain.epsilon = Brain.epsilon_test_time

-- use epsilon probability to choose whether we use network action or random action
if randf(0, 1) < Brain.epsilon
action = Brain.random_action!
else
-- otherwise use our policy to make decision
best_action = Brain.policy net_input
action = best_action.action -- this is the action number
else
-- pathological case that happens first few iterations when we can't
-- fill up our network inputs. Just default to random action in this case
net_input = {}
action = Brain.random_action!

-- shift the network input, state, and action chosen into our windows
table.remove Brain.net_window, 1
table.insert Brain.net_window, net_input

table.remove Brain.state_window, 1
table.insert Brain.state_window, input_array

table.remove Brain.action_window, 1
table.insert Brain.action_window, action

return action

-- This function trains the network using the reward resulting from the last action
-- It will save this past experience which consists of:
-- the state, action chosen, whether a reward was obtained, and the
-- state that resulted from the action
-- After that, it will train the network (using a batch of experiences) using a
-- random sampling of our entire experience history.
Brain.backward = (reward) ->
-- add reward to our history
table.remove Brain.reward_window, 1
table.insert Brain.reward_window, reward

-- if learning is turned off then don't do anything
return unless Brain.learning

Brain.age += 1

-- if we've had enough states and actions to fill up our net input then add
-- this new experience to our history
if Brain.forward_passes > Brain.temporal_window + 1
-- make experience and fill it up
e = Experience nil, nil, nil, nil
n = Brain.window_size
e.state0 = Brain.net_window[n - 1]
e.action0 = Brain.action_window[n - 1]
e.reward0 = Brain.reward_window[n - 1]
e.state1 = Brain.net_window[n]

-- if our experience table isn't larger than the max size then expand
if table.length(Brain.experience) < Brain.experience_size
table.insert Brain.experience, e
-- Otherwise replace random experience. finite memory!
else
ri = torch.random 1, Brain.experience_size
Brain.experience[ri] = e

-- if we have enough experience in memory then start training
if table.length(Brain.experience) > Brain.start_learn_threshold
inputs = torch.Tensor(Brain.batch_size, Brain.net_inputs)\cuda!
targets = torch.Tensor(Brain.batch_size, Brain.net_outputs)\cuda!

for k = 1, Brain.batch_size
-- choose random experience
re = math.random 1, table.length Brain.experience
e = Brain.experience[re]

-- copy state from experience
x = torch.Tensor(e.state0)\cuda!

-- compute best action for the new state
best_action = Brain.policy e.state1

-- get current action output values
-- we want to make the target outputs the same as the actual outputs
-- expect for the action that was chose - we want to replace this with
-- the reward that was obtained + the utility of the resulting state
all_outputs = Brain.net\forward x
inputs[k] = x\clone!
targets[k] = all_outputs\clone!
targets[k][e.action0] = e.reward0 + Brain.gamma * best_action.value

-- create training function to give to optim.sgd
feval = (x) ->
collectgarbage!

-- get new network parameters
Brain.parameters\copy x unless x == Brain.parameters

-- reset gradients
Brain.gradParameters\zero!

-- evaluate function for complete mini batch
outputs = Brain.net\forward inputs
f = Brain.criterion\forward outputs, targets

-- estimate df/dW
df_do = Brain.criterion\backward outputs, targets
Brain.net\backward inputs, df_do

-- penalties (L1 and L2):
if Brain.coefL1 != 0 or Brain.coefL2 != 0
-- locals:
norm,sign = torch.norm, torch.sign

-- Loss:
f += Brain.coefL1 * norm Brain.parameters, 1
f += Brain.coefL2 * 0.5 * norm(Brain.parameters, 2) ^ 2

-- Gradients:
Brain.gradParameters\add(sign(Brain.parameters)\mul(Brain.coefL1) + Brain.parameters\clone!\mul Brain.coefL2)

-- return f and df/dX
return f, Brain.gradParameters

-- fire up optim.sgd
sgdState =
learningRate: Brain.learning_rate
momentum: Brain.momentum
learningRateDecay: Brain.learning_rate_decay

optim.sgd feval, Brain.parameters, sgdState



-- export
return Brain

0 comments on commit d58397a

Please sign in to comment.