diff --git a/deepqlearn.lua b/deepqlearn.lua new file mode 100644 index 0000000..308e793 --- /dev/null +++ b/deepqlearn.lua @@ -0,0 +1,418 @@ +require 'math' +require 'nnx' +require 'os' +require 'optim' + + +math.randomseed( os.time() ) +torch.setdefaulttensortype('torch.FloatTensor') + +local Brain = {} + +--[[ HELPER FUNCTIONS --]] + +function randf(s, e) + return (math.random(0,(e-s)*9999)/10000) + s; +end + +-- new methods for table + +function table.merge(t1, t2) + local t = t1 + for i = 1, #t2 do + t[#t+1] = t2[i] + end + return t +end + +function table.copy(t) + local u = { } + for k, v in pairs(t) do u[k] = v end + return setmetatable(u, getmetatable(t)) +end + +function table.length(T) + local count = 0 + for _ in pairs(T) do count = count + 1 end + return count +end + +--[[ returns experience table for single network decision + contains the state, action chosen, whether a reward was obtained, and the + state that resulted from the action. This is later used to train the network + Remember that the utility of an action is evaluated from the reward gained and + the utility of the state it led to (recursive definition) +--]] +function Experience(state0, action0, reward0, state1) + local Experience = {}; + Experience.state0 = state0; + Experience.action0 = action0; + Experience.reward0 = reward0; + Experience.state1 = state1; + return Experience; +end + +-- BRAIN + +function Brain.init(num_states, num_actions) + -- Number of past state/action pairs input to the network. 0 = agent lives in-the-moment :) + Brain.temporal_window = 2 + -- Maximum number of experiences that we will save for training + Brain.experience_size = 30000 + -- experience necessary to start learning + Brain.start_learn_threshold = 300 + -- gamma is a crucial parameter that controls how much plan-ahead the agent does. In [0,1] + -- Determines the amount of weight placed on the utility of the state resulting from an action. + Brain.gamma = 0.9; + -- number of steps we will learn for + Brain.learning_steps_total = 100000 + -- how many steps of the above to perform only random actions (in the beginning)? + Brain.learning_steps_burnin = 300; + -- controls exploration exploitation tradeoff. Will decay over time + -- a higher epsilon means we are more likely to choose random actions + Brain.epsilon = 1.0 + -- what epsilon value do we bottom out on? 0.0 => purely deterministic policy at end + Brain.epsilon_min = 0.05; + -- what epsilon to use when learning is turned off. This is for testing + Brain.epsilon_test_time = 0.01; + + --[[ states and actions that go into neural net: + (state0,action0),(state1,action1), ... , (stateN) + this variable controls the size of that temporal window. + --]] + Brain.net_inputs = (num_states + num_actions) * Brain.temporal_window + num_states; + Brain.hidden_nodes = 16 + Brain.num_states = num_states; + Brain.num_actions = num_actions; + Brain.net_outputs = Brain.num_actions; + + --[[ Window size dictates the number of states, actions, rewards, and net inputs that we + save. The temporal window size is the number of time states/actions that are input + to the network and must be smaller than or equal to window_size + --]] + Brain.window_size = math.max(Brain.temporal_window, 2); + + -- advanced feature. Sometimes a random action should be biased towards some values + -- for example in flappy bird, we may want to choose to not flap more often + Brain.random_action_distribution = {}; + if(table.length(Brain.random_action_distribution) > 0) then + -- this better sum to 1 by the way, and be of length this.num_actions + if(table.length(Brain.random_action_distribution) ~= Brain.num_actions) then + print('TROUBLE. random_action_distribution should be same length as num_actions.'); + end + + local s = 0.0; + + for k = 1, table.length(Brain.random_action_distribution) do + s = s + Brain.random_action_distribution[k] + end + + if(math.abs(s - 1.0) > 0.0001) then + print('TROUBLE. random_action_distribution should sum to 1!'); + end + end + + + -- define architecture + Brain.net = nn.Sequential() + + Brain.net:add(nn.Linear(Brain.net_inputs, Brain.hidden_nodes)) + Brain.net:add(nn.Threshold(0,0)) + + Brain.net:add(nn.Linear(Brain.hidden_nodes, Brain.hidden_nodes)) + Brain.net:add(nn.Threshold(0,0)) + + Brain.net:add(nn.Linear(Brain.hidden_nodes, Brain.net_outputs)) + + Brain.criterion = nn.MSECriterion() + + + -- other learning parameters + Brain.learning_rate = 0.01; + Brain.learning_rate_decay = 5e-7 + Brain.batch_size = 16; + Brain.momentum = 0.9; + + -- various housekeeping variables + Brain.age = 0; -- incremented every backward() + + -- number of times we've called forward - lets us know when our input temporal + -- window is filled up + Brain.forward_passes = 0; + Brain.learning = true; + + -- coefficients for regression + Brain.coefL1 = 0.001 + Brain.coefL2 = 0.001 + + -- parameters for optim.sgd + Brain.parameters, Brain.gradParameters = Brain.net:getParameters() + + -- These windows track old experiences, states, actions, rewards, and net inputs + -- over time. They should all start out as empty with a fixed size. + -- This is a first in, last out data structure that is shifted along time + Brain.experience = {}; + Brain.state_window = {} + Brain.action_window = {} + Brain.reward_window = {} + Brain.net_window = {} + for i = 1, Brain.window_size do + Brain.state_window[i] = {} + Brain.action_window[i] = {} + Brain.reward_window[i] = {} + Brain.net_window[i] = {} + end +end + + -- a bit of a helper function. It returns a random action + -- we are abstracting this away because in future we may want to + -- do more sophisticated things. For example some actions could be more + -- or less likely at "rest"/default state. +function Brain.random_action() + -- if we don't have a random action distribution defined then sample evenly + if(table.length(Brain.random_action_distribution) == 0) then + return (torch.random() % Brain.net_outputs) + 1 + + -- okay, lets do some fancier sampling: + else + local p = randf(0, 1); + local cumprob = 0.0; + + for k= 1, Brain.num_actions do + cumprob = cumprob + Brain.random_action_distribution[k]; + + if(p < cumprob) then + return k + end + end + end +end + + -- compute the value of doing any action in this state + -- and return the argmax action and its value +function Brain.policy(state) + local tensor_state = torch.Tensor(state) + local action_values = Brain.net:forward(tensor_state); + + local maxval = action_values[1] + local max_index = 1 + + -- find maximum output and note its index and value + for i = 2, Brain.net_outputs do + if action_values[i] > maxval then + maxval = action_values[i] + max_index = i + end + end + + return {action = max_index, value = maxval}; +end + +-- This function assembles the input to the network by concatenating +-- old (state, chosen_action) pairs along with the current state + -- return s = (x,a,x,a,x,a,xt) state vector. +function Brain.getNetInput(xt) + local w = {}; + w = table.merge(w, xt); -- start with current state + + -- and now go backwards and append states and actions from history temporal_window times + local n = Brain.window_size + 1; + for k = 1, Brain.temporal_window do + -- state + w = table.merge(w, Brain.state_window[n-k]); + -- action, encoded as 1-of-k indicator vector. We scale it up a bit because + -- we dont want weight regularization to undervalue this information, as it only exists once + local action1ofk = {}; + for i = 1, Brain.num_actions do + action1ofk[i] = 0 + end + + -- assign action taken for current state to be 1, all others are 0 + action1ofk[Brain.action_window[n-k]] = 1.0*Brain.num_states; + + w = table.merge(w, action1ofk); + end + + return w; +end + +--[[ This function computes an action by either: + 1. Giving the current state and past (state, action) pairs to the network + and letting it choose the best acction + 2. Choosing a random action +--]] +function Brain.forward(input_array) + Brain.forward_passes = Brain.forward_passes + 1; + + local action, net_input; + + -- if we have enough (state, action) pairs in our memory to fill up + -- our network input then we'll proceed to let our network choose the action + if(Brain.forward_passes > Brain.temporal_window ) then + net_input = Brain.getNetInput(input_array); + + -- if learning is turned on then epsilon should be decaying + if(Brain.learning) then + -- compute (decaying) epsilon for the epsilon-greedy policy + local new_epsilon = 1.0 - (Brain.age - Brain.learning_steps_burnin)/(Brain.learning_steps_total - Brain.learning_steps_burnin) + + -- don't let epsilon go above 1.0 + Brain.epsilon = math.min(1.0, math.max(Brain.epsilon_min, new_epsilon)); + else + -- if learning is turned off then use the epsilon we've specified for testing + Brain.epsilon = Brain.epsilon_test_time; + end + + -- use epsilon probability to choose whether we use network action or random action + if(randf(0, 1) < Brain.epsilon) then + action = Brain.random_action(); + else + -- otherwise use our policy to make decision + local best_action = Brain.policy(net_input); + action = best_action.action; -- this is the action number + end + else + -- pathological case that happens first few iterations when we can't + -- fill up our network inputs. Just default to random action in this case + net_input = {}; + action = Brain.random_action(); + end + + -- shift the network input, state, and action chosen into our windows + table.remove( Brain.net_window, 1) + table.insert( Brain.net_window, net_input) + + table.remove( Brain.state_window, 1) + table.insert( Brain.state_window, input_array) + + table.remove( Brain.action_window, 1) + table.insert( Brain.action_window, action) + + return action; +end + +--[[ + This function trains the network using the reward resulting from the last action + It will save this past experience which consists of: + the state, action chosen, whether a reward was obtained, and the + state that resulted from the action + After that, it will train the network (using a batch of experiences) using a + random sampling of our entire experience history. +--]] +function Brain.backward(reward) + -- add reward to our history + table.remove( Brain.reward_window, 1) + table.insert( Brain.reward_window, reward) + + -- if learning is turned off then don't do anything + if(not Brain.learning) then + return; + end + + Brain.age = Brain.age + 1; + + -- if we've had enough states and actions to fill up our net input then add + -- this new experience to our history + if(Brain.forward_passes > Brain.temporal_window + 1) then + -- make experience and fill it up + local e = Experience(nil, nil, nil, nil); + local n = Brain.window_size; + e.state0 = Brain.net_window[n-1]; + e.action0 = Brain.action_window[n-1]; + e.reward0 = Brain.reward_window[n-1]; + e.state1 = Brain.net_window[n]; + + -- if our experience table isn't larger than the max size then expand + if(table.length(Brain.experience) < Brain.experience_size) then + table.insert(Brain.experience, e) + else + -- Otherwise replace random experience. finite memory! + local ri = torch.random(1, Brain.experience_size); + Brain.experience[ri] = e; + end + end + + -- if we have enough experience in memory then start training + if(table.length(Brain.experience) > Brain.start_learn_threshold) then + inputs = torch.Tensor(Brain.batch_size, Brain.net_inputs) + targets = torch.Tensor(Brain.batch_size, Brain.net_outputs) + + for k = 1, Brain.batch_size do + -- choose random experience + local re = math.random(1, table.length(Brain.experience)); + local e = Brain.experience[re]; + + -- copy state from experience + local x = torch.Tensor(e.state0); + + -- compute best action for the new state + local best_action = Brain.policy(e.state1); + + --[[ get current action output values + we want to make the target outputs the same as the actual outputs + expect for the action that was chose - we want to replace this with + the reward that was obtained + the utility of the resulting state + --]] + local all_outputs = Brain.net:forward(x); + inputs[k] = x:clone(); + targets[k] = all_outputs:clone(); + targets[k][e.action0] = e.reward0 + Brain.gamma * best_action.value; + end + + -- create training function to give to optim.sgd + local feval = function(x) + collectgarbage() + + -- get new network parameters + if x ~= Brain.parameters then + Brain.parameters:copy(x) + end + + -- reset gradients + Brain.gradParameters:zero() + + -- evaluate function for complete mini batch + local outputs = Brain.net:forward(inputs) + local f = Brain.criterion:forward(outputs, targets) + + -- estimate df/dW + local df_do = Brain.criterion:backward(outputs, targets) + Brain.net:backward(inputs, df_do) + + -- penalties (L1 and L2): + if Brain.coefL1 ~= 0 or Brain.coefL2 ~= 0 then + -- locals: + local norm,sign = torch.norm,torch.sign + + -- Loss: + f = f + Brain.coefL1 * norm(Brain.parameters,1) + f = f + Brain.coefL2 * norm(Brain.parameters,2)^2/2 + + -- Gradients: + Brain.gradParameters:add( sign(Brain.parameters):mul(Brain.coefL1) + Brain.parameters:clone():mul(Brain.coefL2) ) + end + + -- return f and df/dX + return f, Brain.gradParameters + end + + -- fire up optim.sgd + sgdState = { + learningRate = Brain.learning_rate, + momentum = Brain.momentum, + learningRateDecay = Brain.learning_rate_decay + } + + optim.sgd(feval, Brain.parameters, sgdState) + + end +end + + + +-- export +return Brain + + + + diff --git a/test.lua b/test.lua new file mode 100644 index 0000000..f2848dd --- /dev/null +++ b/test.lua @@ -0,0 +1,57 @@ +require 'xlua' +local Brain = require 'deepqlearn' + +function randtable(size, startnum, endnum) + local rtable = {} + for i = 1, size do + rtable[i+1] = randf(startnum, endnum) + end + + return rtable +end + +-- simple test found in readme.md +num_outcomes = 3 + + +Brain.init(num_outcomes, num_outcomes) +nb_train = 1000 +nb_test = 1000 + +for k = 0, nb_train do + rand_outcome = math.random(1, num_outcomes) + state = randtable(num_outcomes, rand_outcome, rand_outcome + 1) + + xlua.progress(k, nb_train) + + newstate = table.copy(state) -- make a deep copy + action = Brain.forward(newstate); -- returns index of chosen action + + reward = (action == rand_outcome) and 1 or 0 + + Brain.backward(reward); -- learning magic happens +end + +Brain.epsilon_test_time = 0.0; -- don't make any more random choices +Brain.learning = false; + + +-- get an optimal action from the learned policy +local cnt = 0 +for k = 1, nb_test do + xlua.progress(k, nb_test) + + rand_outcome = math.random(1, num_outcomes) + state = randtable(num_outcomes, rand_outcome, rand_outcome + 1) + + + newstate = table.copy(state) + output = Brain.forward(newstate) + if rand_outcome == output then + cnt = cnt + 1 + end + +end + +print("Test cases correct: " .. tostring(100 * cnt/nb_test) .. " %") +