diff --git a/deepqlearn.moon b/deepqlearn.moon new file mode 100644 index 0000000..7491643 --- /dev/null +++ b/deepqlearn.moon @@ -0,0 +1,368 @@ +require 'math' +require 'nnx' +require 'os' +require 'optim' + +math.randomseed os.time! +torch.setdefaulttensortype 'torch.FloatTensor' + +Brain = {} + +-- HELPER FUNCTIONS -- + +export randf = (s, e) -> + return (math.random(0, (e - s) * 9999) / 10000) + s + +-- new methods for table + +table.merge = (t1, t2) -> + t = t1 + for i = 1, #t2 + t[#t + 1] = t2[i] + return t + +table.copy = (t) -> + u = {k, v for k, v in pairs t} + return setmetatable(u, getmetatable t) + +table.length = (T) -> + count = 0 + count += 1 for _ in pairs T + return count + +-- returns experience table for single network decision +-- contains the state, action chosen, whether a reward was obtained, and the +-- state that resulted from the action. This is later used to train the network +-- Remember that the utility of an action is evaluated from the reward gained and +-- the utility of the state it led to (recursive definition) +export Experience = (state0, action0, reward0, state1) -> + NewExperience = + state0: state0 + action0: action0 + reward0: reward0 + state1: state1 + return NewExperience + +-- BRAIN + +Brain.init = (num_states, num_actions) -> + -- Number of past state/action pairs input to the network. 0 = agent lives in-the-moment :) + Brain.temporal_window = 2 + -- Maximum number of experiences that we will save for training + Brain.experience_size = 30000 + -- experience necessary to start learning + Brain.start_learn_threshold = 300 + -- gamma is a crucial parameter that controls how much plan-ahead the agent does. In [0,1] + -- Determines the amount of weight placed on the utility of the state resulting from an action. + Brain.gamma = 0.9 + -- number of steps we will learn for + Brain.learning_steps_total = 100000 + -- how many steps of the above to perform only random actions (in the beginning)? + Brain.learning_steps_burnin = 300 + -- controls exploration exploitation tradeoff. Will decay over time + -- a higher epsilon means we are more likely to choose random actions + Brain.epsilon = 1.0 + -- what epsilon value do we bottom out on? 0.0 => purely deterministic policy at end + Brain.epsilon_min = 0.05 + -- what epsilon to use when learning is turned off. This is for testing + Brain.epsilon_test_time = 0.01 + + [[== states and actions that go into neural net: + (state0,action0),(state1,action1), ... , (stateN) + this variable controls the size of that temporal window. + ]] + Brain.net_inputs = (num_states + num_actions) * Brain.temporal_window + num_states + Brain.hidden_nodes = 16 + Brain.num_states = num_states + Brain.num_actions = num_actions + Brain.net_outputs = Brain.num_actions + + [[== Window size dictates the number of states, actions, rewards, and net inputs that we + save. The temporal window size is the number of time states/actions that are input + to the network and must be smaller than or equal to window_size + ]] + Brain.window_size = math.max Brain.temporal_window, 2 + + -- advanced feature. Sometimes a random action should be biased towards some values + -- for example in flappy bird, we may want to choose to not flap more often + Brain.random_action_distribution = {} + if table.length(Brain.random_action_distribution) > 0 + -- this better sum to 1 by the way, and be of length this.num_actions + if table.length(Brain.random_action_distribution) != Brain.num_actions + print 'TROUBLE. random_action_distribution should be same length as num_actions.' + + s = 0.0 + + for k = 1, table.length Brain.random_action_distribution + s += Brain.random_action_distribution[k] + + if math.abs(s - 1.0) > 0.0001 + print 'TROUBLE. random_action_distribution should sum to 1!' + + + -- define architecture + Brain.net = nn.Sequential! + + Brain.net\add nn.Linear Brain.net_inputs, Brain.hidden_nodes + Brain.net\add nn.Threshold 0, 0 + + Brain.net\add nn.Linear Brain.hidden_nodes, Brain.hidden_nodes + Brain.net\add nn.Threshold 0, 0 + + Brain.net\add nn.Linear Brain.hidden_nodes, Brain.net_outputs + + Brain.criterion = nn.MSECriterion! + + + -- other learning parameters + Brain.learning_rate = 0.01 + Brain.learning_rate_decay = 5e-7 + Brain.batch_size = 16 + Brain.momentum = 0.9 + + -- various housekeeping variables + Brain.age = 0 -- incremented every backward! + + -- number of times we've called forward - lets us know when our input temporal + -- window is filled up + Brain.forward_passes = 0 + Brain.learning = true + + -- coefficients for regression + Brain.coefL1 = 0.001 + Brain.coefL2 = 0.001 + + -- parameters for optim.sgd + Brain.parameters, Brain.gradParameters = Brain.net\getParameters! + + -- These windows track old experiences, states, actions, rewards, and net inputs + -- over time. They should all start out as empty with a fixed size. + -- This is a first in, last out data structure that is shifted along time + Brain.experience = {} + Brain.state_window = {} + Brain.action_window = {} + Brain.reward_window = {} + Brain.net_window = {} + for i = 1, Brain.window_size + Brain.state_window[i] = {} + Brain.action_window[i] = {} + Brain.reward_window[i] = {} + Brain.net_window[i] = {} + +-- a bit of a helper function. It returns a random action +-- we are abstracting this away because in future we may want to +-- do more sophisticated things. For example some actions could be more +-- or less likely at "rest"/default state. +Brain.random_action = -> + -- if we don't have a random action distribution defined then sample evenly + if table.length(Brain.random_action_distribution) == 0 + return (torch.random! % Brain.net_outputs) + 1 + + -- okay, lets do some fancier sampling: + else + p = randf 0, 1 + cumprob = 0.0 + + for k = 1, Brain.num_actions + cumprob += Brain.random_action_distribution[k] + + if p < cumprob + return k + +-- compute the value of doing any action in this state +-- and return the argmax action and its value +Brain.policy = (state) -> + tensor_state = torch.Tensor state + action_values = Brain.net\forward tensor_state + + maxval = action_values[1] + max_index = 1 + + -- find maximum output and note its index and value + --max_index = i for i = 2, Brain.net_outputs when action_values[i] > action_values[max_index] + for i = 2, Brain.net_outputs + if action_values[i] > maxval + maxval = action_values[i] + max_index = i + + return action: max_index, value: maxval + +-- This function assembles the input to the network by concatenating +-- old (state, chosen_action) pairs along with the current state + -- return s = (x,a,x,a,x,a,xt) state vector. +Brain.getNetInput = (xt) -> + w = {} + w = table.merge(w, xt) -- start with current state + + -- and now go backwards and append states and actions from history temporal_window times + n = Brain.window_size + 1 + for k = 1, Brain.temporal_window do + -- state + w = table.merge w, Brain.state_window[n - k] + -- action, encoded as 1-of-k indicator vector. We scale it up a bit because + -- we don't want weight regularization to undervalue this information, as it only exists once + action1ofk = {} + action1ofk[i] = 0 for i = 1, Brain.num_actions + + -- assign action taken for current state to be 1, all others are 0 + action1ofk[Brain.action_window[n - k]] = 1.0 * Brain.num_states + + w = table.merge w, action1ofk + + return w + +-- This function computes an action by either: +-- 1. Giving the current state and past (state, action) pairs to the network +-- and letting it choose the best acction +-- 2. Choosing a random action +Brain.forward = (input_array) -> + Brain.forward_passes += 1 + + local action, net_input + + -- if we have enough (state, action) pairs in our memory to fill up + -- our network input then we'll proceed to let our network choose the action + if Brain.forward_passes > Brain.temporal_window + net_input = Brain.getNetInput input_array + + -- if learning is turned on then epsilon should be decaying + if Brain.learning + -- compute (decaying) epsilon for the epsilon-greedy policy + new_epsilon = 1.0 - (Brain.age - Brain.learning_steps_burnin)/(Brain.learning_steps_total - Brain.learning_steps_burnin) + + -- don't let epsilon go above 1.0 + Brain.epsilon = math.min(1.0, math.max(Brain.epsilon_min, new_epsilon)) + else + -- if learning is turned off then use the epsilon we've specified for testing + Brain.epsilon = Brain.epsilon_test_time + + -- use epsilon probability to choose whether we use network action or random action + if randf(0, 1) < Brain.epsilon + action = Brain.random_action! + else + -- otherwise use our policy to make decision + best_action = Brain.policy net_input + action = best_action.action -- this is the action number + else + -- pathological case that happens first few iterations when we can't + -- fill up our network inputs. Just default to random action in this case + net_input = {} + action = Brain.random_action! + + -- shift the network input, state, and action chosen into our windows + table.remove Brain.net_window, 1 + table.insert Brain.net_window, net_input + + table.remove Brain.state_window, 1 + table.insert Brain.state_window, input_array + + table.remove Brain.action_window, 1 + table.insert Brain.action_window, action + + return action + +-- This function trains the network using the reward resulting from the last action +-- It will save this past experience which consists of: +-- the state, action chosen, whether a reward was obtained, and the +-- state that resulted from the action +-- After that, it will train the network (using a batch of experiences) using a +-- random sampling of our entire experience history. +Brain.backward = (reward) -> + -- add reward to our history + table.remove Brain.reward_window, 1 + table.insert Brain.reward_window, reward + + -- if learning is turned off then don't do anything + return unless Brain.learning + + Brain.age += 1 + + -- if we've had enough states and actions to fill up our net input then add + -- this new experience to our history + if Brain.forward_passes > Brain.temporal_window + 1 + -- make experience and fill it up + e = Experience nil, nil, nil, nil + n = Brain.window_size + e.state0 = Brain.net_window[n - 1] + e.action0 = Brain.action_window[n - 1] + e.reward0 = Brain.reward_window[n - 1] + e.state1 = Brain.net_window[n] + + -- if our experience table isn't larger than the max size then expand + if table.length(Brain.experience) < Brain.experience_size + table.insert Brain.experience, e + -- Otherwise replace random experience. finite memory! + else + ri = torch.random 1, Brain.experience_size + Brain.experience[ri] = e + + -- if we have enough experience in memory then start training + if table.length(Brain.experience) > Brain.start_learn_threshold + inputs = torch.Tensor Brain.batch_size, Brain.net_inputs + targets = torch.Tensor Brain.batch_size, Brain.net_outputs + + for k = 1, Brain.batch_size + -- choose random experience + re = math.random 1, table.length Brain.experience + e = Brain.experience[re] + + -- copy state from experience + x = torch.Tensor e.state0 + + -- compute best action for the new state + best_action = Brain.policy e.state1 + + -- get current action output values + -- we want to make the target outputs the same as the actual outputs + -- expect for the action that was chose - we want to replace this with + -- the reward that was obtained + the utility of the resulting state + all_outputs = Brain.net\forward x + inputs[k] = x\clone! + targets[k] = all_outputs\clone! + targets[k][e.action0] = e.reward0 + Brain.gamma * best_action.value + + -- create training function to give to optim.sgd + feval = (x) -> + collectgarbage! + + -- get new network parameters + Brain.parameters\copy x unless x == Brain.parameters + + -- reset gradients + Brain.gradParameters\zero! + + -- evaluate function for complete mini batch + outputs = Brain.net\forward inputs + f = Brain.criterion\forward outputs, targets + + -- estimate df/dW + df_do = Brain.criterion\backward outputs, targets + Brain.net\backward inputs, df_do + + -- penalties (L1 and L2): + if Brain.coefL1 != 0 or Brain.coefL2 != 0 + -- locals: + norm,sign = torch.norm, torch.sign + + -- Loss: + f += Brain.coefL1 * norm Brain.parameters, 1 + f += Brain.coefL2 * 0.5 * norm(Brain.parameters, 2) ^ 2 + + -- Gradients: + Brain.gradParameters\add(sign(Brain.parameters)\mul(Brain.coefL1) + Brain.parameters\clone!\mul Brain.coefL2) + + -- return f and df/dX + return f, Brain.gradParameters + + -- fire up optim.sgd + sgdState = + learningRate: Brain.learning_rate + momentum: Brain.momentum + learningRateDecay: Brain.learning_rate_decay + + optim.sgd feval, Brain.parameters, sgdState + + + +-- export +return Brain \ No newline at end of file diff --git a/test.moon b/test.moon new file mode 100644 index 0000000..d18ab51 --- /dev/null +++ b/test.moon @@ -0,0 +1,49 @@ +require 'xlua' +Brain = require 'deepqlearn' + +randtable = (size, startnum, endnum) -> + rtable = {} + for i = 1, size + rtable[i + 1] = randf startnum, endnum + + return rtable + +-- simple test found in readme.md +num_outcomes = 3 + + +Brain.init num_outcomes, num_outcomes +nb_train = 1000 +nb_test = 1000 + +for k = 0, nb_train + rand_outcome = math.random 1, num_outcomes + state = randtable num_outcomes, rand_outcome, rand_outcome + 1 + + xlua.progress k, nb_train + + newstate = table.copy state -- make a deep copy + action = Brain.forward newstate -- returns index of chosen action + + reward = (action == rand_outcome) and 1 or 0 + + Brain.backward reward -- learning magic happens + +Brain.epsilon_test_time = 0.0 -- don't make any more random choices +Brain.learning = false + + +-- get an optimal action from the learned policy +cnt = 0 +for k = 1, nb_test + xlua.progress k, nb_test + + rand_outcome = math.random 1, num_outcomes + state = randtable num_outcomes, rand_outcome, rand_outcome + 1 + + newstate = table.copy state + output = Brain.forward newstate + + cnt += 1 if rand_outcome == output + +print "Test cases correct: #{tostring(100 * cnt/nb_test)} %" \ No newline at end of file