From d58397aa3f7ee6670f630ef22ef5493538500c1e Mon Sep 17 00:00:00 2001
From: Yoni Lerner <yoni@terca.tercanet>
Date: Sun, 25 Jan 2015 16:07:18 -0800
Subject: [PATCH] Added GPU version

---
 gpuqlearn.moon | 372 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 372 insertions(+)
 create mode 100644 gpuqlearn.moon

diff --git a/gpuqlearn.moon b/gpuqlearn.moon
new file mode 100644
index 0000000..a3753ac
--- /dev/null
+++ b/gpuqlearn.moon
@@ -0,0 +1,372 @@
+require 'math'
+require 'nnx'
+require 'os'
+require 'optim'
+require 'cutorch'
+require 'cunn'
+
+math.randomseed os.time!
+torch.setdefaulttensortype 'torch.FloatTensor'
+
+Brain = {}
+
+--  HELPER FUNCTIONS --
+ 
+export randf = (s, e) ->
+	return (math.random(0, (e - s) * 9999) / 10000) + s
+
+-- new methods for table
+
+table.merge = (t1, t2) ->
+	t = t1
+	for i = 1, #t2
+		t[#t + 1] = t2[i]
+	return t
+
+table.copy = (t) ->
+	u = {k, v for k, v in pairs t}
+	return setmetatable(u, getmetatable t)
+
+table.length = (T) ->
+	count = 0
+	count += 1 for _ in pairs T
+	return count
+
+-- returns experience table for single network decision
+-- contains the state, action chosen, whether a reward was obtained, and the
+-- state that resulted from the action. This is later used to train the network
+-- Remember that the utility of an action is evaluated from the reward gained and
+-- the utility of the state it led to (recursive definition)
+export Experience = (state0, action0, reward0, state1) ->
+	NewExperience =
+		state0: state0
+		action0: action0
+		reward0: reward0
+		state1: state1
+	return NewExperience
+
+-- BRAIN
+
+Brain.init = (num_states, num_actions) ->
+	-- Number of past state/action pairs input to the network. 0 = agent lives in-the-moment :)
+	Brain.temporal_window = 2
+	-- Maximum number of experiences that we will save for training
+	Brain.experience_size = 30000
+	-- experience necessary to start learning
+	Brain.start_learn_threshold = 300
+	-- gamma is a crucial parameter that controls how much plan-ahead the agent does. In [0,1]
+	-- Determines the amount of weight placed on the utility of the state resulting from an action.
+	Brain.gamma = 0.9
+	-- number of steps we will learn for
+	Brain.learning_steps_total = 100000
+	-- how many steps of the above to perform only random actions (in the beginning)?
+	Brain.learning_steps_burnin = 300
+	-- controls exploration exploitation tradeoff. Will decay over time
+	-- a higher epsilon means we are more likely to choose random actions
+	Brain.epsilon = 1.0
+	-- what epsilon value do we bottom out on? 0.0 => purely deterministic policy at end
+	Brain.epsilon_min = 0.05
+	-- what epsilon to use when learning is turned off. This is for testing
+	Brain.epsilon_test_time = 0.01
+
+	[[== states and actions that go into neural net:
+		(state0,action0),(state1,action1), ... , (stateN)
+		this variable controls the size of that temporal window.
+	]]
+	Brain.net_inputs = (num_states + num_actions) * Brain.temporal_window + num_states
+	Brain.hidden_nodes = 16
+	Brain.num_states = num_states
+	Brain.num_actions = num_actions
+	Brain.net_outputs = Brain.num_actions
+
+	[[== Window size dictates the number of states, actions, rewards, and net inputs that we
+		save. The temporal window size is the number of time states/actions that are input
+		to the network and must be smaller than or equal to window_size
+	]]
+	Brain.window_size = math.max Brain.temporal_window, 2
+
+	-- advanced feature. Sometimes a random action should be biased towards some values
+	-- for example in flappy bird, we may want to choose to not flap more often
+	Brain.random_action_distribution = {}
+	if table.length(Brain.random_action_distribution) > 0
+		-- this better sum to 1 by the way, and be of length this.num_actions
+		if table.length(Brain.random_action_distribution) != Brain.num_actions
+			print 'TROUBLE. random_action_distribution should be same length as num_actions.'
+		
+		s = 0.0
+		
+		for k = 1, table.length Brain.random_action_distribution
+			s += Brain.random_action_distribution[k]
+		
+		if math.abs(s - 1.0) > 0.0001
+			 print 'TROUBLE. random_action_distribution should sum to 1!'
+
+
+	-- define architecture
+	Brain.net = nn.Sequential!
+
+	Brain.net\add nn.Linear Brain.net_inputs, Brain.hidden_nodes
+	Brain.net\add nn.Threshold 0, 0
+
+	Brain.net\add nn.Linear Brain.hidden_nodes, Brain.hidden_nodes
+	Brain.net\add nn.Threshold 0, 0
+
+	Brain.net\add nn.Linear Brain.hidden_nodes, Brain.net_outputs
+
+	Brain.net\cuda! -- move network to GPU
+
+	Brain.criterion = nn.MSECriterion!\cuda!
+
+
+	-- other learning parameters
+	Brain.learning_rate = 0.01
+	Brain.learning_rate_decay = 5e-7
+	Brain.batch_size = 16
+	Brain.momentum = 0.9
+		
+	-- various housekeeping variables
+	Brain.age = 0 -- incremented every backward!
+
+	-- number of times we've called forward - lets us know when our input temporal
+	-- window is filled up
+	Brain.forward_passes = 0
+	Brain.learning = true
+
+	-- coefficients for regression
+	Brain.coefL1 = 0.001
+	Brain.coefL2 = 0.001
+
+	-- parameters for optim.sgd
+	Brain.parameters, Brain.gradParameters = Brain.net\getParameters!
+
+	-- These windows track old experiences, states, actions, rewards, and net inputs
+	-- over time. They should all start out as empty with a fixed size.
+	-- This is a first in, last out data structure that is shifted along time
+	Brain.experience = {}
+	Brain.state_window = {}
+	Brain.action_window = {}
+	Brain.reward_window = {}
+	Brain.net_window = {}
+	for i = 1, Brain.window_size
+		Brain.state_window[i] = {}
+		Brain.action_window[i] = {}
+		Brain.reward_window[i] = {}
+		Brain.net_window[i] = {}
+
+-- a bit of a helper function. It returns a random action
+-- we are abstracting this away because in future we may want to
+-- do more sophisticated things. For example some actions could be more
+-- or less likely at "rest"/default state.
+Brain.random_action = ->
+	-- if we don't have a random action distribution defined then sample evenly
+	if table.length(Brain.random_action_distribution) == 0
+		return (torch.random! % Brain.net_outputs) + 1
+
+	-- okay, lets do some fancier sampling:
+	else
+		p = randf 0, 1
+		cumprob = 0.0
+
+		for k = 1, Brain.num_actions
+			cumprob += Brain.random_action_distribution[k]
+			
+			if p < cumprob
+				return k
+
+-- compute the value of doing any action in this state
+-- and return the argmax action and its value
+Brain.policy = (state) ->
+	tensor_state = torch.Tensor(state)\cuda!
+	action_values = Brain.net\forward tensor_state
+	
+	maxval = action_values[1]
+	max_index = 1
+ 
+	-- find maximum output and note its index and value
+	--max_index = i for i = 2, Brain.net_outputs when action_values[i] > action_values[max_index]
+	for i = 2, Brain.net_outputs
+		if action_values[i] > maxval
+			maxval = action_values[i]
+			max_index = i
+	
+	return action: max_index, value: maxval
+		
+-- This function assembles the input to the network by concatenating
+-- old (state, chosen_action) pairs along with the current state
+	-- return s = (x,a,x,a,x,a,xt) state vector.
+Brain.getNetInput = (xt) ->
+	w = {}
+	w = table.merge(w, xt) -- start with current state
+	
+	-- and now go backwards and append states and actions from history temporal_window times
+	n = Brain.window_size + 1
+	for k = 1, Brain.temporal_window do
+		-- state
+		w = table.merge w, Brain.state_window[n - k]
+		-- action, encoded as 1-of-k indicator vector. We scale it up a bit because
+		-- we don't want weight regularization to undervalue this information, as it only exists once
+		action1ofk = {}
+		action1ofk[i] = 0 for i = 1, Brain.num_actions
+
+		-- assign action taken for current state to be 1, all others are 0
+		action1ofk[Brain.action_window[n - k]] = 1.0 * Brain.num_states
+			
+		w = table.merge w, action1ofk
+	
+	return w
+		
+-- This function computes an action by either:
+-- 1. Giving the current state and past (state, action) pairs to the network
+-- and letting it choose the best acction
+-- 2. Choosing a random action
+Brain.forward = (input_array) ->
+	Brain.forward_passes += 1
+	
+	local action, net_input
+	
+	-- if we have enough (state, action) pairs in our memory to fill up
+	-- our network input then we'll proceed to let our network choose the action
+	if Brain.forward_passes > Brain.temporal_window
+		net_input = Brain.getNetInput input_array
+		
+		-- if learning is turned on then epsilon should be decaying
+		if Brain.learning
+			-- compute (decaying) epsilon for the epsilon-greedy policy
+			new_epsilon = 1.0 - (Brain.age - Brain.learning_steps_burnin)/(Brain.learning_steps_total - Brain.learning_steps_burnin)
+			
+			-- don't let epsilon go above 1.0
+			Brain.epsilon = math.min(1.0, math.max(Brain.epsilon_min, new_epsilon))
+		else
+			-- if learning is turned off then use the epsilon we've specified for testing
+			Brain.epsilon = Brain.epsilon_test_time
+		
+		-- use epsilon probability to choose whether we use network action or random action
+		if randf(0, 1) < Brain.epsilon
+			action = Brain.random_action!
+		else
+			-- otherwise use our policy to make decision
+			best_action = Brain.policy net_input
+			action = best_action.action -- this is the action number
+	else
+		-- pathological case that happens first few iterations when we can't
+		-- fill up our network inputs. Just default to random action in this case
+		net_input = {}
+		action = Brain.random_action!
+	
+	-- shift the network input, state, and action chosen into our windows
+	table.remove Brain.net_window, 1
+	table.insert Brain.net_window, net_input
+
+	table.remove Brain.state_window, 1
+	table.insert Brain.state_window, input_array
+
+	table.remove Brain.action_window, 1
+	table.insert Brain.action_window, action
+	
+	return action
+		
+-- This function trains the network using the reward resulting from the last action
+-- It will save this past experience which consists of:
+--  the state, action chosen, whether a reward was obtained, and the
+--  state that resulted from the action
+-- After that, it will train the network (using a batch of experiences) using a
+-- random sampling of our entire experience history.
+Brain.backward = (reward) ->
+	-- add reward to our history
+	table.remove Brain.reward_window, 1
+	table.insert Brain.reward_window, reward
+	
+	-- if learning is turned off then don't do anything
+	return unless Brain.learning
+	
+	Brain.age += 1
+	
+	-- if we've had enough states and actions to fill up our net input then add
+	-- this new experience to our history
+	if Brain.forward_passes > Brain.temporal_window + 1
+		-- make experience and fill it up
+		e = Experience nil, nil, nil, nil
+		n = Brain.window_size
+		e.state0 = Brain.net_window[n - 1]
+		e.action0 = Brain.action_window[n - 1]
+		e.reward0 = Brain.reward_window[n - 1]
+		e.state1 = Brain.net_window[n]
+		
+		-- if our experience table isn't larger than the max size then expand
+		if table.length(Brain.experience) < Brain.experience_size
+			table.insert Brain.experience, e
+		-- Otherwise replace random experience. finite memory!
+		else
+			ri = torch.random 1, Brain.experience_size
+			Brain.experience[ri] = e
+	
+	-- if we have enough experience in memory then start training
+	if table.length(Brain.experience) > Brain.start_learn_threshold
+		inputs = torch.Tensor(Brain.batch_size, Brain.net_inputs)\cuda!
+		targets = torch.Tensor(Brain.batch_size, Brain.net_outputs)\cuda!
+
+		for k = 1, Brain.batch_size
+			-- choose random experience
+			re = math.random 1, table.length Brain.experience
+			e = Brain.experience[re]
+			
+			-- copy state from experience
+			x = torch.Tensor(e.state0)\cuda!
+
+			-- compute best action for the new state
+			best_action = Brain.policy e.state1
+
+			-- get current action output values
+			-- we want to make the target outputs the same as the actual outputs
+			-- expect for the action that was chose - we want to replace this with
+			-- the reward that was obtained + the utility of the resulting state
+			all_outputs = Brain.net\forward x
+			inputs[k] = x\clone!
+			targets[k] = all_outputs\clone!
+			targets[k][e.action0] = e.reward0 + Brain.gamma * best_action.value
+
+		-- create training function to give to optim.sgd
+		feval = (x) ->
+			collectgarbage!
+
+			-- get new network parameters
+			Brain.parameters\copy x unless x == Brain.parameters
+
+			-- reset gradients
+			Brain.gradParameters\zero!
+
+			-- evaluate function for complete mini batch
+			outputs = Brain.net\forward inputs
+			f = Brain.criterion\forward outputs, targets
+
+			-- estimate df/dW
+			df_do = Brain.criterion\backward outputs, targets
+			Brain.net\backward inputs, df_do
+
+			-- penalties (L1 and L2):
+			if Brain.coefL1 != 0 or Brain.coefL2 != 0
+				-- locals:
+				norm,sign = torch.norm, torch.sign
+
+				-- Loss:
+				f += Brain.coefL1 * norm Brain.parameters, 1
+				f += Brain.coefL2 * 0.5 * norm(Brain.parameters, 2) ^ 2
+
+				-- Gradients:
+				Brain.gradParameters\add(sign(Brain.parameters)\mul(Brain.coefL1) + Brain.parameters\clone!\mul Brain.coefL2)
+
+			-- return f and df/dX
+			return f, Brain.gradParameters
+
+		-- fire up optim.sgd
+		sgdState =
+			learningRate: Brain.learning_rate
+			momentum: Brain.momentum
+			learningRateDecay: Brain.learning_rate_decay
+		
+		optim.sgd feval, Brain.parameters, sgdState
+
+
+
+-- export
+return Brain
\ No newline at end of file