deepqlearn.lua

require 'math'
require 'nnx'
require 'os'
require 'optim'


math.randomseed( os.time() )
torch.setdefaulttensortype('torch.FloatTensor')

local Brain = {}

--[[  HELPER FUNCTIONS --]]
 
function randf(s, e) 
  return (math.random(0,(e-s)*9999)/10000) + s;
end

-- new methods for table

function table.merge(t1, t2)
   local t = t1
    for i = 1, #t2 do
        t[#t+1] = t2[i]
    end
    return t
end

function table.copy(t)
  local u = { }
  for k, v in pairs(t) do u[k] = v end
  return setmetatable(u, getmetatable(t))
end

function table.length(T)
  local count = 0
  for _ in pairs(T) do count = count + 1 end
  return count
end

--[[ returns experience table for single network decision
	 contains the state, action chosen, whether a reward was obtained, and the
	 state that resulted from the action. This is later used to train the network
	 Remember that the utility of an action is evaluated from the reward gained and
	 the utility of the state it led to (recursive definition)
--]] 
function Experience(state0, action0, reward0, state1) 
   local Experience = {};
   Experience.state0 = state0;
   Experience.action0 = action0;
   Experience.reward0 = reward0;
   Experience.state1 = state1;
   return Experience;
end

-- BRAIN

function Brain.init(num_states, num_actions)
   -- Number of past state/action pairs input to the network. 0 = agent lives in-the-moment :)
   Brain.temporal_window = 2  
   -- Maximum number of experiences that we will save for training
   Brain.experience_size = 30000     
   -- experience necessary to start learning
   Brain.start_learn_threshold = 300
   -- gamma is a crucial parameter that controls how much plan-ahead the agent does. In [0,1]
   -- Determines the amount of weight placed on the utility of the state resulting from an action.
   Brain.gamma = 0.9;
   -- number of steps we will learn for
   Brain.learning_steps_total = 100000
   -- how many steps of the above to perform only random actions (in the beginning)?
   Brain.learning_steps_burnin = 300;
   -- controls exploration exploitation tradeoff. Will decay over time
   -- a higher epsilon means we are more likely to choose random actions
   Brain.epsilon = 1.0
   -- what epsilon value do we bottom out on? 0.0 => purely deterministic policy at end
   Brain.epsilon_min = 0.05;
   -- what epsilon to use when learning is turned off. This is for testing
   Brain.epsilon_test_time = 0.01;

    --[[ states and actions that go into neural net:
    	 (state0,action0),(state1,action1), ... , (stateN)
     	this variable controls the size of that temporal window.
     --]]
   Brain.net_inputs = (num_states + num_actions) * Brain.temporal_window + num_states;
   Brain.hidden_nodes = 16
   Brain.num_states = num_states;
   Brain.num_actions = num_actions;
   Brain.net_outputs = Brain.num_actions;
   
    --[[ Window size dictates the number of states, actions, rewards, and net inputs that we
    	save. The temporal window size is the number of time states/actions that are input
    	to the network and must be smaller than or equal to window_size
	--]]
   Brain.window_size = math.max(Brain.temporal_window, 2);

   -- advanced feature. Sometimes a random action should be biased towards some values
   -- for example in flappy bird, we may want to choose to not flap more often
   Brain.random_action_distribution = {};
    if(table.length(Brain.random_action_distribution) > 0) then
      -- this better sum to 1 by the way, and be of length this.num_actions
      if(table.length(Brain.random_action_distribution) ~= Brain.num_actions) then
        print('TROUBLE. random_action_distribution should be same length as num_actions.');
      end
      
      local s = 0.0;
      
      for k = 1, table.length(Brain.random_action_distribution) do
         s = s + Brain.random_action_distribution[k]
      end
      
      if(math.abs(s - 1.0) > 0.0001) then
         print('TROUBLE. random_action_distribution should sum to 1!');
      end
    end
    

   -- define architecture
   Brain.net = nn.Sequential()

   Brain.net:add(nn.Linear(Brain.net_inputs, Brain.hidden_nodes))
   Brain.net:add(nn.Threshold(0,0))

   Brain.net:add(nn.Linear(Brain.hidden_nodes, Brain.hidden_nodes))
   Brain.net:add(nn.Threshold(0,0))

   Brain.net:add(nn.Linear(Brain.hidden_nodes, Brain.net_outputs))
   
   Brain.criterion = nn.MSECriterion()
   
   
   -- other learning parameters
   Brain.learning_rate = 0.01;
   Brain.learning_rate_decay = 5e-7
   Brain.batch_size = 16;
   Brain.momentum = 0.9;
      
   -- various housekeeping variables
   Brain.age = 0; -- incremented every backward()
   
   -- number of times we've called forward - lets us know when our input temporal
   -- window is filled up
   Brain.forward_passes = 0;
   Brain.learning = true;

	-- coefficients for regression
	Brain.coefL1 = 0.001
	Brain.coefL2 = 0.001

	-- parameters for optim.sgd
	Brain.parameters, Brain.gradParameters = Brain.net:getParameters()
	
	-- These windows track old experiences, states, actions, rewards, and net inputs
	-- over time. They should all start out as empty with a fixed size.
	-- This is a first in, last out data structure that is shifted along time
   Brain.experience = {};
   Brain.state_window = {}
   Brain.action_window = {}
   Brain.reward_window = {}
   Brain.net_window = {}
   for i = 1, Brain.window_size do  
      Brain.state_window[i] = {}
      Brain.action_window[i] = {}
      Brain.reward_window[i] = {}
      Brain.net_window[i] = {}
   end
end

   -- a bit of a helper function. It returns a random action
   -- we are abstracting this away because in future we may want to 
   -- do more sophisticated things. For example some actions could be more
   -- or less likely at "rest"/default state.
function Brain.random_action()
	-- if we don't have a random action distribution defined then sample evenly
   if(table.length(Brain.random_action_distribution) == 0) then
   	return (torch.random() % Brain.net_outputs) + 1
   	
      -- okay, lets do some fancier sampling:
   else 
      local p = randf(0, 1);
      local cumprob = 0.0;

      for k= 1, Brain.num_actions do
        cumprob = cumprob + Brain.random_action_distribution[k];
        
        if(p < cumprob) then
         return k
        end
      end
   end
end

  -- compute the value of doing any action in this state
  -- and return the argmax action and its value
function Brain.policy(state)   
  local tensor_state = torch.Tensor(state)
  local action_values = Brain.net:forward(tensor_state);
  
  local maxval = action_values[1]
  local max_index = 1
 
 -- find maximum output and note its index and value
  for i = 2, Brain.net_outputs do
  	if action_values[i] > maxval then
  		maxval = action_values[i]
  		max_index = i
  	end
  end
  
  return {action = max_index, value = maxval};
end
    
-- This function assembles the input to the network by concatenating
-- old (state, chosen_action) pairs along with the current state      
  -- return s = (x,a,x,a,x,a,xt) state vector. 
function Brain.getNetInput(xt) 
  local w = {};
  w = table.merge(w, xt); -- start with current state
  
  -- and now go backwards and append states and actions from history temporal_window times
  local n = Brain.window_size + 1; 
  for k = 1, Brain.temporal_window do
    -- state
    w = table.merge(w, Brain.state_window[n-k]);
    -- action, encoded as 1-of-k indicator vector. We scale it up a bit because
    -- we dont want weight regularization to undervalue this information, as it only exists once
    local action1ofk = {};
    for i = 1, Brain.num_actions do
      action1ofk[i] = 0
    end

   -- assign action taken for current state to be 1, all others are 0
    action1ofk[Brain.action_window[n-k]] = 1.0*Brain.num_states;
      
    w = table.merge(w, action1ofk);
  end
  
  return w;
end
    
--[[ This function computes an action by either:
	1. Giving the current state and past (state, action) pairs to the network
		and letting it choose the best acction
	2. Choosing a random action
--]]
function Brain.forward(input_array) 
  Brain.forward_passes = Brain.forward_passes + 1;
  
  local action, net_input;
  
    -- if we have enough (state, action) pairs in our memory to fill up
    -- our network input then we'll proceed to let our network choose the action
  if(Brain.forward_passes > Brain.temporal_window ) then
    net_input = Brain.getNetInput(input_array);
    
    -- if learning is turned on then epsilon should be decaying
    if(Brain.learning) then
      -- compute (decaying) epsilon for the epsilon-greedy policy
      local new_epsilon = 1.0 - (Brain.age - Brain.learning_steps_burnin)/(Brain.learning_steps_total - Brain.learning_steps_burnin)
      
      -- don't let epsilon go above 1.0
      Brain.epsilon = math.min(1.0, math.max(Brain.epsilon_min, new_epsilon)); 
    else
    	-- if learning is turned off then use the epsilon we've specified for testing        
      Brain.epsilon = Brain.epsilon_test_time;
    end
    
    -- use epsilon probability to choose whether we use network action or random action
    if(randf(0, 1) < Brain.epsilon) then
      action = Brain.random_action();
    else
      -- otherwise use our policy to make decision
      local best_action = Brain.policy(net_input);
      action = best_action.action; -- this is the action number
     end
  else
    -- pathological case that happens first few iterations when we can't
    -- fill up our network inputs. Just default to random action in this case
    net_input = {};
    action = Brain.random_action();
  end
  
  -- shift the network input, state, and action chosen into our windows
  table.remove( Brain.net_window, 1)
  table.insert( Brain.net_window, net_input) 

  table.remove( Brain.state_window, 1)
  table.insert( Brain.state_window, input_array)      

  table.remove( Brain.action_window, 1)
  table.insert( Brain.action_window, action)
  
  return action;
end    
    
--[[ 
	This function trains the network using the reward resulting from the last action
	It will save this past experience which consists of:
		the state, action chosen, whether a reward was obtained, and the
	 	state that resulted from the action
	After that, it will train the network (using a batch of experiences) using a 
	random sampling of our entire experience history.
--]]
function Brain.backward(reward)
		-- add reward to our history 
      table.remove( Brain.reward_window, 1)
      table.insert( Brain.reward_window, reward)
      
      -- if learning is turned off then don't do anything
      if(not Brain.learning) then 
         return; 
      end
      
      Brain.age = Brain.age + 1;
      
      -- if we've had enough states and actions to fill up our net input then add
      -- this new experience to our history
      if(Brain.forward_passes > Brain.temporal_window + 1) then
      	-- make experience and fill it up
        local e = Experience(nil, nil, nil, nil);
        local n = Brain.window_size;
        e.state0 = Brain.net_window[n-1];
        e.action0 = Brain.action_window[n-1];
        e.reward0 = Brain.reward_window[n-1];
        e.state1 = Brain.net_window[n];
        
        -- if our experience table isn't larger than the max size then expand
        if(table.length(Brain.experience) < Brain.experience_size) then
          table.insert(Brain.experience, e)
        else 
          -- Otherwise replace random experience. finite memory!
          local ri = torch.random(1, Brain.experience_size);
          Brain.experience[ri] = e;
        end
      end
      
      -- if we have enough experience in memory then start training
     if(table.length(Brain.experience) > Brain.start_learn_threshold) then
		inputs = torch.Tensor(Brain.batch_size, Brain.net_inputs)
		targets = torch.Tensor(Brain.batch_size, Brain.net_outputs) 
	
        for k = 1, Brain.batch_size do
        	-- choose random experience
        	local re = math.random(1, table.length(Brain.experience));
          	local e = Brain.experience[re];
          
          -- copy state from experience
          	local x = torch.Tensor(e.state0);
   
   			-- compute best action for the new state
          	local best_action = Brain.policy(e.state1);
   
   			--[[ get current action output values
   				we want to make the target outputs the same as the actual outputs
   				expect for the action that was chose - we want to replace this with
	   			the reward that was obtained + the utility of the resulting state
   			--]]
   			local all_outputs = Brain.net:forward(x);
		  	inputs[k] = x:clone();      	
		  	targets[k] = all_outputs:clone();
		  	targets[k][e.action0] = e.reward0 + Brain.gamma * best_action.value;   
		end

		-- create training function to give to optim.sgd
		local feval = function(x)
	     collectgarbage()

	     -- get new network parameters
	     if x ~= Brain.parameters then
	        Brain.parameters:copy(x)
	     end

	     -- reset gradients
	     Brain.gradParameters:zero()

	     -- evaluate function for complete mini batch
	     local outputs = Brain.net:forward(inputs)
	     local f = Brain.criterion:forward(outputs, targets)

	     -- estimate df/dW
	     local df_do = Brain.criterion:backward(outputs, targets)
	     Brain.net:backward(inputs, df_do)

	     -- penalties (L1 and L2):
	     if Brain.coefL1 ~= 0 or Brain.coefL2 ~= 0 then
	        -- locals:
	       local norm,sign = torch.norm,torch.sign

	        -- Loss:
	        f = f + Brain.coefL1 * norm(Brain.parameters,1)
	        f = f + Brain.coefL2 * norm(Brain.parameters,2)^2/2

	        -- Gradients:
	        Brain.gradParameters:add( sign(Brain.parameters):mul(Brain.coefL1) + Brain.parameters:clone():mul(Brain.coefL2) )
	     end

	     -- return f and df/dX
	     return f, Brain.gradParameters
	  	end

		-- fire up optim.sgd
		sgdState = {
            learningRate = Brain.learning_rate,
            momentum = Brain.momentum,
            learningRateDecay = Brain.learning_rate_decay
         }
         
         optim.sgd(feval, Brain.parameters, sgdState)
         
     end
end


-- export
return Brain