-
Notifications
You must be signed in to change notification settings - Fork 1
/
train.py
105 lines (90 loc) · 4.28 KB
/
train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
from unityagents import UnityEnvironment
import numpy as np
import torch
from dqn_agent import Agent
from collections import deque
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import sys
import time
if(len(sys.argv)==1):
print('Please inform the path of the Banana Enviroment: python train.py PATH')
sys.exit()
else:
BANANA_PATH = sys.argv[1] # path to unit environment
# Training Parameters
# -----------------------------------------------------------------------------------------------------------
N_EPISODES = 1000 # maximum number of training episodes
MAX_T = 1000 # maximum number of timesteps per episode
EPS_START = 1.0 # starting value of epsilon, for epsilon-greedy action selection
EPS_END = 0.01 # minimum value of epsilon
EPS_DECAY = 0.985 # multiplicative factor (per episode) for decreasing epsilon
SAVE_AGENT = True # whether to save trained agent
AGENT_PATH = 'output/checkpoint.pth' # path to save agent
SAVE_PLOT = True # whether to save plot scores
PLOT_PATH = 'output/plot_scores.png' # path to save plot scores
# -----------------------------------------------------------------------------------------------------------
# initializing environment
env = UnityEnvironment(file_name=BANANA_PATH)
# get the default brain
brain_name = env.brain_names[0]
brain = env.brains[brain_name]
# reset the environment
env_info = env.reset(train_mode=True)[brain_name]
# number of actions
action_size = brain.vector_action_space_size
# examine the state space
state = env_info.vector_observations[0]
state_size = len(state)
# dqn agent
agent = Agent(state_size=state_size, action_size=action_size, seed=0)
t1 = time.time()
# Start Training
print() # jump one line
scores = [] # list containing scores from each episode
scores_window = deque(maxlen=100) # last 100 scores
scores_window_ = np.zeros((N_EPISODES,)) # list containing average of scores_window after each 100 episodes
eps = EPS_START # initialize epsilon
for i_episode in range(1, N_EPISODES+1):
env_info = env.reset(train_mode=True)[brain_name] # reset the environment
state = env_info.vector_observations[0] # get the current state
score = 0
for t in range(MAX_T):
action = agent.act(state, eps)
env_info = env.step(action)[brain_name] # send the action to the environment
next_state = env_info.vector_observations[0] # get the next state
reward = env_info.rewards[0] # get the reward
done = env_info.local_done[0] # see if episode has finished
agent.step(state, action, reward, next_state, done)
state = next_state
score += reward
if done:
break
scores_window.append(score) # save most recent score
scores.append(score) # save most recent score
eps = max(EPS_END, EPS_DECAY*eps) # decrease epsilon
print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)), end="")
if i_episode % 100 == 0:
t2 = time.time() - t1
print('\rEpisode {}\tAverage Score: {:.2f}\tAccumulative Time: {:02d}:{:02d}:{:02d}'.format(i_episode, np.mean(scores_window),
int(t2/3600), int((t2%3600)/60),
int(t2%60)))
scores_window_[i_episode-100:i_episode] = np.mean(scores_window)
# End Training
env.close()
if SAVE_AGENT:
if np.mean(scores_window)>=13:
torch.save(agent.qnetwork_local.state_dict(), AGENT_PATH)
else:
print('Problem not solved, did not save the model')
if SAVE_PLOT:
# plot the scores
fig = plt.figure()
ax = fig.add_subplot(111)
plt.plot(np.arange(len(scores)), scores, label='Score per episode')
plt.plot(np.arange(len(scores)), scores_window_, 'r-', label='Average Score per 100 episodes')
plt.ylabel('Score')
plt.xlabel('Episode #')
plt.legend()
plt.savefig(PLOT_PATH)