-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsolver.py
106 lines (77 loc) · 3.4 KB
/
solver.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
import torch
import numpy as np
from collections import deque
from tensorboardX import SummaryWriter
from unityagents import UnityEnvironment
from agent import TennisAgent
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
RUN_NAME = "Test2"
# Initializing the Unity Banana environment
env = UnityEnvironment(file_name="Tennis_Linux_NoVis/Tennis.x86_64", worker_id=1)
brain_name = env.brain_names[0]
brain = env.brains[brain_name]
# Initializing the state_size and action_size for the environment
t_env_info = env.reset(train_mode=True)[brain_name]
t_state = t_env_info.vector_observations[0]
num_agents = len(t_env_info.vector_observations)
state_size = len(t_state)
action_size = brain.vector_action_space_size
print('State shape: ', state_size)
print('Number of actions: ', action_size)
print('Number of agents: ', num_agents)
# Creating the TennisAgent object
agent = TennisAgent(state_size, action_size, num_agents)
def maddpq():
"""This method collects values from the environment and passes those to the agent for training.
This method also saves training data in "log/tensorboard/" and trained agents in "checkpoints/"
"""
scores = [] # list containing scores from each episode
scores_window = deque(maxlen=100)
writer = SummaryWriter(log_dir="log/tensorboard/" + RUN_NAME) # initialize writer object for tensorboard
i_episode = 0
step = 0
solved_episode = 0
while True:
env_info = env.reset(train_mode=True)[brain_name]
state = env_info.vector_observations
score = np.array([0.0 for i in range(num_agents)])
while True:
action = agent.act(state, add_noise=True, step=step)
env_info = env.step(action)[brain_name]
next_state = env_info.vector_observations
reward = env_info.rewards
done = env_info.local_done
done_numpy = np.array(done).astype(np.float32)
agent.step(state, action, reward, next_state, done_numpy)
state = next_state
score += np.array(reward)
step += 1
if np.any(done):
break
score = np.max(score)
if len(scores_window) > 0:
writer.add_scalar("score_mean_100", np.mean(scores_window), i_episode)
writer.add_scalar("score", score, i_episode)
writer.flush()
scores_window.append(score)
scores.append(score)
print('\rEpisode {}\tAverage Score: {:.3f}'.format(i_episode, np.mean(scores_window)), end="")
if i_episode > 0 and i_episode % 500 == 0:
print()
# Tf the mean score is 1.0, the training is finished
if np.mean(scores_window) >= 1.0:
if solved_episode == 0:
solved_episode = i_episode
print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.3f}'
.format(i_episode, np.mean(scores_window)))
print('\nReached 0.5 mean on episode {:d}'.format(solved_episode))
torch.save(agent.common_critic.state_dict(), 'checkpoints/' + RUN_NAME + 'critic.pth')
for i in range(num_agents):
torch.save(agent.actor[i].state_dict(), 'checkpoints/' + RUN_NAME + 'actor' + str(i + 1) + '.pth')
break
if np.mean(scores_window) >= 0.5 and solved_episode == 0:
solved_episode = i_episode
i_episode += 1
writer.close()
env.close()
maddpq()