-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbreakout_ppo_main.py
75 lines (56 loc) · 2.51 KB
/
breakout_ppo_main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
import gym
import os
from time import time
import numpy as np
from agent.ppo_cnn_agent import PPOCNNAgent
from tensorboardX import SummaryWriter
from utils.atari_wrappers import make_env
time_str = str(time())
log_dir = 'logs/breakout_main_' + time_str
# model_dir = 'save_model/mpe_main_' + time_str
# os.mkdir(model_dir)
T_horizon = 256
if __name__ == "__main__":
env = gym.make('BreakoutNoFrameskip-v4')
env = make_env(env)
env.seed(42)
agent = PPOCNNAgent(4, action_space=4)
summary_writer = SummaryWriter(log_dir)
score = 0.0
global_step = 0
for i in range(10000000000):
state = env.reset()
state = np.asarray(state)
state = state.transpose((2, 0, 1))
done = False
local_step = 0
while not done:
for t in range(T_horizon):
action, action_probs = agent.get_action(state)
# summary_writer.add_scalar('Episode/action', action, global_step)
next_state, reward, done, info = env.step(action)
reward += 0.02
next_state = np.asarray(next_state)
next_state = next_state.transpose((2, 0, 1))
# print('next_state : {}, action : {}, reward : {}, done : {}, info : {}'.format(next_state, action, reward, done, info))
agent.save_xp((state, next_state, action, action_probs, reward, done))
state = next_state
score += reward
local_step += 1
if done:
break
pi_loss, value_loss, dist_entropy, approx_kl, approx_ent, clipfrac = agent.train()
summary_writer.add_scalar('Loss/pi_loss', pi_loss, global_step)
summary_writer.add_scalar('Loss/value_loss', value_loss, global_step)
summary_writer.add_scalar('Loss/dist_entropy', dist_entropy, global_step)
summary_writer.add_scalar('Loss/approx_kl', approx_kl, global_step)
summary_writer.add_scalar('Loss/approx_ent', approx_ent, global_step)
summary_writer.add_scalar('Loss/clipfrac', clipfrac, global_step)
global_step += 1
# print('pi_loss : {}, value_loss : {}, dist_entropy : {}'.format(pi_loss, value_loss, dist_entropy))
# if i % 10 == 0:
summary_writer.add_scalar('Episode/score', score, i)
summary_writer.add_scalar('Episode/game_len', local_step, i)
# print("{} episode avg score : {:.1f}".format(i+1, score/10))
score = 0.0
env.close()