-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathexperiment.py
331 lines (255 loc) · 14.8 KB
/
experiment.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
import numpy as np
from datetime import datetime
import datetime as dtime
import time
import os
import pickle
import torch
from utils.config import Config
class Experiment(object):
def __init__(self, agent, train_environment, test_environment, seed, write_log, write_plot, resume_params, has_eval=False):
self.has_eval = has_eval
self.agent = agent
self.train_environment = train_environment
self.train_environment.set_random_seed(seed)
# for eval purpose
self.test_environment = test_environment # copy.deepcopy(environment) # this didn't work for Box2D env
self.test_environment.set_random_seed(seed)
self.train_rewards_per_episode = []
self.train_cum_steps = []
self.eval_mean_rewards_per_episode = []
self.eval_std_rewards_per_episode = []
self.total_step_count = 0
# boolean to log result for tensorboard
self.write_log = write_log
self.write_plot = write_plot
self.cum_train_time = 0.0
self.cum_eval_time = 0.0
self.right_exit_count = 0
self.bad_exit_count = 0
self.right_exit_global_count = []
self.bad_exit_global_count = []
self.first_load = False
self.is_maze = ("ContinuousMaze" in self.train_environment.name or "ContinuousWorld" in self.train_environment.name)
# save/resume params
self.resume_training = resume_params['resume_training']
self.save_data_bdir = resume_params['save_data_bdir']
self.save_data_minutes = resume_params['save_data_minutes']
self.save_data_fname = resume_params['save_data_fname']
# save params ContinuousMaze
self.steps_per_netsave = resume_params['steps_per_netsave']
self.no_netsave = resume_params['no_netsave']
self.netsave_data_bdir = resume_params['netsave_data_bdir']
def run(self):
self.episode_count = 0
# For total time
start_run = datetime.now()
self.run_start_time = time.time()
print("Start run at: " + str(start_run)+'\n')
# evaluate once at beginning
if self.has_eval:
self.cum_eval_time += self.eval()
if self.resume_training:
self.link_variables_and_names()
self.first_load = self.load_data()
self.last_time_saved = time.time()
while self.total_step_count < self.train_environment.TOTAL_STEPS_LIMIT:
# runs a single episode and returns the accumulated reward for that episode
if self.first_load is False:
self.train_start_time = time.time()
force_terminated = self.run_episode_train(is_train=True)
train_end_time = time.time()
train_ep_time = train_end_time - self.train_start_time
if self.has_eval:
train_ep_time -= self.eval_session_time
self.cum_train_time += train_ep_time
print("Train:: ep: " + str(self.episode_count) + ", r: " + str(self.episode_reward) + ", n_steps: " + str(self.episode_step_count) + ", elapsed: " + time.strftime("%H:%M:%S", time.gmtime(train_ep_time)) + ', Step:{}'.format(self.total_step_count) )
if not force_terminated:
self.train_rewards_per_episode.append(self.episode_reward)
self.train_cum_steps.append(self.total_step_count)
self.episode_count += 1
self.train_environment.close() # clear environment memory
end_run = datetime.now()
print("End run at: " + str(end_run)+'\n')
print("Total Time taken: "+str(end_run - start_run) + '\n')
print("Training Time: " + time.strftime("%H:%M:%S", time.gmtime(self.cum_train_time)))
print("Evaluation Time: " + time.strftime("%H:%M:%S", time.gmtime(self.cum_eval_time)))
return self.train_rewards_per_episode, self.eval_mean_rewards_per_episode, self.eval_std_rewards_per_episode, self.train_cum_steps
# Runs a single episode (TRAIN)
def run_episode_train(self, is_train, measure_speed_step=3000):
if self.first_load is False:
self.eval_session_time = 0.0
self.episode_reward = 0.
self.episode_step_count = 0
self.obs = self.train_environment.reset()
self.Aold = self.agent.start(self.obs, is_train)
self.agent.reset() # Need to be careful in Agent not to reset the weight
else:
self.first_load = False
done = False
while not (done or self.episode_step_count == self.train_environment.EPISODE_STEPS_LIMIT or self.total_step_count == self.train_environment.TOTAL_STEPS_LIMIT):
if self.resume_training:
time_since_save = time.time() - self.last_time_saved
if time_since_save >= self.save_data_minutes * 60:
self.save_data()
self.last_time_saved = time.time()
print("#########SAVED#########")
if self.episode_step_count >= 1000 and self.episode_step_count % 1000 == 0:
print('\t\tOngoing episode, Step: {}/{}'.format(self.total_step_count, int(self.train_environment.TOTAL_STEPS_LIMIT)))
if self.is_maze and self.total_step_count % self.steps_per_netsave == 0 and self.no_netsave is False:
netsave_dir = os.path.join(self.netsave_data_bdir,os.path.splitext(self.save_data_fname)[0], '{}'.format(self.total_step_count))
if not os.path.isdir(netsave_dir):
os.makedirs(netsave_dir, exist_ok=True)
self.save_nets_custom_path(netsave_dir)
self.episode_step_count += 1
self.total_step_count += 1
if self.total_step_count == measure_speed_step:
speed = self.total_step_count / (time.time() - self.run_start_time)
total_estimated_time = self.train_environment.TOTAL_STEPS_LIMIT / speed
print("Current speed: {} steps/second".format(speed))
print("Total estimated time: {} ".format(dtime.timedelta(seconds=total_estimated_time)))
obs_n, reward, done, info = self.train_environment.step(self.Aold)
self.episode_reward += reward
if self.is_maze:
if isinstance(info, dict):
try:
if info["ending_goal"]:
assert done
self.right_exit_count += 1
elif info["misleading_goal"]:
assert done
self.bad_exit_count += 1
except KeyError:
pass
if self.total_step_count % self.train_environment.x_axis_steps == 0:
self.right_exit_global_count.append(self.right_exit_count)
self.bad_exit_global_count.append(self.bad_exit_count)
# if the episode was externally terminated by episode step limit, don't do update
# (except ContinuousBandits, where the episode is only 1 step)
if self.train_environment.name.startswith('ContinuousBandits'):
is_truncated = False
else:
if done and self.episode_step_count == self.train_environment.EPISODE_STEPS_LIMIT:
is_truncated = True
else:
is_truncated = False
self.agent.update(self.obs, obs_n, float(reward), self.Aold, done, is_truncated)
if not done:
self.Aold = self.agent.step(obs_n, is_train)
self.obs = obs_n
if self.total_step_count % self.train_environment.eval_interval == 0 and self.has_eval:
self.eval_session_time += self.eval()
# check if this episode is finished because of Total Training Step Limit
if not (done or self.episode_step_count == self.train_environment.EPISODE_STEPS_LIMIT):
force_terminated = True
else:
force_terminated = False
return force_terminated
# TODO: update this function to run saving/reloading with GMM
def link_variables_and_names(self):
#Diverse counters
self.sr_diverse_names = ['cum_eval_time', 'cum_train_time', 'total_step_count', 'episode_count','train_cum_steps', 'train_rewards_per_episode', 'train_start_time', 'eval_session_time', 'episode_reward', 'episode_step_count','obs','Aold','right_exit_count','bad_exit_count','right_exit_global_count','bad_exit_global_count']
self.sr_diverse_vars = [None] * len(self.sr_diverse_names)
#Networks
self.sr_nets_names = ['pi_net', 'q_net', 'v_net']
self.sr_nets_vars = [self.agent.network_manager.network.pi_net, self.agent.network_manager.network.q_net, self.agent.network_manager.network.v_net]
if self.agent.network_manager.use_target:
self.sr_nets_names.append('target_v_net')
self.sr_nets_vars.append(self.agent.network_manager.network.target_v_net)
#Optimizers
self.sr_optimizers_names = ['pi_optimizer', 'q_optimizer', 'v_optimizer']
self.sr_optimizers_vars = [self.agent.network_manager.network.pi_optimizer, self.agent.network_manager.network.q_optimizer, self.agent.network_manager.network.v_optimizer]
#Replay buffer
self.sr_buffer_names = ['replay_buffer']
self.sr_buffer_vars = [self.agent.replay_buffer]
# Episode vars
self.sr_pkl_episode_names = ['train_environment']
self.sr_pkl_episode_vars = [ self.train_environment]
# Episode agent vars (go in self.agent.network_manager)
self.sr_episode_names = ['train_ep_count','eval_ep_count']
self.sr_episode_vars = [ None, None]
# torch and numpy states
self.sr_random_state_names = ['np_state','torch_state']
self.sr_random_state_vars = [None, None]
#Join all variables
self.sr_all_names = self.sr_diverse_names + self.sr_nets_names + self.sr_optimizers_names + self.sr_buffer_names + self.sr_pkl_episode_names + self.sr_episode_names + self.sr_random_state_names
self.sr_all_vars = self.sr_diverse_vars + self.sr_nets_vars + self.sr_optimizers_vars + self.sr_buffer_vars + self.sr_pkl_episode_vars + self.sr_episode_vars + self.sr_random_state_vars
# TODO: update this function to run saving/reloading with GMM
def save_data(self):
sr_all_vars_state_dicts = [getattr(self, n) for n in self.sr_diverse_names] + [a.state_dict() for a in self.sr_nets_vars] + [a.state_dict() for a in self.sr_optimizers_vars] + [pickle.dumps(a) for a in self.sr_buffer_vars] + [pickle.dumps(a) for a in self.sr_pkl_episode_vars] + [getattr(self.agent.network_manager,n) for n in self.sr_episode_names] + [np.random.get_state(), torch.get_rng_state()]
out_dict = dict(zip(self.sr_all_names, sr_all_vars_state_dicts))
out_temp_fname = os.path.join(self.save_data_bdir, 'temp_' + self.save_data_fname)
out_fname = os.path.join(self.save_data_bdir, self.save_data_fname)
torch.save(out_dict, out_temp_fname)
os.rename(out_temp_fname, out_fname)
# TODO: update this function to run saving/reloading with GMM
def save_nets_custom_path(self, cpath):
sr_nets_names = ['pi_net', 'q_net', 'v_net']
sr_nets_vars = [self.agent.network_manager.network.pi_net, self.agent.network_manager.network.q_net, self.agent.network_manager.network.v_net]
sr_all_vars_state_dicts = [a.state_dict() for a in sr_nets_vars]
out_dict = dict(zip(sr_nets_names, sr_all_vars_state_dicts))
out_temp_fname = os.path.join(cpath, 'temp_' + self.save_data_fname)
out_fname = os.path.join(cpath, self.save_data_fname)
torch.save(out_dict, out_temp_fname)
os.rename(out_temp_fname, out_fname)
# TODO: update this function to run saving/reloading with GMM
def load_data(self):
in_fname = os.path.join(self.save_data_bdir, self.save_data_fname)
if os.path.isfile(in_fname):
checkpoint = torch.load(in_fname)
for name, var in zip(self.sr_all_names, self.sr_all_vars):
if name in self.sr_optimizers_names or name in self.sr_nets_names:
var.load_state_dict(checkpoint[name])
elif name in self.sr_buffer_names or name in self.sr_pkl_episode_names:
tmp = pickle.loads(checkpoint[name])
for ii in dir(var):
if ii.startswith('__'):
continue
if hasattr(var, ii):
setattr(var, ii, getattr(tmp, ii) )
elif name in self.sr_diverse_names:
setattr(self, name, checkpoint[name])
elif name in self.sr_episode_names:
setattr(self.agent.network_manager, name, checkpoint[name])
elif name in self.sr_random_state_names:
if 'torch' in name:
torch.set_rng_state(checkpoint[name])
elif 'np' in name:
np.random.set_state(checkpoint[name])
else:
raise NotImplementedError
return True
return False
def eval(self):
temp_rewards_per_episode = []
eval_session_time = 0.0
for i in range(self.test_environment.eval_episodes):
eval_start_time = time.time()
episode_reward, num_steps = self.run_episode_eval(self.test_environment, is_train=False)
eval_end_time = time.time()
temp_rewards_per_episode.append(episode_reward)
eval_elapsed_time = eval_end_time - eval_start_time
eval_session_time += eval_elapsed_time
print("=== EVAL :: ep: " + str(i) + ", r: " + str(episode_reward) + ", n_steps: " + str(num_steps) + ", elapsed: " + time.strftime("%H:%M:%S", time.gmtime(eval_elapsed_time)))
mean = np.mean(temp_rewards_per_episode)
self.eval_mean_rewards_per_episode.append(mean)
self.eval_std_rewards_per_episode.append(np.std(temp_rewards_per_episode))
self.cum_eval_time += eval_session_time
return eval_session_time
# Runs a single episode (EVAL)
def run_episode_eval(self, test_env, is_train):
obs = test_env.reset()
self.agent.reset()
episode_reward = 0.
done = False
Aold = self.agent.start(obs, is_train)
episode_step_count = 0
steps_limit = test_env.EPISODE_STEPS_LIMIT if not hasattr(test_env, 'timeout_steps') else min(test_env.EPISODE_STEPS_LIMIT, test_env.timeout_steps)
while not (done or episode_step_count == steps_limit):
obs_n, reward, done, info = test_env.step(Aold)
episode_reward += reward
if not done:
Aold = self.agent.step(obs_n, is_train)
obs = obs_n
episode_step_count += 1
return episode_reward, episode_step_count