hpi-epic
diff --git a/‎recommerce/configuration/hyperparameter_config.py
Lines changed: 1 addition & 1 deletion b/‎recommerce/configuration/hyperparameter_config.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎recommerce/rl/actorcritic/actorcritic_agent.py
Lines changed: 5 additions & 20 deletions b/‎recommerce/rl/actorcritic/actorcritic_agent.py
Lines changed: 5 additions & 20 deletions
diff --git a/‎recommerce/rl/actorcritic/actorcritic_training.py
Lines changed: 15 additions & 14 deletions b/‎recommerce/rl/actorcritic/actorcritic_training.py
Lines changed: 15 additions & 14 deletions
diff --git a/‎recommerce/rl/stable_baselines/stable_baselines_callback.py renamed to ‎recommerce/rl/callback.py
Lines changed: 41 additions & 20 deletions b/‎recommerce/rl/stable_baselines/stable_baselines_callback.py renamed to ‎recommerce/rl/callback.py
Lines changed: 41 additions & 20 deletions
diff --git a/‎recommerce/rl/experience_buffer.py renamed to ‎recommerce/rl/q_learning/experience_buffer.py b/‎recommerce/rl/experience_buffer.py renamed to ‎recommerce/rl/q_learning/experience_buffer.py
diff --git a/‎recommerce/rl/q_learning/q_learning_agent.py
Lines changed: 5 additions & 14 deletions b/‎recommerce/rl/q_learning/q_learning_agent.py
Lines changed: 5 additions & 14 deletions
@@ -275,7 +275,7 @@ def _set_sim_market_variables(self, config: dict) -> None:
 		self.production_price = config['production_price']
 		self.storage_cost_per_product = config['storage_cost_per_product']
 
-		self.mean_reward_bound = self.episode_length * self.max_price * self.number_of_customers
+		self.mean_return_bound = self.episode_length * self.max_price * self.number_of_customers
 
 
 class HyperparameterConfigLoader():
 
@@ -1,4 +1,3 @@
-import os
 from abc import ABC, abstractmethod
 
 import numpy as np
@@ -71,26 +70,16 @@ def policy(self, observation, verbose=False, raw_action=False) -> None:  # pragm
 		"""
 		raise NotImplementedError('This method is abstract. Use a subclass')
 
-	def sync_to_best_interim(self):
-		self.best_interim_actor_net.load_state_dict(self.actor_net.state_dict())
-		self.best_interim_critic_net.load_state_dict(self.critic_net.state_dict())
-
-	def save(self, model_path, model_name) -> None:
+	def save(self, model_path: str) -> None:
 		"""
 		Save a trained model to the specified folder within 'trainedModels'.
-		For each model an actor and a critic net will be saved.
-		This method is copied from our Q-Learning Agent
+		For each model only the actor net will be saved.
 
 		Args:
-			model_path (str): The path to the folder within 'trainedModels' where the model should be saved.
-			model_name (str): The name of the .dat file of this specific model.
+			model_path (str): The path including the name where the model should be saved.
 		"""
-		assert model_name.endswith('.dat'), f'the modelname must end in ".dat": {model_name}'
-		assert os.path.exists(model_path), f'the specified path does not exist: {model_path}'
-		actor_path = os.path.join(model_path, f'actor_parameters{model_name}')
-		torch.save(self.best_interim_actor_net.state_dict(), actor_path)
-		torch.save(self.best_interim_critic_net.state_dict(), os.path.join(model_path, 'critic_parameters' + model_name))
-		return actor_path
+		assert model_path.endswith('.dat'), f'the modelname must end in ".dat": {model_path}'
+		torch.save(self.actor_net.state_dict(), model_path)
 
 	def train_batch(self, states, actions, rewards, next_states, regularization=False):
 		"""
@@ -176,11 +165,9 @@ class DiscreteActorCriticAgent(ActorCriticAgent):
 	def initialize_models_and_optimizer(self, n_observations, n_actions, network_architecture):
 		self.actor_net = network_architecture(n_observations, n_actions).to(self.device)
 		self.actor_optimizer = torch.optim.Adam(self.actor_net.parameters(), lr=0.0000025)
-		self.best_interim_actor_net = network_architecture(n_observations, n_actions).to(self.device)
 		self.critic_net = network_architecture(n_observations, 1).to(self.device)
 		self.critic_optimizer = torch.optim.Adam(self.critic_net.parameters(), lr=0.00025)
 		self.critic_tgt_net = network_architecture(n_observations, 1).to(self.device)
-		self.best_interim_critic_net = self.critic_tgt_net = network_architecture(n_observations, 1).to(self.device)
 
 	def policy(self, observation, verbose=False, raw_action=False):
 		observation = torch.Tensor(np.array(observation)).to(self.device)
@@ -235,11 +222,9 @@ def initialize_models_and_optimizer(self, n_observations, n_actions, network_arc
 		self.n_actions = n_actions
 		self.actor_net = network_architecture(n_observations, self.n_actions).to(self.device)
 		self.actor_optimizer = torch.optim.Adam(self.actor_net.parameters(), lr=0.0002)
-		self.best_interim_actor_net = network_architecture(n_observations, self.n_actions).to(self.device)
 		self.critic_net = network_architecture(n_observations, 1).to(self.device)
 		self.critic_optimizer = torch.optim.Adam(self.critic_net.parameters(), lr=0.002)
 		self.critic_tgt_net = network_architecture(n_observations, 1).to(self.device)
-		self.best_interim_critic_net = network_architecture(n_observations, 1).to(self.device)
 
 	@abstractmethod
 	def transform_network_output(self, number_outputs, network_result):
 
@@ -2,7 +2,6 @@
 
 import numpy as np
 import torch
-from tqdm.auto import trange
 
 import recommerce.configuration.utils as ut
 import recommerce.rl.actorcritic.actorcritic_agent as actorcritic_agent
@@ -12,7 +11,7 @@
 
 class ActorCriticTrainer(RLTrainer):
 	def trainer_agent_fit(self) -> bool:
-		return isinstance(self.RL_agent, actorcritic_agent.ActorCriticAgent)
+		return issubclass(self.agent_class, actorcritic_agent.ActorCriticAgent)
 
 	def choose_random_envs(self, total_envs) -> set:
 		"""
@@ -41,6 +40,7 @@ def train_agent(self, number_of_training_steps=200, verbose=False, total_envs=12
 			verbose (bool, optional): Should additional information about agent steps be written to the tensorboard? Defaults to False.
 			total_envs (int, optional): The number of environments you use in parallel to fulfill the iid assumption. Defaults to 128.
 		"""
+		self.initialize_callback(number_of_training_steps * config.batch_size)
 
 		all_dicts = []
 		if verbose:
@@ -50,25 +50,28 @@ def train_agent(self, number_of_training_steps=200, verbose=False, total_envs=12
 		all_policy_losses = []
 
 		finished_episodes = 0
+		mean_return = -np.inf
+		self.callback.num_timesteps = 0
 		environments = [self.marketplace_class() for _ in range(total_envs)]
 		info_accumulators = [None for _ in range(total_envs)]
 
-		for step_number in trange(number_of_training_steps, unit=' frames', leave=False):
+		for step_number in range(number_of_training_steps):
 			chosen_envs = self.choose_random_envs(total_envs)
 
 			states = []
 			actions = []
 			rewards = []
 			states_dash = []
 			for env in chosen_envs:
+				self.callback.num_timesteps += 1
 				state = environments[env]._observation()
 				if not verbose:
-					action = self.RL_agent.policy(state, verbose=False, raw_action=True)
+					action = self.callback.model.policy(state, verbose=False, raw_action=True)
 				else:
-					action, net_output, v_estimate = self.RL_agent.policy(state, verbose=True, raw_action=True)
+					action, net_output, v_estimate = self.callback.model.policy(state, verbose=True, raw_action=True)
 					all_network_outputs.append(net_output.reshape(-1))
 					all_v_estimates.append(v_estimate)
-				next_state, reward, is_done, info = environments[env].step(self.RL_agent.agent_output_to_market_form(action))
+				next_state, reward, is_done, info = environments[env].step(self.callback.model.agent_output_to_market_form(action))
 
 				states.append(state)
 				actions.append(action)
@@ -92,16 +95,16 @@ def train_agent(self, number_of_training_steps=200, verbose=False, total_envs=12
 							averaged_info[f'verbose/min/information_{str(action_num)}'] = np.min(myactions[:, action_num])
 							averaged_info[f'verbose/max/information_{str(action_num)}'] = np.max(myactions[:, action_num])
 
-					ut.write_dict_to_tensorboard(self.writer, averaged_info, finished_episodes, is_cumulative=True)
+					ut.write_dict_to_tensorboard(self.callback.writer, averaged_info, finished_episodes, is_cumulative=True)
 
 					environments[env].reset()
 					info_accumulators[env] = None
 
-					self.consider_print_info(step_number, finished_episodes, averaged_info)
-					self.consider_update_best_model(averaged_info)
-					self.consider_save_model(finished_episodes)
+					mean_return = averaged_info['profits/all']['vendor_0']
 
-			policy_loss, valueloss = self.RL_agent.train_batch(
+				self.callback._on_step(finished_episodes, mean_return)
+
+			policy_loss, valueloss = self.callback.model.train_batch(
 				torch.Tensor(np.array(states)),
 				torch.from_numpy(np.array(actions, dtype=np.int64)),
 				torch.Tensor(np.array(rewards)),
@@ -112,6 +115,4 @@ def train_agent(self, number_of_training_steps=200, verbose=False, total_envs=12
 
 			self.consider_sync_tgt_net(step_number)
 
-		self.consider_save_model(finished_episodes, force=True)
-		self.analyze_trained_agents()
-		self._end_of_training()
+		self.callback._on_training_end()
@@ -22,26 +22,30 @@
 warnings.filterwarnings('ignore')
 
 
-class PerStepCheck(BaseCallback):
+class RecommerceCallback(BaseCallback):
 	"""
 	Callback for saving a model (the check is done every `check_freq` steps)
 	based on the training reward (in practice, we recommend using `EvalCallback`).
 	"""
-	def __init__(self, agent_class, marketplace_class, log_dir_prepend='', training_steps=10000, iteration_length=500):
+	def __init__(self, agent_class, marketplace_class, log_dir_prepend='', training_steps=10000,
+			iteration_length=500, file_ending='zip', signature='training'):
 		assert issubclass(agent_class, ReinforcementLearningAgent)
 		assert issubclass(marketplace_class, SimMarket)
 		assert isinstance(log_dir_prepend, str), \
 			f'log_dir_prepend should be a string, but {log_dir_prepend} is {type(log_dir_prepend)}'
 		assert isinstance(training_steps, int) and training_steps > 0
 		assert isinstance(iteration_length, int) and iteration_length > 0
-		super(PerStepCheck, self).__init__(True)
+		super(RecommerceCallback, self).__init__(True)
 		self.best_mean_interim_reward = None
 		self.best_mean_overall_reward = None
 		self.marketplace_class = marketplace_class
 		self.agent_class = agent_class
 		self.iteration_length = iteration_length
+		self.file_ending = file_ending
+		self.signature = signature
 		self.tqdm_instance = trange(training_steps)
 		self.saved_parameter_paths = []
+		self.last_finished_episode = 0
 		signal.signal(signal.SIGINT, self._signal_handler)
 
 		self.initialize_io_related(log_dir_prepend)
@@ -63,34 +67,51 @@ def initialize_io_related(self, log_dir_prepend) -> None:
 		"""
 		ut.ensure_results_folders_exist()
 		self.curr_time = time.strftime('%b%d_%H-%M-%S')
-		self.signature = 'Stable_Baselines_Training'
 		self.writer = SummaryWriter(log_dir=os.path.join(PathManager.results_path, 'runs', f'{log_dir_prepend}training_{self.curr_time}'))
 		path_name = f'{self.signature}_{self.curr_time}'
 		self.save_path = os.path.join(PathManager.results_path, 'trainedModels', log_dir_prepend + path_name)
 		os.makedirs(os.path.abspath(self.save_path), exist_ok=True)
-		self.tmp_parameters = os.path.join(self.save_path, 'tmp_model.zip')
+		self.tmp_parameters = os.path.join(self.save_path, f'tmp_model.{self.file_ending}')
 
-	def _on_step(self) -> bool:
+	def _on_step(self, finished_episodes: int = None, mean_return: float = None) -> bool:
 		"""
-		This method is called at every step by the stable baselines agents.
+		This method is called during training after step in the environment is called.
+		If you don't provide finished_episodes and mean_return, the agent will conclude this from the number of timesteps.
+		Note that you must provide finished_episodes if and only if you provide mean_return.
+
+		Args:
+			finished_episodes (int, optional): The episodes that are already finished. Defaults to None.
+			mean_return (float, optional): The recieved return over the last episodes. Defaults to None.
+
+		Returns:
+			bool: True should be returned. False will be interpreted as error.
 		"""
+		assert (finished_episodes is None) == (mean_return is None), 'finished_episodes must be exactly None if mean_return is None'
 		self.tqdm_instance.update()
-		if (self.num_timesteps - 1) % config.episode_length != 0 or self.num_timesteps <= config.episode_length:
+		if finished_episodes is None:
+			finished_episodes = self.num_timesteps // config.episode_length
+			x, y = ts2xy(load_results(self.save_path), 'timesteps')
+			if len(x) <= 0:
+				return True
+			assert len(x) == len(y)
+			mean_return = np.mean(y[-100:])
+		assert isinstance(finished_episodes, int)
+		assert isinstance(mean_return, float)
+
+		assert finished_episodes >= self.last_finished_episode
+		if finished_episodes == self.last_finished_episode or finished_episodes < 5:
 			return True
-		self.tqdm_instance.refresh()
-		finished_episodes = self.num_timesteps // config.episode_length
-		x, y = ts2xy(load_results(self.save_path), 'timesteps')
-		assert len(x) > 0 and len(x) == len(y)
-		mean_reward = np.mean(y[-100:])
+		else:
+			self.last_finished_episode = finished_episodes
 
 		# consider print info
 		if (finished_episodes) % 10 == 0:
-			tqdm.write(f'{self.num_timesteps}: {finished_episodes} episodes trained, mean return {mean_reward:.3f}')
+			tqdm.write(f'{self.num_timesteps}: {finished_episodes} episodes trained, mean return {mean_return:.3f}')
 
 		# consider update best model
-		if self.best_mean_interim_reward is None or mean_reward > self.best_mean_interim_reward + 15:
+		if self.best_mean_interim_reward is None or mean_return > self.best_mean_interim_reward + 15:
 			self.model.save(self.tmp_parameters)
-			self.best_mean_interim_reward = mean_reward
+			self.best_mean_interim_reward = mean_return
 			if self.best_mean_overall_reward is None or self.best_mean_interim_reward > self.best_mean_overall_reward:
 				if self.best_mean_overall_reward is not None:
 					tqdm.write(f'Best overall reward updated {self.best_mean_overall_reward:.3f} -> {self.best_mean_interim_reward:.3f}')
@@ -105,23 +126,23 @@ def _on_step(self) -> bool:
 	def _on_training_end(self) -> None:
 		self.tqdm_instance.close()
 		if self.best_mean_interim_reward is not None:
-			finished_episodes = self.num_timesteps // config.episode_length
-			self.save_parameters(finished_episodes)
+			self.save_parameters(self.last_finished_episode)
 
 		# analyze trained agents
 		if len(self.saved_parameter_paths) == 0:
 			print('No agents saved! Nothing to monitor.')
 			return
 		monitor = Monitor()
 		agent_list = [(self.agent_class, [parameter_path]) for parameter_path in self.saved_parameter_paths]
-		monitor.configurator.setup_monitoring(False, 250, 250, self.marketplace_class, agent_list, support_continuous_action_space=True)
+		monitor.configurator.setup_monitoring(False, 250, 250, self.marketplace_class, agent_list,
+			support_continuous_action_space=hasattr(self.model, 'env'))
 		rewards = monitor.run_marketplace()
 		episode_numbers = [int(parameter_path[-9:][:5]) for parameter_path in self.saved_parameter_paths]
 		Evaluator(monitor.configurator).evaluate_session(rewards, episode_numbers)
 
 	def save_parameters(self, finished_episodes: int):
 		assert isinstance(finished_episodes, int)
-		path_to_parameters = os.path.join(self.save_path, f'{self.signature}_{finished_episodes:05d}.zip')
+		path_to_parameters = os.path.join(self.save_path, f'{self.signature}_{finished_episodes:05d}.{self.file_ending}')
 		os.rename(self.tmp_parameters, path_to_parameters)
 		self.saved_parameter_paths.append(path_to_parameters)
 		tqdm.write(f'I write the interim model after {finished_episodes} episodes to the disk.')
 
@@ -1,5 +1,4 @@
 import collections
-import os
 import random
 from abc import ABC, abstractmethod
 
@@ -11,7 +10,7 @@
 from recommerce.market.circular.circular_vendors import CircularAgent
 from recommerce.market.linear.linear_vendors import LinearAgent
 from recommerce.market.sim_market import SimMarket
-from recommerce.rl.experience_buffer import ExperienceBuffer
+from recommerce.rl.q_learning.experience_buffer import ExperienceBuffer
 from recommerce.rl.reinforcement_learning_agent import ReinforcementLearningAgent
 
 
@@ -48,7 +47,6 @@ def __init__(
 		self.name = name
 		print(f'I initiate a QLearningAgent using {self.device} device')
 		self.net = network_architecture(n_observations, n_actions).to(self.device)
-		self.best_interim_net = network_architecture(n_observations, n_actions)
 		if load_path:
 			self.net.load_state_dict(torch.load(load_path, map_location=self.device))
 		if optim:
@@ -120,22 +118,15 @@ def calc_loss(self, batch, device='cpu'):
 		expected_state_action_values = next_state_values * config.gamma + rewards_v
 		return torch.nn.MSELoss()(state_action_values, expected_state_action_values), state_action_values.mean()
 
-	def sync_to_best_interim(self):
-		self.best_interim_net.load_state_dict(self.net.state_dict())
-
-	def save(self, model_path: str, model_name: str) -> None:
+	def save(self, model_path: str) -> None:
 		"""
 		Save a trained model to the specified folder within 'trainedModels'.
 
 		Args:
-			model_path (str): The path to the folder within 'trainedModels' where the model should be saved.
-			model_name (str): The name of the .dat file of this specific model.
+			model_path (str): The path including the name where the model should be saved.
 		"""
-		assert model_name.endswith('.dat'), f'the modelname must end in ".dat": {model_name}'
-		assert os.path.exists(model_path), f'the specified path does not exist: {model_path}'
-		parameters_path = os.path.join(model_path, model_name)
-		torch.save(self.best_interim_net.state_dict(), parameters_path)
-		return parameters_path
+		assert model_path.endswith('.dat'), f'the modelname must end in ".dat": {model_path}'
+		torch.save(self.net.state_dict(), model_path)
 
 
 class QLearningLEAgent(QLearningAgent, LinearAgent):