From 5a543ba572f4da0480190e4dd7b824ecb2ea02d3 Mon Sep 17 00:00:00 2001 From: Stefan Heid Date: Tue, 3 Nov 2020 10:52:01 +0100 Subject: [PATCH] #51 extended stable baselines experiment --- experiments/issue51_new/stable_baselines.py | 39 +++++++++++++-------- 1 file changed, 24 insertions(+), 15 deletions(-) diff --git a/experiments/issue51_new/stable_baselines.py b/experiments/issue51_new/stable_baselines.py index 0e8b82cc..cf06a210 100644 --- a/experiments/issue51_new/stable_baselines.py +++ b/experiments/issue51_new/stable_baselines.py @@ -5,6 +5,7 @@ import gym import numpy as np from stable_baselines3 import PPO +from stable_baselines3.common.callbacks import BaseCallback, CheckpointCallback, EveryNTimesteps from stable_baselines3.common.monitor import Monitor from openmodelica_microgrid_gym.env import PlotTmpl @@ -35,7 +36,7 @@ def set_idx(self, obs): lambda n: obs.index(n), [[f'lc1.inductor{k}.i' for k in '123'], [f'inverter1.i_ref.{k}' for k in '012']]) - def rew_fun(self, cols: List[str], data: np.ndarray) -> float: + def rew_fun(self, cols: List[str], data: np.ndarray, risk) -> float: """ Defines the reward function for the environment. Uses the observations and setpoints to evaluate the quality of the used parameters. @@ -57,8 +58,8 @@ def rew_fun(self, cols: List[str], data: np.ndarray) -> float: # better, i.e. more significant, gradients) # plus barrier penalty for violating the current constraint error = np.sum((np.abs((ISPabc_master - Iabc_master)) / iLimit) ** 0.5, axis=0) \ - # + -np.sum(mu * np.log(1 - np.maximum(np.abs(Iabc_master) - iNominal, 0) / (iLimit - iNominal)), axis=0) \ - # * max_episode_steps + # + -np.sum(mu * np.log(1 - np.maximum(np.abs(Iabc_master) - iNominal, 0) / (iLimit - iNominal)), axis=0) + error /= max_episode_steps return -np.clip(error.squeeze(), 0, 1e5) @@ -74,7 +75,7 @@ def xylables(fig): env = gym.make('openmodelica_microgrid_gym:ModelicaEnv_test-v1', reward_fun=Reward().rew_fun, viz_cols=[ - PlotTmpl([[f'lc.inductor{i}.i' for i in '123'], [f'inverter1.i_ref.{k}' for k in '012']], + PlotTmpl([[f'lc1.inductor{i}.i' for i in '123'], [f'inverter1.i_ref.{k}' for k in '012']], callback=xylables, color=[['b', 'r', 'g'], ['b', 'r', 'g']], style=[[None], ['--']] @@ -89,15 +90,23 @@ def xylables(fig): print(str(env), file=f) env = Monitor(env) + +class RecordEnvCallback(BaseCallback): + def _on_step(self) -> bool: + obs = env.reset() + for _ in range(max_episode_steps): + env.render() + action, _states = model.predict(obs, deterministic=True) + obs, reward, done, info = env.step(action) + if done: + break + env.close() + env.reset() + return True + + model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=f'{timestamp}/') -model.learn(total_timesteps=1000000) -model.save(f'{timestamp}/model') - -obs = env.reset() -for _ in range(1000): - env.render() - action, _states = model.predict(obs, deterministic=True) - obs, reward, done, info = env.step(action) - if done: - break -env.close() +checkpoint_on_event = CheckpointCallback(save_freq=100000, save_path=f'{timestamp}/checkpoints/') +record_env = RecordEnvCallback() +plot_callback = EveryNTimesteps(n_steps=20000, callback=record_env) +model.learn(total_timesteps=5000000, callback=[checkpoint_on_event, plot_callback])