Skip to content

Commit

Permalink
rmse calculation fix for ppo and sac
Browse files Browse the repository at this point in the history
  • Loading branch information
svsawant committed Aug 14, 2024
1 parent 0d9c780 commit 54fb070
Show file tree
Hide file tree
Showing 8 changed files with 384 additions and 761 deletions.
1,000 changes: 295 additions & 705 deletions examples/rl/data_analysis.ipynb

Large diffs are not rendered by default.

43 changes: 22 additions & 21 deletions examples/rl/train_rl_model.sh
Original file line number Diff line number Diff line change
Expand Up @@ -24,28 +24,28 @@ fi
# Removed the temporary data used to train the new unsafe model.
# rm -r -f ./${ALGO}_data_2/

if [ "$ALGO" == 'safe_explorer_ppo' ]; then
# Pretrain the unsafe controller/agent.
python3 ../../safe_control_gym/experiments/train_rl_controller.py \
--algo ${ALGO} \
--task ${SYS_NAME} \
--overrides \
./config_overrides/${SYS}/${ALGO}_${SYS}_pretrain.yaml \
./config_overrides/${SYS}/${SYS}_${TASK}.yaml \
--output_dir ./unsafe_rl_temp_data/ \
--seed 2 \
--kv_overrides \
task_config.init_state=None

# Move the newly trained unsafe model.
mv ./unsafe_rl_temp_data/model_latest.pt ./models/${ALGO}/${ALGO}_pretrain_${SYS}_${TASK}.pt

# Removed the temporary data used to train the new unsafe model.
rm -r -f ./unsafe_rl_temp_data/
fi
#if [ "$ALGO" == 'safe_explorer_ppo' ]; then
# # Pretrain the unsafe controller/agent.
# python3 ../../safe_control_gym/experiments/train_rl_controller.py \
# --algo ${ALGO} \
# --task ${SYS_NAME} \
# --overrides \
# ./config_overrides/${SYS}/${ALGO}_${SYS}_pretrain.yaml \
# ./config_overrides/${SYS}/${SYS}_${TASK}.yaml \
# --output_dir ./unsafe_rl_temp_data/ \
# --seed 2 \
# --kv_overrides \
# task_config.init_state=None
#
# # Move the newly trained unsafe model.
# mv ./unsafe_rl_temp_data/model_latest.pt ./models/${ALGO}/${ALGO}_pretrain_${SYS}_${TASK}.pt
#
# # Removed the temporary data used to train the new unsafe model.
# rm -r -f ./unsafe_rl_temp_data/
#fi

# Train the unsafe controller/agent.
for SEED in {0..0}
for SEED in {1..1}
do
python3 ../../safe_control_gym/experiments/train_rl_controller.py \
--algo ${ALGO} \
Expand All @@ -56,7 +56,8 @@ do
--output_dir ./Results/${SYS}_${ALGO}_data/${SEED}/ \
--seed ${SEED} \
--kv_overrides \
task_config.randomized_init=True
task_config.randomized_init=True
# --pretrain_path ./models/${ALGO}/model_latest.pt
done

# Move the newly trained unsafe model.
Expand Down
37 changes: 28 additions & 9 deletions safe_control_gym/controllers/ppo/ppo.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,13 +153,22 @@ def learn(self,
):
"""Performs learning (pre-training, training, fine-tuning, etc.)."""

# Initial Evaluation.
eval_results = self.run(env=self.eval_env, n_episodes=self.eval_batch_size)
self.logger.info('Eval | ep_lengths {:.2f} +/- {:.2f} | ep_return {:.3f} +/- {:.3f}'.format(
eval_results['ep_lengths'].mean(),
eval_results['ep_lengths'].std(),
eval_results['ep_returns'].mean(),
eval_results['ep_returns'].std()))

if self.num_checkpoints > 0:
step_interval = np.linspace(0, self.max_env_steps, self.num_checkpoints)
interval_save = np.zeros_like(step_interval, dtype=bool)
while self.total_steps < self.max_env_steps:
results = self.train_step()
# Checkpoint.
if self.total_steps >= self.max_env_steps or (self.save_interval and self.total_steps % self.save_interval == 0):
if (self.total_steps >= self.max_env_steps
or (self.save_interval and self.total_steps % self.save_interval == 0)):
# Latest/final checkpoint.
self.save(self.checkpoint_path)
self.logger.info(f'Checkpoint | {self.checkpoint_path}')
Expand All @@ -176,10 +185,11 @@ def learn(self,
if self.eval_interval and self.total_steps % self.eval_interval == 0:
eval_results = self.run(env=self.eval_env, n_episodes=self.eval_batch_size)
results['eval'] = eval_results
self.logger.info('Eval | ep_lengths {:.2f} +/- {:.2f} | ep_return {:.3f} +/- {:.3f}'.format(eval_results['ep_lengths'].mean(),
eval_results['ep_lengths'].std(),
eval_results['ep_returns'].mean(),
eval_results['ep_returns'].std()))
self.logger.info('Eval | ep_lengths {:.2f} +/- {:.2f} | ep_return {:.3f} +/- {:.3f}'.format(
eval_results['ep_lengths'].mean(),
eval_results['ep_lengths'].std(),
eval_results['ep_returns'].mean(),
eval_results['ep_returns'].std()))
# Save best model.
eval_score = eval_results['ep_returns'].mean()
eval_best_score = getattr(self, 'eval_best_score', -np.infty)
Expand Down Expand Up @@ -209,7 +219,7 @@ def select_action(self, obs, info=None):
def run(self,
env=None,
render=False,
n_episodes=50,
n_episodes=1,
verbose=False,
):
"""Runs evaluation with current policy."""
Expand All @@ -229,24 +239,31 @@ def run(self,
obs = self.obs_normalizer(obs)
ep_returns, ep_lengths, eval_return = [], [], 0.0
frames = []
mse, ep_rmse_mean, ep_rmse_std = [], [], []
while len(ep_returns) < n_episodes:
action = self.select_action(obs=obs, info=info)
obs, _, done, info = env.step(action)
mse.append(info["mse"])
if render:
env.render()
frames.append(env.render('rgb_array'))
if verbose:
print(f'obs {obs} | act {action}')
if done:
assert 'episode' in info
ep_rmse_mean.append(np.array(mse).mean()**0.5)
ep_rmse_std.append(np.array(mse).std()**0.5)
mse = []
ep_returns.append(info['episode']['r'])
ep_lengths.append(info['episode']['l'])
obs, _ = env.reset()
obs = self.obs_normalizer(obs)
# Collect evaluation results.
ep_lengths = np.asarray(ep_lengths)
ep_returns = np.asarray(ep_returns)
eval_results = {'ep_returns': ep_returns, 'ep_lengths': ep_lengths}
eval_results = {'ep_returns': ep_returns, 'ep_lengths': ep_lengths,
'rmse': np.array(ep_rmse_mean).mean(),
'rmse_std': np.array(ep_rmse_std).mean()}
if len(frames) > 0:
eval_results['frames'] = frames
# Other episodic stats from evaluation env.
Expand Down Expand Up @@ -344,15 +361,17 @@ def log_step(self,
eval_ep_lengths = results['eval']['ep_lengths']
eval_ep_returns = results['eval']['ep_returns']
eval_constraint_violation = results['eval']['constraint_violation']
eval_mse = results['eval']['mse']
eval_rmse = results['eval']['rmse']
eval_rmse_std = results['eval']['rmse_std']
self.logger.add_scalars(
{
'ep_length': eval_ep_lengths.mean(),
'ep_return': eval_ep_returns.mean(),
'ep_return_std': eval_ep_returns.std(),
'ep_reward': (eval_ep_returns / eval_ep_lengths).mean(),
'constraint_violation': eval_constraint_violation.mean(),
'mse': eval_mse.mean()
'rmse': eval_rmse,
'rmse_std': eval_rmse_std
},
step,
prefix='stat_eval')
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
'''Model Predictive Control.'''
"""Q learning for Model Predictive Control."""

from copy import deepcopy

Expand All @@ -14,7 +14,7 @@


class Qlearning_MPC(BaseController):
'''MPC with full nonlinear model.'''
"""MPC with full nonlinear model."""

def __init__(
self,
Expand All @@ -34,7 +34,7 @@ def __init__(
seed: int = 0,
**kwargs
):
'''Creates task and controller.
"""Creates task and controller.
Args:
env_func (Callable): function to instantiate task/environment.
Expand All @@ -49,7 +49,7 @@ def __init__(
additional_constraints (list): List of additional constraints
use_gpu (bool): False (use cpu) True (use cuda).
seed (int): random seed.
'''
"""
super().__init__(env_func, output_dir, use_gpu, seed, **kwargs)
for k, v in locals().items():
if k != 'self' and k != 'kwargs' and '__' not in k:
Expand Down Expand Up @@ -88,21 +88,21 @@ def __init__(
def add_constraints(self,
constraints
):
'''Add the constraints (from a list) to the system.
"""Add the constraints (from a list) to the system.
Args:
constraints (list): List of constraints controller is subject too.
'''
"""
self.constraints, self.state_constraints_sym, self.input_constraints_sym = reset_constraints(constraints + self.constraints.constraints)

def remove_constraints(self,
constraints
):
'''Remove constraints from the current constraint list.
"""Remove constraints from the current constraint list.
Args:
constraints (list): list of constraints to be removed.
'''
"""
old_constraints_list = self.constraints.constraints
for constraint in constraints:
assert constraint in self.constraints.constraints, \
Expand All @@ -111,11 +111,11 @@ def remove_constraints(self,
self.constraints, self.state_constraints_sym, self.input_constraints_sym = reset_constraints(old_constraints_list)

def close(self):
'''Cleans up resources.'''
"""Cleans up resources."""
self.env.close()

def reset(self):
'''Prepares for training or evaluation.'''
"""Prepares for training or evaluation."""
# Setup reference input.
if self.env.TASK == Task.STABILIZATION:
self.mode = 'stabilization'
Expand All @@ -136,7 +136,7 @@ def reset(self):
self.setup_results_dict()

def set_dynamics_func(self):
'''Updates symbolic dynamics with actual control frequency.'''
"""Updates symbolic dynamics with actual control frequency."""
# self.dynamics_func = cs.integrator('fd', 'rk',
# {
# 'x': self.model.x_sym,
Expand All @@ -150,7 +150,7 @@ def set_dynamics_func(self):
self.dt)

def compute_initial_guess(self, init_state, goal_states, x_lin, u_lin):
'''Use LQR to get an initial guess of the '''
"""Use LQR to get an initial guess of the """
dfdxdfdu = self.model.df_func(x=x_lin, u=u_lin)
dfdx = dfdxdfdu['dfdx'].toarray()
dfdu = dfdxdfdu['dfdu'].toarray()
Expand All @@ -168,7 +168,7 @@ def compute_initial_guess(self, init_state, goal_states, x_lin, u_lin):
return x_guess, u_guess

def setup_optimizer(self):
'''Sets up nonlinear optimization problem.'''
"""Sets up nonlinear optimization problem."""
nx, nu = self.model.nx, self.model.nu
T = self.T
# Define optimizer and variables.
Expand Down Expand Up @@ -253,15 +253,15 @@ def select_action(self,
obs,
info=None
):
'''Solves nonlinear mpc problem to get next action.
"""Solves nonlinear mpc problem to get next action.
Args:
obs (ndarray): Current state/observation.
info (dict): Current info
Returns:
action (ndarray): Input/action to the task/env.
'''
"""

opti_dict = self.opti_dict
opti = opti_dict['opti']
Expand Down Expand Up @@ -308,7 +308,7 @@ def select_action(self,
return action

def get_references(self):
'''Constructs reference states along mpc horizon.(nx, T+1).'''
"""Constructs reference states along mpc horizon.(nx, T+1)."""
if self.env.TASK == Task.STABILIZATION:
# Repeat goal state for horizon steps.
goal_states = np.tile(self.env.X_GOAL.reshape(-1, 1), (1, self.T + 1))
Expand All @@ -326,7 +326,7 @@ def get_references(self):
return goal_states # (nx, T+1).

def setup_results_dict(self):
'''Setup the results dictionary to store run information.'''
"""Setup the results dictionary to store run information."""
self.results_dict = {'obs': [],
'reward': [],
'done': [],
Expand All @@ -350,15 +350,15 @@ def run(self,
max_steps=None,
terminate_run_on_done=None
):
'''Runs evaluation with current policy.
"""Runs evaluation with current policy.
Args:
render (bool): if to do real-time rendering.
logging (bool): if to log on terminal.
Returns:
dict: evaluation statisitcs, rendered frames.
'''
"""
if env is None:
env = self.env
if terminate_run_on_done is None:
Expand Down Expand Up @@ -440,16 +440,17 @@ def run(self,
self.results_dict['total_rmse_state_error'] = compute_state_rmse(self.results_dict['state'])
self.results_dict['total_rmse_obs_error'] = compute_state_rmse(self.results_dict['obs'])
except ValueError:
raise Exception('[ERROR] mpc.run().py: MPC could not find a solution for the first step given the initial conditions. '
raise Exception('[ERROR] mpc.run().py: MPC could not find a solution for '
'the first step given the initial conditions. '
'Check to make sure initial conditions are feasible.')
return deepcopy(self.results_dict)

def reset_before_run(self, obs, info=None, env=None):
'''Reinitialize just the controller before a new run.
"""Reinitialize just the controller before a new run.
Args:
obs (ndarray): The initial observation for the new run.
info (dict): The first info of the new run.
env (BenchmarkEnv): The environment to be used for the new run.
'''
"""
self.reset()
16 changes: 12 additions & 4 deletions safe_control_gym/controllers/sac/sac.py
Original file line number Diff line number Diff line change
Expand Up @@ -236,11 +236,12 @@ def run(self, env=None, render=False, n_episodes=10, verbose=False, **kwargs):
obs = self.obs_normalizer(obs)
ep_returns, ep_lengths = [], []
frames = []

mse, ep_rmse_mean, ep_rmse_std = [], [], []
while len(ep_returns) < n_episodes:
action = self.select_action(obs=obs, info=info)

obs, _, done, info = env.step(action)
mse.append(info["mse"])
if render:
env.render()
frames.append(env.render('rgb_array'))
Expand All @@ -249,6 +250,9 @@ def run(self, env=None, render=False, n_episodes=10, verbose=False, **kwargs):

if done:
assert 'episode' in info
ep_rmse_mean.append(np.array(mse).mean()**0.5)
ep_rmse_std.append(np.array(mse).std()**0.5)
mse = []
ep_returns.append(info['episode']['r'])
ep_lengths.append(info['episode']['l'])
obs, info = env.reset()
Expand All @@ -257,7 +261,9 @@ def run(self, env=None, render=False, n_episodes=10, verbose=False, **kwargs):
# collect evaluation results
ep_lengths = np.asarray(ep_lengths)
ep_returns = np.asarray(ep_returns)
eval_results = {'ep_returns': ep_returns, 'ep_lengths': ep_lengths}
eval_results = {'ep_returns': ep_returns, 'ep_lengths': ep_lengths,
'rmse': np.array(ep_rmse_mean).mean(),
'rmse_std': np.array(ep_rmse_std).mean()}
if len(frames) > 0:
eval_results['frames'] = frames
# Other episodic stats from evaluation env.
Expand Down Expand Up @@ -383,15 +389,17 @@ def log_step(self, results):
eval_ep_lengths = results['eval']['ep_lengths']
eval_ep_returns = results['eval']['ep_returns']
eval_constraint_violation = results['eval']['constraint_violation']
eval_mse = results['eval']['mse']
eval_rmse = results['eval']['rmse']
eval_rmse_std = results['eval']['rmse_std']
self.logger.add_scalars(
{
'ep_length': eval_ep_lengths.mean(),
'ep_return': eval_ep_returns.mean(),
'ep_return_std': eval_ep_returns.std(),
'ep_reward': (eval_ep_returns / eval_ep_lengths).mean(),
'constraint_violation': eval_constraint_violation.mean(),
'mse': eval_mse.mean()
'rmse': eval_rmse,
'rmse_std': eval_rmse_std
},
step,
prefix='stat_eval')
Expand Down
Loading

0 comments on commit 54fb070

Please sign in to comment.