From 88415629e919dae55f0837bd4fdeebbc302e3726 Mon Sep 17 00:00:00 2001 From: Shambhuraj Sawant Date: Fri, 3 Nov 2023 17:27:23 +0100 Subject: [PATCH] initial commit for rlmpc --- examples/rl/rl_experiment.sh | 4 +- .../cartpole/cartpole_stab.yaml | 72 +++ .../cartpole/qlearning_mpc_cartpole.yaml | 1 + examples/rlmpc/rlmpc_experiment.py | 52 ++ examples/rlmpc/rlmpc_experiment.sh | 22 + safe_control_gym/controllers/__init__.py | 4 + .../controllers/mpc/qlearning_mpc.py | 455 ++++++++++++++++++ .../controllers/mpc/qlearning_mpc.yaml | 13 + 8 files changed, 621 insertions(+), 2 deletions(-) create mode 100644 examples/rlmpc/config_overrides/cartpole/cartpole_stab.yaml create mode 100644 examples/rlmpc/config_overrides/cartpole/qlearning_mpc_cartpole.yaml create mode 100644 examples/rlmpc/rlmpc_experiment.py create mode 100755 examples/rlmpc/rlmpc_experiment.sh create mode 100644 safe_control_gym/controllers/mpc/qlearning_mpc.py create mode 100644 safe_control_gym/controllers/mpc/qlearning_mpc.yaml diff --git a/examples/rl/rl_experiment.sh b/examples/rl/rl_experiment.sh index 203bc14e4..f646920ac 100755 --- a/examples/rl/rl_experiment.sh +++ b/examples/rl/rl_experiment.sh @@ -1,8 +1,8 @@ #!/bin/bash -# SYS='cartpole' +SYS='cartpole' # SYS='quadrotor_2D' -SYS='quadrotor_3D' +# SYS='quadrotor_3D' # TASK='stab' TASK='track' diff --git a/examples/rlmpc/config_overrides/cartpole/cartpole_stab.yaml b/examples/rlmpc/config_overrides/cartpole/cartpole_stab.yaml new file mode 100644 index 000000000..bc1daacaf --- /dev/null +++ b/examples/rlmpc/config_overrides/cartpole/cartpole_stab.yaml @@ -0,0 +1,72 @@ +task_config: + seed: 42 + info_in_reset: True + ctrl_freq: 15 + pyb_freq: 750 + physics: pyb + + # state initialization + init_state: + init_x: 0.1 + init_x_dot: -1.5 + init_theta: -0.175 + init_theta_dot: 0.5 + randomized_init: True + randomized_inertial_prop: False + normalized_rl_action_space: True + + init_state_randomization_info: + init_x: + distrib: "uniform" + low: -1 + high: 1 + init_x_dot: + distrib: "uniform" + low: -0.1 + high: 0.1 + init_theta: + distrib: "uniform" + low: -0.2 + high: 0.2 + init_theta_dot: + distrib: "uniform" + low: -0.1 + high: 0.1 + + task: stabilization + task_info: + stabilization_goal: [0.7, 0] + stabilization_goal_tolerance: 0.05 + + inertial_prop: + pole_length: 0.5 + cart_mass: 1 + pole_mass: 0.1 + + episode_len_sec: 10 + cost: quadratic + obs_goal_horizon: 1 + + # RL Reward + rew_state_weight: [1, 1, 1, 1] + rew_act_weight: 0.1 + rew_exponential: True + + # constraints + constraints: + - constraint_form: default_constraint + constrained_variable: state + upper_bounds: + - 2 + - 2 + - 0.18 + - 2 + lower_bounds: + - -2 + - -2 + - -0.18 + - -2 + - constraint_form: default_constraint + constrained_variable: input + done_on_out_of_bound: True + done_on_violation: False diff --git a/examples/rlmpc/config_overrides/cartpole/qlearning_mpc_cartpole.yaml b/examples/rlmpc/config_overrides/cartpole/qlearning_mpc_cartpole.yaml new file mode 100644 index 000000000..8b1378917 --- /dev/null +++ b/examples/rlmpc/config_overrides/cartpole/qlearning_mpc_cartpole.yaml @@ -0,0 +1 @@ + diff --git a/examples/rlmpc/rlmpc_experiment.py b/examples/rlmpc/rlmpc_experiment.py new file mode 100644 index 000000000..d6b39932f --- /dev/null +++ b/examples/rlmpc/rlmpc_experiment.py @@ -0,0 +1,52 @@ +'''This script tests the RL-MPC implementation.''' + +import os +import shutil +from functools import partial + +import matplotlib.pyplot as plt +import numpy as np + +from safe_control_gym.envs.benchmark_env import Cost, Environment, Task +from safe_control_gym.experiments.base_experiment import BaseExperiment +from safe_control_gym.utils.configuration import ConfigFactory +from safe_control_gym.utils.registration import make + + +def run(plot=True, training=False, n_episodes=1, n_steps=None, curr_path='.'): + '''Main function to run RL-MPC experiments. + + Args: + plot (bool): Whether to plot the results. + training (bool): Whether to train the MPSC or load pre-trained values. + n_episodes (int): The number of episodes to execute. + n_steps (int): How many steps to run the experiment. + curr_path (str): The current relative path to the experiment folder. + ''' + + # Create the configuration dictionary. + fac = ConfigFactory() + config = fac.merge() + system = config.task + + # Create an environment + env_func = partial(make, + config.task, + **config.task_config) + env = env_func() + + # Setup controller. + ctrl = make(config.algo, + env_func, + **config.algo_config, + output_dir=curr_path + '/temp') + + # Run without safety filter + experiment = BaseExperiment(env, ctrl) + results, uncert_metrics = experiment.run_evaluation(n_episodes=n_episodes, n_steps=n_steps) + elapsed_time_uncert = results['timestamp'][0][-1] - results['timestamp'][0][0] + + + +if __name__ == '__main__': + run() diff --git a/examples/rlmpc/rlmpc_experiment.sh b/examples/rlmpc/rlmpc_experiment.sh new file mode 100755 index 000000000..b63aac140 --- /dev/null +++ b/examples/rlmpc/rlmpc_experiment.sh @@ -0,0 +1,22 @@ +#!/bin/bash + +SYS='cartpole' +# SYS='quadrotor_2D' + +TASK='stab' +# TASK='track' + +ALGO='qlearning_mpc' + +if [ "$SYS" == 'cartpole' ]; then + SYS_NAME=$SYS +else + SYS_NAME='quadrotor' +fi + +# Model-predictive safety certification of an unsafe controller. +python3 ./rlmpc_experiment.py \ + --task ${SYS_NAME} \ + --algo ${ALGO} \ + --overrides \ + ./config_overrides/${SYS}/${SYS}_${TASK}.yaml diff --git a/safe_control_gym/controllers/__init__.py b/safe_control_gym/controllers/__init__.py index f08271727..192f5c4c8 100644 --- a/safe_control_gym/controllers/__init__.py +++ b/safe_control_gym/controllers/__init__.py @@ -49,3 +49,7 @@ register(idx='rap', entry_point='safe_control_gym.controllers.rarl.rap:RAP', config_entry_point='safe_control_gym.controllers.rarl:rap.yaml') + +register(idx='qlearning_mpc', + entry_point='safe_control_gym.controllers.mpc.qlearning_mpc:Qlearning_MPC', + config_entry_point='safe_control_gym.controllers.mpc:qlearning_mpc.yaml') diff --git a/safe_control_gym/controllers/mpc/qlearning_mpc.py b/safe_control_gym/controllers/mpc/qlearning_mpc.py new file mode 100644 index 000000000..da8809f3b --- /dev/null +++ b/safe_control_gym/controllers/mpc/qlearning_mpc.py @@ -0,0 +1,455 @@ +'''Model Predictive Control.''' + +from copy import deepcopy + +import casadi as cs +import numpy as np + +from safe_control_gym.controllers.base_controller import BaseController +from safe_control_gym.controllers.mpc.mpc_utils import (compute_discrete_lqr_gain_from_cont_linear_system, + compute_state_rmse, get_cost_weight_matrix, + reset_constraints, rk_discrete) +from safe_control_gym.envs.benchmark_env import Task +from safe_control_gym.envs.constraints import GENERAL_CONSTRAINTS, create_constraint_list + + +class Qlearning_MPC(BaseController): + '''MPC with full nonlinear model.''' + + def __init__( + self, + env_func, + horizon: int = 5, + q_mpc: list = [1], + r_mpc: list = [1], + warmstart: bool = True, + soft_constraints: bool = False, + terminate_run_on_done: bool = True, + constraint_tol: float = 1e-6, + # runner args + # shared/base args + output_dir: str = 'results/temp', + additional_constraints: list = None, + use_gpu: bool = False, + seed: int = 0, + **kwargs + ): + '''Creates task and controller. + + Args: + env_func (Callable): function to instantiate task/environment. + horizon (int): mpc planning horizon. + q_mpc (list): diagonals of state cost weight. + r_mpc (list): diagonals of input/action cost weight. + warmstart (bool): if to initialize from previous iteration. + soft_constraints (bool): Formulate the constraints as soft constraints. + terminate_run_on_done (bool): Terminate the run when the environment returns done or not. + constraint_tol (float): Tolerance to add the the constraint as sometimes solvers are not exact. + output_dir (str): output directory to write logs and results. + additional_constraints (list): List of additional constraints + use_gpu (bool): False (use cpu) True (use cuda). + seed (int): random seed. + ''' + super().__init__(env_func, output_dir, use_gpu, seed, **kwargs) + for k, v in locals().items(): + if k != 'self' and k != 'kwargs' and '__' not in k: + self.__dict__.update({k: v}) + + # if prior_info is None: + # self.prior_info = {} + # Task. + self.env = env_func() + if additional_constraints is not None: + additional_ConstraintsList = create_constraint_list(additional_constraints, + GENERAL_CONSTRAINTS, + self.env) + self.additional_constraints = additional_ConstraintsList.constraints + self.constraints, self.state_constraints_sym, self.input_constraints_sym = reset_constraints(self.env.constraints.constraints + self.additional_constraints) + else: + self.constraints, self.state_constraints_sym, self.input_constraints_sym = reset_constraints(self.env.constraints.constraints) + self.additional_constraints = [] + # Model parameters + self.model = self.get_prior(self.env) + self.dt = self.model.dt + self.T = horizon + self.Q = get_cost_weight_matrix(self.q_mpc, self.model.nx) + self.R = get_cost_weight_matrix(self.r_mpc, self.model.nu) + # self.prior_info = prior_info + + self.soft_constraints = soft_constraints + self.warmstart = warmstart + self.terminate_run_on_done = terminate_run_on_done + + # self.X_EQ = self.env.X_EQ + # self.U_EQ = self.env.U_EQ + # logging + # self.logger = ExperimentLogger(output_dir) + + def add_constraints(self, + constraints + ): + '''Add the constraints (from a list) to the system. + + Args: + constraints (list): List of constraints controller is subject too. + ''' + self.constraints, self.state_constraints_sym, self.input_constraints_sym = reset_constraints(constraints + self.constraints.constraints) + + def remove_constraints(self, + constraints + ): + '''Remove constraints from the current constraint list. + + Args: + constraints (list): list of constraints to be removed. + ''' + old_constraints_list = self.constraints.constraints + for constraint in constraints: + assert constraint in self.constraints.constraints, \ + ValueError('This constraint is not in the current list of constraints') + old_constraints_list.remove(constraint) + self.constraints, self.state_constraints_sym, self.input_constraints_sym = reset_constraints(old_constraints_list) + + def close(self): + '''Cleans up resources.''' + self.env.close() + + def reset(self): + '''Prepares for training or evaluation.''' + # Setup reference input. + if self.env.TASK == Task.STABILIZATION: + self.mode = 'stabilization' + self.x_goal = self.env.X_GOAL + elif self.env.TASK == Task.TRAJ_TRACKING: + self.mode = 'tracking' + self.traj = self.env.X_GOAL.T + # Step along the reference. + self.traj_step = 0 + # Dynamics model. + self.set_dynamics_func() + # CasADi optimizer. + self.setup_optimizer() + # Previously solved states & inputs, useful for warm start. + self.x_prev = None + self.u_prev = None + + self.setup_results_dict() + + def set_dynamics_func(self): + '''Updates symbolic dynamics with actual control frequency.''' + # self.dynamics_func = cs.integrator('fd', 'rk', + # { + # 'x': self.model.x_sym, + # 'p': self.model.u_sym, + # 'ode': self.model.x_dot + # }, + # {'tf': self.dt}) + self.dynamics_func = rk_discrete(self.model.fc_func, + self.model.nx, + self.model.nu, + self.dt) + + def compute_initial_guess(self, init_state, goal_states, x_lin, u_lin): + '''Use LQR to get an initial guess of the ''' + dfdxdfdu = self.model.df_func(x=x_lin, u=u_lin) + dfdx = dfdxdfdu['dfdx'].toarray() + dfdu = dfdxdfdu['dfdu'].toarray() + lqr_gain, _, _ = compute_discrete_lqr_gain_from_cont_linear_system(dfdx, dfdu, self.Q, self.R, self.dt) + + x_guess = np.zeros((self.model.nx, self.T + 1)) + u_guess = np.zeros((self.model.nu, self.T)) + x_guess[:, 0] = init_state + + for i in range(self.T): + u = lqr_gain @ (x_guess[:, i] - goal_states[:, i]) + u_lin + u_guess[:, i] = u + x_guess[:, i + 1, None] = self.dynamics_func(x0=x_guess[:, i], p=u)['xf'].toarray() + + return x_guess, u_guess + + def setup_optimizer(self): + '''Sets up nonlinear optimization problem.''' + nx, nu = self.model.nx, self.model.nu + T = self.T + # Define optimizer and variables. + opti = cs.Opti() + # States. + x_var = opti.variable(nx, T + 1) + # Inputs. + u_var = opti.variable(nu, T) + # Initial state. + x_init = opti.parameter(nx, 1) + # Reference (equilibrium point or trajectory, last step for terminal cost). + x_ref = opti.parameter(nx, T + 1) + # Add slack variables + state_slack = opti.variable(len(self.state_constraints_sym)) + input_slack = opti.variable(len(self.input_constraints_sym)) + + # cost (cumulative) + cost = 0 + cost_func = self.model.loss + for i in range(T): + # Can ignore the first state cost since fist x_var == x_init. + cost += cost_func(x=x_var[:, i], + u=u_var[:, i], + Xr=x_ref[:, i], + Ur=np.zeros((nu, 1)), + Q=self.Q, + R=self.R)['l'] + # Terminal cost. + cost += cost_func(x=x_var[:, -1], + u=np.zeros((nu, 1)), + Xr=x_ref[:, -1], + Ur=np.zeros((nu, 1)), + Q=self.Q, + R=self.R)['l'] + # Constraints + for i in range(self.T): + # Dynamics constraints. + next_state = self.dynamics_func(x0=x_var[:, i], p=u_var[:, i])['xf'] + opti.subject_to(x_var[:, i + 1] == next_state) + + for sc_i, state_constraint in enumerate(self.state_constraints_sym): + if self.soft_constraints: + opti.subject_to(state_constraint(x_var[:, i]) <= state_slack[sc_i]) + cost += 10000 * state_slack[sc_i]**2 + opti.subject_to(state_slack[sc_i] >= 0) + else: + opti.subject_to(state_constraint(x_var[:, i]) < -self.constraint_tol) + for ic_i, input_constraint in enumerate(self.input_constraints_sym): + if self.soft_constraints: + opti.subject_to(input_constraint(u_var[:, i]) <= input_slack[ic_i]) + cost += 10000 * input_slack[ic_i]**2 + opti.subject_to(input_slack[ic_i] >= 0) + else: + opti.subject_to(input_constraint(u_var[:, i]) < -self.constraint_tol) + + # Final state constraints. + for sc_i, state_constraint in enumerate(self.state_constraints_sym): + if self.soft_constraints: + opti.subject_to(state_constraint(x_var[:, -1]) <= state_slack[sc_i]) + cost += 10000 * state_slack[sc_i] ** 2 + opti.subject_to(state_slack[sc_i] >= 0) + else: + opti.subject_to(state_constraint(x_var[:, -1]) <= -self.constraint_tol) + # initial condition constraints + opti.subject_to(x_var[:, 0] == x_init) + + opti.minimize(cost) + # Create solver (IPOPT solver in this version) + # opts = {'ipopt.print_level': 0, 'ipopt.sb': 'yes', 'print_time': 0} + opts = {'expand': True} + opti.solver('ipopt', opts) + self.opti_dict = { + 'opti': opti, + 'x_var': x_var, + 'u_var': u_var, + 'x_init': x_init, + 'x_ref': x_ref, + 'cost': cost + } + + def select_action(self, + obs, + info=None + ): + '''Solves nonlinear mpc problem to get next action. + + Args: + obs (ndarray): Current state/observation. + info (dict): Current info + + Returns: + action (ndarray): Input/action to the task/env. + ''' + + opti_dict = self.opti_dict + opti = opti_dict['opti'] + x_var = opti_dict['x_var'] + u_var = opti_dict['u_var'] + x_init = opti_dict['x_init'] + x_ref = opti_dict['x_ref'] + + # Assign the initial state. + opti.set_value(x_init, obs) + # Assign reference trajectory within horizon. + goal_states = self.get_references() + opti.set_value(x_ref, goal_states) + if self.mode == 'tracking': + self.traj_step += 1 + # if self.warmstart and self.x_prev is None and self.u_prev is None: + # x_guess, u_guess = self.compute_initial_guess(obs, goal_states, self.X_EQ, self.U_EQ) + # opti.set_initial(x_var, x_guess) + # opti.set_initial(u_var, u_guess) # Initial guess for optimization problem. + # elif self.warmstart and self.x_prev is not None and self.u_prev is not None: + if self.warmstart and self.x_prev is not None and self.u_prev is not None: + # shift previous solutions by 1 step + x_guess = deepcopy(self.x_prev) + u_guess = deepcopy(self.u_prev) + x_guess[:, :-1] = x_guess[:, 1:] + u_guess[:-1] = u_guess[1:] + opti.set_initial(x_var, x_guess) + opti.set_initial(u_var, u_guess) + # Solve the optimization problem. + sol = opti.solve() + x_val, u_val = sol.value(x_var), sol.value(u_var) + self.x_prev = x_val + self.u_prev = u_val + self.results_dict['horizon_states'].append(deepcopy(self.x_prev)) + self.results_dict['horizon_inputs'].append(deepcopy(self.u_prev)) + self.results_dict['goal_states'].append(deepcopy(goal_states)) + self.results_dict['t_wall'].append(opti.stats()['t_wall_total']) + # Take the first action from the solved action sequence. + if u_val.ndim > 1: + action = u_val[:, 0] + else: + action = np.array([u_val[0]]) + self.prev_action = action + return action + + def get_references(self): + '''Constructs reference states along mpc horizon.(nx, T+1).''' + if self.env.TASK == Task.STABILIZATION: + # Repeat goal state for horizon steps. + goal_states = np.tile(self.env.X_GOAL.reshape(-1, 1), (1, self.T + 1)) + elif self.env.TASK == Task.TRAJ_TRACKING: + # Slice trajectory for horizon steps, if not long enough, repeat last state. + start = min(self.traj_step, self.traj.shape[-1]) + end = min(self.traj_step + self.T + 1, self.traj.shape[-1]) + remain = max(0, self.T + 1 - (end - start)) + goal_states = np.concatenate([ + self.traj[:, start:end], + np.tile(self.traj[:, -1:], (1, remain)) + ], -1) + else: + raise Exception('Reference for this mode is not implemented.') + return goal_states # (nx, T+1). + + def setup_results_dict(self): + '''Setup the results dictionary to store run information.''' + self.results_dict = {'obs': [], + 'reward': [], + 'done': [], + 'info': [], + 'action': [], + 'horizon_inputs': [], + 'horizon_states': [], + 'goal_states': [], + 'frames': [], + 'state_mse': [], + 'common_cost': [], + 'state': [], + 'state_error': [], + 't_wall': [] + } + + def run(self, + env=None, + render=False, + logging=False, + max_steps=None, + terminate_run_on_done=None + ): + '''Runs evaluation with current policy. + + Args: + render (bool): if to do real-time rendering. + logging (bool): if to log on terminal. + + Returns: + dict: evaluation statisitcs, rendered frames. + ''' + if env is None: + env = self.env + if terminate_run_on_done is None: + terminate_run_on_done = self.terminate_run_on_done + + self.x_prev = None + self.u_prev = None + if not env.initial_reset: + env.set_cost_function_param(self.Q, self.R) + # obs, info = env.reset() + obs = env.reset() + print('Init State:') + print(obs) + ep_returns, ep_lengths = [], [] + frames = [] + self.setup_results_dict() + self.results_dict['obs'].append(obs) + self.results_dict['state'].append(env.state) + i = 0 + if env.TASK == Task.STABILIZATION: + if max_steps is None: + MAX_STEPS = int(env.CTRL_FREQ * env.EPISODE_LEN_SEC) + else: + MAX_STEPS = max_steps + elif env.TASK == Task.TRAJ_TRACKING: + if max_steps is None: + MAX_STEPS = self.traj.shape[1] + else: + MAX_STEPS = max_steps + else: + raise Exception('Undefined Task') + self.terminate_loop = False + done = False + common_metric = 0 + while not (done and terminate_run_on_done) and i < MAX_STEPS and not (self.terminate_loop): + action = self.select_action(obs) + if self.terminate_loop: + print('Infeasible MPC Problem') + break + # Repeat input for more efficient control. + obs, reward, done, info = env.step(action) + self.results_dict['obs'].append(obs) + self.results_dict['reward'].append(reward) + self.results_dict['done'].append(done) + self.results_dict['info'].append(info) + self.results_dict['action'].append(action) + self.results_dict['state'].append(env.state) + self.results_dict['state_mse'].append(info['mse']) + # self.results_dict['state_error'].append(env.state - env.X_GOAL[i,:]) + + common_metric += info['mse'] + print(i, '-th step.') + print(action) + print(obs) + print(reward) + print(done) + print(info) + print() + if render: + env.render() + frames.append(env.render('rgb_array')) + i += 1 + # Collect evaluation results. + ep_lengths = np.asarray(ep_lengths) + ep_returns = np.asarray(ep_returns) + if logging: + msg = '****** Evaluation ******\n' + msg += 'eval_ep_length {:.2f} +/- {:.2f} | eval_ep_return {:.3f} +/- {:.3f}\n'.format( + ep_lengths.mean(), ep_lengths.std(), ep_returns.mean(), + ep_returns.std()) + if len(frames) != 0: + self.results_dict['frames'] = frames + self.results_dict['obs'] = np.vstack(self.results_dict['obs']) + self.results_dict['state'] = np.vstack(self.results_dict['state']) + try: + self.results_dict['reward'] = np.vstack(self.results_dict['reward']) + self.results_dict['action'] = np.vstack(self.results_dict['action']) + self.results_dict['full_traj_common_cost'] = common_metric + self.results_dict['total_rmse_state_error'] = compute_state_rmse(self.results_dict['state']) + self.results_dict['total_rmse_obs_error'] = compute_state_rmse(self.results_dict['obs']) + except ValueError: + raise Exception('[ERROR] mpc.run().py: MPC could not find a solution for the first step given the initial conditions. ' + 'Check to make sure initial conditions are feasible.') + return deepcopy(self.results_dict) + + def reset_before_run(self, obs, info=None, env=None): + '''Reinitialize just the controller before a new run. + + Args: + obs (ndarray): The initial observation for the new run. + info (dict): The first info of the new run. + env (BenchmarkEnv): The environment to be used for the new run. + ''' + self.reset() diff --git a/safe_control_gym/controllers/mpc/qlearning_mpc.yaml b/safe_control_gym/controllers/mpc/qlearning_mpc.yaml new file mode 100644 index 000000000..407961e6b --- /dev/null +++ b/safe_control_gym/controllers/mpc/qlearning_mpc.yaml @@ -0,0 +1,13 @@ +# MPC args +horizon: 50 +r_mpc: + - 0.1 +q_mpc: + - 1. +warmstart: True +soft_constraints: False +constraint_tol: 1.0e-6 + +# Runner args +deque_size: 10 +eval_batch_size: 10