diff --git a/gym_pybullet_drones/examples/learn.py b/gym_pybullet_drones/examples/learn.py index b0cfb1336..de3765c1d 100644 --- a/gym_pybullet_drones/examples/learn.py +++ b/gym_pybullet_drones/examples/learn.py @@ -18,54 +18,149 @@ import argparse import gymnasium as gym import numpy as np +import torch from stable_baselines3 import PPO +from stable_baselines3.common.env_util import make_vec_env +from stable_baselines3.common.callbacks import EvalCallback, StopTrainingOnRewardThreshold +from stable_baselines3.common.policies import ActorCriticPolicy as a2cppoMlpPolicy +from stable_baselines3.common.policies import ActorCriticCnnPolicy as a2cppoCnnPolicy +from stable_baselines3.common.evaluation import evaluate_policy from gym_pybullet_drones.utils.Logger import Logger from gym_pybullet_drones.envs.HoverAviary import HoverAviary from gym_pybullet_drones.envs.LeaderFollowerAviary import LeaderFollowerAviary from gym_pybullet_drones.utils.utils import sync, str2bool +from gym_pybullet_drones.utils.enums import ObservationType, ActionType DEFAULT_GUI = True DEFAULT_RECORD_VIDEO = False DEFAULT_OUTPUT_FOLDER = 'results' DEFAULT_COLAB = False +DEFAULT_ALGO = 'ppo' +DEFAULT_OBS = ObservationType('kin') +DEFAULT_ACT = ActionType('rpm') + def run(output_folder=DEFAULT_OUTPUT_FOLDER, gui=DEFAULT_GUI, plot=True, colab=DEFAULT_COLAB, record_video=DEFAULT_RECORD_VIDEO): + MULTI_AGENT = False + + sa_env_kwargs = dict(obs=DEFAULT_OBS, act=DEFAULT_ACT) + + if not MULTI_AGENT: + # train_env = gym.make('hover-aviary-v0') + train_env = make_vec_env(HoverAviary, + env_kwargs=sa_env_kwargs, + n_envs=2, + seed=0 + ) + eval_env = gym.make('hover-aviary-v0') + else: + train_env = gym.make('leaderfollower-aviary-v0') #### Check the environment's spaces ######################## - env = gym.make('hover-aviary-v0') - # env = gym.make('leaderfollower-aviary-v0') - print('[INFO] Action space:', env.action_space) - print('[INFO] Observation space:', env.observation_space) + print('[INFO] Action space:', train_env.action_space) + print('[INFO] Observation space:', train_env.observation_space) #### Train the model ####################################### - model = PPO('MlpPolicy', - env, + onpolicy_kwargs = dict(activation_fn=torch.nn.ReLU, + net_arch=[512, 512, dict(vf=[256, 128], pi=[256, 128])] + ) # or None + + # model = PPO('MlpPolicy', + # train_env, + # verbose=1 + # ) + model = PPO(a2cppoMlpPolicy, # or a2cppoCnnPolicy + train_env, + # policy_kwargs=onpolicy_kwargs, + # tensorboard_log=filename+'/tb/', verbose=1 ) - model.learn(total_timesteps=10000) # Typically not enough + callback_on_best = StopTrainingOnRewardThreshold(reward_threshold=1000, + verbose=1 + ) + eval_callback = EvalCallback(eval_env, + callback_on_new_best=callback_on_best, + verbose=1, + # best_model_save_path=filename+'/', + # log_path=filename+'/', + eval_freq=int(2000), + deterministic=True, + render=False + ) + model.learn(total_timesteps=10000, #int(1e12), + callback=eval_callback, + log_interval=100, + ) + # model.learn(total_timesteps=10000) # Typically not enough + + #### Save the model ######################################## + # model.save(filename+'/success_model.zip') + # print(filename) + + # #### Print training progression ############################ + # with np.load(filename+'/evaluations.npz') as data: + # for j in range(data['timesteps'].shape[0]): + # print(str(data['timesteps'][j])+","+str(data['results'][j][0])) + + + + + + + + + # if os.path.isfile(exp+'/success_model.zip'): + # path = exp+'/success_model.zip' + # elif os.path.isfile(exp+'/best_model.zip'): + # path = exp+'/best_model.zip' + # else: + # print("[ERROR]: no model under the specified path", exp) + # model = PPO.load(path) + + #### Show (and record a video of) the model's performance ## - env = HoverAviary(gui=gui, + if not MULTI_AGENT: + test_env = HoverAviary(gui=gui, record=record_video ) - logger = Logger(logging_freq_hz=int(env.CTRL_FREQ), + test_env_nogui = HoverAviary() + logger = Logger(logging_freq_hz=int(test_env.CTRL_FREQ), num_drones=1, output_folder=output_folder, colab=colab ) - obs, info = env.reset(seed=42, options={}) + else: + test_env = LeaderFollowerAviary(gui=gui, + record=record_video + ) + test_env_nogui = LeaderFollowerAviary() + logger = Logger(logging_freq_hz=int(test_env.CTRL_FREQ), + num_drones=2, + output_folder=output_folder, + colab=colab + ) + + mean_reward, std_reward = evaluate_policy(model, + test_env_nogui, + n_eval_episodes=10 + ) + print("\n\n\nMean reward ", mean_reward, " +- ", std_reward, "\n\n") + + obs, info = test_env.reset(seed=42, options={}) start = time.time() - for i in range(3*env.CTRL_FREQ): + for i in range(3*test_env.CTRL_FREQ): action, _states = model.predict(obs, deterministic=True ) - obs, reward, terminated, truncated, info = env.step(action) + obs, reward, terminated, truncated, info = test_env.step(action) obs2 = obs.squeeze() act2 = action.squeeze() print("Obs:", obs, "\tAction", action, "\tReward:", reward, "\tTerminated:", terminated, "\tTruncated:", truncated) - logger.log(drone=0, - timestamp=i/env.CTRL_FREQ, + if not MULTI_AGENT: + logger.log(drone=0, + timestamp=i/test_env.CTRL_FREQ, state=np.hstack([obs2[0:3], np.zeros(4), obs2[3:15], @@ -73,12 +168,23 @@ def run(output_folder=DEFAULT_OUTPUT_FOLDER, gui=DEFAULT_GUI, plot=True, colab=D ]), control=np.zeros(12) ) - env.render() + else: + for d in range(2): + logger.log(drone=d, + timestamp=i/test_env.CTRL_FREQ, + state=np.hstack([obs2[d][0:3], + np.zeros(4), + obs2[d][3:15], + act2[d] + ]), + control=np.zeros(12) + ) + test_env.render() print(terminated) - sync(i, start, env.CTRL_TIMESTEP) + sync(i, start, test_env.CTRL_TIMESTEP) if terminated: - obs = env.reset(seed=42, options={}) - env.close() + obs = test_env.reset(seed=42, options={}) + test_env.close() if plot: logger.plot() diff --git a/gym_pybullet_drones/examples/learn2.py b/gym_pybullet_drones/examples/learn2.py deleted file mode 100644 index 8cd620cc5..000000000 --- a/gym_pybullet_drones/examples/learn2.py +++ /dev/null @@ -1,258 +0,0 @@ -"""Learning script for single agent problems. - -Agents are based on `stable_baselines3`'s implementation of A2C, PPO SAC, TD3, DDPG. - -Example -------- -To run the script, type in a terminal: - - $ python singleagent.py --env --algo --obs --act --cpu - -Notes ------ -Use: - - $ tensorboard --logdir ./results/save-----/tb/ - -To check the tensorboard results at: - - http://localhost:6006/ - -""" -import os -import time -from datetime import datetime -from sys import platform -import argparse -import subprocess -import numpy as np -import gymnasium as gym -import torch -from stable_baselines3.common.env_checker import check_env -from stable_baselines3.common.cmd_util import make_vec_env # Module cmd_util will be renamed to env_util https://github.com/DLR-RM/stable-baselines3/pull/197 -from stable_baselines3.common.vec_env import SubprocVecEnv, VecTransposeImage -from stable_baselines3.common.utils import set_random_seed -from stable_baselines3 import A2C -from stable_baselines3 import PPO -from stable_baselines3 import SAC -from stable_baselines3 import TD3 -from stable_baselines3.common.policies import ActorCriticPolicy as a2cppoMlpPolicy -from stable_baselines3.common.policies import ActorCriticCnnPolicy as a2cppoCnnPolicy -from stable_baselines3.sac.policies import SACPolicy as sacMlpPolicy -from stable_baselines3.sac import CnnPolicy as sacCnnPolicy -from stable_baselines3.td3 import MlpPolicy as td3ddpgMlpPolicy -from stable_baselines3.td3 import CnnPolicy as td3ddpgCnnPolicy -from stable_baselines3.common.callbacks import CheckpointCallback, EvalCallback, StopTrainingOnRewardThreshold - -from gym_pybullet_drones.envs.single_agent_rl.TakeoffAviary import TakeoffAviary -from gym_pybullet_drones.envs.single_agent_rl.HoverAviary import HoverAviary -from gym_pybullet_drones.envs.single_agent_rl.FlyThruGateAviary import FlyThruGateAviary -from gym_pybullet_drones.envs.single_agent_rl.TuneAviary import TuneAviary -from gym_pybullet_drones.envs.single_agent_rl.BaseSingleAgentAviary import ActionType, ObservationType - -import shared_constants - -EPISODE_REWARD_THRESHOLD = -0 # Upperbound: rewards are always negative, but non-zero -"""float: Reward threshold to halt the script.""" - -DEFAULT_ENV = 'hover' -DEFAULT_ALGO = 'ppo' -DEFAULT_OBS = ObservationType('kin') -DEFAULT_ACT = ActionType('one_d_rpm') -DEFAULT_CPU = 1 -DEFAULT_STEPS = 35000 -DEFAULT_OUTPUT_FOLDER = 'results' - -def run( - env=DEFAULT_ENV, - algo=DEFAULT_ALGO, - obs=DEFAULT_OBS, - act=DEFAULT_ACT, - cpu=DEFAULT_CPU, - steps=DEFAULT_STEPS, - output_folder=DEFAULT_OUTPUT_FOLDER -): - - #### Save directory ######################################## - filename = os.path.join(output_folder, 'save-'+env+'-'+algo+'-'+obs.value+'-'+act.value+'-'+datetime.now().strftime("%m.%d.%Y_%H.%M.%S")) - if not os.path.exists(filename): - os.makedirs(filename+'/') - - #### Print out current git commit hash ##################### - if (platform == "linux" or platform == "darwin") and ('GITHUB_ACTIONS' not in os.environ.keys()): - git_commit = subprocess.check_output(["git", "describe", "--tags"]).strip() - with open(filename+'/git_commit.txt', 'w+') as f: - f.write(str(git_commit)) - - #### Warning ############################################### - if env == 'tune' and act != ActionType.TUN: - print("\n\n\n[WARNING] TuneAviary is intended for use with ActionType.TUN\n\n\n") - if act == ActionType.ONE_D_RPM or act == ActionType.ONE_D_DYN or act == ActionType.ONE_D_PID: - print("\n\n\n[WARNING] Simplified 1D problem for debugging purposes\n\n\n") - #### Errors ################################################ - if not env in ['takeoff', 'hover']: - print("[ERROR] 1D action space is only compatible with Takeoff and HoverAviary") - exit() - if act == ActionType.TUN and env != 'tune' : - print("[ERROR] ActionType.TUN is only compatible with TuneAviary") - exit() - if algo in ['sac', 'td3', 'ddpg'] and cpu!=1: - print("[ERROR] The selected algorithm does not support multiple environments") - exit() - - #### Uncomment to debug slurm scripts ###################### - # exit() - - env_name = env+"-aviary-v0" - sa_env_kwargs = dict(aggregate_phy_steps=shared_constants.AGGR_PHY_STEPS, obs=obs, act=act) - # train_env = gym.make(env_name, aggregate_phy_steps=shared_constants.AGGR_PHY_STEPS, obs=obs, act=act) # single environment instead of a vectorized one - if env_name == "takeoff-aviary-v0": - train_env = make_vec_env(TakeoffAviary, - env_kwargs=sa_env_kwargs, - n_envs=cpu, - seed=0 - ) - if env_name == "hover-aviary-v0": - train_env = make_vec_env(HoverAviary, - env_kwargs=sa_env_kwargs, - n_envs=cpu, - seed=0 - ) - if env_name == "flythrugate-aviary-v0": - train_env = make_vec_env(FlyThruGateAviary, - env_kwargs=sa_env_kwargs, - n_envs=cpu, - seed=0 - ) - if env_name == "tune-aviary-v0": - train_env = make_vec_env(TuneAviary, - env_kwargs=sa_env_kwargs, - n_envs=cpu, - seed=0 - ) - print("[INFO] Action space:", train_env.action_space) - print("[INFO] Observation space:", train_env.observation_space) - # check_env(train_env, warn=True, skip_render_check=True) - - #### On-policy algorithms ################################## - onpolicy_kwargs = dict(activation_fn=torch.nn.ReLU, - net_arch=[512, 512, dict(vf=[256, 128], pi=[256, 128])] - ) # or None - if algo == 'a2c': - model = A2C(a2cppoMlpPolicy, - train_env, - policy_kwargs=onpolicy_kwargs, - tensorboard_log=filename+'/tb/', - verbose=1 - ) if obs == ObservationType.KIN else A2C(a2cppoCnnPolicy, - train_env, - policy_kwargs=onpolicy_kwargs, - tensorboard_log=filename+'/tb/', - verbose=1 - ) - if algo == 'ppo': - model = PPO(a2cppoMlpPolicy, - train_env, - policy_kwargs=onpolicy_kwargs, - tensorboard_log=filename+'/tb/', - verbose=1 - ) if obs == ObservationType.KIN else PPO(a2cppoCnnPolicy, - train_env, - policy_kwargs=onpolicy_kwargs, - tensorboard_log=filename+'/tb/', - verbose=1 - ) - - #### Off-policy algorithms ################################# - offpolicy_kwargs = dict(activation_fn=torch.nn.ReLU, - net_arch=[512, 512, 256, 128] - ) # or None # or dict(net_arch=dict(qf=[256, 128, 64, 32], pi=[256, 128, 64, 32])) - if algo == 'sac': - model = SAC(sacMlpPolicy, - train_env, - policy_kwargs=offpolicy_kwargs, - tensorboard_log=filename+'/tb/', - verbose=1 - ) if obs==ObservationType.KIN else SAC(sacCnnPolicy, - train_env, - policy_kwargs=offpolicy_kwargs, - tensorboard_log=filename+'/tb/', - verbose=1 - ) - if algo == 'td3': - model = TD3(td3ddpgMlpPolicy, - train_env, - policy_kwargs=offpolicy_kwargs, - tensorboard_log=filename+'/tb/', - verbose=1 - ) if obs==ObservationType.KIN else TD3(td3ddpgCnnPolicy, - train_env, - policy_kwargs=offpolicy_kwargs, - tensorboard_log=filename+'/tb/', - verbose=1 - ) - - #### Create eveluation environment ######################### - if obs == ObservationType.KIN: - eval_env = gym.make(env_name, - aggregate_phy_steps=shared_constants.AGGR_PHY_STEPS, - obs=obs, - act=act - ) - elif obs == ObservationType.RGB: - if env_name == "hover-aviary-v0": - eval_env = make_vec_env(HoverAviary, - env_kwargs=sa_env_kwargs, - n_envs=1, - seed=0 - ) - if env_name == "leaderfollower-aviary-v0": - eval_env = make_vec_env(LeaderFollowerAviary, - env_kwargs=sa_env_kwargs, - n_envs=1, - seed=0 - ) - eval_env = VecTransposeImage(eval_env) - - #### Train the model ####################################### - # checkpoint_callback = CheckpointCallback(save_freq=1000, save_path=filename+'-logs/', name_prefix='rl_model') - callback_on_best = StopTrainingOnRewardThreshold(reward_threshold=EPISODE_REWARD_THRESHOLD, - verbose=1 - ) - eval_callback = EvalCallback(eval_env, - callback_on_new_best=callback_on_best, - verbose=1, - best_model_save_path=filename+'/', - log_path=filename+'/', - eval_freq=int(2000/cpu), - deterministic=True, - render=False - ) - model.learn(total_timesteps=steps, #int(1e12), - callback=eval_callback, - log_interval=100, - ) - - #### Save the model ######################################## - model.save(filename+'/success_model.zip') - print(filename) - - #### Print training progression ############################ - with np.load(filename+'/evaluations.npz') as data: - for j in range(data['timesteps'].shape[0]): - print(str(data['timesteps'][j])+","+str(data['results'][j][0])) - - -if __name__ == "__main__": - #### Define and parse (optional) arguments for the script ## - parser = argparse.ArgumentParser(description='Single agent reinforcement learning experiments script') - parser.add_argument('--env', default=DEFAULT_ENV, type=str, choices=['takeoff', 'hover', 'flythrugate', 'tune'], help='Task (default: hover)', metavar='') - parser.add_argument('--algo', default=DEFAULT_ALGO, type=str, choices=['a2c', 'ppo', 'sac', 'td3'], help='RL agent (default: ppo)', metavar='') - parser.add_argument('--obs', default=DEFAULT_OBS, type=ObservationType, help='Observation space (default: kin)', metavar='') - parser.add_argument('--act', default=DEFAULT_ACT, type=ActionType, help='Action space (default: one_d_rpm)', metavar='') - parser.add_argument('--cpu', default=DEFAULT_CPU, type=int, help='Number of training environments (default: 1)', metavar='') - parser.add_argument('--steps', default=DEFAULT_STEPS, type=int, help='Number of training time steps (default: 35000)', metavar='') - parser.add_argument('--output_folder', default=DEFAULT_OUTPUT_FOLDER, type=str, help='Folder where to save logs (default: "results")', metavar='') - ARGS = parser.parse_args() - - run(**vars(ARGS)) diff --git a/gym_pybullet_drones/examples/play_policy.py b/gym_pybullet_drones/examples/play_policy.py deleted file mode 100644 index dc7e8c033..000000000 --- a/gym_pybullet_drones/examples/play_policy.py +++ /dev/null @@ -1,141 +0,0 @@ -"""Test script for single agent problems. - -This scripts runs the best model found by one of the executions of `singleagent.py` - -Example -------- -To run the script, type in a terminal: - - $ python test_singleagent.py --exp ./results/save----- - -""" -import os -import time -from datetime import datetime -import argparse -import re -import numpy as np -import gym -import torch -from stable_baselines3.common.env_checker import check_env -from stable_baselines3 import A2C -from stable_baselines3 import PPO -from stable_baselines3 import SAC -from stable_baselines3 import TD3 -from stable_baselines3 import DDPG -from stable_baselines3.common.policies import ActorCriticPolicy as a2cppoMlpPolicy -from stable_baselines3.common.policies import ActorCriticCnnPolicy as a2cppoCnnPolicy -from stable_baselines3.sac.policies import SACPolicy as sacMlpPolicy -from stable_baselines3.sac import CnnPolicy as sacCnnPolicy -from stable_baselines3.td3 import MlpPolicy as td3ddpgMlpPolicy -from stable_baselines3.td3 import CnnPolicy as td3ddpgCnnPolicy -from stable_baselines3.common.evaluation import evaluate_policy - -from gym_pybullet_drones.utils.utils import sync -from gym_pybullet_drones.utils.Logger import Logger -from gym_pybullet_drones.envs.single_agent_rl.TakeoffAviary import TakeoffAviary -from gym_pybullet_drones.envs.single_agent_rl.HoverAviary import HoverAviary -from gym_pybullet_drones.envs.single_agent_rl.FlyThruGateAviary import FlyThruGateAviary -from gym_pybullet_drones.envs.single_agent_rl.TuneAviary import TuneAviary -from gym_pybullet_drones.envs.single_agent_rl.BaseSingleAgentAviary import ActionType, ObservationType -from gym_pybullet_drones.utils.utils import sync, str2bool - -import shared_constants - -DEFAULT_GUI = True -DEFAULT_PLOT = True -DEFAULT_OUTPUT_FOLDER = 'results' - -def run(exp, gui=DEFAULT_GUI, plot=DEFAULT_PLOT, output_folder=DEFAULT_OUTPUT_FOLDER): - #### Load the model from file ############################## - algo = exp.split("-")[2] - - if os.path.isfile(exp+'/success_model.zip'): - path = exp+'/success_model.zip' - elif os.path.isfile(exp+'/best_model.zip'): - path = exp+'/best_model.zip' - else: - print("[ERROR]: no model under the specified path", exp) - if algo == 'a2c': - model = A2C.load(path) - if algo == 'ppo': - model = PPO.load(path) - if algo == 'sac': - model = SAC.load(path) - if algo == 'td3': - model = TD3.load(path) - if algo == 'ddpg': - model = DDPG.load(path) - - #### Parameters to recreate the environment ################ - env_name = exp.split("-")[1]+"-aviary-v0" - OBS = ObservationType.KIN if exp.split("-")[3] == 'kin' else ObservationType.RGB - - # Parse ActionType instance from file name - action_name = exp.split("-")[4] - ACT = [action for action in ActionType if action.value == action_name] - if len(ACT) != 1: - raise AssertionError("Result file could have gotten corrupted. Extracted action type does not match any of the existing ones.") - ACT = ACT.pop() - - #### Evaluate the model #################################### - eval_env = gym.make(env_name, - aggregate_phy_steps=shared_constants.AGGR_PHY_STEPS, - obs=OBS, - act=ACT - ) - mean_reward, std_reward = evaluate_policy(model, - eval_env, - n_eval_episodes=10 - ) - print("\n\n\nMean reward ", mean_reward, " +- ", std_reward, "\n\n") - - #### Show, record a video, and log the model's performance # - test_env = gym.make(env_name, - gui=gui, - record=False, - aggregate_phy_steps=shared_constants.AGGR_PHY_STEPS, - obs=OBS, - act=ACT - ) - logger = Logger(logging_freq_hz=int(test_env.SIM_FREQ/test_env.AGGR_PHY_STEPS), - num_drones=1, - output_folder=output_folder - ) - obs = test_env.reset() - start = time.time() - for i in range(6*int(test_env.SIM_FREQ/test_env.AGGR_PHY_STEPS)): # Up to 6'' - action, _states = model.predict(obs, - deterministic=True # OPTIONAL 'deterministic=False' - ) - obs, reward, done, info = test_env.step(action) - test_env.render() - if OBS==ObservationType.KIN: - logger.log(drone=0, - timestamp=i/test_env.SIM_FREQ, - state= np.hstack([obs[0:3], np.zeros(4), obs[3:15], np.resize(action, (4))]), - control=np.zeros(12) - ) - sync(np.floor(i*test_env.AGGR_PHY_STEPS), start, test_env.TIMESTEP) - # if done: obs = test_env.reset() # OPTIONAL EPISODE HALT - test_env.close() - logger.save_as_csv("sa") # Optional CSV save - if plot: - logger.plot() - - # with np.load(exp+'/evaluations.npz') as data: - # print(data.files) - # print(data['timesteps']) - # print(data['results']) - # print(data['ep_lengths']) - -if __name__ == "__main__": - #### Define and parse (optional) arguments for the script ## - parser = argparse.ArgumentParser(description='Single agent reinforcement learning example script using TakeoffAviary') - parser.add_argument('--exp', type=str, help='The experiment folder written as ./results/save-----', metavar='') - parser.add_argument('--gui', default=DEFAULT_GUI, type=str2bool, help='Whether to use PyBullet GUI (default: False)', metavar='') - parser.add_argument('--plot', default=DEFAULT_PLOT, type=str2bool, help='Whether to plot the simulation results (default: True)', metavar='') - parser.add_argument('--output_folder', default=DEFAULT_OUTPUT_FOLDER, type=str, help='Folder where to save logs (default: "results")', metavar='') - ARGS = parser.parse_args() - - run(**vars(ARGS)) \ No newline at end of file