diff --git a/README.md b/README.md index 48fcc90be..012b472f6 100644 --- a/README.md +++ b/README.md @@ -43,6 +43,13 @@ python -m rl_baselines.train --algo rl_algo --env env1 --log-dir logs/ --srl-mod To use the robot's position as input instead of pixels, just pass `--srl-model ground_truth` instead of `--srl-model raw_pixels` +To perform a cross evaluation for the different srl model, one could run in the terminal: + +``` +python -m rl_baselines.pipeline_cross --algo ppo2 --log-dir logs/ --srl-model srl_comnbination ground_truth --num-iteration 5 --num-timesteps 1000000 --task cc sqc sc --srl-config-file config/srl_models1.yaml config/srl_models2.yaml config/srl_models3.yaml +``` +This will output the learning result into the repository logs. + ## Installation @@ -191,6 +198,11 @@ If you have troubles installing mpi4py, make sure you the following installed: sudo apt-get install libopenmpi-dev openmpi-bin openmpi-doc ``` +If you have troubles building wheel for ```atari```, you could fix that by running: +``` +sudo apt-get install cmake libz-dev +``` + ## Known issues The inverse kinematics function has trouble finding a solution when the arm is fully straight and the arm must bend to reach the requested point. diff --git a/config/srl_models_circular.yaml b/config/srl_models_circular.yaml new file mode 100644 index 000000000..e4f493867 --- /dev/null +++ b/config/srl_models_circular.yaml @@ -0,0 +1,8 @@ + +OmnirobotEnv-v0: + # Base path to SRL log folder + # log_folder: srl_zoo/logs/Omnibot_random_simple/ + log_folder: srl_zoo/logs/Omnibot_circular/ + autoencoder: 19-02-04_23h27_22_custom_cnn_ST_DIM200_autoencoder_reward_inverse_forward/srl_model.pth + + diff --git a/config/srl_models_escape.yaml b/config/srl_models_escape.yaml new file mode 100644 index 000000000..a53351a42 --- /dev/null +++ b/config/srl_models_escape.yaml @@ -0,0 +1,9 @@ + +OmnirobotEnv-v0: + # Base path to SRL log folder + # log_folder: srl_zoo/logs/escape_agent/ + log_folder: srl_zoo/logs/escape_agent/ + autoencoder: 19-02-04_23h27_22_custom_cnn_ST_DIM200_autoencoder_reward_inverse_forward/srl_model.pth + srl_combination: 19-06-03_18h38_59_custom_cnn_ST_DIM200_autoencoder_inverse/srl_model.pth + + diff --git a/config/srl_models_merged.yaml b/config/srl_models_merged.yaml new file mode 100644 index 000000000..b299a8396 --- /dev/null +++ b/config/srl_models_merged.yaml @@ -0,0 +1,8 @@ + +OmnirobotEnv-v0: + # Base path to SRL log folder + # log_folder: srl_zoo/logs/Omnibot_random_simple/ + log_folder: srl_zoo/logs/merge_CC_SC/ + autoencoder: 19-02-04_23h27_22_custom_cnn_ST_DIM200_autoencoder_reward_inverse_forward/srl_model.pth + + diff --git a/config/srl_models_simple.yaml b/config/srl_models_simple.yaml new file mode 100644 index 000000000..8c07f243d --- /dev/null +++ b/config/srl_models_simple.yaml @@ -0,0 +1,8 @@ + +OmnirobotEnv-v0: + # Base path to SRL log folder + # log_folder: srl_zoo/logs/Omnibot_random_simple/ + log_folder: srl_zoo/logs/Omnibot_random_simple/ + autoencoder: 19-02-04_23h27_22_custom_cnn_ST_DIM200_autoencoder_reward_inverse_forward/srl_model.pth + + diff --git a/environments/dataset_generator.py b/environments/dataset_generator.py index 283cf0438..dd43251fd 100644 --- a/environments/dataset_generator.py +++ b/environments/dataset_generator.py @@ -9,17 +9,58 @@ import numpy as np from stable_baselines import PPO2 +from stable_baselines.common import set_global_seeds from stable_baselines.common.vec_env import DummyVecEnv, VecNormalize from stable_baselines.common.policies import CnnPolicy +import tensorflow as tf +import torch as th +from torch.autograd import Variable from environments import ThreadingType from environments.registry import registered_env -from real_robots.constants import USING_OMNIROBOT +from environments.utils import makeEnv +from real_robots.constants import * +from replay.enjoy_baselines import createEnv, loadConfigAndSetup +from rl_baselines.utils import MultiprocessSRLModel, loadRunningAverage from srl_zoo.utils import printRed, printYellow +from srl_zoo.preprocessing.utils import deNormalize +from state_representation.models import loadSRLModel, getSRLDim + +RENDER_HEIGHT = 224 +RENDER_WIDTH = 224 +VALID_MODELS = ["forward", "inverse", "reward", "priors", "episode-prior", "reward-prior", "triplet", + "autoencoder", "vae", "dae", "random"] +VALID_POLICIES = ['walker', 'random', 'ppo2', 'custom'] os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # used to remove debug info of tensorflow +def latestPath(path): + """ + :param path: path to the log folder (defined in srl_model.yaml) (str) + :return: path to latest learned model in the same dataset folder (str) + """ + return max([path + d for d in os.listdir(path) if os.path.isdir(path + "/" + d)], key=os.path.getmtime) + '/' + + +def walkerPath(): + """ + + :return: + """ + eps = 0.01 + N_times = 14 + path = [] + left = [0 for _ in range(N_times)] + right = [1 for _ in range(N_times)] + + for idx in range(N_times * 2): + path += left if idx % 2 == 0 else right + path += [3] if idx < N_times else [2] + + return path + + def convertImagePath(args, path, record_id_start): """ Used to convert an image path, from one location, to another @@ -35,13 +76,25 @@ def convertImagePath(args, path, record_id_start): return args.name + "/record_{:03d}".format(new_record_id) + "/" + image_name -def env_thread(args, thread_num, partition=True, use_ppo2=False): +def vecEnv(env_kwargs_local, env_class): + """ + Local Env Wrapper + :param env_kwargs_local: arguments related to the environment wrapper + :param env_class: class of the env + :return: env for the pretrained algo + """ + train_env = env_class(**{**env_kwargs_local, "record_data": False, "renders": False}) + train_env = DummyVecEnv([lambda: train_env]) + train_env = VecNormalize(train_env, norm_obs=True, norm_reward=False) + return train_env + + +def env_thread(args, thread_num, partition=True): """ Run a session of an environment :param args: (ArgumentParser object) :param thread_num: (int) The thread ID of the environment session :param partition: (bool) If the output should be in multiple parts (default=True) - :param use_ppo2: (bool) Use ppo2 to generate the dataset """ env_kwargs = { "max_distance": args.max_distance, @@ -52,7 +105,13 @@ def env_thread(args, thread_num, partition=True, use_ppo2=False): "record_data": not args.no_record_data, "multi_view": args.multi_view, "save_path": args.save_path, - "shape_reward": args.shape_reward + "shape_reward": args.shape_reward, + "simple_continual_target": args.simple_continual, + "circular_continual_move": args.circular_continual, + "square_continual_move": args.square_continual, + "chasing_continual_move":args.chasing_continual, + "escape_continual_move": args.escape_continual, + "short_episodes": args.short_episodes } if partition: @@ -60,39 +119,131 @@ def env_thread(args, thread_num, partition=True, use_ppo2=False): else: env_kwargs["name"] = args.name + load_path, train_args, algo_name, algo_class = None, None, None, None + model = None + srl_model = None + srl_state_dim = 0 + generated_obs = None + + if args.run_policy in ["walker", "custom"]: + if args.latest: + args.log_dir = latestPath(args.log_custom_policy) + else: + args.log_dir = args.log_custom_policy + args.log_dir = args.log_custom_policy + args.render = args.display + args.plotting, args.action_proba = False, False + + train_args, load_path, algo_name, algo_class, _, env_kwargs_extra = loadConfigAndSetup(args) + env_kwargs["srl_model"] = env_kwargs_extra["srl_model"] + env_kwargs["random_target"] = env_kwargs_extra.get("random_target", False) + env_kwargs["use_srl"] = env_kwargs_extra.get("use_srl", False) + + # TODO REFACTOR + env_kwargs["simple_continual_target"] = env_kwargs_extra.get("simple_continual_target", False) + env_kwargs["circular_continual_move"] = env_kwargs_extra.get("circular_continual_move", False) + env_kwargs["square_continual_move"] = env_kwargs_extra.get("square_continual_move", False) + env_kwargs["eight_continual_move"] = env_kwargs_extra.get("eight_continual_move", False) + env_kwargs["chasing_continual_move"] = env_kwargs_extra.get("chasing_continual_move",False) + env_kwargs["escape_continual_move"] = env_kwargs_extra.get("escape_continual_move", False) + + + eps = 0.2 + env_kwargs["state_init_override"] = np.array([MIN_X + eps, MAX_X - eps]) \ + if args.run_policy == 'walker' else None + if env_kwargs["use_srl"]: + env_kwargs["srl_model_path"] = env_kwargs_extra.get("srl_model_path", None) + env_kwargs["state_dim"] = getSRLDim(env_kwargs_extra.get("srl_model_path", None)) + srl_model = MultiprocessSRLModel(num_cpu=args.num_cpu, env_id=args.env, env_kwargs=env_kwargs) + env_kwargs["srl_pipe"] = srl_model.pipe + env_class = registered_env[args.env][0] env = env_class(**env_kwargs) - using_real_omnibot = args.env == "OmnirobotEnv-v0" and USING_OMNIROBOT - model = None - if use_ppo2: - # Additional env when using a trained ppo agent to generate data - # instead of a random agent - train_env = env_class(**{**env_kwargs, "record_data": False, "renders": False}) - train_env = DummyVecEnv([lambda: train_env]) - train_env = VecNormalize(train_env, norm_obs=True, norm_reward=False) + if env_kwargs.get('srl_model', None) not in ["raw_pixels", None]: + # TODO: Remove env duplication + # This is a dirty trick to normalize the obs. + # So for as we override SRL environment functions (step, reset) for on-policy generation & generative replay + # using stable-baselines' normalisation wrappers (step & reset) breaks... + env_norm = [makeEnv(args.env, args.seed, i, args.log_dir, allow_early_resets=False, env_kwargs=env_kwargs) + for i in range(args.num_cpu)] + env_norm = DummyVecEnv(env_norm) + env_norm = VecNormalize(env_norm, norm_obs=True, norm_reward=False) + env_norm = loadRunningAverage(env_norm, load_path_normalise=args.log_custom_policy) + using_real_omnibot = args.env == "OmnirobotEnv-v0" and USING_OMNIROBOT - model = PPO2(CnnPolicy, train_env).learn(args.ppo2_timesteps) + walker_path = None + action_walker = None + state_init_for_walker = None + kwargs_reset, kwargs_step = {}, {} + + if args.run_policy in ['custom', 'ppo2', 'walker']: + # Additional env when using a trained agent to generate data + train_env = vecEnv(env_kwargs, env_class) + + if args.run_policy == 'ppo2': + model = PPO2(CnnPolicy, train_env).learn(args.ppo2_timesteps) + else: + _, _, algo_args = createEnv(args, train_args, algo_name, algo_class, env_kwargs) + tf.reset_default_graph() + set_global_seeds(args.seed % 2 ^ 32) + printYellow("Compiling Policy function....") + model = algo_class.load(load_path, args=algo_args) + if args.run_policy == 'walker': + walker_path = walkerPath() + + if len(args.replay_generative_model) > 0: + srl_model = loadSRLModel(args.log_generative_model, th.cuda.is_available()) + srl_state_dim = srl_model.state_dim + srl_model = srl_model.model.model frames = 0 start_time = time.time() + # divide evenly, then do an extra one for only some of them in order to get the right count for i_episode in range(args.num_episode // args.num_cpu + 1 * (args.num_episode % args.num_cpu > thread_num)): + # seed + position in this slice + size of slice (with reminder if uneven partitions) seed = args.seed + i_episode + args.num_episode // args.num_cpu * thread_num + \ (thread_num if thread_num <= args.num_episode % args.num_cpu else args.num_episode % args.num_cpu) + seed = seed % 2 ^ 32 + if not (args.run_policy in ['custom', 'walker']): + env.seed(seed) + env.action_space.seed(seed) # this is for the sample() function from gym.space + + if len(args.replay_generative_model) > 0: + + sample = Variable(th.randn(1, srl_state_dim)) + if th.cuda.is_available(): + sample = sample.cuda() + + generated_obs = srl_model.decode(sample) + generated_obs = generated_obs[0].detach().cpu().numpy() + generated_obs = deNormalize(generated_obs) - env.seed(seed) - env.action_space.seed(seed) # this is for the sample() function from gym.space - obs = env.reset() + kwargs_reset['generated_observation'] = generated_obs + obs = env.reset(**kwargs_reset) done = False + action_proba = None t = 0 episode_toward_target_on = False + while not done: + env.render() - if use_ppo2: + # Policy to run on the fly - to be trained before generation + if args.run_policy == 'ppo2': action, _ = model.predict([obs]) + + # Custom pre-trained Policy (SRL or End-to-End) + elif args.run_policy in['custom', 'walker']: + obs = env_norm._normalize_observation(obs) + action = [model.getAction(obs, done)] + action_proba = model.getActionProba(obs, done) + if args.run_policy == 'walker': + action_walker = np.array(walker_path[t]) + # Random Policy else: # Using a target reaching policy (untrained, from camera) when collecting data from real OmniRobot if episode_toward_target_on and np.random.rand() < args.toward_target_timesteps_proportion and \ @@ -101,8 +252,30 @@ def env_thread(args, thread_num, partition=True, use_ppo2=False): else: action = [env.action_space.sample()] + # Generative replay +/- for on-policy action + if len(args.replay_generative_model) > 0: + + if args.run_policy == 'custom': + obs = obs.reshape(1, srl_state_dim) + obs = th.from_numpy(obs.astype(np.float32)).cuda() + z = obs + generated_obs = srl_model.decode(z) + else: + sample = Variable(th.randn(1, srl_state_dim)) + + if th.cuda.is_available(): + sample = sample.cuda() + + generated_obs = srl_model.decode(sample) + generated_obs = generated_obs[0].detach().cpu().numpy() + generated_obs = deNormalize(generated_obs) + action_to_step = action[0] - _, _, done, _ = env.step(action_to_step) + kwargs_step = {k: v for (k, v) in [("generated_observation", generated_obs), + ("action_proba", action_proba), + ("action_grid_walker", action_walker)] if v is not None} + + obs, _, done, _ = env.step(action_to_step, **kwargs_step) frames += 1 t += 1 @@ -117,6 +290,7 @@ def env_thread(args, thread_num, partition=True, use_ppo2=False): print("{:.2f} FPS".format(frames * args.num_cpu / (time.time() - start_time))) + def main(): parser = argparse.ArgumentParser(description='Deteministic dataset generator for SRL training ' + '(can be used for environment testing)') @@ -143,12 +317,42 @@ def main(): help='Shape the reward (reward = - distance) instead of a sparse reward') parser.add_argument('--reward-dist', action='store_true', default=False, help='Prints out the reward distribution when the dataset generation is finished') - parser.add_argument('--run-ppo2', action='store_true', default=False, - help='runs a ppo2 agent instead of a random agent') + parser.add_argument('--run-policy', type=str, default="random", + choices=VALID_POLICIES, + help='Policy to run for data collection ' + + '(random, localy pretrained ppo2, pretrained custom policy)') + parser.add_argument('--log-custom-policy', type=str, default='', + help='Logs of the custom pretained policy to run for data collection') + parser.add_argument('--latest', action='store_true', default=False, + help='load the latest learned model (location: args.log-custom-policy)') + parser.add_argument('-rgm', '--replay-generative-model', type=str, default="", choices=['vae'], + help='Generative model to replay for generating a dataset (for Continual Learning purposes)') + parser.add_argument('--log-generative-model', type=str, default='', + help='Logs of the custom pretained policy to run for data collection') parser.add_argument('--ppo2-timesteps', type=int, default=1000, help='number of timesteps to run PPO2 on before generating the dataset') parser.add_argument('--toward-target-timesteps-proportion', type=float, default=0.0, help="propotion of timesteps that use simply towards target policy, should be 0.0 to 1.0") + parser.add_argument('-sc', '--simple-continual', action='store_true', default=False, + help='Simple red square target for task 1 of continual learning scenario. ' + + 'The task is: robot should reach the target.') + parser.add_argument('-cc', '--circular-continual', action='store_true', default=False, + help='Blue square target for task 2 of continual learning scenario. ' + + 'The task is: robot should turn in circle around the target.') + parser.add_argument('-sqc', '--square-continual', action='store_true', default=False, + help='Green square target for task 3 of continual learning scenario. ' + + 'The task is: robot should turn in square around the target.') + parser.add_argument('-chc', '--chasing-continual', action='store_true', default=False, + help='Two chasing robots in the same domain of environment' + + 'The task is: one robot should keep a certain distance towars the other.') + parser.add_argument('-esc', '--escape-continual', action='store_true', default=False, + help='Two chasing robots in the same domain of environment' + + 'The task is: the trainable agent tries to escape from the "zombie" robot.') + parser.add_argument('--short-episodes', action='store_true', default=False, + help='Generate short episodes (only 10 contacts with the target allowed).') + parser.add_argument('--episode', type=int, default=-1, + help='Model saved at episode N that we want to load') + args = parser.parse_args() assert (args.num_cpu > 0), "Error: number of cpu must be positive and non zero" @@ -162,6 +366,18 @@ def main(): args.num_cpu = args.num_episode printYellow("num_cpu cannot be greater than num_episode, defaulting to {} cpus.".format(args.num_cpu)) + assert sum([args.simple_continual, args.circular_continual, args.square_continual]) <= 1, \ + "For continual SRL and RL, please provide only one scenario at the time !" + + assert not (args.log_custom_policy == '' and args.run_policy in ['walker', 'custom']), \ + "If using a custom policy, please specify a valid log folder for loading it." + + assert not (args.log_generative_model == '' and args.replay_generative_model == 'custom'), \ + "If using a custom policy, please specify a valid log folder for loading it." + + if not os.path.exists(args.save_path): + os.makedirs(args.save_path) + # this is done so seed 0 and 1 are different and not simply offset of the same datasets. args.seed = np.random.RandomState(args.seed).randint(int(1e10)) @@ -177,13 +393,13 @@ def main(): os.mkdir(args.save_path + args.name) if args.num_cpu == 1: - env_thread(args, 0, partition=False, use_ppo2=args.run_ppo2) + env_thread(args, 0, partition=False) else: # try and divide into multiple processes, with an environment each try: jobs = [] for i in range(args.num_cpu): - process = multiprocessing.Process(target=env_thread, args=(args, i, True, args.run_ppo2)) + process = multiprocessing.Process(target=env_thread, args=(args, i, True)) jobs.append(process) for j in jobs: diff --git a/environments/dataset_fusioner.py b/environments/dataset_merger.py similarity index 56% rename from environments/dataset_fusioner.py rename to environments/dataset_merger.py index de5decb72..aa64ad366 100644 --- a/environments/dataset_fusioner.py +++ b/environments/dataset_merger.py @@ -8,11 +8,23 @@ import numpy as np from tqdm import tqdm +# List of all possible labels identifying a task, +# for experiments in Continual Learning scenari. +CONTINUAL_LEARNING_LABELS = ['CC', 'SC', 'EC', 'SQC', 'ESC'] +CL_LABEL_KEY = "continual_learning_label" + def main(): parser = argparse.ArgumentParser(description='Dataset Manipulator: useful to merge two datasets by concatenating ' + 'episodes. PS: Deleting sources after merging into the destination ' + 'folder.') + parser.add_argument('--continual-learning-labels', type=str, nargs=2, metavar=('label_1', 'label_2'), + default=argparse.SUPPRESS, help='Labels for the continual learning RL distillation task.') + parser.add_argument('-f', '--force', action='store_true', default=False, + help='Force the merge, even if it overrides something else,' + ' including the destination if it exist') + parser.add_argument('-rm', '--remove', action='store_true', default=False, + help='Remove the original data set.') group = parser.add_mutually_exclusive_group() group.add_argument('--merge', type=str, nargs=3, metavar=('source_1', 'source_2', 'destination'), default=argparse.SUPPRESS, @@ -23,20 +35,32 @@ def main(): if 'merge' in args: # let make sure everything is in order assert os.path.exists(args.merge[0]), "Error: dataset '{}' could not be found".format(args.merge[0]) - assert (not os.path.exists(args.merge[2])), \ - "Error: dataset '{}' already exists, cannot rename '{}' to '{}'".format(args.merge[2], args.merge[0], - args.merge[2]) + assert os.path.exists(args.merge[1]), "Error: dataset '{}' could not be found".format(args.merge[1]) + + # If the merge file exists already, delete it for the convenince of updating student's policy + if os.path.exists(args.merge[2]) or os.path.exists(args.merge[2] + '/'): + assert args.force, "Error: destination directory '{}' already exists".format(args.merge[2]) + shutil.rmtree(args.merge[2]) + + if 'continual_learning_labels' in args: + assert args.continual_learning_labels[0] in CONTINUAL_LEARNING_LABELS \ + and args.continual_learning_labels[1] in CONTINUAL_LEARNING_LABELS, \ + "Please specify a valid Continual learning label to each dataset to be used for RL distillation !" + # create the output os.mkdir(args.merge[2]) # copy files from first source - os.rename(args.merge[0] + "/dataset_config.json", args.merge[2] + "/dataset_config.json") - os.rename(args.merge[0] + "/env_globals.json", args.merge[2] + "/env_globals.json") - + shutil.copy2(args.merge[0] + "/dataset_config.json", args.merge[2] + "/dataset_config.json") + shutil.copy2(args.merge[0] + "/env_globals.json", args.merge[2] + "/env_globals.json") + record = '' for record in sorted(glob.glob(args.merge[0] + "/record_[0-9]*/*")): s = args.merge[2] + "/" + record.split("/")[-2] + '/' + record.split("/")[-1] - os.renames(record, s) - + try: + shutil.copy2(record, s) + except FileNotFoundError: # no folders named so, we should create it first + os.mkdir(os.path.dirname(s)) + shutil.copy2(record, s) num_episode_dataset_1 = int(record.split("/")[-2][7:]) + 1 # copy files from second source @@ -44,7 +68,11 @@ def main(): episode = str(num_episode_dataset_1 + int(record.split("/")[-2][7:])) new_episode = record.split("/")[-2][:-len(episode)] + episode s = args.merge[2] + "/" + new_episode + '/' + record.split("/")[-1] - os.renames(record, s) + try: + shutil.copy2(record, s) + except FileNotFoundError: # no folders named so, we should create it first + os.mkdir(os.path.dirname(s)) + shutil.copy2(record, s) num_episode_dataset_2 = int(record.split("/")[-2][7:]) + 1 # load and correct ground_truth @@ -101,20 +129,40 @@ def main(): preprocessed_load = np.load(args.merge[0] + "/preprocessed_data.npz") preprocessed_load_2 = np.load(args.merge[1] + "/preprocessed_data.npz") - for prepro_load in [preprocessed_load, preprocessed_load_2]: + dataset_1_size = preprocessed_load["actions"].shape[0] + dataset_2_size = preprocessed_load_2["actions"].shape[0] + + # Concatenating additional information: indices of episode start, action probabilities, CL labels... + for idx, prepro_load in enumerate([preprocessed_load, preprocessed_load_2]): for arr in prepro_load.files: pr_arr = prepro_load[arr] - preprocessed[arr] = np.concatenate((preprocessed.get(arr, []), pr_arr), axis=0) + if arr == "episode_starts": - preprocessed[arr] = preprocessed[arr].astype(bool) + to_class = bool + elif arr == "actions_proba" or arr == "rewards": + to_class = float + else: + to_class = int + if preprocessed.get(arr, None) is None: + preprocessed[arr] = pr_arr.astype(to_class) + else: + preprocessed[arr] = np.concatenate((preprocessed[arr].astype(to_class), + pr_arr.astype(to_class)), axis=0) + if 'continual_learning_labels' in args: + if preprocessed.get(CL_LABEL_KEY, None) is None: + preprocessed[CL_LABEL_KEY] = \ + np.array([args.continual_learning_labels[idx] for _ in range(dataset_1_size)]) else: - preprocessed[arr] = preprocessed[arr].astype(int) + preprocessed[CL_LABEL_KEY] = \ + np.concatenate((preprocessed[CL_LABEL_KEY], np.array([args.continual_learning_labels[idx] + for _ in range(dataset_2_size)])), axis=0) np.savez(args.merge[2] + "/preprocessed_data.npz", ** preprocessed) # remove the old folders - shutil.rmtree(args.merge[0]) - shutil.rmtree(args.merge[1]) + if args.remove: + shutil.rmtree(args.merge[0]) + shutil.rmtree(args.merge[1]) if __name__ == '__main__': diff --git a/environments/omnirobot_gym/omnirobot_env.py b/environments/omnirobot_gym/omnirobot_env.py index f5e71f0d4..027adcbea 100644 --- a/environments/omnirobot_gym/omnirobot_env.py +++ b/environments/omnirobot_gym/omnirobot_env.py @@ -29,7 +29,7 @@ def recvMatrix(socket): RENDER_HEIGHT = 224 RENDER_WIDTH = 224 RELATIVE_POS = True -N_CONTACTS_BEFORE_TERMINATION = 3 +N_CONTACTS_BEFORE_TERMINATION = 15 #10 DELTA_POS = 0.1 # DELTA_POS for continuous actions N_DISCRETE_ACTIONS = 4 @@ -72,7 +72,10 @@ class OmniRobotEnv(SRLGymEnv): def __init__(self, renders=False, name="Omnirobot", is_discrete=True, save_path='srl_zoo/data/', state_dim=-1, learn_states=False, srl_model="raw_pixels", record_data=False, action_repeat=1, random_target=True, - shape_reward=False, env_rank=0, srl_pipe=None, **_): + shape_reward=False, simple_continual_target=False, circular_continual_move=False, + escape_continual_move=False, square_continual_move=False, eight_continual_move=False, + chasing_continual_move=False, short_episodes=False, + state_init_override=None, env_rank=0, srl_pipe=None, **_): super(OmniRobotEnv, self).__init__(srl_model=srl_model, relative_pos=RELATIVE_POS, @@ -87,7 +90,7 @@ def __init__(self, renders=False, name="Omnirobot", is_discrete=True, save_path= self.use_srl = use_srl or use_ground_truth self.use_ground_truth = use_ground_truth self.use_joints = False - self.relative_pos = RELATIVE_POS + self.relative_pos = RELATIVE_POS and (not escape_continual_move) self._is_discrete = is_discrete self.observation = [] # Start simulation with first observation @@ -101,6 +104,13 @@ def __init__(self, renders=False, name="Omnirobot", is_discrete=True, save_path= self.target_pos = None self.saver = None self._random_target = random_target + self.simple_continual_target = simple_continual_target + self.circular_continual_move = circular_continual_move + self.square_continual_move = square_continual_move + self.eight_continual_move = eight_continual_move + self.chasing_continual_move = chasing_continual_move + self.escape_continual_move = escape_continual_move + self.short_episodes = short_episodes if self._is_discrete: self.action_space = spaces.Discrete(N_DISCRETE_ACTIONS) @@ -128,8 +138,15 @@ def __init__(self, renders=False, name="Omnirobot", is_discrete=True, save_path= learn_states=learn_states, path=save_path) if USING_OMNIROBOT_SIMULATOR: - self.socket = OmniRobotSimulatorSocket( - output_size=[RENDER_WIDTH, RENDER_HEIGHT], random_target=self._random_target) + self.socket = OmniRobotSimulatorSocket(simple_continual_target=simple_continual_target, + circular_continual_move=circular_continual_move, + square_continual_move=square_continual_move, + eight_continual_move=eight_continual_move, + chasing_continual_move=chasing_continual_move, + escape_continual_move=escape_continual_move, + output_size=[RENDER_WIDTH, RENDER_HEIGHT], + random_target=self._random_target, + state_init_override=state_init_override) else: # Initialize Baxter effector by connecting to the Gym bridge ROS node: self.context = zmq.Context() @@ -172,44 +189,85 @@ def actionPolicyTowardTarget(self): else: return DELTA_POS if self.robot_pos[1] < self.target_pos[1] else -DELTA_POS - def step(self, action): + def actionPolicyAwayTarget(self): """ - :action: (int) + :return: (int) action + """ + if abs(self.robot_pos[0] - self.target_pos[0]) > abs(self.robot_pos[1] - self.target_pos[1]): + + if self._is_discrete: + return int(Move.BACKWARD) if self.robot_pos[0] < self.target_pos[0] else int(Move.FORWARD) + # forward # backward + else: + return -DELTA_POS if self.robot_pos[0] < self.target_pos[0] else +DELTA_POS + else: + if self._is_discrete: + # left # right + return int(Move.RIGHT) if self.robot_pos[1] < self.target_pos[1] else int(Move.LEFT) + else: + return -DELTA_POS if self.robot_pos[1] < self.target_pos[1] else +DELTA_POS + + def step(self, action, generated_observation=None, action_proba=None, action_grid_walker=None): + """ + :param :action: (int) + :param generated_observation: + :param action_proba: + :param action_grid_walker: # Whether or not we want to override the action with the one from a grid walker :return: (tensor (np.ndarray)) observation, int reward, bool done, dict extras) """ + if action_grid_walker is None: + action_to_step = action + action_from_teacher = None + else: + action_to_step = action_grid_walker + action_from_teacher = action + assert self.action_space.contains(action_from_teacher) + if not self._is_discrete: - action = np.array(action) - assert self.action_space.contains(action) + action_to_step = np.array(action_to_step) + assert self.action_space.contains(action_to_step) # Convert int action to action in (x,y,z) space # serialize the action - if isinstance(action, np.ndarray): - self.action = action.tolist() - elif hasattr(action, 'dtype'): # convert numpy type to python type + if isinstance(action_to_step, np.ndarray): + self.action = action_to_step.tolist() + elif hasattr(action_to_step, 'dtype'): # convert numpy type to python type self.action = action.item() else: - self.action = action + self.action = action_to_step self._env_step_counter += 1 # Send the action to the server self.socket.send_json( - {"command": "action", "action": self.action, "is_discrete": self._is_discrete}) + {"command": "action", "action": self.action, "is_discrete": self._is_discrete, + "step_counter": self._env_step_counter}) # Receive state data (position, etc), important to update state related values self.getEnvState() # Receive a camera image from the server - self.observation = self.getObservation() + self.observation = self.getObservation() if generated_observation is None else generated_observation * 255 done = self._hasEpisodeTerminated() self.render() if self.saver is not None: - self.saver.step(self.observation, action, - self.reward, done, self.getGroundTruth()) + # Dynamic environment + if self.chasing_continual_move or self.escape_continual_move: + self.saver.step(self.observation, action_from_teacher if action_grid_walker is not None else + action_to_step, + self.reward, done, self.getGroundTruth(), + action_proba=action_proba, target_pos=self.getTargetPos()) + else: + self.saver.step(self.observation, action_from_teacher if action_grid_walker is not None else + action_to_step, + self.reward, done, self.getGroundTruth(), + action_proba=action_proba) + old_observation = self.getObservation() if self.use_srl: - return self.getSRLState(self.observation), self.reward, done, {} + return self.getSRLState(self.observation + if generated_observation is None else old_observation), self.reward, done, {} else: return self.observation, self.reward, done, {} @@ -223,7 +281,6 @@ def getEnvState(self): self.reward = state_data["reward"] self.target_pos = np.array(state_data["target_pos"]) self.robot_pos = np.array(state_data["position"]) - return state_data def getObservation(self): @@ -247,8 +304,14 @@ def getTargetPos(self): @staticmethod def getGroundTruthDim(): """ + The convergence is slow for escape task with dimension 2 :return: (int) """ + # if(not self.escape_continual_move): + # return 2 + # else: + # #The position of the robot, target + # return 4 return 2 def getGroundTruth(self): @@ -256,6 +319,11 @@ def getGroundTruth(self): Alias for getRobotPos for compatibility between envs :return: (numpy array) """ + # + # if(not self.escape_continual_move): + # return np.array(self.getRobotPos()) + # else: + # return np.append(self.getRobotPos(),self.getTargetPos()) return np.array(self.getRobotPos()) def getRobotPos(self): @@ -264,9 +332,11 @@ def getRobotPos(self): """ return self.robot_pos - def reset(self): + def reset(self, generated_observation=None, state_override=None): """ Reset the environment + :param generated_observation: + :param state_override: :return: (numpy ndarray) first observation of the env """ self.episode_terminated = False @@ -274,16 +344,19 @@ def reset(self): self._env_step_counter = 0 # set n contact count self.n_contacts = 0 - self.socket.send_json({"command": "reset"}) + self.socket.send_json({"command": "reset", "step_counter": self._env_step_counter}) # Update state related variables, important step to get both data and # metadata that allow reading the observation image self.getEnvState() - self.observation = self.getObservation() + self.robot_pos = np.array([0, 0]) if state_override is None else state_override + self.observation = self.getObservation() if generated_observation is None else generated_observation * 255 if self.saver is not None: self.saver.reset(self.observation, self.getTargetPos(), self.getGroundTruth()) + old_observation = self.getObservation() + if self.use_srl: - return self.getSRLState(self.observation) + return self.getSRLState(self.observation if generated_observation is None else old_observation) else: return self.observation @@ -291,13 +364,17 @@ def _hasEpisodeTerminated(self): """ Returns True if the episode is over and False otherwise """ - if self.episode_terminated or self._env_step_counter > MAX_STEPS: + if (self.episode_terminated or self._env_step_counter > MAX_STEPS) or \ + (self.n_contacts >= N_CONTACTS_BEFORE_TERMINATION and self.short_episodes and + self.simple_continual_target) or \ + (self._env_step_counter > MAX_STEPS_CIRCULAR_TASK_SHORT_EPISODES and self.short_episodes and + self.circular_continual_move): return True if np.abs(self.reward - REWARD_TARGET_REACH) < 0.000001: # reach the target self.n_contacts += 1 else: - self.n_contacts = 0 + self.n_contacts += 0 return False def closeServerConnection(self): @@ -325,7 +402,7 @@ def render(self, mode='rgb_array'): self.visualizeBoundary() self.image_plot = plt.imshow(self.observation_with_boundary, cmap='gray') self.image_plot.axes.grid(False) - + else: self.visualizeBoundary() self.image_plot.set_data(self.observation_with_boundary) @@ -333,7 +410,7 @@ def render(self, mode='rgb_array'): # Wait a bit, so that plot is visible plt.pause(0.0001) return self.observation - + def initVisualizeBoundary(self): with open(CAMERA_INFO_PATH, 'r') as stream: try: @@ -364,19 +441,81 @@ def initVisualizeBoundary(self): # transform the corresponding points into cropped image self.boundary_coner_pixel_pos = self.boundary_coner_pixel_pos - (np.array(ORIGIN_SIZE) - np.array(CROPPED_SIZE)).reshape(2, 1) / 2.0 - + # transform the corresponding points into resized image (RENDER_WIDHT, RENDER_HEIGHT) self.boundary_coner_pixel_pos[0, :] *= RENDER_WIDTH/CROPPED_SIZE[0] self.boundary_coner_pixel_pos[1, :] *= RENDER_HEIGHT/CROPPED_SIZE[1] - + self.boundary_coner_pixel_pos = np.around(self.boundary_coner_pixel_pos).astype(np.int) + # Create square for vizu of objective in continual square task + if self.square_continual_move: + + self.boundary_coner_pixel_pos_continual = np.zeros((2, 4)) + # assume that image is undistorted + self.boundary_coner_pixel_pos_continual[:, 0] = \ + pos_transformer.phyPosGround2PixelPos([-RADIUS, -RADIUS], return_distort_image_pos=False).squeeze() + self.boundary_coner_pixel_pos_continual[:, 1] = \ + pos_transformer.phyPosGround2PixelPos([RADIUS, -RADIUS], return_distort_image_pos=False).squeeze() + self.boundary_coner_pixel_pos_continual[:, 2] = \ + pos_transformer.phyPosGround2PixelPos([RADIUS, RADIUS], return_distort_image_pos=False).squeeze() + self.boundary_coner_pixel_pos_continual[:, 3] = \ + pos_transformer.phyPosGround2PixelPos([-RADIUS, RADIUS], return_distort_image_pos=False).squeeze() + + # transform the corresponding points into cropped image + self.boundary_coner_pixel_pos_continual = self.boundary_coner_pixel_pos_continual - ( + np.array(ORIGIN_SIZE) - np.array(CROPPED_SIZE)).reshape(2, 1) / 2.0 + + # transform the corresponding points into resized image (RENDER_WIDHT, RENDER_HEIGHT) + self.boundary_coner_pixel_pos_continual[0, :] *= RENDER_WIDTH / CROPPED_SIZE[0] + self.boundary_coner_pixel_pos_continual[1, :] *= RENDER_HEIGHT / CROPPED_SIZE[1] + + self.boundary_coner_pixel_pos_continual = np.around(self.boundary_coner_pixel_pos_continual).astype(np.int) + + elif self.circular_continual_move: + self.center_coordinates = \ + pos_transformer.phyPosGround2PixelPos([0, 0], return_distort_image_pos=False).squeeze() + self.center_coordinates = self.center_coordinates - ( + np.array(ORIGIN_SIZE) - np.array(CROPPED_SIZE)) / 2.0 + # transform the corresponding points into resized image (RENDER_WIDHT, RENDER_HEIGHT) + self.center_coordinates[0] *= RENDER_WIDTH / CROPPED_SIZE[0] + self.center_coordinates[1] *= RENDER_HEIGHT / CROPPED_SIZE[1] + + self.center_coordinates = np.around(self.center_coordinates).astype(np.int) + + # Points to convert radisu in env space + self.boundary_coner_pixel_pos_continual = \ + pos_transformer.phyPosGround2PixelPos([0, RADIUS], return_distort_image_pos=False).squeeze() + + # transform the corresponding points into cropped image + self.boundary_coner_pixel_pos_continual = self.boundary_coner_pixel_pos_continual - ( + np.array(ORIGIN_SIZE) - np.array(CROPPED_SIZE)) / 2.0 + + # transform the corresponding points into resized image (RENDER_WIDHT, RENDER_HEIGHT) + self.boundary_coner_pixel_pos_continual[0] *= RENDER_WIDTH / CROPPED_SIZE[0] + self.boundary_coner_pixel_pos_continual[1] *= RENDER_HEIGHT / CROPPED_SIZE[1] + + self.boundary_coner_pixel_pos_continual = np.around(self.boundary_coner_pixel_pos_continual).astype(np.int) + def visualizeBoundary(self): """ - visualize the unvisible boundary, should call initVisualizeBoundary firstly + visualize the unvisible boundary, should call initVisualizeBoundary first """ self.observation_with_boundary = self.observation.copy() + # Add boundary continual + if self.square_continual_move: + for idx in range(4): + idx_next = idx + 1 + cv2.line(self.observation_with_boundary, tuple(self.boundary_coner_pixel_pos_continual[:, idx]), + tuple(self.boundary_coner_pixel_pos_continual[:, idx_next % 4]), (0, 0, 200), 2) + + elif self.circular_continual_move: + radius_converted = np.linalg.norm(self.center_coordinates - self.boundary_coner_pixel_pos_continual) + cv2.circle(self.observation_with_boundary, tuple(self.center_coordinates), np.float32(radius_converted), + (0, 0, 200), 2) + + # Add boundary of env for idx in range(4): idx_next = idx + 1 cv2.line(self.observation_with_boundary, tuple(self.boundary_coner_pixel_pos[:, idx]), diff --git a/environments/srl_env.py b/environments/srl_env.py index 4c6be4f34..cf1636388 100644 --- a/environments/srl_env.py +++ b/environments/srl_env.py @@ -81,13 +81,13 @@ def close(self): # TODO: implement close function to close GUI pass - def step(self, action): + def step(self, action, generated_observation=None, action_proba=None, action_grid_walker=None): """ :param action: (int or [float]) """ raise NotImplementedError() - def reset(self): + def reset(self, generated_observation=None, state_override=None): """ Reset the environment :return: (numpy tensor) first observation of the env diff --git a/real_robots/constants.py b/real_robots/constants.py index d2fdce901..cfcd7fd58 100644 --- a/real_robots/constants.py +++ b/real_robots/constants.py @@ -88,6 +88,7 @@ class Move(Enum): # Max number of steps per episode MAX_STEPS = 250 + MAX_STEPS_CIRCULAR_TASK_SHORT_EPISODES = 75 # Boundaries MIN_X, MAX_X = -0.85, 0.85 # center of robot should be in this interval MIN_Y, MAX_Y = -0.85, 0.85 @@ -119,7 +120,7 @@ class Move(Enum): STOP = 4 STEP_DISTANCE = 0.1 # meter, distance for each step - + STEP_DISTANCE_TARGET = 0.05 # moving distance for each step # For continuous action, # Define the action_bounds ACTION_POSITIVE_LOW = 0.0 @@ -136,6 +137,10 @@ class Move(Enum): CAMERA_ROT_EULER_COORD_GROUND = [0, 180, 0] ORIGIN_SIZE = [640, 480] # camera's original resolution CROPPED_SIZE = [480, 480] # cropped to a square, attention, this is not the output image size (RENDER_SIZE) + + # Constants for Continual setup + RADIUS = 0.6125 # Radius of square or circle in continual scenarios: square-continual and circle-continual + # Gazebo else: LEFT_ARM_INIT_POS = [0.6, 0.30, 0.20] diff --git a/real_robots/omnirobot_server.py b/real_robots/omnirobot_server.py index 07a494d84..9e75fb7c2 100755 --- a/real_robots/omnirobot_server.py +++ b/real_robots/omnirobot_server.py @@ -368,6 +368,7 @@ def imageCallback(self, msg): except CvBridgeError as e: print("CvBridgeError:", e) + def saveSecondCamImage(im, episode_folder, episode_step, path="omnirobot_2nd_cam"): """ Write an image to disk diff --git a/real_robots/omnirobot_simulator_server.py b/real_robots/omnirobot_simulator_server.py index 879eae4a7..af6a39226 100755 --- a/real_robots/omnirobot_simulator_server.py +++ b/real_robots/omnirobot_simulator_server.py @@ -27,13 +27,14 @@ def __init__(self, init_x, init_y, init_yaw, origin_size, cropped_size, back_ground_path, camera_info_path, robot_marker_path, robot_marker_margin, target_marker_path, target_marker_margin, robot_marker_code, target_marker_code, - robot_marker_length, target_marker_length, output_size, **_): + robot_marker_length, target_marker_length, output_size, history_size=10, **_): """ Class for rendering Omnirobot environment :param init_x: (float) initial x position of robot :param init_y: (float) initial y position of robot :param init_yaw: (float) initial yaw position of robot - :param origin_size: (list of int) original camera's size (eg. [640,480]), the camera matrix should be corresponding to this size + :param origin_size: (list of int) original camera's size (eg. [640,480]), + the camera matrix should be corresponding to this size :param cropped_size: (list of int) cropped image's size (eg. [480,480]) :param back_ground_path: (str) back ground image's path, the image should be undistorted. :param camera_info_path: (str) camera info file's path (containing camera matrix) @@ -41,8 +42,10 @@ def __init__(self, init_x, init_y, init_yaw, origin_size, cropped_size, :param robot_marker_margin: (list of int) marker's margin (eg. [3,3,3,3]) :param target_marker_path: (str) target maker's path, the marker should have a margin with several pixels :param target_marker_margin: (list of int) marker's margin (eg. [3,3,3,3]) - :param robot_marker_code: (currently not supported, should be "None" by default) (numpy ndarray) optional, the code of robot marker, only used for detecting position directly from the image. - :param target_marker_code: (currently not supported, should be "None" by default) (numpy ndarray) optional, the code of target marker, only used for detecting position directly from the image. + :param robot_marker_code: (currently not supported, should be "None" by default) (numpy ndarray) optional, + the code of robot marker, only used for detecting position directly from the image. + :param target_marker_code: (currently not supported, should be "None" by default) (numpy ndarray) optional, + the code of target marker, only used for detecting position directly from the image. :param robot_marker_length: (float) the physical length of the marker (in meter) :param target_marker_length: (float) the physical length of the marker (in meter) :param output_size: (list of int) the output image's size (eg. [224,224]) @@ -63,6 +66,9 @@ def __init__(self, init_x, init_y, init_yaw, origin_size, cropped_size, self.robot_pos = np.float32([0, 0]) self.robot_yaw = 0 # in rad + self.history_size = history_size + self.robot_pos_past_k_steps = [] + # Last velocity command, used for simulating the controlling of velocity directly self.last_linear_velocity_cmd = np.float32( [0, 0]) # in m/s, in robot local frame @@ -81,7 +87,7 @@ def __init__(self, init_x, init_y, init_yaw, origin_size, cropped_size, self.target_yaw_cmd = 0.0 # Target's real position on the grid - self.target_pos = np.float32([0, 0]) + self.target_pos = np.float32([0,0]) self.target_yaw = 0 # status of moving @@ -90,6 +96,8 @@ def __init__(self, init_x, init_y, init_yaw, origin_size, cropped_size, # Distance for each step self.step_distance = STEP_DISTANCE + # Distance for the escape task, zombie speed + self.step_distance_target =STEP_DISTANCE_TARGET with open(camera_info_path, 'r') as stream: try: @@ -114,7 +122,7 @@ def __init__(self, init_x, init_y, init_yaw, origin_size, cropped_size, self.cropped_margin[1]+self.cropped_size[1]]).astype(np.int) back_ground_img = cv2.imread(back_ground_path) - if(back_ground_img.shape[0:2] != self.cropped_size): + if back_ground_img.shape[0:2] != self.cropped_size: print("input back ground image's size: ", back_ground_img.shape) print("resize to ", self.cropped_size) self.bg_img[self.cropped_range[0]:self.cropped_range[1], self.cropped_range[2]:self.cropped_range[3], :] \ @@ -176,7 +184,6 @@ def renderTarget(self): self.pos_transformer.phyPosGround2PixelPos( self.target_pos.reshape(2, 1)), self.target_yaw, np.random.randn() * NOISE_VAR_TARGET_SIZE_PROPOTION + 1.0) - def renderRobot(self): """ render the image. @@ -186,6 +193,18 @@ def renderRobot(self): self.robot_pos.reshape(2, 1)), self.robot_yaw, self.robot_marker_size_proprotion) + def getHistorySize(self): + return self.history_size + + def appendToHistory(self, pos): + self.robot_pos_past_k_steps.append(pos) + + def popOfHistory(self): + self.robot_pos_past_k_steps.pop(0) + + def emptyHistory(self): + self.robot_pos_past_k_steps = [] + def getCroppedImage(self): return self.image[self.cropped_range[0]:self.cropped_range[1], self.cropped_range[2]:self.cropped_range[3], :] @@ -240,28 +259,28 @@ def setTargetCmd(self, x, y, yaw): self.target_pos = self.target_pos_cmd self.target_yaw = self.normalizeAngle(self.target_yaw_cmd) - def forward(self, action=None): + def forward(self): """ Move one step forward (Translation) """ self.setRobotCmd( self.robot_pos_cmd[0] + self.step_distance, self.robot_pos_cmd[1], self.robot_yaw_cmd) - def backward(self, action=None): + def backward(self): """ Move one step backward """ self.setRobotCmd( self.robot_pos_cmd[0] - self.step_distance, self.robot_pos_cmd[1], self.robot_yaw_cmd) - def left(self, action=None): + def left(self): """ Translate to the left """ self.setRobotCmd( self.robot_pos_cmd[0], self.robot_pos_cmd[1] + self.step_distance, self.robot_yaw_cmd) - def right(self, action=None): + def right(self): """ Translate to the right """ @@ -275,6 +294,49 @@ def moveContinous(self, action): self.setRobotCmd( self.robot_pos_cmd[0] + action[0], self.robot_pos_cmd[1] + action[1], self.robot_yaw_cmd) + def targetMoveDiscrete(self, target_yaw): + pi = np.pi + assert target_yaw > -pi and target_yaw < pi + target_yaw += pi + if target_yaw >= pi/4 and target_yaw < pi*3/4:#Up + self.setTargetCmd( + self.target_pos_cmd[0], self.target_pos_cmd[1] - self.step_distance_target, target_yaw) + elif target_yaw >= 3/4*pi and target_yaw < 5/4 *pi: + self.setTargetCmd( + self.target_pos_cmd[0] + self.step_distance_target, self.target_pos_cmd[1], target_yaw) + elif target_yaw >= 5/4 * pi and target_yaw < 7/4 * pi: + self.setTargetCmd( + self.target_pos_cmd[0], self.target_pos_cmd[1]+self.step_distance_target, target_yaw) + else: + self.setTargetCmd( + self.target_pos_cmd[0] - self.step_distance_target, self.target_pos_cmd[1], target_yaw) + + + def targetMove(self, action): + if action == "forward": + self.setTargetCmd( + self.target_pos_cmd[0] + self.step_distance_target, self.target_pos_cmd[1], self.target_yaw) + elif action == "backward": + self.setTargetCmd( + self.target_pos_cmd[0] - self.step_distance_target, self.target_pos_cmd[1], self.target_yaw) + elif action == "left": + self.setTargetCmd( + self.target_pos_cmd[0] , self.target_pos_cmd[1] + self.step_distance_target, self.target_yaw) + else: + self.setTargetCmd( + self.target_pos_cmd[0] , self.target_pos_cmd[1] - self.step_distance_target, self.target_yaw) + + def targetMoveContinous(self, target_yaw): + """ + Perform a continuous displacement of dx, dy + """ + self.target_yaw = target_yaw + action = ( + self.step_distance_target * np.cos(target_yaw), self.step_distance_target * np.sin(target_yaw)) + self.setTargetCmd( + self.target_pos_cmd[0] +action[0] , self.target_pos_cmd[1] + action[1], target_yaw) + + def moveByVelocityCmd(self, speed_x, speed_y, speed_yaw): """ simuate the robot moved by velocity command @@ -297,8 +359,7 @@ def moveByVelocityCmd(self, speed_x, speed_y, speed_yaw): cos_direction + self.last_linear_velocity_cmd[0] * sin_direction)/RL_CONTROL_FREQ ground_yaw_cmd = self.robot_yaw + self.last_rot_velocity_cmd/RL_CONTROL_FREQ self.setRobotCmd(ground_pos_cmd_x, ground_pos_cmd_y, ground_yaw_cmd) - - # save the command of this moment + # save the command of this moment self.last_linear_velocity_cmd[0] = speed_x self.last_linear_velocity_cmd[1] = speed_y self.last_rot_velocity_cmd = speed_yaw @@ -312,7 +373,6 @@ def moveByWheelsCmd(self, left_speed, front_speed, right_speed): :param front_speed: (float) linear speed of front wheel (meter/s) :param right_speed: (float) linear speed of right wheel (meter/s) """ - # calculate the robot position by omnirobot's kinematic equations # Assume in 1/RL_CONTROL_FREQ, the heading remains the same (not true, # but should be approximately work if RL_CONTROL_FREQ is high enough) @@ -326,7 +386,7 @@ def moveByWheelsCmd(self, left_speed, front_speed, right_speed): local_rot_speed = - self.last_wheel_speeds_cmd[1] / (3.0 * OMNIROBOT_L) \ - self.last_wheel_speeds_cmd[0] / (3.0 * OMNIROBOT_L) \ - self.last_wheel_speeds_cmd[2] / (3.0 * OMNIROBOT_L) - + # translate the last velocity cmd in robot local coordiante to position cmd in gound coordiante cos_direction = np.cos(self.robot_yaw) sin_direction = np.sin(self.robot_yaw) @@ -357,18 +417,16 @@ def normalizeAngle(angle): class OmniRobotSimulatorSocket(OmnirobotManagerBase): def __init__(self, **args): ''' - Simulate the zmq socket like real omnirobot server - :param **args arguments + Simulate the zmq socket like real omnirobot server + :param **args arguments ''' super(OmniRobotSimulatorSocket, self).__init__() - defalt_args = { + default_args = { "back_ground_path": "real_robots/omnirobot_utils/back_ground.jpg", "camera_info_path": CAMERA_INFO_PATH, "robot_marker_path": "real_robots/omnirobot_utils/robot_margin3_pixel_only_tag.png", "robot_marker_margin": [3, 3, 3, 3], - # for black target, use target_margin4_pixel.png", - "target_marker_path": "real_robots/omnirobot_utils/red_target_margin4_pixel_480x480.png", "target_marker_margin": [4, 4, 4, 4], "robot_marker_code": None, "target_marker_code": None, @@ -379,30 +437,56 @@ def __init__(self, **args): "init_y": 0, "init_yaw": 0, "origin_size": ORIGIN_SIZE, - "cropped_size": CROPPED_SIZE + "cropped_size": CROPPED_SIZE, + "circular_move": False } # overwrite the args if it exists - self.new_args = {**defalt_args, **args} + self.new_args = {**default_args, **args} + + if self.new_args["simple_continual_target"]: + self.new_args["target_marker_path"] = "real_robots/omnirobot_utils/red_square.png" + + elif self.new_args["circular_continual_move"]: + self.new_args["target_marker_path"] = "real_robots/omnirobot_utils/blue_square.png" + + elif self.new_args["square_continual_move"] or self.new_args["eight_continual_move"]: + self.new_args["target_marker_path"] = "real_robots/omnirobot_utils/green_square.png" + elif self.new_args["chasing_continual_move"] or self.new_args["escape_continual_move"]: + self.new_args["target_marker_path"] = "real_robots/omnirobot_utils/yellow_T.png" + else: + # for black target, use target_margin4_pixel.png", + self.new_args["target_marker_path"] = "real_robots/omnirobot_utils/red_target_margin4_pixel_480x480.png" + + super(OmniRobotSimulatorSocket, self).__init__(simple_continual_target=self.new_args["simple_continual_target"], + circular_continual_move=self.new_args["circular_continual_move"], + square_continual_move=self.new_args["square_continual_move"], + eight_continual_move=self.new_args["eight_continual_move"], + chasing_continual_move=self.new_args["chasing_continual_move"], + escape_continual_move=self.new_args["escape_continual_move"]) assert len(self.new_args['robot_marker_margin']) == 4 assert len(self.new_args['target_marker_margin']) == 4 assert len(self.new_args['output_size']) == 2 - self.robot = OmniRobotEnvRender(**self.new_args) self.episode_idx = 0 self._random_target = self.new_args["random_target"] + if sum((self.new_args["simple_continual_target"] + , self.new_args["chasing_continual_move"] , self.new_args["escape_continual_move"]))>0: + self._random_target = True + self.state_init_override = self.new_args['state_init_override'] self.resetEpisode() # for a random target initial position def resetEpisode(self): """ override the original method - Give the correct sequance of commands to the robot + Give the correct sequence of commands to the robot to rest environment between the different episodes """ if self.second_cam_topic is not None: assert NotImplementedError # Env reset - random_init_position = self.sampleRobotInitalPosition() + random_init_position = self.sampleRobotInitalPosition() if self.state_init_override is None \ + else self.state_init_override self.robot.setRobotCmd( random_init_position[0], random_init_position[1], 0) @@ -410,7 +494,7 @@ def resetEpisode(self): ) * NOISE_VAR_ROBOT_SIZE_PROPOTION + 1.0 # target reset - if self._random_target or self.episode_idx == 0: + if self._random_target: random_init_x = np.random.random_sample() * (TARGET_MAX_X - TARGET_MIN_X) + \ TARGET_MIN_X random_init_y = np.random.random_sample() * (TARGET_MAX_Y - TARGET_MIN_Y) + \ @@ -425,8 +509,9 @@ def resetEpisode(self): def send_json(self, msg): # env send msg to render self.processMsg(msg) - self.robot.renderRobot() + # As for the escape task, the target is moving, we need to update in every time step + self.robot.renderTarget() self.img = self.robot.getCroppedImage() self.img = self.robot.renderEnvLuminosityNoise(self.img, noise_var=NOISE_VAR_ENVIRONMENT, in_RGB=False, diff --git a/real_robots/omnirobot_utils/blue_square.png b/real_robots/omnirobot_utils/blue_square.png new file mode 100644 index 000000000..db3e9a12b Binary files /dev/null and b/real_robots/omnirobot_utils/blue_square.png differ diff --git a/real_robots/omnirobot_utils/blue_square_480_480.png b/real_robots/omnirobot_utils/blue_square_480_480.png new file mode 100644 index 000000000..fff1e5622 Binary files /dev/null and b/real_robots/omnirobot_utils/blue_square_480_480.png differ diff --git a/real_robots/omnirobot_utils/green_square.png b/real_robots/omnirobot_utils/green_square.png new file mode 100644 index 000000000..29624d4d7 Binary files /dev/null and b/real_robots/omnirobot_utils/green_square.png differ diff --git a/real_robots/omnirobot_utils/green_triangle_480_480.png b/real_robots/omnirobot_utils/green_triangle_480_480.png new file mode 100644 index 000000000..6bd42711f Binary files /dev/null and b/real_robots/omnirobot_utils/green_triangle_480_480.png differ diff --git a/real_robots/omnirobot_utils/omnirobot_manager_base.py b/real_robots/omnirobot_utils/omnirobot_manager_base.py index c7ca56e17..259783b20 100644 --- a/real_robots/omnirobot_utils/omnirobot_manager_base.py +++ b/real_robots/omnirobot_utils/omnirobot_manager_base.py @@ -1,18 +1,29 @@ from __future__ import division, print_function, absolute_import - +import math from real_robots.constants import * class OmnirobotManagerBase(object): - def __init__(self): + def __init__(self, simple_continual_target=False, circular_continual_move=False, square_continual_move=False, + eight_continual_move=False, chasing_continual_move=False, escape_continual_move=False, + lambda_c=10.0, second_cam_topic=None, state_init_override=None): """ This class is the basic class for omnirobot server, and omnirobot simulator's server. This class takes omnirobot position at instant t, and takes the action at instant t, to determinate the position it should go at instant t+1, and the immediate reward it can get at instant t """ super(OmnirobotManagerBase, self).__init__() - self.second_cam_topic = SECOND_CAM_TOPIC + self.second_cam_topic = second_cam_topic self.episode_idx = 0 + self.simple_continual_target = simple_continual_target + self.circular_continual_move = circular_continual_move + self.square_continual_move = square_continual_move + self.eight_continual_move = eight_continual_move + self.chasing_continual_move = chasing_continual_move + self.escape_continual_move = escape_continual_move + self.lambda_c = lambda_c + self.state_init_override = state_init_override + self.step_counter = 0 # the abstract object for robot, # can be the real robot (Omnirobot class) @@ -21,8 +32,8 @@ def __init__(self): def rightAction(self): """ - Let robot excute right action, and checking the boudary - :return has_bumped: (bool) + Let robot execute right action, and checking the boundary + :return has_bumped: (bool) """ if self.robot.robot_pos[1] - STEP_DISTANCE > MIN_Y: self.robot.right() @@ -33,8 +44,8 @@ def rightAction(self): def leftAction(self): """ - Let robot excute left action, and checking the boudary - :return has_bumped: (bool) + Let robot execute left action, and checking the boundary + :return has_bumped: (bool) """ if self.robot.robot_pos[1] + STEP_DISTANCE < MAX_Y: self.robot.left() @@ -45,8 +56,8 @@ def leftAction(self): def forwardAction(self): """ - Let robot excute forward action, and checking the boudary - :return has_bumped: (bool) + Let robot execute forward action, and checking the boundary + :return has_bumped: (bool) """ if self.robot.robot_pos[0] + STEP_DISTANCE < MAX_X: self.robot.forward() @@ -57,8 +68,8 @@ def forwardAction(self): def backwardAction(self): """ - Let robot excute backward action, and checking the boudary - :return has_bumped: (bool) + Let robot execute backward action, and checking the boundary + :return has_bumped: (bool) """ if self.robot.robot_pos[0] - STEP_DISTANCE > MIN_X: self.robot.backward() @@ -69,8 +80,8 @@ def backwardAction(self): def moveContinousAction(self, msg): """ - Let robot excute continous action, and checking the boudary - :return has_bumped: (bool) + Let robot execute continous action, and checking the boundary + :return has_bumped: (bool) """ if MIN_X < self.robot.robot_pos[0] + msg['action'][0] < MAX_X and \ MIN_Y < self.robot.robot_pos[1] + msg['action'][1] < MAX_Y: @@ -80,6 +91,54 @@ def moveContinousAction(self, msg): has_bumped = True return has_bumped + def targetMoveContinousAction(self, target_yaw): + """ + Let robot execute continous action, and checking the boundary + :return has_bumped: (bool) + """ + action = ( + self.robot.step_distance_target * np.cos(target_yaw), self.robot.step_distance_target * np.sin(target_yaw)) + if MIN_X < self.robot.target_pos[0] + action[0] < MAX_X and \ + MIN_Y < self.robot.target_pos[1] + action[1] < MAX_Y: + self.robot.targetMoveContinous(target_yaw) + has_bumped = False + else: + has_bumped = True + return has_bumped + + def targetMoveDiscreteAction(self, target_yaw): + self.robot.targetMoveDiscrete(target_yaw) + + def targetPolicy(self, directed = False): + """ + The policy for the target + :param directed: directed to the robot(agent) + :return: the angle to go for the target + """ + if directed: + dy = self.robot.robot_pos[1] - self.robot.target_pos[1] + np.random.rand() * abs( + self.robot.robot_pos[1] - self.robot.target_pos[1]) + dx = self.robot.robot_pos[0] - self.robot.target_pos[0] + np.random.rand() * abs( + self.robot.robot_pos[0] - self.robot.target_pos[0]) + r = math.sqrt(dy**2 + dx**2) + dy /= r + dx /= r + yaw = math.atan2(dy, dx) + #return yaw + if abs(dy) > abs(dx): + if dy > 0: + self.robot.targetMove("left") + else: + self.robot.targetMove("right") + else: + if dx > 0: + self.robot.targetMove("forward") + else: + self.robot.targetMove("backward") + period = 70 + yaw = (2*(self.step_counter % period )/period-1)*np.pi + return yaw + def sampleRobotInitalPosition(self): """ @@ -88,10 +147,10 @@ def sampleRobotInitalPosition(self): random_init_x = np.random.random_sample() * (INIT_MAX_X - INIT_MIN_X) + INIT_MIN_X random_init_y = np.random.random_sample() * (INIT_MAX_Y - INIT_MIN_Y) + INIT_MIN_Y return [random_init_x, random_init_y] - + def resetEpisode(self): """ - Give the correct sequance of commands to the robot + Give the correct sequance of commands to the robot to rest environment between the different episodes """ if self.second_cam_topic is not None: @@ -111,6 +170,10 @@ def processMsg(self, msg): if command == 'reset': action = None self.episode_idx += 1 + self.step_counter = 0 + # empty list of previous states + self.robot.emptyHistory() + self.resetEpisode() elif command == 'action': @@ -124,6 +187,7 @@ def processMsg(self, msg): exit(0) else: raise ValueError("Unknown command: {}".format(msg)) + self.step_counter += 1 has_bumped = False # We are always facing North @@ -139,20 +203,76 @@ def processMsg(self, msg): has_bumped = self.backwardAction() elif action == 'Continuous': has_bumped = self.moveContinousAction(msg) - elif action == None: + elif action is None: pass else: print("Unsupported action: ", action) # Determinate the reward for this step - - # Consider that we reached the target if we are close enough - # we detect that computing the difference in area between TARGET_INITIAL_AREA - # current detected area of the target - if np.linalg.norm(np.array(self.robot.robot_pos) - np.array(self.robot.target_pos)) \ - < DIST_TO_TARGET_THRESHOLD: - self.reward = REWARD_TARGET_REACH - elif has_bumped: - self.reward = REWARD_BUMP_WALL + + if self.circular_continual_move or self.square_continual_move or self.eight_continual_move: + step_counter = msg.get("step_counter", None) + assert step_counter is not None + + self.robot.appendToHistory(self.robot.robot_pos) + + ord = None + if self.square_continual_move or self.eight_continual_move: + ord = np.inf + + if self.circular_continual_move or self.square_continual_move: + self.reward = self.lambda_c * (1 - (np.linalg.norm(self.robot.robot_pos, ord=ord) - RADIUS) ** 2) + + elif self.eight_continual_move: + plus = self.robot.robot_pos[0]**2 + self.robot.robot_pos[1]**2 + minus = 2 * (RADIUS ** 2) * (self.robot.robot_pos[0] ** 2 - self.robot.robot_pos[1] ** 2) + self.reward = self.lambda_c * (1 - (plus - minus) ** 2) + + else: + pass + + if step_counter < self.robot.getHistorySize(): + pass + else: + self.robot.popOfHistory() + self.reward *= np.linalg.norm(self.robot.robot_pos - self.robot.robot_pos_past_k_steps[0]) + + if has_bumped: + self.reward += self.lambda_c * self.lambda_c * REWARD_BUMP_WALL + + elif self.chasing_continual_move: + # The action for target agent + target_yaw = self.targetPolicy() + self.targetMoveContinousAction(target_yaw) + dis = np.linalg.norm(np.array(self.robot.robot_pos) - np.array(self.robot.target_pos)) + if dis < 0.4 and dis > 0.3: + self.reward = REWARD_TARGET_REACH + elif has_bumped: + self.reward = REWARD_BUMP_WALL + else: + self.reward = REWARD_NOTHING + elif self.escape_continual_move: + dis = np.linalg.norm(np.array(self.robot.robot_pos) - np.array(self.robot.target_pos)) + if has_bumped or dis < 0.4: + self.reward = REWARD_BUMP_WALL + # elif(dis<0.2): + # self.reward = REWARD_BUMP_WALL + elif(dis >= 0.4): + self.reward =REWARD_TARGET_REACH + else: + self.reward = REWARD_NOTHING + + self.targetPolicy(directed=True) + # self.targetMoveContinousAction(target_yaw) + # self.targetMoveDiscreteAction(target_yaw) else: - self.reward = REWARD_NOTHING + # Consider that we reached the target if we are close enough + # we detect that computing the difference in area between TARGET_INITIAL_AREA + # current detected area of the target + if np.linalg.norm(np.array(self.robot.robot_pos) - np.array(self.robot.target_pos)) \ + < DIST_TO_TARGET_THRESHOLD: + self.reward = REWARD_TARGET_REACH + elif has_bumped: + self.reward = REWARD_BUMP_WALL + else: + self.reward = REWARD_NOTHING diff --git a/real_robots/omnirobot_utils/red_dot_480_480.png b/real_robots/omnirobot_utils/red_dot_480_480.png new file mode 100644 index 000000000..0243e13e9 Binary files /dev/null and b/real_robots/omnirobot_utils/red_dot_480_480.png differ diff --git a/real_robots/omnirobot_utils/red_square.png b/real_robots/omnirobot_utils/red_square.png new file mode 100644 index 000000000..5b6cb4fe3 Binary files /dev/null and b/real_robots/omnirobot_utils/red_square.png differ diff --git a/real_robots/omnirobot_utils/red_target_margin4_pixel_480x4801.png b/real_robots/omnirobot_utils/red_target_margin4_pixel_480x4801.png new file mode 100644 index 000000000..d49298b80 Binary files /dev/null and b/real_robots/omnirobot_utils/red_target_margin4_pixel_480x4801.png differ diff --git a/real_robots/omnirobot_utils/yellow_T.jpg b/real_robots/omnirobot_utils/yellow_T.jpg new file mode 100644 index 000000000..6f7496527 Binary files /dev/null and b/real_robots/omnirobot_utils/yellow_T.jpg differ diff --git a/real_robots/omnirobot_utils/yellow_T.png b/real_robots/omnirobot_utils/yellow_T.png new file mode 100644 index 000000000..79c359c1f Binary files /dev/null and b/real_robots/omnirobot_utils/yellow_T.png differ diff --git a/real_robots/omnirobot_utils/yellow_Ttt.png b/real_robots/omnirobot_utils/yellow_Ttt.png new file mode 100644 index 000000000..d48fad228 Binary files /dev/null and b/real_robots/omnirobot_utils/yellow_Ttt.png differ diff --git a/real_robots/omnirobot_utils/zombie.png b/real_robots/omnirobot_utils/zombie.png new file mode 100644 index 000000000..84ad6d673 Binary files /dev/null and b/real_robots/omnirobot_utils/zombie.png differ diff --git a/real_robots/omnirobot_utils/zombie2.png b/real_robots/omnirobot_utils/zombie2.png new file mode 100644 index 000000000..39b49325b Binary files /dev/null and b/real_robots/omnirobot_utils/zombie2.png differ diff --git a/replay/aggregate_plots.py b/replay/aggregate_plots.py index 0fe7d5f8a..cd8197f08 100644 --- a/replay/aggregate_plots.py +++ b/replay/aggregate_plots.py @@ -206,3 +206,4 @@ def plotGatheredExperiments(folders, algo, y_limits, window=40, title="", min_nu plotGatheredExperiments(folders, train_args['algo'], y_limits=y_limits, window=args.episode_window, title=title, min_num_x=args.min_x, no_display=args.no_display, timesteps=args.timesteps, output_file=args.output_file) +#python -m replay.aggregate_plots --log-dir logs/OmnirobotEnv-v0/srl_combination/ppo2/ --timesteps --min-x 1000 -o logs/plot/ \ No newline at end of file diff --git a/replay/cross_eval_plot.py b/replay/cross_eval_plot.py new file mode 100644 index 000000000..4fc2d42dc --- /dev/null +++ b/replay/cross_eval_plot.py @@ -0,0 +1,156 @@ +""" +Plot past experiment in visdom +""" +import argparse +import os +import pickle +import matplotlib.pyplot as plt +import numpy as np +import seaborn as sns +from matplotlib import rc +from matplotlib.ticker import FuncFormatter +from replay.aggregate_plots import lightcolors, darkcolors, Y_LIM_SHAPED_REWARD, Y_LIM_SPARSE_REWARD, millions +from rl_baselines.cross_eval import dict2array +from rl_baselines.visualize import smoothRewardCurve +# Init seaborn +sns.set() +# Style for the title +fontstyle = {'fontname': 'DejaVu Sans', 'fontsize': 24, 'fontweight': 'bold'} +rc('font', weight='bold') + + +def crossEvalPlot(res, tasks, title, y_limits , truncate_x, plot_mean=False, timesteps=False): + + index_x = -1 + episodes = res[:,:,0][0] + if (truncate_x>-1): + for eps in episodes: + index_x += 1 + if(eps >= truncate_x): + break + if(index_x ==-1 ): + y_array = res[:, :, 1:] + x = res[:, :, 0][0] + else: + y_array = res[:, :index_x, 1:] + x = res[:, :index_x, 0][0] + if(timesteps): + x = 250 * x + sum_mean = [] + sum_s = [] + sum_n = [] + + fig = plt.figure(title) + for i in range(len(y_array)): + label = tasks[i] + y = y_array[i].T + print('{}: {} experiments'.format(label, len(y))) + # Compute mean for different seeds + m = np.mean(y, axis=0) + # Compute standard error + s = np.squeeze(np.asarray(np.std(y, axis=0))) + n = y.shape[0] + sum_mean +=[m] + sum_s +=[s] + sum_n +=[n] + plt.fill_between(x, m - s / np.sqrt(n), m + s / np.sqrt(n), color=lightcolors[i % len(lightcolors)], alpha=0.5) + plt.plot(x, m, color=darkcolors[i % len(darkcolors)], label=label, linewidth=2) + + #reward_sum = np.concatenate([res[0, :index_x, 1:], res[1, :index_x, 1:]], axis=1) + + if(plot_mean): + + m = np.mean(sum_mean, axis=0) + # Compute standard error + s = np.mean(sum_s) + n = np.mean(n) + plt.fill_between(x, m - s / np.sqrt(n), m + s / np.sqrt(n), color=lightcolors[4 % len(lightcolors)], alpha=0.5) + plt.plot(x, m, color=darkcolors[4 % len(darkcolors)], label='mean reward', linewidth=2) + + if timesteps: + formatter = FuncFormatter(millions) + plt.xlabel('Number of Timesteps') + fig.axes[0].xaxis.set_major_formatter(formatter) + else: + plt.xlabel('Number of Episodes') + plt.ylabel('Normalized Rewards', fontsize=15 , fontweight='bold') + + plt.title(title, **fontstyle) + if (y_limits[0] != y_limits[1]): + plt.ylim(y_limits) + + plt.legend(framealpha=0.8, frameon=True, labelspacing=0.01, loc='lower right', fontsize=20) + plt.show() + + +def smoothPlot(res, tasks, title, y_limits,timesteps): + y = np.mean(res[:, :, 1:], axis=2) + # y = np.sort(res[:, :, 1:], axis=2) + # y = np.mean(y[:,:,1:],axis=2) + x = res[:, :, 0][0] + if(timesteps): + x = x*250 + print(y.shape, x.shape) + fig = plt.figure(title) + for i in range(len(y)): + label = tasks[i] + tmp_x, tmp_y = smoothRewardCurve(x, y[i], conv_len=2) + plt.plot(tmp_x, tmp_y, color=darkcolors[i % len(darkcolors)], label=label, linewidth=2) + if (timesteps): + formatter = FuncFormatter(millions) + plt.xlabel('Number of Timesteps') + fig.axes[0].xaxis.set_major_formatter(formatter) + else: + plt.xlabel('Number of Episodes') + plt.ylabel('Rewards', fontsize=20, fontweight='bold') + + plt.title(title, **fontstyle) + if (y_limits[0] != y_limits[1]): + plt.ylim(y_limits) + + plt.legend(framealpha=0.8, frameon=True, labelspacing=0.01, loc='lower right', fontsize=18) + plt.show() + + + +#Example command: +# python -m replay.cross_eval_plot -i logs/sc2cc/OmnirobotEnv-v0/srl_combination/ppo2/19-04-29_14h59_35/eval.pkl +if __name__ == '__main__': + parser = argparse.ArgumentParser(description="Plot the learning curve during a training for different tasks") + parser.add_argument('-i', '--input-path', help='folder with the plots as pkl files', type=str, required=True) + parser.add_argument('-t', '--title', help='Plot title', type=str, default='Learning Curve') + + parser.add_argument('--y-lim', nargs=2, type=float, default=[-1, -1], help="limits for the y axis") + parser.add_argument('--truncate-x', type=int, default=-1, + help="Truncate the experiments after n ticks on the x-axis (default: -1, no truncation)") + parser.add_argument('--eval-tasks', type=str, nargs='+', default=['cc', 'esc','sc'], + help='A cross evaluation from the latest stored model to all tasks') + parser.add_argument('-s','--smooth', action='store_true', default=False, + help='Plot with a smooth mode') + parser.add_argument('--timesteps', action = 'store_true',default=False, + help='Plot with timesteps') + + args = parser.parse_args() + + + load_path = args.input_path + title = args.title + y_limits = args.y_lim + tasks = args.eval_tasks + truncate_x= args.truncate_x + + + assert (os.path.isfile(load_path) and load_path.split('.')[-1]=='pkl'), 'Please load a valid .pkl file' + + + with open(load_path, "rb") as file: + data = pickle.load(file) + + + res = dict2array(args.eval_tasks, data) + + print("{} episodes evaluations to plot".format(res.shape[1])) + if(args.smooth): + smoothPlot(res[:,1:],tasks,title,y_limits,args.timesteps) + else: + crossEvalPlot(res[:,:], tasks, title,y_limits, truncate_x,args.timesteps) diff --git a/replay/enjoy_baselines.py b/replay/enjoy_baselines.py index 2405e31dd..78c09dccf 100644 --- a/replay/enjoy_baselines.py +++ b/replay/enjoy_baselines.py @@ -6,12 +6,10 @@ import os from datetime import datetime -import yaml import numpy as np import tensorflow as tf from stable_baselines.common import set_global_seeds import matplotlib.pyplot as plt -from mpl_toolkits.mplot3d import Axes3D from sklearn.decomposition import PCA import seaborn as sns @@ -60,6 +58,24 @@ def parseArguments(): help='display in the latent space the current observation.') parser.add_argument('--action-proba', action='store_true', default=False, help='display the probability of actions') + parser.add_argument('-sc', '--simple-continual', action='store_true', default=False, + help='Simple red square target for task 1 of continual learning scenario. ' + + 'The task is: robot should reach the target.') + parser.add_argument('-cc', '--circular-continual', action='store_true', default=False, + help='Blue square target for task 2 of continual learning scenario. ' + + 'The task is: robot should turn in circle around the target.') + parser.add_argument('-sqc', '--square-continual', action='store_true', default=False, + help='Green square target for task 3 of continual learning scenario. ' + + 'The task is: robot should turn in square around the target.') + parser.add_argument('-chc', '--chasing-continual', action='store_true', default=False, + help='Two chasing robots in the same domain of environment' + + 'The task is: one robot should keep a certain distance towards the other.') + parser.add_argument('-esc', '--escape-continual', action='store_true', default=False, + help='Two chasing robots in the same domain of environment' + + 'The task is: the trainable agent tries to escape from the "zombie" robot.') + args, unknown = parser.parse_known_args() + assert sum([args.simple_continual, args.circular_continual, args.square_continual]) <= 1, \ + "For continual SRL and RL, please provide only one scenario at the time and use OmnirobotEnv-v0 environment !" return parser.parse_args() @@ -79,7 +95,19 @@ def loadConfigAndSetup(load_args): raise ValueError(algo_name + " is not supported for replay") printGreen("\n" + algo_name + "\n") - load_path = "{}/{}_model.pkl".format(load_args.log_dir, algo_name) + try: # If args contains episode information, this is for student_evaluation (distillation) + if not load_args.episode == -1: + load_path = "{}/{}_{}_model.pkl".format(load_args.log_dir, algo_name, load_args.episode,) + else: + load_path = "{}/{}_model.pkl".format(load_args.log_dir, algo_name) + except: + printYellow( + "No episode of checkpoint specified, go for the default policy model: {}_model.pkl".format(algo_name)) + if load_args.log_dir[-3:] != 'pkl': + load_path = "{}/{}_model.pkl".format(load_args.log_dir, algo_name) + else: + load_path = load_args.log_dir + load_args.log_dir = os.path.dirname(load_path)+'/' env_globals = json.load(open(load_args.log_dir + "env_globals.json", 'r')) train_args = json.load(open(load_args.log_dir + "args.json", 'r')) @@ -103,6 +131,24 @@ def loadConfigAndSetup(load_args): else: env_kwargs["force_down"] = env_globals.get('force_down', False) + if train_args["env"] == "OmnirobotEnv-v0": + env_kwargs["simple_continual_target"] = env_globals.get("simple_continual_target", False) + env_kwargs["circular_continual_move"] = env_globals.get("circular_continual_move", False) + env_kwargs["square_continual_move"] = env_globals.get("square_continual_move", False) + env_kwargs["eight_continual_move"] = env_globals.get("eight_continual_move", False) + env_kwargs["chasing_continual_move"] = env_globals.get("chasing_continual_move", False) + env_kwargs["escape_continual_move"] = env_globals.get("escape_continual_move", False) + # If overriding the environment for specific Continual Learning tasks + if sum([load_args.simple_continual, load_args.circular_continual, + load_args.escape_continual, load_args.chasing_continual, + load_args.square_continual]) >= 1: + env_kwargs["simple_continual_target"] = load_args.simple_continual + env_kwargs["circular_continual_move"] = load_args.circular_continual + env_kwargs["square_continual_move"] = load_args.square_continual + env_kwargs["chasing_continual_move"] =load_args.chasing_continual + env_kwargs["escape_continual_move"] = load_args.escape_continual + env_kwargs["random_target"] = not (load_args.circular_continual or load_args.square_continual) + srl_model_path = None if train_args["srl_model"] != "raw_pixels": train_args["policy"] = "mlp" @@ -111,7 +157,8 @@ def loadConfigAndSetup(load_args): if path is not None: env_kwargs["use_srl"] = True # Check that the srl saved model exists on the disk - assert os.path.isfile(env_globals['srl_model_path']), "{} does not exist".format(env_globals['srl_model_path']) + assert os.path.isfile(env_globals['srl_model_path']), \ + "{} does not exist".format(env_globals['srl_model_path']) srl_model_path = env_globals['srl_model_path'] env_kwargs["srl_model_path"] = srl_model_path @@ -161,6 +208,7 @@ def main(): # createTensorflowSession() printYellow("Compiling Policy function....") + printYellow(load_path) method = algo_class.load(load_path, args=algo_args) dones = [False for _ in range(load_args.num_cpu)] @@ -325,10 +373,11 @@ def main(): n_done += np.sum(dones) if (n_done - last_n_done) > 1: + last_n_done = n_done _, mean_reward = computeMeanReward(log_dir, n_done) print("{} episodes - Mean reward: {:.2f}".format(n_done, mean_reward)) - + print("print: ", n_done, log_dir) _, mean_reward = computeMeanReward(log_dir, n_done) print("{} episodes - Mean reward: {:.2f}".format(n_done, mean_reward)) diff --git a/replay/pipeline.py b/replay/pipeline.py new file mode 100644 index 000000000..91939bdc3 --- /dev/null +++ b/replay/pipeline.py @@ -0,0 +1,250 @@ +import argparse +import os + +import matplotlib.pyplot as plt +import numpy as np +import seaborn as sns +from matplotlib.ticker import FuncFormatter +import json + +from rl_baselines.visualize import movingAverage, loadCsv,loadData +from replay.aggregate_plots import lightcolors, darkcolors, Y_LIM_SHAPED_REWARD, Y_LIM_SPARSE_REWARD, millions +from srl_zoo.utils import printGreen, printRed, printYellow + +# Init seaborn +sns.set() +# Style for the title +fontstyle = {'fontname': 'DejaVu Sans', 'fontsize': 16} + + + + +def loadEpisodesData(folder): + """ + :param folder: (str) + :return: (numpy array, numpy array) or (None, None) + """ + result, _ = loadCsv(folder) + + if len(result) == 0: + return None, None + + y = np.array(result)[:, 1] + x = np.arange(len(y)) + return x, y + + +def plotGatheredData(x_list,y_list,y_limits, timesteps,title,legends,no_display,truncate_x=-1,normalization=False): + assert len(legends)==len(y_list) + printGreen("{} Experiments".format(len(y_list))) + + lengths = list(map(len, x_list)) + min_x, max_x = np.min(lengths), np.max(lengths) + if truncate_x > 0: + min_x = min(truncate_x, min_x) + x = np.array(x_list[0][:min_x]) + #To reformulize the data by the min_x + for i in range(len(y_list)): + y_list[i]=y_list[i][:, :min_x] + y_list=np.array(y_list) + + #print("Min, Max rewards:", np.min(y_list), np.max(y_list)) + + + #Normalize the data between 0 and 1. + if (normalization): + y_limits = [-0.05, 1.05] + y_list =(y_list-np.min(y_list))/(np.max(y_list)-np.min(y_list)) + + fig = plt.figure(title) + for i in range(len(y_list)): + label = legends[i] + y = y_list[i][:, :min_x] + + print('{}: {} experiments'.format(label, len(y))) + # Compute mean for different seeds + m = np.mean(y, axis=0) + # Compute standard error + s = np.squeeze(np.asarray(np.std(y, axis=0))) + n = y.shape[0] + plt.fill_between(x, m - s / np.sqrt(n), m + s / np.sqrt(n), color=lightcolors[i % len(lightcolors)], alpha=0.5) + plt.plot(x, m, color=darkcolors[i % len(darkcolors)], label=label, linewidth=2) + + if timesteps: + formatter = FuncFormatter(millions) + plt.xlabel('Number of Timesteps') + fig.axes[0].xaxis.set_major_formatter(formatter) + else: + plt.xlabel('Number of Episodes') + if(normalization): + plt.ylabel('Normalized Rewards') + else: + plt.ylabel('Rewards') + plt.title(title, **fontstyle) + plt.ylim(y_limits) + + plt.legend(framealpha=0.8, frameon=True, labelspacing=0.01, loc='lower right', fontsize=16) + + if not no_display: + plt.show() + + + + +def GatherExperiments(folders, algo, window=40, title="", min_num_x=-1, + timesteps=False, output_file="",): + """ + Compute mean and standard error for several experiments and plot the learning curve + :param folders: ([str]) Log folders, where the monitor.csv are stored + :param window: (int) Smoothing window + :param algo: (str) name of the RL algo + :param title: (str) plot title + :param min_num_x: (int) Minimum number of episode/timesteps to keep an experiment (default: -1, no minimum) + :param timesteps: (bool) Plot timesteps instead of episodes + :param y_limits: ([float]) y-limits for the plot + :param output_file: (str) Path to a file where the plot data will be saved + :param no_display: (bool) Set to true, the plot won't be displayed (useful when only saving plot) + """ + y_list = [] + x_list = [] + ok = False + for folder in folders: + printRed("folder name:{}".format(folder)) + if timesteps: + x, y = loadData(folder, smooth=1, bin_size=100) + if x is not None: + x, y = np.array(x), np.array(y) + else: + x, y = loadEpisodesData(folder) + + if x is None or (min_num_x > 0 and y.shape[0] < min_num_x): + printYellow("Skipping {}".format(folder)) + continue + + if y.shape[0] <= window: + printYellow("Folder {}".format(folder)) + printYellow("Not enough episodes for current window size = {}".format(window)) + continue + ok = True + y = movingAverage(y, window) + y_list.append(y) + print(len(x)) + # Truncate x + x = x[len(x) - len(y):] + x_list.append(x) + + if not ok: + printRed("Not enough data to plot anything with current config." + + " Consider decreasing --min-x") + return + + lengths = list(map(len, x_list)) + min_x, max_x = np.min(lengths), np.max(lengths) + + print("Min x: {}".format(min_x)) + print("Max x: {}".format(max_x)) + + for i in range(len(x_list)): + x_list[i] = x_list[i][:min_x] + y_list[i] = y_list[i][:min_x] + + x = np.array(x_list)[0] + y = np.array(y_list) + # if output_file != "": + # printGreen("Saving aggregated data to {}.npz".format(output_file)) + # np.savez(output_file, x=x, y=y) + return x,y + + +def comparePlots(path, algo,y_limits,title="Learning Curve", + timesteps=False, truncate_x=-1, no_display=False,normalization=False): + """ + :param path: (str) path to the folder where the plots are stored + :param plots: ([str]) List of saved plots as npz file + :param y_limits: ([float]) y-limits for the plot + :param title: (str) plot title + :param timesteps: (bool) Plot timesteps instead of episodes + :param truncate_x: (int) Truncate the experiments after n ticks on the x-axis + :param no_display: (bool) Set to true, the plot won't be displayed (useful when only saving plot) + """ + + folders = [] + other = [] + legends=[] + for folder in os.listdir(path): + folders_srl=[] + other_srl=[] + tmp_path = "{}/{}/{}/".format(path, folder, algo) + legends.append(folder) + for f in os.listdir(tmp_path): + paths = "{}/{}/{}/{}/".format(path, folder, algo,f) + env_globals = json.load(open(paths + "env_globals.json", 'r')) + train_args = json.load(open(paths + "args.json", 'r')) + if train_args["shape_reward"] == args.shape_reward: + folders_srl.append(paths) + else: + other_srl.append(paths) + folders.append(folders_srl) + other.append(other_srl) + + + x_list,y_list=[],[] + for folders_srl in folders: + printGreen("Folder name {}".format(folders_srl)) + x,y=GatherExperiments(folders_srl, algo, window=40, title=title, min_num_x=-1, + timesteps=timesteps, output_file="") + print(len(x)) + x_list.append(x) + y_list.append(y) + printGreen(np.array(x_list).shape) + # printGreen('y_list shape {}'.format(np.array(y_list[1]).shape)) + + plotGatheredData(x_list,y_list,y_limits,timesteps,title,legends,no_display,truncate_x,normalization) + + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description="Plot trained agent") + parser.add_argument('-i', '--input-dir', help='folder with the plots as npz files', type=str, required=True) + parser.add_argument('-t', '--title', help='Plot title', type=str, default='Learning Curve') + parser.add_argument('--episode_window', type=int, default=40, + help='Episode window for moving average plot (default: 40)') + parser.add_argument('--shape-reward', action='store_true', default=False, + help='Change the y_limit to correspond shaped reward bounds') + parser.add_argument('--y-lim', nargs=2, type=float, default=[-1, -1], help="limits for the y axis") + parser.add_argument('--truncate-x', type=int, default=-1, + help="Truncate the experiments after n ticks on the x-axis (default: -1, no truncation)") + parser.add_argument('--timesteps', action='store_true', default=False, + help='Plot timesteps instead of episodes') + parser.add_argument('--no-display', action='store_true', default=False, help='Do not display plot') + parser.add_argument('--algo',type=str,default='ppo2',help='The RL algorithms result to show') + parser.add_argument('--norm', action='store_true', default=False, help='To normalize the output by the maximum reward') + # + # parser.add_argument('--tasks', type=str, nargs='+', default=["cc"], + # help='The tasks for the robot', + # choices=["cc", "ec", "sqc", "sc"]) + + + + args = parser.parse_args() + + y_limits = args.y_lim + if y_limits[0] == y_limits[1]: + if args.shape_reward: + y_limits = Y_LIM_SHAPED_REWARD + else: + y_limits = Y_LIM_SPARSE_REWARD + print("Using default limits:", y_limits) + + + ALGO_NAME=args.algo + + + x_list=[] + y_list=[] + + comparePlots(args.input_dir, args.algo, title=args.title, y_limits=y_limits, no_display=args.no_display, + timesteps=args.timesteps, truncate_x=args.truncate_x,normalization=args.norm) + + +#python -m replay.pipeline -i logs/OmnirobotEnv-v0 --algo ppo2 --title cc --timesteps \ No newline at end of file diff --git a/rl_baselines/base_classes.py b/rl_baselines/base_classes.py index 9e417c053..a1b06d901 100644 --- a/rl_baselines/base_classes.py +++ b/rl_baselines/base_classes.py @@ -1,6 +1,6 @@ import os import pickle - +import re from stable_baselines.common.policies import CnnPolicy, CnnLstmPolicy, CnnLnLstmPolicy, MlpPolicy, MlpLstmPolicy, \ MlpLnLstmPolicy @@ -124,10 +124,15 @@ def save(self, save_path, _locals=None): :param save_path: (str) :param _locals: (dict) local variable from callback, if present """ + assert self.model is not None, "Error: must train or load model before use" - model_save_name = self.name + ".pkl" - if os.path.basename(save_path) == model_save_name: - model_save_name = self.name + "_model.pkl" + + episode = os.path.basename(save_path).split('_')[-2] + if bool(re.search('[a-z]', episode)): + # That means this is not a episode, it is a algo name + model_save_name = self.name + ".pkl" + else: + model_save_name = self.name + '_' + episode + ".pkl" self.model.save(os.path.dirname(save_path) + "/" + model_save_name) save_param = { @@ -160,9 +165,12 @@ def load(cls, load_path, args=None): loaded_model = cls() loaded_model.__dict__ = {**loaded_model.__dict__, **save_param} - model_save_name = loaded_model.name + ".pkl" - if os.path.basename(load_path) == model_save_name: - model_save_name = loaded_model.name + "_model.pkl" + episode = os.path.basename(load_path).split('_')[-2] + if bool(re.search('[a-z]', episode)): + # That means this is not a episode, it is a algo name + model_save_name = loaded_model.name + ".pkl" + else: + model_save_name = loaded_model.name + '_' + episode + ".pkl" loaded_model.model = loaded_model.model_class.load(os.path.dirname(load_path) + "/" + model_save_name) loaded_model.states = loaded_model.model.initial_state @@ -219,7 +227,11 @@ def train(self, args, callback, env_kwargs=None, train_kwargs=None): :param env_kwargs: (dict) The extra arguments for the environment :param train_kwargs: (dict) The list of all training agruments (used in hyperparameter search) """ - envs = self.makeEnv(args, env_kwargs=env_kwargs) + if self.load_rl_model_path is not None: + load_path_normalise = os.path.dirname(self.load_rl_model_path) + envs = self.makeEnv(args, env_kwargs=env_kwargs,load_path_normalise=load_path_normalise) + else: + envs = self.makeEnv(args, env_kwargs=env_kwargs) if train_kwargs is None: train_kwargs = {} diff --git a/rl_baselines/cross_eval.py b/rl_baselines/cross_eval.py new file mode 100644 index 000000000..9b34862d3 --- /dev/null +++ b/rl_baselines/cross_eval.py @@ -0,0 +1,190 @@ + +import subprocess +import numpy as np +import pickle +import argparse +import os + +from rl_baselines.student_eval import allPolicyFiles +from srl_zoo.utils import printRed, printGreen +from rl_baselines.evaluation.cross_eval_utils import EnvsKwargs, loadConfigAndSetup, policyEval,createEnv +from shutil import copyfile + + +def dict2array(tasks,data): + """ + Convert the dictionary data set into a plotable array + :param tasks: ['sc','cc'], the task key in the dictionary + :param data: the dict itself + :return: + """ + res=[] + for t in tasks: + if(t!='cc'): + max_reward=250 + min_reward = 0 + else: + max_reward = 1920 + min_reward = 0 + + data[t]=np.array(data[t]).astype(float) + data[t][:,1:]=(data[t][:,1:]-min_reward)/max_reward + res.append(data[t]) + res=np.array(res) + return res + +def episodeEval(log_dir, tasks,num_timesteps=1000,num_cpu=1): + for t in tasks: + eval_args=['--log-dir', log_dir, '--num-timesteps', str(num_timesteps), '--num-cpu',str(5)] + task_args=['--task',t] + + subprocess.call(['python', '-m', 'rl_baselines.cross_eval_utils']+eval_args+task_args) + file_name=log_dir+'episode_eval.pkl' + with open(file_name, 'rb') as f: + eval_reward = pickle.load(f) + #Trasfer the data from dict into a numpy array and save + eval_reward=dict2array(tasks,eval_reward) + file_name=log_dir+'episode_eval.npy' + np.save(file_name, eval_reward) + + +def policyCrossEval(log_dir,task,episode,model_path, num_timesteps=2000,num_cpu=1): + """ + To do a cross evaluation for a certain policy for different tasks + A version of real time evaluation but with some bugs to fix + :param log_dir: + :param task: + :param episode: + :param model_path: + :param num_timesteps: How many timesteps to evaluate the policy + :param num_cpu: + :return: + """ + train_args, algo_name, algo_class, srl_model_path, env_kwargs = loadConfigAndSetup(log_dir) + env_kwargs = EnvsKwargs(task, env_kwargs) + + OK = True + if (not OK): + # no latest model saved yet + return None, False + else: + pass + printGreen( + "Evaluation from the model saved at: {}, with evaluation time steps: {}".format(model_path, num_timesteps)) + + log_dir, environment, algo_args = createEnv(log_dir, train_args, algo_name, algo_class, env_kwargs, num_cpu=num_cpu) + + reward = policyEval(environment, model_path, log_dir, algo_class, algo_args, num_timesteps, num_cpu) + + # Just a trick to save the episode number of the reward,but need a little bit more space to store + reward = np.append(episode, reward) + return reward, True + + +#Example commands: +#python -m rl_baselines.cross_eval --log-dir logs/sc2cc/OmnirobotEnv-v0/srl_combination/ppo2/19-05-03_11h35_10/ --num-iteration 5 + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description="Evaluation after training") + parser.add_argument('--log-dir',type=str, default='' + ,help='RL algo to use') + parser.add_argument('--num-iteration', type=int, default=5, + help='number of time each algorithm should be run the eval (N seeds).') + parser.add_argument('--scheduler',type = int, default=1, + help='A step scheduler for the evaluation') + + args, unknown = parser.parse_known_args() + + + log_dir = args.log_dir + #log_dir = 'logs/sc2cc/OmnirobotEnv-v0/srl_combination/ppo2/19-05-03_11h35_10/' + + episodes, policy_paths = allPolicyFiles(log_dir) + index_to_begin =0 + # The interval to skip, how many times we skip the evaluate + # For example: if interval = 4 and episode, then the evaluation be be performed each 4* saved_checkpoint_episode + interval_len = args.scheduler + + + #To verify if the episodes have been evaluated before + if(os.path.isfile(args.log_dir+'/eval.pkl')): + with open(args.log_dir+'/eval.pkl', "rb") as file: + rewards = pickle.load(file) + + max_eps = max(np.array(rewards['episode']).astype(int)) + index_to_begin = episodes.astype(int).tolist().index(max_eps)+1 + + + else: + task_labels = ['cc', 'sc','esc'] + rewards = {} + rewards['episode'] = [] + rewards['policy'] = [] + for t in ['cc', 'sc','esc']: + rewards[t] = [] + + + for policy_path in policy_paths[index_to_begin:]: + copyfile(log_dir+'/args.json', policy_path+'/args.json') + copyfile(log_dir + '/env_globals.json', policy_path + '/env_globals.json') + + printGreen("The evaluation will begin from {}".format(episodes[index_to_begin])) + + last_mean = [250.,250,1900] + run_mean = [0,0,0] + + + for k in range(index_to_begin, len(episodes) ,interval_len): + # if(interval_len > 1 and int(episodes[k])>=episode_schedule): + # k += interval_len-1 + printGreen("Evaluation for episode: {}".format(episodes[k])) + increase_interval = True + + model_path=policy_paths[k] + + for t , task_label in enumerate(["-esc","-sc", "-cc" ]): + + local_reward = [int(episodes[k])] + + for seed_i in range(args.num_iteration): + command_line_enjoy_student = ['python', '-m', 'replay.enjoy_baselines', '--num-timesteps', '251', + '--log-dir', model_path, task_label, "--seed", str(seed_i)] + ok = subprocess.check_output(command_line_enjoy_student) + ok = ok.decode('utf-8') + str_before = "Mean reward: " + str_after = "\npybullet" + idx_before = ok.find(str_before) + len(str_before) + idx_after = ok.find(str_after) + seed_reward = float(ok[idx_before: idx_after]) + + local_reward.append(seed_reward) + printGreen(local_reward) + printRed("current rewards {} at episode {} with random seed: {} for task {}".format( + np.mean(seed_reward), episodes[k], seed_i, task_label)) + + rewards[task_label[1:]].append(local_reward) + run_mean[t] = np.mean(local_reward[1:]) + + + + # If one of the two mean rewards varies more thant 1%, then we do not increase the evaluation interval + for t in range(len(run_mean)): + if(run_mean[t] > 1.01 * last_mean[t] or run_mean[t] <0.99 *last_mean[t]): + increase_interval = False + + printGreen("Reward now: {}, last Rewards: {} for sc and cc respectively".format(run_mean, last_mean)) + # If the mean reward varies slowly, we increase the length of the evaluation interval + if (increase_interval): + current_eps = episodes[k] + k = k + 5 + printGreen("Reward at current episode {} varies slow, change to episode {} for next evaluation" + .format(current_eps, episodes[k])) + + last_mean = run_mean.copy() + + rewards['episode'].append(int(episodes[k])) + rewards['policy'].append(model_path) + with open(args.log_dir+'/eval.pkl', 'wb') as f: + pickle.dump(rewards, f, pickle.HIGHEST_PROTOCOL) + + diff --git a/rl_baselines/evaluation/cross_eval_utils.py b/rl_baselines/evaluation/cross_eval_utils.py new file mode 100644 index 000000000..b57856cb3 --- /dev/null +++ b/rl_baselines/evaluation/cross_eval_utils.py @@ -0,0 +1,304 @@ + +import glob +import os +import json +import numpy as np +import tensorflow as tf +import pickle +from datetime import datetime + + +from rl_baselines.utils import WrapFrameStack,computeMeanReward,printGreen +from srl_zoo.utils import printRed +from stable_baselines.common import set_global_seeds +from rl_baselines import AlgoType +from rl_baselines.registry import registered_rl + + + + +def loadConfigAndSetup(log_dir): + """ + load training variable from a pre-trained model + :param log_dir: the path where the model is located, + example: logs/sc2cc/OmnirobotEnv-v0/srl_combination/ppo2/19-05-07_11h32_39 + :return: train_args, algo_name, algo_class(stable_baselines.PPO2), srl_model_path, env_kwargs + """ + algo_name = "" + for algo in list(registered_rl.keys()): + if algo in log_dir: + algo_name = algo + break + algo_class, algo_type, _ = registered_rl[algo_name] + if algo_type == AlgoType.OTHER: + raise ValueError(algo_name + " is not supported for evaluation") + + env_globals = json.load(open(log_dir + "env_globals.json", 'r')) + train_args = json.load(open(log_dir + "args.json", 'r')) + env_kwargs = { + "renders":False, + "shape_reward": False, #TODO, since we dont use simple target, we should elimanate this choice? + "action_joints": train_args["action_joints"], + "is_discrete": not train_args["continuous_actions"], + "random_target": train_args.get('random_target', False), + "srl_model": train_args["srl_model"] + } + + # load it, if it was defined + if "action_repeat" in env_globals: + env_kwargs["action_repeat"] = env_globals['action_repeat'] + + # Remove up action + if train_args["env"] == "Kuka2ButtonGymEnv-v0": + env_kwargs["force_down"] = env_globals.get('force_down', True) + else: + env_kwargs["force_down"] = env_globals.get('force_down', False) + + if train_args["env"] == "OmnirobotEnv-v0": + env_kwargs["simple_continual_target"] = env_globals.get("simple_continual_target", False) + env_kwargs["circular_continual_move"] = env_globals.get("circular_continual_move", False) + env_kwargs["square_continual_move"] = env_globals.get("square_continual_move", False) + env_kwargs["eight_continual_move"] = env_globals.get("eight_continual_move", False) + + srl_model_path = None + if train_args["srl_model"] != "raw_pixels": + train_args["policy"] = "mlp" + path = env_globals.get('srl_model_path') + + if path is not None: + env_kwargs["use_srl"] = True + # Check that the srl saved model exists on the disk + assert os.path.isfile(env_globals['srl_model_path']), "{} does not exist".format(env_globals['srl_model_path']) + srl_model_path = env_globals['srl_model_path'] + env_kwargs["srl_model_path"] = srl_model_path + + return train_args, algo_name, algo_class, srl_model_path, env_kwargs + +def EnvsKwargs(task,env_kwargs): + """ + create several environments kwargs + :param tasks: the task we need the omnirobot to perform + :param env_kwargs: the original env_kwargs from previous pre-trained odel + :return: a list of env_kwargs that has the same length as tasks + """ + t=task + tmp=env_kwargs.copy() + tmp['simple_continual_target'] = False + tmp['circular_continual_move'] = False + tmp['square_continual_move'] = False + tmp['eight_continual_move'] = False + + + if (t=='sc'): + tmp['simple_continual_target']=True + + elif (t=='cc'): + tmp['circular_continual_move']=True + elif (t=='sqc'): + tmp['square_continual_move']=True + elif (t=='ec'): + tmp['eight_continual_move']=True + + return tmp + +def createEnv( model_dir,train_args, algo_name, algo_class, env_kwargs, log_dir="/tmp/gym/test/",num_cpu=1,seed=0): + """ + create the environment from env)kwargs + :param model_dir: The file name of the file which contains the pkl + :param train_args: + :param algo_name: + :param algo_class: + :param env_kwargs: + :param log_dir: + :param num_cpu: + :param seed: + :return: + """ + # Log dir for testing the agent + log_dir += "{}/{}/".format(algo_name, datetime.now().strftime("%y-%m-%d_%Hh%M_%S_%f")) + os.makedirs(log_dir, exist_ok=True) + args = { + "env": train_args['env'], + "seed":seed, + "num_cpu": num_cpu, + "num_stack": train_args["num_stack"], + "srl_model": train_args["srl_model"], + "algo_type": train_args.get('algo_type', None), + "log_dir": log_dir + } + algo_args = type('attrib_dict', (), args)() # anonymous class so the dict looks like Arguments object + envs = algo_class.makeEnv(algo_args, env_kwargs=env_kwargs, load_path_normalise=model_dir) + + return log_dir, envs, algo_args + + + + +def policyEval(envs,model_path,log_dir,algo_class,algo_args,num_timesteps=251,num_cpu=1): + """ + evaluation for the policy in the given envs + :param envs: the environment we want to evaluate + :param model_path: (str)the path to the policy ckp + :param log_dir: (str) the path from a gym temporal file + :param algo_class: + :param algo_args: + :param num_timesteps: (int) numbers of the timesteps we want to evaluate the policy + :param num_cpu: + :return: + """ + tf.reset_default_graph() + + method = algo_class.load(model_path, args=algo_args) + + using_custom_vec_env = isinstance(envs, WrapFrameStack) + + obs = envs.reset() + + + if using_custom_vec_env: + obs = obs.reshape((1,) + obs.shape) + n_done = 0 + last_n_done = 0 + episode_reward=[] + dones = [False for _ in range(num_cpu)] + + for i in range(num_timesteps): + actions=method.getAction(obs,dones) + obs, rewards, dones, _ = envs.step(actions) + if using_custom_vec_env: + obs = obs.reshape((1,) + obs.shape) + if using_custom_vec_env: + if dones: + obs = envs.reset() + obs = obs.reshape((1,) + obs.shape) + + n_done += np.sum(dones) + if (n_done - last_n_done) > 1: + last_n_done = n_done + _, mean_reward = computeMeanReward(log_dir, n_done) + episode_reward.append(mean_reward) + printRed('Episode:{} Reward:{}'.format(n_done,mean_reward)) + _, mean_reward = computeMeanReward(log_dir, n_done) + printRed('Episode:{} Reward:{}'.format(n_done, mean_reward)) + + episode_reward.append(mean_reward) + + episode_reward=np.array(episode_reward) + envs.close() + return episode_reward + +def latestPolicy(log_dir,algo_name): + """ + Get the latest saved model from a file + :param log_dir: (str) a path leads to the model saved path + :param algo_name: + :return: the file name of the latest saved policy and a flag + """ + files= glob.glob(os.path.join(log_dir+algo_name+'_*_model.pkl')) + files_list = [] + for file in files: + eps=int((file.split('_')[-2])) + files_list.append((eps,file)) + + def sortFirst(val): + return val[0] + + files_list.sort(key=sortFirst) + if len(files_list)>0: + #episode,latest model file path, OK + return files_list[-1][0],files_list[-1][1],True + else: + #No model saved yet + return 0,'',False + +def policyCrossEval(log_dir,task,num_timesteps=2000,num_cpu=1,seed=0): + train_args, algo_name, algo_class, srl_model_path, env_kwargs = loadConfigAndSetup(log_dir) + episode, model_path, OK = latestPolicy(log_dir, algo_name) + env_kwargs = EnvsKwargs(task, env_kwargs) + + OK = True + if (not OK): + # no latest model saved yet + return None, False + else: + pass + printGreen( + "Evaluation from the model saved at: {}, with evaluation time steps: {}".format(model_path, num_timesteps)) + + log_dir, environment, algo_args = createEnv(log_dir, train_args, algo_name, algo_class, env_kwargs, num_cpu=num_cpu,seed=seed) + + reward = policyEval(environment, model_path, log_dir, algo_class, algo_args, num_timesteps, num_cpu) + + # Just a trick to save the episode number of the reward,but need a little bit more space to store + reward = np.append(episode, reward) + return reward, True + + + + + +def episodeEval(log_dir,task,save_name='episode_eval.pkl',num_timesteps=800,num_cpu=1): + """ + given a log_dir to the model path and different tasks for ominirobot, save the reward of the latest saved model + it will not save anything if the old policy has already been evaluated + :param log_dir: + :param tasks: + :param save_name: the file name to save the policy evaluation result. + :return: + """ + + file_name=log_dir+save_name + + #can be changed accordingly + if(os.path.isfile(file_name)): + + #eval_reward=np.load(file_name) + with open(file_name, 'rb') as f: + eval_reward= pickle.load(f) + + + reward, ok = policyCrossEval(log_dir, task, num_timesteps,num_cpu=num_cpu) + if (ok): + if (task in eval_reward.keys()): + episodes = np.unique(eval_reward[task][:, 0]) + + current_episode =reward[0] + #Check if the latest episodes policy is already saved + if (current_episode not in episodes): + eval_reward[task]=np.append(eval_reward[task],[reward],axis=0) + with open(file_name, 'wb') as f: + pickle.dump(eval_reward, f, pickle.HIGHEST_PROTOCOL) + else:# The task is not in the file yet + eval_reward[task] =reward[None,:] + with open(file_name, 'wb') as f: + pickle.dump(eval_reward, f, pickle.HIGHEST_PROTOCOL) + + + else: #There is still not a episodes rewards evaluation registered + reward, ok = policyCrossEval(log_dir, task, num_timesteps,num_cpu=num_cpu) + eval_reward = {} + if(ok): + eval_reward[task]=reward[None,:] + + + with open(file_name, 'wb') as f: + pickle.dump(eval_reward, f, pickle.HIGHEST_PROTOCOL) + + return + + +if __name__ == '__main__': + + import argparse + + parser = argparse.ArgumentParser(description="several runtime for cross evaluation", + epilog='') + parser.add_argument('--task', type=str, default='cc', help='task',choices=['cc','sc','sqc']) + parser.add_argument('--log-dir', type=str, default='', help='the directory to policy',required=True) + parser.add_argument('--num-timesteps', type=int, default=1000, help='time steps to evaluate the policy', ) + parser.add_argument('--num-cpu', type=int, default=1, help='number of cpu to perform the evaluation') + + args,_= parser.parse_known_args() + task=args.task + log_dir=args.log_dir + episodeEval(log_dir,task,save_name='episode_eval.pkl',num_timesteps=args.num_timesteps,num_cpu=args.num_cpu) diff --git a/rl_baselines/evaluation/eval_post.py b/rl_baselines/evaluation/eval_post.py new file mode 100644 index 000000000..7bbaeab63 --- /dev/null +++ b/rl_baselines/evaluation/eval_post.py @@ -0,0 +1,134 @@ + +import subprocess +import numpy as np +import pickle +import argparse +import os + +from rl_baselines.student_eval import allPolicy +from srl_zoo.utils import printRed, printGreen +from rl_baselines.evaluation.cross_eval_utils import EnvsKwargs, loadConfigAndSetup, policyEval,createEnv + +def dict2array(tasks,data): + res=[] + + for t in tasks: + if(t=='sc'): + max_reward=250 + else: + max_reward=1850 + + data[t][:,1:]=data[t][:,1:]/max_reward + res.append(data[t]) + res=np.array(res) + return res + +def episodeEval(log_dir, tasks,num_timesteps=1000): + for t in tasks: + eval_args=['--log-dir', log_dir, '--num-timesteps', str(num_timesteps), '--num-cpu',str(5)] + task_args=['--task',t] + + subprocess.call(['python', '-m', 'rl_baselines.cross_eval_utils']+eval_args+task_args) + file_name=log_dir+'episode_eval.pkl' + with open(file_name, 'rb') as f: + eval_reward = pickle.load(f) + + #Trasfer the data from dict into a numpy array and save + eval_reward=dict2array(tasks,eval_reward) + file_name=log_dir+'episode_eval.npy' + np.save(file_name, eval_reward) + + + + +def policyCrossEval(log_dir,task,episode,model_path, num_timesteps=2000,num_cpu=1,seed=0): + train_args, algo_name, algo_class, srl_model_path, env_kwargs = loadConfigAndSetup(log_dir) + env_kwargs = EnvsKwargs(task, env_kwargs) + + OK = True + if (not OK): + # no latest model saved yet + return None, False + else: + pass + printGreen( + "Evaluation from the model saved at: {}, with evaluation time steps: {}".format(model_path, num_timesteps)) + + log_dir, environment, algo_args = createEnv(log_dir, train_args, algo_name, algo_class, env_kwargs, num_cpu=num_cpu,seed=seed) + + reward = policyEval(environment, model_path, log_dir, algo_class, algo_args, num_timesteps, num_cpu) + + # Just a trick to save the episode number of the reward,but need a little bit more space to store + reward = np.append(episode, reward) + return reward, True + + + + + +def saveReward(log_dir,reward, task,save_name='episode_eval.pkl'): + reward = reward.astype(float) + + file_name=log_dir+save_name + + #can be changed accordingly + if(os.path.isfile(file_name)): + + + with open(file_name, 'rb') as f: + eval_reward= pickle.load(f) + + if (task in eval_reward.keys()): + episodes = eval_reward[task][0] + #The fisrt dimension of reward is the episode + current_episode =reward[0] + #Check if the latest episodes policy is already saved + if (current_episode not in episodes): + # # eval_reward[task]=np.append(eval_reward[task],[reward],axis=0) + eval_reward[task][0].append(reward[0]) + eval_reward[task][1].append(reward.tolist()) + else: + index = episodes.index(current_episode) + eval_reward[task][1][index].extend(reward[1:]) + with open(file_name, 'wb') as f: + pickle.dump(eval_reward, f, pickle.HIGHEST_PROTOCOL) + else:# The task is not in the file yet + eval_reward[task]=([reward[0]],[reward.tolist()]) + with open(file_name, 'wb') as f: + pickle.dump(eval_reward, f, pickle.HIGHEST_PROTOCOL) + else: #There is still not a episodes rewards evaluation registered + + eval_reward = {} + eval_reward[task]=([reward[0]],[reward.tolist()]) + with open(file_name, 'wb') as f: + pickle.dump(eval_reward, f, pickle.HIGHEST_PROTOCOL) + + return + + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description="Evaluation after training") + parser.add_argument('--log-dir',type=str, default='' + ,help='RL algo to use') + parser.add_argument('--task-label', type=str, default='', + help='task to evaluate') + parser.add_argument('--episode', type=str, default='', + help='evaluation for the policy saved at this episode') + parser.add_argument('--policy-path', type=str, default='', + help='policy path') + parser.add_argument('--seed', type=int, default=0, + help='policy path') + args, unknown = parser.parse_known_args() + + + + + reward, _ = policyCrossEval(args.log_dir, args.task_label, episode=args.episode, model_path=args.policy_path, + num_timesteps=251,seed=args.seed) + saveReward(args.log_dir, reward, args.task_label, save_name='episode_eval.pkl') + + + + + diff --git a/rl_baselines/pipeline_cross.py b/rl_baselines/pipeline_cross.py new file mode 100644 index 000000000..6b788f111 --- /dev/null +++ b/rl_baselines/pipeline_cross.py @@ -0,0 +1,187 @@ +""" +baseline benchmark script for openAI RL Baselines +""" +import os +import argparse +import subprocess + +import yaml +import numpy as np + +from rl_baselines.registry import registered_rl +from environments.registry import registered_env +from state_representation.registry import registered_srl +from state_representation import SRLType +from srl_zoo.utils import printGreen, printRed, printYellow + +os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # used to remove debug info of tensorflow + + +''' +example: +------------------------------------------------------------------------------ +python -m rl_baselines.pipeline_cross --algo ppo2 --log-dir logs/ +--srl-model srl_combination ground_truth --num-iteration 1 --num-timesteps 100000 +--task sc cc +--srl-config-file config/srl_models.yaml config/srl_models_test.yaml +------------------------------------------------------------------------------ +--srl-config-file : a list of config_file which should have the same number as tasks. (or only one, it will take this one for all tasks by default) +''' + +def main(): + parser = argparse.ArgumentParser(description="OpenAI RL Baselines Benchmark", + epilog='After the arguments are parsed, the rest are assumed to be arguments for' + + ' rl_baselines.train') + parser.add_argument('--algo', type=str, default='ppo2', help='OpenAI baseline to use', + choices=list(registered_rl.keys())) + parser.add_argument('--env', type=str, nargs='+', default=["OmnirobotEnv-v0"], help='environment ID(s)', + choices=["OmnirobotEnv-v0"])#list(registered_env.keys())) + parser.add_argument('--srl-model', type=str, nargs='+', default=["ground_truth"], + help='SRL model(s) to use', + choices=list(registered_srl.keys())) + parser.add_argument('--num-timesteps', type=int, default=1e6, help='number of timesteps the baseline should run') + parser.add_argument('-v', '--verbose', action='store_true', default=False, help='Display baseline STDOUT') + parser.add_argument('--num-iteration', type=int, default=15, + help='number of time each algorithm should be run for each unique combination of environment ' + + ' and srl-model.') + parser.add_argument('--seed', type=int, default=0, + help='initial seed for each unique combination of environment and srl-model.') + parser.add_argument('--srl-config-file', nargs='+', type=str, default=["config/srl_models.yaml"], + help='Set the location of the SRL model path configuration.') + + parser.add_argument('--tasks', type=str, nargs='+', default=["cc"], + help='The tasks for the robot', + choices=["cc","ec","sqc","sc"]) +# parser.add_argument('--srl-modell', type=str, default="",help='') + # returns the parsed arguments, and the rest are assumed to be arguments for rl_baselines.train + args, train_args = parser.parse_known_args() + + + + + + # Sanity check + assert args.num_timesteps >= 1, "Error: --num-timesteps cannot be less than 1" + assert args.num_iteration >= 1, "Error: --num-iteration cannot be less than 1" + + # Removing duplicates and sort + srl_models = list(set(args.srl_model)) + envs = list(set(args.env)) + tasks=args.tasks + srl_models.sort() + envs.sort() + tasks=['-'+t for t in tasks] + config_files=args.srl_config_file + + # LOAD SRL models list + + + if len(config_files)==1: + printYellow("Your are using the same config file: {} for all training tasks".format(config_files[0])) + + for i in range(len(tasks)-1): + config_files.append(config_files[0]) + else: + assert len(config_files)==len(tasks), \ + "Error: {} config files given for {} tasks".format(len(config_files),len(tasks)) + + for file in config_files: + assert os.path.exists(file), \ + "Error: cannot load \"--srl-config-file {}\", file not found!".format(file) + + for file in config_files: + with open(file, 'rb') as f: + all_models = yaml.load(f) + # Checking definition and presence of all requested srl_models + valid = True + for env in envs: + # validated the env definition + if env not in all_models: + printRed("Error: 'srl_models.yaml' missing definition for environment {}".format(env)) + valid = False + continue # skip to the next env, this one is not valid + + # checking log_folder for current env + missing_log = "log_folder" not in all_models[env] + if missing_log: + printRed("Error: '{}' missing definition for log_folder in environment {}".format(file, env)) + valid = False + + # validate each model for the current env definition + for model in srl_models: + if registered_srl[model][0] == SRLType.ENVIRONMENT: + continue # not an srl model, skip to the next model + elif model not in all_models[env]: + printRed("Error: '{}' missing srl_model {} for environment {}".format(file, model, env)) + valid = False + elif (not missing_log) and (not os.path.exists(all_models[env]["log_folder"] + all_models[env][model])): + # checking presence of srl_model path, if and only if log_folder exists + printRed("Error: srl_model {} for environment {} was defined in ".format(model, env) + + "'{}', however the file {} it was tagetting does not exist.".format( + file, all_models[env]["log_folder"] + all_models[env][model])) + valid = False + + assert valid, "Errors occurred due to malformed {}, cannot continue.".format(file) + + + + # check that all the SRL_models can be run on all the environments + valid = True + for env in envs: + for model in srl_models: + if registered_srl[model][1] is not None: + found = False + for compatible_class in registered_srl[model][1]: + if issubclass(compatible_class, registered_env[env][0]): + found = True + break + if not found: + valid = False + printRed("Error: srl_model {}, is not compatible with the {} environment.".format(model, env)) + assert valid, "Errors occured due to an incompatible combination of srl_model and environment, cannot continue." + + # the seeds used in training the baseline. + seeds = list(np.arange(args.num_iteration) + args.seed) + + if args.verbose: + # None here means stdout of terminal for subprocess.call + stdout = None + else: + stdout = open(os.devnull, 'w') + + printGreen("\nRunning {} benchmarks {} times...".format(args.algo, args.num_iteration)) + print("\nSRL-Models:\t{}".format(srl_models)) + print("environments:\t{}".format(envs)) + print("verbose:\t{}".format(args.verbose)) + print("timesteps:\t{}".format(args.num_timesteps)) + + num_tasks=len(tasks) + print(num_tasks) + + printGreen("The tasks that will be exacuted: {}".format(args.tasks)) + printGreen("with following config files: {}".format(config_files)) + + + for model in srl_models: + + for env in envs: + for iter_task in range(num_tasks): + + for i in range(args.num_iteration): + printGreen( + "\nIteration_num={} (seed: {}), Environment='{}', SRL-Model='{}' , Task='{}',Config_file='{}'".format(i, seeds[i], env, model, tasks[iter_task],config_files[iter_task])) + + # redefine the parsed args for rl_baselines.train + loop_args = ['--srl-model', model, '--seed', str(seeds[i]), + '--algo', args.algo, '--env', env, + '--num-timesteps', str(int(args.num_timesteps)), + '--srl-config-file', config_files[iter_task], tasks[iter_task]] + ok = subprocess.call(['python', '-m', 'rl_baselines.train'] + train_args + loop_args, stdout=stdout) + + if ok != 0: + # throw the error down to the terminal + raise ChildProcessError("An error occured, error code: {}".format(ok)) + + +if __name__ == '__main__': + main() diff --git a/rl_baselines/registry.py b/rl_baselines/registry.py index 8fb8824d6..94534fddf 100644 --- a/rl_baselines/registry.py +++ b/rl_baselines/registry.py @@ -12,6 +12,7 @@ from rl_baselines.random_agent import RandomAgentModel from rl_baselines.rl_algorithm.sac import SACModel from rl_baselines.rl_algorithm.trpo import TRPOModel +from rl_baselines.supervised_rl.policy_distillation import PolicyDistillationModel # Register, name: (algo class, algo type, list of action types) registered_rl = { @@ -26,7 +27,8 @@ "ppo2": (PPO2Model, AlgoType.REINFORCEMENT_LEARNING, [ActionType.DISCRETE, ActionType.CONTINUOUS]), "random_agent": (RandomAgentModel, AlgoType.OTHER, [ActionType.DISCRETE, ActionType.CONTINUOUS]), "sac": (SACModel, AlgoType.REINFORCEMENT_LEARNING, [ActionType.CONTINUOUS]), - "trpo": (TRPOModel, AlgoType.REINFORCEMENT_LEARNING, [ActionType.DISCRETE, ActionType.CONTINUOUS]) + "trpo": (TRPOModel, AlgoType.REINFORCEMENT_LEARNING, [ActionType.DISCRETE, ActionType.CONTINUOUS]), + "distillation": (PolicyDistillationModel, AlgoType.REINFORCEMENT_LEARNING, [ActionType.DISCRETE]) } # Checking validity of the registered RL algorithms diff --git a/rl_baselines/student_eval.py b/rl_baselines/student_eval.py new file mode 100644 index 000000000..2119f072c --- /dev/null +++ b/rl_baselines/student_eval.py @@ -0,0 +1,321 @@ +import argparse +import datetime +import glob +import os +import subprocess +import time + +import json +import numpy as np + +from environments.registry import registered_env +from rl_baselines.evaluation.cross_eval_utils import loadConfigAndSetup, latestPolicy +from srl_zoo.utils import printRed, printYellow, printBlue +from state_representation.registry import registered_srl + +CONTINUAL_LEARNING_LABELS = ['CC', 'SC', 'EC', 'SQC', 'ESC'] +CL_LABEL_KEY = "continual_learning_label" + +def OnPolicyDatasetGenerator(teacher_path, output_name, task_id, episode=-1, env_name='OmnirobotEnv-v0', num_cpu=1, + num_eps=200, test_mode=False): + """ + + :param teacher_path: + :param output_name: + :param task_id: + :param episode: + :param env_name: + :param num_cpu: + :param num_eps: + :return: + """ + assert task_id in CONTINUAL_LEARNING_LABELS + command_line = ['python', '-m', 'environments.dataset_generator', '--run-policy', 'custom'] + cpu_command = ['--num-cpu', str(num_cpu)] + name_command = ['--name', output_name] + save_path = ['--save-path', "data/"] + env_command = ['--env', env_name] + task_command = ['-' + task_id.lower()] + if task_id == 'SC': + episode_command = ['--num-episode', str(10 if test_mode else 400)] + else: + episode_command = ['--num-episode', str(10 if test_mode else 60)] + + policy_command = ['--log-custom-policy', teacher_path] + if episode == -1: + eps_policy = [] + else: + eps_policy = ['--episode', str(episode)] + + command = command_line + cpu_command + policy_command + name_command + env_command + task_command + \ + episode_command + eps_policy + save_path + ['-f'] + ['--seed', str(2)] + + if task_id == 'SC': + command += ['--short-episodes'] + + ok = subprocess.call(command) + assert ok == 0, "Teacher dataset for task " + task_id + " was not generated !" + + +def allPolicy(log_dir): + """ + + :param log_dir: + :return: + """ + train_args, algo_name, algo_class, srl_model_path, env_kwargs = loadConfigAndSetup(log_dir) + files = glob.glob(os.path.join(log_dir + algo_name + '_*_model.pkl')) + files_list = [] + for file in files: + eps = int((file.split('_')[-2])) + files_list.append((eps, file)) + + def sortFirst(val): + """ + + :param val: + :return: + """ + return val[0] + + files_list.sort(key=sortFirst) + res = np.array(files_list) + print(res) + return res[:, 0], res[:, 1] + + +def allPolicyFiles(log_dir): + """ + + :param log_dir: + :return: + """ + printYellow(log_dir) + files = glob.glob(log_dir + '/model_*') + + files_list = [] + for file in files: + eps = int((file.split('_')[-1])) + files_list.append((eps, file+'/')) + + def sortFirst(val): + """ + + :param val: + :return: + """ + return val[0] + + files_list.sort(key=sortFirst) + res = np.array(files_list) + return res[:, 0], res[:, 1] + + +def newPolicy(episodes, file_path): + """ + + :param episodes: + :param file_path: + :return: + """ + train_args, algo_name, algo_class, srl_model_path, env_kwargs = loadConfigAndSetup(file_path) + episode, model_path, OK = latestPolicy(file_path, algo_name) + if episode in episodes: + return -1, '', False + else: + return episode, model_path, True + + +def trainStudent(teacher_data_path, task_id, yaml_file='config/srl_models.yaml', log_dir='logs/', + srl_model='srl_combination', env_name='OmnirobotEnv-v0', training_size=40000, epochs=20): + """ + + :param teacher_data_path: + :param task_id: Environment ID + :param yaml_file: + :param log_dir: + :param srl_model: + :param env_name: + :param training_size: + :param epochs: + :return: + """ + command_line = ['python', '-m', 'rl_baselines.train', '--latest', '--algo', 'distillation', '--log-dir', log_dir] + srl_command = ['--srl-model', srl_model] + env_command = ['--env', env_name] + policy_command = ['--teacher-data-folder', teacher_data_path] + size_epochs = ['--distillation-training-set-size', str(training_size), '--epochs-distillation', str(epochs)] + task_command = ['-' + task_id.lower()] + ok = subprocess.call(command_line + srl_command + + env_command + policy_command + size_epochs + task_command + ['--srl-config-file', yaml_file]) + assert ok == 0 + + +def mergeData(teacher_dataset_1, teacher_dataset_2, merge_dataset, force=False): + """ + + :param teacher_dataset_1: + :param teacher_dataset_2: + :param merge_dataset: + :return: + """ + merge_command = ['--merge', teacher_dataset_1, teacher_dataset_2, merge_dataset] + if force: + merge_command.append('-f') + # -rm is to remove the original dataset after training, this would be of the same effect as the old version + # which will remove the dataset automatically. + ok = subprocess.call(['python', '-m', 'environments.dataset_merger', '-rm'] + merge_command) + assert ok == 0 + + +def main(): + # Global variables for callback + parser = argparse.ArgumentParser(description="Evaluation script for distillation from two teacher policies") + parser.add_argument('--seed', type=int, default=0, help='random seed (default: 0)') + parser.add_argument('--env', type=str, help='environment ID', default='OmnirobotEnv-v0', + choices=list(registered_env.keys())) + parser.add_argument('--episode_window', type=int, default=40, + help='Episode window for moving average plot (default: 40)') + parser.add_argument('--log-dir-teacher-one', default='/tmp/gym/', type=str, + help='directory to load an optmimal agent for task 1 (default: /tmp/gym)') + parser.add_argument('--log-dir-teacher-two', default='/tmp/gym/', type=str, + help='directory to load an optmimal agent for task 2 (default: /tmp/gym)') + parser.add_argument('--log-dir-student', default='/tmp/gym/', type=str, + help='directory to save the student agent logs and model (default: /tmp/gym)') + parser.add_argument('--srl-config-file-one', type=str, default="config/srl_models_one.yaml", + help='Set the location of the SRL model path configuration.') + parser.add_argument('--srl-config-file-two', type=str, default="config/srl_models_two.yaml", + help='Set the location of the SRL model path configuration.') + parser.add_argument('--epochs-distillation', type=int, default=30, metavar='N', + help='number of epochs to train for distillation(default: 30)') + parser.add_argument('--distillation-training-set-size', type=int, default=-1, + help='Limit size (number of samples) of the training set (default: -1)') + parser.add_argument('--eval-tasks', type=str, nargs='+', default=['cc', 'sqc', 'sc'], + help='A cross evaluation from the latest stored model to all tasks') + parser.add_argument('--continual-learning-labels', type=str, nargs=2, metavar=('label_1', 'label_2'), + default=argparse.SUPPRESS, + help='Labels for the continual learning RL distillation task.') + parser.add_argument('--student-srl-model', type=str, default='raw_pixels', choices=list(registered_srl.keys()), + help='SRL model to use for the student RL policy') + parser.add_argument('--epochs-teacher-datasets', type=int, default=30, metavar='N', + help='number of epochs for generating both RL teacher datasets (default: 30)') + parser.add_argument('--num-iteration', type=int, default=1, + help='number of time each algorithm should be run the eval (N seeds).') + parser.add_argument('--eval-episode-window', type=int, default=400, metavar='N', + help='Episode window for saving each policy checkpoint for future distillation(default: 100)') + + args, unknown = parser.parse_known_args() + + if 'continual_learning_labels' in args: + assert args.continual_learning_labels[0] in CONTINUAL_LEARNING_LABELS and args.continual_learning_labels[1] \ + in CONTINUAL_LEARNING_LABELS, "Please specify a valid Continual learning label to each dataset to be " \ + "used for RL distillation !" + print(args.continual_learning_labels) + assert os.path.exists(args.srl_config_file_one), \ + "Error: cannot load \"--srl-config-file {}\", file not found!".format(args.srl_config_file_one) + + assert os.path.exists(args.srl_config_file_two), \ + "Error: cannot load \"--srl-config-file {}\", file not found!".format(args.srl_config_file_two) + if not (args.log_dir_teacher_one == "None"): + assert os.path.exists(args.log_dir_teacher_one), \ + "Error: cannot load \"--srl-config-file {}\", file not found!".format(args.log_dir_teacher_one) + assert os.path.exists(args.log_dir_teacher_two), \ + "Error: cannot load \"--srl-config-file {}\", file not found!".format(args.srl_config_file_two) + + teacher_pro = args.log_dir_teacher_one + teacher_learn = args.log_dir_teacher_two + + # The output path generate from the + teacher_pro_data = args.continual_learning_labels[0] + '/' + teacher_learn_data = args.continual_learning_labels[1] + '/' + merge_path = "data/on_policy_merged" + + print(teacher_pro_data, teacher_learn_data) + episodes, policy_path = allPolicy(teacher_learn) + + rewards_at_episode = {} + + if args.continual_learning_labels[1] == "CC": + episodes_to_test = [e for e in episodes if (int(e) < 2000 and int(e) % 200 == 0) or + (int(e) > 2000 and int(e) % 1000 == 0)] + else: + episodes_to_test = [e for e in episodes if (int(e) <= 5000 and int(e) % 1000 == 0) or + (int(e) > 5000 and int(e) % 10000 == 0)] + + if not (args.log_dir_teacher_one == "None"): + # generate data from Professional teacher + printYellow("\nGenerating on policy for optimal teacher: " + args.continual_learning_labels[0]) + + OnPolicyDatasetGenerator(teacher_pro, args.continual_learning_labels[0] + '_copy/', + task_id=args.continual_learning_labels[0], num_eps=args.epochs_teacher_datasets, + episode=-1, env_name=args.env) + print("Eval on eps list: ", episodes_to_test) + for eps in episodes_to_test: + student_path = args.log_dir_student + printBlue("\n\nEvaluation at episode " + str(eps)) + + if not (args.log_dir_teacher_one == "None"): + # Use a copy of the optimal teacher + ok = subprocess.call( + ['cp', '-r', 'data/' + args.continual_learning_labels[0] + '_copy/', 'data/' + teacher_pro_data, '-f']) + assert ok == 0 + time.sleep(2) + + # Generate data from learning teacher + printYellow("\nGenerating on-policy data from the optimal teacher: " + args.continual_learning_labels[1]) + OnPolicyDatasetGenerator(teacher_learn, teacher_learn_data, task_id=args.continual_learning_labels[1], + num_eps=args.epochs_teacher_datasets, episode=eps, env_name=args.env) + + # If Performing policy distillation from a single (learning) teacher at multiple checkpoints + if args.log_dir_teacher_one == "None": + merge_path = 'data/' + teacher_learn_data + ok = subprocess.call( + ['cp', '-r', merge_path, 'srl_zoo/data/', '-f']) + else: + # merge the data + mergeData('data/' + teacher_pro_data, 'data/' + teacher_learn_data, merge_path, force=True) + + ok = subprocess.call( + ['cp', '-r', 'data/on_policy_merged/', 'srl_zoo/data/', '-f']) + assert ok == 0 + time.sleep(2) + + # Train a policy with distillation on the merged teacher's datasets + trainStudent('srl_zoo/' + merge_path, args.continual_learning_labels[1], yaml_file=args.srl_config_file_one, + log_dir=args.log_dir_student, + srl_model=args.student_srl_model, env_name=args.env, + training_size=args.distillation_training_set_size, epochs=args.epochs_distillation) + student_path += args.env + '/' + args.student_srl_model + "/distillation/" + latest_student_path = max([student_path + "/" + d for d in os.listdir(student_path) + if os.path.isdir(student_path + "/" + d)], key=os.path.getmtime) + '/' + rewards = {} + printRed("\nSaving the student at path: " + latest_student_path) + for task_label in ["-sc", "-cc"]: + rewards[task_label] = [] + + for seed_i in range(args.num_iteration): + printYellow("\nEvaluating student on task: " + task_label + " for seed: " + str(seed_i)) + command_line_enjoy_student = ['python', '-m', 'replay.enjoy_baselines', '--num-timesteps', '251', + '--log-dir', latest_student_path, task_label, "--seed", str(seed_i)] + ok = subprocess.check_output(command_line_enjoy_student) + ok = ok.decode('utf-8') + str_before = "Mean reward: " + str_after = "\npybullet" + idx_before = ok.find(str_before) + len(str_before) + idx_after = ok.find(str_after) + seed_reward = float(ok[idx_before: idx_after]) + rewards[task_label].append(seed_reward) + print("rewards at eps ", eps, ": ", rewards) + rewards_at_episode[eps] = rewards + print("All rewards: ", rewards_at_episode) + json_dict = json.dumps(rewards_at_episode) + json_dict_name = \ + args.log_dir_student + "/reward_at_episode_" + datetime.datetime.now().strftime("%y-%m-%d_%Hh%M_%S") + '.json' + f = open(json_dict_name, "w") + f.write(json_dict) + f.close() + printRed("\nSaving the evalation at path: " + json_dict_name) + + +if __name__ == '__main__': + main() diff --git a/rl_baselines/supervised_rl/Readme.md b/rl_baselines/supervised_rl/Readme.md new file mode 100644 index 000000000..16c273f3a --- /dev/null +++ b/rl_baselines/supervised_rl/Readme.md @@ -0,0 +1,117 @@ + + + +# Steps for Distillation + + + + + +# 1 - Train Baselines + + +### 0 - Generate datasets for SRL (random policy) + +``` +cd robotics-rl-srl +# Dataset 1 (random reaching target) +python -m environments.dataset_generator --num-cpu 6 --name Omnibot_random_simple --env OmnirobotEnv-v0 --simple-continual --num-episode 250 -f +# Dataset 2 (Circular task) +python -m environments.dataset_generator --num-cpu 6 --name Omnibot_circular --env OmnirobotEnv-v0 --circular-continual --num-episode 250 -f +``` + +### 1.1) Train SRL + +``` +cd srl_zoo +# Dataset 1 (random reaching target) +python train.py --data-folder data/Omnibot_random_simple -bs 32 --epochs 20 --state-dim 200 --training-set-size 20000 --losses autoencoder inverse +# Dataset 2 (Circular task) +python train.py --data-folder data/Omnibot_circular -bs 32 --epochs 20 --state-dim 200 --training-set-size 20000 --losses autoencoder inverse +``` + + +### 1.2) Train policy + +Train + +``` +cd .. + +# save config file +cp config/srl_models.yaml config/srl_models_temp.yaml + +# Dataset 1 (random reaching target) +python -m rl_baselines.train --algo ppo2 --srl-model srl_combination --srl-config-file config/srl_models_simple.yaml --num-timesteps 5000000 --env OmnirobotEnv-v0 --log-dir logs/simple/ --num-cpu 8 --simple-continual --latest + +# Dataset 2 (Circular task) +python -m rl_baselines.train --algo ppo2 --srl-model srl_combination --srl-config-file config/srl_models_circular.yaml --num-timesteps 5000000 --env OmnirobotEnv-v0 --log-dir logs/circular/ --num-cpu 6 --circular-continual --latest + +# restore config file +cp config/srl_models_temp.yaml config/srl_models.yaml +``` + +Visualize and plot + +``` +# Visualize episodes + +python -m replay.enjoy_baselines --log-dir *file* --num-timesteps 10000 --render --action-proba +example : python -m replay.enjoy_baselines --log-dir logs/simple/OmnirobotEnv-v0/srl_combination/ppo2/19-04-25_10h19_42/ --num-timesteps 10000 --render --action-proba + + + +# plot results +python -m replay.plots --log-dir /logs/simple/OmnirobotEnv-v0/srl_combination/ppo/ --latest + +python -m replay.plots --log-dir /logs/circular/OmnirobotEnv-v0/srl_combination/ppo/ --latest + +``` + +# 2 - Train Distillation + + +### 2.1) Generate dataset on Policy + +(le dossier "data" ne se créé pas tout seul donc executer : "mkdir data" si besoin) + + +(pas completement automatisé "log_custom_policy" doit etre mis manuellement) + +``` +# Dataset 1 (random reaching target) +python -m environments.dataset_generator --env OmnirobotEnv-v0 --num-episode 100 --run-policy custom --log-custom-policy logs/*path2policy* --short-episodes --save-path data/ --name reaching_on_policy -sc + +# Dataset 2 (Circular task) +python -m environments.dataset_generator --env OmnirobotEnv-v0 --num-episode 100 --run-policy custom --log-custom-policy logs/*path2policy* --short-episodes --save-path data/ --name circular_on_policy -cc + +# Merge Datasets + +(/ ! \ it removes the generated dataset for dataset 1 and 2) + +python -m environments.dataset_merger --merge data/circular_on_policy/ data/reaching_on_policy/ data/merge_CC_SC + +# Copy the merged Dataset to srl_zoo repository +cp -r data/merge_CC_SC srl_zoo/data/merge_CC_SC +``` + +### 2.2) Train SRL 1&2 + +``` +cd srl_zoo +# Dataset 1 +python train.py --data-folder data/merge_CC_SC -bs 32 --epochs 20 --state-dim 200 --training-set-size 30000--losses autoencoder inverse + +# Update your RL logs to load the proper SRL model for future distillation, i.e distillation: new-log/srl_model.pth +``` + +### 2.3) Run Distillation + +``` +# make a new log folder +mkdir logs/CL_SC_CC +cp config/srl_models_merged.yaml config/srl_models.yaml + +# Merged Dataset +python -m rl_baselines.train --algo distillation --srl-model raw_pixels --env OmnirobotEnv-v0 --log-dir logs/CL_SC_CC --teacher-data-folder srl_zoo/data/merge_CC_SC -cc --distillation-training-set-size 40000 --epochs-distillation 20 --latest +``` diff --git a/rl_baselines/supervised_rl/__init__.py b/rl_baselines/supervised_rl/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/rl_baselines/supervised_rl/policy_distillation.py b/rl_baselines/supervised_rl/policy_distillation.py new file mode 100644 index 000000000..88543d929 --- /dev/null +++ b/rl_baselines/supervised_rl/policy_distillation.py @@ -0,0 +1,320 @@ +import numpy as np +import pickle +import torch as th + +from tqdm import tqdm +from torch import nn +from torch.nn import functional as F + +from rl_baselines.base_classes import BaseRLObject +from srl_zoo.models.models import CustomCNN +from srl_zoo.preprocessing.data_loader import SupervisedDataLoader, DataLoader +from srl_zoo.utils import loadData +from state_representation.models import loadSRLModel, getSRLDim + +N_WORKERS = 4 +BATCH_SIZE = 32 +TEST_BATCH_SIZE = 256 +VALIDATION_SIZE = 0.2 # 20% of training data for validation +MAX_BATCH_SIZE_GPU = 256 # For plotting, max batch_size before having memory issues +RENDER_HEIGHT = 224 +RENDER_WIDTH = 224 +FINE_TUNING = False + +CONTINUAL_LEARNING_LABELS = ['CC', 'SC', 'EC', 'SQC'] +CL_LABEL_KEY = "continual_learning_label" +USE_ADAPTIVE_TEMPERATURE = False +TEMPERATURES = {'CC': 0.1, 'SC': 0.1, 'EC': 0.1, 'SQC': 0.1, "default": 0.1} +# run with 0.1 to have good results! +# 0.01 worse reward for CC, better SC + + +class MLPPolicy(nn.Module): + def __init__(self, output_size, input_size, hidden_size=16): + super(MLPPolicy, self).__init__() + + self.input_size = input_size + self.hidden_size = hidden_size + self.output_size = output_size + + self.fc = nn.Sequential(nn.Linear(self.input_size, self.hidden_size), + nn.ReLU(inplace=True), + nn.Linear(self.hidden_size, self.output_size) + ) + + def forward(self, input): + input = input.view(-1, self.input_size) + return F.softmax(self.fc(input), dim=1) + + +class CNNPolicy(nn.Module): + def __init__(self, output_size): + super(CNNPolicy, self).__init__() + self.model = CustomCNN(state_dim=output_size) + + def forward(self, input): + return F.softmax(self.model(input), dim=1) + + +class PolicyDistillationModel(BaseRLObject): + """ + Implementation of PolicyDistillation + """ + def __init__(self): + super(PolicyDistillationModel, self).__init__() + + def save(self, save_path, _locals=None): + assert self.model is not None, "Error: must train or load model before use" + with open(save_path, "wb") as f: + pickle.dump(self.__dict__, f) + + @classmethod + def load(cls, load_path, args=None): + with open(load_path, "rb") as f: + class_dict = pickle.load(f) + loaded_model = PolicyDistillationModel() + loaded_model.__dict__ = class_dict + + return loaded_model + + def customArguments(self, parser): + parser.add_argument('--nothing4instance', help='Number of population (each one has 2 threads)', type=bool, + default=True) + + return parser + + def getActionProba(self, observation, dones=None, delta=0): + """ + returns the action probability distribution, from a given observation. + :param observation: (numpy int or numpy float) + :param dones: ([bool]) + :param delta: (numpy float or float) The exploration noise applied to the policy, set to 0 for no noise. + :return: (numpy float) + """ + assert self.model is not None, "Error: must train or load model before use" + if len(observation.shape) > 2: + observation = np.transpose(observation, (0, 3, 2, 1)) + observation = th.from_numpy(observation).float().requires_grad_(False).to(self.device) + action = self.model.forward(observation).detach().cpu().numpy() + return action + + def getAction(self, observation, dones=None, delta=0, sample=False): + """ + From an observation returns the associated action + :param observation: (numpy int or numpy float) + :param dones: ([bool]) + :param delta: (numpy float or float) The exploration noise applied to the policy, set to 0 for no noise. + :return: (numpy float) + """ + assert self.model is not None, "Error: must train or load model before use" + + self.model.eval() + if len(observation.shape) > 2: + observation = np.transpose(observation, (0, 3, 2, 1)) + observation = th.from_numpy(observation).float().requires_grad_(False).to(self.device) + + if sample: + proba_actions = self.model.forward(observation).detach().cpu().numpy().flatten() + return np.random.choice(range(len(proba_actions)), 1, p=proba_actions) + else: + return [np.argmax(self.model.forward(observation).detach().cpu().numpy())] + + def loss_fn_kd(self, outputs, teacher_outputs, labels=None, adaptive_temperature=False): + """ + Hyperparameters: temperature and alpha + :param outputs: output from the student model + :param teacher_outputs: output from the teacher_outputs model + :return: loss + """ + if labels is not None and adaptive_temperature: + T = th.from_numpy(np.array([TEMPERATURES[labels[idx_elm]] for idx_elm in range(BATCH_SIZE)])).cuda().float() + + KD_loss = F.softmax(th.div(teacher_outputs.transpose(1, 0), T), dim=1) * \ + th.log((F.softmax(th.div(teacher_outputs.transpose(1, 0), T), dim=1) / F.softmax(outputs, dim=1))) + else: + T = TEMPERATURES["default"] + KD_loss = F.softmax(teacher_outputs/T, dim=1) * \ + th.log((F.softmax(teacher_outputs/T, dim=1) / F.softmax(outputs, dim=1))) + return KD_loss.mean() + + def loss_mse(self, outputs, teacher_outputs): + return (outputs - teacher_outputs).pow(2).sum(1).mean() + + def train(self, args, callback, env_kwargs=None, train_kwargs=None): + + N_EPOCHS = args.epochs_distillation + self.seed = args.seed + self.batch_size = BATCH_SIZE + print("We assumed SRL training already done") + + print('Loading data for distillation ') + + training_data, ground_truth, true_states, _ = loadData(args.teacher_data_folder, absolute_path=True) + rewards, episode_starts = training_data['rewards'], training_data['episode_starts'] + + images_path = ground_truth['images_path'] + actions = training_data['actions'] + actions_proba = training_data['actions_proba'] + if USE_ADAPTIVE_TEMPERATURE: + cl_labels = training_data[CL_LABEL_KEY] + else: + cl_labels_st = None + + if args.distillation_training_set_size > 0: + limit = args.distillation_training_set_size + actions = actions[:limit] + images_path = images_path[:limit] + episode_starts = episode_starts[:limit] + + images_path_copy = ["srl_zoo/data/" + images_path[k] for k in range(images_path.shape[0])] + images_path = np.array(images_path_copy) + + num_samples = images_path.shape[0] - 1 # number of samples + + # indices for all time steps where the episode continues + indices = np.array([i for i in range(num_samples) if not episode_starts[i + 1]], dtype='int64') + np.random.shuffle(indices) + + # split indices into minibatches. minibatchlis t is a list of lists; each + # list is the id of the observation preserved through the training + minibatchlist = [np.array(sorted(indices[start_idx:start_idx + self.batch_size])) + for start_idx in range(0, len(indices) - self.batch_size + 1, self.batch_size)] + data_loader = DataLoader(minibatchlist, images_path, n_workers=N_WORKERS, multi_view=False, + use_triplets=False, is_training=True, absolute_path=True) + + test_minibatchlist = DataLoader.createTestMinibatchList(len(images_path), MAX_BATCH_SIZE_GPU) + test_data_loader = DataLoader(test_minibatchlist, images_path, n_workers=N_WORKERS, multi_view=False, + use_triplets=False, max_queue_len=1, is_training=False, absolute_path=True) + + # Number of minibatches used for validation: + n_val_batches = np.round(VALIDATION_SIZE * len(minibatchlist)).astype(np.int64) + val_indices = np.random.permutation(len(minibatchlist))[:n_val_batches] + # Print some info + print("{} minibatches for training, {} samples".format(len(minibatchlist) - n_val_batches, + (len(minibatchlist) - n_val_batches) * BATCH_SIZE)) + print("{} minibatches for validation, {} samples".format(n_val_batches, n_val_batches * BATCH_SIZE)) + assert n_val_batches > 0, "Not enough sample to create a validation set" + + # Stats about actions + if not args.continuous_actions: + print('Discrete action space:') + action_set = set(actions) + n_actions = int(np.max(actions) + 1) + print("{} unique actions / {} actions".format(len(action_set), n_actions)) + n_obs_per_action = np.zeros(n_actions, dtype=np.int64) + for i in range(n_actions): + n_obs_per_action[i] = np.sum(actions == i) + + print("Number of observations per action") + print(n_obs_per_action) + + else: + print('Continuous action space:') + print('Action dimension: {}'.format(self.dim_action)) + + # Here the default SRL model is assumed to be raw_pixels + self.state_dim = RENDER_HEIGHT * RENDER_WIDTH * 3 + self.srl_model = None + + # TODO: add sanity checks & test for all possible SRL for distillation + if env_kwargs["srl_model"] == "raw_pixels": + self.model = CNNPolicy(n_actions) + learnable_params = self.model.parameters() + learning_rate = 1e-3 + + else: + self.state_dim = getSRLDim(env_kwargs.get("srl_model_path", None)) + self.srl_model = loadSRLModel(env_kwargs.get("srl_model_path", None), + th.cuda.is_available(), self.state_dim, env_object=None) + + self.model = MLPPolicy(output_size=n_actions, input_size=self.state_dim) + for param in self.model.parameters(): + param.requires_grad = True + learnable_params = [param for param in self.model.parameters()] + + if FINE_TUNING and self.srl_model is not None: + for param in self.srl_model.model.parameters(): + param.requires_grad = True + learnable_params += [param for param in self.srl_model.model.parameters()] + + learning_rate = 1e-3 + self.device = th.device("cuda" if th.cuda.is_available() else "cpu") + + if th.cuda.is_available(): + self.model.cuda() + + self.optimizer = th.optim.Adam(learnable_params, lr=learning_rate) + best_error = np.inf + best_model_path = "{}/{}_model.pkl".format(args.log_dir, args.algo) + + for epoch in range(N_EPOCHS): + # In each epoch, we do a full pass over the training data: + epoch_loss, epoch_batches = 0, 0 + val_loss = 0 + pbar = tqdm(total=len(minibatchlist)) + + for minibatch_num, (minibatch_idx, obs, _, _, _) in enumerate(data_loader): + self.optimizer.zero_grad() + + obs = obs.to(self.device) + validation_mode = minibatch_idx in val_indices + if validation_mode: + self.model.eval() + if FINE_TUNING and self.srl_model is not None: + self.srl_model.model.eval() + else: + self.model.train() + if FINE_TUNING and self.srl_model is not None: + self.srl_model.model.train() + + # Actions associated to the observations of the current minibatch + actions_st = actions[minibatchlist[minibatch_idx]] + actions_proba_st = actions_proba[minibatchlist[minibatch_idx]] + + if USE_ADAPTIVE_TEMPERATURE: + cl_labels_st = cl_labels[minibatchlist[minibatch_idx]] + + if not args.continuous_actions: + # Discrete actions, rearrange action to have n_minibatch ligns and one column, + # containing the int action + actions_st = th.from_numpy(actions_st).requires_grad_(False).to(self.device) + actions_proba_st = th.from_numpy(actions_proba_st).requires_grad_(False).to(self.device) + else: + # Continuous actions, rearrange action to have n_minibatch ligns and dim_action columns + actions_st = th.from_numpy(actions_st).view(-1, self.dim_action).requires_grad_(False).to( + self.device) + + if self.srl_model is not None: + state = self.srl_model.model.getStates(obs).to(self.device).detach() + if "autoencoder" in self.srl_model.model.losses: + use_ae = True + decoded_obs = self.srl_model.model.model.decode(state).to(self.device).detach() + else: + state = obs.detach() + pred_action = self.model.forward(state) + + loss = self.loss_fn_kd(pred_action, + actions_proba_st.float(), + labels=cl_labels_st, adaptive_temperature=USE_ADAPTIVE_TEMPERATURE) + + loss.backward() + if validation_mode: + val_loss += loss.item() + # We do not optimize on validation data + # so optimizer.step() is not called + else: + self.optimizer.step() + epoch_loss += loss.item() + epoch_batches += 1 + pbar.update(1) + + train_loss = epoch_loss / float(epoch_batches) + val_loss /= float(n_val_batches) + + pbar.close() + print("Epoch {:3}/{}, train_loss:{:.6f} val_loss:{:.6f}".format(epoch + 1, N_EPOCHS, train_loss, val_loss)) + + # Save best model + if val_loss < best_error: + best_error = val_loss + self.save(best_model_path) diff --git a/rl_baselines/train.py b/rl_baselines/train.py index b4239d2bc..672b7f9a6 100644 --- a/rl_baselines/train.py +++ b/rl_baselines/train.py @@ -20,7 +20,8 @@ from rl_baselines.registry import registered_rl from rl_baselines.utils import computeMeanReward from rl_baselines.utils import filterJSONSerializableObjects -from rl_baselines.visualize import timestepsPlot, episodePlot +from rl_baselines.visualize import timestepsPlot, episodePlot,episodesEvalPlot +from rl_baselines.cross_eval import episodeEval from srl_zoo.utils import printGreen, printYellow from state_representation import SRLType from state_representation.registry import registered_srl @@ -33,15 +34,20 @@ ENV_NAME = "" PLOT_TITLE = "" EPISODE_WINDOW = 40 # For plotting moving average +CROSS_EVAL = False +EPISODE_WINDOW_DISTILLATION_WIN = 20 +NEW_LR=0.001 + + viz = None n_steps = 0 SAVE_INTERVAL = 0 # initialised during loading of the algorithm N_EPISODES_EVAL = 100 # Evaluate the performance on the last 100 episodes -MIN_EPISODES_BEFORE_SAVE = 100 # Number of episodes to train on before saving best model +MIN_EPISODES_BEFORE_SAVE = 1000 # Number of episodes to train on before saving best model params_saved = False best_mean_reward = -10000 -win, win_smooth, win_episodes = None, None, None +win, win_smooth, win_episodes, win_crossEval= None, None, None, None os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # used to remove debug info of tensorflow @@ -98,6 +104,14 @@ def configureEnvAndLogFolder(args, env_kwargs, all_models): srl_model_path = models['log_folder'] + path env_kwargs["srl_model_path"] = srl_model_path + # Use of continual learning env + env_kwargs["simple_continual_target"] = args.simple_continual + env_kwargs["circular_continual_move"] = args.circular_continual + env_kwargs["square_continual_move"] = args.square_continual + env_kwargs["eight_continual_move"] = args.eight_continual + env_kwargs["chasing_continual_move"] = args.chasing_continual + env_kwargs["escape_continual_move"] = args.escape_continual + # Add date + current time args.log_dir += "{}/{}/".format(ALGO_NAME, datetime.now().strftime("%y-%m-%d_%Hh%M_%S")) LOG_DIR = args.log_dir @@ -114,7 +128,7 @@ def callback(_locals, _globals): :param _locals: (dict) :param _globals: (dict) """ - global win, win_smooth, win_episodes, n_steps, viz, params_saved, best_mean_reward + global win, win_smooth, win_episodes, win_crossEval, n_steps, viz, params_saved, best_mean_reward # Create vizdom object only if needed if viz is None: viz = Visdom(port=VISDOM_PORT) @@ -158,6 +172,28 @@ def callback(_locals, _globals): printGreen("Saving new best model") ALGO.save(LOG_DIR + ALGO_NAME + "_model.pkl", _locals) + if CROSS_EVAL: # If we want to do the cross evaluation after the training + if n_episodes >= 0: + # For every checkpoint, we create one directory for saving logs file (policy and run mean std) + if EPISODE_WINDOW_DISTILLATION_WIN > 0: + if n_episodes % EPISODE_WINDOW_DISTILLATION_WIN == 0: + ALGO.save(LOG_DIR + ALGO_NAME + '_' + str(n_episodes) + "_model.pkl", _locals) + eps_path = LOG_DIR + "model_" + str(n_episodes) + try: + os.mkdir(LOG_DIR + "model_" + str(n_episodes)) + except OSError: + pass + #print("Creation of the directory {} failed".format(eps_path)) + + ALGO.save("{}/{}".format(eps_path, ALGO_NAME + "_model.pkl"), _locals) + try: + if 'env' in _locals: + _locals['env'].save_running_average(eps_path) + else: + _locals['self'].env.save_running_average(eps_path) + except AttributeError: + pass + # Plots in visdom if viz and (n_steps + 1) % LOG_INTERVAL == 0: win = timestepsPlot(viz, win, LOG_DIR, ENV_NAME, ALGO_NAME, bin_size=1, smooth=0, title=PLOT_TITLE, is_es=is_es) @@ -179,7 +215,7 @@ def main(): parser.add_argument('--env', type=str, help='environment ID', default='KukaButtonGymEnv-v0', choices=list(registered_env.keys())) parser.add_argument('--seed', type=int, default=0, help='random seed (default: 0)') - parser.add_argument('--episode_window', type=int, default=40, + parser.add_argument('--episode-window', type=int, default=40, help='Episode window for moving average plot (default: 40)') parser.add_argument('--log-dir', default='/tmp/gym/', type=str, help='directory to save agent logs and model (default: /tmp/gym)') @@ -207,7 +243,37 @@ def main(): help='load the latest learned model (location:srl_zoo/logs/DatasetName/)') parser.add_argument('--load-rl-model-path', type=str, default=None, help="load the trained RL model, should be with the same algorithm type") - + parser.add_argument('-sc', '--simple-continual', action='store_true', default=False, + help='Simple red square target for task 1 of continual learning scenario. ' + + 'The task is: robot should reach the target.') + parser.add_argument('-cc', '--circular-continual', action='store_true', default=False, + help='Blue square target for task 2 of continual learning scenario. ' + + 'The task is: robot should turn in circle around the target.') + parser.add_argument('-sqc', '--square-continual', action='store_true', default=False, + help='Green square target for task 3 of continual learning scenario. ' + + 'The task is: robot should turn in square around the target.') + parser.add_argument('-ec', '--eight-continual', action='store_true', default=False, + help='Green square target for task 4 of continual learning scenario. ' + + 'The task is: robot should do the eigth with the target as center of the shape.') + parser.add_argument('-chc', '--chasing-continual', action='store_true', default=False, + help='Two chasing robots in the same domain of environment' + + 'The task is: one robot should keep a certain distance towars the other.') + parser.add_argument('-esc', '--escape-continual', action='store_true', default=False, + help='Two chasing robots in the same domain of environment' + + 'The task is: the trainable agent tries to escape from the "zombie" robot.') + parser.add_argument('--teacher-data-folder', type=str, default="", + help='Dataset folder of the teacher(s) policy(ies)', required=False) + parser.add_argument('--epochs-distillation', type=int, default=30, metavar='N', + help='number of epochs to train for distillation(default: 30)') + parser.add_argument('--distillation-training-set-size', type=int, default=-1, + help='Limit size (number of samples) of the training set (default: -1)') + parser.add_argument('--perform-cross-evaluation-cc', action='store_true', default=False, + help='A cross evaluation from the latest stored model to all tasks') + parser.add_argument('--eval-episode-window', type=int, default=400, metavar='N', + help='Episode window for saving each policy checkpoint for future distillation(default: 100)') + parser.add_argument('--new-lr', type=float, default=1.e-4, + help="New learning rate ratio to train a pretrained agent") + # Ignore unknown args for now args, unknown = parser.parse_known_args() env_kwargs = {} @@ -236,11 +302,22 @@ def main(): break assert found, "Error: srl_model {}, is not compatible with the {} environment.".format(args.srl_model, args.env) + assert not(sum([args.simple_continual, args.circular_continual, args.square_continual, args.eight_continual, + args.chasing_continual, args.escape_continual]) > 1 and args.env == "OmnirobotEnv-v0"), \ + "For continual SRL and RL, please provide only one scenario at the time and use OmnirobotEnv-v0 environment !" + + assert not(args.algo == "distillation" and (args.teacher_data_folder == '' or args.continuous_actions is True)), \ + "For performing policy distillation, make sure use specify a valid teacher dataset and discrete actions !" + ENV_NAME = args.env ALGO_NAME = args.algo VISDOM_PORT = args.port EPISODE_WINDOW = args.episode_window MIN_EPISODES_BEFORE_SAVE = args.min_episodes_save + CROSS_EVAL = args.perform_cross_evaluation_cc + EPISODE_WINDOW_DISTILLATION_WIN = args.eval_episode_window + NEW_LR =args.new_lr + print("EPISODE_WINDOW_DISTILLATION_WIN: ", EPISODE_WINDOW_DISTILLATION_WIN) if args.no_vis: viz = False @@ -249,7 +326,6 @@ def main(): algo = algo_class() ALGO = algo - # if callback frequency needs to be changed LOG_INTERVAL = algo.LOG_INTERVAL SAVE_INTERVAL = algo.SAVE_INTERVAL @@ -268,6 +344,11 @@ def main(): env_kwargs["action_repeat"] = args.action_repeat # Random init position for button env_kwargs["random_target"] = args.random_target + + # If in simple continual scenario, then the target should be initialized randomly. + if args.simple_continual is True: + env_kwargs["random_target"] = True + # Allow up action # env_kwargs["force_down"] = False @@ -291,7 +372,8 @@ def main(): globals_env_param = sys.modules[env_class.__module__].getGlobals() super_class = registered_env[args.env][1] - # reccursive search through all the super classes of the asked environment, in order to get all the arguments. + + # recursive search through all the super classes of the asked environment, in order to get all the arguments. rec_super_class_lookup = {dict_class: dict_super_class for _, (dict_class, dict_super_class, _, _) in registered_env.items()} while super_class != SRLGymEnv: @@ -322,12 +404,11 @@ def main(): hyperparams = algo.parserHyperParam(hyperparams) if args.load_rl_model_path is not None: - #use a small learning rate - print("use a small learning rate: {:f}".format(1.0e-4)) - hyperparams["learning_rate"] = lambda f: f * 1.0e-4 + # use a small learning rate + print("use a small learning rate: {:f}".format(NEW_LR)) + hyperparams["learning_rate"] = lambda f: f * NEW_LR # Train the agent - if args.load_rl_model_path is not None: algo.setLoadPath(args.load_rl_model_path) algo.train(args, callback, env_kwargs=env_kwargs, train_kwargs=hyperparams) diff --git a/rl_baselines/visualize.py b/rl_baselines/visualize.py index 16b308105..e2e1270c9 100644 --- a/rl_baselines/visualize.py +++ b/rl_baselines/visualize.py @@ -9,14 +9,15 @@ from scipy.signal import medfilt -def smoothRewardCurve(x, y): +def smoothRewardCurve(x, y, conv_len=30): """ :param x: (numpy array) :param y: (numpy array) + :param conv_len: an integer, kernel size of the convolution :return: (numpy array, numpy array) """ # Halfwidth of our smoothing convolution - halfwidth = min(31, int(np.ceil(len(x) / 30))) + halfwidth = min(conv_len+1, int(np.ceil(len(x) / conv_len))) k = halfwidth xsmoo = x[k:-k] ysmoo = np.convolve(y, np.ones(2 * k + 1), mode='valid') / \ @@ -213,3 +214,36 @@ def timestepsPlot(viz, win, folder, game, name, bin_size=100, smooth=1, title="" "legend": [name] } return viz.line(ty, tx, win=win, opts=opts) + +def episodesEvalPlot(viz, win, folder, game, name, window=1, title=""): + + folder+='episode_eval.npy' + if(os.path.isfile(folder)): + result = np.load(folder) + else: + return win + + if len(result) == 0: + return win + print(result.shape) + + y = np.mean(result[:, :, 1:], axis=2).T + + + x = result[:, :, 0].T + + if y.shape[0] < window: + return win + + if len(y) == 0: + return win + + + opts = { + "title": "{}\n{}".format(game, title), + "xlabel": "Episodes", + "ylabel": "Rewards", + "legend": name + } + + return viz.line(y, x, win=win, opts=opts) \ No newline at end of file diff --git a/run_policy.sh b/run_policy.sh new file mode 100644 index 000000000..bcb1ccede --- /dev/null +++ b/run_policy.sh @@ -0,0 +1,83 @@ +#!/usr/bin/env bash + + + + + +policy="ppo2" +env="OmnirobotEnv-v0" + + +# those name can not be reuse for a unique run, in other case some folder need to be manually removed +name_circular_policy_folder="circular_on_policy" +name_reaching_policy_folder="reaching_on_policy" +merging_file="merge_CC_SC" + + + +### 0 - Generate datasets for SRL (random policy) +# Dataset 1 (random reaching target) +python -m environments.dataset_generator --num-cpu 8 --name Omnibot_random_simple --env $env --simple-continual --num-episode 250 -f +# Dataset 2 (Circular task) +python -m environments.dataset_generator --num-cpu 8 --name Omnibot_random_circular --env $env --circular-continual --num-episode 250 -f + + +### 1.1) Train SRL + +cd srl_zoo +# Dataset 1 (random reaching target) +python train.py --data-folder data/Omnibot_random_simple -bs 32 --epochs 20 --state-dim 200 --training-set-size 20000 --losses autoencoder inverse +# Dataset 2 (Circular task) +python train.py --data-folder data/Omnibot_random_circular -bs 32 --epochs 20 --state-dim 200 --training-set-size 20000 --losses autoencoder inverse + +### 1.2) Train policy +cd .. + +# Dataset 1 (random reaching target) +cp config/srl_models_simple.yaml config/srl_models.yaml +python -m rl_baselines.train --algo ppo2 --srl-model srl_combination --num-timesteps 500000 --env OmnirobotEnv-v0 --log-dir logs/simple/ --num-cpu 8 --simple-continual --latest + +# Dataset 2 (Circular task) +cp config/srl_models_circular.yaml config/srl_models.yaml +python -m rl_baselines.train --algo ppo2 --srl-model srl_combination --num-timesteps 1000000 --env OmnirobotEnv-v0 --log-dir logs/circular/ --num-cpu 8 --circular-continual --latest + + + + +# Dataset 1 (random reaching target) +path2policy="logs/simple/OmnirobotEnv-v0/srl_combination/ppo2/" +python -m environments.dataset_generator --env OmnirobotEnv-v0 --num-episode 100 --run-policy custom --log-custom-policy $path2policy --short-episodes --save-path data/ --name reaching_on_policy -sc --latest + + +# Dataset 2 (Circular task) +path2policy="logs/circular/OmnirobotEnv-v0/srl_combination/ppo2/" +python -m environments.dataset_generator --env OmnirobotEnv-v0 --num-episode 100 --run-policy custom --log-custom-policy $path2policy --short-episodes --save-path data/ --name circular_on_policy -cc --latest + + +# Merge Datasets + +#(/ ! \ it removes the generated dataset for dataset 1 and 2) + +python -m environments.dataset_merger --merge data/$name_circular_policy_folder\/ data/$name_reaching_policy_folder\/ data/$merging_file + +# Copy the merged Dataset to srl_zoo repository +cp -r data/$merging_file srl_zoo/data/$merging_file + + +### 2.3) Train SRL 1&2 + +cd srl_zoo +# Dataset 1 +python train.py --data-folder data/$merging_file -bs 32 --epochs 20 --state-dim 200 --training-set-size 30000 --losses autoencoder inverse + + +### 2.3) Run Distillation + +# make a new log folder +mkdir logs/CL_SC_CC +cp config/srl_models_merged.yaml config/srl_models.yaml + +# Merged Dataset +cd .. +python -m rl_baselines.train --algo distillation --srl-model srl_combination --env OmnirobotEnv-v0 --log-dir logs/CL_SC_CC --teacher-data-folder srl_zoo/data/$merging_file -cc --distillation-training-set-size 40000 --epochs-distillation 20 --latest + diff --git a/run_policy_dry_run.sh b/run_policy_dry_run.sh new file mode 100755 index 000000000..2ec72e826 --- /dev/null +++ b/run_policy_dry_run.sh @@ -0,0 +1,75 @@ + + + + + +policy="ppo2" +env="OmnirobotEnv-v0" + + +### 0 - Generate datasets for SRL (random policy) +# Dataset 1 (random reaching target) +python -m environments.dataset_generator --num-cpu 8 --name Omnibot_random_simple --env $env --simple-continual --num-episode 1 -f +# Dataset 2 (Circular task) +python -m environments.dataset_generator --num-cpu 8 --name Omnibot_random_circular --env $env --circular-continual --num-episode 1 -f + + +### 1.1) Train SRL + +cd srl_zoo +# Dataset 1 (random reaching target) +python train.py --data-folder data/Omnibot_random_simple -bs 32 --epochs 2 --state-dim 200 --training-set-size 20000 --losses autoencoder inverse +# Dataset 2 (Circular task) +python train.py --data-folder data/Omnibot_random_circular -bs 32 --epochs 2 --state-dim 200 --training-set-size 20000 --losses autoencoder inverse + +### 1.2) Train policy +cd .. + +# Dataset 1 (random reaching target) +cp config/srl_models_simple.yaml config/srl_models.yaml +python -m rl_baselines.train --algo ppo2 --srl-model srl_combination --num-timesteps 30000 --env OmnirobotEnv-v0 --log-dir logs/simple/ --num-cpu 8 --simple-continual --latest + +# Dataset 2 (Circular task) +cp config/srl_models_circular.yaml config/srl_models.yaml +python -m rl_baselines.train --algo ppo2 --srl-model srl_combination --num-timesteps 30000 --env OmnirobotEnv-v0 --log-dir logs/circular/ --num-cpu 8 --circular-continual --latest + + + +# Dataset 1 (random reaching target) + +# Dataset 1 (random reaching target) +path2policy="logs/simple/OmnirobotEnv-v0/srl_combination/ppo2/" +python -m environments.dataset_generator --env OmnirobotEnv-v0 --num-episode 2 --run-policy custom --log-custom-policy $path2policy --short-episodes --save-path data/ --name reaching_on_policy -sc --latest + + +# Dataset 2 (Circular task) +path2policy="logs/circular/OmnirobotEnv-v0/srl_combination/ppo2/" +python -m environments.dataset_generator --env OmnirobotEnv-v0 --num-episode 2 --run-policy custom --log-custom-policy $path2policy --short-episodes --save-path data/ --name circular_on_policy -cc --latest + +# Merge Datasets + +#(/ ! \ it removes the generated dataset for dataset 1 and 2) + +python -m environments.dataset_merger --merge data/circular_on_policy/ data/reaching_on_policy/ data/merge_CC_SC + +# Copy the merged Dataset to srl_zoo repository +cp -r data/merge_CC_SC srl_zoo/data/merge_CC_SC + + +### 2.3) Train SRL 1&2 + +cd srl_zoo +# Dataset 1 +python train.py --data-folder data/merge_CC_SC -bs 32 --epochs 2 --state-dim 200 --training-set-size 30000 --losses autoencoder inverse + + +### 2.3) Run Distillation + +# make a new log folder +mkdir logs/CL_SC_CC +cp config/srl_models_merged.yaml config/srl_models.yaml + +# Merged Dataset +cd .. +python -m rl_baselines.train --algo distillation --srl-model srl_combination --env OmnirobotEnv-v0 --log-dir logs/CL_SC_CC --teacher-data-folder srl_zoo/data/merge_CC_SC -cc --distillation-training-set-size 40000 --epochs-distillation 2 --latest + diff --git a/srl_zoo b/srl_zoo index 438a05ab6..f391c6c1a 160000 --- a/srl_zoo +++ b/srl_zoo @@ -1 +1 @@ -Subproject commit 438a05ab625a2c5ada573b47f73469d92de82132 +Subproject commit f391c6c1a782fce78139c7e826654354aadd4899 diff --git a/state_representation/episode_saver.py b/state_representation/episode_saver.py index 842cd3582..ca9439c8b 100644 --- a/state_representation/episode_saver.py +++ b/state_representation/episode_saver.py @@ -36,6 +36,7 @@ def __init__(self, name, max_dist, state_dim=-1, globals_=None, learn_every=3, l printYellow("Folder already exist") self.actions = [] + self.actions_proba = [] self.rewards = [] self.images = [] self.target_positions = [] @@ -111,10 +112,11 @@ def reset(self, observation, target_pos, ground_truth): self.ground_truth_states.append(ground_truth) self.saveImage(observation) - def step(self, observation, action, reward, done, ground_truth_state): + def step(self, observation, action, reward, done, ground_truth_state, action_proba=None, target_pos=[]): """ :param observation: (numpy matrix) BGR Image :param action: (int) + :param action_proba: (list float) probability of taking each action :param reward: (float) :param done: (bool) whether the episode is done or not :param ground_truth_state: (numpy array) @@ -124,11 +126,16 @@ def step(self, observation, action, reward, done, ground_truth_state): self.n_steps += 1 self.rewards.append(reward) self.actions.append(action) + if action_proba is not None: + self.actions_proba.append(action_proba) + if reward > 0: self.episode_success = True if not done: self.episode_starts.append(False) + if len(target_pos) != 0: + self.target_positions.append(target_pos) self.ground_truth_states.append(ground_truth_state) self.saveImage(observation) else: @@ -144,11 +151,14 @@ def save(self): assert len(self.actions) == len(self.episode_starts) assert len(self.actions) == len(self.images_path) assert len(self.actions) == len(self.ground_truth_states) - assert len(self.target_positions) == self.episode_idx + 1 + # change this assertion since the dynamic environment needs to save the target position at each step + assert len(self.target_positions) == self.episode_idx + 1 or len(self.target_positions) == len(self.actions) + assert len(self.actions_proba) == 0 or len(self.actions_proba) == len(self.actions) data = { 'rewards': np.array(self.rewards), 'actions': np.array(self.actions), + 'actions_proba': np.array(self.actions_proba), 'episode_starts': np.array(self.episode_starts) } diff --git a/tests/test_dataset_manipulation.py b/tests/test_dataset_manipulation.py index 22ccc0be0..87053e4d7 100644 --- a/tests/test_dataset_manipulation.py +++ b/tests/test_dataset_manipulation.py @@ -31,7 +31,7 @@ def testDataGenForFusion(): args_3 = ['--merge', PATH_SRL + DATA_FOLDER_NAME_1, PATH_SRL + DATA_FOLDER_NAME_2, PATH_SRL + DATA_FOLDER_NAME_3] args_3 = list(map(str, args_3)) - ok = subprocess.call(['python', '-m', 'environments.dataset_fusioner'] + args_3) + ok = subprocess.call(['python', '-m', 'environments.dataset_merger'] + args_3) assertEq(ok, 0) # Checking inexistance of original datasets to be merged diff --git a/tests/test_distillation_pipeline.py b/tests/test_distillation_pipeline.py new file mode 100644 index 000000000..0a11930c4 --- /dev/null +++ b/tests/test_distillation_pipeline.py @@ -0,0 +1,108 @@ +import subprocess +import pytest +import os +import shutil +import time + +from rl_baselines.student_eval import OnPolicyDatasetGenerator, mergeData, trainStudent + + +ENV_NAME = 'OmnirobotEnv-v0' +PATH_SRL = "srl_zoo/data/" +DEFAULT_SRL_TEACHERS = "ground_truth" +DEFAULT_SRL_STUDENT = "raw_pixels" +NUM_TIMESTEP = 25000 +NUM_CPU = 4 + + +def assertEq(left, right): + assert left == right, "{} != {}".format(left, right) + + +@pytest.mark.fast +def testOnPolicyDatasetGeneration(): + + # # Train Ground_truth teacher policies for each env + + # do not write distillation in path to prevent loading irrelevant algo based on folder name + test_log_dir = "logs/test_dist/" + test_log_dir_teacher_sc = test_log_dir + 'teacher_sc/' + test_log_dir_teacher_cc = test_log_dir + 'teacher_cc/' + test_log_dir_teacher_esc = test_log_dir + 'teacher_esc/' + teacher_log_dirs = [test_log_dir_teacher_sc, test_log_dir_teacher_cc, test_log_dir_teacher_esc] + + if os.path.exists(test_log_dir): + print("Destination log directory '{}' already exists - removing it before re-creating it".format(test_log_dir)) + shutil.rmtree(test_log_dir) + if not os.path.exists("logs"): + os.mkdir("logs/") + os.mkdir(test_log_dir) + + for log_dir in teacher_log_dirs: + os.mkdir(log_dir) + + print("Teachers training") + # First Teacher for circular task + args = ['--algo', "ppo2", '--env', ENV_NAME, '--srl-model', DEFAULT_SRL_TEACHERS, + '--num-timesteps', NUM_TIMESTEP, '--num-cpu', NUM_CPU, '--no-vis', '--log-dir', + teacher_log_dirs[0], '-sc'] + args = list(map(str, args)) + + ok = subprocess.call(['python', '-m', 'rl_baselines.train'] + args) + assertEq(ok, 0) + # Second teacher for circular task + args = ['--algo', "ppo2", '--env', ENV_NAME, '--srl-model', DEFAULT_SRL_TEACHERS, + '--num-timesteps', NUM_TIMESTEP, '--num-cpu', NUM_CPU, '--no-vis', '--log-dir', + teacher_log_dirs[1], '-cc'] + args = list(map(str, args)) + + ok = subprocess.call(['python', '-m', 'rl_baselines.train'] + args) + assertEq(ok, 0) + # Third teacher for circular task + args = ['--algo', "ppo2", '--env', ENV_NAME, '--srl-model', DEFAULT_SRL_TEACHERS, + '--num-timesteps', NUM_TIMESTEP, '--num-cpu', NUM_CPU, '--no-vis', '--log-dir', + teacher_log_dirs[2], '-esc'] + args = list(map(str, args)) + + ok = subprocess.call(['python', '-m', 'rl_baselines.train'] + args) + assertEq(ok, 0) + + # Generate on-policy datasets from each teacher + teacher_log_dirs[0] += ENV_NAME + '/' + DEFAULT_SRL_TEACHERS + "/ppo2" + teacher_one_path = \ + max([teacher_log_dirs[0] + "/" + d for d in os.listdir(teacher_log_dirs[0]) + if os.path.isdir(teacher_log_dirs[0] + "/" + d)], key=os.path.getmtime) + '/' + + OnPolicyDatasetGenerator(teacher_path=teacher_one_path, + output_name='test_SC_copy/', task_id='SC', episode=-1, env_name=ENV_NAME, test_mode=True) + + teacher_log_dirs[1] += ENV_NAME + '/' + DEFAULT_SRL_TEACHERS + "/ppo2" + teacher_two_path = \ + max([teacher_log_dirs[1] + "/" + d for d in os.listdir(teacher_log_dirs[1]) + if os.path.isdir(teacher_log_dirs[1] + "/" + d)], key=os.path.getmtime) + '/' + + OnPolicyDatasetGenerator(teacher_path=teacher_two_path, output_name='test_CC_copy/', + task_id='CC', episode=-1, env_name=ENV_NAME, test_mode=True) + + teacher_log_dirs[2] += ENV_NAME + '/' + DEFAULT_SRL_TEACHERS + "/ppo2" + teacher_three_path = \ + max([teacher_log_dirs[2] + "/" + d for d in os.listdir(teacher_log_dirs[2]) + if os.path.isdir(teacher_log_dirs[2] + "/" + d)], key=os.path.getmtime) + '/' + + OnPolicyDatasetGenerator(teacher_path=teacher_three_path, output_name='test_ESC_copy/', + task_id='ESC', episode=-1, env_name=ENV_NAME, test_mode=True) + + # Merge those on-policy datasets + merge_path = "data/on_policy_merged_test" + tmp_path = 'data/on_policy_merged_sc_cc/' + mergeData('data/test_SC_copy/', 'data/test_CC_copy/', tmp_path, force=True) + mergeData(tmp_path, 'data/test_ESC_copy/', merge_path, force=True) + + ok = subprocess.call(['cp', '-r', merge_path, 'srl_zoo/data/', '-f']) + assert ok == 0 + time.sleep(1) + + # Train a raw_pixels student policy via distillation + trainStudent(merge_path, "CC", log_dir=test_log_dir, srl_model=DEFAULT_SRL_STUDENT, + env_name=ENV_NAME, training_size=500, epochs=3) + print("Distillation test performed!") diff --git a/tests/test_end_to_end.py b/tests/test_end_to_end.py index 01122a023..1f3286f2e 100644 --- a/tests/test_end_to_end.py +++ b/tests/test_end_to_end.py @@ -167,7 +167,8 @@ def testSrlCombiningTrain(): assertEq(ok, 0) -@pytest.mark.parametrize("model_type", ['vae', 'autoencoder', "robotic_priors", "inverse", "forward", "srl_combination", "multi_view_srl"]) +@pytest.mark.parametrize("model_type", ['vae', 'autoencoder', "robotic_priors", "inverse", "forward", + "srl_combination", "multi_view_srl"]) def testAllRLOnSrlTrain(model_type): """ Testing all the previously learned srl models on the RL pipeline @@ -182,7 +183,8 @@ def testAllRLOnSrlTrain(model_type): assertEq(ok, 0) -@pytest.mark.parametrize("algo", ['a2c', 'acer', 'acktr', 'ars', 'cma-es', 'ddpg', 'deepq', 'ppo1', 'ppo2', 'random_agent', 'sac', 'trpo']) +@pytest.mark.parametrize("algo", ['a2c', 'acer', 'acktr', 'ars', 'cma-es', 'ddpg', 'deepq', 'ppo1', 'ppo2', + 'random_agent', 'sac', 'trpo']) def testAllSrlonRLTrain(algo): """ Testing RL pipeline on previously learned models diff --git a/tests/test_eval.py b/tests/test_eval.py new file mode 100644 index 000000000..44782010b --- /dev/null +++ b/tests/test_eval.py @@ -0,0 +1,80 @@ +from __future__ import print_function, division, absolute_import + +import subprocess +import shutil +import pytest +import os +from environments import ThreadingType +from environments.registry import registered_env + +DEFAULT_ALGO = "ppo2" +DEFAULT_SRL = "raw_pixels" +NUM_ITERATION = 1 +NUM_TIMESTEP = 251 # this should be long enough to call a reset of the environment +SEED = 0 +DEFAULT_ENV = "OmnirobotEnv-v0" +DEFAULT_LOG = "logs/test_eval/" +EPOCH_DATA = 2 +EPISODE_WINS =40 +DIR_STUDENT = 'logs/test_students/' +EPOCH_DISTILLATION = 5 +def assertEq(left, right): + assert left == right, "{} != {}".format(left, right) + +@pytest.mark.fast +@pytest.mark.parametrize("tasks", [['-cc','-sc']]) +def testStudentEval(tasks,teacher_folder_one='logs/teacher_one/', teacher_folder_two='logs/teacher_two/' ): + + + teacher_args_one = ['--algo', DEFAULT_ALGO, '--srl-model', DEFAULT_SRL, + '--num-timesteps', NUM_TIMESTEP, '--seed', SEED, '--no-vis', + '--episode-window', EPISODE_WINS, + '--env', DEFAULT_ENV, '--log-dir', teacher_folder_one , tasks[0], + '--min-episodes-save', 0] + + teacher_args_one = list(map(str, teacher_args_one)) + + teacher_args_two = ['--algo', DEFAULT_ALGO, '--srl-model', DEFAULT_SRL, + '--num-timesteps', NUM_TIMESTEP, '--seed', SEED, '--no-vis', + '--episode-window', EPISODE_WINS, + '--env', DEFAULT_ENV, '--log-dir', teacher_folder_two , tasks[1]] + teacher_args_two = list(map(str, teacher_args_two)) + + ok = subprocess.call(['python', '-m', 'rl_baselines.train'] + teacher_args_one) + assertEq(ok, 0) + ok = subprocess.call(['python', '-m', 'rl_baselines.train'] + teacher_args_two) + assertEq(ok, 0) + + folder2remove = [teacher_folder_one, teacher_folder_two] + + for i in range(4):#Go into the folder that contains the policy file + teacher_folder_two += os.listdir(teacher_folder_two)[-1] + '/' + teacher_folder_one += os.listdir(teacher_folder_one)[-1] + '/' + + + #Distillation part + args = ['--num-iteration', NUM_ITERATION, '--epochs-teacher-datasets', EPOCH_DATA, + '--env', DEFAULT_ENV, '--log-dir-student', DIR_STUDENT, + '--log-dir-teacher-one', teacher_folder_one,'--log-dir-teacher-two', teacher_folder_two, + '--epochs-distillation', EPOCH_DISTILLATION] + if(tasks ==['-cc','-sc']): + args+=['--srl-config-file-one', 'config/srl_models_circular.yaml', + '--srl-config-file-two','config/srl_models_simple.yaml', + '--continual-learning-labels', 'CC', 'SC'] + + else: + args += ['--srl-config-file-one', 'config/srl_models_simple.yaml', + '--srl-config-file-two', 'config/srl_models_circular.yaml', + '--continual-learning-labels', 'SC', 'CC'] + + args = list(map(str, args)) + ok = subprocess.call(['python', '-m', 'rl_baselines.student_eval'] + args) + assertEq(ok, 0) + for i in range(10): + print("OK 1 ") + #Remove test files + shutil.rmtree(folder2remove[0]) + shutil.rmtree(folder2remove[1]) + shutil.rmtree(DIR_STUDENT) + for i in range(10): + print("test finished")