From c588b47bf416756df8b9ab38ae125a2eb750c513 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20Serrano=20Mu=C3=B1oz?= Date: Wed, 13 Jul 2022 23:28:48 +0200 Subject: [PATCH 001/108] Update Omniverse Isaac Gym examples --- .../examples/omniisaacgym/ppo_allegro_hand.py | 5 +++++ docs/source/examples/omniisaacgym/ppo_ant.py | 5 +++++ docs/source/examples/omniisaacgym/ppo_ant_mt.py | 5 +++++ docs/source/examples/omniisaacgym/ppo_cartpole.py | 5 +++++ .../examples/omniisaacgym/ppo_cartpole_mt.py | 5 +++++ docs/source/examples/omniisaacgym/ppo_humanoid.py | 5 +++++ .../examples/omniisaacgym/ppo_shadow_hand.py | 5 +++++ docs/source/intro/examples.rst | 14 +++++++------- 8 files changed, 42 insertions(+), 7 deletions(-) diff --git a/docs/source/examples/omniisaacgym/ppo_allegro_hand.py b/docs/source/examples/omniisaacgym/ppo_allegro_hand.py index 04fd3a3b..02e06628 100644 --- a/docs/source/examples/omniisaacgym/ppo_allegro_hand.py +++ b/docs/source/examples/omniisaacgym/ppo_allegro_hand.py @@ -6,6 +6,7 @@ from skrl.memories.torch import RandomMemory from skrl.agents.torch.ppo import PPO, PPO_DEFAULT_CONFIG from skrl.resources.schedulers.torch import KLAdaptiveRL +from skrl.resources.preprocessors.torch import RunningStandardScaler from skrl.trainers.torch import SequentialTrainer from skrl.envs.torch import wrap_env from skrl.envs.torch import load_omniverse_isaacgym_env @@ -97,6 +98,10 @@ def compute(self, states, taken_actions): cfg_ppo["value_loss_scale"] = 2.0 cfg_ppo["kl_threshold"] = 0 cfg_ppo["rewards_shaper"] = lambda rewards, timestep, timesteps: rewards * 0.01 +cfg_ppo["state_preprocessor"] = RunningStandardScaler +cfg_ppo["state_preprocessor_kwargs"] = {"size": env.observation_space, "device": device} +cfg_ppo["value_preprocessor"] = RunningStandardScaler +cfg_ppo["value_preprocessor_kwargs"] = {"size": 1, "device": device} # logging to TensorBoard and write checkpoints each 200 and 2000 timesteps respectively cfg_ppo["experiment"]["write_interval"] = 200 cfg_ppo["experiment"]["checkpoint_interval"] = 2000 diff --git a/docs/source/examples/omniisaacgym/ppo_ant.py b/docs/source/examples/omniisaacgym/ppo_ant.py index 2244d1be..dc446032 100644 --- a/docs/source/examples/omniisaacgym/ppo_ant.py +++ b/docs/source/examples/omniisaacgym/ppo_ant.py @@ -6,6 +6,7 @@ from skrl.memories.torch import RandomMemory from skrl.agents.torch.ppo import PPO, PPO_DEFAULT_CONFIG from skrl.resources.schedulers.torch import KLAdaptiveRL +from skrl.resources.preprocessors.torch import RunningStandardScaler from skrl.trainers.torch import SequentialTrainer from skrl.envs.torch import wrap_env from skrl.envs.torch import load_omniverse_isaacgym_env @@ -97,6 +98,10 @@ def compute(self, states, taken_actions): cfg_ppo["value_loss_scale"] = 1.0 cfg_ppo["kl_threshold"] = 0 cfg_ppo["rewards_shaper"] = lambda rewards, timestep, timesteps: rewards * 0.01 +cfg_ppo["state_preprocessor"] = RunningStandardScaler +cfg_ppo["state_preprocessor_kwargs"] = {"size": env.observation_space, "device": device} +cfg_ppo["value_preprocessor"] = RunningStandardScaler +cfg_ppo["value_preprocessor_kwargs"] = {"size": 1, "device": device} # logging to TensorBoard and write checkpoints each 40 and 400 timesteps respectively cfg_ppo["experiment"]["write_interval"] = 40 cfg_ppo["experiment"]["checkpoint_interval"] = 400 diff --git a/docs/source/examples/omniisaacgym/ppo_ant_mt.py b/docs/source/examples/omniisaacgym/ppo_ant_mt.py index 3aea2fb5..e576af3b 100644 --- a/docs/source/examples/omniisaacgym/ppo_ant_mt.py +++ b/docs/source/examples/omniisaacgym/ppo_ant_mt.py @@ -8,6 +8,7 @@ from skrl.memories.torch import RandomMemory from skrl.agents.torch.ppo import PPO, PPO_DEFAULT_CONFIG from skrl.resources.schedulers.torch import KLAdaptiveRL +from skrl.resources.preprocessors.torch import RunningStandardScaler from skrl.trainers.torch import SequentialTrainer from skrl.envs.torch import wrap_env from skrl.envs.torch import load_omniverse_isaacgym_env @@ -99,6 +100,10 @@ def compute(self, states, taken_actions): cfg_ppo["value_loss_scale"] = 1.0 cfg_ppo["kl_threshold"] = 0 cfg_ppo["rewards_shaper"] = lambda rewards, timestep, timesteps: rewards * 0.01 +cfg_ppo["state_preprocessor"] = RunningStandardScaler +cfg_ppo["state_preprocessor_kwargs"] = {"size": env.observation_space, "device": device} +cfg_ppo["value_preprocessor"] = RunningStandardScaler +cfg_ppo["value_preprocessor_kwargs"] = {"size": 1, "device": device} # logging to TensorBoard and write checkpoints each 40 and 400 timesteps respectively cfg_ppo["experiment"]["write_interval"] = 40 cfg_ppo["experiment"]["checkpoint_interval"] = 400 diff --git a/docs/source/examples/omniisaacgym/ppo_cartpole.py b/docs/source/examples/omniisaacgym/ppo_cartpole.py index 034b31c0..096e67a9 100644 --- a/docs/source/examples/omniisaacgym/ppo_cartpole.py +++ b/docs/source/examples/omniisaacgym/ppo_cartpole.py @@ -6,6 +6,7 @@ from skrl.memories.torch import RandomMemory from skrl.agents.torch.ppo import PPO, PPO_DEFAULT_CONFIG from skrl.resources.schedulers.torch import KLAdaptiveRL +from skrl.resources.preprocessors.torch import RunningStandardScaler from skrl.trainers.torch import SequentialTrainer from skrl.envs.torch import wrap_env from skrl.envs.torch import load_omniverse_isaacgym_env @@ -93,6 +94,10 @@ def compute(self, states, taken_actions): cfg_ppo["value_loss_scale"] = 2.0 cfg_ppo["kl_threshold"] = 0 cfg_ppo["rewards_shaper"] = lambda rewards, timestep, timesteps: rewards * 0.1 +cfg_ppo["state_preprocessor"] = RunningStandardScaler +cfg_ppo["state_preprocessor_kwargs"] = {"size": env.observation_space, "device": device} +cfg_ppo["value_preprocessor"] = RunningStandardScaler +cfg_ppo["value_preprocessor_kwargs"] = {"size": 1, "device": device} # logging to TensorBoard and write checkpoints each 16 and 80 timesteps respectively cfg_ppo["experiment"]["write_interval"] = 16 cfg_ppo["experiment"]["checkpoint_interval"] = 80 diff --git a/docs/source/examples/omniisaacgym/ppo_cartpole_mt.py b/docs/source/examples/omniisaacgym/ppo_cartpole_mt.py index 888caaeb..8f1928e6 100644 --- a/docs/source/examples/omniisaacgym/ppo_cartpole_mt.py +++ b/docs/source/examples/omniisaacgym/ppo_cartpole_mt.py @@ -8,6 +8,7 @@ from skrl.memories.torch import RandomMemory from skrl.agents.torch.ppo import PPO, PPO_DEFAULT_CONFIG from skrl.resources.schedulers.torch import KLAdaptiveRL +from skrl.resources.preprocessors.torch import RunningStandardScaler from skrl.trainers.torch import SequentialTrainer from skrl.envs.torch import wrap_env from skrl.envs.torch import load_omniverse_isaacgym_env @@ -95,6 +96,10 @@ def compute(self, states, taken_actions): cfg_ppo["value_loss_scale"] = 2.0 cfg_ppo["kl_threshold"] = 0 cfg_ppo["rewards_shaper"] = lambda rewards, timestep, timesteps: rewards * 0.1 +cfg_ppo["state_preprocessor"] = RunningStandardScaler +cfg_ppo["state_preprocessor_kwargs"] = {"size": env.observation_space, "device": device} +cfg_ppo["value_preprocessor"] = RunningStandardScaler +cfg_ppo["value_preprocessor_kwargs"] = {"size": 1, "device": device} # logging to TensorBoard and write checkpoints each 16 and 80 timesteps respectively cfg_ppo["experiment"]["write_interval"] = 16 cfg_ppo["experiment"]["checkpoint_interval"] = 80 diff --git a/docs/source/examples/omniisaacgym/ppo_humanoid.py b/docs/source/examples/omniisaacgym/ppo_humanoid.py index ad9cd876..9fd60330 100644 --- a/docs/source/examples/omniisaacgym/ppo_humanoid.py +++ b/docs/source/examples/omniisaacgym/ppo_humanoid.py @@ -6,6 +6,7 @@ from skrl.memories.torch import RandomMemory from skrl.agents.torch.ppo import PPO, PPO_DEFAULT_CONFIG from skrl.resources.schedulers.torch import KLAdaptiveRL +from skrl.resources.preprocessors.torch import RunningStandardScaler from skrl.trainers.torch import SequentialTrainer from skrl.envs.torch import wrap_env from skrl.envs.torch import load_omniverse_isaacgym_env @@ -97,6 +98,10 @@ def compute(self, states, taken_actions): cfg_ppo["value_loss_scale"] = 2.0 cfg_ppo["kl_threshold"] = 0 cfg_ppo["rewards_shaper"] = lambda rewards, timestep, timesteps: rewards * 0.01 +cfg_ppo["state_preprocessor"] = RunningStandardScaler +cfg_ppo["state_preprocessor_kwargs"] = {"size": env.observation_space, "device": device} +cfg_ppo["value_preprocessor"] = RunningStandardScaler +cfg_ppo["value_preprocessor_kwargs"] = {"size": 1, "device": device} # logging to TensorBoard and write checkpoints each 160 and 1600 timesteps respectively cfg_ppo["experiment"]["write_interval"] = 160 cfg_ppo["experiment"]["checkpoint_interval"] = 1600 diff --git a/docs/source/examples/omniisaacgym/ppo_shadow_hand.py b/docs/source/examples/omniisaacgym/ppo_shadow_hand.py index 90648118..e7b66041 100644 --- a/docs/source/examples/omniisaacgym/ppo_shadow_hand.py +++ b/docs/source/examples/omniisaacgym/ppo_shadow_hand.py @@ -6,6 +6,7 @@ from skrl.memories.torch import RandomMemory from skrl.agents.torch.ppo import PPO, PPO_DEFAULT_CONFIG from skrl.resources.schedulers.torch import KLAdaptiveRL +from skrl.resources.preprocessors.torch import RunningStandardScaler from skrl.trainers.torch import SequentialTrainer from skrl.envs.torch import wrap_env from skrl.envs.torch import load_omniverse_isaacgym_env @@ -101,6 +102,10 @@ def compute(self, states, taken_actions): cfg_ppo["value_loss_scale"] = 2.0 cfg_ppo["kl_threshold"] = 0 cfg_ppo["rewards_shaper"] = lambda rewards, timestep, timesteps: rewards * 0.01 +cfg_ppo["state_preprocessor"] = RunningStandardScaler +cfg_ppo["state_preprocessor_kwargs"] = {"size": env.observation_space, "device": device} +cfg_ppo["value_preprocessor"] = RunningStandardScaler +cfg_ppo["value_preprocessor_kwargs"] = {"size": 1, "device": device} # logging to TensorBoard and write checkpoints each 200 and 2000 timesteps respectively cfg_ppo["experiment"]["write_interval"] = 200 cfg_ppo["experiment"]["checkpoint_interval"] = 2000 diff --git a/docs/source/intro/examples.rst b/docs/source/intro/examples.rst index a9c48ad1..bf3cd29b 100644 --- a/docs/source/intro/examples.rst +++ b/docs/source/intro/examples.rst @@ -608,7 +608,7 @@ The PPO agent configuration is mapped, as far as possible, from the rl_games' A2 .. literalinclude:: ../examples/omniisaacgym/ppo_allegro_hand.py :language: python :linenos: - :emphasize-lines: 10-11, 57-58 + :emphasize-lines: 11-12, 58-59 .. tab:: Ant @@ -617,7 +617,7 @@ The PPO agent configuration is mapped, as far as possible, from the rl_games' A2 .. literalinclude:: ../examples/omniisaacgym/ppo_ant.py :language: python :linenos: - :emphasize-lines: 10-11, 57-58 + :emphasize-lines: 11-12, 58-59 .. tab:: Ant (multi-threaded) @@ -626,7 +626,7 @@ The PPO agent configuration is mapped, as far as possible, from the rl_games' A2 .. literalinclude:: ../examples/omniisaacgym/ppo_ant_mt.py :language: python :linenos: - :emphasize-lines: 1, 12-13, 59-60, 119, 123 + :emphasize-lines: 1, 13-14, 60-61, 124, 128 .. tab:: Cartpole @@ -635,7 +635,7 @@ The PPO agent configuration is mapped, as far as possible, from the rl_games' A2 .. literalinclude:: ../examples/omniisaacgym/ppo_cartpole.py :language: python :linenos: - :emphasize-lines: 10-11, 53-54 + :emphasize-lines: 11-12, 54-55 .. tab:: Cartpole (multi-threaded) @@ -644,7 +644,7 @@ The PPO agent configuration is mapped, as far as possible, from the rl_games' A2 .. literalinclude:: ../examples/omniisaacgym/ppo_cartpole_mt.py :language: python :linenos: - :emphasize-lines: 1, 12-13, 55-56, 115, 119 + :emphasize-lines: 1, 13-14, 56-57, 120, 124 .. tab:: Humanoid @@ -653,7 +653,7 @@ The PPO agent configuration is mapped, as far as possible, from the rl_games' A2 .. literalinclude:: ../examples/omniisaacgym/ppo_humanoid.py :language: python :linenos: - :emphasize-lines: 10-11, 57-58 + :emphasize-lines: 11-12, 58-59 .. tab:: ShadowHand @@ -662,7 +662,7 @@ The PPO agent configuration is mapped, as far as possible, from the rl_games' A2 .. literalinclude:: ../examples/omniisaacgym/ppo_shadow_hand.py :language: python :linenos: - :emphasize-lines: 10-11, 61-62 + :emphasize-lines: 11-12, 62-63 .. raw:: html From c7e44c73d18097142af065acaa15392b8f4ea88f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20Serrano=20Mu=C3=B1oz?= Date: Sun, 24 Jul 2022 23:49:50 +0200 Subject: [PATCH 002/108] Add AMP agent --- skrl/agents/torch/amp/__init__.py | 1 + skrl/agents/torch/amp/amp.py | 553 ++++++++++++++++++++++++++++++ 2 files changed, 554 insertions(+) create mode 100644 skrl/agents/torch/amp/__init__.py create mode 100644 skrl/agents/torch/amp/amp.py diff --git a/skrl/agents/torch/amp/__init__.py b/skrl/agents/torch/amp/__init__.py new file mode 100644 index 00000000..9a6ca76e --- /dev/null +++ b/skrl/agents/torch/amp/__init__.py @@ -0,0 +1 @@ +from .amp import AMP, AMP_DEFAULT_CONFIG \ No newline at end of file diff --git a/skrl/agents/torch/amp/amp.py b/skrl/agents/torch/amp/amp.py new file mode 100644 index 00000000..e2befc77 --- /dev/null +++ b/skrl/agents/torch/amp/amp.py @@ -0,0 +1,553 @@ +from typing import Callable, Union, Tuple, Dict, Any + +import gym +import math +import copy +import itertools + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from ....memories.torch import Memory +from ....models.torch import Model + +from .. import Agent + + +AMP_DEFAULT_CONFIG = { + "rollouts": 16, # number of rollouts before updating + "learning_epochs": 6, # number of learning epochs during each update + "mini_batches": 2, # number of mini batches during each learning epoch + + "discount_factor": 0.99, # discount factor (gamma) + "lambda": 0.95, # TD(lambda) coefficient (lam) for computing returns and advantages + + "learning_rate": 5e-5, # learning rate + "discriminator_learning_rate": 5e-5, # discriminator learning rate + "learning_rate_scheduler": None, # learning rate scheduler class (see torch.optim.lr_scheduler) + "learning_rate_scheduler_kwargs": {}, # learning rate scheduler's kwargs (e.g. {"step_size": 1e-3}) + + "state_preprocessor": None, # state preprocessor class (see skrl.resources.preprocessors) + "state_preprocessor_kwargs": {}, # state preprocessor's kwargs (e.g. {"size": env.observation_space}) + "value_preprocessor": None, # value preprocessor class (see skrl.resources.preprocessors) + "value_preprocessor_kwargs": {}, # value preprocessor's kwargs (e.g. {"size": 1}) + "amp_state_preprocessor": None, # AMP state preprocessor class (see skrl.resources.preprocessors) + "amp_state_preprocessor_kwargs": {}, # AMP state preprocessor's kwargs (e.g. {"size": env.amp_observation_space}) + + "random_timesteps": 0, # random exploration steps + "learning_starts": 0, # learning starts after this many steps + + "grad_norm_clip": 0.0, # clipping coefficient for the norm of the gradients + "ratio_clip": 0.2, # clipping coefficient for computing the clipped surrogate objective + "value_clip": 0.2, # clipping coefficient for computing the value loss (if clip_predicted_values is True) + "clip_predicted_values": False, # clip predicted values during value loss computation + + "entropy_loss_scale": 0.0, # entropy loss scaling factor + "value_loss_scale": 2.5, # value loss scaling factor + "discriminator_loss_scale": 5.0, # discriminator loss scaling factor + + "amp_batch_size": 512, # batch size for updating the reference motion dataset + "task_reward_weight": 0.0, # task-reward weight (wG) + "style_reward_weight": 1.0, # style-reward weight (wS) + "discriminator_reward_scale": 2, # discriminator reward scaling factor + "discriminator_logit_regularization_scale": 0.05, # logit regularization scale factor for the discriminator loss + "discriminator_gradient_penalty_scale": 5, # gradient penalty scaling factor for the discriminator loss + "discriminator_weight_decay_scale": 0.0001, # weight decay scaling factor for the discriminator loss + + "rewards_shaper": None, # rewards shaping function: Callable(reward, timestep, timesteps) -> reward + + "experiment": { + "directory": "", # experiment's parent directory + "experiment_name": "", # experiment name + "write_interval": 250, # TensorBoard writing interval (timesteps) + + "checkpoint_interval": 1000, # interval for checkpoints (timesteps) + "checkpoint_policy_only": True, # checkpoint for policy only + } +} + + +class AMP(Agent): + def __init__(self, + models: Dict[str, Model], + memory: Union[Memory, Tuple[Memory], None] = None, + observation_space: Union[int, Tuple[int], gym.Space, None] = None, + action_space: Union[int, Tuple[int], gym.Space, None] = None, + device: Union[str, torch.device] = "cuda:0", + cfg: dict = {}, + amp_observation_space: Union[int, Tuple[int], gym.Space, None] = None, + motion_dataset: Union[Memory, None] = None, + reply_buffer: Union[Memory, None] = None, + collect_reference_motions: Union[Callable[[int], torch.Tensor], None] = None, + collect_observation: Union[Callable[[], torch.Tensor], None] = None) -> None: + """Adversarial Motion Priors (AMP) + + https://arxiv.org/abs/2104.02180 + + The implementation is adapted from the NVIDIA IsaacGymEnvs + (https://github.com/NVIDIA-Omniverse/IsaacGymEnvs/blob/main/isaacgymenvs/learning/amp_continuous.py) + + :param models: Models used by the agent + :type models: dictionary of skrl.models.torch.Model + :param memory: Memory to storage the transitions. + If it is a tuple, the first element will be used for training and + for the rest only the environment transitions will be added + :type memory: skrl.memory.torch.Memory, list of skrl.memory.torch.Memory or None + :param observation_space: Observation/state space or shape (default: None) + :type observation_space: int, tuple or list of integers, gym.Space or None, optional + :param action_space: Action space or shape (default: None) + :type action_space: int, tuple or list of integers, gym.Space or None, optional + :param device: Computing device (default: "cuda:0") + :type device: str or torch.device, optional + :param cfg: Configuration dictionary + :type cfg: dict + :param amp_observation_space: AMP observation/state space or shape (default: None) + :type amp_observation_space: int, tuple or list of integers, gym.Space or None + :param motion_dataset: Reference motion dataset: M (default: None) + :type motion_dataset: skrl.memory.torch.Memory or None + :param reply_buffer: Reply buffer for preventing discriminator overfitting: B (default: None) + :type reply_buffer: skrl.memory.torch.Memory or None + :param collect_reference_motions: Callable to collect reference motions (default: None) + :type collect_reference_motions: Callable[[int], torch.Tensor] or None + :param collect_observation: Callable to collect observation (default: None) + :type collect_observation: Callable[[], torch.Tensor] or None + + :raises KeyError: If the models dictionary is missing a required key + """ + _cfg = copy.deepcopy(AMP_DEFAULT_CONFIG) + _cfg.update(cfg) + super().__init__(models=models, + memory=memory, + observation_space=observation_space, + action_space=action_space, + device=device, + cfg=_cfg) + + self.amp_observation_space = amp_observation_space + self.motion_dataset = motion_dataset + self.reply_buffer = reply_buffer + self.collect_reference_motions = collect_reference_motions + self.collect_observation = collect_observation + + # models + self.policy = self.models.get("policy", None) + self.value = self.models.get("value", None) + self.discriminator = self.models.get("discriminator", None) + + # checkpoint models + self.checkpoint_models = {"policy": self.policy} if self.checkpoint_policy_only else self.models + + # configuration + self._learning_epochs = self.cfg["learning_epochs"] + self._mini_batches = self.cfg["mini_batches"] + self._rollouts = self.cfg["rollouts"] + self._rollout = 0 + + self._grad_norm_clip = self.cfg["grad_norm_clip"] + self._ratio_clip = self.cfg["ratio_clip"] + self._value_clip = self.cfg["value_clip"] + self._clip_predicted_values = self.cfg["clip_predicted_values"] + + self._value_loss_scale = self.cfg["value_loss_scale"] + self._entropy_loss_scale = self.cfg["entropy_loss_scale"] + self._discriminator_loss_scale = self.cfg["discriminator_loss_scale"] + + self._learning_rate = self.cfg["learning_rate"] + self._discriminator_learning_rate = self.cfg["discriminator_learning_rate"] + self._learning_rate_scheduler = self.cfg["learning_rate_scheduler"] + + self._state_preprocessor = self.cfg["state_preprocessor"] + self._value_preprocessor = self.cfg["value_preprocessor"] + self._amp_state_preprocessor = self.cfg["amp_state_preprocessor"] + + self._discount_factor = self.cfg["discount_factor"] + self._lambda = self.cfg["lambda"] + + self._random_timesteps = self.cfg["random_timesteps"] + self._learning_starts = self.cfg["learning_starts"] + + self._amp_batch_size = self.cfg["amp_batch_size"] + self._task_reward_weight = self.cfg["task_reward_weight"] + self._style_reward_weight = self.cfg["style_reward_weight"] + + self._discriminator_reward_scale = self.cfg["discriminator_reward_scale"] + self._discriminator_logit_regularization_scale = self.cfg["discriminator_logit_regularization_scale"] + self._discriminator_gradient_penalty_scale = self.cfg["discriminator_gradient_penalty_scale"] + self._discriminator_weight_decay_scale = self.cfg["discriminator_weight_decay_scale"] + + self._rewards_shaper = self.cfg["rewards_shaper"] + + # set up optimizer and learning rate scheduler + if self.policy is not None and self.value is not None and self.discriminator is not None: + self.optimizer = torch.optim.Adam(itertools.chain(self.policy.parameters(), + self.value.parameters(), + self.discriminator.parameters()), + lr=self._learning_rate) + if self._learning_rate_scheduler is not None: + self.scheduler = self._learning_rate_scheduler(self.optimizer, **self.cfg["learning_rate_scheduler_kwargs"]) + + # set up preprocessors + self._state_preprocessor = self._state_preprocessor(**self.cfg["state_preprocessor_kwargs"]) if self._state_preprocessor \ + else self._empty_preprocessor + self._value_preprocessor = self._value_preprocessor(**self.cfg["value_preprocessor_kwargs"]) if self._value_preprocessor \ + else self._empty_preprocessor + self._amp_state_preprocessor = self._amp_state_preprocessor(**self.cfg["amp_state_preprocessor_kwargs"]) \ + if self._amp_state_preprocessor else self._empty_preprocessor + + def init(self) -> None: + """Initialize the agent + """ + super().init() + self.set_mode("eval") + + # create tensors in memory + if self.memory is not None: + self.memory.create_tensor(name="states", size=self.observation_space, dtype=torch.float32) + self.memory.create_tensor(name="next_states", size=self.observation_space, dtype=torch.float32) + self.memory.create_tensor(name="actions", size=self.action_space, dtype=torch.float32) + self.memory.create_tensor(name="rewards", size=1, dtype=torch.float32) + self.memory.create_tensor(name="dones", size=1, dtype=torch.bool) + self.memory.create_tensor(name="log_prob", size=self.action_space, dtype=torch.float32) + self.memory.create_tensor(name="values", size=1, dtype=torch.float32) + self.memory.create_tensor(name="returns", size=1, dtype=torch.float32) + self.memory.create_tensor(name="advantages", size=1, dtype=torch.float32) + + self.memory.create_tensor(name="amp_states", size=self.amp_observation_space, dtype=torch.float32) + self.memory.create_tensor(name="next_values", size=1, dtype=torch.float32) + + self.tensors_names = ["states", "actions", "rewards", "next_states", "dones", \ + "log_prob", "values", "returns", "advantages", "amp_states", "next_values"] + + # create tensors for motion dataset and reply buffer + self.motion_dataset.create_tensor(name="states", size=self.amp_observation_space, dtype=torch.float32) + self.reply_buffer.create_tensor(name="states", size=self.amp_observation_space, dtype=torch.float32) + + # initialize motion dataset + for _ in range(math.ceil(self.motion_dataset.memory_size / self._amp_batch_size)): + self.motion_dataset.add_samples(states=self.collect_reference_motions(self._amp_batch_size)) + + # create temporary variables needed for storage and computation + self._current_log_prob = None + self._current_states = None + + def act(self, + states: torch.Tensor, + timestep: int, + timesteps: int, + inference: bool = False) -> torch.Tensor: + """Process the environment's states to make a decision (actions) using the main policy + + :param states: Environment's states + :type states: torch.Tensor + :param timestep: Current timestep + :type timestep: int + :param timesteps: Number of timesteps + :type timesteps: int + :param inference: Flag to indicate whether the model is making inference + :type inference: bool + + :return: Actions + :rtype: torch.Tensor + """ + # use collected states + if self._current_states is not None: + states = self._current_states + + states = self._state_preprocessor(states) + + # sample random actions + # TODO, check for stochasticity + if timestep < self._random_timesteps: + return self.policy.random_act(states) + + # sample stochastic actions + actions, log_prob, actions_mean = self.policy.act(states, inference=inference) + self._current_log_prob = log_prob + + return actions, log_prob, actions_mean + + def record_transition(self, + states: torch.Tensor, + actions: torch.Tensor, + rewards: torch.Tensor, + next_states: torch.Tensor, + dones: torch.Tensor, + infos: Any, + timestep: int, + timesteps: int) -> None: + """Record an environment transition in memory + + :param states: Observations/states of the environment used to make the decision + :type states: torch.Tensor + :param actions: Actions taken by the agent + :type actions: torch.Tensor + :param rewards: Instant rewards achieved by the current actions + :type rewards: torch.Tensor + :param next_states: Next observations/states of the environment + :type next_states: torch.Tensor + :param dones: Signals to indicate that episodes have ended + :type dones: torch.Tensor + :param infos: Additional information about the environment + :type infos: Any type supported by the environment + :param timestep: Current timestep + :type timestep: int + :param timesteps: Number of timesteps + :type timesteps: int + """ + # use collected states + if self._current_states is not None: + states = self._current_states + + super().record_transition(states, actions, rewards, next_states, dones, infos, timestep, timesteps) + + # reward shaping + if self._rewards_shaper is not None: + rewards = self._rewards_shaper(rewards, timestep, timesteps) + + amp_states = infos["amp_obs"] + + if self.memory is not None: + values, _, _ = self.value.act(states=self._state_preprocessor(states), inference=True) + values = self._value_preprocessor(values, inverse=True) + + next_values, _, _ = self.value.act(states=self._state_preprocessor(next_states), inference=True) + next_values = self._value_preprocessor(next_values, inverse=True) + next_values *= infos['terminate'].view(-1, 1).logical_not() + + self.memory.add_samples(states=states, actions=actions, rewards=rewards, next_states=next_states, dones=dones, + log_prob=self._current_log_prob, values=values, amp_states=amp_states, next_values=next_values) + for memory in self.secondary_memories: + memory.add_samples(states=states, actions=actions, rewards=rewards, next_states=next_states, dones=dones, + log_prob=self._current_log_prob, values=values, amp_states=amp_states, next_values=next_values) + + def pre_interaction(self, timestep: int, timesteps: int) -> None: + """Callback called before the interaction with the environment + + :param timestep: Current timestep + :type timestep: int + :param timesteps: Number of timesteps + :type timesteps: int + """ + if self.collect_observation is not None: + self._current_states = self.collect_observation() + + def post_interaction(self, timestep: int, timesteps: int) -> None: + """Callback called after the interaction with the environment + + :param timestep: Current timestep + :type timestep: int + :param timesteps: Number of timesteps + :type timesteps: int + """ + self._rollout += 1 + if not self._rollout % self._rollouts and timestep >= self._learning_starts: + self.set_mode("train") + self._update(timestep, timesteps) + self.set_mode("eval") + + # write tracking data and checkpoints + super().post_interaction(timestep, timesteps) + + def _update(self, timestep: int, timesteps: int) -> None: + """Algorithm's main update step + + :param timestep: Current timestep + :type timestep: int + :param timesteps: Number of timesteps + :type timesteps: int + """ + def compute_gae(rewards: torch.Tensor, + dones: torch.Tensor, + values: torch.Tensor, + next_values: torch.Tensor, + discount_factor: float = 0.99, + lambda_coefficient: float = 0.95) -> torch.Tensor: + """Compute the Generalized Advantage Estimator (GAE) + + :param rewards: Rewards obtained by the agent + :type rewards: torch.Tensor + :param dones: Signals to indicate that episodes have ended + :type dones: torch.Tensor + :param values: Values obtained by the agent + :type values: torch.Tensor + :param next_values: Next values obtained by the agent + :type next_values: torch.Tensor + :param discount_factor: Discount factor + :type discount_factor: float + :param lambda_coefficient: Lambda coefficient + :type lambda_coefficient: float + + :return: Generalized Advantage Estimator + :rtype: torch.Tensor + """ + advantage = 0 + advantages = torch.zeros_like(rewards) + not_dones = dones.logical_not() + memory_size = rewards.shape[0] + + # advantages computation + for i in reversed(range(memory_size)): + advantage = rewards[i] - values[i] + discount_factor * (next_values[i] + lambda_coefficient * not_dones[i] * advantage) + advantages[i] = advantage + # returns computation + returns = advantages + values + # normalize advantages + advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8) + + return returns, advantages + + # update dataset of reference motions + self.motion_dataset.add_samples(states=self.collect_reference_motions(self._amp_batch_size)) + + # compute combined rewards + rewards = self.memory.get_tensor_by_name("rewards") + amp_states = self.memory.get_tensor_by_name("amp_states") + + with torch.no_grad(): + amp_logits, _, _ = self.discriminator.act(self._amp_state_preprocessor(amp_states)) + style_reward = -torch.log(torch.maximum(1 - 1 / (1 + torch.exp(-amp_logits)), torch.tensor(0.0001, device=self.device))) + style_reward *= self._discriminator_reward_scale + + combined_rewards = self._task_reward_weight * rewards + self._style_reward_weight * style_reward + + # compute returns and advantages + values = self.memory.get_tensor_by_name("values") + next_values=self.memory.get_tensor_by_name("next_values") + returns, advantages = compute_gae(rewards=combined_rewards, + dones=self.memory.get_tensor_by_name("dones"), + values=values, + next_values=next_values, + discount_factor=self._discount_factor, + lambda_coefficient=self._lambda) + + self.memory.set_tensor_by_name("values", self._value_preprocessor(values, train=True)) + self.memory.set_tensor_by_name("returns", self._value_preprocessor(returns, train=True)) + self.memory.set_tensor_by_name("advantages", advantages) + + # sample mini-batches from memory + sampled_batches = self.memory.sample_all(names=self.tensors_names, mini_batches=self._mini_batches) + sampled_motion_batches = self.motion_dataset.sample(names=["states"], + batch_size=self.memory.memory_size * self.memory.num_envs, + mini_batches=self._mini_batches) + if len(self.reply_buffer): + sampled_replay_batches = self.reply_buffer.sample(names=["states"], + batch_size=self.memory.memory_size * self.memory.num_envs, + mini_batches=self._mini_batches) + else: + sampled_replay_batches = [[batches[self.tensors_names.index("amp_states")]] for batches in sampled_batches] + + cumulative_policy_loss = 0 + cumulative_entropy_loss = 0 + cumulative_value_loss = 0 + cumulative_discriminator_loss = 0 + + # learning epochs + for epoch in range(self._learning_epochs): + + # mini-batches loop + for batch_index, (sampled_states, sampled_actions, _, _, _, \ + sampled_log_prob, sampled_values, sampled_returns, sampled_advantages, \ + sampled_amp_states, _) in enumerate(sampled_batches): + + sampled_states = self._state_preprocessor(sampled_states, train=True) + + _, next_log_prob, _ = self.policy.act(states=sampled_states, taken_actions=sampled_actions) + + # compute entropy loss + if self._entropy_loss_scale: + entropy_loss = -self._entropy_loss_scale * self.policy.get_entropy().mean() + else: + entropy_loss = 0 + + # compute policy loss + ratio = torch.exp(next_log_prob - sampled_log_prob) + surrogate = sampled_advantages * ratio + surrogate_clipped = sampled_advantages * torch.clip(ratio, 1.0 - self._ratio_clip, 1.0 + self._ratio_clip) + + policy_loss = -torch.min(surrogate, surrogate_clipped).mean() + + # compute value loss + predicted_values, _, _ = self.value.act(states=sampled_states) + + if self._clip_predicted_values: + predicted_values = sampled_values + torch.clip(predicted_values - sampled_values, + min=-self._value_clip, + max=self._value_clip) + value_loss = self._value_loss_scale * F.mse_loss(sampled_returns, predicted_values) + + # compute discriminator loss + amp_logits, _, _ = self.discriminator.act(states=self._amp_state_preprocessor(sampled_amp_states[0:4096], train=True)) + + amp_replay_logits, _, _ = self.discriminator.act(states=self._amp_state_preprocessor(sampled_replay_batches[batch_index][0][0:4096], train=True)) + + sampled_amp_motion_states = self._amp_state_preprocessor(sampled_motion_batches[batch_index][0][0:4096], train=True) + sampled_amp_motion_states.requires_grad_(True) + amp_motion_logits, _, _ = self.discriminator.act(states=sampled_amp_motion_states) + + amp_cat_logits = torch.cat([amp_logits, amp_replay_logits], dim=0) + + # discriminator prediction loss + discriminator_loss = 0.5 * (nn.BCEWithLogitsLoss()(amp_cat_logits, torch.zeros_like(amp_cat_logits)) \ + + torch.nn.BCEWithLogitsLoss()(amp_motion_logits, torch.ones_like(amp_motion_logits))) + + # discriminator logit regularization + if self._discriminator_logit_regularization_scale: + logit_weights = torch.flatten(list(self.discriminator.modules())[-1].weight) + discriminator_loss += self._discriminator_logit_regularization_scale * torch.sum(torch.square(logit_weights)) + + # discriminator gradient penalty + if self._discriminator_gradient_penalty_scale: + amp_motion_gradient = torch.autograd.grad(amp_motion_logits, + sampled_amp_motion_states, + grad_outputs=torch.ones_like(amp_motion_logits), + create_graph=True, + retain_graph=True, + only_inputs=True) + gradient_penalty = torch.sum(torch.square(amp_motion_gradient[0]), dim=-1).mean() + discriminator_loss += self._discriminator_gradient_penalty_scale * gradient_penalty + + # discriminator weight decay + if self._discriminator_weight_decay_scale: + weights = [torch.flatten(module.weight) for module in self.discriminator.modules() \ + if isinstance(module, torch.nn.Linear)] + weight_decay = torch.sum(torch.square(torch.cat(weights, dim=-1))) + discriminator_loss += self._discriminator_weight_decay_scale * weight_decay + + discriminator_loss *= self._discriminator_loss_scale + + # optimization step + self.optimizer.zero_grad() + (policy_loss + entropy_loss + value_loss + discriminator_loss).backward() + if self._grad_norm_clip > 0: + nn.utils.clip_grad_norm_(itertools.chain(self.policy.parameters(), + self.value.parameters(), + self.discriminator.parameters()), + max_norm=self._grad_norm_clip) + self.optimizer.step() + + # update cumulative losses + cumulative_policy_loss += policy_loss.item() + cumulative_value_loss += value_loss.item() + if self._entropy_loss_scale: + cumulative_entropy_loss += entropy_loss.item() + cumulative_discriminator_loss += discriminator_loss.item() + + # update learning rate + if self._learning_rate_scheduler: + self.scheduler.step() + + # update AMP repaly buffer + self.reply_buffer.add_samples(states=amp_states) + + # record data + self.track_data("Loss / Policy loss", cumulative_policy_loss / (self._learning_epochs * self._mini_batches)) + self.track_data("Loss / Value loss", cumulative_value_loss / (self._learning_epochs * self._mini_batches)) + if self._entropy_loss_scale: + self.track_data("Loss / Entropy loss", cumulative_entropy_loss / (self._learning_epochs * self._mini_batches)) + self.track_data("Loss / Discriminator loss", cumulative_discriminator_loss / (self._learning_epochs * self._mini_batches)) + + self.track_data("Policy / Standard deviation", self.policy.distribution().stddev.mean().item()) + + if self._learning_rate_scheduler: + self.track_data("Learning / Learning rate", self.scheduler.get_last_lr()[0]) From 45df8389944dc4ebd6497266df97c9e2f869182c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20Serrano=20Mu=C3=B1oz?= Date: Sun, 24 Jul 2022 23:54:04 +0200 Subject: [PATCH 003/108] Add AMP agent to docs --- docs/source/index.rst | 2 + docs/source/modules/skrl.agents.amp.rst | 155 ++++++++++++++++++++++++ 2 files changed, 157 insertions(+) create mode 100644 docs/source/modules/skrl.agents.amp.rst diff --git a/docs/source/index.rst b/docs/source/index.rst index 9245c30e..69dc9550 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -56,6 +56,7 @@ Agents Definition of reinforcement learning algorithms that compute an optimal policy. All agents inherit from one and only one :doc:`base class ` (that defines a uniform interface and provides for common functionalities) but which is not tied to the implementation details of the algorithms * :doc:`Advantage Actor Critic ` (**A2C**) + * :doc:`Adversarial Motion Priors ` (**AMP**) * :doc:`Cross-Entropy Method ` (**CEM**) * :doc:`Deep Deterministic Policy Gradient ` (**DDPG**) * :doc:`Double Deep Q-Network ` (**DDQN**) @@ -74,6 +75,7 @@ Agents modules/skrl.agents.base_class A2C + AMP CEM DDPG DDQN diff --git a/docs/source/modules/skrl.agents.amp.rst b/docs/source/modules/skrl.agents.amp.rst new file mode 100644 index 00000000..e8e7c228 --- /dev/null +++ b/docs/source/modules/skrl.agents.amp.rst @@ -0,0 +1,155 @@ +Adversarial Motion Priors (AMP) +=============================== + +AMP is a **model-free**, **stochastic** **on-policy** **policy gradient** algorithm (trained using a combination of GAIL and PPO) for adversarial learning of physics-based character animation. It enables characters to imitate diverse behaviors from large unstructured datasets, without the need for motion planners or other mechanisms for clip selection + +Paper: `AMP: Adversarial Motion Priors for Stylized Physics-Based Character Control `_ + +Algorithm implementation +^^^^^^^^^^^^^^^^^^^^^^^^ + +| Main notation/symbols: +| - policy (:math:`\pi_\theta`), value (:math:`V_\phi`) and discriminator (:math:`D_\psi`) function approximators +| - states (:math:`s`), actions (:math:`a`), rewards (:math:`r`), next states (:math:`s'`), dones (:math:`d`) +| - values (:math:`V`), next values (:math:`V'`), advantages (:math:`A`), returns (:math:`R`) +| - log probabilities (:math:`logp`) +| - loss (:math:`L`) +| - reference motion dataset (:math:`M`), AMP replay buffer (:math:`B`) +| - AMP states (:math:`s_{_{AMP}}`), reference motion states (:math:`s_{_{AMP}}^{^M}`), AMP states from replay buffer (:math:`s_{_{AMP}}^{^B}`) + +**Learning algorithm** (:literal:`_update(...)`) + +| :literal:`compute_gae(...)` +| :blue:`def` :math:`\;f_{GAE} (r, d, V, V') \;\rightarrow\; R, A:` +| :math:`adv \leftarrow 0` +| :math:`A \leftarrow \text{zeros}(r)` +| :green:`# advantages computation` +| **FOR** each reverse iteration :math:`i` up to the number of rows in :math:`r` **DO** +| :math:`adv \leftarrow r_i - V_i \, +` :guilabel:`discount_factor` :math:`(V' \, +` :guilabel:`lambda` :math:`\neg d_i \; adv)` +| :math:`A_i \leftarrow adv` +| :green:`# returns computation` +| :math:`R \leftarrow A + V` +| :green:`# normalize advantages` +| :math:`A \leftarrow \dfrac{A - \bar{A}}{A_\sigma + 10^{-8}}` + +| :green:`# update dataset of reference motions` +| :math:`\text{collect reference motions of}` :guilabel:`amp_batch_size` :math:`\rightarrow\;` :math:`\text{append}(M)` +| :green:`# compute combined rewards` +| :math:`r_D \leftarrow -log(\text{max}( 1 - \hat{y}(D_\psi(s_{_{AMP}})), \, 10^{-4})) \qquad` with :math:`\; \hat{y}(x) = \dfrac{1}{1 + e^{-x}}` +| :math:`r' \leftarrow` :guilabel:`task_reward_weight` :math:`r \, +` :guilabel:`style_reward_weight` :guilabel:`discriminator_reward_scale` :math:`r_D` +| :green:`# compute returns and advantages` +| :math:`R, A \leftarrow f_{GAE}(r', d, V, V')` +| :green:`# sample mini-batches from memory` +| [[:math:`s, a, logp, V, R, A, s_{_{AMP}}`]] :math:`\leftarrow` states, actions, log_prob, values, returns, advantages, AMP states +| [[:math:`s_{_{AMP}}^{^M}`]] :math:`\leftarrow` AMP states from :math:`M` +| **IF** :math:`B` is not empty **THEN** +| [[:math:`s_{_{AMP}}^{^B}`]] :math:`\leftarrow` AMP states from :math:`B` +| **ELSE** +| [[:math:`s_{_{AMP}}^{^B}`]] :math:`\leftarrow` [[:math:`s_{_{AMP}}`]] +| :green:`# learning epochs` +| **FOR** each learning epoch up to :guilabel:`learning_epochs` **DO** +| :green:`# mini-batches loop` +| **FOR** each mini-batch [:math:`s, a, logp, V, R, A, s_{_{AMP}}, s_{_{AMP}}^{^B}, s_{_{AMP}}^{^M}`] up to :guilabel:`mini_batches` **DO** +| :math:`logp' \leftarrow \pi_\theta(s, a)` +| :green:`# compute entropy loss` +| **IF** entropy computation is enabled **THEN** +| :math:`{L}_{entropy} \leftarrow \, -` :guilabel:`entropy_loss_scale` :math:`\frac{1}{N} \sum_{i=1}^N \pi_{\theta_{entropy}}` +| **ELSE** +| :math:`{L}_{entropy} \leftarrow 0` +| :green:`# compute policy loss` +| :math:`ratio \leftarrow e^{logp' - logp}` +| :math:`L_{_{surrogate}} \leftarrow A \; ratio` +| :math:`L_{_{clipped\,surrogate}} \leftarrow A \; \text{clip}(ratio, 1 - c, 1 + c) \qquad` with :math:`c` as :guilabel:`ratio_clip` +| :math:`L^{clip}_{\pi_\theta} \leftarrow - \frac{1}{N} \sum_{i=1}^N \min(L_{_{surrogate}}, L_{_{clipped\,surrogate}})` +| :green:`# compute value loss` +| :math:`V_{_{predicted}} \leftarrow V_\phi(s)` +| **IF** :guilabel:`clip_predicted_values` is enabled **THEN** +| :math:`V_{_{predicted}} \leftarrow V + \text{clip}(V_{_{predicted}} - V, -c, c) \qquad` with :math:`c` as :guilabel:`value_clip` +| :math:`L_{V_\phi} \leftarrow` :guilabel:`value_loss_scale` :math:`\frac{1}{N} \sum_{i=1}^N (R - V_{_{predicted}})^2` +| :green:`# compute discriminator loss` +| :math:`{logit}_{_{AMP}} \leftarrow D_\psi(s_{_{AMP}})` +| :math:`{logit}_{_{AMP}}^{^B} \leftarrow D_\psi(s_{_{AMP}}^{^B})` +| :math:`{logit}_{_{AMP}}^{^M} \leftarrow D_\psi(s_{_{AMP}}^{^M})` +| :green:`# discriminator prediction loss` +| :math:`L_{D_\psi} \leftarrow \dfrac{1}{2}(BCE({logit}_{_{AMP}}` ++ :math:`{logit}_{_{AMP}}^{^B}, \, 0) + BCE({logit}_{_{AMP}}^{^M}, \, 1))` +| with :math:`\; BCE(x,y)=-\frac{1}{N} \sum_{i=1}^N [y \; log(\hat{y}) + (1-y) \, log(1-\hat{y})] \;` and :math:`\; \hat{y} = \dfrac{1}{1 + e^{-x}}` +| :green:`# discriminator logit regularization` +| :math:`L_{D_\psi} \leftarrow L_{D_\psi} +` :guilabel:`discriminator_logit_regularization_scale` :math:`\sum_{i=1}^N \text{flatten}(\psi_w[-1])^2` +| :green:`# discriminator gradient penalty` +| :math:`L_{D_\psi} \leftarrow L_{D_\psi} +` :guilabel:`discriminator_gradient_penalty_scale` :math:`\frac{1}{N} \sum_{i=1}^N \sum (\nabla_\psi {logit}_{_{AMP}}^{^M})^2` +| :green:`# discriminator weight decay` +| :math:`L_{D_\psi} \leftarrow L_{D_\psi} +` :guilabel:`discriminator_weight_decay_scale` :math:`\sum_{i=1}^N \text{flatten}(\psi_w)^2` +| :green:`# optimization step` +| reset :math:`\text{optimizer}_{\theta, \phi, \psi}` +| :math:`\nabla_{\theta, \, \phi, \, \psi} (L^{clip}_{\pi_\theta} + {L}_{entropy} + L_{V_\phi} + L_{D_\psi})` +| :math:`\text{clip}(\lVert \nabla_{\theta, \, \phi, \, \psi} \rVert)` with :guilabel:`grad_norm_clip` +| step :math:`\text{optimizer}_{\theta, \phi, \psi}` +| :green:`# update learning rate` +| **IF** there is a :guilabel:`learning_rate_scheduler` **THEN** +| step :math:`\text{scheduler}_{\theta, \phi, \psi} (\text{optimizer}_{\theta, \phi, \psi})` +| :green:`# update AMP repaly buffer` +| :math:`s_{_{AMP}} \rightarrow\;` :math:`\text{append}(B)` + +Configuration and hyperparameters +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. py:data:: skrl.agents.torch.amp.amp.AMP_DEFAULT_CONFIG + +.. literalinclude:: ../../../skrl/agents/torch/amp/amp.py + :language: python + :lines: 18-67 + :linenos: + +Spaces and models +^^^^^^^^^^^^^^^^^ + +The implementation supports the following `Gym spaces `_ + +.. list-table:: + :header-rows: 1 + + * - Gym spaces + - .. centered:: Observation + - .. centered:: Action + * - Discrete + - .. centered:: :math:`\square` + - .. centered:: :math:`\square` + * - Box + - .. centered:: :math:`\blacksquare` + - .. centered:: :math:`\blacksquare` + * - Dict + - .. centered:: :math:`\square` + - .. centered:: :math:`\square` + +The implementation uses 1 stochastic (continuous) and 2 deterministic function approximators. These function approximators (models) must be collected in a dictionary and passed to the constructor of the class under the argument :literal:`models` + +.. list-table:: + :header-rows: 1 + + * - Notation + - Concept + - Key + - Type + * - :math:`\pi_\theta(s)` + - Policy + - :literal:`"policy"` + - :ref:`Gaussian ` + * - :math:`V_\phi(s)` + - Value + - :literal:`"value"` + - :ref:`Deterministic ` + * - :math:`D_\psi(s)` + - Discriminator + - :literal:`"discriminator"` + - :ref:`Deterministic ` + +API +^^^ + +.. autoclass:: skrl.agents.torch.amp.amp.AMP + :undoc-members: + :show-inheritance: + :private-members: _update + :members: + + .. automethod:: __init__ From 9cb5d0c68161f4ea288fe61565e8ea69183a9437 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20Serrano=20Mu=C3=B1oz?= Date: Sun, 24 Jul 2022 23:59:46 +0200 Subject: [PATCH 004/108] Update CHANGELOG with unreleased modifications --- CHANGELOG.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index fce6847f..196a69bc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,10 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). +## [0.8.0] - Unreleased +### Added +- AMP agent for physics-based character animation + ## [0.7.0] - 2022-07-11 ### Added - A2C agent From 7abf7836adcc4d5e2ebba240c7212cf159822ca2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20Serrano=20Mu=C3=B1oz?= Date: Mon, 25 Jul 2022 12:41:25 +0200 Subject: [PATCH 005/108] List input and output shapes for each agent model in docs --- docs/source/modules/skrl.agents.a2c.rst | 6 ++++++ docs/source/modules/skrl.agents.amp.rst | 14 +++++++++++++- docs/source/modules/skrl.agents.cem.rst | 4 ++++ docs/source/modules/skrl.agents.ddpg.rst | 10 ++++++++++ docs/source/modules/skrl.agents.ddqn.rst | 6 ++++++ docs/source/modules/skrl.agents.dqn.rst | 6 ++++++ docs/source/modules/skrl.agents.ppo.rst | 6 ++++++ docs/source/modules/skrl.agents.q_learning.rst | 4 ++++ docs/source/modules/skrl.agents.sac.rst | 12 ++++++++++++ docs/source/modules/skrl.agents.sarsa.rst | 4 ++++ docs/source/modules/skrl.agents.td3.rst | 14 ++++++++++++++ docs/source/modules/skrl.agents.trpo.rst | 6 ++++++ 12 files changed, 91 insertions(+), 1 deletion(-) diff --git a/docs/source/modules/skrl.agents.a2c.rst b/docs/source/modules/skrl.agents.a2c.rst index 4898bd61..da63d444 100644 --- a/docs/source/modules/skrl.agents.a2c.rst +++ b/docs/source/modules/skrl.agents.a2c.rst @@ -110,14 +110,20 @@ The implementation uses 1 stochastic (discrete or continuous) and 1 deterministi - Concept - Key - Type + - Input shape + - Output shape * - :math:`\pi_\theta(s)` - Policy - :literal:`"policy"` - :ref:`Categorical ` / :ref:`Gaussian ` + - observation + - action * - :math:`V_\phi(s)` - Value - :literal:`"value"` - :ref:`Deterministic ` + - observation + - 1 API ^^^ diff --git a/docs/source/modules/skrl.agents.amp.rst b/docs/source/modules/skrl.agents.amp.rst index e8e7c228..2256f4d9 100644 --- a/docs/source/modules/skrl.agents.amp.rst +++ b/docs/source/modules/skrl.agents.amp.rst @@ -109,17 +109,21 @@ The implementation supports the following `Gym spaces ` + - observation + - action * - :math:`V_\phi(s)` - Value - :literal:`"value"` - :ref:`Deterministic ` - * - :math:`D_\psi(s)` + - observation + - 1 + * - :math:`D_\psi(s_{_{AMP}})` - Discriminator - :literal:`"discriminator"` - :ref:`Deterministic ` + - AMP observation + - 1 API ^^^ diff --git a/docs/source/modules/skrl.agents.cem.rst b/docs/source/modules/skrl.agents.cem.rst index cfe6fa29..0001678b 100644 --- a/docs/source/modules/skrl.agents.cem.rst +++ b/docs/source/modules/skrl.agents.cem.rst @@ -65,10 +65,14 @@ The implementation uses 1 discrete function approximator. This function approxim - Concept - Key - Type + - Input shape + - Output shape * - :math:`\pi(s)` - Policy - :literal:`"policy"` - :ref:`Categorical ` + - observation + - action API ^^^ diff --git a/docs/source/modules/skrl.agents.ddpg.rst b/docs/source/modules/skrl.agents.ddpg.rst index 96de2054..51dfc657 100644 --- a/docs/source/modules/skrl.agents.ddpg.rst +++ b/docs/source/modules/skrl.agents.ddpg.rst @@ -93,22 +93,32 @@ The implementation uses 4 deterministic function approximators. These function a - Concept - Key - Type + - Input shape + - Output shape * - :math:`\mu_\theta(s)` - Policy (actor) - :literal:`"policy"` - :ref:`Deterministic ` + - observation + - action * - :math:`\mu_{\theta_{target}}(s)` - Target policy - :literal:`"target_policy"` - :ref:`Deterministic ` + - observation + - action * - :math:`Q_\phi(s, a)` - Q-network (critic) - :literal:`"critic"` - :ref:`Deterministic ` + - observation + action + - 1 * - :math:`Q_{\phi_{target}}(s, a)` - Target Q-network - :literal:`"target_critic"` - :ref:`Deterministic ` + - observation + action + - 1 API ^^^ diff --git a/docs/source/modules/skrl.agents.ddqn.rst b/docs/source/modules/skrl.agents.ddqn.rst index 76b53ca1..afeddc73 100644 --- a/docs/source/modules/skrl.agents.ddqn.rst +++ b/docs/source/modules/skrl.agents.ddqn.rst @@ -72,14 +72,20 @@ The implementation uses 2 deterministic function approximators. These function a - Concept - Key - Type + - Input shape + - Output shape * - :math:`Q_\phi(s, a)` - Q-network - :literal:`"q_network"` - :ref:`Deterministic ` + - observation + - action * - :math:`Q_{\phi_{target}}(s, a)` - Target Q-network - :literal:`"target_q_network"` - :ref:`Deterministic ` + - observation + - action API ^^^ diff --git a/docs/source/modules/skrl.agents.dqn.rst b/docs/source/modules/skrl.agents.dqn.rst index f1d01871..77eb0e89 100644 --- a/docs/source/modules/skrl.agents.dqn.rst +++ b/docs/source/modules/skrl.agents.dqn.rst @@ -72,14 +72,20 @@ The implementation uses 2 deterministic function approximators. These function a - Concept - Key - Type + - Input shape + - Output shape * - :math:`Q_\phi(s, a)` - Q-network - :literal:`"q_network"` - :ref:`Deterministic ` + - observation + - action * - :math:`Q_{\phi_{target}}(s, a)` - Target Q-network - :literal:`"target_q_network"` - :ref:`Deterministic ` + - observation + - action API ^^^ diff --git a/docs/source/modules/skrl.agents.ppo.rst b/docs/source/modules/skrl.agents.ppo.rst index f53bb403..5260bddc 100644 --- a/docs/source/modules/skrl.agents.ppo.rst +++ b/docs/source/modules/skrl.agents.ppo.rst @@ -125,14 +125,20 @@ The implementation uses 1 stochastic (discrete or continuous) and 1 deterministi - Concept - Key - Type + - Input shape + - Output shape * - :math:`\pi_\theta(s)` - Policy - :literal:`"policy"` - :ref:`Categorical ` / :ref:`Gaussian ` + - observation + - action * - :math:`V_\phi(s)` - Value - :literal:`"value"` - :ref:`Deterministic ` + - observation + - 1 API ^^^ diff --git a/docs/source/modules/skrl.agents.q_learning.rst b/docs/source/modules/skrl.agents.q_learning.rst index f4cc8a00..b5dee107 100644 --- a/docs/source/modules/skrl.agents.q_learning.rst +++ b/docs/source/modules/skrl.agents.q_learning.rst @@ -63,10 +63,14 @@ The implementation uses 1 table. This table (model) must be collected in a dicti - Concept - Key - Type + - Input shape + - Output shape * - :math:`\pi_{Q[s,a]}(s)` - Policy (:math:`\epsilon`-greedy) - :literal:`"policy"` - :ref:`Tabular ` + - observation + - action API ^^^ diff --git a/docs/source/modules/skrl.agents.sac.rst b/docs/source/modules/skrl.agents.sac.rst index 5fe11b9e..599c9ff0 100644 --- a/docs/source/modules/skrl.agents.sac.rst +++ b/docs/source/modules/skrl.agents.sac.rst @@ -100,26 +100,38 @@ The implementation uses 1 stochastic and 4 deterministic function approximators. - Concept - Key - Type + - Input shape + - Output shape * - :math:`\pi_\theta(s)` - Policy (actor) - :literal:`"policy"` - :ref:`Gaussian ` + - observation + - action * - :math:`Q_{\phi 1}(s, a)` - Q1-network (critic 1) - :literal:`"critic_1"` - :ref:`Deterministic ` + - observation + action + - 1 * - :math:`Q_{\phi 2}(s, a)` - Q2-network (critic 2) - :literal:`"critic_2"` - :ref:`Deterministic ` + - observation + action + - 1 * - :math:`Q_{{\phi 1}_{target}}(s, a)` - Target Q1-network - :literal:`"target_critic_1"` - :ref:`Deterministic ` + - observation + action + - 1 * - :math:`Q_{{\phi 2}_{target}}(s, a)` - Target Q2-network - :literal:`"target_critic_2"` - :ref:`Deterministic ` + - observation + action + - 1 API ^^^ diff --git a/docs/source/modules/skrl.agents.sarsa.rst b/docs/source/modules/skrl.agents.sarsa.rst index 7688e26a..1c5fba3f 100644 --- a/docs/source/modules/skrl.agents.sarsa.rst +++ b/docs/source/modules/skrl.agents.sarsa.rst @@ -62,10 +62,14 @@ The implementation uses 1 table. This table (model) must be collected in a dicti - Concept - Key - Type + - Input shape + - Output shape * - :math:`\pi_{Q[s,a]}(s)` - Policy (:math:`\epsilon`-greedy) - :literal:`"policy"` - :ref:`Tabular ` + - observation + - action API ^^^ diff --git a/docs/source/modules/skrl.agents.td3.rst b/docs/source/modules/skrl.agents.td3.rst index f7ab9423..2b8108fb 100644 --- a/docs/source/modules/skrl.agents.td3.rst +++ b/docs/source/modules/skrl.agents.td3.rst @@ -103,30 +103,44 @@ The implementation uses 6 deterministic function approximators. These function a - Concept - Key - Type + - Input shape + - Output shape * - :math:`\mu_\theta(s)` - Policy (actor) - :literal:`"policy"` - :ref:`Deterministic ` + - observation + - action * - :math:`\mu_{\theta_{target}}(s)` - Target policy - :literal:`"target_policy"` - :ref:`Deterministic ` + - observation + - action * - :math:`Q_{\phi 1}(s, a)` - Q1-network (critic 1) - :literal:`"critic_1"` - :ref:`Deterministic ` + - observation + action + - 1 * - :math:`Q_{\phi 2}(s, a)` - Q2-network (critic 2) - :literal:`"critic_2"` - :ref:`Deterministic ` + - observation + action + - 1 * - :math:`Q_{{\phi 1}_{target}}(s, a)` - Target Q1-network - :literal:`"target_critic_1"` - :ref:`Deterministic ` + - observation + action + - 1 * - :math:`Q_{{\phi 2}_{target}}(s, a)` - Target Q2-network - :literal:`"target_critic_2"` - :ref:`Deterministic ` + - observation + action + - 1 API ^^^ diff --git a/docs/source/modules/skrl.agents.trpo.rst b/docs/source/modules/skrl.agents.trpo.rst index 8a60ae8a..d4e87b78 100644 --- a/docs/source/modules/skrl.agents.trpo.rst +++ b/docs/source/modules/skrl.agents.trpo.rst @@ -163,14 +163,20 @@ The implementation uses 1 stochastic and 1 deterministic function approximator. - Concept - Key - Type + - Input shape + - Output shape * - :math:`\pi_\theta(s)` - Policy - :literal:`"policy"` - :ref:`Gaussian ` + - observation + - action * - :math:`V_\phi(s)` - Value - :literal:`"value"` - :ref:`Deterministic ` + - observation + - 1 API ^^^ From adc49551efc68878af7b76411ca51632cf84a512 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20Serrano=20Mu=C3=B1oz?= Date: Mon, 25 Jul 2022 13:43:11 +0200 Subject: [PATCH 006/108] Add AMP example to docs --- docs/source/examples/isaacgym/amp_humanoid.py | 143 ++++++++++++++++++ docs/source/intro/examples.rst | 10 ++ 2 files changed, 153 insertions(+) create mode 100644 docs/source/examples/isaacgym/amp_humanoid.py diff --git a/docs/source/examples/isaacgym/amp_humanoid.py b/docs/source/examples/isaacgym/amp_humanoid.py new file mode 100644 index 00000000..6fe006d9 --- /dev/null +++ b/docs/source/examples/isaacgym/amp_humanoid.py @@ -0,0 +1,143 @@ +import isaacgym + +import torch +import torch.nn as nn + +# Import the skrl components to build the RL system +from skrl.models.torch import GaussianModel, DeterministicModel +from skrl.memories.torch import RandomMemory +from skrl.agents.torch.amp import AMP, AMP_DEFAULT_CONFIG +from skrl.resources.preprocessors.torch import RunningStandardScaler +from skrl.trainers.torch import SequentialTrainer +from skrl.envs.torch import wrap_env +from skrl.envs.torch import load_isaacgym_env_preview4 +from skrl.utils import set_seed + + +# set the seed for reproducibility +set_seed(42) + + +# Define the models (stochastic and deterministic models) for the agent using helper classes. +# - Policy: takes as input the environment's observation/state and returns an action +# - Value: takes the state as input and provides a value to guide the policy +# - Discriminator: differentiate between police-generated behaviors and behaviors from the motion dataset +class Policy(GaussianModel): + def __init__(self, observation_space, action_space, device, clip_actions=False, + clip_log_std=True, min_log_std=-20, max_log_std=2): + super().__init__(observation_space, action_space, device, clip_actions, + clip_log_std, min_log_std, max_log_std) + + self.net = nn.Sequential(nn.Linear(self.num_observations, 1024), + nn.ReLU(), + nn.Linear(1024, 512), + nn.ReLU(), + nn.Linear(512, self.num_actions)) + + # set a fixed log standard deviation for the policy + self.log_std_parameter = nn.Parameter(torch.full((self.num_actions,), fill_value=-2.9), requires_grad=False) + + def compute(self, states, taken_actions): + return torch.tanh(self.net(states)), self.log_std_parameter + +class Value(DeterministicModel): + def __init__(self, observation_space, action_space, device, clip_actions=False): + super().__init__(observation_space, action_space, device, clip_actions) + + self.net = nn.Sequential(nn.Linear(self.num_observations, 1024), + nn.ReLU(), + nn.Linear(1024, 512), + nn.ReLU(), + nn.Linear(512, 1)) + + def compute(self, states, taken_actions): + return self.net(states) + +class Discriminator(DeterministicModel): + def __init__(self, observation_space, action_space, device, clip_actions=False): + super().__init__(observation_space, action_space, device, clip_actions) + + self.net = nn.Sequential(nn.Linear(self.num_observations, 1024), + nn.ReLU(), + nn.Linear(1024, 512), + nn.ReLU(), + nn.Linear(512, 1)) + + def compute(self, states, taken_actions): + return self.net(states) + + +# Load and wrap the Isaac Gym environment +env = load_isaacgym_env_preview4(task_name="HumanoidAMP") # preview 3 and 4 use the same loader +env = wrap_env(env) + +device = env.device + + +# Instantiate a RandomMemory as rollout buffer (any memory can be used for this) +memory = RandomMemory(memory_size=16, num_envs=env.num_envs, device=device) + + +# Instantiate the agent's models (function approximators). +# AMP requires 3 models, visit its documentation for more details +# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.amp.html#spaces-and-models +models_amp = {"policy": Policy(env.observation_space, env.action_space, device), + "value": Value(env.observation_space, env.action_space, device), + "discriminator": Discriminator(env.amp_observation_space, env.action_space, device)} + + +# Configure and instantiate the agent. +# Only modify some of the default configuration, visit its documentation to see all the options +# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.amp.html#configuration-and-hyperparameters +cfg_amp = AMP_DEFAULT_CONFIG.copy() +cfg_amp["rollouts"] = 16 +cfg_amp["learning_epochs"] = 6 +cfg_amp["mini_batches"] = 2 # 16 * 4096 / 32768 +cfg_amp["discount_factor"] = 0.99 +cfg_amp["lambda"] = 0.95 +cfg_amp["learning_rate"] = 5e-5 +cfg_amp["random_timesteps"] = 0 +cfg_amp["learning_starts"] = 0 +cfg_amp["grad_norm_clip"] = 0.0 +cfg_amp["ratio_clip"] = 0.2 +cfg_amp["value_clip"] = 0.2 +cfg_amp["clip_predicted_values"] = False +cfg_amp["entropy_loss_scale"] = 0.0 +cfg_amp["value_loss_scale"] = 2.5 +cfg_amp["discriminator_loss_scale"] = 5.0 +cfg_amp["amp_batch_size"] = 512 +cfg_amp["task_reward_weight"] = 0.0 +cfg_amp["style_reward_weight"] = 1.0 +cfg_amp["discriminator_reward_scale"] = 2 +cfg_amp["discriminator_logit_regularization_scale"] = 0.05 +cfg_amp["discriminator_gradient_penalty_scale"] = 5 +cfg_amp["discriminator_weight_decay_scale"] = 0.0001 +cfg_amp["state_preprocessor"] = RunningStandardScaler +cfg_amp["state_preprocessor_kwargs"] = {"size": env.observation_space, "device": device} +cfg_amp["value_preprocessor"] = RunningStandardScaler +cfg_amp["value_preprocessor_kwargs"] = {"size": 1, "device": device} +cfg_amp["amp_state_preprocessor"] = RunningStandardScaler +cfg_amp["amp_state_preprocessor_kwargs"] = {"size": env.amp_observation_space, "device": device} +# logging to TensorBoard and write checkpoints each 16 and 4000 timesteps respectively +cfg_amp["experiment"]["write_interval"] = 160 +cfg_amp["experiment"]["checkpoint_interval"] = 4000 + +agent = AMP(models=models_amp, + memory=memory, + cfg=cfg_amp, + observation_space=env.observation_space, + action_space=env.action_space, + device=device, + amp_observation_space=env.amp_observation_space, + motion_dataset=RandomMemory(memory_size=200000, device=device), + reply_buffer=RandomMemory(memory_size=1000000, device=device), + collect_reference_motions=lambda num_samples: env.fetch_amp_obs_demo(num_samples), + collect_observation=lambda: env.reset_done()[0]["obs"]) + + +# Configure and instantiate the RL trainer +cfg_trainer = {"timesteps": 80000, "headless": True} +trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=agent) + +# start training +trainer.train() diff --git a/docs/source/intro/examples.rst b/docs/source/intro/examples.rst index bf3cd29b..cf8f8f2e 100644 --- a/docs/source/intro/examples.rst +++ b/docs/source/intro/examples.rst @@ -273,6 +273,7 @@ The following components or practices are exemplified (highlighted): - Set a random seed for reproducibility: **Cartpole** - Set a learning rate scheduler: **FrankaCabinet**, **Humanoid** - Define a reward shaping function: **Quadcopter**, **ShadowHand**, **Trifinger** + - Access to environment-specific properties and methods: **Humanoid (AMP)** - Load a checkpoint during evaluation: **Cartpole** The PPO agent configuration is mapped, as far as possible, from the rl_games' A2C-PPO `configuration for Isaac Gym preview environments `_. The following list shows the mapping between the two configurations @@ -389,6 +390,15 @@ The PPO agent configuration is mapped, as far as possible, from the rl_games' A2 :linenos: :emphasize-lines: 10, 97-98 + .. tab:: Humanoid (AMP) + + View the raw code: `ppo_humanoid.py `_ + + .. literalinclude:: ../examples/isaacgym/amp_humanoid.py + :language: python + :linenos: + :emphasize-lines: 86, 120, 131, 134-135 + .. tab:: Ingenuity View the raw code: `ppo_ingenuity.py `_ From 851f90770f3f4bd0c8e8212cc0009f1bc5932f3d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20Serrano=20Mu=C3=B1oz?= Date: Mon, 25 Jul 2022 22:15:18 +0200 Subject: [PATCH 007/108] Use a normal distribution in the gaussian model --- skrl/models/torch/gaussian.py | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/skrl/models/torch/gaussian.py b/skrl/models/torch/gaussian.py index 6b1e00c1..a5913d32 100644 --- a/skrl/models/torch/gaussian.py +++ b/skrl/models/torch/gaussian.py @@ -3,7 +3,7 @@ import gym import torch -from torch.distributions import MultivariateNormal +from torch.distributions import Normal from . import Model @@ -17,7 +17,7 @@ def __init__(self, clip_log_std: bool = True, min_log_std: float = -20, max_log_std: float = 2) -> None: - """Diagonal Gaussian model (stochastic model) + """Gaussian model (stochastic model) :param observation_space: Observation/state space or shape (default: None). If it is not None, the num_observations property will contain the size of that space @@ -90,10 +90,7 @@ def act(self, self._num_samples = actions_mean.shape[0] # distribution - covariance = torch.diag(log_std.exp() * log_std.exp()) - self._distribution = MultivariateNormal(actions_mean, scale_tril=covariance) - # self._distribution.loc = actions_mean - # self._distribution._unbroadcasted_scale_tril = covariance + self._distribution = Normal(actions_mean, log_std.exp()) # sample using the reparameterization trick actions = self._distribution.rsample() @@ -107,8 +104,6 @@ def act(self, # log of the probability density function log_prob = self._distribution.log_prob(actions if taken_actions is None else taken_actions) - if log_prob.dim() != actions.dim(): - log_prob = log_prob.unsqueeze(-1) if inference: return actions.detach(), log_prob.detach(), actions_mean.detach() @@ -132,10 +127,10 @@ def get_log_std(self) -> torch.Tensor: """ return self._log_std.repeat(self._num_samples, 1) - def distribution(self) -> torch.distributions.MultivariateNormal: + def distribution(self) -> torch.distributions.Normal: """Get the current distribution of the model :return: Distribution of the model - :rtype: torch.distributions.MultivariateNormal + :rtype: torch.distributions.Normal """ return self._distribution From 62ba65633a87eefee7e863c57467129a6198ac59 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20Serrano=20Mu=C3=B1oz?= Date: Mon, 25 Jul 2022 23:34:23 +0200 Subject: [PATCH 008/108] Define the multivariate gaussian model in a separate file --- skrl/models/torch/__init__.py | 2 +- skrl/models/torch/multivariate_gaussian.py | 139 +++++++++++++++++++++ 2 files changed, 140 insertions(+), 1 deletion(-) create mode 100644 skrl/models/torch/multivariate_gaussian.py diff --git a/skrl/models/torch/__init__.py b/skrl/models/torch/__init__.py index c3b94db1..de02edaa 100644 --- a/skrl/models/torch/__init__.py +++ b/skrl/models/torch/__init__.py @@ -1,7 +1,7 @@ from .base import Model from .tabular import TabularModel - from .gaussian import GaussianModel from .categorical import CategoricalModel from .deterministic import DeterministicModel +from .multivariate_gaussian import MultivariateGaussianModel diff --git a/skrl/models/torch/multivariate_gaussian.py b/skrl/models/torch/multivariate_gaussian.py new file mode 100644 index 00000000..1f61f275 --- /dev/null +++ b/skrl/models/torch/multivariate_gaussian.py @@ -0,0 +1,139 @@ +from typing import Union, Tuple + +import gym + +import torch +from torch.distributions import MultivariateNormal + +from . import Model + + +class MultivariateGaussianModel(Model): + def __init__(self, + observation_space: Union[int, Tuple[int], gym.Space, None] = None, + action_space: Union[int, Tuple[int], gym.Space, None] = None, + device: Union[str, torch.device] = "cuda:0", + clip_actions: bool = False, + clip_log_std: bool = True, + min_log_std: float = -20, + max_log_std: float = 2) -> None: + """Multivariate Gaussian model (stochastic model) + + :param observation_space: Observation/state space or shape (default: None). + If it is not None, the num_observations property will contain the size of that space + :type observation_space: int, tuple or list of integers, gym.Space or None, optional + :param action_space: Action space or shape (default: None). + If it is not None, the num_actions property will contain the size of that space + :type action_space: int, tuple or list of integers, gym.Space or None, optional + :param device: Device on which a torch tensor is or will be allocated (default: "cuda:0") + :type device: str or torch.device, optional + :param clip_actions: Flag to indicate whether the actions should be clipped to the action space (default: False) + :type clip_actions: bool, optional + :param clip_log_std: Flag to indicate whether the log standard deviations should be clipped (default: True) + :type clip_log_std: bool, optional + :param min_log_std: Minimum value of the log standard deviation if clip_log_std is True (default: -20) + :type min_log_std: float, optional + :param max_log_std: Maximum value of the log standard deviation if clip_log_std is True (default: 2) + :type max_log_std: float, optional + """ + super(MultivariateGaussianModel, self).__init__(observation_space, action_space, device) + + self.clip_actions = clip_actions and issubclass(type(self.action_space), gym.Space) + + if self.clip_actions: + self.clip_actions_min = torch.tensor(self.action_space.low, device=self.device) + self.clip_actions_max = torch.tensor(self.action_space.high, device=self.device) + + # backward compatibility: torch < 1.9 clamp method does not support tensors + self._backward_compatibility = tuple(map(int, (torch.__version__.split(".")[:2]))) < (1, 9) + + self.clip_log_std = clip_log_std + self.log_std_min = min_log_std + self.log_std_max = max_log_std + + self._log_std = None + self._num_samples = None + self._distribution = None + + def act(self, + states: torch.Tensor, + taken_actions: Union[torch.Tensor, None] = None, + inference=False) -> Tuple[torch.Tensor]: + """Act stochastically in response to the state of the environment + + :param states: Observation/state of the environment used to make the decision + :type states: torch.Tensor + :param taken_actions: Actions taken by a policy to the given states (default: None). + The use of these actions only makes sense in critical models, e.g. + :type taken_actions: torch.Tensor or None, optional + :param inference: Flag to indicate whether the model is making inference (default: False). + If True, the returned tensors will be detached from the current graph + :type inference: bool, optional + + :return: Action to be taken by the agent given the state of the environment. + The tuple's components are the actions, the log of the probability density function and mean actions + :rtype: tuple of torch.Tensor + """ + # map from states/observations to mean actions and log standard deviations + if self._instantiator_net is None: + actions_mean, log_std = self.compute(states.to(self.device), + taken_actions.to(self.device) if taken_actions is not None else taken_actions) + else: + actions_mean, log_std = self._get_instantiator_output(states.to(self.device), \ + taken_actions.to(self.device) if taken_actions is not None else taken_actions) + + # clamp log standard deviations + if self.clip_log_std: + log_std = torch.clamp(log_std, self.log_std_min, self.log_std_max) + + self._log_std = log_std + self._num_samples = actions_mean.shape[0] + + # distribution + covariance = torch.diag(log_std.exp() * log_std.exp()) + self._distribution = MultivariateNormal(actions_mean, scale_tril=covariance) + + # sample using the reparameterization trick + actions = self._distribution.rsample() + + # clip actions + if self.clip_actions: + if self._backward_compatibility: + actions = torch.max(torch.min(actions, self.clip_actions_max), self.clip_actions_min) + else: + actions = torch.clamp(actions, min=self.clip_actions_min, max=self.clip_actions_max) + + # log of the probability density function + log_prob = self._distribution.log_prob(actions if taken_actions is None else taken_actions) + if log_prob.dim() != actions.dim(): + log_prob = log_prob.unsqueeze(-1) + + if inference: + return actions.detach(), log_prob.detach(), actions_mean.detach() + return actions, log_prob, actions_mean + + def get_entropy(self) -> torch.Tensor: + """Compute and return the entropy of the model + + :return: Entropy of the model + :rtype: torch.Tensor + """ + if self._distribution is None: + return torch.tensor(0.0, device=self.device) + return self._distribution.entropy().to(self.device) + + def get_log_std(self) -> torch.Tensor: + """Return the log standard deviation of the model + + :return: Log standard deviation of the model + :rtype: torch.Tensor + """ + return self._log_std.repeat(self._num_samples, 1) + + def distribution(self) -> torch.distributions.MultivariateNormal: + """Get the current distribution of the model + + :return: Distribution of the model + :rtype: torch.distributions.MultivariateNormal + """ + return self._distribution From afdc25fc1be970fd283fc8a4ca78312d24f3eb3a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20Serrano=20Mu=C3=B1oz?= Date: Mon, 25 Jul 2022 23:40:17 +0200 Subject: [PATCH 009/108] Add reduction methods for returning the log probability density function --- skrl/models/torch/gaussian.py | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/skrl/models/torch/gaussian.py b/skrl/models/torch/gaussian.py index a5913d32..24b117f0 100644 --- a/skrl/models/torch/gaussian.py +++ b/skrl/models/torch/gaussian.py @@ -16,7 +16,8 @@ def __init__(self, clip_actions: bool = False, clip_log_std: bool = True, min_log_std: float = -20, - max_log_std: float = 2) -> None: + max_log_std: float = 2, + reduction: str = "sum") -> None: """Gaussian model (stochastic model) :param observation_space: Observation/state space or shape (default: None). @@ -35,6 +36,12 @@ def __init__(self, :type min_log_std: float, optional :param max_log_std: Maximum value of the log standard deviation if clip_log_std is True (default: 2) :type max_log_std: float, optional + :param reduction: Reduction method for returning the log probability density function: (default: "sum"). + Supported values are "mean", "sum", "prod" and "none". If "none", the log probability density + function is returned as a tensor of shape (num_samples, num_actions) instead of (num_samples, 1) + :type reduction: str, optional + + :raises ValueError: If the reduction method is not valid """ super(GaussianModel, self).__init__(observation_space, action_space, device) @@ -55,6 +62,11 @@ def __init__(self, self._num_samples = None self._distribution = None + if reduction not in ["mean", "sum", "prod", "none"]: + raise ValueError("reduction must be one of 'mean', 'sum', 'prod' or 'none'") + self._reduction = torch.mean if reduction == "mean" else torch.sum if reduction == "sum" \ + else torch.prod if reduction == "prod" else None + def act(self, states: torch.Tensor, taken_actions: Union[torch.Tensor, None] = None, @@ -104,6 +116,10 @@ def act(self, # log of the probability density function log_prob = self._distribution.log_prob(actions if taken_actions is None else taken_actions) + if self._reduction is not None: + log_prob = self._reduction(log_prob, dim=-1) + if log_prob.dim() != actions.dim(): + log_prob = log_prob.unsqueeze(-1) if inference: return actions.detach(), log_prob.detach(), actions_mean.detach() From 2dd0fb96d6a3d56c05aed2924a6cc08e96a3e041 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20Serrano=20Mu=C3=B1oz?= Date: Tue, 26 Jul 2022 18:29:33 +0200 Subject: [PATCH 010/108] Show the changelog in docs --- docs/source/intro/installation.rst | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/docs/source/intro/installation.rst b/docs/source/intro/installation.rst index 5e0dacec..1663cccb 100644 --- a/docs/source/intro/installation.rst +++ b/docs/source/intro/installation.rst @@ -77,3 +77,9 @@ Known issues .. code-block:: text AttributeError: 'Adam' object has no attribute '_warned_capturable_if_run_uncaptured' + +Changelog +--------- + +.. literalinclude:: ../../../CHANGELOG.md + :language: markdown From ea42867f0c58d35751f334c4a1fdf244a1db47b9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20Serrano=20Mu=C3=B1oz?= Date: Tue, 26 Jul 2022 22:39:27 +0200 Subject: [PATCH 011/108] Add multivariate gaussian model schema --- docs/source/_static/imgs/model_gaussian.svg | 2 +- docs/source/_static/imgs/model_multivariate_gaussian.svg | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) mode change 100644 => 100755 docs/source/_static/imgs/model_gaussian.svg create mode 100644 docs/source/_static/imgs/model_multivariate_gaussian.svg diff --git a/docs/source/_static/imgs/model_gaussian.svg b/docs/source/_static/imgs/model_gaussian.svg old mode 100644 new mode 100755 index 19bd5771..92fa89ca --- a/docs/source/_static/imgs/model_gaussian.svg +++ b/docs/source/_static/imgs/model_gaussian.svg @@ -1 +1 @@ -inputhiddenoutput.compute(…)states (𝒔𝒕)with or withoutactions (𝒂𝒕)log standarddeviations(𝑙𝑜𝑔(𝜎))mean actions(𝑎𝑡+1)multivariategaussian distribution𝒩(𝜇,𝛴)actions (𝒂𝒕+𝟏)log prob evaluated at𝑎𝑡+1mean actions (𝒂𝒕+𝟏)paramclip_log_stdclip_actions \ No newline at end of file +inputhiddenoutput.compute(…)states (𝒔𝒕)with or withoutactions (𝒂𝒕)log standarddeviations(𝑙𝑜𝑔(𝜎))mean actions(𝑎𝑡+1)gaussiandistribution𝒩(𝜇,𝜎)actions (𝒂𝒕+𝟏)log prob evaluated at𝑎𝑡+1mean actions (𝒂𝒕+𝟏)paramclip_log_stdclip_actionsreduction \ No newline at end of file diff --git a/docs/source/_static/imgs/model_multivariate_gaussian.svg b/docs/source/_static/imgs/model_multivariate_gaussian.svg new file mode 100644 index 00000000..19bd5771 --- /dev/null +++ b/docs/source/_static/imgs/model_multivariate_gaussian.svg @@ -0,0 +1 @@ +inputhiddenoutput.compute(…)states (𝒔𝒕)with or withoutactions (𝒂𝒕)log standarddeviations(𝑙𝑜𝑔(𝜎))mean actions(𝑎𝑡+1)multivariategaussian distribution𝒩(𝜇,𝛴)actions (𝒂𝒕+𝟏)log prob evaluated at𝑎𝑡+1mean actions (𝒂𝒕+𝟏)paramclip_log_stdclip_actions \ No newline at end of file From 91c71eb47039ce0bc18a7abd9503c8ef997143be Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20Serrano=20Mu=C3=B1oz?= Date: Tue, 26 Jul 2022 22:45:31 +0200 Subject: [PATCH 012/108] Add multivariate gaussian model to docs --- docs/source/index.rst | 2 + .../skrl.models.multivariate_gaussian.rst | 43 +++++++ docs/source/snippets/gaussian_model.py | 14 ++- .../snippets/multivariate_gaussian_model.py | 116 ++++++++++++++++++ 4 files changed, 169 insertions(+), 6 deletions(-) create mode 100644 docs/source/modules/skrl.models.multivariate_gaussian.rst create mode 100644 docs/source/snippets/multivariate_gaussian_model.py diff --git a/docs/source/index.rst b/docs/source/index.rst index 69dc9550..e26408f7 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -128,6 +128,7 @@ Models * :doc:`Tabular model ` (discrete domain) * :doc:`Categorical model ` (discrete domain) * :doc:`Gaussian model ` (continuous domain) + * :doc:`Multivariate Gaussian model ` (continuous domain) * :doc:`Deterministic model ` (continuous domain) .. toctree:: @@ -139,6 +140,7 @@ Models modules/skrl.models.tabular modules/skrl.models.categorical modules/skrl.models.gaussian + modules/skrl.models.multivariate_gaussian modules/skrl.models.deterministic Trainers diff --git a/docs/source/modules/skrl.models.multivariate_gaussian.rst b/docs/source/modules/skrl.models.multivariate_gaussian.rst new file mode 100644 index 00000000..3ad21398 --- /dev/null +++ b/docs/source/modules/skrl.models.multivariate_gaussian.rst @@ -0,0 +1,43 @@ +.. _models_multivariate_gaussian: + +Multivariate Gaussian model +=========================== + +Concept +^^^^^^^ + +.. image:: ../_static/imgs/model_multivariate_gaussian.svg + :width: 100% + :align: center + :alt: Multivariate Gaussian model + +Basic usage +^^^^^^^^^^^ + +.. tabs:: + + .. tab:: Multi-Layer Perceptron (MLP) + + .. literalinclude:: ../snippets/multivariate_gaussian_model.py + :language: python + :linenos: + :start-after: [start-mlp] + :end-before: [end-mlp] + + .. tab:: Convolutional Neural Network (CNN) + + .. literalinclude:: ../snippets/multivariate_gaussian_model.py + :language: python + :linenos: + :start-after: [start-cnn] + :end-before: [end-cnn] + +API +^^^ + +.. autoclass:: skrl.models.torch.multivariate_gaussian.MultivariateGaussianModel + :show-inheritance: + :members: + + .. automethod:: __init__ + .. automethod:: compute diff --git a/docs/source/snippets/gaussian_model.py b/docs/source/snippets/gaussian_model.py index 6e7f3f8e..0b1d5f85 100644 --- a/docs/source/snippets/gaussian_model.py +++ b/docs/source/snippets/gaussian_model.py @@ -18,9 +18,9 @@ class DummyEnv: # define the model class MLP(GaussianModel): def __init__(self, observation_space, action_space, device, clip_actions=False, - clip_log_std=True, min_log_std=-20, max_log_std=2): + clip_log_std=True, min_log_std=-20, max_log_std=2, reduction="sum"): super().__init__(observation_space, action_space, device, clip_actions, - clip_log_std, min_log_std, max_log_std) + clip_log_std, min_log_std, max_log_std, reduction) self.linear_layer_1 = nn.Linear(self.num_observations, 128) self.linear_layer_2 = nn.Linear(128, 64) @@ -42,7 +42,8 @@ def compute(self, states, taken_actions): clip_actions=True, clip_log_std=True, min_log_std=-20, - max_log_std=2) + max_log_std=2, + reduction="sum") # [end-mlp] import torch @@ -72,9 +73,9 @@ class DummyEnv: # define the model class CNN(GaussianModel): def __init__(self, observation_space, action_space, device, clip_actions=False, - clip_log_std=True, min_log_std=-20, max_log_std=2): + clip_log_std=True, min_log_std=-20, max_log_std=2, reduction="sum"): super().__init__(observation_space, action_space, device, clip_actions, - clip_log_std, min_log_std, max_log_std) + clip_log_std, min_log_std, max_log_std, reduction) self.net = nn.Sequential(nn.Conv2d(1, 64, kernel_size=4, stride=2), nn.ReLU(), @@ -107,7 +108,8 @@ def compute(self, states, taken_actions): clip_actions=True, clip_log_std=True, min_log_std=-20, - max_log_std=2) + max_log_std=2, + reduction="sum") # [end-cnn] import torch diff --git a/docs/source/snippets/multivariate_gaussian_model.py b/docs/source/snippets/multivariate_gaussian_model.py new file mode 100644 index 00000000..69fbad38 --- /dev/null +++ b/docs/source/snippets/multivariate_gaussian_model.py @@ -0,0 +1,116 @@ +import gym + +class DummyEnv: + observation_space = gym.spaces.Box(low=-1, high=1, shape=(5,)) + action_space = gym.spaces.Box(low=-1, high=1, shape=(3,)) + device = "cuda:0" + +env = DummyEnv() + +# [start-mlp] +import torch +import torch.nn as nn +import torch.nn.functional as F + +from skrl.models.torch import MultivariateGaussianModel + + +# define the model +class MLP(MultivariateGaussianModel): + def __init__(self, observation_space, action_space, device, clip_actions=False, + clip_log_std=True, min_log_std=-20, max_log_std=2): + super().__init__(observation_space, action_space, device, clip_actions, + clip_log_std, min_log_std, max_log_std) + + self.linear_layer_1 = nn.Linear(self.num_observations, 128) + self.linear_layer_2 = nn.Linear(128, 64) + self.linear_layer_3 = nn.Linear(64, 32) + self.mean_action_layer = nn.Linear(32, self.num_actions) + + self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) + + def compute(self, states, taken_actions): + x = F.relu(self.linear_layer_1(states)) + x = F.relu(self.linear_layer_2(x)) + x = F.relu(self.linear_layer_3(x)) + return torch.tanh(self.mean_action_layer(x)), self.log_std_parameter + +# instantiate the model (assumes there is a wrapped environment: env) +policy = MLP(observation_space=env.observation_space, + action_space=env.action_space, + device=env.device, + clip_actions=True, + clip_log_std=True, + min_log_std=-20, + max_log_std=2) +# [end-mlp] + +import torch +policy.to(env.device) +actions = policy.act(torch.randn(10, 5, device=env.device), torch.randn(10, 3, device=env.device)) +assert actions[0].shape == torch.Size([10, env.action_space.shape[0]]) + +# ============================================================================= + +import gym + +class DummyEnv: + observation_space = gym.spaces.Box(low=0, high=255, shape=(256, 256, 1)) + action_space = gym.spaces.Box(low=-1, high=1, shape=(2,)) + device = "cuda:0" + +env = DummyEnv() + +# [start-cnn] +import torch +import torch.nn as nn +import torch.nn.functional as F + +from skrl.models.torch import MultivariateGaussianModel + + +# define the model +class CNN(MultivariateGaussianModel): + def __init__(self, observation_space, action_space, device, clip_actions=False, + clip_log_std=True, min_log_std=-20, max_log_std=2): + super().__init__(observation_space, action_space, device, clip_actions, + clip_log_std, min_log_std, max_log_std) + + self.net = nn.Sequential(nn.Conv2d(1, 64, kernel_size=4, stride=2), + nn.ReLU(), + nn.Conv2d(64, 32, kernel_size=4, stride=2), + nn.ReLU(), + nn.Conv2d(32, 16, kernel_size=2, stride=2), + nn.ReLU(), + nn.Conv2d(16, 8, kernel_size=2, stride=2), + nn.ReLU(), + nn.Flatten(), + nn.Linear(1800, 256), + nn.ReLU(), + nn.Linear(256, 16), + nn.Tanh(), + nn.Linear(16, 32), + nn.Tanh(), + nn.Linear(32, self.num_actions)) + + self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) + + def compute(self, states, taken_actions): + # permute (samples, width, height, channels) -> (samples, channels, width, height) + return self.net(states.permute(0, 3, 1, 2)), self.log_std_parameter + + +# instantiate the model (assumes there is a wrapped environment: env) +policy = CNN(observation_space=env.observation_space, + action_space=env.action_space, + device=env.device, + clip_actions=True, + clip_log_std=True, + min_log_std=-20, + max_log_std=2) +# [end-cnn] + +import torch +policy.to(env.device) +actions = policy.act(torch.randn(10, 256, 256, 1, device=env.device), torch.randn(10, 2, device=env.device)) +assert actions[0].shape == torch.Size([10, env.action_space.shape[0]]) From eb46aaeb04c5efb048e52793bf3d20bc752b0cee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20Serrano=20Mu=C3=B1oz?= Date: Tue, 26 Jul 2022 23:16:50 +0200 Subject: [PATCH 013/108] Define discriminator batch size in agent config --- skrl/agents/torch/amp/amp.py | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/skrl/agents/torch/amp/amp.py b/skrl/agents/torch/amp/amp.py index e2befc77..1b1df03d 100644 --- a/skrl/agents/torch/amp/amp.py +++ b/skrl/agents/torch/amp/amp.py @@ -24,7 +24,6 @@ "lambda": 0.95, # TD(lambda) coefficient (lam) for computing returns and advantages "learning_rate": 5e-5, # learning rate - "discriminator_learning_rate": 5e-5, # discriminator learning rate "learning_rate_scheduler": None, # learning rate scheduler class (see torch.optim.lr_scheduler) "learning_rate_scheduler_kwargs": {}, # learning rate scheduler's kwargs (e.g. {"step_size": 1e-3}) @@ -50,6 +49,7 @@ "amp_batch_size": 512, # batch size for updating the reference motion dataset "task_reward_weight": 0.0, # task-reward weight (wG) "style_reward_weight": 1.0, # style-reward weight (wS) + "discriminator_batch_size": 0, # batch size for computing the discriminator loss (all samples if 0) "discriminator_reward_scale": 2, # discriminator reward scaling factor "discriminator_logit_regularization_scale": 0.05, # logit regularization scale factor for the discriminator loss "discriminator_gradient_penalty_scale": 5, # gradient penalty scaling factor for the discriminator loss @@ -154,7 +154,6 @@ def __init__(self, self._discriminator_loss_scale = self.cfg["discriminator_loss_scale"] self._learning_rate = self.cfg["learning_rate"] - self._discriminator_learning_rate = self.cfg["discriminator_learning_rate"] self._learning_rate_scheduler = self.cfg["learning_rate_scheduler"] self._state_preprocessor = self.cfg["state_preprocessor"] @@ -171,6 +170,7 @@ def __init__(self, self._task_reward_weight = self.cfg["task_reward_weight"] self._style_reward_weight = self.cfg["style_reward_weight"] + self._discriminator_batch_size = self.cfg["discriminator_batch_size"] self._discriminator_reward_scale = self.cfg["discriminator_reward_scale"] self._discriminator_logit_regularization_scale = self.cfg["discriminator_logit_regularization_scale"] self._discriminator_gradient_penalty_scale = self.cfg["discriminator_gradient_penalty_scale"] @@ -208,7 +208,7 @@ def init(self) -> None: self.memory.create_tensor(name="actions", size=self.action_space, dtype=torch.float32) self.memory.create_tensor(name="rewards", size=1, dtype=torch.float32) self.memory.create_tensor(name="dones", size=1, dtype=torch.bool) - self.memory.create_tensor(name="log_prob", size=self.action_space, dtype=torch.float32) + self.memory.create_tensor(name="log_prob", size=1, dtype=torch.float32) self.memory.create_tensor(name="values", size=1, dtype=torch.float32) self.memory.create_tensor(name="returns", size=1, dtype=torch.float32) self.memory.create_tensor(name="advantages", size=1, dtype=torch.float32) @@ -477,12 +477,20 @@ def compute_gae(rewards: torch.Tensor, value_loss = self._value_loss_scale * F.mse_loss(sampled_returns, predicted_values) # compute discriminator loss - amp_logits, _, _ = self.discriminator.act(states=self._amp_state_preprocessor(sampled_amp_states[0:4096], train=True)) - - amp_replay_logits, _, _ = self.discriminator.act(states=self._amp_state_preprocessor(sampled_replay_batches[batch_index][0][0:4096], train=True)) + if self._discriminator_batch_size: + sampled_amp_states = self._amp_state_preprocessor(sampled_amp_states[0:self._discriminator_batch_size], train=True) + sampled_amp_replay_states = self._amp_state_preprocessor( + sampled_replay_batches[batch_index][0][0:self._discriminator_batch_size], train=True) + sampled_amp_motion_states = self._amp_state_preprocessor( + sampled_motion_batches[batch_index][0][0:self._discriminator_batch_size], train=True) + else: + sampled_amp_states = self._amp_state_preprocessor(sampled_amp_states, train=True) + sampled_amp_replay_states = self._amp_state_preprocessor(sampled_replay_batches[batch_index][0], train=True) + sampled_amp_motion_states = self._amp_state_preprocessor(sampled_motion_batches[batch_index][0], train=True) - sampled_amp_motion_states = self._amp_state_preprocessor(sampled_motion_batches[batch_index][0][0:4096], train=True) sampled_amp_motion_states.requires_grad_(True) + amp_logits, _, _ = self.discriminator.act(states=sampled_amp_states) + amp_replay_logits, _, _ = self.discriminator.act(states=sampled_amp_replay_states) amp_motion_logits, _, _ = self.discriminator.act(states=sampled_amp_motion_states) amp_cat_logits = torch.cat([amp_logits, amp_replay_logits], dim=0) @@ -538,7 +546,7 @@ def compute_gae(rewards: torch.Tensor, self.scheduler.step() # update AMP repaly buffer - self.reply_buffer.add_samples(states=amp_states) + self.reply_buffer.add_samples(states=amp_states.view(-1, amp_states.shape[-1])) # record data self.track_data("Loss / Policy loss", cumulative_policy_loss / (self._learning_epochs * self._mini_batches)) From f34dbfe619ba83d8e6151b0fbbdd0613d5ac3141 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20Serrano=20Mu=C3=B1oz?= Date: Tue, 26 Jul 2022 23:30:59 +0200 Subject: [PATCH 014/108] Update AMP agent in docs --- docs/source/examples/isaacgym/amp_humanoid.py | 5 +++-- docs/source/intro/examples.rst | 2 +- docs/source/modules/skrl.agents.amp.rst | 10 +++++----- 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/docs/source/examples/isaacgym/amp_humanoid.py b/docs/source/examples/isaacgym/amp_humanoid.py index 6fe006d9..8b3c4f05 100644 --- a/docs/source/examples/isaacgym/amp_humanoid.py +++ b/docs/source/examples/isaacgym/amp_humanoid.py @@ -24,9 +24,9 @@ # - Discriminator: differentiate between police-generated behaviors and behaviors from the motion dataset class Policy(GaussianModel): def __init__(self, observation_space, action_space, device, clip_actions=False, - clip_log_std=True, min_log_std=-20, max_log_std=2): + clip_log_std=True, min_log_std=-20, max_log_std=2, reduction="sum"): super().__init__(observation_space, action_space, device, clip_actions, - clip_log_std, min_log_std, max_log_std) + clip_log_std, min_log_std, max_log_std, reduction) self.net = nn.Sequential(nn.Linear(self.num_observations, 1024), nn.ReLU(), @@ -108,6 +108,7 @@ def compute(self, states, taken_actions): cfg_amp["amp_batch_size"] = 512 cfg_amp["task_reward_weight"] = 0.0 cfg_amp["style_reward_weight"] = 1.0 +cfg_amp["discriminator_batch_size"] = 4096 cfg_amp["discriminator_reward_scale"] = 2 cfg_amp["discriminator_logit_regularization_scale"] = 0.05 cfg_amp["discriminator_gradient_penalty_scale"] = 5 diff --git a/docs/source/intro/examples.rst b/docs/source/intro/examples.rst index cf8f8f2e..02adee87 100644 --- a/docs/source/intro/examples.rst +++ b/docs/source/intro/examples.rst @@ -392,7 +392,7 @@ The PPO agent configuration is mapped, as far as possible, from the rl_games' A2 .. tab:: Humanoid (AMP) - View the raw code: `ppo_humanoid.py `_ + View the raw code: `amp_humanoid.py `_ .. literalinclude:: ../examples/isaacgym/amp_humanoid.py :language: python diff --git a/docs/source/modules/skrl.agents.amp.rst b/docs/source/modules/skrl.agents.amp.rst index 2256f4d9..314ecf6a 100644 --- a/docs/source/modules/skrl.agents.amp.rst +++ b/docs/source/modules/skrl.agents.amp.rst @@ -33,7 +33,7 @@ Algorithm implementation | :math:`A \leftarrow \dfrac{A - \bar{A}}{A_\sigma + 10^{-8}}` | :green:`# update dataset of reference motions` -| :math:`\text{collect reference motions of}` :guilabel:`amp_batch_size` :math:`\rightarrow\;` :math:`\text{append}(M)` +| collect reference motions of size :guilabel:`amp_batch_size` :math:`\rightarrow\;` :math:`\text{append}(M)` | :green:`# compute combined rewards` | :math:`r_D \leftarrow -log(\text{max}( 1 - \hat{y}(D_\psi(s_{_{AMP}})), \, 10^{-4})) \qquad` with :math:`\; \hat{y}(x) = \dfrac{1}{1 + e^{-x}}` | :math:`r' \leftarrow` :guilabel:`task_reward_weight` :math:`r \, +` :guilabel:`style_reward_weight` :guilabel:`discriminator_reward_scale` :math:`r_D` @@ -67,9 +67,9 @@ Algorithm implementation | :math:`V_{_{predicted}} \leftarrow V + \text{clip}(V_{_{predicted}} - V, -c, c) \qquad` with :math:`c` as :guilabel:`value_clip` | :math:`L_{V_\phi} \leftarrow` :guilabel:`value_loss_scale` :math:`\frac{1}{N} \sum_{i=1}^N (R - V_{_{predicted}})^2` | :green:`# compute discriminator loss` -| :math:`{logit}_{_{AMP}} \leftarrow D_\psi(s_{_{AMP}})` -| :math:`{logit}_{_{AMP}}^{^B} \leftarrow D_\psi(s_{_{AMP}}^{^B})` -| :math:`{logit}_{_{AMP}}^{^M} \leftarrow D_\psi(s_{_{AMP}}^{^M})` +| :math:`{logit}_{_{AMP}} \leftarrow D_\psi(s_{_{AMP}}) \qquad` with :math:`s_{_{AMP}}` of size :guilabel:`discriminator_batch_size` +| :math:`{logit}_{_{AMP}}^{^B} \leftarrow D_\psi(s_{_{AMP}}^{^B}) \qquad` with :math:`s_{_{AMP}}^{^B}` of size :guilabel:`discriminator_batch_size` +| :math:`{logit}_{_{AMP}}^{^M} \leftarrow D_\psi(s_{_{AMP}}^{^M}) \qquad` with :math:`s_{_{AMP}}^{^M}` of size :guilabel:`discriminator_batch_size` | :green:`# discriminator prediction loss` | :math:`L_{D_\psi} \leftarrow \dfrac{1}{2}(BCE({logit}_{_{AMP}}` ++ :math:`{logit}_{_{AMP}}^{^B}, \, 0) + BCE({logit}_{_{AMP}}^{^M}, \, 1))` | with :math:`\; BCE(x,y)=-\frac{1}{N} \sum_{i=1}^N [y \; log(\hat{y}) + (1-y) \, log(1-\hat{y})] \;` and :math:`\; \hat{y} = \dfrac{1}{1 + e^{-x}}` @@ -97,7 +97,7 @@ Configuration and hyperparameters .. literalinclude:: ../../../skrl/agents/torch/amp/amp.py :language: python - :lines: 18-67 + :lines: 18-68 :linenos: Spaces and models From 2eb42cd3745776d6dbe4b629bee96b66e1f55768 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20Serrano=20Mu=C3=B1oz?= Date: Tue, 26 Jul 2022 23:42:56 +0200 Subject: [PATCH 015/108] Add gaussian models modification to CHANGELOG --- CHANGELOG.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 196a69bc..cd35cd6e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,10 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). ## [0.8.0] - Unreleased ### Added - AMP agent for physics-based character animation +- Gaussian model + +### Changed +- Multivariate Gaussian model (`GaussianModel` until 0.7.0) to `MultivariateGaussianModel` ## [0.7.0] - 2022-07-11 ### Added From 1c7e986a50e19095b779f948a59a82ab4750aaf3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20Serrano=20Mu=C3=B1oz?= Date: Tue, 26 Jul 2022 23:49:52 +0200 Subject: [PATCH 016/108] Update MINOR version --- skrl/version.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skrl/version.txt b/skrl/version.txt index bcaffe19..8adc70fd 100644 --- a/skrl/version.txt +++ b/skrl/version.txt @@ -1 +1 @@ -0.7.0 \ No newline at end of file +0.8.0 \ No newline at end of file From d1809889e19df9dcf7c09e80af7eaf2f43369236 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20Serrano=20Mu=C3=B1oz?= Date: Wed, 27 Jul 2022 15:13:16 +0200 Subject: [PATCH 017/108] Map model device location during loading --- skrl/models/torch/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skrl/models/torch/base.py b/skrl/models/torch/base.py index 7cd77c5d..ad810ce0 100644 --- a/skrl/models/torch/base.py +++ b/skrl/models/torch/base.py @@ -302,7 +302,7 @@ def load(self, path: str) -> None: :param path: Path to load the model from :type path: str """ - self.load_state_dict(torch.load(path)) + self.load_state_dict(torch.load(path, map_location=self.device)) self.eval() def freeze_parameters(self, freeze: bool = True) -> None: From 9e9281c553ede45a554c1faefa5918ef16bf5982 Mon Sep 17 00:00:00 2001 From: Toni-SM Date: Sat, 9 Jul 2022 00:07:08 +0200 Subject: [PATCH 018/108] Create python-publish.yml --- .github/workflows/python-publish.yml | 47 ++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) create mode 100644 .github/workflows/python-publish.yml diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml new file mode 100644 index 00000000..6609624f --- /dev/null +++ b/.github/workflows/python-publish.yml @@ -0,0 +1,47 @@ +# This workflow will upload a Python Package using Twine when a release is created +# For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries + +# This workflow uses actions that are not certified by GitHub. +# They are provided by a third-party and are governed by +# separate terms of service, privacy policy, and support +# documentation. + +name: Upload Python Package + +on: + release: + types: [published] + +permissions: + contents: read + +jobs: + deploy: + + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v3 + - name: Set up Python + uses: actions/setup-python@v3 + with: + python-version: '3.x' + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install build + - name: Build package + run: python -m build + - name: Publish package to PyPI + if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags') + uses: pypa/gh-action-pypi-publish@release/v1 + with: + user: __token__ + password: ${{ secrets.PYPI_API_TOKEN }} + verbose: true + - name: Publish package to TestPyPI + uses: pypa/gh-action-pypi-publish@release/v1 + with: + user: __token__ + password: ${{ secrets.TEST_PYPI_API_TOKEN }} + repository_url: https://test.pypi.org/legacy/ From 9407a0d84fb7dd12100fab629779476e664549c8 Mon Sep 17 00:00:00 2001 From: Toni-SM Date: Sat, 9 Jul 2022 00:17:07 +0200 Subject: [PATCH 019/108] Create manual.yml --- .github/workflows/manual.yml | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) create mode 100644 .github/workflows/manual.yml diff --git a/.github/workflows/manual.yml b/.github/workflows/manual.yml new file mode 100644 index 00000000..9a98e4e9 --- /dev/null +++ b/.github/workflows/manual.yml @@ -0,0 +1,24 @@ +name: Manually triggered workflow +on: + workflow_dispatch: + inputs: + job: + description: 'Job to run.' + required: true + default: 'prod-deploy' +jobs: + dev-deploy: + name: Deploy dev-deploy + runs-on: ubuntu-latest + if: ${{ github.event.inputs.job == 'dev-deploy'}} + steps: + - name: ${{ github.event.inputs.job }} - step 1 + run: echo "${{ github.event.inputs.job }} triggered!" + + prod-deploy: + name: Deploy prod-deploy + runs-on: ubuntu-latest + if: ${{ github.event.inputs.job == 'prod-deploy'}} + steps: + - name: ${{ github.event.inputs.job }} - step 1 + run: echo "${{ github.event.inputs.job }} triggered!" From a9f0f02e20fe83a934fc7680472002df9f645aff Mon Sep 17 00:00:00 2001 From: Toni-SM Date: Sat, 9 Jul 2022 00:27:35 +0200 Subject: [PATCH 020/108] Update and rename manual.yml to python-publish-manual.yml --- .github/workflows/manual.yml | 24 ------------- .github/workflows/python-publish-manual.yml | 38 +++++++++++++++++++++ 2 files changed, 38 insertions(+), 24 deletions(-) delete mode 100644 .github/workflows/manual.yml create mode 100644 .github/workflows/python-publish-manual.yml diff --git a/.github/workflows/manual.yml b/.github/workflows/manual.yml deleted file mode 100644 index 9a98e4e9..00000000 --- a/.github/workflows/manual.yml +++ /dev/null @@ -1,24 +0,0 @@ -name: Manually triggered workflow -on: - workflow_dispatch: - inputs: - job: - description: 'Job to run.' - required: true - default: 'prod-deploy' -jobs: - dev-deploy: - name: Deploy dev-deploy - runs-on: ubuntu-latest - if: ${{ github.event.inputs.job == 'dev-deploy'}} - steps: - - name: ${{ github.event.inputs.job }} - step 1 - run: echo "${{ github.event.inputs.job }} triggered!" - - prod-deploy: - name: Deploy prod-deploy - runs-on: ubuntu-latest - if: ${{ github.event.inputs.job == 'prod-deploy'}} - steps: - - name: ${{ github.event.inputs.job }} - step 1 - run: echo "${{ github.event.inputs.job }} triggered!" diff --git a/.github/workflows/python-publish-manual.yml b/.github/workflows/python-publish-manual.yml new file mode 100644 index 00000000..58ae055d --- /dev/null +++ b/.github/workflows/python-publish-manual.yml @@ -0,0 +1,38 @@ +name: Upload Python Package (manually triggered workflow) + +on: + workflow_dispatch: + inputs: + job: + description: 'Upload Python Package to PyPI/TestPyPI' + required: true + default: 'test-pypi' + +permissions: + contents: read + +jobs: + pypi: + name: Publish package to PyPI + runs-on: ubuntu-latest + if: ${{ github.event.inputs.job == 'dev-deploy'}} + steps: + - name: Publish package to PyPI + uses: pypa/gh-action-pypi-publish@release/v1 + with: + user: __token__ + password: ${{ secrets.PYPI_API_TOKEN }} + verbose: true + + test-pypi: + name: Publish package to TestPyPI + runs-on: ubuntu-latest + if: ${{ github.event.inputs.job == 'prod-deploy'}} + steps: + - name: Publish package to TestPyPI + uses: pypa/gh-action-pypi-publish@release/v1 + with: + user: __token__ + password: ${{ secrets.TEST_PYPI_API_TOKEN }} + repository_url: https://test.pypi.org/legacy/ + verbose: true From f0bc89300abf520ac34a44ae8f1bf354e5a25d0c Mon Sep 17 00:00:00 2001 From: Toni-SM Date: Sat, 9 Jul 2022 00:32:24 +0200 Subject: [PATCH 021/108] Update python-publish-manual.yml --- .github/workflows/python-publish-manual.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/python-publish-manual.yml b/.github/workflows/python-publish-manual.yml index 58ae055d..caec0aa6 100644 --- a/.github/workflows/python-publish-manual.yml +++ b/.github/workflows/python-publish-manual.yml @@ -15,7 +15,7 @@ jobs: pypi: name: Publish package to PyPI runs-on: ubuntu-latest - if: ${{ github.event.inputs.job == 'dev-deploy'}} + if: ${{ github.event.inputs.job == 'pypi'}} steps: - name: Publish package to PyPI uses: pypa/gh-action-pypi-publish@release/v1 @@ -27,7 +27,7 @@ jobs: test-pypi: name: Publish package to TestPyPI runs-on: ubuntu-latest - if: ${{ github.event.inputs.job == 'prod-deploy'}} + if: ${{ github.event.inputs.job == 'test-pypi'}} steps: - name: Publish package to TestPyPI uses: pypa/gh-action-pypi-publish@release/v1 From c0fd1d22432b2970002d1af385dde8907b3b6841 Mon Sep 17 00:00:00 2001 From: Toni-SM Date: Sat, 9 Jul 2022 00:37:13 +0200 Subject: [PATCH 022/108] Update python-publish-manual.yml --- .github/workflows/python-publish-manual.yml | 22 +++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/.github/workflows/python-publish-manual.yml b/.github/workflows/python-publish-manual.yml index caec0aa6..c3caae87 100644 --- a/.github/workflows/python-publish-manual.yml +++ b/.github/workflows/python-publish-manual.yml @@ -17,6 +17,17 @@ jobs: runs-on: ubuntu-latest if: ${{ github.event.inputs.job == 'pypi'}} steps: + - uses: actions/checkout@v3 + - name: Set up Python + uses: actions/setup-python@v3 + with: + python-version: '3.7' + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install build + - name: Build package + run: python -m build - name: Publish package to PyPI uses: pypa/gh-action-pypi-publish@release/v1 with: @@ -29,6 +40,17 @@ jobs: runs-on: ubuntu-latest if: ${{ github.event.inputs.job == 'test-pypi'}} steps: + - uses: actions/checkout@v3 + - name: Set up Python + uses: actions/setup-python@v3 + with: + python-version: '3.7' + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install build + - name: Build package + run: python -m build - name: Publish package to TestPyPI uses: pypa/gh-action-pypi-publish@release/v1 with: From 195708ff0994a8ea5c7c2494af2ab3f05551a3fa Mon Sep 17 00:00:00 2001 From: Toni-SM Date: Mon, 11 Jul 2022 23:58:48 +0200 Subject: [PATCH 023/108] Delete python-publish.yml --- .github/workflows/python-publish.yml | 47 ---------------------------- 1 file changed, 47 deletions(-) delete mode 100644 .github/workflows/python-publish.yml diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml deleted file mode 100644 index 6609624f..00000000 --- a/.github/workflows/python-publish.yml +++ /dev/null @@ -1,47 +0,0 @@ -# This workflow will upload a Python Package using Twine when a release is created -# For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries - -# This workflow uses actions that are not certified by GitHub. -# They are provided by a third-party and are governed by -# separate terms of service, privacy policy, and support -# documentation. - -name: Upload Python Package - -on: - release: - types: [published] - -permissions: - contents: read - -jobs: - deploy: - - runs-on: ubuntu-latest - - steps: - - uses: actions/checkout@v3 - - name: Set up Python - uses: actions/setup-python@v3 - with: - python-version: '3.x' - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install build - - name: Build package - run: python -m build - - name: Publish package to PyPI - if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags') - uses: pypa/gh-action-pypi-publish@release/v1 - with: - user: __token__ - password: ${{ secrets.PYPI_API_TOKEN }} - verbose: true - - name: Publish package to TestPyPI - uses: pypa/gh-action-pypi-publish@release/v1 - with: - user: __token__ - password: ${{ secrets.TEST_PYPI_API_TOKEN }} - repository_url: https://test.pypi.org/legacy/ From a5cd534218435c5acf4e94c854f343d45f303fa3 Mon Sep 17 00:00:00 2001 From: Johann Christensen Date: Sun, 24 Jul 2022 13:22:56 +0200 Subject: [PATCH 024/108] Replace custom progress with cleaner tqdm --- .../examples/isaacgym/ppo_ball_balance.py | 12 +-- docs/source/examples/isaacgym/ppo_cartpole.py | 12 +-- .../examples/isaacgym/ppo_quadcopter.py | 12 +-- .../source/examples/isaacgym/trpo_cartpole.py | 12 +-- .../examples/isaacsim/isaacsim_jetbot_ppo.py | 22 +++--- .../examples/omniisaacgym/ppo_cartpole.py | 10 +-- .../examples/omniisaacgym/ppo_cartpole_mt.py | 10 +-- setup.py | 1 + skrl/trainers/torch/base.py | 76 ++++++------------- skrl/trainers/torch/parallel.py | 55 ++++++-------- skrl/trainers/torch/sequential.py | 63 ++++++++------- 11 files changed, 123 insertions(+), 162 deletions(-) diff --git a/docs/source/examples/isaacgym/ppo_ball_balance.py b/docs/source/examples/isaacgym/ppo_ball_balance.py index 1baf3895..84e9999a 100644 --- a/docs/source/examples/isaacgym/ppo_ball_balance.py +++ b/docs/source/examples/isaacgym/ppo_ball_balance.py @@ -57,7 +57,7 @@ def compute(self, states, taken_actions): # Load and wrap the Isaac Gym environment. -# The following lines are intended to support all versions (preview 2, 3 and 4). +# The following lines are intended to support all versions (preview 2, 3 and 4). # It tries to load from preview 3/4, but if it fails, it will try to load from preview 2 try: env = load_isaacgym_env_preview4(task_name="BallBalance") # preview 3 and 4 use the same loader @@ -81,7 +81,7 @@ def compute(self, states, taken_actions): # Initialize the models' parameters (weights and biases) using a Gaussian distribution for model in models_ppo.values(): - model.init_parameters(method_name="normal_", mean=0.0, std=0.1) + model.init_parameters(method_name="normal_", mean=0.0, std=0.1) # Configure and instantiate the agent. @@ -115,15 +115,15 @@ def compute(self, states, taken_actions): cfg_ppo["experiment"]["checkpoint_interval"] = 200 agent = PPO(models=models_ppo, - memory=memory, - cfg=cfg_ppo, - observation_space=env.observation_space, + memory=memory, + cfg=cfg_ppo, + observation_space=env.observation_space, action_space=env.action_space, device=device) # Configure and instantiate the RL trainer -cfg_trainer = {"timesteps": 4000, "headless": True, "progress_interval": 400} +cfg_trainer = {"timesteps": 4000, "headless": True} trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=agent) # start training diff --git a/docs/source/examples/isaacgym/ppo_cartpole.py b/docs/source/examples/isaacgym/ppo_cartpole.py index 8933f319..260a298e 100644 --- a/docs/source/examples/isaacgym/ppo_cartpole.py +++ b/docs/source/examples/isaacgym/ppo_cartpole.py @@ -53,7 +53,7 @@ def compute(self, states, taken_actions): # Load and wrap the Isaac Gym environment. -# The following lines are intended to support all versions (preview 2, 3 and 4). +# The following lines are intended to support all versions (preview 2, 3 and 4). # It tries to load from preview 3/4, but if it fails, it will try to load from preview 2 try: env = load_isaacgym_env_preview4(task_name="Cartpole") # preview 3 and 4 use the same loader @@ -77,7 +77,7 @@ def compute(self, states, taken_actions): # Initialize the models' parameters (weights and biases) using a Gaussian distribution for model in models_ppo.values(): - model.init_parameters(method_name="normal_", mean=0.0, std=0.1) + model.init_parameters(method_name="normal_", mean=0.0, std=0.1) # Configure and instantiate the agent. @@ -111,15 +111,15 @@ def compute(self, states, taken_actions): cfg_ppo["experiment"]["checkpoint_interval"] = 80 agent = PPO(models=models_ppo, - memory=memory, - cfg=cfg_ppo, - observation_space=env.observation_space, + memory=memory, + cfg=cfg_ppo, + observation_space=env.observation_space, action_space=env.action_space, device=device) # Configure and instantiate the RL trainer -cfg_trainer = {"timesteps": 1600, "headless": True, "progress_interval": 160} +cfg_trainer = {"timesteps": 1600, "headless": True} trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=agent) # start training diff --git a/docs/source/examples/isaacgym/ppo_quadcopter.py b/docs/source/examples/isaacgym/ppo_quadcopter.py index b7ace921..883e31f4 100644 --- a/docs/source/examples/isaacgym/ppo_quadcopter.py +++ b/docs/source/examples/isaacgym/ppo_quadcopter.py @@ -57,7 +57,7 @@ def compute(self, states, taken_actions): # Load and wrap the Isaac Gym environment. -# The following lines are intended to support all versions (preview 2, 3 and 4). +# The following lines are intended to support all versions (preview 2, 3 and 4). # It tries to load from preview 3/4, but if it fails, it will try to load from preview 2 try: env = load_isaacgym_env_preview4(task_name="Quadcopter") # preview 3 and 4 use the same loader @@ -81,7 +81,7 @@ def compute(self, states, taken_actions): # Initialize the models' parameters (weights and biases) using a Gaussian distribution for model in models_ppo.values(): - model.init_parameters(method_name="normal_", mean=0.0, std=0.1) + model.init_parameters(method_name="normal_", mean=0.0, std=0.1) # Configure and instantiate the agent. @@ -115,15 +115,15 @@ def compute(self, states, taken_actions): cfg_ppo["experiment"]["checkpoint_interval"] = 200 agent = PPO(models=models_ppo, - memory=memory, - cfg=cfg_ppo, - observation_space=env.observation_space, + memory=memory, + cfg=cfg_ppo, + observation_space=env.observation_space, action_space=env.action_space, device=device) # Configure and instantiate the RL trainer -cfg_trainer = {"timesteps": 4000, "headless": True, "progress_interval": 400} +cfg_trainer = {"timesteps": 4000, "headless": True} trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=agent) # start training diff --git a/docs/source/examples/isaacgym/trpo_cartpole.py b/docs/source/examples/isaacgym/trpo_cartpole.py index 8a51f840..31266a55 100644 --- a/docs/source/examples/isaacgym/trpo_cartpole.py +++ b/docs/source/examples/isaacgym/trpo_cartpole.py @@ -52,7 +52,7 @@ def compute(self, states, taken_actions): # Load and wrap the Isaac Gym environment. -# The following lines are intended to support all versions (preview 2, 3 and 4). +# The following lines are intended to support all versions (preview 2, 3 and 4). # It tries to load from preview 3/4, but if it fails, it will try to load from preview 2 try: env = load_isaacgym_env_preview4(task_name="Cartpole") # preview 3 and 4 use the same loader @@ -76,7 +76,7 @@ def compute(self, states, taken_actions): # Initialize the models' parameters (weights and biases) using a Gaussian distribution for model in models_trpo.values(): - model.init_parameters(method_name="normal_", mean=0.0, std=0.1) + model.init_parameters(method_name="normal_", mean=0.0, std=0.1) # Configure and instantiate the agent. @@ -98,15 +98,15 @@ def compute(self, states, taken_actions): cfg_trpo["experiment"]["checkpoint_interval"] = 125 agent = TRPO(models=models_trpo, - memory=memory, - cfg=cfg_trpo, - observation_space=env.observation_space, + memory=memory, + cfg=cfg_trpo, + observation_space=env.observation_space, action_space=env.action_space, device=device) # Configure and instantiate the RL trainer -cfg_trainer = {"timesteps": 2500, "headless": True, "progress_interval": 250} +cfg_trainer = {"timesteps": 2500, "headless": True} trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=agent) # start training diff --git a/docs/source/examples/isaacsim/isaacsim_jetbot_ppo.py b/docs/source/examples/isaacsim/isaacsim_jetbot_ppo.py index 672fa3c6..539d69b9 100644 --- a/docs/source/examples/isaacsim/isaacsim_jetbot_ppo.py +++ b/docs/source/examples/isaacsim/isaacsim_jetbot_ppo.py @@ -12,7 +12,7 @@ from skrl.envs.torch import wrap_env -# Define the models (stochastic and deterministic models) for the agent using helper classes +# Define the models (stochastic and deterministic models) for the agent using helper classes # and programming with two approaches (layer by layer and torch.nn.Sequential class). # - Policy: takes as input the environment's observation/state and returns an action # - Value: takes the state as input and provides a value to guide the policy @@ -41,8 +41,8 @@ def __init__(self, observation_space, action_space, device, clip_actions=False, self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) def compute(self, states, taken_actions): - # view (samples, width * height * channels) -> (samples, width, height, channels) - # permute (samples, width, height, channels) -> (samples, channels, width, height) + # view (samples, width * height * channels) -> (samples, width, height, channels) + # permute (samples, width, height, channels) -> (samples, channels, width, height) x = self.net(states.view(-1, *self.observation_space.shape).permute(0, 3, 1, 2)) return 10 * torch.tanh(x), self.log_std_parameter # JetBotEnv action_space is -10 to 10 @@ -66,10 +66,10 @@ def __init__(self, observation_space, action_space, device, clip_actions = False nn.Linear(64, 32), nn.Tanh(), nn.Linear(32, 1)) - + def compute(self, states, taken_actions): - # view (samples, width * height * channels) -> (samples, width, height, channels) - # permute (samples, width, height, channels) -> (samples, channels, width, height) + # view (samples, width * height * channels) -> (samples, width, height, channels) + # permute (samples, width, height, channels) -> (samples, channels, width, height) return self.net(states.view(-1, *self.observation_space.shape).permute(0, 3, 1, 2)) @@ -92,7 +92,7 @@ def compute(self, states, taken_actions): # Initialize the models' parameters (weights and biases) using a Gaussian distribution for model in models_ppo.values(): - model.init_parameters(method_name="normal_", mean=0.0, std=0.1) + model.init_parameters(method_name="normal_", mean=0.0, std=0.1) # Configure and instantiate the agent. @@ -120,15 +120,15 @@ def compute(self, states, taken_actions): cfg_ppo["experiment"]["checkpoint_interval"] = 10000 agent = PPO(models=models_ppo, - memory=memory, - cfg=cfg_ppo, - observation_space=env.observation_space, + memory=memory, + cfg=cfg_ppo, + observation_space=env.observation_space, action_space=env.action_space, device=device) # Configure and instanciate the RL trainer -cfg_trainer = {"timesteps": 500000, "headless": True, "progress_interval": 10000} +cfg_trainer = {"timesteps": 500000, "headless": True} trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=agent) # start training diff --git a/docs/source/examples/omniisaacgym/ppo_cartpole.py b/docs/source/examples/omniisaacgym/ppo_cartpole.py index 034b31c0..789dfbff 100644 --- a/docs/source/examples/omniisaacgym/ppo_cartpole.py +++ b/docs/source/examples/omniisaacgym/ppo_cartpole.py @@ -68,7 +68,7 @@ def compute(self, states, taken_actions): # Initialize the models' parameters (weights and biases) using a Gaussian distribution for model in models_ppo.values(): - model.init_parameters(method_name="normal_", mean=0.0, std=0.1) + model.init_parameters(method_name="normal_", mean=0.0, std=0.1) # Configure and instantiate the agent. @@ -98,15 +98,15 @@ def compute(self, states, taken_actions): cfg_ppo["experiment"]["checkpoint_interval"] = 80 agent = PPO(models=models_ppo, - memory=memory, - cfg=cfg_ppo, - observation_space=env.observation_space, + memory=memory, + cfg=cfg_ppo, + observation_space=env.observation_space, action_space=env.action_space, device=device) # Configure and instantiate the RL trainer -cfg_trainer = {"timesteps": 1600, "headless": True, "progress_interval": 160} +cfg_trainer = {"timesteps": 1600, "headless": True} trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=agent) # start training diff --git a/docs/source/examples/omniisaacgym/ppo_cartpole_mt.py b/docs/source/examples/omniisaacgym/ppo_cartpole_mt.py index 888caaeb..065b7826 100644 --- a/docs/source/examples/omniisaacgym/ppo_cartpole_mt.py +++ b/docs/source/examples/omniisaacgym/ppo_cartpole_mt.py @@ -70,7 +70,7 @@ def compute(self, states, taken_actions): # Initialize the models' parameters (weights and biases) using a Gaussian distribution for model in models_ppo.values(): - model.init_parameters(method_name="normal_", mean=0.0, std=0.1) + model.init_parameters(method_name="normal_", mean=0.0, std=0.1) # Configure and instantiate the agent. @@ -100,15 +100,15 @@ def compute(self, states, taken_actions): cfg_ppo["experiment"]["checkpoint_interval"] = 80 agent = PPO(models=models_ppo, - memory=memory, - cfg=cfg_ppo, - observation_space=env.observation_space, + memory=memory, + cfg=cfg_ppo, + observation_space=env.observation_space, action_space=env.action_space, device=device) # Configure and instantiate the RL trainer -cfg_trainer = {"timesteps": 1600, "headless": True, "progress_interval": 160} +cfg_trainer = {"timesteps": 1600, "headless": True} trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=agent) # start training in a separate thread diff --git a/setup.py b/setup.py index c48d99ac..d9bf4aa1 100644 --- a/setup.py +++ b/setup.py @@ -12,6 +12,7 @@ "gym", "torch", "tensorboard", + "tqdm", ] # installation diff --git a/skrl/trainers/torch/base.py b/skrl/trainers/torch/base.py index 1b7560b4..9635c42d 100644 --- a/skrl/trainers/torch/base.py +++ b/skrl/trainers/torch/base.py @@ -3,6 +3,7 @@ import time import torch +from tqdm import tqdm from ...envs.torch import Wrapper from ...agents.torch import Agent @@ -31,10 +32,10 @@ def generate_equally_spaced_scopes(num_envs: int, num_agents: int) -> List[int]: class Trainer(): - def __init__(self, - cfg: dict, - env: Wrapper, - agents: Union[Agent, List[Agent], List[List[Agent]]], + def __init__(self, + cfg: dict, + env: Wrapper, + agents: Union[Agent, List[Agent], List[List[Agent]]], agents_scope : List[int] = []) -> None: """Base class for trainers @@ -51,11 +52,10 @@ def __init__(self, self.env = env self.agents = agents self.agents_scope = agents_scope - + # get configuration self.timesteps = self.cfg.get('timesteps', 0) self.headless = self.cfg.get("headless", False) - self.progress_interval = self.cfg.get("progress_interval", 1000) self.initial_timestep = 0 @@ -116,7 +116,7 @@ def _setup_agents(self) -> None: raise ValueError("The scopes ({}) don't cover the number of parallelizable environments ({})" \ .format(sum(self.agents_scope), self.env.num_envs)) # generate agents' scopes - index = 0 + index = 0 for i in range(len(self.agents_scope)): index += self.agents_scope[i] self.agents_scope[i] = (index - self.agents_scope[i], index) @@ -124,34 +124,6 @@ def _setup_agents(self) -> None: raise ValueError("A list of agents is expected") else: self.num_agents = 1 - - def show_progress(self, timestep: int, timesteps: int) -> None: - """Show training progress - - :param timestep: Current timestep - :type timestep: int - :param timesteps: Number of timesteps - :type timesteps: int - """ - if timestep > 0: - timestep += 1 - - if not timestep % self.progress_interval: - current_timestamp = time.time() - if self._timestamp is None: - self._timestamp = current_timestamp - self._timestamp_elapsed = self._timestamp - - delta = current_timestamp - self._timestamp - elapsed = current_timestamp - self._timestamp_elapsed if timestep else 0.0 - remaining = elapsed * (self.timesteps / timestep - 1) if timestep else 0.0 - - self._timestamp = current_timestamp - - print("|--------------------------|--------------------------|") - print("| timestep / timesteps | {} / {}".format(timestep, self.timesteps)) - print("| timesteps per second |", round(self.progress_interval / delta, 2) if timestep else 0.0) - print("| elapsed / remaining time | {} sec / {} sec".format(round(elapsed, 2), round(remaining, 2))) def train(self) -> None: """Train the agents @@ -170,7 +142,7 @@ def eval(self) -> None: def start(self) -> None: """Start training - This method is deprecated in favour of the '.train()' method + This method is deprecated in favour of the '.train()' method """ # TODO: remove this method in future versions print("[WARNING] Trainer.start() method is deprecated in favour of the '.train()' method") @@ -193,27 +165,25 @@ def single_agent_train(self) -> None: # reset env states = self.env.reset() - for timestep in range(self.initial_timestep, self.timesteps): - # show progress - self.show_progress(timestep=timestep, timesteps=self.timesteps) + for timestep in tqdm(range(self.initial_timestep, self.timesteps)): # pre-interaction self.agents.pre_interaction(timestep=timestep, timesteps=self.timesteps) - + # compute actions with torch.no_grad(): actions, _, _ = self.agents.act(states, inference=True, timestep=timestep, timesteps=self.timesteps) - + # step the environments next_states, rewards, dones, infos = self.env.step(actions) - + # render scene if not self.headless: self.env.render() # record the environments' transitions with torch.no_grad(): - self.agents.record_transition(states=states, + self.agents.record_transition(states=states, actions=actions, rewards=rewards, next_states=next_states, @@ -221,10 +191,10 @@ def single_agent_train(self) -> None: infos=infos, timestep=timestep, timesteps=self.timesteps) - + # post-interaction self.agents.post_interaction(timestep=timestep, timesteps=self.timesteps) - + # reset environments with torch.no_grad(): if dones.any(): @@ -239,7 +209,7 @@ def single_agent_eval(self) -> None: """Evaluate the agents sequentially This method executes the following steps in loop: - + - Compute actions (sequentially) - Interact with the environments - Render scene @@ -250,24 +220,22 @@ def single_agent_eval(self) -> None: # reset env states = self.env.reset() - for timestep in range(self.initial_timestep, self.timesteps): - # show progress - self.show_progress(timestep=timestep, timesteps=self.timesteps) - + for timestep in tqdm(range(self.initial_timestep, self.timesteps)): + # compute actions with torch.no_grad(): actions, _, _ = self.agents.act(states, inference=True, timestep=timestep, timesteps=self.timesteps) - + # step the environments next_states, rewards, dones, infos = self.env.step(actions) - + # render scene if not self.headless: self.env.render() - + with torch.no_grad(): # write data to TensorBoard - super(type(self.agents), self.agents).record_transition(states=states, + super(type(self.agents), self.agents).record_transition(states=states, actions=actions, rewards=rewards, next_states=next_states, diff --git a/skrl/trainers/torch/parallel.py b/skrl/trainers/torch/parallel.py index 60742017..17d1e72a 100644 --- a/skrl/trainers/torch/parallel.py +++ b/skrl/trainers/torch/parallel.py @@ -38,7 +38,7 @@ def fn_processor(process_index, *args): agent.init() print("[INFO] Processor {}: init agent {} with scope {}".format(process_index, type(agent).__name__, scope)) barrier.wait() - + # execute agent's pre-interaction step elif task == "pre_interaction": agent.pre_interaction(timestep=msg['timestep'], timesteps=msg['timesteps']) @@ -48,9 +48,9 @@ def fn_processor(process_index, *args): elif task == "act": _states = queue.get()[scope[0]:scope[1]] with torch.no_grad(): - _actions = agent.act(_states, + _actions = agent.act(_states, inference=True, - timestep=msg['timestep'], + timestep=msg['timestep'], timesteps=msg['timesteps'])[0] if not _actions.is_cuda: _actions.share_memory_() @@ -60,7 +60,7 @@ def fn_processor(process_index, *args): # record agent's experience elif task == "record_transition": with torch.no_grad(): - agent.record_transition(states=_states, + agent.record_transition(states=_states, actions=_actions, rewards=queue.get()[scope[0]:scope[1]], next_states=queue.get()[scope[0]:scope[1]], @@ -69,7 +69,7 @@ def fn_processor(process_index, *args): timestep=msg['timestep'], timesteps=msg['timesteps']) barrier.wait() - + # execute agent's post-interaction step elif task == "post_interaction": agent.post_interaction(timestep=msg['timestep'], timesteps=msg['timesteps']) @@ -78,7 +78,7 @@ def fn_processor(process_index, *args): # write data to TensorBoard (evaluation) elif task == "eval-record_transition-post_interaction": with torch.no_grad(): - super(type(agent), agent).record_transition(states=_states, + super(type(agent), agent).record_transition(states=_states, actions=_actions, rewards=queue.get()[scope[0]:scope[1]], next_states=queue.get()[scope[0]:scope[1]], @@ -91,13 +91,13 @@ def fn_processor(process_index, *args): class ParallelTrainer(Trainer): - def __init__(self, - cfg: dict, - env: Wrapper, - agents: Union[Agent, List[Agent], List[List[Agent]]], + def __init__(self, + cfg: dict, + env: Wrapper, + agents: Union[Agent, List[Agent], List[List[Agent]]], agents_scope : List[int] = []) -> None: """Parallel trainer - + Train agents in parallel using multiple processes :param cfg: Configuration dictionary @@ -138,7 +138,7 @@ def train(self) -> None: consumer_pipes = [] barrier = mp.Barrier(self.num_agents + 1) processes = [] - + for i in range(self.num_agents): pipe_read, pipe_write = mp.Pipe(duplex=False) producer_pipes.append(pipe_write) @@ -175,9 +175,7 @@ def train(self) -> None: if not states.is_cuda: states.share_memory_() - for timestep in range(self.initial_timestep, self.timesteps): - # show progress - self.show_progress(timestep=timestep, timesteps=self.timesteps) + for timestep in tqdm(range(self.initial_timestep, self.timesteps)): # pre-interaction for pipe in producer_pipes: @@ -192,10 +190,10 @@ def train(self) -> None: barrier.wait() actions = torch.vstack([queue.get() for queue in queues]) - + # step the environments next_states, rewards, dones, infos = self.env.step(actions) - + # render scene if not self.headless: self.env.render() @@ -208,7 +206,7 @@ def train(self) -> None: next_states.share_memory_() if not dones.is_cuda: dones.share_memory_() - + for pipe, queue in zip(producer_pipes, queues): pipe.send({"task": "record_transition", "timestep": timestep, "timesteps": self.timesteps}) queue.put(rewards) @@ -234,7 +232,7 @@ def train(self) -> None: # terminate processes for pipe in producer_pipes: pipe.send({"task": "terminate"}) - + # join processes for process in processes: process.join() @@ -264,7 +262,7 @@ def eval(self) -> None: consumer_pipes = [] barrier = mp.Barrier(self.num_agents + 1) processes = [] - + for i in range(self.num_agents): pipe_read, pipe_write = mp.Pipe(duplex=False) producer_pipes.append(pipe_write) @@ -302,9 +300,7 @@ def eval(self) -> None: if not states.is_cuda: states.share_memory_() - for timestep in range(self.initial_timestep, self.timesteps): - # show progress - self.show_progress(timestep=timestep, timesteps=self.timesteps) + for timestep in tqdm(range(self.initial_timestep, self.timesteps)): # compute actions with torch.no_grad(): @@ -314,10 +310,10 @@ def eval(self) -> None: barrier.wait() actions = torch.vstack([queue.get() for queue in queues]) - + # step the environments next_states, rewards, dones, infos = self.env.step(actions) - + # render scene if not self.headless: self.env.render() @@ -330,10 +326,10 @@ def eval(self) -> None: next_states.share_memory_() if not dones.is_cuda: dones.share_memory_() - + for pipe, queue in zip(producer_pipes, queues): - pipe.send({"task": "eval-record_transition-post_interaction", - "timestep": timestep, + pipe.send({"task": "eval-record_transition-post_interaction", + "timestep": timestep, "timesteps": self.timesteps}) queue.put(rewards) queue.put(next_states) @@ -352,11 +348,10 @@ def eval(self) -> None: # terminate processes for pipe in producer_pipes: pipe.send({"task": "terminate"}) - + # join processes for process in processes: process.join() # close the environment self.env.close() - \ No newline at end of file diff --git a/skrl/trainers/torch/sequential.py b/skrl/trainers/torch/sequential.py index 22618d41..eed888e8 100644 --- a/skrl/trainers/torch/sequential.py +++ b/skrl/trainers/torch/sequential.py @@ -1,6 +1,7 @@ from typing import Union, List import torch +from tqdm import tqdm from ...envs.torch import Wrapper from ...agents.torch import Agent @@ -9,13 +10,13 @@ class SequentialTrainer(Trainer): - def __init__(self, - cfg: dict, - env: Wrapper, - agents: Union[Agent, List[Agent], List[List[Agent]]], + def __init__(self, + cfg: dict, + env: Wrapper, + agents: Union[Agent, List[Agent], List[List[Agent]]], agents_scope : List[int] = []) -> None: """Sequential trainer - + Train agents sequentially (i.e., one after the other in each interaction with the environment) :param cfg: Configuration dictionary @@ -57,25 +58,23 @@ def train(self) -> None: # reset env states = self.env.reset() - for timestep in range(self.initial_timestep, self.timesteps): - # show progress - self.show_progress(timestep=timestep, timesteps=self.timesteps) + for timestep in tqdm(range(self.initial_timestep, self.timesteps)): # pre-interaction for agent in self.agents: agent.pre_interaction(timestep=timestep, timesteps=self.timesteps) - + # compute actions with torch.no_grad(): - actions = torch.vstack([agent.act(states[scope[0]:scope[1]], + actions = torch.vstack([agent.act(states[scope[0]:scope[1]], inference=True, - timestep=timestep, + timestep=timestep, timesteps=self.timesteps)[0] \ for agent, scope in zip(self.agents, self.agents_scope)]) - + # step the environments next_states, rewards, dones, infos = self.env.step(actions) - + # render scene if not self.headless: self.env.render() @@ -83,15 +82,15 @@ def train(self) -> None: # record the environments' transitions with torch.no_grad(): for agent, scope in zip(self.agents, self.agents_scope): - agent.record_transition(states=states[scope[0]:scope[1]], - actions=actions[scope[0]:scope[1]], - rewards=rewards[scope[0]:scope[1]], - next_states=next_states[scope[0]:scope[1]], + agent.record_transition(states=states[scope[0]:scope[1]], + actions=actions[scope[0]:scope[1]], + rewards=rewards[scope[0]:scope[1]], + next_states=next_states[scope[0]:scope[1]], dones=dones[scope[0]:scope[1]], infos=infos, timestep=timestep, timesteps=self.timesteps) - + # post-interaction for agent in self.agents: agent.post_interaction(timestep=timestep, timesteps=self.timesteps) @@ -110,7 +109,7 @@ def eval(self) -> None: """Evaluate the agents sequentially This method executes the following steps in loop: - + - Compute actions (sequentially) - Interact with the environments - Render scene @@ -124,32 +123,30 @@ def eval(self) -> None: # reset env states = self.env.reset() - for timestep in range(self.initial_timestep, self.timesteps): - # show progress - self.show_progress(timestep=timestep, timesteps=self.timesteps) - + for timestep in tqdm(range(self.initial_timestep, self.timesteps)): + # compute actions with torch.no_grad(): - actions = torch.vstack([agent.act(states[scope[0]:scope[1]], + actions = torch.vstack([agent.act(states[scope[0]:scope[1]], inference=True, - timestep=timestep, + timestep=timestep, timesteps=self.timesteps)[0] \ for agent, scope in zip(self.agents, self.agents_scope)]) - + # step the environments next_states, rewards, dones, infos = self.env.step(actions) - + # render scene if not self.headless: self.env.render() - + with torch.no_grad(): # write data to TensorBoard for agent, scope in zip(self.agents, self.agents_scope): - super(type(agent), agent).record_transition(states=states[scope[0]:scope[1]], - actions=actions[scope[0]:scope[1]], - rewards=rewards[scope[0]:scope[1]], - next_states=next_states[scope[0]:scope[1]], + super(type(agent), agent).record_transition(states=states[scope[0]:scope[1]], + actions=actions[scope[0]:scope[1]], + rewards=rewards[scope[0]:scope[1]], + next_states=next_states[scope[0]:scope[1]], dones=dones[scope[0]:scope[1]], infos=infos, timestep=timestep, @@ -168,7 +165,7 @@ def eval(self) -> None: def start(self) -> None: """Start training - This method is deprecated in favour of the '.train()' method + This method is deprecated in favour of the '.train()' method """ super().start() self.train() From 1002e0ec440049cca4c0be39f5a155ac819783a7 Mon Sep 17 00:00:00 2001 From: Johann Christensen Date: Thu, 28 Jul 2022 09:02:02 +0200 Subject: [PATCH 025/108] Remove remnant files from wrong git operations --- .github/workflows/python-publish-manual.yml | 60 --------------------- 1 file changed, 60 deletions(-) delete mode 100644 .github/workflows/python-publish-manual.yml diff --git a/.github/workflows/python-publish-manual.yml b/.github/workflows/python-publish-manual.yml deleted file mode 100644 index c3caae87..00000000 --- a/.github/workflows/python-publish-manual.yml +++ /dev/null @@ -1,60 +0,0 @@ -name: Upload Python Package (manually triggered workflow) - -on: - workflow_dispatch: - inputs: - job: - description: 'Upload Python Package to PyPI/TestPyPI' - required: true - default: 'test-pypi' - -permissions: - contents: read - -jobs: - pypi: - name: Publish package to PyPI - runs-on: ubuntu-latest - if: ${{ github.event.inputs.job == 'pypi'}} - steps: - - uses: actions/checkout@v3 - - name: Set up Python - uses: actions/setup-python@v3 - with: - python-version: '3.7' - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install build - - name: Build package - run: python -m build - - name: Publish package to PyPI - uses: pypa/gh-action-pypi-publish@release/v1 - with: - user: __token__ - password: ${{ secrets.PYPI_API_TOKEN }} - verbose: true - - test-pypi: - name: Publish package to TestPyPI - runs-on: ubuntu-latest - if: ${{ github.event.inputs.job == 'test-pypi'}} - steps: - - uses: actions/checkout@v3 - - name: Set up Python - uses: actions/setup-python@v3 - with: - python-version: '3.7' - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install build - - name: Build package - run: python -m build - - name: Publish package to TestPyPI - uses: pypa/gh-action-pypi-publish@release/v1 - with: - user: __token__ - password: ${{ secrets.TEST_PYPI_API_TOKEN }} - repository_url: https://test.pypi.org/legacy/ - verbose: true From a54f69b3848e8de150b4cace1f22e38b7d3784c7 Mon Sep 17 00:00:00 2001 From: Johann Christensen Date: Thu, 28 Jul 2022 13:29:13 +0200 Subject: [PATCH 026/108] Reorder imports according to skrl standards --- skrl/trainers/torch/base.py | 2 +- skrl/trainers/torch/sequential.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/skrl/trainers/torch/base.py b/skrl/trainers/torch/base.py index 9635c42d..676215ab 100644 --- a/skrl/trainers/torch/base.py +++ b/skrl/trainers/torch/base.py @@ -1,9 +1,9 @@ from typing import Union, List import time +from tqdm import tqdm import torch -from tqdm import tqdm from ...envs.torch import Wrapper from ...agents.torch import Agent diff --git a/skrl/trainers/torch/sequential.py b/skrl/trainers/torch/sequential.py index eed888e8..b9a3ca57 100644 --- a/skrl/trainers/torch/sequential.py +++ b/skrl/trainers/torch/sequential.py @@ -1,8 +1,9 @@ from typing import Union, List -import torch from tqdm import tqdm +import torch + from ...envs.torch import Wrapper from ...agents.torch import Agent From 323b8ea8e199202a5b42f5c284e4bc3f55473752 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20Serrano=20Mu=C3=B1oz?= Date: Sun, 31 Jul 2022 09:58:28 +0200 Subject: [PATCH 027/108] Update CONTRIBUTING.md --- CONTRIBUTING.md | 27 +++++++++++++++++++++++---- 1 file changed, 23 insertions(+), 4 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 1abbf65a..53004b68 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -3,13 +3,15 @@ First of all, **thank you**... For what? Because you are dedicating some time to
-### I don't want to contribute (for now), I just want to ask a question! +### I just want to ask a question! If you have a question, please do not open an issue for this. Instead, use the following resources for it (you will get a faster response): - [skrl's GitHub discussions](https://github.com/Toni-SM/skrl/discussions), a place to ask questions and discuss about the project -- [Isaac Gym's forum](https://forums.developer.nvidia.com/c/agx-autonomous-machines/isaac/isaac-gym/322), , a place to post your questions, find past answers, or just chat with other members of the community about Isaac Gym topics +- [Isaac Gym's forum](https://forums.developer.nvidia.com/c/agx-autonomous-machines/isaac/isaac-gym/322), a place to post your questions, find past answers, or just chat with other members of the community about Isaac Gym topics + +- [Omniverse Isaac Sim's forum](https://forums.developer.nvidia.com/c/agx-autonomous-machines/isaac/simulation/69), a place to post your questions, find past answers, or just chat with other members of the community about Omniverse Isaac Sim/Gym topics ### I have found a (good) bug. What can I do? @@ -21,10 +23,16 @@ Open an issue on [skrl's GitHub issues](https://github.com/Toni-SM/skrl/issues) - A link to the source code of the library that you are using (some problems may be due to the use of older versions. If possible, always use the latest version) - Any other information that you think may be useful or help to reproduce/describe the problem -Note: Changes that are cosmetic in nature (code formatting, removing whitespace, etc.) or that correct grammatical, spelling or typo errors, and that do not add anything substantial to the functionality of the library will generally not be accepted as a pull request - ### I want to contribute, but I don't know how +There is a [board](https://github.com/users/Toni-SM/projects/2/views/8) containing relevant future implementations which can be a good starting place to identify contributions. Please consider the following points + +#### Notes about contributing + +- Try to **communicate your change first** to [discuss](https://github.com/Toni-SM/skrl/discussions) the implementation if you want to add a new feature or change an existing one +- Modify only the minimum amount of code required and the files needed to make the change +- Changes that are cosmetic in nature (code formatting, removing whitespace, etc.) or that correct grammatical, spelling or typo errors, and that do not add anything substantial to the functionality of the library will generally not be accepted as a pull request + #### Coding conventions **skrl** is designed with a focus on modularity, readability, simplicity and transparency of algorithm implementation. The file system structure groups components according to their functionality. Library components only inherit (and must inherit) from a single base class (no multilevel or multiple inheritance) that provides a uniform interface and implements common functionality that is not tied to the implementation details of the algorithms @@ -39,6 +47,17 @@ Read the code a little bit and you will understand it at first glance... Also - Capitalize (the first letter) and omit any trailing punctuation - Write it in the imperative tense - Aim for about 50 (or 72) characters +- Add import statements at the top of each module as follows: + + ```ini + function annotation (e.g. typing) + # insert an empty line + python libraries and other libraries (e.g. gym, numpy, time, etc.) + # insert an empty line + machine learning framework modules (e.g. torch, torch.nn) + # insert an empty line + skrl components + ```
From 2008209a2c17f43394abda547e3806d2620d1483 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20Serrano=20Mu=C3=B1oz?= Date: Sun, 31 Jul 2022 12:54:47 +0200 Subject: [PATCH 028/108] Add manual trainer --- skrl/trainers/torch/__init__.py | 3 +- skrl/trainers/torch/manual.py | 219 ++++++++++++++++++++++++++++++++ 2 files changed, 221 insertions(+), 1 deletion(-) create mode 100644 skrl/trainers/torch/manual.py diff --git a/skrl/trainers/torch/__init__.py b/skrl/trainers/torch/__init__.py index 26232c81..1077b277 100644 --- a/skrl/trainers/torch/__init__.py +++ b/skrl/trainers/torch/__init__.py @@ -2,4 +2,5 @@ from .base import generate_equally_spaced_scopes from .sequential import SequentialTrainer -from .parallel import ParallelTrainer \ No newline at end of file +from .parallel import ParallelTrainer +from .manual import ManualTrainer diff --git a/skrl/trainers/torch/manual.py b/skrl/trainers/torch/manual.py new file mode 100644 index 00000000..e07ab75b --- /dev/null +++ b/skrl/trainers/torch/manual.py @@ -0,0 +1,219 @@ +from typing import Union, List, Optional + +import tqdm + +import torch + +from ...envs.torch import Wrapper +from ...agents.torch import Agent + +from . import Trainer + + +class ManualTrainer(Trainer): + def __init__(self, + cfg: dict, + env: Wrapper, + agents: Union[Agent, List[Agent], List[List[Agent]]], + agents_scope : List[int] = []) -> None: + """Manual trainer + + Train agents by manually controlling the training/evaluation loop + + :param cfg: Configuration dictionary + :type cfg: dict + :param env: Environment to train on + :type env: skrl.env.torch.Wrapper + :param agents: Agents to train + :type agents: Union[Agent, List[Agent]] + :param agents_scope: Number of environments for each agent to train on (default: []) + :type agents_scope: tuple or list of integers + """ + # TODO: close the environment + super().__init__(cfg, env, agents, agents_scope) + + # init agents + if self.num_agents > 1: + for agent in self.agents: + agent.init() + else: + self.agents.init() + + self._progress = None + + self.states = None + + def train(self, timestep: int, timesteps: Optional[int] = None) -> None: + """Execute a training iteration + + This method executes the following steps once: + + - Pre-interaction (sequentially if num_agents > 1) + - Compute actions (sequentially if num_agents > 1) + - Interact with the environments + - Render scene + - Record transitions (sequentially if num_agents > 1) + - Post-interaction (sequentially if num_agents > 1) + - Reset environments + + :param timestep: Current timestep + :type timestep: int + :param timesteps: Total number of timesteps (default: None). + If None, the total number of timesteps is obtained from the trainer's config + :type timesteps: int, optional + """ + timesteps = self.timesteps if timesteps is None else timesteps + + if self._progress is None: + self._progress = tqdm.tqdm(total=timesteps) + self._progress.update(n=1) + + # reset env + if self.states is None: + self.states = self.env.reset() + + if self.num_agents == 1: + # pre-interaction + self.agents.pre_interaction(timestep=timestep, timesteps=timesteps) + + # compute actions + with torch.no_grad(): + actions, _, _ = self.agents.act(self.states, inference=True, timestep=timestep, timesteps=timesteps) + + else: + # pre-interaction + for agent in self.agents: + agent.pre_interaction(timestep=timestep, timesteps=timesteps) + + # compute actions + with torch.no_grad(): + actions = torch.vstack([agent.act(self.states[scope[0]:scope[1]], + inference=True, + timestep=timestep, + timesteps=timesteps)[0] \ + for agent, scope in zip(self.agents, self.agents_scope)]) + + # step the environments + next_states, rewards, dones, infos = self.env.step(actions) + + # render scene + if not self.headless: + self.env.render() + + if self.num_agents == 1: + # record the environments' transitions + with torch.no_grad(): + self.agents.record_transition(states=self.states, + actions=actions, + rewards=rewards, + next_states=next_states, + dones=dones, + infos=infos, + timestep=timestep, + timesteps=timesteps) + + # post-interaction + self.agents.post_interaction(timestep=timestep, timesteps=timesteps) + + else: + # record the environments' transitions + with torch.no_grad(): + for agent, scope in zip(self.agents, self.agents_scope): + agent.record_transition(states=self.states[scope[0]:scope[1]], + actions=actions[scope[0]:scope[1]], + rewards=rewards[scope[0]:scope[1]], + next_states=next_states[scope[0]:scope[1]], + dones=dones[scope[0]:scope[1]], + infos=infos, + timestep=timestep, + timesteps=timesteps) + + # post-interaction + for agent in self.agents: + agent.post_interaction(timestep=timestep, timesteps=timesteps) + + # reset environments + with torch.no_grad(): + if dones.any(): + self.states = self.env.reset() + else: + self.states.copy_(next_states) + + + def eval(self, timestep: int, timesteps: Optional[int] = None) -> None: + """Evaluate the agents sequentially + + This method executes the following steps in loop: + + - Compute actions (sequentially if num_agents > 1) + - Interact with the environments + - Render scene + - Reset environments + + :param timestep: Current timestep + :type timestep: int + :param timesteps: Total number of timesteps (default: None). + If None, the total number of timesteps is obtained from the trainer's config + :type timesteps: int, optional + """ + timesteps = self.timesteps if timesteps is None else timesteps + + if self._progress is None: + self._progress = tqdm.tqdm(total=timesteps) + self._progress.update(n=1) + + # reset env + if self.states is None: + self.states = self.env.reset() + + with torch.no_grad(): + if self.num_agents == 1: + # compute actions + actions, _, _ = self.agents.act(self.states, inference=True, timestep=timestep, timesteps=timesteps) + + else: + # compute actions + actions = torch.vstack([agent.act(self.states[scope[0]:scope[1]], + inference=True, + timestep=timestep, + timesteps=timesteps)[0] \ + for agent, scope in zip(self.agents, self.agents_scope)]) + + # step the environments + next_states, rewards, dones, infos = self.env.step(actions) + + # render scene + if not self.headless: + self.env.render() + + with torch.no_grad(): + if self.num_agents == 1: + # write data to TensorBoard + super(type(self.agents), self.agents).record_transition(states=self.states, + actions=actions, + rewards=rewards, + next_states=next_states, + dones=dones, + infos=infos, + timestep=timestep, + timesteps=timesteps) + super(type(self.agents), self.agents).post_interaction(timestep=timestep, timesteps=timesteps) + + else: + # write data to TensorBoard + for agent, scope in zip(self.agents, self.agents_scope): + super(type(agent), agent).record_transition(states=self.states[scope[0]:scope[1]], + actions=actions[scope[0]:scope[1]], + rewards=rewards[scope[0]:scope[1]], + next_states=next_states[scope[0]:scope[1]], + dones=dones[scope[0]:scope[1]], + infos=infos, + timestep=timestep, + timesteps=timesteps) + super(type(agent), agent).post_interaction(timestep=timestep, timesteps=timesteps) + + # reset environments + if dones.any(): + self.states = self.env.reset() + else: + self.states.copy_(next_states) From 43b660e4288c15298eabb095b1604293a22fbf23 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20Serrano=20Mu=C3=B1oz?= Date: Sun, 31 Jul 2022 14:33:48 +0200 Subject: [PATCH 029/108] Change trainer's cfg parameter position and use default config --- skrl/trainers/torch/base.py | 20 ++++++++------------ skrl/trainers/torch/manual.py | 23 ++++++++++++++++------- skrl/trainers/torch/parallel.py | 28 ++++++++++++++++++++-------- skrl/trainers/torch/sequential.py | 28 +++++++++++++++++++--------- 4 files changed, 63 insertions(+), 36 deletions(-) diff --git a/skrl/trainers/torch/base.py b/skrl/trainers/torch/base.py index 676215ab..538e08a1 100644 --- a/skrl/trainers/torch/base.py +++ b/skrl/trainers/torch/base.py @@ -1,7 +1,6 @@ from typing import Union, List -import time -from tqdm import tqdm +import tqdm import torch @@ -33,20 +32,20 @@ def generate_equally_spaced_scopes(num_envs: int, num_agents: int) -> List[int]: class Trainer(): def __init__(self, - cfg: dict, env: Wrapper, - agents: Union[Agent, List[Agent], List[List[Agent]]], - agents_scope : List[int] = []) -> None: + agents: Union[Agent, List[Agent]], + agents_scope : List[int] = [], + cfg: dict = {}) -> None: """Base class for trainers - :param cfg: Configuration dictionary - :type cfg: dict :param env: Environment to train on :type env: skrl.env.torch.Wrapper :param agents: Agents to train :type agents: Union[Agent, List[Agent]] :param agents_scope: Number of environments for each agent to train on (default: []) :type agents_scope: tuple or list of integers + :param cfg: Configuration dictionary (default: {}) + :type cfg: dict, optional """ self.cfg = cfg self.env = env @@ -59,9 +58,6 @@ def __init__(self, self.initial_timestep = 0 - self._timestamp = None - self._timestamp_elapsed = None - # setup agents self.num_agents = 0 self._setup_agents() @@ -165,7 +161,7 @@ def single_agent_train(self) -> None: # reset env states = self.env.reset() - for timestep in tqdm(range(self.initial_timestep, self.timesteps)): + for timestep in tqdm.tqdm(range(self.initial_timestep, self.timesteps)): # pre-interaction self.agents.pre_interaction(timestep=timestep, timesteps=self.timesteps) @@ -220,7 +216,7 @@ def single_agent_eval(self) -> None: # reset env states = self.env.reset() - for timestep in tqdm(range(self.initial_timestep, self.timesteps)): + for timestep in tqdm.tqdm(range(self.initial_timestep, self.timesteps)): # compute actions with torch.no_grad(): diff --git a/skrl/trainers/torch/manual.py b/skrl/trainers/torch/manual.py index e07ab75b..938fdc80 100644 --- a/skrl/trainers/torch/manual.py +++ b/skrl/trainers/torch/manual.py @@ -1,5 +1,6 @@ from typing import Union, List, Optional +import copy import tqdm import torch @@ -10,27 +11,35 @@ from . import Trainer +MANUAL_TRAINER_DEFAULT_CONFIG = { + "timesteps": 100000, # number of timesteps to train for + "headless": False, # whether to use headless mode (no rendering) +} + + class ManualTrainer(Trainer): def __init__(self, - cfg: dict, env: Wrapper, - agents: Union[Agent, List[Agent], List[List[Agent]]], - agents_scope : List[int] = []) -> None: + agents: Union[Agent, List[Agent]], + agents_scope : List[int] = [], + cfg: dict = {}) -> None: """Manual trainer Train agents by manually controlling the training/evaluation loop - :param cfg: Configuration dictionary - :type cfg: dict :param env: Environment to train on :type env: skrl.env.torch.Wrapper :param agents: Agents to train :type agents: Union[Agent, List[Agent]] :param agents_scope: Number of environments for each agent to train on (default: []) :type agents_scope: tuple or list of integers + :param cfg: Configuration dictionary (default: {}). + See MANUAL_TRAINER_DEFAULT_CONFIG for default values + :type cfg: dict, optional """ - # TODO: close the environment - super().__init__(cfg, env, agents, agents_scope) + _cfg = copy.deepcopy(MANUAL_TRAINER_DEFAULT_CONFIG) + _cfg.update(cfg) + super().__init__(env=env, agents=agents, agents_scope=agents_scope, cfg=_cfg) # init agents if self.num_agents > 1: diff --git a/skrl/trainers/torch/parallel.py b/skrl/trainers/torch/parallel.py index 17d1e72a..c5526223 100644 --- a/skrl/trainers/torch/parallel.py +++ b/skrl/trainers/torch/parallel.py @@ -1,5 +1,8 @@ from typing import Union, List +import copy +import tqdm + import torch import torch.multiprocessing as mp @@ -9,6 +12,12 @@ from . import Trainer +PARALLEL_TRAINER_DEFAULT_CONFIG = { + "timesteps": 100000, # number of timesteps to train for + "headless": False, # whether to use headless mode (no rendering) +} + + def fn_processor(process_index, *args): print("[INFO] Processor {}: started".format(process_index)) @@ -92,24 +101,27 @@ def fn_processor(process_index, *args): class ParallelTrainer(Trainer): def __init__(self, - cfg: dict, env: Wrapper, - agents: Union[Agent, List[Agent], List[List[Agent]]], - agents_scope : List[int] = []) -> None: + agents: Union[Agent, List[Agent]], + agents_scope : List[int] = [], + cfg: dict = {}) -> None: """Parallel trainer Train agents in parallel using multiple processes - :param cfg: Configuration dictionary - :type cfg: dict :param env: Environment to train on :type env: skrl.env.torch.Wrapper :param agents: Agents to train :type agents: Union[Agent, List[Agent]] :param agents_scope: Number of environments for each agent to train on (default: []) :type agents_scope: tuple or list of integers + :param cfg: Configuration dictionary (default: {}). + See PARALLEL_TRAINER_DEFAULT_CONFIG for default values + :type cfg: dict, optional """ - super().__init__(cfg, env, agents, agents_scope) + _cfg = copy.deepcopy(PARALLEL_TRAINER_DEFAULT_CONFIG) + _cfg.update(cfg) + super().__init__(env=env, agents=agents, agents_scope=agents_scope, cfg=_cfg) mp.set_start_method(method='spawn', force=True) @@ -175,7 +187,7 @@ def train(self) -> None: if not states.is_cuda: states.share_memory_() - for timestep in tqdm(range(self.initial_timestep, self.timesteps)): + for timestep in tqdm.tqdm(range(self.initial_timestep, self.timesteps)): # pre-interaction for pipe in producer_pipes: @@ -300,7 +312,7 @@ def eval(self) -> None: if not states.is_cuda: states.share_memory_() - for timestep in tqdm(range(self.initial_timestep, self.timesteps)): + for timestep in tqdm.tqdm(range(self.initial_timestep, self.timesteps)): # compute actions with torch.no_grad(): diff --git a/skrl/trainers/torch/sequential.py b/skrl/trainers/torch/sequential.py index b9a3ca57..42114ad0 100644 --- a/skrl/trainers/torch/sequential.py +++ b/skrl/trainers/torch/sequential.py @@ -1,6 +1,7 @@ from typing import Union, List -from tqdm import tqdm +import copy +import tqdm import torch @@ -10,26 +11,35 @@ from . import Trainer +SEQUENTIAL_TRAINER_DEFAULT_CONFIG = { + "timesteps": 100000, # number of timesteps to train for + "headless": False, # whether to use headless mode (no rendering) +} + + class SequentialTrainer(Trainer): def __init__(self, - cfg: dict, env: Wrapper, - agents: Union[Agent, List[Agent], List[List[Agent]]], - agents_scope : List[int] = []) -> None: + agents: Union[Agent, List[Agent]], + agents_scope : List[int] = [], + cfg: dict = {}) -> None: """Sequential trainer Train agents sequentially (i.e., one after the other in each interaction with the environment) - :param cfg: Configuration dictionary - :type cfg: dict :param env: Environment to train on :type env: skrl.env.torch.Wrapper :param agents: Agents to train :type agents: Union[Agent, List[Agent]] :param agents_scope: Number of environments for each agent to train on (default: []) :type agents_scope: tuple or list of integers + :param cfg: Configuration dictionary (default: {}). + See SEQUENTIAL_TRAINER_DEFAULT_CONFIG for default values + :type cfg: dict, optional """ - super().__init__(cfg, env, agents, agents_scope) + _cfg = copy.deepcopy(SEQUENTIAL_TRAINER_DEFAULT_CONFIG) + _cfg.update(cfg) + super().__init__(env=env, agents=agents, agents_scope=agents_scope, cfg=_cfg) # init agents if self.num_agents > 1: @@ -59,7 +69,7 @@ def train(self) -> None: # reset env states = self.env.reset() - for timestep in tqdm(range(self.initial_timestep, self.timesteps)): + for timestep in tqdm.tqdm(range(self.initial_timestep, self.timesteps)): # pre-interaction for agent in self.agents: @@ -124,7 +134,7 @@ def eval(self) -> None: # reset env states = self.env.reset() - for timestep in tqdm(range(self.initial_timestep, self.timesteps)): + for timestep in tqdm.tqdm(range(self.initial_timestep, self.timesteps)): # compute actions with torch.no_grad(): From 5596883c42ef5edf062b45822d6aa1e4c639140b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20Serrano=20Mu=C3=B1oz?= Date: Sun, 31 Jul 2022 23:11:05 +0200 Subject: [PATCH 030/108] Add manual trainer to docs --- docs/source/index.rst | 2 + docs/source/modules/skrl.trainers.manual.rst | 44 ++++++++++++++++++ .../source/modules/skrl.trainers.parallel.rst | 10 +++++ .../modules/skrl.trainers.sequential.rst | 10 +++++ docs/source/snippets/trainer.py | 45 ++++++++++++++++--- 5 files changed, 104 insertions(+), 7 deletions(-) create mode 100644 docs/source/modules/skrl.trainers.manual.rst diff --git a/docs/source/index.rst b/docs/source/index.rst index e26408f7..f9c96941 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -150,6 +150,7 @@ Trainers * :doc:`Sequential trainer ` * :doc:`Parallel trainer ` + * :doc:`Manual trainer ` .. toctree:: :maxdepth: 1 @@ -159,6 +160,7 @@ Trainers modules/skrl.trainers.base_class modules/skrl.trainers.sequential modules/skrl.trainers.parallel + modules/skrl.trainers.manual Resources ^^^^^^^^^ diff --git a/docs/source/modules/skrl.trainers.manual.rst b/docs/source/modules/skrl.trainers.manual.rst new file mode 100644 index 00000000..61c43c86 --- /dev/null +++ b/docs/source/modules/skrl.trainers.manual.rst @@ -0,0 +1,44 @@ +Manual trainer +============== + +Concept +^^^^^^^ + +.. image:: ../_static/imgs/manual_trainer.svg + :width: 100% + :align: center + :alt: Manual trainer + +Basic usage +^^^^^^^^^^^ + +.. tabs:: + + .. tab:: Snippet + + .. literalinclude:: ../snippets/trainer.py + :language: python + :linenos: + :start-after: [start-manual] + :end-before: [end-manual] + +Configuration +^^^^^^^^^^^^^ + +.. py:data:: skrl.trainers.torch.manual.MANUAL_TRAINER_DEFAULT_CONFIG + +.. literalinclude:: ../../../skrl/trainers/torch/manual.py + :language: python + :lines: 14-17 + :linenos: + +API +^^^ + +.. autoclass:: skrl.trainers.torch.manual.ManualTrainer + :undoc-members: + :show-inheritance: + :inherited-members: + :members: + + .. automethod:: __init__ diff --git a/docs/source/modules/skrl.trainers.parallel.rst b/docs/source/modules/skrl.trainers.parallel.rst index 1e8a0f46..4e3751e3 100644 --- a/docs/source/modules/skrl.trainers.parallel.rst +++ b/docs/source/modules/skrl.trainers.parallel.rst @@ -30,6 +30,16 @@ Basic usage :start-after: [start-parallel] :end-before: [end-parallel] +Configuration +^^^^^^^^^^^^^ + +.. py:data:: skrl.trainers.torch.parallel.PARALLEL_TRAINER_DEFAULT_CONFIG + +.. literalinclude:: ../../../skrl/trainers/torch/parallel.py + :language: python + :lines: 15-18 + :linenos: + API ^^^ diff --git a/docs/source/modules/skrl.trainers.sequential.rst b/docs/source/modules/skrl.trainers.sequential.rst index 14c23b1b..9866a420 100644 --- a/docs/source/modules/skrl.trainers.sequential.rst +++ b/docs/source/modules/skrl.trainers.sequential.rst @@ -22,6 +22,16 @@ Basic usage :start-after: [start-sequential] :end-before: [end-sequential] +Configuration +^^^^^^^^^^^^^ + +.. py:data:: skrl.trainers.torch.sequential.SEQUENTIAL_TRAINER_DEFAULT_CONFIG + +.. literalinclude:: ../../../skrl/trainers/torch/sequential.py + :language: python + :lines: 14-17 + :linenos: + API ^^^ diff --git a/docs/source/snippets/trainer.py b/docs/source/snippets/trainer.py index 2ff4c9e4..2e54ad44 100644 --- a/docs/source/snippets/trainer.py +++ b/docs/source/snippets/trainer.py @@ -1,29 +1,39 @@ # [start-base] from typing import Union, List +import copy + from skrl.envs.torch import Wrapper # from ...envs.torch import Wrapper from skrl.agents.torch import Agent # from ...agents.torch import Agent from skrl.trainers.torch import Trainer # from . import Trainer +CUSTOM_DEFAULT_CONFIG = { + "timesteps": 100000, # number of timesteps to train for + "headless": False, # whether to use headless mode (no rendering) +} + + class CustomTrainer(Trainer): def __init__(self, - cfg: dict, env: Wrapper, agents: Union[Agent, List[Agent], List[List[Agent]]], - agents_scope : List[int] = []) -> None: + agents_scope : List[int] = [], + cfg: dict = {}) -> None: """ - :param cfg: Configuration dictionary - :type cfg: dict :param env: Environment to train on :type env: skrl.env.torch.Wrapper :param agents: Agents to train :type agents: Union[Agent, List[Agent]] :param agents_scope: Number of environments for each agent to train on (default: []) :type agents_scope: tuple or list of integers + :param cfg: Configuration dictionary + :type cfg: dict, optional """ - super().__init__(cfg, env, agents, agents_scope) + _cfg = copy.deepcopy(CUSTOM_DEFAULT_CONFIG) + _cfg.update(cfg) + super().__init__(env=env, agents=agents, agents_scope=agents_scope, cfg=_cfg) # ================================ # - init agents @@ -66,7 +76,7 @@ def eval(self) -> None: # create a sequential trainer cfg = {"timesteps": 50000, "headless": False} -trainer = SequentialTrainer(cfg=cfg, env=env, agents=agents) +trainer = SequentialTrainer(env=env, agents=agents, cfg=cfg) # train the agent(s) trainer.train() @@ -85,7 +95,7 @@ def eval(self) -> None: # create a sequential trainer cfg = {"timesteps": 50000, "headless": False} -trainer = ParallelTrainer(cfg=cfg, env=env, agents=agents) +trainer = ParallelTrainer(env=env, agents=agents, cfg=cfg) # train the agent(s) trainer.train() @@ -93,3 +103,24 @@ def eval(self) -> None: # evaluate the agent(s) trainer.eval() # [end-parallel] + +# ============================================================================= + +# [start-manual] +from skrl.trainers.torch import ManualTrainer + +# asuming there is an environment called 'env' +# and an agent or a list of agents called 'agents' + +# create a sequential trainer +cfg = {"timesteps": 50000, "headless": False} +trainer = ManualTrainer(env=env, agents=agents, cfg=cfg) + +# train the agent(s) +for timestep in range(cfg["timesteps"]): + trainer.train(timestep=timestep) + +# evaluate the agent(s) +for timestep in range(cfg["timesteps"]): + trainer.eval(timestep=timestep) +# [end-manual] \ No newline at end of file From 3cf64d83e72703c80aeda06fadc483cea5fea4cc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20Serrano=20Mu=C3=B1oz?= Date: Sun, 31 Jul 2022 23:13:15 +0200 Subject: [PATCH 031/108] Update images --- docs/source/_static/imgs/manual_trainer.svg | 1 + docs/source/_static/imgs/rl_schema.svg | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) create mode 100755 docs/source/_static/imgs/manual_trainer.svg diff --git a/docs/source/_static/imgs/manual_trainer.svg b/docs/source/_static/imgs/manual_trainer.svg new file mode 100755 index 00000000..460497a3 --- /dev/null +++ b/docs/source/_static/imgs/manual_trainer.svg @@ -0,0 +1 @@ +step environmentsenv.step(…)a0ana0ana0an.........a0an...scope0scope1scope2scope3scopemrs0sns0sns0sns0snrrrddddrenderenv.render(…)agent.record_transitions(…)agent.act(…)record transitionscompute actions. . .. . ...............................Execute each agent method sequentially (one agent after the other) in the same processscope0scope1scope2scope3scopem𝒂𝒕𝒅𝒕+𝟏𝒓𝒕+𝟏𝒔𝒕+𝟏A1A2A3. . .Amagentenvagent.post_interaction(…)agent.pre_interaction(…)post-interactionpre-interaction𝒔𝒕resetenv.reset(…)training / evaluationiteration \ No newline at end of file diff --git a/docs/source/_static/imgs/rl_schema.svg b/docs/source/_static/imgs/rl_schema.svg index ca892130..aac7fe7a 100755 --- a/docs/source/_static/imgs/rl_schema.svg +++ b/docs/source/_static/imgs/rl_schema.svg @@ -1 +1 @@ -action (𝒂𝒕)state (𝒔𝒕+𝟏)reward (𝒓𝒕+𝟏)state (𝒔𝒕)reward (𝒓𝒕)agentenv1623457 \ No newline at end of file +action (𝒂𝒕)state (𝒔𝒕+𝟏)reward (𝒓𝒕+𝟏)state (𝒔𝒕)reward (𝒓𝒕)agentenv17234586 \ No newline at end of file From d7e10a2d6f779ba73fe842b0e32a49e950c9b911 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20Serrano=20Mu=C3=B1oz?= Date: Mon, 1 Aug 2022 00:11:45 +0200 Subject: [PATCH 032/108] Update Tensorboard log table in docs --- docs/source/intro/data.rst | 94 +++++++++++++++++++------------------- 1 file changed, 48 insertions(+), 46 deletions(-) diff --git a/docs/source/intro/data.rst b/docs/source/intro/data.rst index f5e358c4..9deef70b 100644 --- a/docs/source/intro/data.rst +++ b/docs/source/intro/data.rst @@ -51,51 +51,53 @@ To visualize the tracked metrics/scales, during or after the training, TensorBoa The following table shows the metrics/scales tracked by each agent ([**+**] all the time, [**-**] only when such a function is enabled in the agent's configuration): -+-----------+--------------------+------------------+------------------+------------------+------------------+------------------+-------------------------+------------------+--------------------+------------------+------------------+ -|Tag |Metric / Scalar |.. centered:: CEM |.. centered:: DDPG|.. centered:: DQN |.. centered:: DDQN|.. centered:: PPO |.. centered:: Q-learning |.. centered:: SAC |.. centered:: SARSA |.. centered:: TD3 |.. centered:: TRPO| -+===========+====================+==================+==================+==================+==================+==================+=========================+==================+====================+==================+==================+ -|Coefficient|Entropy coefficient | | | | | | |.. centered:: + | | | | -+ +--------------------+------------------+------------------+------------------+------------------+------------------+-------------------------+------------------+--------------------+------------------+------------------+ -| |Return threshold |.. centered:: + | | | | | | | | | | -+ +--------------------+------------------+------------------+------------------+------------------+------------------+-------------------------+------------------+--------------------+------------------+------------------+ -| |Mean disc. returns |.. centered:: + | | | | | | | | | | -+-----------+--------------------+------------------+------------------+------------------+------------------+------------------+-------------------------+------------------+--------------------+------------------+------------------+ -|Episode |Total timesteps |.. centered:: + |.. centered:: + |.. centered:: + |.. centered:: + |.. centered:: + |.. centered:: + |.. centered:: + |.. centered:: + |.. centered:: + |.. centered:: + | -+-----------+--------------------+------------------+------------------+------------------+------------------+------------------+-------------------------+------------------+--------------------+------------------+------------------+ -|Exploration|Exploration noise | |.. centered:: + | | | | | | |.. centered:: + | | -+ +--------------------+------------------+------------------+------------------+------------------+------------------+-------------------------+------------------+--------------------+------------------+------------------+ -| |Exploration epsilon | | |.. centered:: + |.. centered:: + | | | | | | | -+-----------+--------------------+------------------+------------------+------------------+------------------+------------------+-------------------------+------------------+--------------------+------------------+------------------+ -|Learning |Learning rate |.. centered:: -- | |.. centered:: -- |.. centered:: -- |.. centered:: -- | | | | | | -+ +--------------------+------------------+------------------+------------------+------------------+------------------+-------------------------+------------------+--------------------+------------------+------------------+ -| |Policy learning rate| |.. centered:: -- | | | | |.. centered:: -- | |.. centered:: -- | | -+ +--------------------+------------------+------------------+------------------+------------------+------------------+-------------------------+------------------+--------------------+------------------+------------------+ -| |Critic learning rate| |.. centered:: -- | | | | |.. centered:: -- | |.. centered:: -- | | -+ +--------------------+------------------+------------------+------------------+------------------+------------------+-------------------------+------------------+--------------------+------------------+------------------+ -| |Return threshold | | | | | | | | | |.. centered:: -- | -+-----------+--------------------+------------------+------------------+------------------+------------------+------------------+-------------------------+------------------+--------------------+------------------+------------------+ -|Loss |Policy loss |.. centered:: + |.. centered:: + | | |.. centered:: + | |.. centered:: + | |.. centered:: + |.. centered:: + | -+ +--------------------+------------------+------------------+------------------+------------------+------------------+-------------------------+------------------+--------------------+------------------+------------------+ -| |Critic loss | |.. centered:: + | | | | |.. centered:: + | |.. centered:: + | | -+ +--------------------+------------------+------------------+------------------+------------------+------------------+-------------------------+------------------+--------------------+------------------+------------------+ -| |Value loss | | | | |.. centered:: + | | | | |.. centered:: + | -+ +--------------------+------------------+------------------+------------------+------------------+------------------+-------------------------+------------------+--------------------+------------------+------------------+ -| |Entropy loss | | | | |.. centered:: -- | |.. centered:: -- | | | | -+ +--------------------+------------------+------------------+------------------+------------------+------------------+-------------------------+------------------+--------------------+------------------+------------------+ -| |Q-network loss | | |.. centered:: + |.. centered:: + | | | | | | | -+-----------+--------------------+------------------+------------------+------------------+------------------+------------------+-------------------------+------------------+--------------------+------------------+------------------+ -|Policy |Standard deviation | | | | |.. centered:: + | | | | |.. centered:: + | -+-----------+--------------------+------------------+------------------+------------------+------------------+------------------+-------------------------+------------------+--------------------+------------------+------------------+ -|Q-network |Q1 | |.. centered:: + | | | | |.. centered:: + | |.. centered:: + | | -+ +--------------------+------------------+------------------+------------------+------------------+------------------+-------------------------+------------------+--------------------+------------------+------------------+ -| |Q2 | | | | | | |.. centered:: + | |.. centered:: + | | -+-----------+--------------------+------------------+------------------+------------------+------------------+------------------+-------------------------+------------------+--------------------+------------------+------------------+ -|Reward |Instantaneous reward|.. centered:: + |.. centered:: + |.. centered:: + |.. centered:: + |.. centered:: + |.. centered:: + |.. centered:: + |.. centered:: + |.. centered:: + |.. centered:: + | -+ +--------------------+------------------+------------------+------------------+------------------+------------------+-------------------------+------------------+--------------------+------------------+------------------+ -| |Total reward |.. centered:: + |.. centered:: + |.. centered:: + |.. centered:: + |.. centered:: + |.. centered:: + |.. centered:: + |.. centered:: + |.. centered:: + |.. centered:: + | -+-----------+--------------------+------------------+------------------+------------------+------------------+------------------+-------------------------+------------------+--------------------+------------------+------------------+ -|Target |Target | |.. centered:: + |.. centered:: + |.. centered:: + | | |.. centered:: + | |.. centered:: + | | -+-----------+--------------------+------------------+------------------+------------------+------------------+------------------+-------------------------+------------------+--------------------+------------------+------------------+ ++-----------+--------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+-------------------------+------------------+--------------------+------------------+------------------+ +|Tag |Metric / Scalar |.. centered:: A2C |.. centered:: AMP |.. centered:: CEM |.. centered:: DDPG|.. centered:: DDQN|.. centered:: DQN |.. centered:: PPO |.. centered:: Q-learning |.. centered:: SAC |.. centered:: SARSA |.. centered:: TD3 |.. centered:: TRPO| ++===========+====================+==================+==================+==================+==================+==================+==================+==================+=========================+==================+====================+==================+==================+ +|Coefficient|Entropy coefficient | | | | | | | | |.. centered:: + | | | | ++ +--------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+-------------------------+------------------+--------------------+------------------+------------------+ +| |Return threshold | | |.. centered:: + | | | | | | | | | | ++ +--------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+-------------------------+------------------+--------------------+------------------+------------------+ +| |Mean disc. returns | | |.. centered:: + | | | | | | | | | | ++-----------+--------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+-------------------------+------------------+--------------------+------------------+------------------+ +|Episode |Total timesteps |.. centered:: + |.. centered:: + |.. centered:: + |.. centered:: + |.. centered:: + |.. centered:: + |.. centered:: + |.. centered:: + |.. centered:: + |.. centered:: + |.. centered:: + |.. centered:: + | ++-----------+--------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+-------------------------+------------------+--------------------+------------------+------------------+ +|Exploration|Exploration noise | | | |.. centered:: + | | | | | | |.. centered:: + | | ++ +--------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+-------------------------+------------------+--------------------+------------------+------------------+ +| |Exploration epsilon | | | | |.. centered:: + |.. centered:: + | | | | | | | ++-----------+--------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+-------------------------+------------------+--------------------+------------------+------------------+ +|Learning |Learning rate |.. centered:: + |.. centered:: + |.. centered:: -- | |.. centered:: -- |.. centered:: -- |.. centered:: -- | | | | | | ++ +--------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+-------------------------+------------------+--------------------+------------------+------------------+ +| |Policy learning rate| | | |.. centered:: -- | | | | |.. centered:: -- | |.. centered:: -- | | ++ +--------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+-------------------------+------------------+--------------------+------------------+------------------+ +| |Critic learning rate| | | |.. centered:: -- | | | | |.. centered:: -- | |.. centered:: -- | | ++ +--------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+-------------------------+------------------+--------------------+------------------+------------------+ +| |Return threshold | | | | | | | | | | | |.. centered:: -- | ++-----------+--------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+-------------------------+------------------+--------------------+------------------+------------------+ +|Loss |Critic loss | | | |.. centered:: + | | | | |.. centered:: + | |.. centered:: + | | ++ +--------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+-------------------------+------------------+--------------------+------------------+------------------+ +| |Entropy loss |.. centered:: -- |.. centered:: -- | | | | |.. centered:: -- | |.. centered:: -- | | | | ++ +--------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+-------------------------+------------------+--------------------+------------------+------------------+ +| |Discriminator loss | |.. centered:: + | | | | | | | | | | | ++ +--------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+-------------------------+------------------+--------------------+------------------+------------------+ +| |Policy loss |.. centered:: + |.. centered:: + |.. centered:: + |.. centered:: + | | |.. centered:: + | |.. centered:: + | |.. centered:: + |.. centered:: + | ++ +--------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+-------------------------+------------------+--------------------+------------------+------------------+ +| |Q-network loss | | | | |.. centered:: + |.. centered:: + | | | | | | | ++ +--------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+-------------------------+------------------+--------------------+------------------+------------------+ +| |Value loss |.. centered:: + |.. centered:: + | | | | |.. centered:: + | | | | |.. centered:: + | ++-----------+--------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+-------------------------+------------------+--------------------+------------------+------------------+ +|Policy |Standard deviation |.. centered:: + |.. centered:: + | | | | |.. centered:: + | | | | |.. centered:: + | ++-----------+--------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+-------------------------+------------------+--------------------+------------------+------------------+ +|Q-network |Q1 | | | |.. centered:: + | | | | |.. centered:: + | |.. centered:: + | | ++ +--------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+-------------------------+------------------+--------------------+------------------+------------------+ +| |Q2 | | | | | | | | |.. centered:: + | |.. centered:: + | | ++-----------+--------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+-------------------------+------------------+--------------------+------------------+------------------+ +|Reward |Instantaneous reward|.. centered:: + |.. centered:: + |.. centered:: + |.. centered:: + |.. centered:: + |.. centered:: + |.. centered:: + |.. centered:: + |.. centered:: + |.. centered:: + |.. centered:: + |.. centered:: + | ++ +--------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+-------------------------+------------------+--------------------+------------------+------------------+ +| |Total reward |.. centered:: + |.. centered:: + |.. centered:: + |.. centered:: + |.. centered:: + |.. centered:: + |.. centered:: + |.. centered:: + |.. centered:: + |.. centered:: + |.. centered:: + |.. centered:: + | ++-----------+--------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+-------------------------+------------------+--------------------+------------------+------------------+ +|Target |Target | | | |.. centered:: + |.. centered:: + |.. centered:: + | | |.. centered:: + | |.. centered:: + | | ++-----------+--------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+-------------------------+------------------+--------------------+------------------+------------------+ Tracking custom metrics/scales ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -163,7 +165,7 @@ The best models are updated internally on each TensorBoard writing interval :lit Loading checkpoints ^^^^^^^^^^^^^^^^^^^ -Checkpoints can be loaded for each of the instantiated models independently via the :literal:`.load(...)` method (`Model.load <../modules/skrl.models.base_class.html#skrl.models.torch.base.Model.load>`_). It accepts the path (relative or absolute) of the checkpoint to load as the only argument +Checkpoints can be loaded for each of the instantiated models independently via the :literal:`.load(...)` method (`Model.load <../modules/skrl.models.base_class.html#skrl.models.torch.base.Model.load>`_). It accepts the path (relative or absolute) of the checkpoint to load as the only argument. The checkpoint will be dynamically mapped to the device specified as argument in the class constructor (internally the torch load's :literal:`map_location` method is used during loading) .. note:: From 971c486d175798f95cc45c59d2aa33855bb3e690 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20Serrano=20Mu=C3=B1oz?= Date: Mon, 1 Aug 2022 00:14:08 +0200 Subject: [PATCH 033/108] Update CHANGELOG --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index cd35cd6e..b48c1b53 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,9 +6,11 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). ### Added - AMP agent for physics-based character animation - Gaussian model +- Manual trainer ### Changed - Multivariate Gaussian model (`GaussianModel` until 0.7.0) to `MultivariateGaussianModel` +- Trainer's `cfg` parameter position and default values ## [0.7.0] - 2022-07-11 ### Added From ae81fd25ef659497b7a6db4ab5a91d0c9bc36c7e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20Serrano=20Mu=C3=B1oz?= Date: Mon, 1 Aug 2022 00:26:50 +0200 Subject: [PATCH 034/108] Add tqdm to sphinx requirements --- docs/requirements.txt | 1 + docs/source/intro/installation.rst | 1 + 2 files changed, 2 insertions(+) diff --git a/docs/requirements.txt b/docs/requirements.txt index 89459e2b..65315ffa 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -5,3 +5,4 @@ sphinx-tabs==3.2.0 gym torch tensorboard +tqdm diff --git a/docs/source/intro/installation.rst b/docs/source/intro/installation.rst index 1663cccb..3c96b006 100644 --- a/docs/source/intro/installation.rst +++ b/docs/source/intro/installation.rst @@ -11,6 +11,7 @@ Prerequisites **skrl** requires Python 3.6 or higher and the following libraries (they will be installed automatically): * `gym `_ + * `tqdm `_ * `torch `_ 1.8.0 or higher * `tensorboard `_ From 00647f1c08357ffb4a341d233ef14ef1860763d3 Mon Sep 17 00:00:00 2001 From: Johann Christensen Date: Mon, 15 Aug 2022 09:46:44 +0200 Subject: [PATCH 035/108] Support new step api from gym 0.25.0 --- skrl/envs/torch/wrappers.py | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/skrl/envs/torch/wrappers.py b/skrl/envs/torch/wrappers.py index 62110e02..c4dfb43c 100644 --- a/skrl/envs/torch/wrappers.py +++ b/skrl/envs/torch/wrappers.py @@ -89,7 +89,7 @@ def num_envs(self) -> int: def state_space(self) -> gym.Space: """State space - If the wrapped environment does not have the ``state_space`` property, + If the wrapped environment does not have the ``state_space`` property, the value of the ``observation_space`` property will be used """ return self._env.state_space if hasattr(self._env, "state_space") else self._env.observation_space @@ -115,7 +115,7 @@ def __init__(self, env: Any) -> None: :type env: Any supported Isaac Gym environment (preview 2) environment """ super().__init__(env) - + self._reset_once = True self._obs_buf = None @@ -192,7 +192,7 @@ def render(self, *args, **kwargs) -> None: """Render the environment """ pass - + def close(self) -> None: """Close the environment """ @@ -248,7 +248,7 @@ def render(self, *args, **kwargs) -> None: """Render the environment """ pass - + def close(self) -> None: """Close the environment """ @@ -271,6 +271,11 @@ def __init__(self, env: Any) -> None: except Exception as e: print("[WARNING] Failed to check for a vectorized environment: {}".format(e)) + if hasattr(self, "new_step_api"): + self._new_step_api = self._env.new_step_api + else: + self._new_step_api = False + @property def state_space(self) -> gym.Space: """State space @@ -359,13 +364,17 @@ def step(self, actions: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch :return: The state, the reward, the done flag, and the info :rtype: tuple of torch.Tensor and any other info """ - observation, reward, done, info = self._env.step(self._tensor_to_action(actions)) + if self._new_step_api: + observation, reward, termination, truncation, info = self._env.step(self._tensor_to_action(actions)) + done = termination or truncation + else: + observation, reward, done, info = self._env.step(self._tensor_to_action(actions)) # convert response to torch return self._observation_to_tensor(observation), \ torch.tensor(reward, device=self.device, dtype=torch.float32).view(self.num_envs, -1), \ torch.tensor(done, device=self.device, dtype=torch.bool).view(self.num_envs, -1), \ info - + def reset(self) -> torch.Tensor: """Reset the environment @@ -508,7 +517,7 @@ def step(self, actions: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch reward = timestep.reward if timestep.reward is not None else 0 done = timestep.last() info = {} - + # convert response to torch return self._observation_to_tensor(observation), \ torch.tensor(reward, device=self.device, dtype=torch.float32).view(self.num_envs, -1), \ @@ -562,7 +571,7 @@ def wrap_env(env: Any, wrapper: str = "auto", verbose: bool = True) -> Wrapper: .. raw:: html
- + +--------------------+-------------------------+ |Environment |Wrapper tag | +====================+=========================+ @@ -581,9 +590,9 @@ def wrap_env(env: Any, wrapper: str = "auto", verbose: bool = True) -> Wrapper: :type wrapper: str, optional :param verbose: Whether to print the wrapper type (default: True) :type verbose: bool, optional - + :raises ValueError: Unknow wrapper type - + :return: Wrapped environment :rtype: Wrapper """ From a0b8300dddaa612ceee1c5fe16e16102cfeb3723 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20Serrano=20Mu=C3=B1oz?= Date: Wed, 17 Aug 2022 23:12:16 +0200 Subject: [PATCH 036/108] Migrate external model's state dict to current model --- skrl/models/torch/base.py | 91 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 90 insertions(+), 1 deletion(-) diff --git a/skrl/models/torch/base.py b/skrl/models/torch/base.py index ad810ce0..9d36f45b 100644 --- a/skrl/models/torch/base.py +++ b/skrl/models/torch/base.py @@ -1,6 +1,7 @@ -from typing import Union, Tuple +from typing import Optional, Union, Mapping, Tuple import gym +import collections import numpy as np import torch @@ -304,6 +305,94 @@ def load(self, path: str) -> None: """ self.load_state_dict(torch.load(path, map_location=self.device)) self.eval() + + def migrate(self, + state_dict: Mapping[str, torch.Tensor], + name_map: Optional[Mapping[str, str]] = {}, + auto_mapping: Optional[bool] = True, + show_names: Optional[bool] = False) -> bool: + """Migrate the specified extrernal model's state dict to the current model + + :param state_dict: External model's state dict to migrate from + :type state_dict: Mapping[str, torch.Tensor] + :param name_map: Name map to use for the migration (default: {}). + Keys are the current parameter names and values are the external parameter names + :type name_map: Mapping[str, str], optional + :param auto_mapping: Automatically map the external state dict to the current state dict (default: True) + :type auto_mapping: bool, optional + :param show_names: Show the names of both, current and external state dicts parameters (default: False) + :type show_names: bool, optional + + :return: True if the migration was successful, False otherwise. + Migration is successful if all parameters of the current model are found in the external model + :rtype: bool + """ + # Show state_dict + if show_names: + print("Model migration") + print("Current state_dict:") + for name, tensor in self.state_dict().items(): + print(" |-- {} : {}".format(name, tensor.shape)) + print("Source state_dict:") + for name, tensor in state_dict.items(): + print(" |-- {} : {}".format(name, tensor.shape)) + + # migrate the state dict to current model + new_state_dict = collections.OrderedDict() + match_counter = collections.defaultdict(list) + used_counter = collections.defaultdict(list) + for name, tensor in self.state_dict().items(): + for external_name, external_tensor in state_dict.items(): + # mapped names + if name_map.get(name, "") == external_name: + if tensor.shape == external_tensor.shape: + new_state_dict[name] = external_tensor + match_counter[name].append(external_name) + used_counter[external_name].append(name) + break + else: + print("Shape mismatch for {} <- {} : {} != {}".format(name, external_name, tensor.shape, external_tensor.shape)) + # auto-mapped names + if auto_mapping: + if tensor.shape == external_tensor.shape: + if name.endswith(".weight"): + if external_name.endswith(".weight"): + new_state_dict[name] = external_tensor + match_counter[name].append(external_name) + used_counter[external_name].append(name) + elif name.endswith(".bias"): + if external_name.endswith(".bias"): + new_state_dict[name] = external_tensor + match_counter[name].append(external_name) + used_counter[external_name].append(name) + else: + if not external_name.endswith(".weight") and not external_name.endswith(".bias"): + new_state_dict[name] = external_tensor + match_counter[name].append(external_name) + used_counter[external_name].append(name) + + # show ambiguous matches + status = True + for name, tensor in self.state_dict().items(): + if len(match_counter.get(name, [])) > 1: + print("Ambiguous match for {} <- {}".format(name, match_counter.get(name, []))) + status = False + # show missing matches + for name, tensor in self.state_dict().items(): + if not match_counter.get(name, []): + print("Missing match for {}".format(name)) + status = False + # show duplicated uses + for name, tensor in state_dict.items(): + if len(used_counter.get(name, [])) > 1: + print("Duplicated use of {} -> {}".format(name, used_counter.get(name, []))) + status = False + + # load new state dict + self.load_state_dict(new_state_dict, strict=False) + self.eval() + + return status def freeze_parameters(self, freeze: bool = True) -> None: """Freeze or unfreeze internal parameters From 1a4d14aa00908c10957e16aefe790b225d84e330 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20Serrano=20Mu=C3=B1oz?= Date: Sat, 20 Aug 2022 23:11:35 +0200 Subject: [PATCH 037/108] Add role parameter to models' act and compute methods --- skrl/models/torch/base.py | 10 ++++++++-- skrl/models/torch/categorical.py | 8 ++++++-- skrl/models/torch/deterministic.py | 8 ++++++-- skrl/models/torch/gaussian.py | 8 ++++++-- skrl/models/torch/multivariate_gaussian.py | 8 ++++++-- 5 files changed, 32 insertions(+), 10 deletions(-) diff --git a/skrl/models/torch/base.py b/skrl/models/torch/base.py index 9d36f45b..87d3b282 100644 --- a/skrl/models/torch/base.py +++ b/skrl/models/torch/base.py @@ -227,7 +227,8 @@ def forward(self): def compute(self, states: torch.Tensor, - taken_actions: Union[torch.Tensor, None] = None) -> Union[torch.Tensor, Tuple[torch.Tensor]]: + taken_actions: Union[torch.Tensor, None] = None, + role: str = "") -> Union[torch.Tensor, Tuple[torch.Tensor]]: """Define the computation performed (to be implemented by the inheriting classes) by the models :param states: Observation/state of the environment used to make the decision @@ -235,6 +236,8 @@ def compute(self, :param taken_actions: Actions taken by a policy to the given states (default: None). The use of these actions only makes sense in critical models, e.g. :type taken_actions: torch.Tensor or None, optional + :param role: Role of the agent (default: "") + :type role: str, optional :raises NotImplementedError: Child class must implement this method @@ -246,7 +249,8 @@ def compute(self, def act(self, states: torch.Tensor, taken_actions: Union[torch.Tensor, None] = None, - inference=False) -> Tuple[torch.Tensor]: + inference=False, + role: str = "") -> Tuple[torch.Tensor]: """Act according to the specified behavior (to be implemented by the inheriting classes) Agents will call this method to obtain the decision to be taken given the state of the environment. @@ -260,6 +264,8 @@ def act(self, :type taken_actions: torch.Tensor or None, optional :param inference: Flag to indicate whether the model is making inference (default: False) :type inference: bool, optional + :param role: Role of the agent (default: "") + :type role: str, optional :raises NotImplementedError: Child class must implement this method diff --git a/skrl/models/torch/categorical.py b/skrl/models/torch/categorical.py index 2e099187..19aa2594 100644 --- a/skrl/models/torch/categorical.py +++ b/skrl/models/torch/categorical.py @@ -39,7 +39,8 @@ def __init__(self, def act(self, states: torch.Tensor, taken_actions: Union[torch.Tensor, None] = None, - inference=False) -> Tuple[torch.Tensor]: + inference=False, + role: str = "") -> Tuple[torch.Tensor]: """Act stochastically in response to the state of the environment :param states: Observation/state of the environment used to make the decision @@ -50,6 +51,8 @@ def act(self, :param inference: Flag to indicate whether the model is making inference (default: False). If True, the returned tensors will be detached from the current graph :type inference: bool, optional + :param role: Role of the agent (default: "") + :type role: str, optional :return: Action to be taken by the agent given the state of the environment. The tuple's components are the actions, the log of the probability density function and the model's output @@ -58,7 +61,8 @@ def act(self, # map from states/observations to normalized probabilities or unnormalized log probabilities if self._instantiator_net is None: output = self.compute(states.to(self.device), - taken_actions.to(self.device) if taken_actions is not None else taken_actions) + taken_actions.to(self.device) if taken_actions is not None else taken_actions, + role) else: output = self._get_instantiator_output(states.to(self.device), \ taken_actions.to(self.device) if taken_actions is not None else taken_actions) diff --git a/skrl/models/torch/deterministic.py b/skrl/models/torch/deterministic.py index d96e6f64..7ee8c63b 100644 --- a/skrl/models/torch/deterministic.py +++ b/skrl/models/torch/deterministic.py @@ -40,7 +40,8 @@ def __init__(self, def act(self, states: torch.Tensor, taken_actions: Union[torch.Tensor, None] = None, - inference=False) -> Tuple[torch.Tensor]: + inference=False, + role: str = "") -> Tuple[torch.Tensor]: """Act deterministically in response to the state of the environment :param states: Observation/state of the environment used to make the decision @@ -51,6 +52,8 @@ def act(self, :param inference: Flag to indicate whether the model is making inference (default: False). If True, the returned tensors will be detached from the current graph :type inference: bool, optional + :param role: Role of the agent (default: "") + :type role: str, optional :return: Action to be taken by the agent given the state of the environment. The tuple's components are the computed actions and None for the last two components @@ -59,7 +62,8 @@ def act(self, # map from observations/states to actions if self._instantiator_net is None: actions = self.compute(states.to(self.device), - taken_actions.to(self.device) if taken_actions is not None else taken_actions) + taken_actions.to(self.device) if taken_actions is not None else taken_actions, + role) else: actions = self._get_instantiator_output(states.to(self.device), \ taken_actions.to(self.device) if taken_actions is not None else taken_actions) diff --git a/skrl/models/torch/gaussian.py b/skrl/models/torch/gaussian.py index 24b117f0..dfcbc705 100644 --- a/skrl/models/torch/gaussian.py +++ b/skrl/models/torch/gaussian.py @@ -70,7 +70,8 @@ def __init__(self, def act(self, states: torch.Tensor, taken_actions: Union[torch.Tensor, None] = None, - inference=False) -> Tuple[torch.Tensor]: + inference=False, + role: str = "") -> Tuple[torch.Tensor]: """Act stochastically in response to the state of the environment :param states: Observation/state of the environment used to make the decision @@ -81,6 +82,8 @@ def act(self, :param inference: Flag to indicate whether the model is making inference (default: False). If True, the returned tensors will be detached from the current graph :type inference: bool, optional + :param role: Role of the agent (default: "") + :type role: str, optional :return: Action to be taken by the agent given the state of the environment. The tuple's components are the actions, the log of the probability density function and mean actions @@ -89,7 +92,8 @@ def act(self, # map from states/observations to mean actions and log standard deviations if self._instantiator_net is None: actions_mean, log_std = self.compute(states.to(self.device), - taken_actions.to(self.device) if taken_actions is not None else taken_actions) + taken_actions.to(self.device) if taken_actions is not None else taken_actions, + role) else: actions_mean, log_std = self._get_instantiator_output(states.to(self.device), \ taken_actions.to(self.device) if taken_actions is not None else taken_actions) diff --git a/skrl/models/torch/multivariate_gaussian.py b/skrl/models/torch/multivariate_gaussian.py index 1f61f275..1b06f813 100644 --- a/skrl/models/torch/multivariate_gaussian.py +++ b/skrl/models/torch/multivariate_gaussian.py @@ -58,7 +58,8 @@ def __init__(self, def act(self, states: torch.Tensor, taken_actions: Union[torch.Tensor, None] = None, - inference=False) -> Tuple[torch.Tensor]: + inference=False, + role: str = "") -> Tuple[torch.Tensor]: """Act stochastically in response to the state of the environment :param states: Observation/state of the environment used to make the decision @@ -69,6 +70,8 @@ def act(self, :param inference: Flag to indicate whether the model is making inference (default: False). If True, the returned tensors will be detached from the current graph :type inference: bool, optional + :param role: Role of the agent (default: "") + :type role: str, optional :return: Action to be taken by the agent given the state of the environment. The tuple's components are the actions, the log of the probability density function and mean actions @@ -77,7 +80,8 @@ def act(self, # map from states/observations to mean actions and log standard deviations if self._instantiator_net is None: actions_mean, log_std = self.compute(states.to(self.device), - taken_actions.to(self.device) if taken_actions is not None else taken_actions) + taken_actions.to(self.device) if taken_actions is not None else taken_actions, + role) else: actions_mean, log_std = self._get_instantiator_output(states.to(self.device), \ taken_actions.to(self.device) if taken_actions is not None else taken_actions) From bb78edb11b397c6e43a47b2ecf6d09e41b7e6409 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20Serrano=20Mu=C3=B1oz?= Date: Sun, 21 Aug 2022 23:33:15 +0200 Subject: [PATCH 038/108] Improve model annotations and docstrings --- skrl/models/torch/base.py | 246 +++++++++++++++------ skrl/models/torch/categorical.py | 89 ++++++-- skrl/models/torch/deterministic.py | 83 +++++-- skrl/models/torch/gaussian.py | 116 +++++++--- skrl/models/torch/multivariate_gaussian.py | 110 +++++++-- 5 files changed, 487 insertions(+), 157 deletions(-) diff --git a/skrl/models/torch/base.py b/skrl/models/torch/base.py index 87d3b282..a53bff52 100644 --- a/skrl/models/torch/base.py +++ b/skrl/models/torch/base.py @@ -1,4 +1,4 @@ -from typing import Optional, Union, Mapping, Tuple +from typing import Optional, Union, Mapping, Sequence import gym import collections @@ -9,29 +9,45 @@ class Model(torch.nn.Module): def __init__(self, - observation_space: Union[int, Tuple[int], gym.Space, None] = None, - action_space: Union[int, Tuple[int], gym.Space, None] = None, + observation_space: Union[int, Sequence[int], gym.Space], + action_space: Union[int, Sequence[int], gym.Space], device: Union[str, torch.device] = "cuda:0") -> None: """Base class representing a function approximator The following properties are defined: - ``device`` (torch.device): Device to be used for the computations - - ``observation_space`` (int, tuple or list of integers, gym.Space or None): Observation/state space - - ``action_space`` (int, tuple or list of integers, gym.Space or None): Action space - - ``num_observations`` (int or None): Number of elements in the observation/state space - - ``num_actions`` (int or None): Number of elements in the action space + - ``observation_space`` (int, sequence of int, gym.Space): Observation/state space + - ``action_space`` (int, sequence of int, gym.Space): Action space + - ``num_observations`` (int): Number of elements in the observation/state space + - ``num_actions`` (int): Number of elements in the action space - :param observation_space: Observation/state space or shape (default: None). - If it is not None, the num_observations property will contain the size of that space - :type observation_space: int, tuple or list of integers, gym.Space or None, optional - :param action_space: Action space or shape (default: None). - If it is not None, the num_actions property will contain the size of that space - :type action_space: int, tuple or list of integers, gym.Space or None, optional - :param device: Device on which a torch tensor is or will be allocated (default: "cuda:0") + :param observation_space: Observation/state space or shape. + The ``num_observations`` property will contain the size of that space + :type observation_space: int, sequence of int, gym.Space + :param action_space: Action space or shape. + The ``num_actions`` property will contain the size of that space + :type action_space: int, sequence of int, gym.Space + :param device: Device on which a torch tensor is or will be allocated (default: ``"cuda:0"``) :type device: str or torch.device, optional + + Custom models should override the ``act`` method:: + + import torch + from skrl.models.torch import Model + + class CustomModel(Model): + def __init__(self, observation_space, action_space, device="cuda:0"): + super().__init__(observation_space, action_space, device) + + self.layer_1 = nn.Linear(self.num_observations, 64) + self.layer_2 = nn.Linear(64, self.num_actions) + + def act(self, states, taken_actions=None, inference=False, role=""): + x = F.relu(self.layer_1(states)) + x = F.relu(self.layer_2(x)) + return x """ - # TODO: export to onnx (https://pytorch.org/tutorials/advanced/super_resolution_with_onnxruntime.html) super(Model, self).__init__() self.device = torch.device(device) @@ -51,7 +67,7 @@ def __init__(self, def _get_instantiator_output(self, states: torch.Tensor, - taken_actions: Union[torch.Tensor, None] = None) -> Tuple[torch.Tensor]: + taken_actions: Optional[torch.Tensor] = None) -> Sequence[torch.Tensor]: """Get the output of the instantiator model Input shape depends on the instantiator (see skrl.utils.model_instantiator.Shape) as follows: @@ -62,11 +78,11 @@ def _get_instantiator_output(self, :param states: Observation/state of the environment used to make the decision :type states: torch.Tensor - :param taken_actions: Actions taken by a policy to the given states (default: None) + :param taken_actions: Actions taken by a policy to the given states (default: ``None``) :type taken_actions: torch.Tensor, optional :return: Output of the instantiator model - :rtype: tuple of torch.Tensor + :rtype: sequence of torch.Tensor """ if self._instantiator_input_type == 0: output = self._instantiator_net(states) @@ -82,16 +98,51 @@ def _get_instantiator_output(self, else: return output * self._instantiator_output_scale, self._instantiator_parameter - def _get_space_size(self, space: Union[int, Tuple[int], gym.Space]) -> int: + def _get_space_size(self, + space: Union[int, Sequence[int], gym.Space], + number_of_elements: bool = True) -> int: """Get the size (number of elements) of a space :param space: Space or shape from which to obtain the number of elements - :type space: int, tuple or list of integers, or gym.Space + :type space: int, sequence of int, or gym.Space + :param number_of_elements: Whether the number of elements occupied by the space is returned (default: ``True``). + If ``False``, the shape of the space is returned. It only affects Discrete spaces + :type number_of_elements: bool, optional :raises ValueError: If the space is not supported :return: Size of the space (number of elements) :rtype: int + + Example:: + + # from int + >>> model._get_space_size(2) + 2 + + # from sequence of int + >>> model._get_space_size([2, 3]) + 6 + + # Box space + >>> space = gym.spaces.Box(low=-1, high=1, shape=(2, 3)) + >>> model._get_space_size(space) + 6 + + # Discrete space + >>> space = gym.spaces.Discrete(4) + >>> model._get_space_size(space) + 4 + >>> model._get_space_size(space, number_of_elements=False) + 1 + + # Dict space + >>> space = gym.spaces.Dict({'a': gym.spaces.Box(low=-1, high=1, shape=(2, 3)), + ... 'b': gym.spaces.Discrete(4)}) + >>> model._get_space_size(space) + 10 + >>> model._get_space_size(space, number_of_elements=False) + 7 """ size = None if type(space) in [int, float]: @@ -100,16 +151,22 @@ def _get_space_size(self, space: Union[int, Tuple[int], gym.Space]) -> int: size = np.prod(space) elif issubclass(type(space), gym.Space): if issubclass(type(space), gym.spaces.Discrete): - size = space.n + if number_of_elements: + size = space.n + else: + size = 1 elif issubclass(type(space), gym.spaces.Box): size = np.prod(space.shape) elif issubclass(type(space), gym.spaces.Dict): - size = sum([self._get_space_size(space.spaces[key]) for key in space.spaces]) + size = sum([self._get_space_size(space.spaces[key], number_of_elements) for key in space.spaces]) if size is None: raise ValueError("Space type {} not supported".format(type(space))) return int(size) - def tensor_to_space(self, tensor: torch.Tensor, space: gym.Space, start: int = 0) -> Union[torch.Tensor, dict]: + def tensor_to_space(self, + tensor: torch.Tensor, + space: gym.Space, + start: int = 0) -> Union[torch.Tensor, dict]: """Map a flat tensor to a Gym space The mapping is done in the following way: @@ -119,17 +176,28 @@ def tensor_to_space(self, tensor: torch.Tensor, space: gym.Space, start: int = 0 keeping the first dimension (number of samples) as they are - Tensors belonging to Dict spaces are mapped into a dictionary with the same keys as the original space - :param tensor: Tensor to map + :param tensor: Tensor to map from :type tensor: torch.Tensor :param space: Space to map the tensor to :type space: gym.Space - :param start: Index of the first element of the tensor to map (default: 0) + :param start: Index of the first element of the tensor to map (default: ``0``) :type start: int, optional :raises ValueError: If the space is not supported :return: Mapped tensor or dictionary :rtype: torch.Tensor or dict + + Example:: + + >>> space = gym.spaces.Dict({'a': gym.spaces.Box(low=-1, high=1, shape=(2, 3)), + ... 'b': gym.spaces.Discrete(4)}) + >>> tensor = torch.tensor([[-0.3, -0.2, -0.1, 0.1, 0.2, 0.3, 2]]) + >>> + >>> model.tensor_to_space(tensor, space) + {'a': tensor([[[-0.3000, -0.2000, -0.1000], + [ 0.1000, 0.2000, 0.3000]]]), + 'b': tensor([[2.]])} """ if issubclass(type(space), gym.spaces.Discrete): return tensor @@ -138,7 +206,7 @@ def tensor_to_space(self, tensor: torch.Tensor, space: gym.Space, start: int = 0 elif issubclass(type(space), gym.spaces.Dict): output = {} for k in sorted(space.keys()): - end = start + self._get_space_size(space[k]) + end = start + self._get_space_size(space[k], number_of_elements=False) output[k] = self.tensor_to_space(tensor[:, start:end], space[k], end) start = end return output @@ -146,22 +214,25 @@ def tensor_to_space(self, tensor: torch.Tensor, space: gym.Space, start: int = 0 def random_act(self, states: torch.Tensor, - taken_actions: Union[torch.Tensor, None] = None, - inference=False) -> Tuple[torch.Tensor]: + taken_actions: Optional[torch.Tensor] = None, + inference: bool = False, + role: str = "") -> Sequence[torch.Tensor]: """Act randomly according to the action space :param states: Observation/state of the environment used to get the shape of the action space :type states: torch.Tensor - :param taken_actions: Actions taken by a policy to the given states (default: None). + :param taken_actions: Actions taken by a policy to the given states (default: ``None``). The use of these actions only makes sense in critical models, e.g. - :type taken_actions: torch.Tensor or None, optional - :param inference: Flag to indicate whether the model is making inference (default: False) + :type taken_actions: torch.Tensor, optional + :param inference: Flag to indicate whether the model is making inference (default: ``False``) :type inference: bool, optional + :param role: Role of the model (default: ``""``) + :type role: str, optional :raises NotImplementedError: Unsupported action space :return: Random actions to be taken by the agent - :rtype: tuple of torch.Tensor + :rtype: sequence of torch.Tensor """ # discrete action space (Discrete) if issubclass(type(self.action_space), gym.spaces.Discrete): @@ -183,12 +254,20 @@ def init_parameters(self, method_name: str = "normal_", *args, **kwargs) -> None Method names are from the `torch.nn.init `_ module. Allowed method names are *uniform_*, *normal_*, *constant_*, etc. - :param method_name: `torch.nn.init `_ method name (default: "normal\_") + :param method_name: `torch.nn.init `_ method name (default: ``"normal_"``) :type method_name: str, optional :param args: Positional arguments of the method to be called :type args: tuple, optional :param kwargs: Key-value arguments of the method to be called :type kwargs: dict, optional + + Example:: + + # initialize all parameters with an orthogonal distribution with a gain of 0.5 + >>> model.init_parameters("orthogonal_", gain=0.5) + + # initialize all parameters as a sparse matrix with a sparsity of 0.1 + >>> model.init_parameters("sparse_", sparsity=0.1) """ for parameters in self.parameters(): exec("torch.nn.init.{}(parameters, *args, **kwargs)".format(method_name)) @@ -202,12 +281,20 @@ def init_weights(self, method_name: str = "orthogonal_", *args, **kwargs) -> Non The following layers will be initialized: - torch.nn.Linear - :param method_name: `torch.nn.init `_ method name (default: "orthogonal\_") + :param method_name: `torch.nn.init `_ method name (default: ``"orthogonal_"``) :type method_name: str, optional :param args: Positional arguments of the method to be called :type args: tuple, optional :param kwargs: Key-value arguments of the method to be called :type kwargs: dict, optional + + Example:: + + # initialize all weights with uniform distribution in range [-0.1, 0.1] + >>> model.init_weights(method_name="uniform_", a=-0.1, b=0.1) + + # initialize all weights with normal distribution with mean 0 and standard deviation 0.25 + >>> model.init_weights(method_name="normal_", mean=0.0, std=0.25) """ def _update_weights(module, method_name, args, kwargs): for layer in module: @@ -227,30 +314,30 @@ def forward(self): def compute(self, states: torch.Tensor, - taken_actions: Union[torch.Tensor, None] = None, - role: str = "") -> Union[torch.Tensor, Tuple[torch.Tensor]]: + taken_actions: Optional[torch.Tensor] = None, + role: str = "") -> Union[torch.Tensor, Sequence[torch.Tensor]]: """Define the computation performed (to be implemented by the inheriting classes) by the models :param states: Observation/state of the environment used to make the decision :type states: torch.Tensor - :param taken_actions: Actions taken by a policy to the given states (default: None). + :param taken_actions: Actions taken by a policy to the given states (default: ``None``). The use of these actions only makes sense in critical models, e.g. - :type taken_actions: torch.Tensor or None, optional - :param role: Role of the agent (default: "") + :type taken_actions: torch.Tensor, optional + :param role: Role of the model (default: ``""``) :type role: str, optional :raises NotImplementedError: Child class must implement this method :return: Computation performed by the models - :rtype: torch.Tensor or tuple of torch.Tensor + :rtype: torch.Tensor or sequence of torch.Tensor """ raise NotImplementedError("The computation performed by the models (.compute()) is not implemented") def act(self, states: torch.Tensor, - taken_actions: Union[torch.Tensor, None] = None, - inference=False, - role: str = "") -> Tuple[torch.Tensor]: + taken_actions: Optional[torch.Tensor] = None, + inference: bool = False, + role: str = "") -> Sequence[torch.Tensor]: """Act according to the specified behavior (to be implemented by the inheriting classes) Agents will call this method to obtain the decision to be taken given the state of the environment. @@ -259,31 +346,31 @@ def act(self, :param states: Observation/state of the environment used to make the decision :type states: torch.Tensor - :param taken_actions: Actions taken by a policy to the given states (default: None). + :param taken_actions: Actions taken by a policy to the given states (default: ``None``). The use of these actions only makes sense in critical models, e.g. - :type taken_actions: torch.Tensor or None, optional - :param inference: Flag to indicate whether the model is making inference (default: False) + :type taken_actions: torch.Tensor, optional + :param inference: Flag to indicate whether the model is making inference (default: ``False``) :type inference: bool, optional - :param role: Role of the agent (default: "") + :param role: Role of the model (default: ``""``) :type role: str, optional :raises NotImplementedError: Child class must implement this method :return: Action to be taken by the agent given the state of the environment. - The typical tuple's components are the actions, the log of the probability density function and mean actions. + The typical sequence's components are the actions, the log of the probability density function and mean actions. Deterministic agents must ignore the last two components and return empty tensors or None for them - :rtype: tuple of torch.Tensor + :rtype: sequence of torch.Tensor """ raise NotImplementedError("The action to be taken by the agent (.act()) is not implemented") def set_mode(self, mode: str) -> None: """Set the model mode (training or evaluation) - :param mode: Mode: "train" for training or "eval" for evaluation. + :param mode: Mode: ``"train"`` for training or ``"eval"`` for evaluation. See `torch.nn.Module.train `_ :type mode: str - :raises ValueError: Mode must be ``"train"`` or ``"eval"`` + :raises ValueError: If the mode is not ``"train"`` or ``"eval"`` """ if mode == "train": self.train(True) @@ -292,41 +379,64 @@ def set_mode(self, mode: str) -> None: else: raise ValueError("Invalid mode. Use 'train' for training or 'eval' for evaluation") - def save(self, path: str, state_dict: Union[dict, None] = None) -> None: + def save(self, path: str, state_dict: Optional[dict] = None) -> None: """Save the model to the specified path :param path: Path to save the model to :type path: str - :param state_dict: State dictionary to save (default: None). + :param state_dict: State dictionary to save (default: ``None``). If None, the model's state_dict will be saved :type state_dict: dict, optional + + Example:: + + # save the current model to the specified path + >>> model.save("/tmp/model.pt") + + # save an older version of the model to the specified path + >>> old_state_dict = copy.deepcopy(model.state_dict()) + >>> # ... + >>> model.save("/tmp/model.pt", old_state_dict) + """ torch.save(self.state_dict() if state_dict is None else state_dict, path) def load(self, path: str) -> None: """Load the model from the specified path - + + The final storage device is determined by the constructor of the model + :param path: Path to load the model from :type path: str + + Example:: + + # load the model onto the CPU + >>> model = Model(observation_space, action_space, device="cpu") + >>> model.load("model.pt") + + # load the model onto the GPU 1 + >>> model = Model(observation_space, action_space, device="cuda:1") + >>> model.load("model.pt") """ self.load_state_dict(torch.load(path, map_location=self.device)) self.eval() def migrate(self, state_dict: Mapping[str, torch.Tensor], - name_map: Optional[Mapping[str, str]] = {}, - auto_mapping: Optional[bool] = True, - show_names: Optional[bool] = False) -> bool: + name_map: Mapping[str, str] = {}, + auto_mapping: bool = True, + show_names: bool = False) -> bool: """Migrate the specified extrernal model's state dict to the current model :param state_dict: External model's state dict to migrate from :type state_dict: Mapping[str, torch.Tensor] - :param name_map: Name map to use for the migration (default: {}). + :param name_map: Name map to use for the migration (default: ``{}``). Keys are the current parameter names and values are the external parameter names :type name_map: Mapping[str, str], optional - :param auto_mapping: Automatically map the external state dict to the current state dict (default: True) + :param auto_mapping: Automatically map the external state dict to the current state dict (default: ``True``) :type auto_mapping: bool, optional - :param show_names: Show the names of both, current and external state dicts parameters (default: False) + :param show_names: Show the names of both, current and external state dicts parameters (default: ``False``) :type show_names: bool, optional :return: True if the migration was successful, False otherwise. @@ -406,7 +516,7 @@ def freeze_parameters(self, freeze: bool = True) -> None: - Freeze: disable gradient computation (``parameters.requires_grad = False``) - Unfreeze: enable gradient computation (``parameters.requires_grad = True``) - :param freeze: Freeze the internal parameters if True, otherwise unfreeze them + :param freeze: Freeze the internal parameters if True, otherwise unfreeze them (default: ``True``) :type freeze: bool, optional """ for parameters in self.parameters(): @@ -417,12 +527,20 @@ def update_parameters(self, model: torch.nn.Module, polyak: float = 1) -> None: - Hard update: :math:`\\theta = \\theta_{net}` - Soft (polyak averaging) update: :math:`\\theta = (1 - \\rho) \\theta + \\rho \\theta_{net}` - + :param model: Model used to update the internal parameters :type model: torch.nn.Module (skrl.models.torch.Model) - :param polyak: Polyak hyperparameter between 0 and 1 (usually close to 0). - A hard update is performed when its value is 1 (default) + :param polyak: Polyak hyperparameter between 0 and 1 (default: ``1``). + A hard update is performed when its value is 1 :type polyak: float, optional + + Example:: + + # hard update (from source model) + >>> model.update_parameters(source_model) + + # soft update (from source model) + >>> model.update_parameters(source_model, polyak=0.005) """ with torch.no_grad(): # hard update diff --git a/skrl/models/torch/categorical.py b/skrl/models/torch/categorical.py index 19aa2594..ab74a029 100644 --- a/skrl/models/torch/categorical.py +++ b/skrl/models/torch/categorical.py @@ -1,4 +1,4 @@ -from typing import Union, Tuple +from typing import Optional, Union, Sequence import gym @@ -10,25 +10,60 @@ class CategoricalModel(Model): def __init__(self, - observation_space: Union[int, Tuple[int], gym.Space, None] = None, - action_space: Union[int, Tuple[int], gym.Space, None] = None, + observation_space: Union[int, Sequence[int], gym.Space], + action_space: Union[int, Sequence[int], gym.Space], device: Union[str, torch.device] = "cuda:0", unnormalized_log_prob: bool = True) -> None: """Categorical model (stochastic model) - :param observation_space: Observation/state space or shape (default: None). - If it is not None, the num_observations property will contain the size of that space - :type observation_space: int, tuple or list of integers, gym.Space or None, optional - :param action_space: Action space or shape (default: None). - If it is not None, the num_actions property will contain the size of that space - :type action_space: int, tuple or list of integers, gym.Space or None, optional - :param device: Device on which a torch tensor is or will be allocated (default: "cuda:0") + :param observation_space: Observation/state space or shape. + The ``num_observations`` property will contain the size of that space + :type observation_space: int, sequence of int, gym.Space + :param action_space: Action space or shape. + The ``num_actions`` property will contain the size of that space + :type action_space: int, sequence of int, gym.Space + :param device: Device on which a torch tensor is or will be allocated (default: ``"cuda:0"``) :type device: str or torch.device, optional - :param unnormalized_log_prob: Flag to indicate how to be interpreted the model's output (default: True). + :param unnormalized_log_prob: Flag to indicate how to be interpreted the model's output (default: ``True``). If True, the model's output is interpreted as unnormalized log probabilities (it can be any real number), otherwise as normalized probabilities (the output must be non-negative, finite and have a non-zero sum) :type unnormalized_log_prob: bool, optional + + Example:: + + # define the model + >>> import torch + >>> import torch.nn as nn + >>> from skrl.models.torch import CategoricalModel + >>> + >>> class Policy(CategoricalModel): + ... def __init__(self, observation_space, action_space, device, unnormalized_log_prob=True): + ... super().__init__(observation_space, action_space, device, unnormalized_log_prob) + ... + ... self.net = nn.Sequential(nn.Linear(self.num_observations, 32), + ... nn.ELU(), + ... nn.Linear(32, 32), + ... nn.ELU(), + ... nn.Linear(32, self.num_actions)) + ... + ... def compute(self, states, taken_actions, role): + ... return self.net(states) + ... + >>> # given an observation_space: gym.spaces.Box with shape (4,) + >>> # and an action_space: gym.spaces.Discrete with n = 2 + >>> model = Policy(observation_space, action_space) + >>> + >>> print(model) + Policy( + (net): Sequential( + (0): Linear(in_features=4, out_features=32, bias=True) + (1): ELU(alpha=1.0) + (2): Linear(in_features=32, out_features=32, bias=True) + (3): ELU(alpha=1.0) + (4): Linear(in_features=32, out_features=2, bias=True) + ) + ) """ super(CategoricalModel, self).__init__(observation_space, action_space, device) @@ -38,25 +73,31 @@ def __init__(self, def act(self, states: torch.Tensor, - taken_actions: Union[torch.Tensor, None] = None, - inference=False, - role: str = "") -> Tuple[torch.Tensor]: + taken_actions: Optional[torch.Tensor] = None, + inference: bool = False, + role: str = "") -> Sequence[torch.Tensor]: """Act stochastically in response to the state of the environment :param states: Observation/state of the environment used to make the decision :type states: torch.Tensor - :param taken_actions: Actions taken by a policy to the given states (default: None). + :param taken_actions: Actions taken by a policy to the given states (default: ``None``). The use of these actions only makes sense in critical models, e.g. - :type taken_actions: torch.Tensor or None, optional - :param inference: Flag to indicate whether the model is making inference (default: False). - If True, the returned tensors will be detached from the current graph + :type taken_actions: torch.Tensor, optional + :param inference: Flag to indicate whether the model is making inference (default: ``False``) :type inference: bool, optional - :param role: Role of the agent (default: "") + :param role: Role of the model (default: ``""``) :type role: str, optional :return: Action to be taken by the agent given the state of the environment. - The tuple's components are the actions, the log of the probability density function and the model's output - :rtype: tuple of torch.Tensor + The sequence's components are the actions, the log of the probability density function and the model's output + :rtype: sequence of torch.Tensor + + Example:: + + >>> # given a batch of sample states with shape (4096, 4) + >>> action, log_prob, net_output = model.act(states) + >>> print(action.shape, log_prob.shape, net_output.shape) + torch.Size([4096, 1]) torch.Size([4096, 1]) torch.Size([4096, 2]) """ # map from states/observations to normalized probabilities or unnormalized log probabilities if self._instantiator_net is None: @@ -87,5 +128,11 @@ def distribution(self) -> torch.distributions.Categorical: :return: Distribution of the model :rtype: torch.distributions.Categorical + + Example:: + + >>> distribution = model.distribution() + >>> print(distribution) + Categorical(probs: torch.Size([4096, 2]), logits: torch.Size([4096, 2])) """ return self._distribution \ No newline at end of file diff --git a/skrl/models/torch/deterministic.py b/skrl/models/torch/deterministic.py index 7ee8c63b..634ef791 100644 --- a/skrl/models/torch/deterministic.py +++ b/skrl/models/torch/deterministic.py @@ -1,4 +1,4 @@ -from typing import Union, Tuple +from typing import Optional, Union, Sequence import gym @@ -9,22 +9,57 @@ class DeterministicModel(Model): def __init__(self, - observation_space: Union[int, Tuple[int], gym.Space, None] = None, - action_space: Union[int, Tuple[int], gym.Space, None] = None, + observation_space: Union[int, Sequence[int], gym.Space], + action_space: Union[int, Sequence[int], gym.Space], device: Union[str, torch.device] = "cuda:0", clip_actions: bool = False) -> None: """Deterministic model (deterministic model) - :param observation_space: Observation/state space or shape (default: None). - If it is not None, the num_observations property will contain the size of that space - :type observation_space: int, tuple or list of integers, gym.Space or None, optional - :param action_space: Action space or shape (default: None). - If it is not None, the num_actions property will contain the size of that space - :type action_space: int, tuple or list of integers, gym.Space or None, optional - :param device: Device on which a torch tensor is or will be allocated (default: "cuda:0") + :param observation_space: Observation/state space or shape. + The ``num_observations`` property will contain the size of that space + :type observation_space: int, sequence of int, gym.Space + :param action_space: Action space or shape. + The ``num_actions`` property will contain the size of that space + :type action_space: int, sequence of int, gym.Space + :param device: Device on which a torch tensor is or will be allocated (default: ``"cuda:0"``) :type device: str or torch.device, optional - :param clip_actions: Flag to indicate whether the actions should be clipped to the action space (default: False) + :param clip_actions: Flag to indicate whether the actions should be clipped to the action space (default: ``False``) :type clip_actions: bool, optional + + Example:: + + # define the model + >>> import torch + >>> import torch.nn as nn + >>> from skrl.models.torch import DeterministicModel + >>> + >>> class Value(DeterministicModel): + ... def __init__(self, observation_space, action_space, device, clip_actions=False): + ... super().__init__(observation_space, action_space, device, clip_actions) + ... + ... self.net = nn.Sequential(nn.Linear(self.num_observations, 32), + ... nn.ELU(), + ... nn.Linear(32, 32), + ... nn.ELU(), + ... nn.Linear(32, 1)) + ... + ... def compute(self, states, taken_actions, role): + ... return self.net(states) + ... + >>> # given an observation_space: gym.spaces.Box with shape (60,) + >>> # and an action_space: gym.spaces.Box with shape (8,) + >>> model = Value(observation_space, action_space) + >>> + >>> print(model) + Value( + (net): Sequential( + (0): Linear(in_features=60, out_features=32, bias=True) + (1): ELU(alpha=1.0) + (2): Linear(in_features=32, out_features=32, bias=True) + (3): ELU(alpha=1.0) + (4): Linear(in_features=32, out_features=1, bias=True) + ) + ) """ super(DeterministicModel, self).__init__(observation_space, action_space, device) @@ -39,25 +74,31 @@ def __init__(self, def act(self, states: torch.Tensor, - taken_actions: Union[torch.Tensor, None] = None, - inference=False, - role: str = "") -> Tuple[torch.Tensor]: + taken_actions: Optional[torch.Tensor] = None, + inference: bool = False, + role: str = "") -> Sequence[torch.Tensor]: """Act deterministically in response to the state of the environment :param states: Observation/state of the environment used to make the decision :type states: torch.Tensor - :param taken_actions: Actions taken by a policy to the given states (default: None). + :param taken_actions: Actions taken by a policy to the given states (default: ``None``). The use of these actions only makes sense in critical models, e.g. - :type taken_actions: torch.Tensor or None, optional - :param inference: Flag to indicate whether the model is making inference (default: False). - If True, the returned tensors will be detached from the current graph + :type taken_actions: torch.Tensor, optional + :param inference: Flag to indicate whether the model is making inference (default: ``False``) :type inference: bool, optional - :param role: Role of the agent (default: "") + :param role: Role of the model (default: ``""``) :type role: str, optional :return: Action to be taken by the agent given the state of the environment. - The tuple's components are the computed actions and None for the last two components - :rtype: tuple of torch.Tensor + The sequence's components are the computed actions and None for the last two components + :rtype: sequence of torch.Tensor + + Example:: + + >>> # given a batch of sample states with shape (4096, 60) + >>> output = model.act(states) + >>> print(output[0].shape, output[1], output[2]) + torch.Size([4096, 1]) None None """ # map from observations/states to actions if self._instantiator_net is None: diff --git a/skrl/models/torch/gaussian.py b/skrl/models/torch/gaussian.py index dfcbc705..f8cb3203 100644 --- a/skrl/models/torch/gaussian.py +++ b/skrl/models/torch/gaussian.py @@ -1,4 +1,4 @@ -from typing import Union, Tuple +from typing import Optional, Union, Sequence import gym @@ -10,8 +10,8 @@ class GaussianModel(Model): def __init__(self, - observation_space: Union[int, Tuple[int], gym.Space, None] = None, - action_space: Union[int, Tuple[int], gym.Space, None] = None, + observation_space: Union[int, Sequence[int], gym.Space], + action_space: Union[int, Sequence[int], gym.Space], device: Union[str, torch.device] = "cuda:0", clip_actions: bool = False, clip_log_std: bool = True, @@ -20,28 +20,66 @@ def __init__(self, reduction: str = "sum") -> None: """Gaussian model (stochastic model) - :param observation_space: Observation/state space or shape (default: None). - If it is not None, the num_observations property will contain the size of that space - :type observation_space: int, tuple or list of integers, gym.Space or None, optional - :param action_space: Action space or shape (default: None). - If it is not None, the num_actions property will contain the size of that space - :type action_space: int, tuple or list of integers, gym.Space or None, optional - :param device: Device on which a torch tensor is or will be allocated (default: "cuda:0") + :param observation_space: Observation/state space or shape. + The ``num_observations`` property will contain the size of that space + :type observation_space: int, sequence of int, gym.Space + :param action_space: Action space or shape. + The ``num_actions`` property will contain the size of that space + :type action_space: int, sequence of int, gym.Space + :param device: Device on which a torch tensor is or will be allocated (default: ``"cuda:0"``) :type device: str or torch.device, optional - :param clip_actions: Flag to indicate whether the actions should be clipped to the action space (default: False) + :param clip_actions: Flag to indicate whether the actions should be clipped to the action space (default: ``False``) :type clip_actions: bool, optional - :param clip_log_std: Flag to indicate whether the log standard deviations should be clipped (default: True) + :param clip_log_std: Flag to indicate whether the log standard deviations should be clipped (default: ``True``) :type clip_log_std: bool, optional - :param min_log_std: Minimum value of the log standard deviation if clip_log_std is True (default: -20) + :param min_log_std: Minimum value of the log standard deviation if ``clip_log_std`` is True (default: ``-20``) :type min_log_std: float, optional - :param max_log_std: Maximum value of the log standard deviation if clip_log_std is True (default: 2) + :param max_log_std: Maximum value of the log standard deviation if ``clip_log_std`` is True (default: ``2``) :type max_log_std: float, optional - :param reduction: Reduction method for returning the log probability density function: (default: "sum"). - Supported values are "mean", "sum", "prod" and "none". If "none", the log probability density - function is returned as a tensor of shape (num_samples, num_actions) instead of (num_samples, 1) + :param reduction: Reduction method for returning the log probability density function: (default: ``"sum"``). + Supported values are ``"mean"``, ``"sum"``, ``"prod"`` and ``"none"``. If "``none"``, the log probability density + function is returned as a tensor of shape ``(num_samples, num_actions)`` instead of ``(num_samples, 1)`` :type reduction: str, optional :raises ValueError: If the reduction method is not valid + + Example:: + + # define the model + >>> import torch + >>> import torch.nn as nn + >>> from skrl.models.torch import GaussianModel + >>> + >>> class Policy(GaussianModel): + ... def __init__(self, observation_space, action_space, device, clip_actions=False, + ... clip_log_std=True, min_log_std=-20, max_log_std=2): + ... super().__init__(observation_space, action_space, device, clip_actions, + ... clip_log_std, min_log_std, max_log_std) + ... + ... self.net = nn.Sequential(nn.Linear(self.num_observations, 32), + ... nn.ELU(), + ... nn.Linear(32, 32), + ... nn.ELU(), + ... nn.Linear(32, self.num_actions)) + ... self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) + ... + ... def compute(self, states, taken_actions, role): + ... return self.net(states), self.log_std_parameter + ... + >>> # given an observation_space: gym.spaces.Box with shape (60,) + >>> # and an action_space: gym.spaces.Box with shape (8,) + >>> model = Policy(observation_space, action_space) + >>> + >>> print(model) + Policy( + (net): Sequential( + (0): Linear(in_features=60, out_features=32, bias=True) + (1): ELU(alpha=1.0) + (2): Linear(in_features=32, out_features=32, bias=True) + (3): ELU(alpha=1.0) + (4): Linear(in_features=32, out_features=8, bias=True) + ) + ) """ super(GaussianModel, self).__init__(observation_space, action_space, device) @@ -69,25 +107,31 @@ def __init__(self, def act(self, states: torch.Tensor, - taken_actions: Union[torch.Tensor, None] = None, - inference=False, - role: str = "") -> Tuple[torch.Tensor]: + taken_actions: Optional[torch.Tensor] = None, + inference: bool = False, + role: str = "") -> Sequence[torch.Tensor]: """Act stochastically in response to the state of the environment :param states: Observation/state of the environment used to make the decision :type states: torch.Tensor - :param taken_actions: Actions taken by a policy to the given states (default: None). + :param taken_actions: Actions taken by a policy to the given states (default: ``None``). The use of these actions only makes sense in critical models, e.g. - :type taken_actions: torch.Tensor or None, optional - :param inference: Flag to indicate whether the model is making inference (default: False). - If True, the returned tensors will be detached from the current graph + :type taken_actions: torch.Tensor, optional + :param inference: Flag to indicate whether the model is making inference (default: ``False``) :type inference: bool, optional - :param role: Role of the agent (default: "") + :param role: Role of the model (default: ``""``) :type role: str, optional :return: Action to be taken by the agent given the state of the environment. - The tuple's components are the actions, the log of the probability density function and mean actions - :rtype: tuple of torch.Tensor + The sequence's components are the actions, the log of the probability density function and mean actions + :rtype: sequence of torch.Tensor + + Example:: + + >>> # given a batch of sample states with shape (4096, 60) + >>> action, log_prob, mean_action = model.act(states) + >>> print(action.shape, log_prob.shape, mean_action.shape) + torch.Size([4096, 8]) torch.Size([4096, 1]) torch.Size([4096, 8]) """ # map from states/observations to mean actions and log standard deviations if self._instantiator_net is None: @@ -134,6 +178,12 @@ def get_entropy(self) -> torch.Tensor: :return: Entropy of the model :rtype: torch.Tensor + + Example:: + + >>> entropy = model.get_entropy() + >>> print(entropy.shape) + torch.Size([4096, 8]) """ if self._distribution is None: return torch.tensor(0.0, device=self.device) @@ -144,6 +194,12 @@ def get_log_std(self) -> torch.Tensor: :return: Log standard deviation of the model :rtype: torch.Tensor + + Example:: + + >>> log_std = model.get_log_std() + >>> print(log_std.shape) + torch.Size([4096, 8]) """ return self._log_std.repeat(self._num_samples, 1) @@ -152,5 +208,11 @@ def distribution(self) -> torch.distributions.Normal: :return: Distribution of the model :rtype: torch.distributions.Normal + + Example:: + + >>> distribution = model.distribution() + >>> print(distribution) + Normal(loc: torch.Size([4096, 8]), scale: torch.Size([4096, 8])) """ return self._distribution diff --git a/skrl/models/torch/multivariate_gaussian.py b/skrl/models/torch/multivariate_gaussian.py index 1b06f813..18d2d963 100644 --- a/skrl/models/torch/multivariate_gaussian.py +++ b/skrl/models/torch/multivariate_gaussian.py @@ -1,4 +1,4 @@ -from typing import Union, Tuple +from typing import Optional, Union, Sequence import gym @@ -10,8 +10,8 @@ class MultivariateGaussianModel(Model): def __init__(self, - observation_space: Union[int, Tuple[int], gym.Space, None] = None, - action_space: Union[int, Tuple[int], gym.Space, None] = None, + observation_space: Union[int, Sequence[int], gym.Space], + action_space: Union[int, Sequence[int], gym.Space], device: Union[str, torch.device] = "cuda:0", clip_actions: bool = False, clip_log_std: bool = True, @@ -19,22 +19,60 @@ def __init__(self, max_log_std: float = 2) -> None: """Multivariate Gaussian model (stochastic model) - :param observation_space: Observation/state space or shape (default: None). - If it is not None, the num_observations property will contain the size of that space - :type observation_space: int, tuple or list of integers, gym.Space or None, optional - :param action_space: Action space or shape (default: None). - If it is not None, the num_actions property will contain the size of that space - :type action_space: int, tuple or list of integers, gym.Space or None, optional - :param device: Device on which a torch tensor is or will be allocated (default: "cuda:0") + :param observation_space: Observation/state space or shape. + The ``num_observations`` property will contain the size of that space + :type observation_space: int, sequence of int, gym.Space + :param action_space: Action space or shape. + The ``num_actions`` property will contain the size of that space + :type action_space: int, sequence of int, gym.Space + :param device: Device on which a torch tensor is or will be allocated (default: ``"cuda:0"``) :type device: str or torch.device, optional - :param clip_actions: Flag to indicate whether the actions should be clipped to the action space (default: False) + :param clip_actions: Flag to indicate whether the actions should be clipped to the action space (default: ``False``) :type clip_actions: bool, optional - :param clip_log_std: Flag to indicate whether the log standard deviations should be clipped (default: True) + :param clip_log_std: Flag to indicate whether the log standard deviations should be clipped (default: ``True``) :type clip_log_std: bool, optional - :param min_log_std: Minimum value of the log standard deviation if clip_log_std is True (default: -20) + :param min_log_std: Minimum value of the log standard deviation if ``clip_log_std`` is True (default: ``-20``) :type min_log_std: float, optional - :param max_log_std: Maximum value of the log standard deviation if clip_log_std is True (default: 2) + :param max_log_std: Maximum value of the log standard deviation if ``clip_log_std`` is True (default: ``2``) :type max_log_std: float, optional + + Example:: + + # define the model + >>> import torch + >>> import torch.nn as nn + >>> from skrl.models.torch import MultivariateGaussianModel + >>> + >>> class Policy(MultivariateGaussianModel): + ... def __init__(self, observation_space, action_space, device, clip_actions=False, + ... clip_log_std=True, min_log_std=-20, max_log_std=2): + ... super().__init__(observation_space, action_space, device, clip_actions, + ... clip_log_std, min_log_std, max_log_std) + ... + ... self.net = nn.Sequential(nn.Linear(self.num_observations, 32), + ... nn.ELU(), + ... nn.Linear(32, 32), + ... nn.ELU(), + ... nn.Linear(32, self.num_actions)) + ... self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) + ... + ... def compute(self, states, taken_actions, role): + ... return self.net(states), self.log_std_parameter + ... + >>> # given an observation_space: gym.spaces.Box with shape (60,) + >>> # and an action_space: gym.spaces.Box with shape (8,) + >>> model = Policy(observation_space, action_space) + >>> + >>> print(model) + Policy( + (net): Sequential( + (0): Linear(in_features=60, out_features=32, bias=True) + (1): ELU(alpha=1.0) + (2): Linear(in_features=32, out_features=32, bias=True) + (3): ELU(alpha=1.0) + (4): Linear(in_features=32, out_features=8, bias=True) + ) + ) """ super(MultivariateGaussianModel, self).__init__(observation_space, action_space, device) @@ -57,25 +95,31 @@ def __init__(self, def act(self, states: torch.Tensor, - taken_actions: Union[torch.Tensor, None] = None, - inference=False, - role: str = "") -> Tuple[torch.Tensor]: + taken_actions: Optional[torch.Tensor] = None, + inference: bool = False, + role: str = "") -> Sequence[torch.Tensor]: """Act stochastically in response to the state of the environment :param states: Observation/state of the environment used to make the decision :type states: torch.Tensor - :param taken_actions: Actions taken by a policy to the given states (default: None). + :param taken_actions: Actions taken by a policy to the given states (default: ``None``). The use of these actions only makes sense in critical models, e.g. - :type taken_actions: torch.Tensor or None, optional - :param inference: Flag to indicate whether the model is making inference (default: False). - If True, the returned tensors will be detached from the current graph + :type taken_actions: torch.Tensor, optional + :param inference: Flag to indicate whether the model is making inference (default: ``False``) :type inference: bool, optional - :param role: Role of the agent (default: "") + :param role: Role of the model (default: ``""``) :type role: str, optional :return: Action to be taken by the agent given the state of the environment. - The tuple's components are the actions, the log of the probability density function and mean actions - :rtype: tuple of torch.Tensor + The sequence's components are the actions, the log of the probability density function and mean actions + :rtype: sequence of torch.Tensor + + Example:: + + >>> # given a batch of sample states with shape (4096, 60) + >>> action, log_prob, mean_action = model.act(states) + >>> print(action.shape, log_prob.shape, mean_action.shape) + torch.Size([4096, 8]) torch.Size([4096, 1]) torch.Size([4096, 8]) """ # map from states/observations to mean actions and log standard deviations if self._instantiator_net is None: @@ -121,6 +165,12 @@ def get_entropy(self) -> torch.Tensor: :return: Entropy of the model :rtype: torch.Tensor + + Example:: + + >>> entropy = model.get_entropy() + >>> print(entropy.shape) + torch.Size([4096]) """ if self._distribution is None: return torch.tensor(0.0, device=self.device) @@ -131,6 +181,12 @@ def get_log_std(self) -> torch.Tensor: :return: Log standard deviation of the model :rtype: torch.Tensor + + Example:: + + >>> log_std = model.get_log_std() + >>> print(log_std.shape) + torch.Size([4096, 8]) """ return self._log_std.repeat(self._num_samples, 1) @@ -139,5 +195,11 @@ def distribution(self) -> torch.distributions.MultivariateNormal: :return: Distribution of the model :rtype: torch.distributions.MultivariateNormal + + Example:: + + >>> distribution = model.distribution() + >>> print(distribution) + MultivariateNormal(loc: torch.Size([4096, 8]), scale_tril: torch.Size([4096, 8, 8])) """ return self._distribution From 30e17b64c06290bbd5de9fe2bc0d74cbe240efdb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20Serrano=20Mu=C3=B1oz?= Date: Sun, 21 Aug 2022 23:47:12 +0200 Subject: [PATCH 039/108] Show base class properties in docs --- .../source/modules/skrl.models.base_class.rst | 30 +++++++++++++++---- .../modules/skrl.resources.preprocessors.rst | 1 - 2 files changed, 25 insertions(+), 6 deletions(-) diff --git a/docs/source/modules/skrl.models.base_class.rst b/docs/source/modules/skrl.models.base_class.rst index 50b6ec50..5ed11e5b 100644 --- a/docs/source/modules/skrl.models.base_class.rst +++ b/docs/source/modules/skrl.models.base_class.rst @@ -24,9 +24,29 @@ API ^^^ .. autoclass:: skrl.models.torch.base.Model - :undoc-members: - :show-inheritance: - :private-members: _get_space_size - :members: + :undoc-members: + :show-inheritance: + :private-members: _get_space_size, _get_instantiator_output + :members: - .. automethod:: __init__ + .. automethod:: __init__ + + .. py:property:: device + + Device to be used for the computations + + .. py:property:: observation_space + + Observation/state space. It is a replica of the class constructor parameter of the same name + + .. py:property:: action_space + + Action space. It is a replica of the class constructor parameter of the same name + + .. py:property:: num_observations + + Number of elements in the observation/state space + + .. py:property:: num_actions + + Number of elements in the action space diff --git a/docs/source/modules/skrl.resources.preprocessors.rst b/docs/source/modules/skrl.resources.preprocessors.rst index 5dedb188..d44aa631 100644 --- a/docs/source/modules/skrl.resources.preprocessors.rst +++ b/docs/source/modules/skrl.resources.preprocessors.rst @@ -28,7 +28,6 @@ The preprocessor class is set under the :literal:`"_preprocessor"` key cfg["value_preprocessor"] = RunningStandardScaler cfg["value_preprocessor_kwargs"] = {"size": 1, "device": device} - .. raw:: html
From 091d46bdde2db4184729257f0421085f73b40804 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20Serrano=20Mu=C3=B1oz?= Date: Thu, 25 Aug 2022 19:00:24 +0200 Subject: [PATCH 040/108] Skip forwarding of role parameter to models' compute method --- skrl/models/torch/base.py | 5 +---- skrl/models/torch/categorical.py | 3 +-- skrl/models/torch/deterministic.py | 3 +-- skrl/models/torch/gaussian.py | 3 +-- skrl/models/torch/multivariate_gaussian.py | 3 +-- 5 files changed, 5 insertions(+), 12 deletions(-) diff --git a/skrl/models/torch/base.py b/skrl/models/torch/base.py index a53bff52..40b6b517 100644 --- a/skrl/models/torch/base.py +++ b/skrl/models/torch/base.py @@ -314,8 +314,7 @@ def forward(self): def compute(self, states: torch.Tensor, - taken_actions: Optional[torch.Tensor] = None, - role: str = "") -> Union[torch.Tensor, Sequence[torch.Tensor]]: + taken_actions: Optional[torch.Tensor] = None) -> Union[torch.Tensor, Sequence[torch.Tensor]]: """Define the computation performed (to be implemented by the inheriting classes) by the models :param states: Observation/state of the environment used to make the decision @@ -323,8 +322,6 @@ def compute(self, :param taken_actions: Actions taken by a policy to the given states (default: ``None``). The use of these actions only makes sense in critical models, e.g. :type taken_actions: torch.Tensor, optional - :param role: Role of the model (default: ``""``) - :type role: str, optional :raises NotImplementedError: Child class must implement this method diff --git a/skrl/models/torch/categorical.py b/skrl/models/torch/categorical.py index ab74a029..295c4957 100644 --- a/skrl/models/torch/categorical.py +++ b/skrl/models/torch/categorical.py @@ -102,8 +102,7 @@ def act(self, # map from states/observations to normalized probabilities or unnormalized log probabilities if self._instantiator_net is None: output = self.compute(states.to(self.device), - taken_actions.to(self.device) if taken_actions is not None else taken_actions, - role) + taken_actions.to(self.device) if taken_actions is not None else taken_actions) else: output = self._get_instantiator_output(states.to(self.device), \ taken_actions.to(self.device) if taken_actions is not None else taken_actions) diff --git a/skrl/models/torch/deterministic.py b/skrl/models/torch/deterministic.py index 634ef791..0e5f5dd4 100644 --- a/skrl/models/torch/deterministic.py +++ b/skrl/models/torch/deterministic.py @@ -103,8 +103,7 @@ def act(self, # map from observations/states to actions if self._instantiator_net is None: actions = self.compute(states.to(self.device), - taken_actions.to(self.device) if taken_actions is not None else taken_actions, - role) + taken_actions.to(self.device) if taken_actions is not None else taken_actions) else: actions = self._get_instantiator_output(states.to(self.device), \ taken_actions.to(self.device) if taken_actions is not None else taken_actions) diff --git a/skrl/models/torch/gaussian.py b/skrl/models/torch/gaussian.py index f8cb3203..8950230e 100644 --- a/skrl/models/torch/gaussian.py +++ b/skrl/models/torch/gaussian.py @@ -136,8 +136,7 @@ def act(self, # map from states/observations to mean actions and log standard deviations if self._instantiator_net is None: actions_mean, log_std = self.compute(states.to(self.device), - taken_actions.to(self.device) if taken_actions is not None else taken_actions, - role) + taken_actions.to(self.device) if taken_actions is not None else taken_actions) else: actions_mean, log_std = self._get_instantiator_output(states.to(self.device), \ taken_actions.to(self.device) if taken_actions is not None else taken_actions) diff --git a/skrl/models/torch/multivariate_gaussian.py b/skrl/models/torch/multivariate_gaussian.py index 18d2d963..b7118b53 100644 --- a/skrl/models/torch/multivariate_gaussian.py +++ b/skrl/models/torch/multivariate_gaussian.py @@ -124,8 +124,7 @@ def act(self, # map from states/observations to mean actions and log standard deviations if self._instantiator_net is None: actions_mean, log_std = self.compute(states.to(self.device), - taken_actions.to(self.device) if taken_actions is not None else taken_actions, - role) + taken_actions.to(self.device) if taken_actions is not None else taken_actions) else: actions_mean, log_std = self._get_instantiator_output(states.to(self.device), \ taken_actions.to(self.device) if taken_actions is not None else taken_actions) From 0d42800780377ff912e63aca9a2181613abd132d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20Serrano=20Mu=C3=B1oz?= Date: Sat, 27 Aug 2022 16:22:21 +0200 Subject: [PATCH 041/108] Create logger with colored formatting --- skrl/__init__.py | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/skrl/__init__.py b/skrl/__init__.py index 1d0e5ee7..37fdd2aa 100644 --- a/skrl/__init__.py +++ b/skrl/__init__.py @@ -1,6 +1,31 @@ import os +import logging + +__all__ = ["__version__", "logger"] + # read library version from file path = os.path.join(os.path.dirname(__file__), "version.txt") with open(path, "r") as file: __version__ = file.read().strip() + + +# logger with format +class _Formatter(logging.Formatter): + _format = "%(name)s:%(levelname)s - %(message)s (%(module)s:%(funcName)s:%(lineno)d)" + _formats = {logging.DEBUG: f"\x1b[38;20m{_format}\x1b[0m", + logging.INFO: f"\x1b[38;20m{_format}\x1b[0m", + logging.WARNING: f"\x1b[33;20m{_format}\x1b[0m", + logging.ERROR: f"\x1b[31;20m{_format}\x1b[0m", + logging.CRITICAL: f"\x1b[31;1m{_format}\x1b[0m"} + + def format(self, record): + return logging.Formatter(self._formats.get(record.levelno)).format(record) + +_handler = logging.StreamHandler() +_handler.setLevel(logging.DEBUG) +_handler.setFormatter(_Formatter()) + +logger = logging.getLogger("skrl") +logger.setLevel(logging.DEBUG) +logger.addHandler(_handler) From 850c6a913d3e7661341e87b1c911bf8c0fb804a4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20Serrano=20Mu=C3=B1oz?= Date: Sat, 27 Aug 2022 19:10:52 +0200 Subject: [PATCH 042/108] Convert model classes to mixins --- skrl/models/torch/__init__.py | 8 +- skrl/models/torch/base.py | 9 +- skrl/models/torch/categorical.py | 67 +++++------ skrl/models/torch/deterministic.py | 45 +++----- skrl/models/torch/gaussian.py | 123 ++++++++++++--------- skrl/models/torch/multivariate_gaussian.py | 114 ++++++++++--------- 6 files changed, 188 insertions(+), 178 deletions(-) diff --git a/skrl/models/torch/__init__.py b/skrl/models/torch/__init__.py index de02edaa..ed7b6389 100644 --- a/skrl/models/torch/__init__.py +++ b/skrl/models/torch/__init__.py @@ -1,7 +1,7 @@ from .base import Model from .tabular import TabularModel -from .gaussian import GaussianModel -from .categorical import CategoricalModel -from .deterministic import DeterministicModel -from .multivariate_gaussian import MultivariateGaussianModel +from .gaussian import GaussianMixin +from .categorical import CategoricalMixin +from .deterministic import DeterministicMixin +from .multivariate_gaussian import MultivariateGaussianMixin diff --git a/skrl/models/torch/base.py b/skrl/models/torch/base.py index 40b6b517..c51e98da 100644 --- a/skrl/models/torch/base.py +++ b/skrl/models/torch/base.py @@ -6,6 +6,8 @@ import torch +from skrl import logger + class Model(torch.nn.Module): def __init__(self, @@ -38,7 +40,7 @@ def __init__(self, class CustomModel(Model): def __init__(self, observation_space, action_space, device="cuda:0"): - super().__init__(observation_space, action_space, device) + Model.__init__(self, observation_space, action_space, device) self.layer_1 = nn.Linear(self.num_observations, 64) self.layer_2 = nn.Linear(64, self.num_actions) @@ -226,7 +228,7 @@ def random_act(self, :type taken_actions: torch.Tensor, optional :param inference: Flag to indicate whether the model is making inference (default: ``False``) :type inference: bool, optional - :param role: Role of the model (default: ``""``) + :param role: Role play by the model (default: ``""``) :type role: str, optional :raises NotImplementedError: Unsupported action space @@ -348,7 +350,7 @@ def act(self, :type taken_actions: torch.Tensor, optional :param inference: Flag to indicate whether the model is making inference (default: ``False``) :type inference: bool, optional - :param role: Role of the model (default: ``""``) + :param role: Role play by the model (default: ``""``) :type role: str, optional :raises NotImplementedError: Child class must implement this method @@ -358,6 +360,7 @@ def act(self, Deterministic agents must ignore the last two components and return empty tensors or None for them :rtype: sequence of torch.Tensor """ + logger.warn("Make sure to place Mixins before Model during model definition") raise NotImplementedError("The action to be taken by the agent (.act()) is not implemented") def set_mode(self, mode: str) -> None: diff --git a/skrl/models/torch/categorical.py b/skrl/models/torch/categorical.py index 295c4957..dc9e06ec 100644 --- a/skrl/models/torch/categorical.py +++ b/skrl/models/torch/categorical.py @@ -1,45 +1,32 @@ -from typing import Optional, Union, Sequence - -import gym +from typing import Optional, Sequence import torch from torch.distributions import Categorical -from . import Model - - -class CategoricalModel(Model): - def __init__(self, - observation_space: Union[int, Sequence[int], gym.Space], - action_space: Union[int, Sequence[int], gym.Space], - device: Union[str, torch.device] = "cuda:0", - unnormalized_log_prob: bool = True) -> None: - """Categorical model (stochastic model) - - :param observation_space: Observation/state space or shape. - The ``num_observations`` property will contain the size of that space - :type observation_space: int, sequence of int, gym.Space - :param action_space: Action space or shape. - The ``num_actions`` property will contain the size of that space - :type action_space: int, sequence of int, gym.Space - :param device: Device on which a torch tensor is or will be allocated (default: ``"cuda:0"``) - :type device: str or torch.device, optional + +class CategoricalMixin: + def __init__(self, unnormalized_log_prob: bool = True, role: str = "") -> None: + """Categorical mixin model (stochastic model) + :param unnormalized_log_prob: Flag to indicate how to be interpreted the model's output (default: ``True``). If True, the model's output is interpreted as unnormalized log probabilities (it can be any real number), otherwise as normalized probabilities (the output must be non-negative, finite and have a non-zero sum) :type unnormalized_log_prob: bool, optional + :param role: Role play by the model (default: ``""``) + :type role: str, optional Example:: # define the model >>> import torch >>> import torch.nn as nn - >>> from skrl.models.torch import CategoricalModel + >>> from skrl.models.torch import Model, CategoricalMixin >>> - >>> class Policy(CategoricalModel): - ... def __init__(self, observation_space, action_space, device, unnormalized_log_prob=True): - ... super().__init__(observation_space, action_space, device, unnormalized_log_prob) + >>> class Policy(CategoricalMixin, Model): + ... def __init__(self, observation_space, action_space, device="cuda:0", unnormalized_log_prob=True): + ... Model.__init__(self, observation_space, action_space, device) + ... CategoricalMixin.__init__(self, unnormalized_log_prob) ... ... self.net = nn.Sequential(nn.Linear(self.num_observations, 32), ... nn.ELU(), @@ -65,11 +52,13 @@ def __init__(self, ) ) """ - super(CategoricalModel, self).__init__(observation_space, action_space, device) - - self._unnormalized_log_prob = unnormalized_log_prob + if not hasattr(self, "_c_unnormalized_log_prob"): + self._c_unnormalized_log_prob = {} + self._c_unnormalized_log_prob[role] = unnormalized_log_prob - self._distribution = None + if not hasattr(self, "_c_distribution"): + self._c_distribution = {} + self._c_distribution[role] = None def act(self, states: torch.Tensor, @@ -85,7 +74,7 @@ def act(self, :type taken_actions: torch.Tensor, optional :param inference: Flag to indicate whether the model is making inference (default: ``False``) :type inference: bool, optional - :param role: Role of the model (default: ``""``) + :param role: Role play by the model (default: ``""``) :type role: str, optional :return: Action to be taken by the agent given the state of the environment. @@ -108,25 +97,27 @@ def act(self, taken_actions.to(self.device) if taken_actions is not None else taken_actions) # unnormalized log probabilities - if self._unnormalized_log_prob: - self._distribution = Categorical(logits=output) + if self._c_unnormalized_log_prob[role] if role in self._c_unnormalized_log_prob else self._c_unnormalized_log_prob[""]: + self._c_distribution[role] = Categorical(logits=output) # normalized probabilities else: - self._distribution = Categorical(probs=output) + self._c_distribution[role] = Categorical(probs=output) # actions and log of the probability density function - actions = self._distribution.sample() - log_prob = self._distribution.log_prob(actions if taken_actions is None else taken_actions.view(-1)) + actions = self._c_distribution[role].sample() + log_prob = self._c_distribution[role].log_prob(actions if taken_actions is None else taken_actions.view(-1)) if inference: return actions.unsqueeze(-1).detach(), log_prob.unsqueeze(-1).detach(), output.detach() return actions.unsqueeze(-1), log_prob.unsqueeze(-1), output - def distribution(self) -> torch.distributions.Categorical: + def distribution(self, role: str = "") -> torch.distributions.Categorical: """Get the current distribution of the model :return: Distribution of the model :rtype: torch.distributions.Categorical + :param role: Role play by the model (default: ``""``) + :type role: str, optional Example:: @@ -134,4 +125,4 @@ def distribution(self) -> torch.distributions.Categorical: >>> print(distribution) Categorical(probs: torch.Size([4096, 2]), logits: torch.Size([4096, 2])) """ - return self._distribution \ No newline at end of file + return self._c_distribution if role in self._c_distribution else self._c_distribution[""] diff --git a/skrl/models/torch/deterministic.py b/skrl/models/torch/deterministic.py index 0e5f5dd4..bde36ec4 100644 --- a/skrl/models/torch/deterministic.py +++ b/skrl/models/torch/deterministic.py @@ -1,41 +1,30 @@ -from typing import Optional, Union, Sequence +from typing import Optional, Sequence import gym import torch -from . import Model +class DeterministicMixin: + def __init__(self, clip_actions: bool = False, role: str = "") -> None: + """Deterministic mixin model (deterministic model) -class DeterministicModel(Model): - def __init__(self, - observation_space: Union[int, Sequence[int], gym.Space], - action_space: Union[int, Sequence[int], gym.Space], - device: Union[str, torch.device] = "cuda:0", - clip_actions: bool = False) -> None: - """Deterministic model (deterministic model) - - :param observation_space: Observation/state space or shape. - The ``num_observations`` property will contain the size of that space - :type observation_space: int, sequence of int, gym.Space - :param action_space: Action space or shape. - The ``num_actions`` property will contain the size of that space - :type action_space: int, sequence of int, gym.Space - :param device: Device on which a torch tensor is or will be allocated (default: ``"cuda:0"``) - :type device: str or torch.device, optional :param clip_actions: Flag to indicate whether the actions should be clipped to the action space (default: ``False``) :type clip_actions: bool, optional + :param role: Role play by the model (default: ``""``) + :type role: str, optional Example:: # define the model >>> import torch >>> import torch.nn as nn - >>> from skrl.models.torch import DeterministicModel + >>> from skrl.models.torch import Model, DeterministicMixin >>> - >>> class Value(DeterministicModel): - ... def __init__(self, observation_space, action_space, device, clip_actions=False): - ... super().__init__(observation_space, action_space, device, clip_actions) + >>> class Value(DeterministicMixin, Model): + ... def __init__(self, observation_space, action_space, device="cuda:0", clip_actions=False): + ... Model.__init__(self, observation_space, action_space, device) + ... DeterministicMixin.__init__(self, clip_actions) ... ... self.net = nn.Sequential(nn.Linear(self.num_observations, 32), ... nn.ELU(), @@ -61,11 +50,11 @@ def __init__(self, ) ) """ - super(DeterministicModel, self).__init__(observation_space, action_space, device) - - self.clip_actions = clip_actions and issubclass(type(self.action_space), gym.Space) + if not hasattr(self, "_d_clip_actions"): + self._d_clip_actions = {} + self._d_clip_actions[role] = clip_actions and issubclass(type(self.action_space), gym.Space) - if self.clip_actions: + if self._d_clip_actions[role]: self.clip_actions_min = torch.tensor(self.action_space.low, device=self.device) self.clip_actions_max = torch.tensor(self.action_space.high, device=self.device) @@ -86,7 +75,7 @@ def act(self, :type taken_actions: torch.Tensor, optional :param inference: Flag to indicate whether the model is making inference (default: ``False``) :type inference: bool, optional - :param role: Role of the model (default: ``""``) + :param role: Role play by the model (default: ``""``) :type role: str, optional :return: Action to be taken by the agent given the state of the environment. @@ -109,7 +98,7 @@ def act(self, taken_actions.to(self.device) if taken_actions is not None else taken_actions) # clip actions - if self.clip_actions: + if self._d_clip_actions[role] if role in self._d_clip_actions else self._d_clip_actions[""]: if self._backward_compatibility: actions = torch.max(torch.min(actions, self.clip_actions_max), self.clip_actions_min) else: diff --git a/skrl/models/torch/gaussian.py b/skrl/models/torch/gaussian.py index 8950230e..8cad2667 100644 --- a/skrl/models/torch/gaussian.py +++ b/skrl/models/torch/gaussian.py @@ -1,33 +1,21 @@ -from typing import Optional, Union, Sequence +from typing import Optional, Sequence import gym import torch from torch.distributions import Normal -from . import Model - -class GaussianModel(Model): +class GaussianMixin: def __init__(self, - observation_space: Union[int, Sequence[int], gym.Space], - action_space: Union[int, Sequence[int], gym.Space], - device: Union[str, torch.device] = "cuda:0", clip_actions: bool = False, clip_log_std: bool = True, min_log_std: float = -20, max_log_std: float = 2, - reduction: str = "sum") -> None: - """Gaussian model (stochastic model) - - :param observation_space: Observation/state space or shape. - The ``num_observations`` property will contain the size of that space - :type observation_space: int, sequence of int, gym.Space - :param action_space: Action space or shape. - The ``num_actions`` property will contain the size of that space - :type action_space: int, sequence of int, gym.Space - :param device: Device on which a torch tensor is or will be allocated (default: ``"cuda:0"``) - :type device: str or torch.device, optional + reduction: str = "sum", + role: str = "") -> None: + """Gaussian mixin model (stochastic model) + :param clip_actions: Flag to indicate whether the actions should be clipped to the action space (default: ``False``) :type clip_actions: bool, optional :param clip_log_std: Flag to indicate whether the log standard deviations should be clipped (default: ``True``) @@ -40,6 +28,8 @@ def __init__(self, Supported values are ``"mean"``, ``"sum"``, ``"prod"`` and ``"none"``. If "``none"``, the log probability density function is returned as a tensor of shape ``(num_samples, num_actions)`` instead of ``(num_samples, 1)`` :type reduction: str, optional + :param role: Role play by the model (default: ``""``) + :type role: str, optional :raises ValueError: If the reduction method is not valid @@ -48,13 +38,13 @@ def __init__(self, # define the model >>> import torch >>> import torch.nn as nn - >>> from skrl.models.torch import GaussianModel + >>> from skrl.models.torch import Model, GaussianMixin >>> - >>> class Policy(GaussianModel): - ... def __init__(self, observation_space, action_space, device, clip_actions=False, - ... clip_log_std=True, min_log_std=-20, max_log_std=2): - ... super().__init__(observation_space, action_space, device, clip_actions, - ... clip_log_std, min_log_std, max_log_std) + >>> class Policy(GaussianMixin, Model): + ... def __init__(self, observation_space, action_space, device="cuda:0", + ... clip_actions=False, clip_log_std=True, min_log_std=-20, max_log_std=2): + ... Model.__init__(self, observation_space, action_space, device) + ... GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std) ... ... self.net = nn.Sequential(nn.Linear(self.num_observations, 32), ... nn.ELU(), @@ -81,28 +71,42 @@ def __init__(self, ) ) """ - super(GaussianModel, self).__init__(observation_space, action_space, device) - - self.clip_actions = clip_actions and issubclass(type(self.action_space), gym.Space) + if not hasattr(self, "_g_clip_actions"): + self._g_clip_actions = {} + self._g_clip_actions[role] = clip_actions and issubclass(type(self.action_space), gym.Space) - if self.clip_actions: + if self._g_clip_actions[role]: self.clip_actions_min = torch.tensor(self.action_space.low, device=self.device) self.clip_actions_max = torch.tensor(self.action_space.high, device=self.device) # backward compatibility: torch < 1.9 clamp method does not support tensors self._backward_compatibility = tuple(map(int, (torch.__version__.split(".")[:2]))) < (1, 9) - self.clip_log_std = clip_log_std - self.log_std_min = min_log_std - self.log_std_max = max_log_std - - self._log_std = None - self._num_samples = None - self._distribution = None + if not hasattr(self, "_g_clip_log_std"): + self._g_clip_log_std = {} + self._g_clip_log_std[role] = clip_log_std + if not hasattr(self, "_g_log_std_min"): + self._g_log_std_min = {} + self._g_log_std_min[role] = min_log_std + if not hasattr(self, "_g_log_std_max"): + self._g_log_std_max = {} + self._g_log_std_max[role] = max_log_std + + if not hasattr(self, "_g_log_std"): + self._g_log_std = {} + self._g_log_std[role] = None + if not hasattr(self, "_g_num_samples"): + self._g_num_samples = {} + self._g_num_samples[role] = None + if not hasattr(self, "_g_distribution"): + self._g_distribution = {} + self._g_distribution[role] = None if reduction not in ["mean", "sum", "prod", "none"]: raise ValueError("reduction must be one of 'mean', 'sum', 'prod' or 'none'") - self._reduction = torch.mean if reduction == "mean" else torch.sum if reduction == "sum" \ + if not hasattr(self, "_g_reduction"): + self._g_reduction = {} + self._g_reduction[role] = torch.mean if reduction == "mean" else torch.sum if reduction == "sum" \ else torch.prod if reduction == "prod" else None def act(self, @@ -119,7 +123,7 @@ def act(self, :type taken_actions: torch.Tensor, optional :param inference: Flag to indicate whether the model is making inference (default: ``False``) :type inference: bool, optional - :param role: Role of the model (default: ``""``) + :param role: Role play by the model (default: ``""``) :type role: str, optional :return: Action to be taken by the agent given the state of the environment. @@ -142,29 +146,32 @@ def act(self, taken_actions.to(self.device) if taken_actions is not None else taken_actions) # clamp log standard deviations - if self.clip_log_std: - log_std = torch.clamp(log_std, self.log_std_min, self.log_std_max) + if self._g_clip_log_std[role] if role in self._g_clip_log_std else self._g_clip_log_std[""]: + log_std = torch.clamp(log_std, + self._g_log_std_min[role] if role in self._g_log_std_min else self._g_log_std_min[""], + self._g_log_std_max[role] if role in self._g_log_std_max else self._g_log_std_max[""]) - self._log_std = log_std - self._num_samples = actions_mean.shape[0] + self._g_log_std[role] = log_std + self._g_num_samples[role] = actions_mean.shape[0] # distribution - self._distribution = Normal(actions_mean, log_std.exp()) + self._g_distribution[role] = Normal(actions_mean, log_std.exp()) # sample using the reparameterization trick - actions = self._distribution.rsample() + actions = self._g_distribution[role].rsample() # clip actions - if self.clip_actions: + if self._g_clip_actions[role] if role in self._g_clip_actions else self._g_clip_actions[""]: if self._backward_compatibility: actions = torch.max(torch.min(actions, self.clip_actions_max), self.clip_actions_min) else: actions = torch.clamp(actions, min=self.clip_actions_min, max=self.clip_actions_max) # log of the probability density function - log_prob = self._distribution.log_prob(actions if taken_actions is None else taken_actions) - if self._reduction is not None: - log_prob = self._reduction(log_prob, dim=-1) + log_prob = self._g_distribution[role].log_prob(actions if taken_actions is None else taken_actions) + reduction = self._g_reduction[role] if role in self._g_reduction else self._g_reduction[""] + if reduction is not None: + log_prob = reduction(log_prob, dim=-1) if log_prob.dim() != actions.dim(): log_prob = log_prob.unsqueeze(-1) @@ -172,11 +179,13 @@ def act(self, return actions.detach(), log_prob.detach(), actions_mean.detach() return actions, log_prob, actions_mean - def get_entropy(self) -> torch.Tensor: + def get_entropy(self, role: str = "") -> torch.Tensor: """Compute and return the entropy of the model :return: Entropy of the model :rtype: torch.Tensor + :param role: Role play by the model (default: ``""``) + :type role: str, optional Example:: @@ -184,15 +193,18 @@ def get_entropy(self) -> torch.Tensor: >>> print(entropy.shape) torch.Size([4096, 8]) """ - if self._distribution is None: + distribution = self._g_distribution[role] if role in self._g_distribution else self._g_distribution[""] + if distribution is None: return torch.tensor(0.0, device=self.device) - return self._distribution.entropy().to(self.device) + return distribution.entropy().to(self.device) - def get_log_std(self) -> torch.Tensor: + def get_log_std(self, role: str = "") -> torch.Tensor: """Return the log standard deviation of the model :return: Log standard deviation of the model :rtype: torch.Tensor + :param role: Role play by the model (default: ``""``) + :type role: str, optional Example:: @@ -200,13 +212,16 @@ def get_log_std(self) -> torch.Tensor: >>> print(log_std.shape) torch.Size([4096, 8]) """ - return self._log_std.repeat(self._num_samples, 1) + return (self._g_log_std[role] if role in self._g_log_std else self._g_log_std[""]) \ + .repeat(self._g_num_samples[role] if role in self._g_num_samples else self._g_num_samples[""], 1) - def distribution(self) -> torch.distributions.Normal: + def distribution(self, role: str = "") -> torch.distributions.Normal: """Get the current distribution of the model :return: Distribution of the model :rtype: torch.distributions.Normal + :param role: Role play by the model (default: ``""``) + :type role: str, optional Example:: @@ -214,4 +229,4 @@ def distribution(self) -> torch.distributions.Normal: >>> print(distribution) Normal(loc: torch.Size([4096, 8]), scale: torch.Size([4096, 8])) """ - return self._distribution + return self._g_distribution[role] if role in self._g_distribution else self._g_distribution[""] diff --git a/skrl/models/torch/multivariate_gaussian.py b/skrl/models/torch/multivariate_gaussian.py index b7118b53..684b41ae 100644 --- a/skrl/models/torch/multivariate_gaussian.py +++ b/skrl/models/torch/multivariate_gaussian.py @@ -1,32 +1,20 @@ -from typing import Optional, Union, Sequence +from typing import Optional, Sequence import gym import torch from torch.distributions import MultivariateNormal -from . import Model - -class MultivariateGaussianModel(Model): +class MultivariateGaussianMixin: def __init__(self, - observation_space: Union[int, Sequence[int], gym.Space], - action_space: Union[int, Sequence[int], gym.Space], - device: Union[str, torch.device] = "cuda:0", clip_actions: bool = False, clip_log_std: bool = True, min_log_std: float = -20, - max_log_std: float = 2) -> None: - """Multivariate Gaussian model (stochastic model) - - :param observation_space: Observation/state space or shape. - The ``num_observations`` property will contain the size of that space - :type observation_space: int, sequence of int, gym.Space - :param action_space: Action space or shape. - The ``num_actions`` property will contain the size of that space - :type action_space: int, sequence of int, gym.Space - :param device: Device on which a torch tensor is or will be allocated (default: ``"cuda:0"``) - :type device: str or torch.device, optional + max_log_std: float = 2, + role: str = "") -> None: + """Multivariate Gaussian mixin model (stochastic model) + :param clip_actions: Flag to indicate whether the actions should be clipped to the action space (default: ``False``) :type clip_actions: bool, optional :param clip_log_std: Flag to indicate whether the log standard deviations should be clipped (default: ``True``) @@ -35,19 +23,21 @@ def __init__(self, :type min_log_std: float, optional :param max_log_std: Maximum value of the log standard deviation if ``clip_log_std`` is True (default: ``2``) :type max_log_std: float, optional + :param role: Role play by the model (default: ``""``) + :type role: str, optional Example:: # define the model >>> import torch >>> import torch.nn as nn - >>> from skrl.models.torch import MultivariateGaussianModel + >>> from skrl.models.torch import Model, MultivariateGaussianMixin >>> - >>> class Policy(MultivariateGaussianModel): - ... def __init__(self, observation_space, action_space, device, clip_actions=False, + >>> class Policy(MultivariateGaussianMixin, Model): + ... def __init__(self, observation_space, action_space, device="cuda:0", clip_actions=False, ... clip_log_std=True, min_log_std=-20, max_log_std=2): - ... super().__init__(observation_space, action_space, device, clip_actions, - ... clip_log_std, min_log_std, max_log_std) + ... Model.__init__(self, observation_space, action_space, device) + ... MultivariateGaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std) ... ... self.net = nn.Sequential(nn.Linear(self.num_observations, 32), ... nn.ELU(), @@ -74,24 +64,36 @@ def __init__(self, ) ) """ - super(MultivariateGaussianModel, self).__init__(observation_space, action_space, device) - - self.clip_actions = clip_actions and issubclass(type(self.action_space), gym.Space) + if not hasattr(self, "_mg_clip_actions"): + self._mg_clip_actions = {} + self._mg_clip_actions[role] = clip_actions and issubclass(type(self.action_space), gym.Space) - if self.clip_actions: + if self._mg_clip_actions[role]: self.clip_actions_min = torch.tensor(self.action_space.low, device=self.device) self.clip_actions_max = torch.tensor(self.action_space.high, device=self.device) # backward compatibility: torch < 1.9 clamp method does not support tensors self._backward_compatibility = tuple(map(int, (torch.__version__.split(".")[:2]))) < (1, 9) - self.clip_log_std = clip_log_std - self.log_std_min = min_log_std - self.log_std_max = max_log_std - - self._log_std = None - self._num_samples = None - self._distribution = None + if not hasattr(self, "_mg_clip_log_std"): + self._mg_clip_log_std = {} + self._mg_clip_log_std[role] = clip_log_std + if not hasattr(self, "_mg_log_std_min"): + self._mg_log_std_min = {} + self._mg_log_std_min[role] = min_log_std + if not hasattr(self, "_mg_log_std_max"): + self._mg_log_std_max = {} + self._mg_log_std_max[role] = max_log_std + + if not hasattr(self, "_mg_log_std"): + self._mg_log_std = {} + self._mg_log_std[role] = None + if not hasattr(self, "_mg_num_samples"): + self._mg_num_samples = {} + self._mg_num_samples[role] = None + if not hasattr(self, "_mg_distribution"): + self._mg_distribution = {} + self._mg_distribution[role] = None def act(self, states: torch.Tensor, @@ -107,7 +109,7 @@ def act(self, :type taken_actions: torch.Tensor, optional :param inference: Flag to indicate whether the model is making inference (default: ``False``) :type inference: bool, optional - :param role: Role of the model (default: ``""``) + :param role: Role play by the model (default: ``""``) :type role: str, optional :return: Action to be taken by the agent given the state of the environment. @@ -130,28 +132,30 @@ def act(self, taken_actions.to(self.device) if taken_actions is not None else taken_actions) # clamp log standard deviations - if self.clip_log_std: - log_std = torch.clamp(log_std, self.log_std_min, self.log_std_max) + if self._mg_clip_log_std[role] if role in self._mg_clip_log_std else self._mg_clip_log_std[""]: + log_std = torch.clamp(log_std, + self._mg_log_std_min[role] if role in self._mg_log_std_min else self._mg_log_std_min[""], + self._mg_log_std_max[role] if role in self._mg_log_std_max else self._mg_log_std_max[""]) - self._log_std = log_std - self._num_samples = actions_mean.shape[0] + self._mg_log_std[role] = log_std + self._mg_num_samples[role] = actions_mean.shape[0] # distribution covariance = torch.diag(log_std.exp() * log_std.exp()) - self._distribution = MultivariateNormal(actions_mean, scale_tril=covariance) + self._mg_distribution[role] = MultivariateNormal(actions_mean, scale_tril=covariance) # sample using the reparameterization trick - actions = self._distribution.rsample() + actions = self._mg_distribution[role].rsample() # clip actions - if self.clip_actions: + if self._mg_clip_actions[role] if role in self._mg_clip_actions else self._mg_clip_actions[""]: if self._backward_compatibility: actions = torch.max(torch.min(actions, self.clip_actions_max), self.clip_actions_min) else: actions = torch.clamp(actions, min=self.clip_actions_min, max=self.clip_actions_max) # log of the probability density function - log_prob = self._distribution.log_prob(actions if taken_actions is None else taken_actions) + log_prob = self._mg_distribution[role].log_prob(actions if taken_actions is None else taken_actions) if log_prob.dim() != actions.dim(): log_prob = log_prob.unsqueeze(-1) @@ -159,11 +163,13 @@ def act(self, return actions.detach(), log_prob.detach(), actions_mean.detach() return actions, log_prob, actions_mean - def get_entropy(self) -> torch.Tensor: + def get_entropy(self, role: str = "") -> torch.Tensor: """Compute and return the entropy of the model :return: Entropy of the model :rtype: torch.Tensor + :param role: Role play by the model (default: ``""``) + :type role: str, optional Example:: @@ -171,15 +177,18 @@ def get_entropy(self) -> torch.Tensor: >>> print(entropy.shape) torch.Size([4096]) """ - if self._distribution is None: + distribution = self._mg_distribution[role] if role in self._mg_distribution else self._mg_distribution[""] + if distribution is None: return torch.tensor(0.0, device=self.device) - return self._distribution.entropy().to(self.device) + return distribution.entropy().to(self.device) - def get_log_std(self) -> torch.Tensor: + def get_log_std(self, role: str = "") -> torch.Tensor: """Return the log standard deviation of the model :return: Log standard deviation of the model :rtype: torch.Tensor + :param role: Role play by the model (default: ``""``) + :type role: str, optional Example:: @@ -187,13 +196,16 @@ def get_log_std(self) -> torch.Tensor: >>> print(log_std.shape) torch.Size([4096, 8]) """ - return self._log_std.repeat(self._num_samples, 1) - - def distribution(self) -> torch.distributions.MultivariateNormal: + return (self._mg_log_std[role] if role in self._mg_log_std else self._mg_log_std[""]) \ + .repeat(self._mg_num_samples[role] if role in self._mg_num_samples else self._mg_num_samples[""], 1) + + def distribution(self, role: str = "") -> torch.distributions.MultivariateNormal: """Get the current distribution of the model :return: Distribution of the model :rtype: torch.distributions.MultivariateNormal + :param role: Role play by the model (default: ``""``) + :type role: str, optional Example:: @@ -201,4 +213,4 @@ def distribution(self) -> torch.distributions.MultivariateNormal: >>> print(distribution) MultivariateNormal(loc: torch.Size([4096, 8]), scale_tril: torch.Size([4096, 8, 8])) """ - return self._distribution + return self._mg_distribution[role] if role in self._mg_distribution else self._mg_distribution[""] From bdebddb69884ff47e9bf5c5cc7a9dbcad1b4e14f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20Serrano=20Mu=C3=B1oz?= Date: Sat, 27 Aug 2022 19:41:19 +0200 Subject: [PATCH 043/108] Update models docs to reflect class changes --- .../modules/skrl.models.categorical.rst | 25 ++++----- .../modules/skrl.models.deterministic.rst | 25 ++++----- docs/source/modules/skrl.models.gaussian.rst | 25 ++++----- .../skrl.models.multivariate_gaussian.rst | 19 +++---- docs/source/snippets/categorical_model.py | 46 ++++----------- docs/source/snippets/deterministic_model.py | 46 ++++----------- docs/source/snippets/gaussian_model.py | 56 +++++-------------- .../snippets/multivariate_gaussian_model.py | 47 +++++----------- 8 files changed, 93 insertions(+), 196 deletions(-) diff --git a/docs/source/modules/skrl.models.categorical.rst b/docs/source/modules/skrl.models.categorical.rst index 8e9c540a..66c075c9 100644 --- a/docs/source/modules/skrl.models.categorical.rst +++ b/docs/source/modules/skrl.models.categorical.rst @@ -4,15 +4,15 @@ Categorical model ================= Concept -^^^^^^^ +------- .. image:: ../_static/imgs/model_categorical.svg - :width: 100% - :align: center - :alt: Categorical model + :width: 100% + :align: center + :alt: Categorical model Basic usage -^^^^^^^^^^^ +----------- .. tabs:: @@ -33,11 +33,10 @@ Basic usage :end-before: [end-cnn] API -^^^ - -.. autoclass:: skrl.models.torch.categorical.CategoricalModel - :show-inheritance: - :members: - - .. automethod:: __init__ - .. automethod:: compute +--- + +.. autoclass:: skrl.models.torch.categorical.CategoricalMixin + :show-inheritance: + :members: + + .. automethod:: __init__ diff --git a/docs/source/modules/skrl.models.deterministic.rst b/docs/source/modules/skrl.models.deterministic.rst index 733800a1..04266a88 100644 --- a/docs/source/modules/skrl.models.deterministic.rst +++ b/docs/source/modules/skrl.models.deterministic.rst @@ -4,15 +4,15 @@ Deterministic model =================== Concept -^^^^^^^ +------- .. image:: ../_static/imgs/model_deterministic.svg - :width: 65% - :align: center - :alt: Deterministic model + :width: 65% + :align: center + :alt: Deterministic model Basic usage -^^^^^^^^^^^ +----------- .. tabs:: @@ -33,11 +33,10 @@ Basic usage :end-before: [end-cnn] API -^^^ - -.. autoclass:: skrl.models.torch.deterministic.DeterministicModel - :show-inheritance: - :members: - - .. automethod:: __init__ - .. automethod:: compute +--- + +.. autoclass:: skrl.models.torch.deterministic.DeterministicMixin + :show-inheritance: + :members: + + .. automethod:: __init__ diff --git a/docs/source/modules/skrl.models.gaussian.rst b/docs/source/modules/skrl.models.gaussian.rst index 9739bf25..7a8b788b 100644 --- a/docs/source/modules/skrl.models.gaussian.rst +++ b/docs/source/modules/skrl.models.gaussian.rst @@ -4,15 +4,15 @@ Gaussian model ============== Concept -^^^^^^^ +------- .. image:: ../_static/imgs/model_gaussian.svg - :width: 100% - :align: center - :alt: Gaussian model + :width: 100% + :align: center + :alt: Gaussian model Basic usage -^^^^^^^^^^^ +----------- .. tabs:: @@ -33,11 +33,10 @@ Basic usage :end-before: [end-cnn] API -^^^ - -.. autoclass:: skrl.models.torch.gaussian.GaussianModel - :show-inheritance: - :members: - - .. automethod:: __init__ - .. automethod:: compute +--- + +.. autoclass:: skrl.models.torch.gaussian.GaussianMixin + :show-inheritance: + :members: + + .. automethod:: __init__ diff --git a/docs/source/modules/skrl.models.multivariate_gaussian.rst b/docs/source/modules/skrl.models.multivariate_gaussian.rst index 3ad21398..c4a20066 100644 --- a/docs/source/modules/skrl.models.multivariate_gaussian.rst +++ b/docs/source/modules/skrl.models.multivariate_gaussian.rst @@ -4,7 +4,7 @@ Multivariate Gaussian model =========================== Concept -^^^^^^^ +------- .. image:: ../_static/imgs/model_multivariate_gaussian.svg :width: 100% @@ -12,7 +12,7 @@ Concept :alt: Multivariate Gaussian model Basic usage -^^^^^^^^^^^ +----------- .. tabs:: @@ -33,11 +33,10 @@ Basic usage :end-before: [end-cnn] API -^^^ - -.. autoclass:: skrl.models.torch.multivariate_gaussian.MultivariateGaussianModel - :show-inheritance: - :members: - - .. automethod:: __init__ - .. automethod:: compute +--- + +.. autoclass:: skrl.models.torch.multivariate_gaussian.MultivariateGaussianMixin + :show-inheritance: + :members: + + .. automethod:: __init__ diff --git a/docs/source/snippets/categorical_model.py b/docs/source/snippets/categorical_model.py index 799bd6c6..66fcc47b 100644 --- a/docs/source/snippets/categorical_model.py +++ b/docs/source/snippets/categorical_model.py @@ -1,29 +1,21 @@ -import gym - -class DummyEnv: - observation_space = gym.spaces.Box(low=-1, high=1, shape=(4,)) - action_space = gym.spaces.Discrete(2) - device = "cuda:0" - -env = DummyEnv() - # [start-mlp] import torch.nn as nn import torch.nn.functional as F -from skrl.models.torch import CategoricalModel +from skrl.models.torch import Model, CategoricalMixin # define the model -class MLP(CategoricalModel): +class MLP(CategoricalMixin, Model): def __init__(self, observation_space, action_space, device, unnormalized_log_prob=True): - super().__init__(observation_space, action_space, device, unnormalized_log_prob) + Model.__init__(self, observation_space, action_space, device) + CategoricalMixin.__init__(self, unnormalized_log_prob) self.linear_layer_1 = nn.Linear(self.num_observations, 64) self.linear_layer_2 = nn.Linear(64, 32) self.output_layer = nn.Linear(32, self.num_actions) - def compute(self, states, taken_actions): + def compute(self, states, taken_actions, role): x = F.relu(self.linear_layer_1(states)) x = F.relu(self.linear_layer_2(x)) return self.output_layer(x) @@ -36,32 +28,19 @@ def compute(self, states, taken_actions): unnormalized_log_prob=True) # [end-mlp] -import torch -policy.to(env.device) -actions = policy.act(torch.randn(10, 4, device=env.device)) -assert actions[0].shape == torch.Size([10, 1]) - # ============================================================================= -import gym - -class DummyEnv: - observation_space = gym.spaces.Box(low=0, high=255, shape=(128, 128, 3)) - action_space = gym.spaces.Discrete(3) - device = "cuda:0" - -env = DummyEnv() - # [start-cnn] import torch.nn as nn -from skrl.models.torch import CategoricalModel +from skrl.models.torch import Model, CategoricalMixin # define the model -class CNN(CategoricalModel): +class CNN(CategoricalMixin, Model): def __init__(self, observation_space, action_space, device, unnormalized_log_prob=True): - super().__init__(observation_space, action_space, device, unnormalized_log_prob) + Model.__init__(self, observation_space, action_space, device) + CategoricalMixin.__init__(self, unnormalized_log_prob) self.net = nn.Sequential(nn.Conv2d(3, 32, kernel_size=8, stride=4), nn.ReLU(), @@ -80,7 +59,7 @@ def __init__(self, observation_space, action_space, device, unnormalized_log_pro nn.Tanh(), nn.Linear(32, self.num_actions)) - def compute(self, states, taken_actions): + def compute(self, states, taken_actions, role): # permute (samples, width, height, channels) -> (samples, channels, width, height) return self.net(states.permute(0, 3, 1, 2)) @@ -91,8 +70,3 @@ def compute(self, states, taken_actions): device=env.device, unnormalized_log_prob=True) # [end-cnn] - -import torch -policy.to(env.device) -actions = policy.act(torch.randn(10, 128, 128, 3, device=env.device)) -assert actions[0].shape == torch.Size([10, 1]) diff --git a/docs/source/snippets/deterministic_model.py b/docs/source/snippets/deterministic_model.py index 797c4045..d735ef63 100644 --- a/docs/source/snippets/deterministic_model.py +++ b/docs/source/snippets/deterministic_model.py @@ -1,23 +1,15 @@ -import gym - -class DummyEnv: - observation_space = gym.spaces.Box(low=-1, high=1, shape=(4,)) - action_space = gym.spaces.Box(low=-1, high=1, shape=(3,)) - device = "cuda:0" - -env = DummyEnv() - # [start-mlp] import torch import torch.nn as nn -from skrl.models.torch import DeterministicModel +from skrl.models.torch import Model, DeterministicMixin # define the model -class MLP(DeterministicModel): +class MLP(DeterministicMixin, Model): def __init__(self, observation_space, action_space, device, clip_actions=False): - super().__init__(observation_space, action_space, device, clip_actions) + Model.__init__(self, observation_space, action_space, device) + DeterministicMixin.__init__(self, clip_actions) self.net = nn.Sequential(nn.Linear(self.num_observations + self.num_actions, 64), nn.ReLU(), @@ -25,7 +17,7 @@ def __init__(self, observation_space, action_space, device, clip_actions=False): nn.ReLU(), nn.Linear(32, 1)) - def compute(self, states, taken_actions): + def compute(self, states, taken_actions, role): return self.net(torch.cat([states, taken_actions], dim=1)) @@ -36,33 +28,20 @@ def compute(self, states, taken_actions): clip_actions=False) # [end-mlp] -import torch -policy.to(env.device) -actions = policy.act(torch.randn(10, 4, device=env.device), torch.randn(10, 3, device=env.device)) -assert actions[0].shape == torch.Size([10, 1]) - # ============================================================================= -import gym - -class DummyEnv: - observation_space = gym.spaces.Box(low=0, high=255, shape=(64, 64, 3)) - action_space = gym.spaces.Box(low=-1, high=1, shape=(3,)) - device = "cuda:0" - -env = DummyEnv() - # [start-cnn] import torch import torch.nn as nn -from skrl.models.torch import DeterministicModel +from skrl.models.torch import Model, DeterministicMixin # define the model -class CNN(DeterministicModel): +class CNN(DeterministicMixin, Model): def __init__(self, observation_space, action_space, device, clip_actions=False): - super().__init__(observation_space, action_space, device, clip_actions) + Model.__init__(self, observation_space, action_space, device) + DeterministicMixin.__init__(self, clip_actions) self.features_extractor = nn.Sequential(nn.Conv2d(3, 32, kernel_size=8, stride=3), nn.ReLU(), @@ -81,7 +60,7 @@ def __init__(self, observation_space, action_space, device, clip_actions=False): nn.Tanh(), nn.Linear(32, 1)) - def compute(self, states, taken_actions): + def compute(self, states, taken_actions, role): # permute (samples, width, height, channels) -> (samples, channels, width, height) x = self.features_extractor(states.permute(0, 3, 1, 2)) return self.net(torch.cat([x, taken_actions], dim=1)) @@ -93,8 +72,3 @@ def compute(self, states, taken_actions): device=env.device, clip_actions=False) # [end-cnn] - -import torch -policy.to(env.device) -actions = policy.act(torch.randn(10, 64, 64, 3, device=env.device), torch.randn(10, 3, device=env.device)) -assert actions[0].shape == torch.Size([10, 1]) diff --git a/docs/source/snippets/gaussian_model.py b/docs/source/snippets/gaussian_model.py index 0b1d5f85..50ef6afe 100644 --- a/docs/source/snippets/gaussian_model.py +++ b/docs/source/snippets/gaussian_model.py @@ -1,26 +1,17 @@ -import gym - -class DummyEnv: - observation_space = gym.spaces.Box(low=-1, high=1, shape=(5,)) - action_space = gym.spaces.Box(low=-1, high=1, shape=(3,)) - device = "cuda:0" - -env = DummyEnv() - # [start-mlp] import torch import torch.nn as nn import torch.nn.functional as F -from skrl.models.torch import GaussianModel +from skrl.models.torch import Model, GaussianMixin # define the model -class MLP(GaussianModel): - def __init__(self, observation_space, action_space, device, clip_actions=False, - clip_log_std=True, min_log_std=-20, max_log_std=2, reduction="sum"): - super().__init__(observation_space, action_space, device, clip_actions, - clip_log_std, min_log_std, max_log_std, reduction) +class MLP(GaussianMixin, Model): + def __init__(self, observation_space, action_space, device, + clip_actions=False, clip_log_std=True, min_log_std=-20, max_log_std=2, reduction="sum"): + Model.__init__(self, observation_space, action_space, device) + GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std, reduction) self.linear_layer_1 = nn.Linear(self.num_observations, 128) self.linear_layer_2 = nn.Linear(128, 64) @@ -29,7 +20,7 @@ def __init__(self, observation_space, action_space, device, clip_actions=False, self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) - def compute(self, states, taken_actions): + def compute(self, states, taken_actions, role): x = F.relu(self.linear_layer_1(states)) x = F.relu(self.linear_layer_2(x)) x = F.relu(self.linear_layer_3(x)) @@ -46,36 +37,22 @@ def compute(self, states, taken_actions): reduction="sum") # [end-mlp] -import torch -policy.to(env.device) -actions = policy.act(torch.randn(10, 5, device=env.device), torch.randn(10, 3, device=env.device)) -assert actions[0].shape == torch.Size([10, env.action_space.shape[0]]) - # ============================================================================= -import gym - -class DummyEnv: - observation_space = gym.spaces.Box(low=0, high=255, shape=(256, 256, 1)) - action_space = gym.spaces.Box(low=-1, high=1, shape=(2,)) - device = "cuda:0" - -env = DummyEnv() - # [start-cnn] import torch import torch.nn as nn import torch.nn.functional as F -from skrl.models.torch import GaussianModel +from skrl.models.torch import Model, GaussianMixin # define the model -class CNN(GaussianModel): - def __init__(self, observation_space, action_space, device, clip_actions=False, - clip_log_std=True, min_log_std=-20, max_log_std=2, reduction="sum"): - super().__init__(observation_space, action_space, device, clip_actions, - clip_log_std, min_log_std, max_log_std, reduction) +class CNN(GaussianMixin, Model): + def __init__(self, observation_space, action_space, device, + clip_actions=False, clip_log_std=True, min_log_std=-20, max_log_std=2, reduction="sum"): + Model.__init__(self, observation_space, action_space, device) + GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std, reduction) self.net = nn.Sequential(nn.Conv2d(1, 64, kernel_size=4, stride=2), nn.ReLU(), @@ -96,7 +73,7 @@ def __init__(self, observation_space, action_space, device, clip_actions=False, self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) - def compute(self, states, taken_actions): + def compute(self, states, taken_actions, role): # permute (samples, width, height, channels) -> (samples, channels, width, height) return self.net(states.permute(0, 3, 1, 2)), self.log_std_parameter @@ -111,8 +88,3 @@ def compute(self, states, taken_actions): max_log_std=2, reduction="sum") # [end-cnn] - -import torch -policy.to(env.device) -actions = policy.act(torch.randn(10, 256, 256, 1, device=env.device), torch.randn(10, 2, device=env.device)) -assert actions[0].shape == torch.Size([10, env.action_space.shape[0]]) diff --git a/docs/source/snippets/multivariate_gaussian_model.py b/docs/source/snippets/multivariate_gaussian_model.py index 69fbad38..d384271c 100644 --- a/docs/source/snippets/multivariate_gaussian_model.py +++ b/docs/source/snippets/multivariate_gaussian_model.py @@ -12,15 +12,15 @@ class DummyEnv: import torch.nn as nn import torch.nn.functional as F -from skrl.models.torch import MultivariateGaussianModel +from skrl.models.torch import Model, MultivariateGaussianMixin # define the model -class MLP(MultivariateGaussianModel): - def __init__(self, observation_space, action_space, device, clip_actions=False, - clip_log_std=True, min_log_std=-20, max_log_std=2): - super().__init__(observation_space, action_space, device, clip_actions, - clip_log_std, min_log_std, max_log_std) +class MLP(MultivariateGaussianMixin, Model): + def __init__(self, observation_space, action_space, device, + clip_actions=False, clip_log_std=True, min_log_std=-20, max_log_std=2): + Model.__init__(self, observation_space, action_space, device) + MultivariateGaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std) self.linear_layer_1 = nn.Linear(self.num_observations, 128) self.linear_layer_2 = nn.Linear(128, 64) @@ -29,7 +29,7 @@ def __init__(self, observation_space, action_space, device, clip_actions=False, self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) - def compute(self, states, taken_actions): + def compute(self, states, taken_actions, role): x = F.relu(self.linear_layer_1(states)) x = F.relu(self.linear_layer_2(x)) x = F.relu(self.linear_layer_3(x)) @@ -45,36 +45,22 @@ def compute(self, states, taken_actions): max_log_std=2) # [end-mlp] -import torch -policy.to(env.device) -actions = policy.act(torch.randn(10, 5, device=env.device), torch.randn(10, 3, device=env.device)) -assert actions[0].shape == torch.Size([10, env.action_space.shape[0]]) - # ============================================================================= -import gym - -class DummyEnv: - observation_space = gym.spaces.Box(low=0, high=255, shape=(256, 256, 1)) - action_space = gym.spaces.Box(low=-1, high=1, shape=(2,)) - device = "cuda:0" - -env = DummyEnv() - # [start-cnn] import torch import torch.nn as nn import torch.nn.functional as F -from skrl.models.torch import MultivariateGaussianModel +from skrl.models.torch import Model, MultivariateGaussianMixin # define the model -class CNN(MultivariateGaussianModel): - def __init__(self, observation_space, action_space, device, clip_actions=False, - clip_log_std=True, min_log_std=-20, max_log_std=2): - super().__init__(observation_space, action_space, device, clip_actions, - clip_log_std, min_log_std, max_log_std) +class CNN(MultivariateGaussianMixin, Model): + def __init__(self, observation_space, action_space, device, + clip_actions=False, clip_log_std=True, min_log_std=-20, max_log_std=2): + Model.__init__(self, observation_space, action_space, device) + MultivariateGaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std) self.net = nn.Sequential(nn.Conv2d(1, 64, kernel_size=4, stride=2), nn.ReLU(), @@ -95,7 +81,7 @@ def __init__(self, observation_space, action_space, device, clip_actions=False, self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) - def compute(self, states, taken_actions): + def compute(self, states, taken_actions, role): # permute (samples, width, height, channels) -> (samples, channels, width, height) return self.net(states.permute(0, 3, 1, 2)), self.log_std_parameter @@ -109,8 +95,3 @@ def compute(self, states, taken_actions): min_log_std=-20, max_log_std=2) # [end-cnn] - -import torch -policy.to(env.device) -actions = policy.act(torch.randn(10, 256, 256, 1, device=env.device), torch.randn(10, 2, device=env.device)) -assert actions[0].shape == torch.Size([10, env.action_space.shape[0]]) From ffe3850a4a750eb2cc938c602e7ad074f18f215b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20Serrano=20Mu=C3=B1oz?= Date: Sun, 28 Aug 2022 00:02:23 +0200 Subject: [PATCH 044/108] Fix model weights initialization for sequential containers --- skrl/models/torch/base.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/skrl/models/torch/base.py b/skrl/models/torch/base.py index c51e98da..1d4b915a 100644 --- a/skrl/models/torch/base.py +++ b/skrl/models/torch/base.py @@ -301,7 +301,7 @@ def init_weights(self, method_name: str = "orthogonal_", *args, **kwargs) -> Non def _update_weights(module, method_name, args, kwargs): for layer in module: if isinstance(layer, torch.nn.Sequential): - _update_weights(layer) + _update_weights(layer, method_name, args, kwargs) elif isinstance(layer, torch.nn.Linear): exec("torch.nn.init.{}(layer.weight, *args, **kwargs)".format(method_name)) @@ -316,7 +316,8 @@ def forward(self): def compute(self, states: torch.Tensor, - taken_actions: Optional[torch.Tensor] = None) -> Union[torch.Tensor, Sequence[torch.Tensor]]: + taken_actions: Optional[torch.Tensor] = None, + role: str = "") -> Union[torch.Tensor, Sequence[torch.Tensor]]: """Define the computation performed (to be implemented by the inheriting classes) by the models :param states: Observation/state of the environment used to make the decision @@ -324,6 +325,8 @@ def compute(self, :param taken_actions: Actions taken by a policy to the given states (default: ``None``). The use of these actions only makes sense in critical models, e.g. :type taken_actions: torch.Tensor, optional + :param role: Role play by the model (default: ``""``) + :type role: str, optional :raises NotImplementedError: Child class must implement this method From e23d25f3fc714fd8fb67d24bc4e3fd8a4fdc6031 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20Serrano=20Mu=C3=B1oz?= Date: Sun, 28 Aug 2022 00:10:55 +0200 Subject: [PATCH 045/108] Forward role argument to compute method --- skrl/models/torch/categorical.py | 2 +- skrl/models/torch/deterministic.py | 2 +- skrl/models/torch/gaussian.py | 3 ++- skrl/models/torch/multivariate_gaussian.py | 3 ++- 4 files changed, 6 insertions(+), 4 deletions(-) diff --git a/skrl/models/torch/categorical.py b/skrl/models/torch/categorical.py index dc9e06ec..fba74763 100644 --- a/skrl/models/torch/categorical.py +++ b/skrl/models/torch/categorical.py @@ -91,7 +91,7 @@ def act(self, # map from states/observations to normalized probabilities or unnormalized log probabilities if self._instantiator_net is None: output = self.compute(states.to(self.device), - taken_actions.to(self.device) if taken_actions is not None else taken_actions) + taken_actions.to(self.device) if taken_actions is not None else taken_actions, role) else: output = self._get_instantiator_output(states.to(self.device), \ taken_actions.to(self.device) if taken_actions is not None else taken_actions) diff --git a/skrl/models/torch/deterministic.py b/skrl/models/torch/deterministic.py index bde36ec4..8551d00a 100644 --- a/skrl/models/torch/deterministic.py +++ b/skrl/models/torch/deterministic.py @@ -92,7 +92,7 @@ def act(self, # map from observations/states to actions if self._instantiator_net is None: actions = self.compute(states.to(self.device), - taken_actions.to(self.device) if taken_actions is not None else taken_actions) + taken_actions.to(self.device) if taken_actions is not None else taken_actions, role) else: actions = self._get_instantiator_output(states.to(self.device), \ taken_actions.to(self.device) if taken_actions is not None else taken_actions) diff --git a/skrl/models/torch/gaussian.py b/skrl/models/torch/gaussian.py index 8cad2667..b80895aa 100644 --- a/skrl/models/torch/gaussian.py +++ b/skrl/models/torch/gaussian.py @@ -140,7 +140,8 @@ def act(self, # map from states/observations to mean actions and log standard deviations if self._instantiator_net is None: actions_mean, log_std = self.compute(states.to(self.device), - taken_actions.to(self.device) if taken_actions is not None else taken_actions) + taken_actions.to(self.device) if taken_actions is not None else taken_actions, + role) else: actions_mean, log_std = self._get_instantiator_output(states.to(self.device), \ taken_actions.to(self.device) if taken_actions is not None else taken_actions) diff --git a/skrl/models/torch/multivariate_gaussian.py b/skrl/models/torch/multivariate_gaussian.py index 684b41ae..b408923c 100644 --- a/skrl/models/torch/multivariate_gaussian.py +++ b/skrl/models/torch/multivariate_gaussian.py @@ -126,7 +126,8 @@ def act(self, # map from states/observations to mean actions and log standard deviations if self._instantiator_net is None: actions_mean, log_std = self.compute(states.to(self.device), - taken_actions.to(self.device) if taken_actions is not None else taken_actions) + taken_actions.to(self.device) if taken_actions is not None else taken_actions, + role) else: actions_mean, log_std = self._get_instantiator_output(states.to(self.device), \ taken_actions.to(self.device) if taken_actions is not None else taken_actions) From a0868fdd760aa4a15759fd8b0222c4dc9fc4de1f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20Serrano=20Mu=C3=B1oz?= Date: Mon, 5 Sep 2022 17:47:21 +0200 Subject: [PATCH 046/108] Remove models' inference parameter --- skrl/models/torch/base.py | 8 +------- skrl/models/torch/categorical.py | 5 ----- skrl/models/torch/deterministic.py | 5 ----- skrl/models/torch/gaussian.py | 5 ----- skrl/models/torch/multivariate_gaussian.py | 5 ----- 5 files changed, 1 insertion(+), 27 deletions(-) diff --git a/skrl/models/torch/base.py b/skrl/models/torch/base.py index 1d4b915a..93b98074 100644 --- a/skrl/models/torch/base.py +++ b/skrl/models/torch/base.py @@ -45,7 +45,7 @@ def __init__(self, observation_space, action_space, device="cuda:0"): self.layer_1 = nn.Linear(self.num_observations, 64) self.layer_2 = nn.Linear(64, self.num_actions) - def act(self, states, taken_actions=None, inference=False, role=""): + def act(self, states, taken_actions=None, role=""): x = F.relu(self.layer_1(states)) x = F.relu(self.layer_2(x)) return x @@ -217,7 +217,6 @@ def tensor_to_space(self, def random_act(self, states: torch.Tensor, taken_actions: Optional[torch.Tensor] = None, - inference: bool = False, role: str = "") -> Sequence[torch.Tensor]: """Act randomly according to the action space @@ -226,8 +225,6 @@ def random_act(self, :param taken_actions: Actions taken by a policy to the given states (default: ``None``). The use of these actions only makes sense in critical models, e.g. :type taken_actions: torch.Tensor, optional - :param inference: Flag to indicate whether the model is making inference (default: ``False``) - :type inference: bool, optional :param role: Role play by the model (default: ``""``) :type role: str, optional @@ -338,7 +335,6 @@ def compute(self, def act(self, states: torch.Tensor, taken_actions: Optional[torch.Tensor] = None, - inference: bool = False, role: str = "") -> Sequence[torch.Tensor]: """Act according to the specified behavior (to be implemented by the inheriting classes) @@ -351,8 +347,6 @@ def act(self, :param taken_actions: Actions taken by a policy to the given states (default: ``None``). The use of these actions only makes sense in critical models, e.g. :type taken_actions: torch.Tensor, optional - :param inference: Flag to indicate whether the model is making inference (default: ``False``) - :type inference: bool, optional :param role: Role play by the model (default: ``""``) :type role: str, optional diff --git a/skrl/models/torch/categorical.py b/skrl/models/torch/categorical.py index fba74763..5c779510 100644 --- a/skrl/models/torch/categorical.py +++ b/skrl/models/torch/categorical.py @@ -63,7 +63,6 @@ def __init__(self, unnormalized_log_prob: bool = True, role: str = "") -> None: def act(self, states: torch.Tensor, taken_actions: Optional[torch.Tensor] = None, - inference: bool = False, role: str = "") -> Sequence[torch.Tensor]: """Act stochastically in response to the state of the environment @@ -72,8 +71,6 @@ def act(self, :param taken_actions: Actions taken by a policy to the given states (default: ``None``). The use of these actions only makes sense in critical models, e.g. :type taken_actions: torch.Tensor, optional - :param inference: Flag to indicate whether the model is making inference (default: ``False``) - :type inference: bool, optional :param role: Role play by the model (default: ``""``) :type role: str, optional @@ -107,8 +104,6 @@ def act(self, actions = self._c_distribution[role].sample() log_prob = self._c_distribution[role].log_prob(actions if taken_actions is None else taken_actions.view(-1)) - if inference: - return actions.unsqueeze(-1).detach(), log_prob.unsqueeze(-1).detach(), output.detach() return actions.unsqueeze(-1), log_prob.unsqueeze(-1), output def distribution(self, role: str = "") -> torch.distributions.Categorical: diff --git a/skrl/models/torch/deterministic.py b/skrl/models/torch/deterministic.py index 8551d00a..bc863320 100644 --- a/skrl/models/torch/deterministic.py +++ b/skrl/models/torch/deterministic.py @@ -64,7 +64,6 @@ def __init__(self, clip_actions: bool = False, role: str = "") -> None: def act(self, states: torch.Tensor, taken_actions: Optional[torch.Tensor] = None, - inference: bool = False, role: str = "") -> Sequence[torch.Tensor]: """Act deterministically in response to the state of the environment @@ -73,8 +72,6 @@ def act(self, :param taken_actions: Actions taken by a policy to the given states (default: ``None``). The use of these actions only makes sense in critical models, e.g. :type taken_actions: torch.Tensor, optional - :param inference: Flag to indicate whether the model is making inference (default: ``False``) - :type inference: bool, optional :param role: Role play by the model (default: ``""``) :type role: str, optional @@ -104,7 +101,5 @@ def act(self, else: actions = torch.clamp(actions, min=self.clip_actions_min, max=self.clip_actions_max) - if inference: - return actions.detach(), None, None return actions, None, None \ No newline at end of file diff --git a/skrl/models/torch/gaussian.py b/skrl/models/torch/gaussian.py index b80895aa..c06fe570 100644 --- a/skrl/models/torch/gaussian.py +++ b/skrl/models/torch/gaussian.py @@ -112,7 +112,6 @@ def __init__(self, def act(self, states: torch.Tensor, taken_actions: Optional[torch.Tensor] = None, - inference: bool = False, role: str = "") -> Sequence[torch.Tensor]: """Act stochastically in response to the state of the environment @@ -121,8 +120,6 @@ def act(self, :param taken_actions: Actions taken by a policy to the given states (default: ``None``). The use of these actions only makes sense in critical models, e.g. :type taken_actions: torch.Tensor, optional - :param inference: Flag to indicate whether the model is making inference (default: ``False``) - :type inference: bool, optional :param role: Role play by the model (default: ``""``) :type role: str, optional @@ -176,8 +173,6 @@ def act(self, if log_prob.dim() != actions.dim(): log_prob = log_prob.unsqueeze(-1) - if inference: - return actions.detach(), log_prob.detach(), actions_mean.detach() return actions, log_prob, actions_mean def get_entropy(self, role: str = "") -> torch.Tensor: diff --git a/skrl/models/torch/multivariate_gaussian.py b/skrl/models/torch/multivariate_gaussian.py index b408923c..cc65b7c9 100644 --- a/skrl/models/torch/multivariate_gaussian.py +++ b/skrl/models/torch/multivariate_gaussian.py @@ -98,7 +98,6 @@ def __init__(self, def act(self, states: torch.Tensor, taken_actions: Optional[torch.Tensor] = None, - inference: bool = False, role: str = "") -> Sequence[torch.Tensor]: """Act stochastically in response to the state of the environment @@ -107,8 +106,6 @@ def act(self, :param taken_actions: Actions taken by a policy to the given states (default: ``None``). The use of these actions only makes sense in critical models, e.g. :type taken_actions: torch.Tensor, optional - :param inference: Flag to indicate whether the model is making inference (default: ``False``) - :type inference: bool, optional :param role: Role play by the model (default: ``""``) :type role: str, optional @@ -160,8 +157,6 @@ def act(self, if log_prob.dim() != actions.dim(): log_prob = log_prob.unsqueeze(-1) - if inference: - return actions.detach(), log_prob.detach(), actions_mean.detach() return actions, log_prob, actions_mean def get_entropy(self, role: str = "") -> torch.Tensor: From 14e87646cf55425574210da097172e2bb52fd9e9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20Serrano=20Mu=C3=B1oz?= Date: Mon, 5 Sep 2022 17:54:40 +0200 Subject: [PATCH 047/108] Update A2C implementation --- skrl/agents/torch/a2c/a2c.py | 69 ++++++++++++++++-------------------- 1 file changed, 31 insertions(+), 38 deletions(-) diff --git a/skrl/agents/torch/a2c/a2c.py b/skrl/agents/torch/a2c/a2c.py index 8c779bdf..b51d2479 100644 --- a/skrl/agents/torch/a2c/a2c.py +++ b/skrl/agents/torch/a2c/a2c.py @@ -2,6 +2,7 @@ import gym import copy +import itertools import torch import torch.nn as nn @@ -20,8 +21,7 @@ "discount_factor": 0.99, # discount factor (gamma) "lambda": 0.95, # TD(lambda) coefficient (lam) for computing returns and advantages - "policy_learning_rate": 1e-3, # policy learning rate - "value_learning_rate": 1e-3, # value learning rate + "learning_rate": 1e-3, # learning rate "learning_rate_scheduler": None, # learning rate scheduler class (see torch.optim.lr_scheduler) "learning_rate_scheduler_kwargs": {}, # learning rate scheduler's kwargs (e.g. {"step_size": 1e-3}) @@ -104,8 +104,7 @@ def __init__(self, self._entropy_loss_scale = self.cfg["entropy_loss_scale"] - self._policy_learning_rate = self.cfg["policy_learning_rate"] - self._value_learning_rate = self.cfg["value_learning_rate"] + self._learning_rate = self.cfg["learning_rate"] self._learning_rate_scheduler = self.cfg["learning_rate_scheduler"] self._state_preprocessor = self.cfg["state_preprocessor"] @@ -119,13 +118,15 @@ def __init__(self, self._rewards_shaper = self.cfg["rewards_shaper"] - # set up optimizers and learning rate schedulers + # set up optimizer and learning rate scheduler if self.policy is not None and self.value is not None: - self.policy_optimizer = torch.optim.Adam(self.policy.parameters(), lr=self._policy_learning_rate) - self.value_optimizer = torch.optim.Adam(self.value.parameters(), lr=self._value_learning_rate) + if self.policy is self.value: + self.optimizer = torch.optim.Adam(self.policy.parameters(), lr=self._learning_rate) + else: + self.optimizer = torch.optim.Adam(itertools.chain(self.policy.parameters(), self.value.parameters()), + lr=self._learning_rate) if self._learning_rate_scheduler is not None: - self.policy_scheduler = self._learning_rate_scheduler(self.policy_optimizer, **self.cfg["learning_rate_scheduler_kwargs"]) - self.value_scheduler = self._learning_rate_scheduler(self.value_optimizer, **self.cfg["learning_rate_scheduler_kwargs"]) + self.scheduler = self._learning_rate_scheduler(self.optimizer, **self.cfg["learning_rate_scheduler_kwargs"]) # set up preprocessors self._state_preprocessor = self._state_preprocessor(**self.cfg["state_preprocessor_kwargs"]) if self._state_preprocessor \ @@ -157,8 +158,7 @@ def init(self) -> None: def act(self, states: torch.Tensor, timestep: int, - timesteps: int, - inference: bool = False) -> torch.Tensor: + timesteps: int) -> torch.Tensor: """Process the environment's states to make a decision (actions) using the main policy :param states: Environment's states @@ -167,8 +167,6 @@ def act(self, :type timestep: int :param timesteps: Number of timesteps :type timesteps: int - :param inference: Flag to indicate whether the model is making inference - :type inference: bool :return: Actions :rtype: torch.Tensor @@ -178,10 +176,10 @@ def act(self, # sample random actions # TODO, check for stochasticity if timestep < self._random_timesteps: - return self.policy.random_act(states) + return self.policy.random_act(states, taken_actions=None, role="policy") # sample stochastic actions - return self.policy.act(states, inference=inference) + return self.policy.act(states, taken_actions=None, role="policy") def record_transition(self, states: torch.Tensor, @@ -220,7 +218,8 @@ def record_transition(self, self._current_next_states = next_states if self.memory is not None: - values, _, _ = self.value.act(states=self._state_preprocessor(states), inference=True) + with torch.no_grad(): + values, _, _ = self.value.act(self._state_preprocessor(states), taken_actions=None, role="value") values = self._value_preprocessor(values, inverse=True) self.memory.add_samples(states=states, actions=actions, rewards=rewards, next_states=next_states, dones=dones, @@ -306,8 +305,8 @@ def compute_gae(rewards: torch.Tensor, return returns, advantages # compute returns and advantages - last_values, _, _ = self.value.act(states=self._state_preprocessor(self._current_next_states.float() \ - if not torch.is_floating_point(self._current_next_states) else self._current_next_states), inference=True) + with torch.no_grad(): + last_values, _, _ = self.value.act(self._state_preprocessor(self._current_next_states.float()), taken_actions=None, role="value") last_values = self._value_preprocessor(last_values, inverse=True) values = self.memory.get_tensor_by_name("values") @@ -334,35 +333,31 @@ def compute_gae(rewards: torch.Tensor, sampled_states = self._state_preprocessor(sampled_states, train=True) - _, next_log_prob, _ = self.policy.act(states=sampled_states, taken_actions=sampled_actions) + _, next_log_prob, _ = self.policy.act(states=sampled_states, taken_actions=sampled_actions, role="policy") # compute entropy loss if self._entropy_loss_scale: - entropy_loss = -self._entropy_loss_scale * self.policy.get_entropy().mean() + entropy_loss = -self._entropy_loss_scale * self.policy.get_entropy(role="policy").mean() else: entropy_loss = 0 # compute policy loss policy_loss = -(sampled_advantages * next_log_prob).mean() - # optimization step (policy) - self.policy_optimizer.zero_grad() - (policy_loss + entropy_loss).backward() - if self._grad_norm_clip > 0: - nn.utils.clip_grad_norm_(self.policy.parameters(), self._grad_norm_clip) - self.policy_optimizer.step() - # compute value loss - predicted_values, _, _ = self.value.act(states=sampled_states) + predicted_values, _, _ = self.value.act(states=sampled_states, taken_actions=None, role="value") value_loss = F.mse_loss(sampled_returns, predicted_values) - # optimization step (value) - self.value_optimizer.zero_grad() - value_loss.backward() + # optimization step + self.optimizer.zero_grad() + (policy_loss + entropy_loss + value_loss).backward() if self._grad_norm_clip > 0: - nn.utils.clip_grad_norm_(self.value.parameters(), self._grad_norm_clip) - self.value_optimizer.step() + if self.policy is self.value: + nn.utils.clip_grad_norm_(self.policy.parameters(), max_norm=self._grad_norm_clip) + else: + nn.utils.clip_grad_norm_(itertools.chain(self.policy.parameters(), self.value.parameters()), self._grad_norm_clip) + self.optimizer.step() # update cumulative losses cumulative_policy_loss += policy_loss.item() @@ -372,8 +367,7 @@ def compute_gae(rewards: torch.Tensor, # update learning rate if self._learning_rate_scheduler: - self.policy_scheduler.step() - self.value_scheduler.step() + self.scheduler.step() # record data self.track_data("Loss / Policy loss", cumulative_policy_loss / len(sampled_batches)) @@ -382,8 +376,7 @@ def compute_gae(rewards: torch.Tensor, if self._entropy_loss_scale: self.track_data("Loss / Entropy loss", cumulative_entropy_loss / len(sampled_batches)) - self.track_data("Policy / Standard deviation", self.policy.distribution().stddev.mean().item()) + self.track_data("Policy / Standard deviation", self.policy.distribution(role="policy").stddev.mean().item()) if self._learning_rate_scheduler: - self.track_data("Learning / Policy learning rate", self.policy_scheduler.get_last_lr()[0]) - self.track_data("Learning / Value learning rate", self.value_scheduler.get_last_lr()[0]) + self.track_data("Learning / Learning rate", self.scheduler.get_last_lr()[0]) From c5bef1fea92fb7c3c56ea4c18828466930f48733 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20Serrano=20Mu=C3=B1oz?= Date: Mon, 5 Sep 2022 18:14:27 +0200 Subject: [PATCH 048/108] Update A2C agent in docs --- docs/source/modules/skrl.agents.a2c.rst | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/docs/source/modules/skrl.agents.a2c.rst b/docs/source/modules/skrl.agents.a2c.rst index da63d444..7413838e 100644 --- a/docs/source/modules/skrl.agents.a2c.rst +++ b/docs/source/modules/skrl.agents.a2c.rst @@ -8,6 +8,10 @@ Paper: `Asynchronous Methods for Deep Reinforcement Learning Date: Mon, 5 Sep 2022 20:04:56 +0200 Subject: [PATCH 049/108] Pass taken_action and role arguments to each model --- skrl/agents/torch/a2c/a2c.py | 7 ++-- skrl/agents/torch/amp/amp.py | 37 +++++++++----------- skrl/agents/torch/cem/cem.py | 14 +++----- skrl/agents/torch/ddpg/ddpg.py | 22 +++++------- skrl/agents/torch/dqn/ddqn.py | 23 +++++-------- skrl/agents/torch/dqn/dqn.py | 19 ++++------ skrl/agents/torch/ppo/ppo.py | 40 +++++++++++----------- skrl/agents/torch/q_learning/q_learning.py | 12 ++----- skrl/agents/torch/sac/sac.py | 28 ++++++--------- skrl/agents/torch/sarsa/sarsa.py | 14 +++----- skrl/agents/torch/td3/td3.py | 26 ++++++-------- skrl/agents/torch/trpo/trpo.py | 29 +++++++--------- 12 files changed, 106 insertions(+), 165 deletions(-) diff --git a/skrl/agents/torch/a2c/a2c.py b/skrl/agents/torch/a2c/a2c.py index b51d2479..02dec6da 100644 --- a/skrl/agents/torch/a2c/a2c.py +++ b/skrl/agents/torch/a2c/a2c.py @@ -155,10 +155,7 @@ def init(self) -> None: # create temporary variables needed for storage and computation self._current_next_states = None - def act(self, - states: torch.Tensor, - timestep: int, - timesteps: int) -> torch.Tensor: + def act(self, states: torch.Tensor, timestep: int, timesteps: int) -> torch.Tensor: """Process the environment's states to make a decision (actions) using the main policy :param states: Environment's states @@ -354,7 +351,7 @@ def compute_gae(rewards: torch.Tensor, (policy_loss + entropy_loss + value_loss).backward() if self._grad_norm_clip > 0: if self.policy is self.value: - nn.utils.clip_grad_norm_(self.policy.parameters(), max_norm=self._grad_norm_clip) + nn.utils.clip_grad_norm_(self.policy.parameters(), self._grad_norm_clip) else: nn.utils.clip_grad_norm_(itertools.chain(self.policy.parameters(), self.value.parameters()), self._grad_norm_clip) self.optimizer.step() diff --git a/skrl/agents/torch/amp/amp.py b/skrl/agents/torch/amp/amp.py index 1b1df03d..4664fcbb 100644 --- a/skrl/agents/torch/amp/amp.py +++ b/skrl/agents/torch/amp/amp.py @@ -231,11 +231,7 @@ def init(self) -> None: self._current_log_prob = None self._current_states = None - def act(self, - states: torch.Tensor, - timestep: int, - timesteps: int, - inference: bool = False) -> torch.Tensor: + def act(self, states: torch.Tensor, timestep: int, timesteps: int) -> torch.Tensor: """Process the environment's states to make a decision (actions) using the main policy :param states: Environment's states @@ -244,8 +240,6 @@ def act(self, :type timestep: int :param timesteps: Number of timesteps :type timesteps: int - :param inference: Flag to indicate whether the model is making inference - :type inference: bool :return: Actions :rtype: torch.Tensor @@ -259,10 +253,10 @@ def act(self, # sample random actions # TODO, check for stochasticity if timestep < self._random_timesteps: - return self.policy.random_act(states) + return self.policy.random_act(states, taken_actions=None, role="policy") # sample stochastic actions - actions, log_prob, actions_mean = self.policy.act(states, inference=inference) + actions, log_prob, actions_mean = self.policy.act(states, taken_actions=None, role="policy") self._current_log_prob = log_prob return actions, log_prob, actions_mean @@ -308,10 +302,12 @@ def record_transition(self, amp_states = infos["amp_obs"] if self.memory is not None: - values, _, _ = self.value.act(states=self._state_preprocessor(states), inference=True) + with torch.no_grad(): + values, _, _ = self.value.act(states=self._state_preprocessor(states), taken_actions=None, role="value") values = self._value_preprocessor(values, inverse=True) - next_values, _, _ = self.value.act(states=self._state_preprocessor(next_states), inference=True) + with torch.no_grad(): + next_values, _, _ = self.value.act(states=self._state_preprocessor(next_states), taken_actions=None, role="value") next_values = self._value_preprocessor(next_values, inverse=True) next_values *= infos['terminate'].view(-1, 1).logical_not() @@ -405,7 +401,7 @@ def compute_gae(rewards: torch.Tensor, amp_states = self.memory.get_tensor_by_name("amp_states") with torch.no_grad(): - amp_logits, _, _ = self.discriminator.act(self._amp_state_preprocessor(amp_states)) + amp_logits, _, _ = self.discriminator.act(self._amp_state_preprocessor(amp_states), taken_actions=None, role="discriminator") style_reward = -torch.log(torch.maximum(1 - 1 / (1 + torch.exp(-amp_logits)), torch.tensor(0.0001, device=self.device))) style_reward *= self._discriminator_reward_scale @@ -452,11 +448,11 @@ def compute_gae(rewards: torch.Tensor, sampled_states = self._state_preprocessor(sampled_states, train=True) - _, next_log_prob, _ = self.policy.act(states=sampled_states, taken_actions=sampled_actions) + _, next_log_prob, _ = self.policy.act(states=sampled_states, taken_actions=sampled_actions, role="policy") # compute entropy loss if self._entropy_loss_scale: - entropy_loss = -self._entropy_loss_scale * self.policy.get_entropy().mean() + entropy_loss = -self._entropy_loss_scale * self.policy.get_entropy(role="policy").mean() else: entropy_loss = 0 @@ -468,7 +464,7 @@ def compute_gae(rewards: torch.Tensor, policy_loss = -torch.min(surrogate, surrogate_clipped).mean() # compute value loss - predicted_values, _, _ = self.value.act(states=sampled_states) + predicted_values, _, _ = self.value.act(states=sampled_states, taken_actions=None, role="value") if self._clip_predicted_values: predicted_values = sampled_values + torch.clip(predicted_values - sampled_values, @@ -489,9 +485,9 @@ def compute_gae(rewards: torch.Tensor, sampled_amp_motion_states = self._amp_state_preprocessor(sampled_motion_batches[batch_index][0], train=True) sampled_amp_motion_states.requires_grad_(True) - amp_logits, _, _ = self.discriminator.act(states=sampled_amp_states) - amp_replay_logits, _, _ = self.discriminator.act(states=sampled_amp_replay_states) - amp_motion_logits, _, _ = self.discriminator.act(states=sampled_amp_motion_states) + amp_logits, _, _ = self.discriminator.act(states=sampled_amp_states, taken_actions=None, role="discriminator") + amp_replay_logits, _, _ = self.discriminator.act(states=sampled_amp_replay_states, taken_actions=None, role="discriminator") + amp_motion_logits, _, _ = self.discriminator.act(states=sampled_amp_motion_states, taken_actions=None, role="discriminator") amp_cat_logits = torch.cat([amp_logits, amp_replay_logits], dim=0) @@ -530,8 +526,7 @@ def compute_gae(rewards: torch.Tensor, if self._grad_norm_clip > 0: nn.utils.clip_grad_norm_(itertools.chain(self.policy.parameters(), self.value.parameters(), - self.discriminator.parameters()), - max_norm=self._grad_norm_clip) + self.discriminator.parameters()), self._grad_norm_clip) self.optimizer.step() # update cumulative losses @@ -555,7 +550,7 @@ def compute_gae(rewards: torch.Tensor, self.track_data("Loss / Entropy loss", cumulative_entropy_loss / (self._learning_epochs * self._mini_batches)) self.track_data("Loss / Discriminator loss", cumulative_discriminator_loss / (self._learning_epochs * self._mini_batches)) - self.track_data("Policy / Standard deviation", self.policy.distribution().stddev.mean().item()) + self.track_data("Policy / Standard deviation", self.policy.distribution(role="policy").stddev.mean().item()) if self._learning_rate_scheduler: self.track_data("Learning / Learning rate", self.scheduler.get_last_lr()[0]) diff --git a/skrl/agents/torch/cem/cem.py b/skrl/agents/torch/cem/cem.py index b2b831d8..35a40da2 100644 --- a/skrl/agents/torch/cem/cem.py +++ b/skrl/agents/torch/cem/cem.py @@ -129,11 +129,7 @@ def init(self) -> None: self.tensors_names = ["states", "actions", "rewards", "next_states", "dones"] - def act(self, - states: torch.Tensor, - timestep: int, - timesteps: int, - inference: bool = False) -> torch.Tensor: + def act(self, states: torch.Tensor, timestep: int, timesteps: int) -> torch.Tensor: """Process the environment's states to make a decision (actions) using the main policy :param states: Environment's states @@ -142,8 +138,6 @@ def act(self, :type timestep: int :param timesteps: Number of timesteps :type timesteps: int - :param inference: Flag to indicate whether the model is making inference - :type inference: bool :return: Actions :rtype: torch.Tensor @@ -153,10 +147,10 @@ def act(self, # sample random actions # TODO, check for stochasticity if timestep < self._random_timesteps: - return self.policy.random_act(states) + return self.policy.random_act(states, taken_actions=None, role="policy") # sample stochastic actions - return self.policy.act(states, inference=inference) + return self.policy.act(states, taken_actions=None, role="policy") def record_transition(self, states: torch.Tensor, @@ -269,7 +263,7 @@ def _update(self, timestep: int, timesteps: int) -> None: elite_actions = torch.cat([sampled_actions[limits[i][0]:limits[i][1]] for i in indexes[:, 0]], dim=0) # compute scores for the elite states - scores = self.policy.act(elite_states)[2] + scores = self.policy.act(elite_states, taken_actions=None, role="policy")[2] # compute policy loss policy_loss = F.cross_entropy(scores, elite_actions.view(-1)) diff --git a/skrl/agents/torch/ddpg/ddpg.py b/skrl/agents/torch/ddpg/ddpg.py index d5e9e5b8..e829aef8 100644 --- a/skrl/agents/torch/ddpg/ddpg.py +++ b/skrl/agents/torch/ddpg/ddpg.py @@ -163,11 +163,7 @@ def init(self) -> None: # backward compatibility: torch < 1.9 clamp method does not support tensors self._backward_compatibility = tuple(map(int, (torch.__version__.split(".")[:2]))) < (1, 9) - def act(self, - states: torch.Tensor, - timestep: int, - timesteps: int, - inference: bool = False) -> torch.Tensor: + def act(self, states: torch.Tensor, timestep: int, timesteps: int) -> torch.Tensor: """Process the environment's states to make a decision (actions) using the main policy :param states: Environment's states @@ -176,8 +172,6 @@ def act(self, :type timestep: int :param timesteps: Number of timesteps :type timesteps: int - :param inference: Flag to indicate whether the model is making inference - :type inference: bool :return: Actions :rtype: torch.Tensor @@ -186,10 +180,10 @@ def act(self, # sample random actions if timestep < self._random_timesteps: - return self.policy.random_act(states) + return self.policy.random_act(states, taken_actions=None, role="policy") # sample deterministic actions - actions = self.policy.act(states, inference=inference) + actions = self.policy.act(states, taken_actions=None, role="policy") # add exloration noise if self._exploration_noise is not None: @@ -313,13 +307,13 @@ def _update(self, timestep: int, timesteps: int) -> None: # compute target values with torch.no_grad(): - next_actions, _, _ = self.target_policy.act(states=sampled_next_states) + next_actions, _, _ = self.target_policy.act(states=sampled_next_states, taken_actions=None, role="target_policy") - target_q_values, _, _ = self.target_critic.act(states=sampled_next_states, taken_actions=next_actions) + target_q_values, _, _ = self.target_critic.act(states=sampled_next_states, taken_actions=next_actions, role="target_critic") target_values = sampled_rewards + self._discount_factor * sampled_dones.logical_not() * target_q_values # compute critic loss - critic_values, _, _ = self.critic.act(states=sampled_states, taken_actions=sampled_actions) + critic_values, _, _ = self.critic.act(states=sampled_states, taken_actions=sampled_actions, role="critic") critic_loss = F.mse_loss(critic_values, target_values) @@ -329,8 +323,8 @@ def _update(self, timestep: int, timesteps: int) -> None: self.critic_optimizer.step() # compute policy (actor) loss - actions, _, _ = self.policy.act(states=sampled_states) - critic_values, _, _ = self.critic.act(states=sampled_states, taken_actions=actions) + actions, _, _ = self.policy.act(states=sampled_states, taken_actions=None, role="policy") + critic_values, _, _ = self.critic.act(states=sampled_states, taken_actions=actions, role="critic") policy_loss = -critic_values.mean() diff --git a/skrl/agents/torch/dqn/ddqn.py b/skrl/agents/torch/dqn/ddqn.py index 94a44bc0..4655531f 100644 --- a/skrl/agents/torch/dqn/ddqn.py +++ b/skrl/agents/torch/dqn/ddqn.py @@ -153,11 +153,7 @@ def init(self) -> None: self.tensors_names = ["states", "actions", "rewards", "next_states", "dones"] - def act(self, - states: torch.Tensor, - timestep: int, - timesteps: int, - inference: bool = False) -> torch.Tensor: + def act(self, states: torch.Tensor, timestep: int, timesteps: int) -> torch.Tensor: """Process the environment's states to make a decision (actions) using the main policy :param states: Environment's states @@ -166,8 +162,6 @@ def act(self, :type timestep: int :param timesteps: Number of timesteps :type timesteps: int - :param inference: Flag to indicate whether the model is making inference - :type inference: bool :return: Actions :rtype: torch.Tensor @@ -175,10 +169,10 @@ def act(self, states = self._state_preprocessor(states) if not self._exploration_timesteps: - return torch.argmax(self.q_network.act(states, inference=inference)[0], dim=1, keepdim=True), None, None + return torch.argmax(self.q_network.act(states, taken_actions=None, role="q_network")[0], dim=1, keepdim=True), None, None # sample random actions - actions = self.q_network.random_act(states)[0] + actions = self.q_network.random_act(states, taken_actions=None, role="q_network")[0] if timestep < self._random_timesteps: return actions, None, None @@ -188,7 +182,7 @@ def act(self, indexes = (torch.rand(states.shape[0], device=self.device) >= epsilon).nonzero().view(-1) if indexes.numel(): - actions[indexes] = torch.argmax(self.q_network.act(states[indexes], inference=inference)[0], dim=1, keepdim=True) + actions[indexes] = torch.argmax(self.q_network.act(states[indexes], taken_actions=None, role="q_network")[0], dim=1, keepdim=True) # record epsilon self.track_data("Exploration / Exploration epsilon", epsilon) @@ -278,14 +272,15 @@ def _update(self, timestep: int, timesteps: int) -> None: # compute target values with torch.no_grad(): - next_q_values, _, _ = self.target_q_network.act(states=sampled_next_states) + next_q_values, _, _ = self.target_q_network.act(states=sampled_next_states, taken_actions=None, role="target_q_network") - target_q_values = torch.gather(next_q_values, dim=1, \ - index=torch.argmax(self.q_network.act(states=sampled_next_states)[0], dim=1, keepdim=True)) + target_q_values = torch.gather(next_q_values, dim=1, index=torch.argmax(self.q_network.act(states=sampled_next_states, \ + taken_actions=None, role="q_network")[0], dim=1, keepdim=True)) target_values = sampled_rewards + self._discount_factor * sampled_dones.logical_not() * target_q_values # compute Q-network loss - q_values = torch.gather(self.q_network.act(states=sampled_states)[0], dim=1, index=sampled_actions.long()) + q_values = torch.gather(self.q_network.act(states=sampled_states, taken_actions=None, role="q_network")[0], + dim=1, index=sampled_actions.long()) q_network_loss = F.mse_loss(q_values, target_values) diff --git a/skrl/agents/torch/dqn/dqn.py b/skrl/agents/torch/dqn/dqn.py index ba6c544f..9c77d947 100644 --- a/skrl/agents/torch/dqn/dqn.py +++ b/skrl/agents/torch/dqn/dqn.py @@ -153,11 +153,7 @@ def init(self) -> None: self.tensors_names = ["states", "actions", "rewards", "next_states", "dones"] - def act(self, - states: torch.Tensor, - timestep: int, - timesteps: int, - inference: bool = False) -> torch.Tensor: + def act(self, states: torch.Tensor, timestep: int, timesteps: int) -> torch.Tensor: """Process the environment's states to make a decision (actions) using the main policy :param states: Environment's states @@ -166,8 +162,6 @@ def act(self, :type timestep: int :param timesteps: Number of timesteps :type timesteps: int - :param inference: Flag to indicate whether the model is making inference - :type inference: bool :return: Actions :rtype: torch.Tensor @@ -175,10 +169,10 @@ def act(self, states = self._state_preprocessor(states) if not self._exploration_timesteps: - return torch.argmax(self.q_network.act(states, inference=inference)[0], dim=1, keepdim=True), None, None + return torch.argmax(self.q_network.act(states, taken_actions=None, role="q_network")[0], dim=1, keepdim=True), None, None # sample random actions - actions = self.q_network.random_act(states)[0] + actions = self.q_network.random_act(states, taken_actions=None, role="q_network")[0] if timestep < self._random_timesteps: return actions, None, None @@ -188,7 +182,7 @@ def act(self, indexes = (torch.rand(states.shape[0], device=self.device) >= epsilon).nonzero().view(-1) if indexes.numel(): - actions[indexes] = torch.argmax(self.q_network.act(states[indexes], inference=inference)[0], dim=1, keepdim=True) + actions[indexes] = torch.argmax(self.q_network.act(states[indexes], taken_actions=None, role="q_network")[0], dim=1, keepdim=True) # record epsilon self.track_data("Exploration / Exploration epsilon", epsilon) @@ -278,13 +272,14 @@ def _update(self, timestep: int, timesteps: int) -> None: # compute target values with torch.no_grad(): - next_q_values, _, _ = self.target_q_network.act(states=sampled_next_states) + next_q_values, _, _ = self.target_q_network.act(states=sampled_next_states, taken_actions=None, role="target_q_network") target_q_values = torch.max(next_q_values, dim=-1, keepdim=True)[0] target_values = sampled_rewards + self._discount_factor * sampled_dones.logical_not() * target_q_values # compute Q-network loss - q_values = torch.gather(self.q_network.act(states=sampled_states)[0], dim=1, index=sampled_actions.long()) + q_values = torch.gather(self.q_network.act(states=sampled_states, taken_actions=None, role="q_network")[0], + dim=1, index=sampled_actions.long()) q_network_loss = F.mse_loss(q_values, target_values) diff --git a/skrl/agents/torch/ppo/ppo.py b/skrl/agents/torch/ppo/ppo.py index 560efdfd..a04c06b9 100644 --- a/skrl/agents/torch/ppo/ppo.py +++ b/skrl/agents/torch/ppo/ppo.py @@ -135,8 +135,11 @@ def __init__(self, # set up optimizer and learning rate scheduler if self.policy is not None and self.value is not None: - self.optimizer = torch.optim.Adam(itertools.chain(self.policy.parameters(), self.value.parameters()), - lr=self._learning_rate) + if self.policy is self.value: + self.optimizer = torch.optim.Adam(self.policy.parameters(), lr=self._learning_rate) + else: + self.optimizer = torch.optim.Adam(itertools.chain(self.policy.parameters(), self.value.parameters()), + lr=self._learning_rate) if self._learning_rate_scheduler is not None: self.scheduler = self._learning_rate_scheduler(self.optimizer, **self.cfg["learning_rate_scheduler_kwargs"]) @@ -169,11 +172,7 @@ def init(self) -> None: self._current_log_prob = None self._current_next_states = None - def act(self, - states: torch.Tensor, - timestep: int, - timesteps: int, - inference: bool = False) -> torch.Tensor: + def act(self, states: torch.Tensor, timestep: int, timesteps: int) -> torch.Tensor: """Process the environment's states to make a decision (actions) using the main policy :param states: Environment's states @@ -182,8 +181,6 @@ def act(self, :type timestep: int :param timesteps: Number of timesteps :type timesteps: int - :param inference: Flag to indicate whether the model is making inference - :type inference: bool :return: Actions :rtype: torch.Tensor @@ -193,10 +190,10 @@ def act(self, # sample random actions # TODO, check for stochasticity if timestep < self._random_timesteps: - return self.policy.random_act(states) + return self.policy.random_act(states, taken_actions=None, role="policy") # sample stochastic actions - actions, log_prob, actions_mean = self.policy.act(states, inference=inference) + actions, log_prob, actions_mean = self.policy.act(states, taken_actions=None, role="policy") self._current_log_prob = log_prob return actions, log_prob, actions_mean @@ -238,7 +235,8 @@ def record_transition(self, self._current_next_states = next_states if self.memory is not None: - values, _, _ = self.value.act(states=self._state_preprocessor(states), inference=True) + with torch.no_grad(): + values, _, _ = self.value.act(states=self._state_preprocessor(states), taken_actions=None, role="value") values = self._value_preprocessor(values, inverse=True) self.memory.add_samples(states=states, actions=actions, rewards=rewards, next_states=next_states, dones=dones, @@ -324,8 +322,8 @@ def compute_gae(rewards: torch.Tensor, return returns, advantages # compute returns and advantages - last_values, _, _ = self.value.act(states=self._state_preprocessor(self._current_next_states.float() \ - if not torch.is_floating_point(self._current_next_states) else self._current_next_states), inference=True) + with torch.no_grad(): + last_values, _, _ = self.value.act(self._state_preprocessor(self._current_next_states.float()), taken_actions=None, role="value") last_values = self._value_preprocessor(last_values, inverse=True) values = self.memory.get_tensor_by_name("values") @@ -357,7 +355,7 @@ def compute_gae(rewards: torch.Tensor, sampled_states = self._state_preprocessor(sampled_states, train=not epoch) - _, next_log_prob, _ = self.policy.act(states=sampled_states, taken_actions=sampled_actions) + _, next_log_prob, _ = self.policy.act(states=sampled_states, taken_actions=sampled_actions, role="policy") # compute aproximate KL divergence with torch.no_grad(): @@ -371,7 +369,7 @@ def compute_gae(rewards: torch.Tensor, # compute entropy loss if self._entropy_loss_scale: - entropy_loss = -self._entropy_loss_scale * self.policy.get_entropy().mean() + entropy_loss = -self._entropy_loss_scale * self.policy.get_entropy(role="policy").mean() else: entropy_loss = 0 @@ -383,7 +381,7 @@ def compute_gae(rewards: torch.Tensor, policy_loss = -torch.min(surrogate, surrogate_clipped).mean() # compute value loss - predicted_values, _, _ = self.value.act(states=sampled_states) + predicted_values, _, _ = self.value.act(states=sampled_states, taken_actions=None, role="value") if self._clip_predicted_values: predicted_values = sampled_values + torch.clip(predicted_values - sampled_values, @@ -395,8 +393,10 @@ def compute_gae(rewards: torch.Tensor, self.optimizer.zero_grad() (policy_loss + entropy_loss + value_loss).backward() if self._grad_norm_clip > 0: - nn.utils.clip_grad_norm_(itertools.chain(self.policy.parameters(), self.value.parameters()), - max_norm=self._grad_norm_clip) + if self.policy is self.value: + nn.utils.clip_grad_norm_(self.policy.parameters(), self._grad_norm_clip) + else: + nn.utils.clip_grad_norm_(itertools.chain(self.policy.parameters(), self.value.parameters()), self._grad_norm_clip) self.optimizer.step() # update cumulative losses @@ -418,7 +418,7 @@ def compute_gae(rewards: torch.Tensor, if self._entropy_loss_scale: self.track_data("Loss / Entropy loss", cumulative_entropy_loss / (self._learning_epochs * self._mini_batches)) - self.track_data("Policy / Standard deviation", self.policy.distribution().stddev.mean().item()) + self.track_data("Policy / Standard deviation", self.policy.distribution(role="policy").stddev.mean().item()) if self._learning_rate_scheduler: self.track_data("Learning / Learning rate", self.scheduler.get_last_lr()[0]) diff --git a/skrl/agents/torch/q_learning/q_learning.py b/skrl/agents/torch/q_learning/q_learning.py index 91fb3bbb..67b74029 100644 --- a/skrl/agents/torch/q_learning/q_learning.py +++ b/skrl/agents/torch/q_learning/q_learning.py @@ -98,11 +98,7 @@ def init(self) -> None: """ super().init() - def act(self, - states: torch.Tensor, - timestep: int, - timesteps: int, - inference: bool = False) -> torch.Tensor: + def act(self, states: torch.Tensor, timestep: int, timesteps: int) -> torch.Tensor: """Process the environment's states to make a decision (actions) using the main policy :param states: Environment's states @@ -111,18 +107,16 @@ def act(self, :type timestep: int :param timesteps: Number of timesteps :type timesteps: int - :param inference: Flag to indicate whether the model is making inference - :type inference: bool :return: Actions :rtype: torch.Tensor """ # sample random actions if timestep < self._random_timesteps: - return self.policy.random_act(states) + return self.policy.random_act(states, taken_actions=None, role="policy") # sample actions from policy - return self.policy.act(states, inference=inference) + return self.policy.act(states, taken_actions=None, role="policy") def record_transition(self, states: torch.Tensor, diff --git a/skrl/agents/torch/sac/sac.py b/skrl/agents/torch/sac/sac.py index 4f3a9d57..b4ca788a 100644 --- a/skrl/agents/torch/sac/sac.py +++ b/skrl/agents/torch/sac/sac.py @@ -166,11 +166,7 @@ def init(self) -> None: self.tensors_names = ["states", "actions", "rewards", "next_states", "dones"] - def act(self, - states: torch.Tensor, - timestep: int, - timesteps: int, - inference: bool = False) -> torch.Tensor: + def act(self, states: torch.Tensor, timestep: int, timesteps: int) -> torch.Tensor: """Process the environment's states to make a decision (actions) using the main policy :param states: Environment's states @@ -179,8 +175,6 @@ def act(self, :type timestep: int :param timesteps: Number of timesteps :type timesteps: int - :param inference: Flag to indicate whether the model is making inference - :type inference: bool :return: Actions :rtype: torch.Tensor @@ -190,10 +184,10 @@ def act(self, # sample random actions # TODO, check for stochasticity if timestep < self._random_timesteps: - return self.policy.random_act(states) + return self.policy.random_act(states, taken_actions=None, role="policy") # sample stochastic actions - return self.policy.act(states, inference=inference) + return self.policy.act(states, taken_actions=None, role="policy") def record_transition(self, states: torch.Tensor, @@ -278,16 +272,16 @@ def _update(self, timestep: int, timesteps: int) -> None: # compute target values with torch.no_grad(): - next_actions, next_log_prob, _ = self.policy.act(states=sampled_next_states) + next_actions, next_log_prob, _ = self.policy.act(states=sampled_next_states, taken_actions=None, role="policy") - target_q1_values, _, _ = self.target_critic_1.act(states=sampled_next_states, taken_actions=next_actions) - target_q2_values, _, _ = self.target_critic_2.act(states=sampled_next_states, taken_actions=next_actions) + target_q1_values, _, _ = self.target_critic_1.act(states=sampled_next_states, taken_actions=next_actions, role="target_critic_1") + target_q2_values, _, _ = self.target_critic_2.act(states=sampled_next_states, taken_actions=next_actions, role="target_critic_2") target_q_values = torch.min(target_q1_values, target_q2_values) - self._entropy_coefficient * next_log_prob target_values = sampled_rewards + self._discount_factor * sampled_dones.logical_not() * target_q_values # compute critic loss - critic_1_values, _, _ = self.critic_1.act(states=sampled_states, taken_actions=sampled_actions) - critic_2_values, _, _ = self.critic_2.act(states=sampled_states, taken_actions=sampled_actions) + critic_1_values, _, _ = self.critic_1.act(states=sampled_states, taken_actions=sampled_actions, role="critic_1") + critic_2_values, _, _ = self.critic_2.act(states=sampled_states, taken_actions=sampled_actions, role="critic_2") critic_loss = (F.mse_loss(critic_1_values, target_values) + F.mse_loss(critic_2_values, target_values)) / 2 @@ -297,9 +291,9 @@ def _update(self, timestep: int, timesteps: int) -> None: self.critic_optimizer.step() # compute policy (actor) loss - actions, log_prob, _ = self.policy.act(states=sampled_states) - critic_1_values, _, _ = self.critic_1.act(states=sampled_states, taken_actions=actions) - critic_2_values, _, _ = self.critic_2.act(states=sampled_states, taken_actions=actions) + actions, log_prob, _ = self.policy.act(states=sampled_states, taken_actions=None, role="policy") + critic_1_values, _, _ = self.critic_1.act(states=sampled_states, taken_actions=actions, role="critic_1") + critic_2_values, _, _ = self.critic_2.act(states=sampled_states, taken_actions=actions, role="critic_2") policy_loss = (self._entropy_coefficient * log_prob - torch.min(critic_1_values, critic_2_values)).mean() diff --git a/skrl/agents/torch/sarsa/sarsa.py b/skrl/agents/torch/sarsa/sarsa.py index d0710084..4c9f9abb 100644 --- a/skrl/agents/torch/sarsa/sarsa.py +++ b/skrl/agents/torch/sarsa/sarsa.py @@ -98,11 +98,7 @@ def init(self) -> None: """ super().init() - def act(self, - states: torch.Tensor, - timestep: int, - timesteps: int, - inference: bool = False) -> torch.Tensor: + def act(self, states: torch.Tensor, timestep: int, timesteps: int) -> torch.Tensor: """Process the environment's states to make a decision (actions) using the main policy :param states: Environment's states @@ -111,18 +107,16 @@ def act(self, :type timestep: int :param timesteps: Number of timesteps :type timesteps: int - :param inference: Flag to indicate whether the model is making inference - :type inference: bool :return: Actions :rtype: torch.Tensor """ # sample random actions if timestep < self._random_timesteps: - return self.policy.random_act(states) + return self.policy.random_act(states, taken_actions=None, role="policy") # sample actions from policy - return self.policy.act(states, inference=inference) + return self.policy.act(states, taken_actions=None, role="policy") def record_transition(self, states: torch.Tensor, @@ -205,7 +199,7 @@ def _update(self, timestep: int, timesteps: int) -> None: env_ids = torch.arange(self._current_rewards.shape[0]).view(-1, 1) # compute next actions - next_actions = self.policy.act(self._current_next_states)[0] + next_actions = self.policy.act(self._current_next_states, taken_actions=None, role="policy")[0] # update Q-table q_table[env_ids, self._current_states, self._current_actions] += self._learning_rate \ diff --git a/skrl/agents/torch/td3/td3.py b/skrl/agents/torch/td3/td3.py index 572605f0..73eeacdb 100644 --- a/skrl/agents/torch/td3/td3.py +++ b/skrl/agents/torch/td3/td3.py @@ -179,11 +179,7 @@ def init(self) -> None: # backward compatibility: torch < 1.9 clamp method does not support tensors self._backward_compatibility = tuple(map(int, (torch.__version__.split(".")[:2]))) < (1, 9) - def act(self, - states: torch.Tensor, - timestep: int, - timesteps: int, - inference: bool = False) -> torch.Tensor: + def act(self, states: torch.Tensor, timestep: int, timesteps: int) -> torch.Tensor: """Process the environment's states to make a decision (actions) using the main policy :param states: Environment's states @@ -192,8 +188,6 @@ def act(self, :type timestep: int :param timesteps: Number of timesteps :type timesteps: int - :param inference: Flag to indicate whether the model is making inference - :type inference: bool :return: Actions :rtype: torch.Tensor @@ -202,10 +196,10 @@ def act(self, # sample random actions if timestep < self._random_timesteps: - return self.policy.random_act(states) + return self.policy.random_act(states, taken_actions=None, role="policy") # sample deterministic actions - actions = self.policy.act(states, inference=inference) + actions = self.policy.act(states, taken_actions=None, role="policy") # add noise if self._exploration_noise is not None: @@ -330,7 +324,7 @@ def _update(self, timestep: int, timesteps: int) -> None: with torch.no_grad(): # target policy smoothing - next_actions, _, _ = self.target_policy.act(states=sampled_next_states) + next_actions, _, _ = self.target_policy.act(states=sampled_next_states, taken_actions=None, role="target_policy") noises = torch.clamp(self._smooth_regularization_noise.sample(next_actions.shape), min=-self._smooth_regularization_clip, max=self._smooth_regularization_clip) @@ -342,14 +336,14 @@ def _update(self, timestep: int, timesteps: int) -> None: next_actions.clamp_(min=self.clip_actions_min, max=self.clip_actions_max) # compute target values - target_q1_values, _, _ = self.target_critic_1.act(states=sampled_next_states, taken_actions=next_actions) - target_q2_values, _, _ = self.target_critic_2.act(states=sampled_next_states, taken_actions=next_actions) + target_q1_values, _, _ = self.target_critic_1.act(states=sampled_next_states, taken_actions=next_actions, role="target_critic_1") + target_q2_values, _, _ = self.target_critic_2.act(states=sampled_next_states, taken_actions=next_actions, role="target_critic_2") target_q_values = torch.min(target_q1_values, target_q2_values) target_values = sampled_rewards + self._discount_factor * sampled_dones.logical_not() * target_q_values # compute critic loss - critic_1_values, _, _ = self.critic_1.act(states=sampled_states, taken_actions=sampled_actions) - critic_2_values, _, _ = self.critic_2.act(states=sampled_states, taken_actions=sampled_actions) + critic_1_values, _, _ = self.critic_1.act(states=sampled_states, taken_actions=sampled_actions, role="critic_1") + critic_2_values, _, _ = self.critic_2.act(states=sampled_states, taken_actions=sampled_actions, role="critic_2") critic_loss = F.mse_loss(critic_1_values, target_values) + F.mse_loss(critic_2_values, target_values) @@ -363,8 +357,8 @@ def _update(self, timestep: int, timesteps: int) -> None: if not self._critic_update_counter % self._policy_delay: # compute policy (actor) loss - actions, _, _ = self.policy.act(states=sampled_states) - critic_values, _, _ = self.critic_1.act(states=sampled_states, taken_actions=actions) + actions, _, _ = self.policy.act(states=sampled_states, taken_actions=None, role="policy") + critic_values, _, _ = self.critic_1.act(states=sampled_states, taken_actions=actions, role="critic_1") policy_loss = -critic_values.mean() diff --git a/skrl/agents/torch/trpo/trpo.py b/skrl/agents/torch/trpo/trpo.py index ce78c89f..aead3d07 100644 --- a/skrl/agents/torch/trpo/trpo.py +++ b/skrl/agents/torch/trpo/trpo.py @@ -169,11 +169,7 @@ def init(self) -> None: self._current_log_prob = None self._current_next_states = None - def act(self, - states: torch.Tensor, - timestep: int, - timesteps: int, - inference: bool = False) -> torch.Tensor: + def act(self, states: torch.Tensor, timestep: int, timesteps: int) -> torch.Tensor: """Process the environment's states to make a decision (actions) using the main policy :param states: Environment's states @@ -182,8 +178,6 @@ def act(self, :type timestep: int :param timesteps: Number of timesteps :type timesteps: int - :param inference: Flag to indicate whether the model is making inference - :type inference: bool :return: Actions :rtype: torch.Tensor @@ -193,10 +187,10 @@ def act(self, # sample random actions # TODO, check for stochasticity if timestep < self._random_timesteps: - return self.policy.random_act(states) + return self.policy.random_act(states, taken_actions=None, role="policy") # sample stochastic actions - actions, log_prob, actions_mean = self.policy.act(states, inference=inference) + actions, log_prob, actions_mean = self.policy.act(states, taken_actions=None, role="policy") self._current_log_prob = log_prob return actions, log_prob, actions_mean @@ -238,7 +232,8 @@ def record_transition(self, self._current_next_states = next_states if self.memory is not None: - values, _, _ = self.value.act(states=self._state_preprocessor(states), inference=True) + with torch.no_grad(): + values, _, _ = self.value.act(states=self._state_preprocessor(states), taken_actions=None, role="value") values = self._value_preprocessor(values, inverse=True) self.memory.add_samples(states=states, actions=actions, rewards=rewards, next_states=next_states, dones=dones, @@ -342,7 +337,7 @@ def surrogate_loss(policy: Model, :return: Surrogate loss :rtype: torch.Tensor """ - _, new_log_prob, _ = policy.act(states, actions) + _, new_log_prob, _ = policy.act(states, taken_actions=actions, role="policy") return (advantages * torch.exp(new_log_prob - log_prob.detach())).mean() def conjugate_gradient(policy: Model, @@ -426,11 +421,11 @@ def kl_divergence(policy_1: Model, policy_2: Model, states: torch.Tensor) -> tor :return: KL divergence :rtype: torch.Tensor """ - _, _, mu_1 = policy_1.act(states) + _, _, mu_1 = policy_1.act(states, taken_actions=None, role="policy") logstd_1 = policy_1.get_log_std() mu_1, logstd_1 = mu_1.detach(), logstd_1.detach() - _, _, mu_2 = policy_2.act(states) + _, _, mu_2 = policy_2.act(states, taken_actions=None, role="policy") logstd_2 = policy_2.get_log_std() kl = logstd_1 - logstd_2 + 0.5 * (torch.square(logstd_1.exp()) + torch.square(mu_1 - mu_2)) \ @@ -438,8 +433,8 @@ def kl_divergence(policy_1: Model, policy_2: Model, states: torch.Tensor) -> tor return torch.sum(kl, dim=-1).mean() # compute returns and advantages - last_values, _, _ = self.value.act(states=self._state_preprocessor(self._current_next_states.float() \ - if not torch.is_floating_point(self._current_next_states) else self._current_next_states), inference=True) + with torch.no_grad(): + last_values, _, _ = self.value.act(self._state_preprocessor(self._current_next_states.float()), taken_actions=None, role="value") last_values = self._value_preprocessor(last_values, inverse=True) values = self.memory.get_tensor_by_name("values") @@ -506,7 +501,7 @@ def kl_divergence(policy_1: Model, policy_2: Model, states: torch.Tensor) -> tor self.policy.update_parameters(self.backup_policy) # compute value loss - predicted_values, _, _ = self.value.act(sampled_states) + predicted_values, _, _ = self.value.act(sampled_states, taken_actions=None, role="value") value_loss = self._value_loss_scale * F.mse_loss(sampled_returns, predicted_values) @@ -529,7 +524,7 @@ def kl_divergence(policy_1: Model, policy_2: Model, states: torch.Tensor) -> tor self.track_data("Loss / Policy loss", cumulative_policy_loss / (self._learning_epochs * self._mini_batches)) self.track_data("Loss / Value loss", cumulative_value_loss / (self._learning_epochs * self._mini_batches)) - self.track_data("Policy / Standard deviation", self.policy.distribution().stddev.mean().item()) + self.track_data("Policy / Standard deviation", self.policy.distribution(role="policy").stddev.mean().item()) if self._learning_rate_scheduler: self.track_data("Learning / Value learning rate", self.value_scheduler.get_last_lr()[0]) From 0de03ec102b8bffd685c8d8ed1c454ecd68d8d2c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20Serrano=20Mu=C3=B1oz?= Date: Tue, 6 Sep 2022 11:12:24 +0200 Subject: [PATCH 050/108] Update checkpoint mechanism to storage optimizers and preprocessors --- skrl/agents/torch/a2c/a2c.py | 22 ++++++--- skrl/agents/torch/amp/amp.py | 31 +++++++++---- skrl/agents/torch/base.py | 54 +++++++++++++--------- skrl/agents/torch/cem/cem.py | 13 ++++-- skrl/agents/torch/ddpg/ddpg.py | 17 +++++-- skrl/agents/torch/dqn/ddqn.py | 16 +++++-- skrl/agents/torch/dqn/dqn.py | 14 ++++-- skrl/agents/torch/ppo/ppo.py | 22 ++++++--- skrl/agents/torch/q_learning/q_learning.py | 4 +- skrl/agents/torch/sac/sac.py | 20 ++++++-- skrl/agents/torch/sarsa/sarsa.py | 4 +- skrl/agents/torch/td3/td3.py | 19 ++++++-- skrl/agents/torch/trpo/trpo.py | 22 ++++++--- 13 files changed, 182 insertions(+), 76 deletions(-) diff --git a/skrl/agents/torch/a2c/a2c.py b/skrl/agents/torch/a2c/a2c.py index 02dec6da..64d407fa 100644 --- a/skrl/agents/torch/a2c/a2c.py +++ b/skrl/agents/torch/a2c/a2c.py @@ -45,7 +45,7 @@ "write_interval": 250, # TensorBoard writing interval (timesteps) "checkpoint_interval": 1000, # interval for checkpoints (timesteps) - "checkpoint_policy_only": True, # checkpoint for policy only + "store_separately": True, # whether to store checkpoints separately } } @@ -93,7 +93,8 @@ def __init__(self, self.value = self.models.get("value", None) # checkpoint models - self.checkpoint_models = {"policy": self.policy} if self.checkpoint_policy_only else self.models + self.checkpoint_modules["policy"] = self.policy + self.checkpoint_modules["value"] = self.value # configuration self._mini_batches = self.cfg["mini_batches"] @@ -128,11 +129,20 @@ def __init__(self, if self._learning_rate_scheduler is not None: self.scheduler = self._learning_rate_scheduler(self.optimizer, **self.cfg["learning_rate_scheduler_kwargs"]) + self.checkpoint_modules["optimizer"] = self.optimizer + # set up preprocessors - self._state_preprocessor = self._state_preprocessor(**self.cfg["state_preprocessor_kwargs"]) if self._state_preprocessor \ - else self._empty_preprocessor - self._value_preprocessor = self._value_preprocessor(**self.cfg["value_preprocessor_kwargs"]) if self._value_preprocessor \ - else self._empty_preprocessor + if self._state_preprocessor: + self._state_preprocessor = self._state_preprocessor(**self.cfg["state_preprocessor_kwargs"]) + self.checkpoint_modules["state_preprocessor"] = self._state_preprocessor + else: + self._state_preprocessor = self._empty_preprocessor + + if self._value_preprocessor: + self._value_preprocessor = self._value_preprocessor(**self.cfg["value_preprocessor_kwargs"]) + self.checkpoint_modules["value_preprocessor"] = self._value_preprocessor + else: + self._value_preprocessor = self._empty_preprocessor def init(self) -> None: """Initialize the agent diff --git a/skrl/agents/torch/amp/amp.py b/skrl/agents/torch/amp/amp.py index 4664fcbb..e9b2264f 100644 --- a/skrl/agents/torch/amp/amp.py +++ b/skrl/agents/torch/amp/amp.py @@ -63,7 +63,7 @@ "write_interval": 250, # TensorBoard writing interval (timesteps) "checkpoint_interval": 1000, # interval for checkpoints (timesteps) - "checkpoint_policy_only": True, # checkpoint for policy only + "store_separately": True, # whether to store checkpoints separately } } @@ -136,7 +136,9 @@ def __init__(self, self.discriminator = self.models.get("discriminator", None) # checkpoint models - self.checkpoint_models = {"policy": self.policy} if self.checkpoint_policy_only else self.models + self.checkpoint_modules["policy"] = self.policy + self.checkpoint_modules["value"] = self.value + self.checkpoint_modules["discriminator"] = self.discriminator # configuration self._learning_epochs = self.cfg["learning_epochs"] @@ -187,13 +189,26 @@ def __init__(self, if self._learning_rate_scheduler is not None: self.scheduler = self._learning_rate_scheduler(self.optimizer, **self.cfg["learning_rate_scheduler_kwargs"]) + self.checkpoint_modules["optimizer"] = self.optimizer + # set up preprocessors - self._state_preprocessor = self._state_preprocessor(**self.cfg["state_preprocessor_kwargs"]) if self._state_preprocessor \ - else self._empty_preprocessor - self._value_preprocessor = self._value_preprocessor(**self.cfg["value_preprocessor_kwargs"]) if self._value_preprocessor \ - else self._empty_preprocessor - self._amp_state_preprocessor = self._amp_state_preprocessor(**self.cfg["amp_state_preprocessor_kwargs"]) \ - if self._amp_state_preprocessor else self._empty_preprocessor + if self._state_preprocessor: + self._state_preprocessor = self._state_preprocessor(**self.cfg["state_preprocessor_kwargs"]) + self.checkpoint_modules["state_preprocessor"] = self._state_preprocessor + else: + self._state_preprocessor = self._empty_preprocessor + + if self._value_preprocessor: + self._value_preprocessor = self._value_preprocessor(**self.cfg["value_preprocessor_kwargs"]) + self.checkpoint_modules["value_preprocessor"] = self._value_preprocessor + else: + self._value_preprocessor = self._empty_preprocessor + + if self._amp_state_preprocessor: + self._amp_state_preprocessor = self._amp_state_preprocessor(**self.cfg["amp_state_preprocessor_kwargs"]) + self.checkpoint_modules["amp_state_preprocessor"] = self._amp_state_preprocessor + else: + self._amp_state_preprocessor = self._empty_preprocessor def init(self) -> None: """Initialize the agent diff --git a/skrl/agents/torch/base.py b/skrl/agents/torch/base.py index 87a4e8d2..f7c9ea07 100644 --- a/skrl/agents/torch/base.py +++ b/skrl/agents/torch/base.py @@ -66,10 +66,10 @@ def __init__(self, self._cumulative_timesteps = None # checkpoint - self.checkpoint_models = {} + self.checkpoint_modules = {} self.checkpoint_interval = self.cfg.get("experiment", {}).get("checkpoint_interval", 1000) - self.checkpoint_policy_only = self.cfg.get("experiment", {}).get("checkpoint_policy_only", True) - self.checkpoint_best_models = {"timestep": 0, "reward": -2 ** 31, "saved": False, "models": {}} + self.checkpoint_store_separately = self.cfg.get("experiment", {}).get("store_separately", False) + self.checkpoint_best_modules = {"timestep": 0, "reward": -2 ** 31, "saved": False, "modules": {}} def __str__(self) -> str: """Generate a representation of the agent as string @@ -164,23 +164,37 @@ def write_checkpoint(self, timestep: int, timesteps: int) -> None: :param timesteps: Number of timesteps :type timesteps: int """ - # current models - for k, model in self.checkpoint_models.items(): - name = "{}_{}".format(timestep if timestep is not None else datetime.datetime.now().strftime("%y-%m-%d_%H-%M-%S-%f"), k) - model.save(os.path.join(self.experiment_dir, "checkpoints", "{}.pt".format(name))) + tag = str(timestep if timestep is not None else datetime.datetime.now().strftime("%y-%m-%d_%H-%M-%S-%f")) + # separated modules + if self.checkpoint_store_separately: + for name, module in self.checkpoint_modules.items(): + torch.save(module.state_dict(), os.path.join(self.experiment_dir, "checkpoints", "{}_{}.pt".format(name, tag))) + # whole agent + else: + modules = {} + for name, module in self.checkpoint_modules.items(): + modules[name] = module.state_dict() + torch.save(modules, os.path.join(self.experiment_dir, "checkpoints", "{}_{}.pt".format("agent", tag))) # best models - if self.checkpoint_best_models["models"] and not self.checkpoint_best_models["saved"]: - for k, model in self.checkpoint_models.items(): - model.save(os.path.join(self.experiment_dir, "checkpoints", "best_{}.pt".format(k)), - state_dict=self.checkpoint_best_models["models"][k]) - self.checkpoint_best_models["saved"] = True + if self.checkpoint_best_modules["modules"] and not self.checkpoint_best_modules["saved"]: + # separated modules + if self.checkpoint_store_separately: + for name, module in self.checkpoint_modules.items(): + torch.save(self.checkpoint_best_modules["modules"][name], + os.path.join(self.experiment_dir, "checkpoints", "best_{}.pt".format(name))) + # whole agent + else: + modules = {} + for name, module in self.checkpoint_modules.items(): + modules[name] = self.checkpoint_best_modules["modules"][name] + torch.save(modules, os.path.join(self.experiment_dir, "checkpoints", "best_{}.pt".format("agent"))) + self.checkpoint_best_modules["saved"] = True def act(self, states: torch.Tensor, timestep: int, - timesteps: int, - inference: bool = False) -> torch.Tensor: + timesteps: int) -> torch.Tensor: """Process the environment's states to make a decision (actions) using the main policy :param states: Environment's states @@ -189,8 +203,6 @@ def act(self, :type timestep: int :param timesteps: Number of timesteps :type timesteps: int - :param inference: Flag to indicate whether the model is making inference - :type inference: bool :raises NotImplementedError: The method is not implemented by the inheriting classes @@ -302,11 +314,11 @@ def post_interaction(self, timestep: int, timesteps: int) -> None: if timestep > 1 and self.write_interval > 0 and not timestep % self.write_interval: # update best models reward = np.mean(self.tracking_data.get("Reward / Total reward (mean)", -2 ** 31)) - if reward > self.checkpoint_best_models["reward"]: - self.checkpoint_best_models["timestep"] = timestep - self.checkpoint_best_models["reward"] = reward - self.checkpoint_best_models["saved"] = False - self.checkpoint_best_models["models"] = {k: copy.deepcopy(model.state_dict()) for k, model in self.checkpoint_models.items()} + if reward > self.checkpoint_best_modules["reward"]: + self.checkpoint_best_modules["timestep"] = timestep + self.checkpoint_best_modules["reward"] = reward + self.checkpoint_best_modules["saved"] = False + self.checkpoint_best_modules["modules"] = {k: copy.deepcopy(v.state_dict()) for k, v in self.checkpoint_modules.items()} # write to tensorboard self.write_tracking_data(timestep, timesteps) diff --git a/skrl/agents/torch/cem/cem.py b/skrl/agents/torch/cem/cem.py index 35a40da2..fb655e65 100644 --- a/skrl/agents/torch/cem/cem.py +++ b/skrl/agents/torch/cem/cem.py @@ -36,7 +36,7 @@ "write_interval": 250, # TensorBoard writing interval (timesteps) "checkpoint_interval": 1000, # interval for checkpoints (timesteps) - "checkpoint_policy_only": True, # checkpoint for policy only + "store_separately": True, # whether to store checkpoints separately } } @@ -83,7 +83,7 @@ def __init__(self, self.policy = self.models.get("policy", None) # checkpoint models - self.checkpoint_models = self.models + self.checkpoint_modules["policy"] = self.policy # configuration: self._rollouts = self.cfg["rollouts"] @@ -110,9 +110,14 @@ def __init__(self, if self._learning_rate_scheduler is not None: self.scheduler = self._learning_rate_scheduler(self.optimizer, **self.cfg["learning_rate_scheduler_kwargs"]) + self.checkpoint_modules["optimizer"] = self.optimizer + # set up preprocessors - self._state_preprocessor = self._state_preprocessor(**self.cfg["state_preprocessor_kwargs"]) if self._state_preprocessor \ - else self._empty_preprocessor + if self._state_preprocessor: + self._state_preprocessor = self._state_preprocessor(**self.cfg["state_preprocessor_kwargs"]) + self.checkpoint_modules["state_preprocessor"] = self._state_preprocessor + else: + self._state_preprocessor = self._empty_preprocessor def init(self) -> None: """Initialize the agent diff --git a/skrl/agents/torch/ddpg/ddpg.py b/skrl/agents/torch/ddpg/ddpg.py index e829aef8..703dcd10 100644 --- a/skrl/agents/torch/ddpg/ddpg.py +++ b/skrl/agents/torch/ddpg/ddpg.py @@ -45,7 +45,7 @@ "write_interval": 250, # TensorBoard writing interval (timesteps) "checkpoint_interval": 1000, # interval for checkpoints (timesteps) - "checkpoint_policy_only": True, # checkpoint for policy only + "store_separately": True, # whether to store checkpoints separately } } @@ -95,7 +95,10 @@ def __init__(self, self.target_critic = self.models.get("target_critic", None) # checkpoint models - self.checkpoint_models = {"policy": self.policy} if self.checkpoint_policy_only else self.models + self.checkpoint_modules["policy"] = self.policy + self.checkpoint_modules["target_policy"] = self.target_policy + self.checkpoint_modules["critic"] = self.critic + self.checkpoint_modules["target_critic"] = self.target_critic if self.target_policy is not None and self.target_critic is not None: # freeze target networks with respect to optimizers (update via .update_parameters()) @@ -137,9 +140,15 @@ def __init__(self, self.policy_scheduler = self._learning_rate_scheduler(self.policy_optimizer, **self.cfg["learning_rate_scheduler_kwargs"]) self.critic_scheduler = self._learning_rate_scheduler(self.critic_optimizer, **self.cfg["learning_rate_scheduler_kwargs"]) + self.checkpoint_modules["policy_optimizer"] = self.policy_optimizer + self.checkpoint_modules["critic_optimizer"] = self.critic_optimizer + # set up preprocessors - self._state_preprocessor = self._state_preprocessor(**self.cfg["state_preprocessor_kwargs"]) if self._state_preprocessor \ - else self._empty_preprocessor + if self._state_preprocessor: + self._state_preprocessor = self._state_preprocessor(**self.cfg["state_preprocessor_kwargs"]) + self.checkpoint_modules["state_preprocessor"] = self._state_preprocessor + else: + self._state_preprocessor = self._empty_preprocessor def init(self) -> None: """Initialize the agent diff --git a/skrl/agents/torch/dqn/ddqn.py b/skrl/agents/torch/dqn/ddqn.py index 4655531f..be0affb9 100644 --- a/skrl/agents/torch/dqn/ddqn.py +++ b/skrl/agents/torch/dqn/ddqn.py @@ -47,7 +47,7 @@ "write_interval": 250, # TensorBoard writing interval (timesteps) "checkpoint_interval": 1000, # interval for checkpoints (timesteps) - "checkpoint_policy_only": True, # checkpoint for policy only + "store_separately": True, # whether to store checkpoints separately } } @@ -95,8 +95,9 @@ def __init__(self, self.target_q_network = self.models.get("target_q_network", None) # checkpoint models - self.checkpoint_models = {"q_network": self.q_network} if self.checkpoint_policy_only else self.models - + self.checkpoint_modules["q_network"] = self.q_network + self.checkpoint_modules["target_q_network"] = self.target_q_network + if self.target_q_network is not None: # freeze target networks with respect to optimizers (update via .update_parameters()) self.target_q_network.freeze_parameters(True) @@ -134,9 +135,14 @@ def __init__(self, if self._learning_rate_scheduler is not None: self.scheduler = self._learning_rate_scheduler(self.optimizer, **self.cfg["learning_rate_scheduler_kwargs"]) + self.checkpoint_modules["optimizer"] = self.optimizer + # set up preprocessors - self._state_preprocessor = self._state_preprocessor(**self.cfg["state_preprocessor_kwargs"]) if self._state_preprocessor \ - else self._empty_preprocessor + if self._state_preprocessor: + self._state_preprocessor = self._state_preprocessor(**self.cfg["state_preprocessor_kwargs"]) + self.checkpoint_modules["state_preprocessor"] = self._state_preprocessor + else: + self._state_preprocessor = self._empty_preprocessor def init(self) -> None: """Initialize the agent diff --git a/skrl/agents/torch/dqn/dqn.py b/skrl/agents/torch/dqn/dqn.py index 9c77d947..a3cb03b2 100644 --- a/skrl/agents/torch/dqn/dqn.py +++ b/skrl/agents/torch/dqn/dqn.py @@ -47,7 +47,7 @@ "write_interval": 250, # TensorBoard writing interval (timesteps) "checkpoint_interval": 1000, # interval for checkpoints (timesteps) - "checkpoint_policy_only": True, # checkpoint for policy only + "store_separately": True, # whether to store checkpoints separately } } @@ -95,7 +95,8 @@ def __init__(self, self.target_q_network = self.models.get("target_q_network", None) # checkpoint models - self.checkpoint_models = {"q_network": self.q_network} if self.checkpoint_policy_only else self.models + self.checkpoint_modules["q_network"] = self.q_network + self.checkpoint_modules["target_q_network"] = self.target_q_network if self.target_q_network is not None: # freeze target networks with respect to optimizers (update via .update_parameters()) @@ -134,9 +135,14 @@ def __init__(self, if self._learning_rate_scheduler is not None: self.scheduler = self._learning_rate_scheduler(self.optimizer, **self.cfg["learning_rate_scheduler_kwargs"]) + self.checkpoint_modules["optimizer"] = self.optimizer + # set up preprocessors - self._state_preprocessor = self._state_preprocessor(**self.cfg["state_preprocessor_kwargs"]) if self._state_preprocessor \ - else self._empty_preprocessor + if self._state_preprocessor: + self._state_preprocessor = self._state_preprocessor(**self.cfg["state_preprocessor_kwargs"]) + self.checkpoint_modules["state_preprocessor"] = self._state_preprocessor + else: + self._state_preprocessor = self._empty_preprocessor def init(self) -> None: """Initialize the agent diff --git a/skrl/agents/torch/ppo/ppo.py b/skrl/agents/torch/ppo/ppo.py index a04c06b9..9581c362 100644 --- a/skrl/agents/torch/ppo/ppo.py +++ b/skrl/agents/torch/ppo/ppo.py @@ -53,7 +53,7 @@ "write_interval": 250, # TensorBoard writing interval (timesteps) "checkpoint_interval": 1000, # interval for checkpoints (timesteps) - "checkpoint_policy_only": True, # checkpoint for policy only + "store_separately": True, # whether to store checkpoints separately } } @@ -101,7 +101,8 @@ def __init__(self, self.value = self.models.get("value", None) # checkpoint models - self.checkpoint_models = {"policy": self.policy} if self.checkpoint_policy_only else self.models + self.checkpoint_modules["policy"] = self.policy + self.checkpoint_modules["value"] = self.value # configuration self._learning_epochs = self.cfg["learning_epochs"] @@ -143,11 +144,20 @@ def __init__(self, if self._learning_rate_scheduler is not None: self.scheduler = self._learning_rate_scheduler(self.optimizer, **self.cfg["learning_rate_scheduler_kwargs"]) + self.checkpoint_modules["optimizer"] = self.optimizer + # set up preprocessors - self._state_preprocessor = self._state_preprocessor(**self.cfg["state_preprocessor_kwargs"]) if self._state_preprocessor \ - else self._empty_preprocessor - self._value_preprocessor = self._value_preprocessor(**self.cfg["value_preprocessor_kwargs"]) if self._value_preprocessor \ - else self._empty_preprocessor + if self._state_preprocessor: + self._state_preprocessor = self._state_preprocessor(**self.cfg["state_preprocessor_kwargs"]) + self.checkpoint_modules["state_preprocessor"] = self._state_preprocessor + else: + self._state_preprocessor = self._empty_preprocessor + + if self._value_preprocessor: + self._value_preprocessor = self._value_preprocessor(**self.cfg["value_preprocessor_kwargs"]) + self.checkpoint_modules["value_preprocessor"] = self._value_preprocessor + else: + self._value_preprocessor = self._empty_preprocessor def init(self) -> None: """Initialize the agent diff --git a/skrl/agents/torch/q_learning/q_learning.py b/skrl/agents/torch/q_learning/q_learning.py index 67b74029..95f4a9a5 100644 --- a/skrl/agents/torch/q_learning/q_learning.py +++ b/skrl/agents/torch/q_learning/q_learning.py @@ -27,7 +27,7 @@ "write_interval": 250, # TensorBoard writing interval (timesteps) "checkpoint_interval": 1000, # interval for checkpoints (timesteps) - "checkpoint_policy_only": True, # checkpoint for policy only + "store_separately": True, # whether to store checkpoints separately } } @@ -74,7 +74,7 @@ def __init__(self, self.policy = self.models.get("policy", None) # checkpoint models - self.checkpoint_models = {"policy": self.policy} if self.checkpoint_policy_only else self.models + self.checkpoint_modules["policy"] = self.policy # configuration self._discount_factor = self.cfg["discount_factor"] diff --git a/skrl/agents/torch/sac/sac.py b/skrl/agents/torch/sac/sac.py index b4ca788a..886c6f25 100644 --- a/skrl/agents/torch/sac/sac.py +++ b/skrl/agents/torch/sac/sac.py @@ -45,7 +45,7 @@ "write_interval": 250, # TensorBoard writing interval (timesteps) "checkpoint_interval": 1000, # interval for checkpoints (timesteps) - "checkpoint_policy_only": True, # checkpoint for policy only + "store_separately": True, # whether to store checkpoints separately } } @@ -96,7 +96,11 @@ def __init__(self, self.target_critic_2 = self.models.get("target_critic_2", None) # checkpoint models - self.checkpoint_models = {"policy": self.policy} if self.checkpoint_policy_only else self.models + self.checkpoint_modules["policy"] = self.policy + self.checkpoint_modules["critic_1"] = self.critic_1 + self.checkpoint_modules["critic_2"] = self.critic_2 + self.checkpoint_modules["target_critic_1"] = self.target_critic_1 + self.checkpoint_modules["target_critic_2"] = self.target_critic_2 if self.target_critic_1 is not None and self.target_critic_2 is not None: # freeze target networks with respect to optimizers (update via .update_parameters()) @@ -138,6 +142,8 @@ def __init__(self, self.log_entropy_coefficient = torch.log(torch.ones(1, device=self.device) * self._entropy_coefficient).requires_grad_(True) self.entropy_optimizer = torch.optim.Adam([self.log_entropy_coefficient], lr=self._entropy_learning_rate) + self.checkpoint_modules["entropy_optimizer"] = self.entropy_optimizer + # set up optimizers and learning rate schedulers if self.policy is not None and self.critic_1 is not None and self.critic_2 is not None: self.policy_optimizer = torch.optim.Adam(self.policy.parameters(), lr=self._actor_learning_rate) @@ -147,9 +153,15 @@ def __init__(self, self.policy_scheduler = self._learning_rate_scheduler(self.policy_optimizer, **self.cfg["learning_rate_scheduler_kwargs"]) self.critic_scheduler = self._learning_rate_scheduler(self.critic_optimizer, **self.cfg["learning_rate_scheduler_kwargs"]) + self.checkpoint_modules["policy_optimizer"] = self.policy_optimizer + self.checkpoint_modules["critic_optimizer"] = self.critic_optimizer + # set up preprocessors - self._state_preprocessor = self._state_preprocessor(**self.cfg["state_preprocessor_kwargs"]) if self._state_preprocessor \ - else self._empty_preprocessor + if self._state_preprocessor: + self._state_preprocessor = self._state_preprocessor(**self.cfg["state_preprocessor_kwargs"]) + self.checkpoint_modules["state_preprocessor"] = self._state_preprocessor + else: + self._state_preprocessor = self._empty_preprocessor def init(self) -> None: """Initialize the agent diff --git a/skrl/agents/torch/sarsa/sarsa.py b/skrl/agents/torch/sarsa/sarsa.py index 4c9f9abb..9871fe8c 100644 --- a/skrl/agents/torch/sarsa/sarsa.py +++ b/skrl/agents/torch/sarsa/sarsa.py @@ -27,7 +27,7 @@ "write_interval": 250, # TensorBoard writing interval (timesteps) "checkpoint_interval": 1000, # interval for checkpoints (timesteps) - "checkpoint_policy_only": True, # checkpoint for policy only + "store_separately": True, # whether to store checkpoints separately } } @@ -74,7 +74,7 @@ def __init__(self, self.policy = self.models.get("policy", None) # checkpoint models - self.checkpoint_models = {"policy": self.policy} if self.checkpoint_policy_only else self.models + self.checkpoint_modules["policy"] = self.policy # configuration self._discount_factor = self.cfg["discount_factor"] diff --git a/skrl/agents/torch/td3/td3.py b/skrl/agents/torch/td3/td3.py index 73eeacdb..c3c66442 100644 --- a/skrl/agents/torch/td3/td3.py +++ b/skrl/agents/torch/td3/td3.py @@ -50,7 +50,7 @@ "write_interval": 250, # TensorBoard writing interval (timesteps) "checkpoint_interval": 1000, # interval for checkpoints (timesteps) - "checkpoint_policy_only": True, # checkpoint for policy only + "store_separately": True, # whether to store checkpoints separately } } @@ -102,7 +102,12 @@ def __init__(self, self.target_critic_2 = self.models.get("target_critic_2", None) # checkpoint models - self.checkpoint_models = {"policy": self.policy} if self.checkpoint_policy_only else self.models + self.checkpoint_modules["policy"] = self.policy + self.checkpoint_modules["target_policy"] = self.target_policy + self.checkpoint_modules["critic_1"] = self.critic_1 + self.checkpoint_modules["critic_2"] = self.critic_2 + self.checkpoint_modules["target_critic_1"] = self.target_critic_1 + self.checkpoint_modules["target_critic_2"] = self.target_critic_2 if self.target_policy is not None and self.target_critic_1 is not None and self.target_critic_2 is not None: # freeze target networks with respect to optimizers (update via .update_parameters()) @@ -153,9 +158,15 @@ def __init__(self, self.policy_scheduler = self._learning_rate_scheduler(self.policy_optimizer, **self.cfg["learning_rate_scheduler_kwargs"]) self.critic_scheduler = self._learning_rate_scheduler(self.critic_optimizer, **self.cfg["learning_rate_scheduler_kwargs"]) + self.checkpoint_modules["policy_optimizer"] = self.policy_optimizer + self.checkpoint_modules["critic_optimizer"] = self.critic_optimizer + # set up preprocessors - self._state_preprocessor = self._state_preprocessor(**self.cfg["state_preprocessor_kwargs"]) if self._state_preprocessor \ - else self._empty_preprocessor + if self._state_preprocessor: + self._state_preprocessor = self._state_preprocessor(**self.cfg["state_preprocessor_kwargs"]) + self.checkpoint_modules["state_preprocessor"] = self._state_preprocessor + else: + self._state_preprocessor = self._empty_preprocessor def init(self) -> None: """Initialize the agent diff --git a/skrl/agents/torch/trpo/trpo.py b/skrl/agents/torch/trpo/trpo.py index aead3d07..b06c84fe 100644 --- a/skrl/agents/torch/trpo/trpo.py +++ b/skrl/agents/torch/trpo/trpo.py @@ -53,7 +53,7 @@ "write_interval": 250, # TensorBoard writing interval (timesteps) "checkpoint_interval": 1000, # interval for checkpoints (timesteps) - "checkpoint_policy_only": True, # checkpoint for policy only + "store_separately": True, # whether to store checkpoints separately } } @@ -103,7 +103,8 @@ def __init__(self, self.backup_policy = copy.deepcopy(self.policy) # checkpoint models - self.checkpoint_models = {"policy": self.policy} if self.checkpoint_policy_only else self.models + self.checkpoint_modules["policy"] = self.policy + self.checkpoint_modules["value"] = self.value # configuration self._learning_epochs = self.cfg["learning_epochs"] @@ -141,11 +142,20 @@ def __init__(self, if self._learning_rate_scheduler is not None: self.value_scheduler = self._learning_rate_scheduler(self.value_optimizer, **self.cfg["learning_rate_scheduler_kwargs"]) + self.checkpoint_modules["value_optimizer"] = self.value_optimizer + # set up preprocessors - self._state_preprocessor = self._state_preprocessor(**self.cfg["state_preprocessor_kwargs"]) if self._state_preprocessor \ - else self._empty_preprocessor - self._value_preprocessor = self._value_preprocessor(**self.cfg["value_preprocessor_kwargs"]) if self._value_preprocessor \ - else self._empty_preprocessor + if self._state_preprocessor: + self._state_preprocessor = self._state_preprocessor(**self.cfg["state_preprocessor_kwargs"]) + self.checkpoint_modules["state_preprocessor"] = self._state_preprocessor + else: + self._state_preprocessor = self._empty_preprocessor + + if self._value_preprocessor: + self._value_preprocessor = self._value_preprocessor(**self.cfg["value_preprocessor_kwargs"]) + self.checkpoint_modules["value_preprocessor"] = self._value_preprocessor + else: + self._value_preprocessor = self._empty_preprocessor def init(self) -> None: """Initialize the agent From 22f2a395b4ed179b364604fe2dd4e84cd228c0a6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20Serrano=20Mu=C3=B1oz?= Date: Tue, 6 Sep 2022 11:51:26 +0200 Subject: [PATCH 051/108] Remove inference argument for model acting --- skrl/trainers/torch/base.py | 4 ++-- skrl/trainers/torch/manual.py | 14 ++++---------- skrl/trainers/torch/parallel.py | 5 +---- skrl/trainers/torch/sequential.py | 10 ++-------- 4 files changed, 9 insertions(+), 24 deletions(-) diff --git a/skrl/trainers/torch/base.py b/skrl/trainers/torch/base.py index 538e08a1..7565b090 100644 --- a/skrl/trainers/torch/base.py +++ b/skrl/trainers/torch/base.py @@ -168,7 +168,7 @@ def single_agent_train(self) -> None: # compute actions with torch.no_grad(): - actions, _, _ = self.agents.act(states, inference=True, timestep=timestep, timesteps=self.timesteps) + actions, _, _ = self.agents.act(states, timestep=timestep, timesteps=self.timesteps) # step the environments next_states, rewards, dones, infos = self.env.step(actions) @@ -220,7 +220,7 @@ def single_agent_eval(self) -> None: # compute actions with torch.no_grad(): - actions, _, _ = self.agents.act(states, inference=True, timestep=timestep, timesteps=self.timesteps) + actions, _, _ = self.agents.act(states, timestep=timestep, timesteps=self.timesteps) # step the environments next_states, rewards, dones, infos = self.env.step(actions) diff --git a/skrl/trainers/torch/manual.py b/skrl/trainers/torch/manual.py index 938fdc80..d82fb26e 100644 --- a/skrl/trainers/torch/manual.py +++ b/skrl/trainers/torch/manual.py @@ -87,7 +87,7 @@ def train(self, timestep: int, timesteps: Optional[int] = None) -> None: # compute actions with torch.no_grad(): - actions, _, _ = self.agents.act(self.states, inference=True, timestep=timestep, timesteps=timesteps) + actions, _, _ = self.agents.act(self.states, timestep=timestep, timesteps=timesteps) else: # pre-interaction @@ -96,10 +96,7 @@ def train(self, timestep: int, timesteps: Optional[int] = None) -> None: # compute actions with torch.no_grad(): - actions = torch.vstack([agent.act(self.states[scope[0]:scope[1]], - inference=True, - timestep=timestep, - timesteps=timesteps)[0] \ + actions = torch.vstack([agent.act(self.states[scope[0]:scope[1]], timestep=timestep, timesteps=timesteps)[0] \ for agent, scope in zip(self.agents, self.agents_scope)]) # step the environments @@ -178,14 +175,11 @@ def eval(self, timestep: int, timesteps: Optional[int] = None) -> None: with torch.no_grad(): if self.num_agents == 1: # compute actions - actions, _, _ = self.agents.act(self.states, inference=True, timestep=timestep, timesteps=timesteps) + actions, _, _ = self.agents.act(self.states, timestep=timestep, timesteps=timesteps) else: # compute actions - actions = torch.vstack([agent.act(self.states[scope[0]:scope[1]], - inference=True, - timestep=timestep, - timesteps=timesteps)[0] \ + actions = torch.vstack([agent.act(self.states[scope[0]:scope[1]], timestep=timestep, timesteps=timesteps)[0] \ for agent, scope in zip(self.agents, self.agents_scope)]) # step the environments diff --git a/skrl/trainers/torch/parallel.py b/skrl/trainers/torch/parallel.py index c5526223..c02957a7 100644 --- a/skrl/trainers/torch/parallel.py +++ b/skrl/trainers/torch/parallel.py @@ -57,10 +57,7 @@ def fn_processor(process_index, *args): elif task == "act": _states = queue.get()[scope[0]:scope[1]] with torch.no_grad(): - _actions = agent.act(_states, - inference=True, - timestep=msg['timestep'], - timesteps=msg['timesteps'])[0] + _actions = agent.act(_states, timestep=msg['timestep'], timesteps=msg['timesteps'])[0] if not _actions.is_cuda: _actions.share_memory_() queue.put(_actions) diff --git a/skrl/trainers/torch/sequential.py b/skrl/trainers/torch/sequential.py index 42114ad0..910b6079 100644 --- a/skrl/trainers/torch/sequential.py +++ b/skrl/trainers/torch/sequential.py @@ -77,10 +77,7 @@ def train(self) -> None: # compute actions with torch.no_grad(): - actions = torch.vstack([agent.act(states[scope[0]:scope[1]], - inference=True, - timestep=timestep, - timesteps=self.timesteps)[0] \ + actions = torch.vstack([agent.act(states[scope[0]:scope[1]], timestep=timestep, timesteps=self.timesteps)[0] \ for agent, scope in zip(self.agents, self.agents_scope)]) # step the environments @@ -138,10 +135,7 @@ def eval(self) -> None: # compute actions with torch.no_grad(): - actions = torch.vstack([agent.act(states[scope[0]:scope[1]], - inference=True, - timestep=timestep, - timesteps=self.timesteps)[0] \ + actions = torch.vstack([agent.act(states[scope[0]:scope[1]], timestep=timestep, timesteps=self.timesteps)[0] \ for agent, scope in zip(self.agents, self.agents_scope)]) # step the environments From daee3e9eba81c628cb4c719055f73093c0377365 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20Serrano=20Mu=C3=B1oz?= Date: Tue, 6 Sep 2022 13:58:48 +0200 Subject: [PATCH 052/108] Set models' store_separately configuration to False by default --- skrl/agents/torch/a2c/a2c.py | 2 +- skrl/agents/torch/amp/amp.py | 2 +- skrl/agents/torch/cem/cem.py | 2 +- skrl/agents/torch/ddpg/ddpg.py | 2 +- skrl/agents/torch/dqn/ddqn.py | 2 +- skrl/agents/torch/dqn/dqn.py | 2 +- skrl/agents/torch/ppo/ppo.py | 2 +- skrl/agents/torch/q_learning/q_learning.py | 2 +- skrl/agents/torch/sac/sac.py | 2 +- skrl/agents/torch/sarsa/sarsa.py | 2 +- skrl/agents/torch/td3/td3.py | 2 +- skrl/agents/torch/trpo/trpo.py | 2 +- 12 files changed, 12 insertions(+), 12 deletions(-) diff --git a/skrl/agents/torch/a2c/a2c.py b/skrl/agents/torch/a2c/a2c.py index 64d407fa..0693e0d5 100644 --- a/skrl/agents/torch/a2c/a2c.py +++ b/skrl/agents/torch/a2c/a2c.py @@ -45,7 +45,7 @@ "write_interval": 250, # TensorBoard writing interval (timesteps) "checkpoint_interval": 1000, # interval for checkpoints (timesteps) - "store_separately": True, # whether to store checkpoints separately + "store_separately": False, # whether to store checkpoints separately } } diff --git a/skrl/agents/torch/amp/amp.py b/skrl/agents/torch/amp/amp.py index e9b2264f..4af3af05 100644 --- a/skrl/agents/torch/amp/amp.py +++ b/skrl/agents/torch/amp/amp.py @@ -63,7 +63,7 @@ "write_interval": 250, # TensorBoard writing interval (timesteps) "checkpoint_interval": 1000, # interval for checkpoints (timesteps) - "store_separately": True, # whether to store checkpoints separately + "store_separately": False, # whether to store checkpoints separately } } diff --git a/skrl/agents/torch/cem/cem.py b/skrl/agents/torch/cem/cem.py index fb655e65..6168ed02 100644 --- a/skrl/agents/torch/cem/cem.py +++ b/skrl/agents/torch/cem/cem.py @@ -36,7 +36,7 @@ "write_interval": 250, # TensorBoard writing interval (timesteps) "checkpoint_interval": 1000, # interval for checkpoints (timesteps) - "store_separately": True, # whether to store checkpoints separately + "store_separately": False, # whether to store checkpoints separately } } diff --git a/skrl/agents/torch/ddpg/ddpg.py b/skrl/agents/torch/ddpg/ddpg.py index 703dcd10..dc7235ab 100644 --- a/skrl/agents/torch/ddpg/ddpg.py +++ b/skrl/agents/torch/ddpg/ddpg.py @@ -45,7 +45,7 @@ "write_interval": 250, # TensorBoard writing interval (timesteps) "checkpoint_interval": 1000, # interval for checkpoints (timesteps) - "store_separately": True, # whether to store checkpoints separately + "store_separately": False, # whether to store checkpoints separately } } diff --git a/skrl/agents/torch/dqn/ddqn.py b/skrl/agents/torch/dqn/ddqn.py index be0affb9..6ad0901e 100644 --- a/skrl/agents/torch/dqn/ddqn.py +++ b/skrl/agents/torch/dqn/ddqn.py @@ -47,7 +47,7 @@ "write_interval": 250, # TensorBoard writing interval (timesteps) "checkpoint_interval": 1000, # interval for checkpoints (timesteps) - "store_separately": True, # whether to store checkpoints separately + "store_separately": False, # whether to store checkpoints separately } } diff --git a/skrl/agents/torch/dqn/dqn.py b/skrl/agents/torch/dqn/dqn.py index a3cb03b2..0a18ece2 100644 --- a/skrl/agents/torch/dqn/dqn.py +++ b/skrl/agents/torch/dqn/dqn.py @@ -47,7 +47,7 @@ "write_interval": 250, # TensorBoard writing interval (timesteps) "checkpoint_interval": 1000, # interval for checkpoints (timesteps) - "store_separately": True, # whether to store checkpoints separately + "store_separately": False, # whether to store checkpoints separately } } diff --git a/skrl/agents/torch/ppo/ppo.py b/skrl/agents/torch/ppo/ppo.py index 9581c362..2fa6081d 100644 --- a/skrl/agents/torch/ppo/ppo.py +++ b/skrl/agents/torch/ppo/ppo.py @@ -53,7 +53,7 @@ "write_interval": 250, # TensorBoard writing interval (timesteps) "checkpoint_interval": 1000, # interval for checkpoints (timesteps) - "store_separately": True, # whether to store checkpoints separately + "store_separately": False, # whether to store checkpoints separately } } diff --git a/skrl/agents/torch/q_learning/q_learning.py b/skrl/agents/torch/q_learning/q_learning.py index 95f4a9a5..f13a0c4a 100644 --- a/skrl/agents/torch/q_learning/q_learning.py +++ b/skrl/agents/torch/q_learning/q_learning.py @@ -27,7 +27,7 @@ "write_interval": 250, # TensorBoard writing interval (timesteps) "checkpoint_interval": 1000, # interval for checkpoints (timesteps) - "store_separately": True, # whether to store checkpoints separately + "store_separately": False, # whether to store checkpoints separately } } diff --git a/skrl/agents/torch/sac/sac.py b/skrl/agents/torch/sac/sac.py index 886c6f25..26145f59 100644 --- a/skrl/agents/torch/sac/sac.py +++ b/skrl/agents/torch/sac/sac.py @@ -45,7 +45,7 @@ "write_interval": 250, # TensorBoard writing interval (timesteps) "checkpoint_interval": 1000, # interval for checkpoints (timesteps) - "store_separately": True, # whether to store checkpoints separately + "store_separately": False, # whether to store checkpoints separately } } diff --git a/skrl/agents/torch/sarsa/sarsa.py b/skrl/agents/torch/sarsa/sarsa.py index 9871fe8c..fdad6030 100644 --- a/skrl/agents/torch/sarsa/sarsa.py +++ b/skrl/agents/torch/sarsa/sarsa.py @@ -27,7 +27,7 @@ "write_interval": 250, # TensorBoard writing interval (timesteps) "checkpoint_interval": 1000, # interval for checkpoints (timesteps) - "store_separately": True, # whether to store checkpoints separately + "store_separately": False, # whether to store checkpoints separately } } diff --git a/skrl/agents/torch/td3/td3.py b/skrl/agents/torch/td3/td3.py index c3c66442..ae62cbf9 100644 --- a/skrl/agents/torch/td3/td3.py +++ b/skrl/agents/torch/td3/td3.py @@ -50,7 +50,7 @@ "write_interval": 250, # TensorBoard writing interval (timesteps) "checkpoint_interval": 1000, # interval for checkpoints (timesteps) - "store_separately": True, # whether to store checkpoints separately + "store_separately": False, # whether to store checkpoints separately } } diff --git a/skrl/agents/torch/trpo/trpo.py b/skrl/agents/torch/trpo/trpo.py index b06c84fe..e9766f8c 100644 --- a/skrl/agents/torch/trpo/trpo.py +++ b/skrl/agents/torch/trpo/trpo.py @@ -53,7 +53,7 @@ "write_interval": 250, # TensorBoard writing interval (timesteps) "checkpoint_interval": 1000, # interval for checkpoints (timesteps) - "store_separately": True, # whether to store checkpoints separately + "store_separately": False, # whether to store checkpoints separately } } From f8fe773fab70d4aa059fa2d890526ee29a310fd6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20Serrano=20Mu=C3=B1oz?= Date: Tue, 6 Sep 2022 14:06:53 +0200 Subject: [PATCH 053/108] Save and load agents' modules --- skrl/agents/torch/base.py | 57 ++++++++++++++++++++++++++++++++++----- 1 file changed, 51 insertions(+), 6 deletions(-) diff --git a/skrl/agents/torch/base.py b/skrl/agents/torch/base.py index f7c9ea07..616a05b5 100644 --- a/skrl/agents/torch/base.py +++ b/skrl/agents/torch/base.py @@ -10,6 +10,7 @@ import torch from torch.utils.tensorboard import SummaryWriter +from skrl import logger from ...memories.torch import Memory from ...models.torch import Model @@ -87,7 +88,7 @@ def __str__(self) -> str: string += "\n |-- {}: {}".format(k, v) return string - def _empty_preprocessor(self, _input, *args, **kwargs) -> Any: + def _empty_preprocessor(self, _input: Any, *args, **kwargs) -> Any: """Empty preprocess method This method is defined because PyTorch multiprocessing can't pickle lambdas @@ -100,6 +101,17 @@ def _empty_preprocessor(self, _input, *args, **kwargs) -> Any: """ return _input + def _get_internal_value(self, _module: Any) -> Any: + """Get internal module/variable state/value + + :param _input: Module or variable + :type _input: Any + + :return: Module/variable state/value + :rtype: Any + """ + return _module.state_dict() if hasattr(_module, "state_dict") else _module + def init(self) -> None: """Initialize the agent @@ -154,7 +166,7 @@ def write_tracking_data(self, timestep: int, timesteps: int) -> None: self.tracking_data.clear() def write_checkpoint(self, timestep: int, timesteps: int) -> None: - """Write checkpoint (models) to disk + """Write checkpoint (modules) to disk The checkpoints are saved in the directory 'checkpoints' in the experiment directory. The name of the checkpoint is the current timestep if timestep is not None, otherwise it is the current time. @@ -168,15 +180,15 @@ def write_checkpoint(self, timestep: int, timesteps: int) -> None: # separated modules if self.checkpoint_store_separately: for name, module in self.checkpoint_modules.items(): - torch.save(module.state_dict(), os.path.join(self.experiment_dir, "checkpoints", "{}_{}.pt".format(name, tag))) + torch.save(self._get_internal_value(module), os.path.join(self.experiment_dir, "checkpoints", "{}_{}.pt".format(name, tag))) # whole agent else: modules = {} for name, module in self.checkpoint_modules.items(): - modules[name] = module.state_dict() + modules[name] = self._get_internal_value(module) torch.save(modules, os.path.join(self.experiment_dir, "checkpoints", "{}_{}.pt".format("agent", tag))) - # best models + # best modules if self.checkpoint_best_modules["modules"] and not self.checkpoint_best_modules["saved"]: # separated modules if self.checkpoint_store_separately: @@ -290,6 +302,39 @@ def set_mode(self, mode: str) -> None: if model is not None: model.set_mode(mode) + def save(self, path: str) -> None: + """Save the agent to the specified path + + :param path: Path to save the model to + :type path: str + """ + modules = {} + for name, module in self.checkpoint_modules.items(): + modules[name] = self._get_internal_value(module) + torch.save(modules, path) + + def load(self, path: str) -> None: + """Load the model from the specified path + + The final storage device is determined by the constructor of the model + + :param path: Path to load the model from + :type path: str + """ + modules = torch.load(path, map_location=self.device) + if type(modules) is dict: + for name, data in modules.items(): + module = self.checkpoint_modules.get(name, None) + if module is not None: + if hasattr(module, "load_state_dict"): + module.load_state_dict(data) + if hasattr(module, "eval"): + module.eval() + else: + raise NotImplementedError + else: + logger.warning("Cannot load the {} module. The agent doesn't have such an instance".format(name)) + def pre_interaction(self, timestep: int, timesteps: int) -> None: """Callback called before the interaction with the environment @@ -318,7 +363,7 @@ def post_interaction(self, timestep: int, timesteps: int) -> None: self.checkpoint_best_modules["timestep"] = timestep self.checkpoint_best_modules["reward"] = reward self.checkpoint_best_modules["saved"] = False - self.checkpoint_best_modules["modules"] = {k: copy.deepcopy(v.state_dict()) for k, v in self.checkpoint_modules.items()} + self.checkpoint_best_modules["modules"] = {k: copy.deepcopy(self._get_internal_value(v)) for k, v in self.checkpoint_modules.items()} # write to tensorboard self.write_tracking_data(timestep, timesteps) From fb25540635aa9214a66c5ede344d271c0be53e50 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20Serrano=20Mu=C3=B1oz?= Date: Tue, 6 Sep 2022 14:32:37 +0200 Subject: [PATCH 054/108] Update OpenAI Gym documentation domain link --- README.md | 2 +- docs/source/index.rst | 2 +- docs/source/intro/installation.rst | 2 +- docs/source/modules/skrl.agents.a2c.rst | 2 +- docs/source/modules/skrl.agents.amp.rst | 2 +- docs/source/modules/skrl.agents.cem.rst | 2 +- docs/source/modules/skrl.agents.ddpg.rst | 2 +- docs/source/modules/skrl.agents.ddqn.rst | 2 +- docs/source/modules/skrl.agents.dqn.rst | 2 +- docs/source/modules/skrl.agents.ppo.rst | 2 +- docs/source/modules/skrl.agents.q_learning.rst | 2 +- docs/source/modules/skrl.agents.sac.rst | 2 +- docs/source/modules/skrl.agents.sarsa.rst | 2 +- docs/source/modules/skrl.agents.td3.rst | 2 +- docs/source/modules/skrl.agents.trpo.rst | 2 +- docs/source/modules/skrl.envs.wrapping.rst | 4 ++-- 16 files changed, 17 insertions(+), 17 deletions(-) diff --git a/README.md b/README.md index 2f3b3ecd..f9fa13a8 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@

SKRL - Reinforcement Learning library


-**skrl** is an open-source modular library for Reinforcement Learning written in Python (using [PyTorch](https://pytorch.org/)) and designed with a focus on readability, simplicity, and transparency of algorithm implementation. In addition to supporting the [OpenAI Gym](https://www.gymlibrary.ml) and [DeepMind](https://github.com/deepmind/dm_env) environment interfaces, it allows loading and configuring [NVIDIA Isaac Gym](https://developer.nvidia.com/isaac-gym/) and [NVIDIA Omniverse Isaac Gym](https://docs.omniverse.nvidia.com/app_isaacsim/app_isaacsim/tutorial_gym_isaac_gym.html) environments, enabling agents' simultaneous training by scopes (subsets of environments among all available environments), which may or may not share resources, in the same run +**skrl** is an open-source modular library for Reinforcement Learning written in Python (using [PyTorch](https://pytorch.org/)) and designed with a focus on readability, simplicity, and transparency of algorithm implementation. In addition to supporting the [OpenAI Gym](https://www.gymlibrary.dev) and [DeepMind](https://github.com/deepmind/dm_env) environment interfaces, it allows loading and configuring [NVIDIA Isaac Gym](https://developer.nvidia.com/isaac-gym/) and [NVIDIA Omniverse Isaac Gym](https://docs.omniverse.nvidia.com/app_isaacsim/app_isaacsim/tutorial_gym_isaac_gym.html) environments, enabling agents' simultaneous training by scopes (subsets of environments among all available environments), which may or may not share resources, in the same run
diff --git a/docs/source/index.rst b/docs/source/index.rst index f9c96941..0569fb7c 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -1,7 +1,7 @@ SKRL - Reinforcement Learning library (|version|) ================================================= -**skrl** is an open-source modular library for Reinforcement Learning written in Python (using `PyTorch `_) and designed with a focus on readability, simplicity, and transparency of algorithm implementation. In addition to supporting the `OpenAI Gym `_ and `DeepMind `_ environment interfaces, it allows loading and configuring `NVIDIA Isaac Gym `_ and `NVIDIA Omniverse Isaac Gym `_ environments, enabling agents' simultaneous training by scopes (subsets of environments among all available environments), which may or may not share resources, in the same run +**skrl** is an open-source modular library for Reinforcement Learning written in Python (using `PyTorch `_) and designed with a focus on readability, simplicity, and transparency of algorithm implementation. In addition to supporting the `OpenAI Gym `_ and `DeepMind `_ environment interfaces, it allows loading and configuring `NVIDIA Isaac Gym `_ and `NVIDIA Omniverse Isaac Gym `_ environments, enabling agents' simultaneous training by scopes (subsets of environments among all available environments), which may or may not share resources, in the same run **Main features:** * Clean code diff --git a/docs/source/intro/installation.rst b/docs/source/intro/installation.rst index 3c96b006..eaf11187 100644 --- a/docs/source/intro/installation.rst +++ b/docs/source/intro/installation.rst @@ -10,7 +10,7 @@ Prerequisites **skrl** requires Python 3.6 or higher and the following libraries (they will be installed automatically): - * `gym `_ + * `gym `_ * `tqdm `_ * `torch `_ 1.8.0 or higher * `tensorboard `_ diff --git a/docs/source/modules/skrl.agents.a2c.rst b/docs/source/modules/skrl.agents.a2c.rst index 7413838e..f2d52e46 100644 --- a/docs/source/modules/skrl.agents.a2c.rst +++ b/docs/source/modules/skrl.agents.a2c.rst @@ -82,7 +82,7 @@ Configuration and hyperparameters Spaces and models ^^^^^^^^^^^^^^^^^ -The implementation supports the following `Gym spaces `_ +The implementation supports the following `Gym spaces `_ .. list-table:: :header-rows: 1 diff --git a/docs/source/modules/skrl.agents.amp.rst b/docs/source/modules/skrl.agents.amp.rst index 314ecf6a..4ee3db5b 100644 --- a/docs/source/modules/skrl.agents.amp.rst +++ b/docs/source/modules/skrl.agents.amp.rst @@ -103,7 +103,7 @@ Configuration and hyperparameters Spaces and models ^^^^^^^^^^^^^^^^^ -The implementation supports the following `Gym spaces `_ +The implementation supports the following `Gym spaces `_ .. list-table:: :header-rows: 1 diff --git a/docs/source/modules/skrl.agents.cem.rst b/docs/source/modules/skrl.agents.cem.rst index 0001678b..6ef63e4c 100644 --- a/docs/source/modules/skrl.agents.cem.rst +++ b/docs/source/modules/skrl.agents.cem.rst @@ -38,7 +38,7 @@ Configuration and hyperparameters Spaces and models ^^^^^^^^^^^^^^^^^ -The implementation supports the following `Gym spaces `_ +The implementation supports the following `Gym spaces `_ .. list-table:: :header-rows: 1 diff --git a/docs/source/modules/skrl.agents.ddpg.rst b/docs/source/modules/skrl.agents.ddpg.rst index 51dfc657..ae57fc80 100644 --- a/docs/source/modules/skrl.agents.ddpg.rst +++ b/docs/source/modules/skrl.agents.ddpg.rst @@ -66,7 +66,7 @@ Configuration and hyperparameters Spaces and models ^^^^^^^^^^^^^^^^^ -The implementation supports the following `Gym spaces `_ +The implementation supports the following `Gym spaces `_ .. list-table:: :header-rows: 1 diff --git a/docs/source/modules/skrl.agents.ddqn.rst b/docs/source/modules/skrl.agents.ddqn.rst index afeddc73..f12ac7c1 100644 --- a/docs/source/modules/skrl.agents.ddqn.rst +++ b/docs/source/modules/skrl.agents.ddqn.rst @@ -45,7 +45,7 @@ Configuration and hyperparameters Spaces and models ^^^^^^^^^^^^^^^^^ -The implementation supports the following `Gym spaces `_ +The implementation supports the following `Gym spaces `_ .. list-table:: :header-rows: 1 diff --git a/docs/source/modules/skrl.agents.dqn.rst b/docs/source/modules/skrl.agents.dqn.rst index 77eb0e89..0ed111f8 100644 --- a/docs/source/modules/skrl.agents.dqn.rst +++ b/docs/source/modules/skrl.agents.dqn.rst @@ -45,7 +45,7 @@ Configuration and hyperparameters Spaces and models ^^^^^^^^^^^^^^^^^ -The implementation supports the following `Gym spaces `_ +The implementation supports the following `Gym spaces `_ .. list-table:: :header-rows: 1 diff --git a/docs/source/modules/skrl.agents.ppo.rst b/docs/source/modules/skrl.agents.ppo.rst index 5260bddc..8be4f1f1 100644 --- a/docs/source/modules/skrl.agents.ppo.rst +++ b/docs/source/modules/skrl.agents.ppo.rst @@ -98,7 +98,7 @@ Configuration and hyperparameters Spaces and models ^^^^^^^^^^^^^^^^^ -The implementation supports the following `Gym spaces `_ +The implementation supports the following `Gym spaces `_ .. list-table:: :header-rows: 1 diff --git a/docs/source/modules/skrl.agents.q_learning.rst b/docs/source/modules/skrl.agents.q_learning.rst index b5dee107..9b270188 100644 --- a/docs/source/modules/skrl.agents.q_learning.rst +++ b/docs/source/modules/skrl.agents.q_learning.rst @@ -36,7 +36,7 @@ Configuration and hyperparameters Spaces and models ^^^^^^^^^^^^^^^^^ -The implementation supports the following `Gym spaces `_ +The implementation supports the following `Gym spaces `_ .. list-table:: :header-rows: 1 diff --git a/docs/source/modules/skrl.agents.sac.rst b/docs/source/modules/skrl.agents.sac.rst index 599c9ff0..20dbe252 100644 --- a/docs/source/modules/skrl.agents.sac.rst +++ b/docs/source/modules/skrl.agents.sac.rst @@ -73,7 +73,7 @@ Configuration and hyperparameters Spaces and models ^^^^^^^^^^^^^^^^^ -The implementation supports the following `Gym spaces `_ +The implementation supports the following `Gym spaces `_ .. list-table:: :header-rows: 1 diff --git a/docs/source/modules/skrl.agents.sarsa.rst b/docs/source/modules/skrl.agents.sarsa.rst index 1c5fba3f..dacfb320 100644 --- a/docs/source/modules/skrl.agents.sarsa.rst +++ b/docs/source/modules/skrl.agents.sarsa.rst @@ -35,7 +35,7 @@ Configuration and hyperparameters Spaces and models ^^^^^^^^^^^^^^^^^ -The implementation supports the following `Gym spaces `_ +The implementation supports the following `Gym spaces `_ .. list-table:: :header-rows: 1 diff --git a/docs/source/modules/skrl.agents.td3.rst b/docs/source/modules/skrl.agents.td3.rst index 2b8108fb..74bb068e 100644 --- a/docs/source/modules/skrl.agents.td3.rst +++ b/docs/source/modules/skrl.agents.td3.rst @@ -76,7 +76,7 @@ Configuration and hyperparameters Spaces and models ^^^^^^^^^^^^^^^^^ -The implementation supports the following `Gym spaces `_ +The implementation supports the following `Gym spaces `_ .. list-table:: :header-rows: 1 diff --git a/docs/source/modules/skrl.agents.trpo.rst b/docs/source/modules/skrl.agents.trpo.rst index d4e87b78..52da1555 100644 --- a/docs/source/modules/skrl.agents.trpo.rst +++ b/docs/source/modules/skrl.agents.trpo.rst @@ -136,7 +136,7 @@ Configuration and hyperparameters Spaces and models ^^^^^^^^^^^^^^^^^ -The implementation supports the following `Gym spaces `_ +The implementation supports the following `Gym spaces `_ .. list-table:: :header-rows: 1 diff --git a/docs/source/modules/skrl.envs.wrapping.rst b/docs/source/modules/skrl.envs.wrapping.rst index f7489a94..3f0a7be5 100644 --- a/docs/source/modules/skrl.envs.wrapping.rst +++ b/docs/source/modules/skrl.envs.wrapping.rst @@ -3,7 +3,7 @@ Wrapping This library works with a common API to interact with the following RL environments: -* `OpenAI Gym `_ (single and vectorized environments) +* `OpenAI Gym `_ (single and vectorized environments) * `DeepMind `_ * `NVIDIA Isaac Gym `_ (preview 2, 3 and 4) * `NVIDIA Omniverse Isaac Gym `_ @@ -150,7 +150,7 @@ Basic usage .. tab:: Vectorized environment - Visit the OpenAI Gym documentation (`Vector API `_) for more information about the creation and usage of vectorized environments + Visit the OpenAI Gym documentation (`Vector API `_) for more information about the creation and usage of vectorized environments .. code-block:: python :linenos: From cf92e72121d6714841c70cb0989f72d7166988c4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20Serrano=20Mu=C3=B1oz?= Date: Tue, 6 Sep 2022 23:32:46 +0200 Subject: [PATCH 055/108] Modify logging format --- skrl/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skrl/__init__.py b/skrl/__init__.py index 37fdd2aa..d6928d79 100644 --- a/skrl/__init__.py +++ b/skrl/__init__.py @@ -12,7 +12,7 @@ # logger with format class _Formatter(logging.Formatter): - _format = "%(name)s:%(levelname)s - %(message)s (%(module)s:%(funcName)s:%(lineno)d)" + _format = "[%(name)s:%(levelname)s] %(message)s" _formats = {logging.DEBUG: f"\x1b[38;20m{_format}\x1b[0m", logging.INFO: f"\x1b[38;20m{_format}\x1b[0m", logging.WARNING: f"\x1b[33;20m{_format}\x1b[0m", From 496fe7784ba7029fa82df664fe8779ea4b02475a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20Serrano=20Mu=C3=B1oz?= Date: Tue, 6 Sep 2022 23:36:25 +0200 Subject: [PATCH 056/108] Migrate from external models by specifying its path --- skrl/models/torch/base.py | 165 +++++++++++++++++++++++++++++++++----- 1 file changed, 143 insertions(+), 22 deletions(-) diff --git a/skrl/models/torch/base.py b/skrl/models/torch/base.py index 93b98074..9973cc4d 100644 --- a/skrl/models/torch/base.py +++ b/skrl/models/torch/base.py @@ -357,7 +357,7 @@ def act(self, Deterministic agents must ignore the last two components and return empty tensors or None for them :rtype: sequence of torch.Tensor """ - logger.warn("Make sure to place Mixins before Model during model definition") + logger.warning("Make sure to place Mixins before Model during model definition") raise NotImplementedError("The action to be taken by the agent (.act()) is not implemented") def set_mode(self, mode: str) -> None: @@ -394,7 +394,6 @@ def save(self, path: str, state_dict: Optional[dict] = None) -> None: >>> old_state_dict = copy.deepcopy(model.state_dict()) >>> # ... >>> model.save("/tmp/model.pt", old_state_dict) - """ torch.save(self.state_dict() if state_dict is None else state_dict, path) @@ -420,37 +419,143 @@ def load(self, path: str) -> None: self.eval() def migrate(self, - state_dict: Mapping[str, torch.Tensor], + state_dict: Optional[Mapping[str, torch.Tensor]] = None, + path: Optional[str] = None, name_map: Mapping[str, str] = {}, auto_mapping: bool = True, - show_names: bool = False) -> bool: + verbose: bool = False) -> bool: """Migrate the specified extrernal model's state dict to the current model - :param state_dict: External model's state dict to migrate from - :type state_dict: Mapping[str, torch.Tensor] + The final storage device is determined by the constructor of the model + + Only one of ``state_dict`` or ``path`` can be specified. + The ``path`` parameter allows automatic loading the ``state_dict`` only from files generated + by the *rl_games* and *stable-baselines3* libraries at the moment + + For ambiguous models (where 2 or more parameters, for source or current model, have equal shape) + it is necessary to define the ``name_map``, at least for those parameters, to perform the migration successfully + + :param state_dict: External model's state dict to migrate from (default: ``None``) + :type state_dict: Mapping[str, torch.Tensor], optional + :param path: Path to the external checkpoint to migrate from (default: ``None``) + :type path: str, optional :param name_map: Name map to use for the migration (default: ``{}``). Keys are the current parameter names and values are the external parameter names :type name_map: Mapping[str, str], optional :param auto_mapping: Automatically map the external state dict to the current state dict (default: ``True``) :type auto_mapping: bool, optional - :param show_names: Show the names of both, current and external state dicts parameters (default: ``False``) - :type show_names: bool, optional + :param verbose: Show model names and migration (default: ``False``) + :type verbose: bool, optional + + :raises ValueError: If neither or both of ``state_dict`` and ``path`` parameters have been set + :raises ValueError: If the correct file type cannot be identified from the ``path`` parameter :return: True if the migration was successful, False otherwise. Migration is successful if all parameters of the current model are found in the external model :rtype: bool + + Example:: + + # migrate a rl_games checkpoint with unambiguous state_dict + >>> model.migrate(path="./runs/Ant/nn/Ant.pth") + True + + # migrate a rl_games checkpoint with ambiguous state_dict + >>> model.migrate(path="./runs/Cartpole/nn/Cartpole.pth", verbose=False) + [skrl:WARNING] Ambiguous match for log_std_parameter <- [value_mean_std.running_mean, value_mean_std.running_var, a2c_network.sigma] + [skrl:WARNING] Ambiguous match for net.0.bias <- [a2c_network.actor_mlp.0.bias, a2c_network.actor_mlp.2.bias] + [skrl:WARNING] Ambiguous match for net.2.bias <- [a2c_network.actor_mlp.0.bias, a2c_network.actor_mlp.2.bias] + [skrl:WARNING] Ambiguous match for net.4.weight <- [a2c_network.value.weight, a2c_network.mu.weight] + [skrl:WARNING] Ambiguous match for net.4.bias <- [a2c_network.value.bias, a2c_network.mu.bias] + [skrl:WARNING] Multiple use of a2c_network.actor_mlp.0.bias -> [net.0.bias, net.2.bias] + [skrl:WARNING] Multiple use of a2c_network.actor_mlp.2.bias -> [net.0.bias, net.2.bias] + False + >>> name_map = {"log_std_parameter": "a2c_network.sigma", + ... "net.0.bias": "a2c_network.actor_mlp.0.bias", + ... "net.2.bias": "a2c_network.actor_mlp.2.bias", + ... "net.4.weight": "a2c_network.mu.weight", + ... "net.4.bias": "a2c_network.mu.bias"} + >>> model.migrate(path="./runs/Cartpole/nn/Cartpole.pth", name_map=name_map, verbose=True) + [skrl:INFO] Models + [skrl:INFO] |-- current: 7 items + [skrl:INFO] | |-- log_std_parameter : torch.Size([1]) + [skrl:INFO] | |-- net.0.weight : torch.Size([32, 4]) + [skrl:INFO] | |-- net.0.bias : torch.Size([32]) + [skrl:INFO] | |-- net.2.weight : torch.Size([32, 32]) + [skrl:INFO] | |-- net.2.bias : torch.Size([32]) + [skrl:INFO] | |-- net.4.weight : torch.Size([1, 32]) + [skrl:INFO] | |-- net.4.bias : torch.Size([1]) + [skrl:INFO] |-- source: 15 items + [skrl:INFO] | |-- value_mean_std.running_mean : torch.Size([1]) + [skrl:INFO] | |-- value_mean_std.running_var : torch.Size([1]) + [skrl:INFO] | |-- value_mean_std.count : torch.Size([]) + [skrl:INFO] | |-- running_mean_std.running_mean : torch.Size([4]) + [skrl:INFO] | |-- running_mean_std.running_var : torch.Size([4]) + [skrl:INFO] | |-- running_mean_std.count : torch.Size([]) + [skrl:INFO] | |-- a2c_network.sigma : torch.Size([1]) + [skrl:INFO] | |-- a2c_network.actor_mlp.0.weight : torch.Size([32, 4]) + [skrl:INFO] | |-- a2c_network.actor_mlp.0.bias : torch.Size([32]) + [skrl:INFO] | |-- a2c_network.actor_mlp.2.weight : torch.Size([32, 32]) + [skrl:INFO] | |-- a2c_network.actor_mlp.2.bias : torch.Size([32]) + [skrl:INFO] | |-- a2c_network.value.weight : torch.Size([1, 32]) + [skrl:INFO] | |-- a2c_network.value.bias : torch.Size([1]) + [skrl:INFO] | |-- a2c_network.mu.weight : torch.Size([1, 32]) + [skrl:INFO] | |-- a2c_network.mu.bias : torch.Size([1]) + [skrl:INFO] Migration + [skrl:INFO] |-- map: log_std_parameter <- a2c_network.sigma + [skrl:INFO] |-- auto: net.0.weight <- a2c_network.actor_mlp.0.weight + [skrl:INFO] |-- map: net.0.bias <- a2c_network.actor_mlp.0.bias + [skrl:INFO] |-- auto: net.2.weight <- a2c_network.actor_mlp.2.weight + [skrl:INFO] |-- map: net.2.bias <- a2c_network.actor_mlp.2.bias + [skrl:INFO] |-- map: net.4.weight <- a2c_network.mu.weight + [skrl:INFO] |-- map: net.4.bias <- a2c_network.mu.bias + False + + # migrate a stable-baselines3 checkpoint with unambiguous state_dict + >>> model.migrate(path="./ddpg_pendulum.zip") + True + + # migrate from any exported model by loading its state_dict (unambiguous state_dict) + >>> state_dict = torch.load("./external_model.pt") + >>> model.migrate(state_dict=state_dict) + True """ - # Show state_dict - if show_names: - print("Model migration") - print("Current state_dict:") + if (state_dict is not None) + (path is not None) != 1: + raise ValueError("Exactly one of state_dict or path may be specified") + + # load state_dict from path + if path is not None: + state_dict = {} + # rl_games checkpoint + if path.endswith(".pt") or path.endswith(".pth"): + checkpoint = torch.load(path, map_location=self.device) + if type(checkpoint) is dict: + state_dict = checkpoint.get("model", {}) + # stable-baselines3 + elif path.endswith(".zip"): + import zipfile + try: + archive = zipfile.ZipFile(path, 'r') + with archive.open('policy.pth', mode="r") as file: + state_dict = torch.load(file, map_location=self.device) + except KeyError as e: + logger.warning(str(e)) + state_dict = {} + else: + raise ValueError("Cannot identify file type") + + # show state_dict + if verbose: + logger.info("Models") + logger.info(" |-- current: {} items".format(len(self.state_dict().keys()))) for name, tensor in self.state_dict().items(): - print(" |-- {} : {}".format(name, tensor.shape)) - print("Source state_dict:") + logger.info(" | |-- {} : {}".format(name, tensor.shape)) + logger.info(" |-- source: {} items".format(len(state_dict.keys()))) for name, tensor in state_dict.items(): - print(" |-- {} : {}".format(name, tensor.shape)) + logger.info(" | |-- {} : {}".format(name, tensor.shape)) + logger.info("Migration") - # migrate the state dict to current model + # migrate the state_dict to current model new_state_dict = collections.OrderedDict() match_counter = collections.defaultdict(list) used_counter = collections.defaultdict(list) @@ -462,43 +567,51 @@ def migrate(self, new_state_dict[name] = external_tensor match_counter[name].append(external_name) used_counter[external_name].append(name) + if verbose: + logger.info(" |-- map: {} <- {}".format(name, external_name)) break else: - print("Shape mismatch for {} <- {} : {} != {}".format(name, external_name, tensor.shape, external_tensor.shape)) + logger.warning("Shape mismatch for {} <- {} : {} != {}".format(name, external_name, tensor.shape, external_tensor.shape)) # auto-mapped names - if auto_mapping: + if auto_mapping and name not in name_map: if tensor.shape == external_tensor.shape: if name.endswith(".weight"): if external_name.endswith(".weight"): new_state_dict[name] = external_tensor match_counter[name].append(external_name) used_counter[external_name].append(name) + if verbose: + logger.info(" |-- auto: {} <- {}".format(name, external_name)) elif name.endswith(".bias"): if external_name.endswith(".bias"): new_state_dict[name] = external_tensor match_counter[name].append(external_name) used_counter[external_name].append(name) + if verbose: + logger.info(" |-- auto: {} <- {}".format(name, external_name)) else: if not external_name.endswith(".weight") and not external_name.endswith(".bias"): new_state_dict[name] = external_tensor match_counter[name].append(external_name) used_counter[external_name].append(name) + if verbose: + logger.info(" |-- auto: {} <- {}".format(name, external_name)) # show ambiguous matches status = True for name, tensor in self.state_dict().items(): if len(match_counter.get(name, [])) > 1: - print("Ambiguous match for {} <- {}".format(name, match_counter.get(name, []))) + logger.warning("Ambiguous match for {} <- [{}]".format(name, ", ".join(match_counter.get(name, [])))) status = False # show missing matches for name, tensor in self.state_dict().items(): if not match_counter.get(name, []): - print("Missing match for {}".format(name)) + logger.warning("Missing match for {}".format(name)) status = False - # show duplicated uses + # show multiple uses for name, tensor in state_dict.items(): if len(used_counter.get(name, [])) > 1: - print("Duplicated use of {} -> {}".format(name, used_counter.get(name, []))) + logger.warning("Multiple use of {} -> [{}]".format(name, ", ".join(used_counter.get(name, [])))) status = False # load new state dict @@ -515,6 +628,14 @@ def freeze_parameters(self, freeze: bool = True) -> None: :param freeze: Freeze the internal parameters if True, otherwise unfreeze them (default: ``True``) :type freeze: bool, optional + + Example:: + + # freeze model parameters + >>> model.freeze_parameters(True) + + # unfreeze model parameters + >>> model.freeze_parameters(False) """ for parameters in self.parameters(): parameters.requires_grad = not freeze From b89036a7491af904ceaf2e8bf7aaf9e14fae615f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20Serrano=20Mu=C3=B1oz?= Date: Wed, 7 Sep 2022 18:38:06 +0200 Subject: [PATCH 057/108] Allow migrating extrenal state_dict/checkpoint --- skrl/agents/torch/base.py | 240 +++++++++++++++++++++++++++++++++++++- skrl/models/torch/base.py | 4 +- 2 files changed, 241 insertions(+), 3 deletions(-) diff --git a/skrl/agents/torch/base.py b/skrl/agents/torch/base.py index 616a05b5..3483a994 100644 --- a/skrl/agents/torch/base.py +++ b/skrl/agents/torch/base.py @@ -1,4 +1,4 @@ -from typing import Union, Tuple, Dict, Any +from typing import Union, Mapping, Tuple, Dict, Any import os import gym @@ -335,6 +335,244 @@ def load(self, path: str) -> None: else: logger.warning("Cannot load the {} module. The agent doesn't have such an instance".format(name)) + def migrate(self, + path: str, + name_map: Mapping[str, Mapping[str, str]] = {}, + auto_mapping: bool = True, + verbose: bool = False) -> bool: + """Migrate the specified extrernal checkpoint to the current agent + + The final storage device is determined by the constructor of the agent. + Only files generated by the *rl_games* library are supported at the moment + + For ambiguous models (where 2 or more parameters, for source or current model, have equal shape) + it is necessary to define the ``name_map``, at least for those parameters, to perform the migration successfully + + :param path: Path to the external checkpoint to migrate from + :type path: str + :param name_map: Name map to use for the migration (default: ``{}``). + Keys are the current parameter names and values are the external parameter names + :type name_map: Mapping[str, Mapping[str, str]], optional + :param auto_mapping: Automatically map the external state dict to the current state dict (default: ``True``) + :type auto_mapping: bool, optional + :param verbose: Show model names and migration (default: ``False``) + :type verbose: bool, optional + + :raises ValueError: If the correct file type cannot be identified from the ``path`` parameter + + :return: True if the migration was successful, False otherwise. + Migration is successful if all parameters of the current model are found in the external model + :rtype: bool + + Example:: + + # migrate a rl_games checkpoint with ambiguous state_dict + >>> agent.migrate(path="./runs/Cartpole/nn/Cartpole.pth", verbose=False) + [skrl:WARNING] Ambiguous match for net.0.bias <- [a2c_network.actor_mlp.0.bias, a2c_network.actor_mlp.2.bias] + [skrl:WARNING] Ambiguous match for net.2.bias <- [a2c_network.actor_mlp.0.bias, a2c_network.actor_mlp.2.bias] + [skrl:WARNING] Ambiguous match for net.4.weight <- [a2c_network.value.weight, a2c_network.mu.weight] + [skrl:WARNING] Ambiguous match for net.4.bias <- [a2c_network.value.bias, a2c_network.mu.bias] + [skrl:WARNING] Multiple use of a2c_network.actor_mlp.0.bias -> [net.0.bias, net.2.bias] + [skrl:WARNING] Multiple use of a2c_network.actor_mlp.2.bias -> [net.0.bias, net.2.bias] + [skrl:WARNING] Ambiguous match for net.0.bias <- [a2c_network.actor_mlp.0.bias, a2c_network.actor_mlp.2.bias] + [skrl:WARNING] Ambiguous match for net.2.bias <- [a2c_network.actor_mlp.0.bias, a2c_network.actor_mlp.2.bias] + [skrl:WARNING] Ambiguous match for net.4.weight <- [a2c_network.value.weight, a2c_network.mu.weight] + [skrl:WARNING] Ambiguous match for net.4.bias <- [a2c_network.value.bias, a2c_network.mu.bias] + [skrl:WARNING] Multiple use of a2c_network.actor_mlp.0.bias -> [net.0.bias, net.2.bias] + [skrl:WARNING] Multiple use of a2c_network.actor_mlp.2.bias -> [net.0.bias, net.2.bias] + False + >>> name_map = {"policy": {"net.0.bias": "a2c_network.actor_mlp.0.bias", + ... "net.2.bias": "a2c_network.actor_mlp.2.bias", + ... "net.4.weight": "a2c_network.mu.weight", + ... "net.4.bias": "a2c_network.mu.bias"}, + ... "value": {"net.0.bias": "a2c_network.actor_mlp.0.bias", + ... "net.2.bias": "a2c_network.actor_mlp.2.bias", + ... "net.4.weight": "a2c_network.value.weight", + ... "net.4.bias": "a2c_network.value.bias"}} + >>> model.migrate(path="./runs/Cartpole/nn/Cartpole.pth", name_map=name_map, verbose=True) + [skrl:INFO] Modules + [skrl:INFO] |-- current + [skrl:INFO] | |-- policy (Policy) + [skrl:INFO] | | |-- log_std_parameter : [1] + [skrl:INFO] | | |-- net.0.weight : [32, 4] + [skrl:INFO] | | |-- net.0.bias : [32] + [skrl:INFO] | | |-- net.2.weight : [32, 32] + [skrl:INFO] | | |-- net.2.bias : [32] + [skrl:INFO] | | |-- net.4.weight : [1, 32] + [skrl:INFO] | | |-- net.4.bias : [1] + [skrl:INFO] | |-- value (Value) + [skrl:INFO] | | |-- net.0.weight : [32, 4] + [skrl:INFO] | | |-- net.0.bias : [32] + [skrl:INFO] | | |-- net.2.weight : [32, 32] + [skrl:INFO] | | |-- net.2.bias : [32] + [skrl:INFO] | | |-- net.4.weight : [1, 32] + [skrl:INFO] | | |-- net.4.bias : [1] + [skrl:INFO] | |-- optimizer (Adam) + [skrl:INFO] | | |-- state (dict) + [skrl:INFO] | | |-- param_groups (list) + [skrl:INFO] | |-- state_preprocessor (RunningStandardScaler) + [skrl:INFO] | | |-- running_mean : [4] + [skrl:INFO] | | |-- running_variance : [4] + [skrl:INFO] | | |-- current_count : [] + [skrl:INFO] | |-- value_preprocessor (RunningStandardScaler) + [skrl:INFO] | | |-- running_mean : [1] + [skrl:INFO] | | |-- running_variance : [1] + [skrl:INFO] | | |-- current_count : [] + [skrl:INFO] |-- source + [skrl:INFO] | |-- model (OrderedDict) + [skrl:INFO] | | |-- value_mean_std.running_mean : [1] + [skrl:INFO] | | |-- value_mean_std.running_var : [1] + [skrl:INFO] | | |-- value_mean_std.count : [] + [skrl:INFO] | | |-- running_mean_std.running_mean : [4] + [skrl:INFO] | | |-- running_mean_std.running_var : [4] + [skrl:INFO] | | |-- running_mean_std.count : [] + [skrl:INFO] | | |-- a2c_network.sigma : [1] + [skrl:INFO] | | |-- a2c_network.actor_mlp.0.weight : [32, 4] + [skrl:INFO] | | |-- a2c_network.actor_mlp.0.bias : [32] + [skrl:INFO] | | |-- a2c_network.actor_mlp.2.weight : [32, 32] + [skrl:INFO] | | |-- a2c_network.actor_mlp.2.bias : [32] + [skrl:INFO] | | |-- a2c_network.value.weight : [1, 32] + [skrl:INFO] | | |-- a2c_network.value.bias : [1] + [skrl:INFO] | | |-- a2c_network.mu.weight : [1, 32] + [skrl:INFO] | | |-- a2c_network.mu.bias : [1] + [skrl:INFO] | |-- epoch (int) + [skrl:INFO] | |-- optimizer (dict) + [skrl:INFO] | |-- frame (int) + [skrl:INFO] | |-- last_mean_rewards (float32) + [skrl:INFO] | |-- env_state (NoneType) + [skrl:INFO] Migration + [skrl:INFO] Model: policy (Policy) + [skrl:INFO] Models + [skrl:INFO] |-- current: 7 items + [skrl:INFO] | |-- log_std_parameter : [1] + [skrl:INFO] | |-- net.0.weight : [32, 4] + [skrl:INFO] | |-- net.0.bias : [32] + [skrl:INFO] | |-- net.2.weight : [32, 32] + [skrl:INFO] | |-- net.2.bias : [32] + [skrl:INFO] | |-- net.4.weight : [1, 32] + [skrl:INFO] | |-- net.4.bias : [1] + [skrl:INFO] |-- source: 9 items + [skrl:INFO] | |-- a2c_network.sigma : [1] + [skrl:INFO] | |-- a2c_network.actor_mlp.0.weight : [32, 4] + [skrl:INFO] | |-- a2c_network.actor_mlp.0.bias : [32] + [skrl:INFO] | |-- a2c_network.actor_mlp.2.weight : [32, 32] + [skrl:INFO] | |-- a2c_network.actor_mlp.2.bias : [32] + [skrl:INFO] | |-- a2c_network.value.weight : [1, 32] + [skrl:INFO] | |-- a2c_network.value.bias : [1] + [skrl:INFO] | |-- a2c_network.mu.weight : [1, 32] + [skrl:INFO] | |-- a2c_network.mu.bias : [1] + [skrl:INFO] Migration + [skrl:INFO] |-- auto: log_std_parameter <- a2c_network.sigma + [skrl:INFO] |-- auto: net.0.weight <- a2c_network.actor_mlp.0.weight + [skrl:INFO] |-- map: net.0.bias <- a2c_network.actor_mlp.0.bias + [skrl:INFO] |-- auto: net.2.weight <- a2c_network.actor_mlp.2.weight + [skrl:INFO] |-- map: net.2.bias <- a2c_network.actor_mlp.2.bias + [skrl:INFO] |-- map: net.4.weight <- a2c_network.mu.weight + [skrl:INFO] |-- map: net.4.bias <- a2c_network.mu.bias + [skrl:INFO] Model: value (Value) + [skrl:INFO] Models + [skrl:INFO] |-- current: 6 items + [skrl:INFO] | |-- net.0.weight : [32, 4] + [skrl:INFO] | |-- net.0.bias : [32] + [skrl:INFO] | |-- net.2.weight : [32, 32] + [skrl:INFO] | |-- net.2.bias : [32] + [skrl:INFO] | |-- net.4.weight : [1, 32] + [skrl:INFO] | |-- net.4.bias : [1] + [skrl:INFO] |-- source: 9 items + [skrl:INFO] | |-- a2c_network.sigma : [1] + [skrl:INFO] | |-- a2c_network.actor_mlp.0.weight : [32, 4] + [skrl:INFO] | |-- a2c_network.actor_mlp.0.bias : [32] + [skrl:INFO] | |-- a2c_network.actor_mlp.2.weight : [32, 32] + [skrl:INFO] | |-- a2c_network.actor_mlp.2.bias : [32] + [skrl:INFO] | |-- a2c_network.value.weight : [1, 32] + [skrl:INFO] | |-- a2c_network.value.bias : [1] + [skrl:INFO] | |-- a2c_network.mu.weight : [1, 32] + [skrl:INFO] | |-- a2c_network.mu.bias : [1] + [skrl:INFO] Migration + [skrl:INFO] |-- auto: net.0.weight <- a2c_network.actor_mlp.0.weight + [skrl:INFO] |-- map: net.0.bias <- a2c_network.actor_mlp.0.bias + [skrl:INFO] |-- auto: net.2.weight <- a2c_network.actor_mlp.2.weight + [skrl:INFO] |-- map: net.2.bias <- a2c_network.actor_mlp.2.bias + [skrl:INFO] |-- map: net.4.weight <- a2c_network.value.weight + [skrl:INFO] |-- map: net.4.bias <- a2c_network.value.bias + True + """ + # load state_dict from path + if path is not None: + # rl_games checkpoint + if path.endswith(".pt") or path.endswith(".pth"): + checkpoint = torch.load(path, map_location=self.device) + else: + raise ValueError("Cannot identify file type") + + # show modules + if verbose: + logger.info("Modules") + logger.info(" |-- current") + for name, module in self.checkpoint_modules.items(): + logger.info(" | |-- {} ({})".format(name, type(module).__name__)) + if hasattr(module, "state_dict"): + for k, v in module.state_dict().items(): + if hasattr(v, "shape"): + logger.info(" | | |-- {} : {}".format(k, list(v.shape))) + else: + logger.info(" | | |-- {} ({})".format(k, type(v).__name__)) + logger.info(" |-- source") + for name, module in checkpoint.items(): + logger.info(" | |-- {} ({})".format(name, type(module).__name__)) + if name == "model": + for k, v in module.items(): + logger.info(" | | |-- {} : {}".format(k, list(v.shape))) + else: + if hasattr(module, "state_dict"): + for k, v in module.state_dict().items(): + if hasattr(v, "shape"): + logger.info(" | | |-- {} : {}".format(k, list(v.shape))) + else: + logger.info(" | | |-- {} ({})".format(k, type(v).__name__)) + logger.info("Migration") + + if "optimizer" in self.checkpoint_modules: + # loaded state dict contains a parameter group that doesn't match the size of optimizer's group + # self.checkpoint_modules["optimizer"].load_state_dict(checkpoint["optimizer"]) + pass + # state_preprocessor + if "state_preprocessor" in self.checkpoint_modules: + if "running_mean_std.running_mean" in checkpoint["model"]: + state_dict = copy.deepcopy(self.checkpoint_modules["state_preprocessor"].state_dict()) + state_dict["running_mean"] = checkpoint["model"]["running_mean_std.running_mean"] + state_dict["running_variance"] = checkpoint["model"]["running_mean_std.running_var"] + state_dict["current_count"] = checkpoint["model"]["running_mean_std.count"] + self.checkpoint_modules["state_preprocessor"].load_state_dict(state_dict) + del checkpoint["model"]["running_mean_std.running_mean"] + del checkpoint["model"]["running_mean_std.running_var"] + del checkpoint["model"]["running_mean_std.count"] + # value_preprocessor + if "value_preprocessor" in self.checkpoint_modules: + if "value_mean_std.running_mean" in checkpoint["model"]: + state_dict = copy.deepcopy(self.checkpoint_modules["value_preprocessor"].state_dict()) + state_dict["running_mean"] = checkpoint["model"]["value_mean_std.running_mean"] + state_dict["running_variance"] = checkpoint["model"]["value_mean_std.running_var"] + state_dict["current_count"] = checkpoint["model"]["value_mean_std.count"] + self.checkpoint_modules["value_preprocessor"].load_state_dict(state_dict) + del checkpoint["model"]["value_mean_std.running_mean"] + del checkpoint["model"]["value_mean_std.running_var"] + del checkpoint["model"]["value_mean_std.count"] + # TODO: AMP state preprocessor + # model + status = True + for name, module in self.checkpoint_modules.items(): + if module not in ["state_preprocessor", "value_preprocessor", "optimizer"] and hasattr(module, "migrate"): + if verbose: + logger.info("Model: {} ({})".format(name, type(module).__name__)) + status *= module.migrate(state_dict=checkpoint["model"], + name_map=name_map.get(name, {}), + auto_mapping=auto_mapping, + verbose=verbose) + + self.set_mode("eval") + return bool(status) + def pre_interaction(self, timestep: int, timesteps: int) -> None: """Callback called before the interaction with the environment diff --git a/skrl/models/torch/base.py b/skrl/models/torch/base.py index 9973cc4d..73478f36 100644 --- a/skrl/models/torch/base.py +++ b/skrl/models/torch/base.py @@ -549,10 +549,10 @@ def migrate(self, logger.info("Models") logger.info(" |-- current: {} items".format(len(self.state_dict().keys()))) for name, tensor in self.state_dict().items(): - logger.info(" | |-- {} : {}".format(name, tensor.shape)) + logger.info(" | |-- {} : {}".format(name, list(tensor.shape))) logger.info(" |-- source: {} items".format(len(state_dict.keys()))) for name, tensor in state_dict.items(): - logger.info(" | |-- {} : {}".format(name, tensor.shape)) + logger.info(" | |-- {} : {}".format(name, list(tensor.shape))) logger.info("Migration") # migrate the state_dict to current model From ec243b255b3c3f1248658d4fc5a012de377d89e3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20Serrano=20Mu=C3=B1oz?= Date: Wed, 7 Sep 2022 22:31:00 +0200 Subject: [PATCH 058/108] Remove model instantiator properties and methods from models --- skrl/models/torch/base.py | 39 ---------------------- skrl/models/torch/categorical.py | 8 ++--- skrl/models/torch/deterministic.py | 8 ++--- skrl/models/torch/gaussian.py | 11 ++---- skrl/models/torch/multivariate_gaussian.py | 11 ++---- 5 files changed, 10 insertions(+), 67 deletions(-) diff --git a/skrl/models/torch/base.py b/skrl/models/torch/base.py index 73478f36..06cfadf6 100644 --- a/skrl/models/torch/base.py +++ b/skrl/models/torch/base.py @@ -61,45 +61,6 @@ def act(self, states, taken_actions=None, role=""): self._random_distribution = None - # internal variables to be used by the model instantiators - self._instantiator_net = None - self._instantiator_input_type = 0 - self._instantiator_parameter = None - self._instantiator_output_scale = 1.0 - - def _get_instantiator_output(self, - states: torch.Tensor, - taken_actions: Optional[torch.Tensor] = None) -> Sequence[torch.Tensor]: - """Get the output of the instantiator model - - Input shape depends on the instantiator (see skrl.utils.model_instantiator.Shape) as follows: - - - STATES / OBSERVATIONS = 0 - - ACTIONS = -1 - - STATES_ACTIONS = -2 - - :param states: Observation/state of the environment used to make the decision - :type states: torch.Tensor - :param taken_actions: Actions taken by a policy to the given states (default: ``None``) - :type taken_actions: torch.Tensor, optional - - :return: Output of the instantiator model - :rtype: sequence of torch.Tensor - """ - if self._instantiator_input_type == 0: - output = self._instantiator_net(states) - elif self._instantiator_input_type == -1: - output = self._instantiator_net(taken_actions) - elif self._instantiator_input_type == -2: - output = self._instantiator_net(torch.cat((states, taken_actions), dim=1)) - - # deterministic and categorical output - if self._instantiator_parameter is None: - return output * self._instantiator_output_scale - # gaussian output - else: - return output * self._instantiator_output_scale, self._instantiator_parameter - def _get_space_size(self, space: Union[int, Sequence[int], gym.Space], number_of_elements: bool = True) -> int: diff --git a/skrl/models/torch/categorical.py b/skrl/models/torch/categorical.py index 5c779510..19f94f10 100644 --- a/skrl/models/torch/categorical.py +++ b/skrl/models/torch/categorical.py @@ -86,12 +86,8 @@ def act(self, torch.Size([4096, 1]) torch.Size([4096, 1]) torch.Size([4096, 2]) """ # map from states/observations to normalized probabilities or unnormalized log probabilities - if self._instantiator_net is None: - output = self.compute(states.to(self.device), - taken_actions.to(self.device) if taken_actions is not None else taken_actions, role) - else: - output = self._get_instantiator_output(states.to(self.device), \ - taken_actions.to(self.device) if taken_actions is not None else taken_actions) + output = self.compute(states.to(self.device), + taken_actions.to(self.device) if taken_actions is not None else taken_actions, role) # unnormalized log probabilities if self._c_unnormalized_log_prob[role] if role in self._c_unnormalized_log_prob else self._c_unnormalized_log_prob[""]: diff --git a/skrl/models/torch/deterministic.py b/skrl/models/torch/deterministic.py index bc863320..72996d96 100644 --- a/skrl/models/torch/deterministic.py +++ b/skrl/models/torch/deterministic.py @@ -87,12 +87,8 @@ def act(self, torch.Size([4096, 1]) None None """ # map from observations/states to actions - if self._instantiator_net is None: - actions = self.compute(states.to(self.device), - taken_actions.to(self.device) if taken_actions is not None else taken_actions, role) - else: - actions = self._get_instantiator_output(states.to(self.device), \ - taken_actions.to(self.device) if taken_actions is not None else taken_actions) + actions = self.compute(states.to(self.device), + taken_actions.to(self.device) if taken_actions is not None else taken_actions, role) # clip actions if self._d_clip_actions[role] if role in self._d_clip_actions else self._d_clip_actions[""]: diff --git a/skrl/models/torch/gaussian.py b/skrl/models/torch/gaussian.py index c06fe570..8e69a343 100644 --- a/skrl/models/torch/gaussian.py +++ b/skrl/models/torch/gaussian.py @@ -135,14 +135,9 @@ def act(self, torch.Size([4096, 8]) torch.Size([4096, 1]) torch.Size([4096, 8]) """ # map from states/observations to mean actions and log standard deviations - if self._instantiator_net is None: - actions_mean, log_std = self.compute(states.to(self.device), - taken_actions.to(self.device) if taken_actions is not None else taken_actions, - role) - else: - actions_mean, log_std = self._get_instantiator_output(states.to(self.device), \ - taken_actions.to(self.device) if taken_actions is not None else taken_actions) - + actions_mean, log_std = self.compute(states.to(self.device), + taken_actions.to(self.device) if taken_actions is not None else taken_actions, role) + # clamp log standard deviations if self._g_clip_log_std[role] if role in self._g_clip_log_std else self._g_clip_log_std[""]: log_std = torch.clamp(log_std, diff --git a/skrl/models/torch/multivariate_gaussian.py b/skrl/models/torch/multivariate_gaussian.py index cc65b7c9..b4d6e7b3 100644 --- a/skrl/models/torch/multivariate_gaussian.py +++ b/skrl/models/torch/multivariate_gaussian.py @@ -121,14 +121,9 @@ def act(self, torch.Size([4096, 8]) torch.Size([4096, 1]) torch.Size([4096, 8]) """ # map from states/observations to mean actions and log standard deviations - if self._instantiator_net is None: - actions_mean, log_std = self.compute(states.to(self.device), - taken_actions.to(self.device) if taken_actions is not None else taken_actions, - role) - else: - actions_mean, log_std = self._get_instantiator_output(states.to(self.device), \ - taken_actions.to(self.device) if taken_actions is not None else taken_actions) - + actions_mean, log_std = self.compute(states.to(self.device), + taken_actions.to(self.device) if taken_actions is not None else taken_actions, role) + # clamp log standard deviations if self._mg_clip_log_std[role] if role in self._mg_clip_log_std else self._mg_clip_log_std[""]: log_std = torch.clamp(log_std, From 52f9e708a42af674366ce039afbe97462a88f31c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20Serrano=20Mu=C3=B1oz?= Date: Wed, 7 Sep 2022 22:37:09 +0200 Subject: [PATCH 059/108] Update and autocontain model instances --- skrl/utils/model_instantiators.py | 294 +++++++++++++++++++++++------- 1 file changed, 226 insertions(+), 68 deletions(-) diff --git a/skrl/utils/model_instantiators.py b/skrl/utils/model_instantiators.py index d6b2da53..a4626bf6 100644 --- a/skrl/utils/model_instantiators.py +++ b/skrl/utils/model_instantiators.py @@ -7,9 +7,10 @@ import torch.nn as nn from ..models.torch import Model -from ..models.torch import GaussianModel -from ..models.torch import CategoricalModel -from ..models.torch import DeterministicModel +from ..models.torch import GaussianMixin +from ..models.torch import CategoricalMixin +from ..models.torch import DeterministicMixin +from ..models.torch import MultivariateGaussianMixin __all__ = ["categorical_model", "deterministic_model", "gaussian_model", "Shape"] @@ -92,7 +93,7 @@ def _generate_sequential(model: Model, hidden_activation: list = ["relu", "relu"], output_shape: Shape = Shape.ACTIONS, output_activation: Union[str, None] = "tanh", - output_scale: int = None,) -> nn.Sequential: + output_scale: int = None) -> nn.Sequential: """Generate a sequential model :param model: model to generate sequential model for @@ -141,8 +142,8 @@ def gaussian_model(observation_space: Union[int, Tuple[int], gym.Space, None] = hidden_activation: list = ["relu", "relu"], output_shape: Shape = Shape.ACTIONS, output_activation: Union[str, None] = "tanh", - output_scale: float = 1.0) -> GaussianModel: - """Instantiate a GaussianModel model + output_scale: float = 1.0) -> Model: + """Instantiate a Gaussian model :param observation_space: Observation/state space or shape (default: None). If it is not None, the num_observations property will contain the size of that space @@ -174,30 +175,145 @@ def gaussian_model(observation_space: Union[int, Tuple[int], gym.Space, None] = If None, the output layer will not be scaled :type output_scale: float, optional - :return: GaussianModel instance - :rtype: GaussianModel + :return: Gaussian model instance + :rtype: Model """ - model = GaussianModel(observation_space=observation_space, - action_space=action_space, - device=device, - clip_actions=clip_actions, - clip_log_std=clip_log_std, - min_log_std=min_log_std, - max_log_std=max_log_std) - - model._instantiator_net = _generate_sequential(model=model, - input_shape=input_shape, - hiddens=hiddens, - hidden_activation=hidden_activation, - output_shape=output_shape, - output_activation=output_activation, - output_scale=output_scale) - model._instantiator_output_scale = output_scale - model._instantiator_input_type = input_shape.value - model._instantiator_parameter = nn.Parameter(torch.zeros(_get_num_units_by_shape(model, output_shape))) - - return model + class GaussianModel(GaussianMixin, Model): + def __init__(self, observation_space, action_space, device, clip_actions, + clip_log_std, min_log_std, max_log_std, metadata): + Model.__init__(self, observation_space, action_space, device) + GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std) + + self.instantiator_output_scale = metadata["output_scale"] + self.instantiator_input_type = metadata["input_shape"].value + + self.net = _generate_sequential(model=self, + input_shape=metadata["input_shape"], + hiddens=metadata["hiddens"], + hidden_activation=metadata["hidden_activation"], + output_shape=metadata["output_shape"], + output_activation=metadata["output_activation"], + output_scale=metadata["output_scale"]) + self.log_std_parameter = nn.Parameter(torch.zeros(_get_num_units_by_shape(self, metadata["output_shape"]))) + + def compute(self, states, taken_actions=None, role=""): + if self.instantiator_input_type == 0: + output = self.net(states) + elif self.instantiator_input_type == -1: + output = self.net(taken_actions) + elif self.instantiator_input_type == -2: + output = self.net(torch.cat((states, taken_actions), dim=1)) + + return output * self.instantiator_output_scale, self.log_std_parameter + + metadata = {"input_shape": input_shape, + "hiddens": hiddens, + "hidden_activation": hidden_activation, + "output_shape": output_shape, + "output_activation": output_activation, + "output_scale": output_scale} + + return GaussianModel(observation_space=observation_space, + action_space=action_space, + device=device, + clip_actions=clip_actions, + clip_log_std=clip_log_std, + min_log_std=min_log_std, + max_log_std=max_log_std, + metadata=metadata) +def multivariate_gaussian_model(observation_space: Union[int, Tuple[int], gym.Space, None] = None, + action_space: Union[int, Tuple[int], gym.Space, None] = None, + device: Union[str, torch.device] = "cuda:0", + clip_actions: bool = False, + clip_log_std: bool = True, + min_log_std: float = -20, + max_log_std: float = 2, + input_shape: Shape = Shape.STATES, + hiddens: list = [256, 256], + hidden_activation: list = ["relu", "relu"], + output_shape: Shape = Shape.ACTIONS, + output_activation: Union[str, None] = "tanh", + output_scale: float = 1.0) -> Model: + """Instantiate a multivariate Gaussian model + + :param observation_space: Observation/state space or shape (default: None). + If it is not None, the num_observations property will contain the size of that space + :type observation_space: int, tuple or list of integers, gym.Space or None, optional + :param action_space: Action space or shape (default: None). + If it is not None, the num_actions property will contain the size of that space + :type action_space: int, tuple or list of integers, gym.Space or None, optional + :param device: Device on which the model will be trained (default: "cuda:0") + :type device: str or torch.device, optional + :param clip_actions: Flag to indicate whether the actions should be clipped (default: False) + :type clip_actions: bool, optional + :param clip_log_std: Flag to indicate whether the log standard deviations should be clipped (default: True) + :type clip_log_std: bool, optional + :param min_log_std: Minimum value of the log standard deviation (default: -20) + :type min_log_std: float, optional + :param max_log_std: Maximum value of the log standard deviation (default: 2) + :type max_log_std: float, optional + :param input_shape: Shape of the input (default: Shape.STATES) + :type input_shape: Shape, optional + :param hiddens: Number of hidden units in each hidden layer + :type hiddens: int or list of ints + :param hidden_activation: Activation function for each hidden layer (default: "relu"). + :type hidden_activation: list of strings + :param output_shape: Shape of the output (default: Shape.ACTIONS) + :type output_shape: Shape, optional + :param output_activation: Activation function for the output layer (default: "tanh") + :type output_activation: str or None, optional + :param output_scale: Scale of the output layer (default: 1.0). + If None, the output layer will not be scaled + :type output_scale: float, optional + + :return: Multivariate Gaussian model instance + :rtype: Model + """ + class MultivariateGaussianModel(MultivariateGaussianMixin, Model): + def __init__(self, observation_space, action_space, device, clip_actions, + clip_log_std, min_log_std, max_log_std, metadata): + Model.__init__(self, observation_space, action_space, device) + MultivariateGaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std) + + self.instantiator_output_scale = metadata["output_scale"] + self.instantiator_input_type = metadata["input_shape"].value + + self.net = _generate_sequential(model=self, + input_shape=metadata["input_shape"], + hiddens=metadata["hiddens"], + hidden_activation=metadata["hidden_activation"], + output_shape=metadata["output_shape"], + output_activation=metadata["output_activation"], + output_scale=metadata["output_scale"]) + self.log_std_parameter = nn.Parameter(torch.zeros(_get_num_units_by_shape(self, metadata["output_shape"]))) + + def compute(self, states, taken_actions=None, role=""): + if self.instantiator_input_type == 0: + output = self.net(states) + elif self.instantiator_input_type == -1: + output = self.net(taken_actions) + elif self.instantiator_input_type == -2: + output = self.net(torch.cat((states, taken_actions), dim=1)) + + return output * self.instantiator_output_scale, self.log_std_parameter + + metadata = {"input_shape": input_shape, + "hiddens": hiddens, + "hidden_activation": hidden_activation, + "output_shape": output_shape, + "output_activation": output_activation, + "output_scale": output_scale} + + return MultivariateGaussianModel(observation_space=observation_space, + action_space=action_space, + device=device, + clip_actions=clip_actions, + clip_log_std=clip_log_std, + min_log_std=min_log_std, + max_log_std=max_log_std, + metadata=metadata) + def deterministic_model(observation_space: Union[int, Tuple[int], gym.Space, None] = None, action_space: Union[int, Tuple[int], gym.Space, None] = None, device: Union[str, torch.device] = "cuda:0", @@ -207,8 +323,8 @@ def deterministic_model(observation_space: Union[int, Tuple[int], gym.Space, Non hidden_activation: list = ["relu", "relu"], output_shape: Shape = Shape.ACTIONS, output_activation: Union[str, None] = "tanh", - output_scale: float = 1.0) -> DeterministicModel: - """Instantiate a DeterministicModel model + output_scale: float = 1.0) -> Model: + """Instantiate a deterministic model :param observation_space: Observation/state space or shape (default: None). If it is not None, the num_observations property will contain the size of that space @@ -234,26 +350,48 @@ def deterministic_model(observation_space: Union[int, Tuple[int], gym.Space, Non If None, the output layer will not be scaled :type output_scale: float, optional - :return: DeterministicModel instance - :rtype: DeterministicModel + :return: Deterministic model instance + :rtype: Model """ - model = DeterministicModel(observation_space=observation_space, - action_space=action_space, - device=device, - clip_actions=clip_actions) - - model._instantiator_net = _generate_sequential(model=model, - input_shape=input_shape, - hiddens=hiddens, - hidden_activation=hidden_activation, - output_shape=output_shape, - output_activation=output_activation, - output_scale=output_scale) - model._instantiator_output_scale = output_scale - model._instantiator_input_type = input_shape.value - - return model - + class DeterministicModel(DeterministicMixin, Model): + def __init__(self, observation_space, action_space, device, clip_actions, metadata): + Model.__init__(self, observation_space, action_space, device) + DeterministicMixin.__init__(self, clip_actions) + + self.instantiator_output_scale = metadata["output_scale"] + self.instantiator_input_type = metadata["input_shape"].value + + self.net = _generate_sequential(model=self, + input_shape=metadata["input_shape"], + hiddens=metadata["hiddens"], + hidden_activation=metadata["hidden_activation"], + output_shape=metadata["output_shape"], + output_activation=metadata["output_activation"], + output_scale=metadata["output_scale"]) + + def compute(self, states, taken_actions=None, role=""): + if self.instantiator_input_type == 0: + output = self.net(states) + elif self.instantiator_input_type == -1: + output = self.net(taken_actions) + elif self.instantiator_input_type == -2: + output = self.net(torch.cat((states, taken_actions), dim=1)) + + return output * self.instantiator_output_scale + + metadata = {"input_shape": input_shape, + "hiddens": hiddens, + "hidden_activation": hidden_activation, + "output_shape": output_shape, + "output_activation": output_activation, + "output_scale": output_scale} + + return DeterministicModel(observation_space=observation_space, + action_space=action_space, + device=device, + clip_actions=clip_actions, + metadata=metadata) + def categorical_model(observation_space: Union[int, Tuple[int], gym.Space, None] = None, action_space: Union[int, Tuple[int], gym.Space, None] = None, device: Union[str, torch.device] = "cuda:0", @@ -262,8 +400,8 @@ def categorical_model(observation_space: Union[int, Tuple[int], gym.Space, None] hiddens: list = [256, 256], hidden_activation: list = ["relu", "relu"], output_shape: Shape = Shape.ACTIONS, - output_activation: Union[str, None] = None) -> CategoricalModel: - """Instantiate a CategoricalModel model + output_activation: Union[str, None] = None) -> Model: + """Instantiate a categorical model :param observation_space: Observation/state space or shape (default: None). If it is not None, the num_observations property will contain the size of that space @@ -289,21 +427,41 @@ def categorical_model(observation_space: Union[int, Tuple[int], gym.Space, None] :param output_activation: Activation function for the output layer (default: None) :type output_activation: str or None, optional - :return: CategoricalModel instance - :rtype: CategoricalModel + :return: Categorical model instance + :rtype: Model """ - model = CategoricalModel(observation_space=observation_space, - action_space=action_space, - device=device, - unnormalized_log_prob=unnormalized_log_prob) - - model._instantiator_net = _generate_sequential(model=model, - input_shape=input_shape, - hiddens=hiddens, - hidden_activation=hidden_activation, - output_shape=output_shape, - output_activation=output_activation) - model._instantiator_input_type = input_shape.value - - return model - \ No newline at end of file + class CategoricalModel(CategoricalMixin, Model): + def __init__(self, observation_space, action_space, device, unnormalized_log_prob, metadata): + Model.__init__(self, observation_space, action_space, device) + CategoricalMixin.__init__(self, unnormalized_log_prob) + + self.instantiator_input_type = metadata["input_shape"].value + + self.net = _generate_sequential(model=self, + input_shape=metadata["input_shape"], + hiddens=metadata["hiddens"], + hidden_activation=metadata["hidden_activation"], + output_shape=metadata["output_shape"], + output_activation=metadata["output_activation"]) + + def compute(self, states, taken_actions=None, role=""): + if self.instantiator_input_type == 0: + output = self.net(states) + elif self.instantiator_input_type == -1: + output = self.net(taken_actions) + elif self.instantiator_input_type == -2: + output = self.net(torch.cat((states, taken_actions), dim=1)) + + return output + + metadata = {"input_shape": input_shape, + "hiddens": hiddens, + "hidden_activation": hidden_activation, + "output_shape": output_shape, + "output_activation": output_activation} + + return CategoricalModel(observation_space=observation_space, + action_space=action_space, + device=device, + unnormalized_log_prob=unnormalized_log_prob, + metadata=metadata) From acd81ae0f1c46a3568570e3edc9f1807e4cfd592 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20Serrano=20Mu=C3=B1oz?= Date: Wed, 7 Sep 2022 22:49:15 +0200 Subject: [PATCH 060/108] Expose multivariate Gaussian instantiator and add it to docs --- docs/source/modules/skrl.utils.model_instantiators.rst | 4 +++- skrl/utils/model_instantiators.py | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/docs/source/modules/skrl.utils.model_instantiators.rst b/docs/source/modules/skrl.utils.model_instantiators.rst index 005dfc94..536f4d04 100644 --- a/docs/source/modules/skrl.utils.model_instantiators.rst +++ b/docs/source/modules/skrl.utils.model_instantiators.rst @@ -36,6 +36,8 @@ API .. autofunction:: skrl.utils.model_instantiators.categorical_model +.. autofunction:: skrl.utils.model_instantiators.deterministic_model + .. autofunction:: skrl.utils.model_instantiators.gaussian_model -.. autofunction:: skrl.utils.model_instantiators.deterministic_model +.. autofunction:: skrl.utils.model_instantiators.multivariate_gaussian_model diff --git a/skrl/utils/model_instantiators.py b/skrl/utils/model_instantiators.py index a4626bf6..86b64294 100644 --- a/skrl/utils/model_instantiators.py +++ b/skrl/utils/model_instantiators.py @@ -12,7 +12,7 @@ from ..models.torch import DeterministicMixin from ..models.torch import MultivariateGaussianMixin -__all__ = ["categorical_model", "deterministic_model", "gaussian_model", "Shape"] +__all__ = ["categorical_model", "deterministic_model", "gaussian_model", "multivariate_gaussian_model", "Shape"] class Shape(Enum): From 7d8c8f96831a49e3886359a9fbd714be655a53e4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20Serrano=20Mu=C3=B1oz?= Date: Sat, 10 Sep 2022 12:11:52 +0200 Subject: [PATCH 061/108] Add packaging to library dependencies --- docs/requirements.txt | 1 + docs/source/intro/installation.rst | 1 + setup.py | 1 + 3 files changed, 3 insertions(+) diff --git a/docs/requirements.txt b/docs/requirements.txt index 65315ffa..dc80cf40 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -6,3 +6,4 @@ gym torch tensorboard tqdm +packaging diff --git a/docs/source/intro/installation.rst b/docs/source/intro/installation.rst index eaf11187..cd5e6bdb 100644 --- a/docs/source/intro/installation.rst +++ b/docs/source/intro/installation.rst @@ -12,6 +12,7 @@ Prerequisites * `gym `_ * `tqdm `_ + * `packaging `_ * `torch `_ 1.8.0 or higher * `tensorboard `_ diff --git a/setup.py b/setup.py index d9bf4aa1..bbf87912 100644 --- a/setup.py +++ b/setup.py @@ -13,6 +13,7 @@ "torch", "tensorboard", "tqdm", + "packaging", ] # installation From a75d6d2533d1831ad8058cce0843d6facf90235a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20Serrano=20Mu=C3=B1oz?= Date: Sat, 10 Sep 2022 12:37:44 +0200 Subject: [PATCH 062/108] Fix Gym's environment wrapper for the new API --- skrl/envs/torch/wrappers.py | 33 ++++++++++++++++++++++----------- 1 file changed, 22 insertions(+), 11 deletions(-) diff --git a/skrl/envs/torch/wrappers.py b/skrl/envs/torch/wrappers.py index c4dfb43c..2ab26175 100644 --- a/skrl/envs/torch/wrappers.py +++ b/skrl/envs/torch/wrappers.py @@ -3,9 +3,12 @@ import gym import collections import numpy as np +from packaging import version import torch +from skrl import logger + __all__ = ["wrap_env"] @@ -271,10 +274,9 @@ def __init__(self, env: Any) -> None: except Exception as e: print("[WARNING] Failed to check for a vectorized environment: {}".format(e)) - if hasattr(self, "new_step_api"): - self._new_step_api = self._env.new_step_api - else: - self._new_step_api = False + self._drepecated_api = version.parse(gym.__version__) < version.parse(" 0.25.0") + if self._drepecated_api: + logger.warning("Using a deprecated version of OpenAI Gym's API: {}".format(gym.__version__)) @property def state_space(self) -> gym.Space: @@ -346,8 +348,11 @@ def _tensor_to_action(self, actions: torch.Tensor) -> Any: """ space = self._env.action_space if self._vectorized else self.action_space - if self._vectorized and isinstance(space, gym.spaces.MultiDiscrete): - return np.array(actions.cpu().numpy(), dtype=space.dtype).reshape(space.shape) + if self._vectorized: + if isinstance(space, gym.spaces.MultiDiscrete): + return np.array(actions.cpu().numpy(), dtype=space.dtype).reshape(space.shape) + elif isinstance(space, gym.spaces.Tuple): + return np.array(actions.cpu().numpy(), dtype=space[0].dtype).reshape(space.shape) elif isinstance(space, gym.spaces.Discrete): return actions.item() elif isinstance(space, gym.spaces.Box): @@ -364,11 +369,14 @@ def step(self, actions: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch :return: The state, the reward, the done flag, and the info :rtype: tuple of torch.Tensor and any other info """ - if self._new_step_api: - observation, reward, termination, truncation, info = self._env.step(self._tensor_to_action(actions)) - done = termination or truncation - else: + if self._drepecated_api: observation, reward, done, info = self._env.step(self._tensor_to_action(actions)) + else: + observation, reward, termination, truncation, info = self._env.step(self._tensor_to_action(actions)) + if type(termination) is bool: + done = termination or truncation + else: + done = np.logical_or(termination, truncation) # convert response to torch return self._observation_to_tensor(observation), \ torch.tensor(reward, device=self.device, dtype=torch.float32).view(self.num_envs, -1), \ @@ -381,7 +389,10 @@ def reset(self) -> torch.Tensor: :return: The state of the environment :rtype: torch.Tensor """ - observation = self._env.reset() + if self._drepecated_api: + observation = self._env.reset() + else: + observation, info = self._env.reset() return self._observation_to_tensor(observation) def render(self, *args, **kwargs) -> None: From 6f8443e8fa6630e6508ba647dd2e5958cb963536 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20Serrano=20Mu=C3=B1oz?= Date: Sat, 10 Sep 2022 17:40:26 +0200 Subject: [PATCH 063/108] Convert tabular model class to mixin --- skrl/models/torch/__init__.py | 2 +- skrl/models/torch/tabular.py | 167 ++++++++++++++++++++++++++-------- 2 files changed, 130 insertions(+), 39 deletions(-) diff --git a/skrl/models/torch/__init__.py b/skrl/models/torch/__init__.py index ed7b6389..9a4e879c 100644 --- a/skrl/models/torch/__init__.py +++ b/skrl/models/torch/__init__.py @@ -1,6 +1,6 @@ from .base import Model -from .tabular import TabularModel +from .tabular import TabularMixin from .gaussian import GaussianMixin from .categorical import CategoricalMixin from .deterministic import DeterministicMixin diff --git a/skrl/models/torch/tabular.py b/skrl/models/torch/tabular.py index b2206f83..8fe88279 100644 --- a/skrl/models/torch/tabular.py +++ b/skrl/models/torch/tabular.py @@ -1,40 +1,67 @@ -from typing import Union, Tuple - -import gym +from typing import Optional, Mapping, Sequence import torch from . import Model -class TabularModel(Model): - def __init__(self, - observation_space: Union[int, Tuple[int], gym.Space, None] = None, - action_space: Union[int, Tuple[int], gym.Space, None] = None, - device: Union[str, torch.device] = "cuda:0", - num_envs: int = 1) -> None: - """Tabular model - - :param observation_space: Observation/state space or shape (default: None). - If it is not None, the num_observations property will contain the size of that space - :type observation_space: int, tuple or list of integers, gym.Space or None, optional - :param action_space: Action space or shape (default: None). - If it is not None, the num_actions property will contain the size of that space - :type action_space: int, tuple or list of integers, gym.Space or None, optional - :param device: Device on which a torch tensor is or will be allocated (default: "cuda:0") - :type device: str or torch.device, optional +class TabularMixin: + def __init__(self, num_envs: int = 1, role: str = "") -> None: + """Tabular mixin model + :param num_envs: Number of environments (default: 1) :type num_envs: int, optional + :param role: Role play by the model (default: ``""``) + :type role: str, optional + + Example:: + + # define the model + >>> import torch + >>> from skrl.models.torch import Model, TabularMixin + >>> + >>> class GreedyPolicy(TabularMixin, Model): + ... def __init__(self, observation_space, action_space, device="cuda:0", num_envs=1): + ... Model.__init__(self, observation_space, action_space, device) + ... TabularMixin.__init__(self, num_envs) + ... + ... self.table = torch.ones((num_envs, self.num_observations, self.num_actions), + ... dtype=torch.float32, device=self.device) + ... + ... def compute(self, states, taken_actions, role): + ... actions = torch.argmax(self.table[torch.arange(self.num_envs).view(-1, 1), states], + ... dim=-1, keepdim=True).view(-1,1) + ... + >>> # given an observation_space: gym.spaces.Discrete with n=100 + >>> # and an action_space: gym.spaces.Discrete with n=5 + >>> model = GreedyPolicy(observation_space, action_space, num_envs=1) + >>> + >>> print(model) + GreedyPolicy( + (table): Tensor(shape=[1, 100, 5]) + ) """ - super(TabularModel, self).__init__(observation_space, action_space, device) - self.num_envs = num_envs - def _get_tensor_names(self) -> Tuple[str]: + def __repr__(self) -> str: + """String representation of an object as torch.nn.Module + """ + lines = [] + for name in self._get_tensor_names(): + tensor = getattr(self, name) + lines.append("({}): {}(shape={})".format(name, tensor.__class__.__name__, list(tensor.shape))) + + main_str = self.__class__.__name__ + '(' + if lines: + main_str += "\n {}\n".format("\n ".join(lines)) + main_str += ')' + return main_str + + def _get_tensor_names(self) -> Sequence[str]: """Get the names of the tensors that the model is using :return: Tensor names - :rtype: tuple of str + :rtype: sequence of str """ tensors = [] for attr in dir(self): @@ -44,24 +71,31 @@ def _get_tensor_names(self) -> Tuple[str]: def act(self, states: torch.Tensor, - taken_actions: Union[torch.Tensor, None] = None, - inference=False) -> Tuple[torch.Tensor]: + taken_actions: Optional[torch.Tensor] = None, + role: str = "") -> Sequence[torch.Tensor]: """Act in response to the state of the environment :param states: Observation/state of the environment used to make the decision :type states: torch.Tensor - :param taken_actions: Actions taken by a policy to the given states (default: None) - :type taken_actions: torch.Tensor or None, optional - :param inference: Flag to indicate whether the model is making inference (default: False). - If True, the returned tensors will be detached from the current graph - :type inference: bool, optional + :param taken_actions: Actions taken by a policy to the given states (default: ``None``). + The use of these actions only makes sense in critical models, e.g. + :type taken_actions: torch.Tensor, optional + :param role: Role play by the model (default: ``""``) + :type role: str, optional :return: Action to be taken by the agent given the state of the environment. - The tuple's components are the computed actions and None for the last two components - :rtype: tuple of torch.Tensor + The sequence's components are the computed actions and None for the last two components + :rtype: sequence of torch.Tensor + + Example:: + + >>> # given a batch of sample states with shape (1, 100) + >>> output = model.act(states) + >>> print(output[0], output[1], output[2]) + tensor([[3]], device='cuda:0') None None """ actions = self.compute(states.to(self.device), - taken_actions.to(self.device) if taken_actions is not None else taken_actions) + taken_actions.to(self.device) if taken_actions is not None else taken_actions, role) return actions, None, None def table(self) -> torch.Tensor: @@ -69,6 +103,12 @@ def table(self) -> torch.Tensor: :return: Q-table :rtype: torch.Tensor + + Example:: + + >>> output = model.table() + >>> print(output.shape) + torch.Size([1, 100, 5]) """ return self.q_table @@ -83,29 +123,80 @@ def to(self, *args, **kwargs) -> Model: :return: Model moved to the specified device :rtype: Model """ - super(TabularModel, self).to(*args, **kwargs) + Model.to(self, *args, **kwargs) for name in self._get_tensor_names(): setattr(self, name, getattr(self, name).to(*args, **kwargs)) return self - def save(self, path: str, state_dict: Union[dict, None] = None) -> None: + def state_dict(self, *args, **kwargs) -> Mapping: + """Returns a dictionary containing a whole state of the module + + :return: A dictionary containing a whole state of the module + :rtype: dict + """ + _state_dict = {name: getattr(self, name) for name in self._get_tensor_names()} + Model.state_dict(self, destination=_state_dict) + return _state_dict + + def load_state_dict(self, state_dict: Mapping, strict: bool = True) -> None: + """Copies parameters and buffers from state_dict into this module and its descendants + + :param state_dict: A dict containing parameters and persistent buffers + :type state_dict: dict + :param strict: Whether to strictly enforce that the keys in state_dict match the keys + returned by this module's state_dict() function (default: ``True``) + :type strict: bool, optional + """ + Model.load_state_dict(self, state_dict, strict=False) + + for name, tensor in state_dict.items(): + if hasattr(self, name) and isinstance(getattr(self, name), torch.Tensor): + _tensor = getattr(self, name) + if isinstance(_tensor, torch.Tensor): + if _tensor.shape == tensor.shape and _tensor.dtype == tensor.dtype: + setattr(self, name, tensor) + else: + raise ValueError("Tensor shape ({} vs {}) or dtype ({} vs {}) mismatch"\ + .format(_tensor.shape, tensor.shape, _tensor.dtype, tensor.dtype)) + else: + raise ValueError("{} is not a tensor of {}".format(name, self.__class__.__name__)) + + def save(self, path: str, state_dict: Optional[dict] = None) -> None: """Save the model to the specified path :param path: Path to save the model to :type path: str - :param state_dict: State dictionary to save (default: None). + :param state_dict: State dictionary to save (default: ``None``). If None, the model's state_dict will be saved :type state_dict: dict, optional + + Example:: + + # save the current model to the specified path + >>> model.save("/tmp/model.pt") """ + # TODO: save state_dict torch.save({name: getattr(self, name) for name in self._get_tensor_names()}, path) def load(self, path: str) -> None: """Load the model from the specified path - - :raises ValueError: If the models are not compatible + + The final storage device is determined by the constructor of the model :param path: Path to load the model from :type path: str + + :raises ValueError: If the models are not compatible + + Example:: + + # load the model onto the CPU + >>> model = Model(observation_space, action_space, device="cpu") + >>> model.load("model.pt") + + # load the model onto the GPU 1 + >>> model = Model(observation_space, action_space, device="cuda:1") + >>> model.load("model.pt") """ tensors = torch.load(path) for name, tensor in tensors.items(): From 349f699c38c835062b72afa6657afdb399939ed3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20Serrano=20Mu=C3=B1oz?= Date: Sat, 10 Sep 2022 17:41:42 +0200 Subject: [PATCH 064/108] Update tabular model in docs --- docs/source/modules/skrl.models.tabular.rst | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/docs/source/modules/skrl.models.tabular.rst b/docs/source/modules/skrl.models.tabular.rst index 25b27bba..774b338a 100644 --- a/docs/source/modules/skrl.models.tabular.rst +++ b/docs/source/modules/skrl.models.tabular.rst @@ -4,7 +4,7 @@ Tabular model ============= Basic usage -^^^^^^^^^^^ +----------- .. tabs:: @@ -17,11 +17,11 @@ Basic usage :end-before: [end-epsilon-greedy] API -^^^ +--- -.. autoclass:: skrl.models.torch.tabular.TabularModel - :show-inheritance: - :members: - - .. automethod:: __init__ - .. automethod:: compute +.. autoclass:: skrl.models.torch.tabular.TabularMixin + :show-inheritance: + :exclude-members: to, state_dict, load_state_dict, load, save + :members: + + .. automethod:: __init__ From 4506c6018768e045993b8ce83748587dcfa22a2b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20Serrano=20Mu=C3=B1oz?= Date: Sat, 10 Sep 2022 23:36:52 +0200 Subject: [PATCH 065/108] Update OpenAI Gym examples --- docs/source/examples/gym/gym_cartpole_cem.py | 82 +++++++++++++++++++ .../examples/gym/gym_cartpole_cem_eval.py | 75 +++++++++++++++++ docs/source/examples/gym/gym_cartpole_dqn.py | 44 +++++----- .../examples/gym/gym_cartpole_dqn_eval.py | 28 +++---- .../gym/gym_frozen_lake_q_learning.py | 26 +++--- .../gym/gym_frozen_lake_q_learning_eval.py | 28 ++++--- docs/source/examples/gym/gym_pendulum_ddpg.py | 50 +++++------ .../examples/gym/gym_pendulum_ddpg_eval.py | 24 +++--- docs/source/examples/gym/gym_taxi_sarsa.py | 26 +++--- .../examples/gym/gym_taxi_sarsa_eval.py | 26 +++--- .../examples/gym/gym_vector_cartpole_dqn.py | 44 +++++----- .../gym/gym_vector_frozen_lake_q_learning.py | 36 ++++---- .../examples/gym/gym_vector_pendulum_ddpg.py | 46 ++++++----- .../examples/gym/gym_vector_taxi_sarsa.py | 26 +++--- 14 files changed, 361 insertions(+), 200 deletions(-) create mode 100644 docs/source/examples/gym/gym_cartpole_cem.py create mode 100644 docs/source/examples/gym/gym_cartpole_cem_eval.py diff --git a/docs/source/examples/gym/gym_cartpole_cem.py b/docs/source/examples/gym/gym_cartpole_cem.py new file mode 100644 index 00000000..ac78940b --- /dev/null +++ b/docs/source/examples/gym/gym_cartpole_cem.py @@ -0,0 +1,82 @@ +import gym + +import torch.nn as nn +import torch.nn.functional as F + +# Import the skrl components to build the RL system +from skrl.models.torch import Model, CategoricalMixin +from skrl.memories.torch import RandomMemory +from skrl.agents.torch.cem import CEM, CEM_DEFAULT_CONFIG +from skrl.trainers.torch import SequentialTrainer +from skrl.envs.torch import wrap_env + + +# Define the model (categorical model) for the CEM agent using mixin +# - Policy: takes as input the environment's observation/state and returns an action +class Policy(CategoricalMixin, Model): + def __init__(self, observation_space, action_space, device, unnormalized_log_prob=True): + Model.__init__(self, observation_space, action_space, device) + CategoricalMixin.__init__(self, unnormalized_log_prob) + + self.linear_layer_1 = nn.Linear(self.num_observations, 64) + self.linear_layer_2 = nn.Linear(64, 64) + self.output_layer = nn.Linear(64, self.num_actions) + + def compute(self, states, taken_actions, role): + x = F.relu(self.linear_layer_1(states)) + x = F.relu(self.linear_layer_2(x)) + return self.output_layer(x) + + +# Load and wrap the Gym environment. +# Note: the environment version may change depending on the gym version +try: + env = gym.make("CartPole-v0") +except gym.error.DeprecatedEnv as e: + env_id = [spec.id for spec in gym.envs.registry.all() if spec.id.startswith("CartPole-v")][0] + print("CartPole-v0 not found. Trying {}".format(env_id)) + env = gym.make(env_id) +env = wrap_env(env) + +device = env.device + + +# Instantiate a RandomMemory (without replacement) as experience replay memory +memory = RandomMemory(memory_size=1000, num_envs=env.num_envs, device=device, replacement=False) + + +# Instantiate the agent's model (function approximator). +# CEM requires 1 model, visit its documentation for more details +# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.cem.html#spaces-and-models +models_cem = {} +models_cem["policy"] = Policy(env.observation_space, env.action_space, device) + +# Initialize the models' parameters (weights and biases) using a Gaussian distribution +for model in models_cem.values(): + model.init_parameters(method_name="normal_", mean=0.0, std=0.1) + + +# Configure and instantiate the agent. +# Only modify some of the default configuration, visit its documentation to see all the options +# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.cem.html#configuration-and-hyperparameters +cfg_cem = CEM_DEFAULT_CONFIG.copy() +cfg_cem["rollouts"] = 1000 +cfg_cem["learning_starts"] = 100 +# logging to TensorBoard and write checkpoints each 1000 and 5000 timesteps respectively +cfg_cem["experiment"]["write_interval"] = 1000 +cfg_cem["experiment"]["checkpoint_interval"] = 5000 + +agent_cem = CEM(models=models_cem, + memory=memory, + cfg=cfg_cem, + observation_space=env.observation_space, + action_space=env.action_space, + device=device) + + +# Configure and instantiate the RL trainer +cfg_trainer = {"timesteps": 100000, "headless": True} +trainer = SequentialTrainer(env=env, agents=[agent_cem], cfg=cfg_trainer) + +# start training +trainer.train() diff --git a/docs/source/examples/gym/gym_cartpole_cem_eval.py b/docs/source/examples/gym/gym_cartpole_cem_eval.py new file mode 100644 index 00000000..72a3633e --- /dev/null +++ b/docs/source/examples/gym/gym_cartpole_cem_eval.py @@ -0,0 +1,75 @@ +import gym + +import torch.nn as nn +import torch.nn.functional as F + +# Import the skrl components to build the RL system +from skrl.models.torch import Model, CategoricalMixin +from skrl.agents.torch.cem import CEM, CEM_DEFAULT_CONFIG +from skrl.trainers.torch import SequentialTrainer +from skrl.envs.torch import wrap_env + + +# Define the model (categorical model) for the CEM agent using mixin +# - Policy: takes as input the environment's observation/state and returns an action +class Policy(CategoricalMixin, Model): + def __init__(self, observation_space, action_space, device, unnormalized_log_prob=True): + Model.__init__(self, observation_space, action_space, device) + CategoricalMixin.__init__(self, unnormalized_log_prob) + + self.linear_layer_1 = nn.Linear(self.num_observations, 64) + self.linear_layer_2 = nn.Linear(64, 64) + self.output_layer = nn.Linear(64, self.num_actions) + + def compute(self, states, taken_actions, role): + x = F.relu(self.linear_layer_1(states)) + x = F.relu(self.linear_layer_2(x)) + return self.output_layer(x) + + +# Load and wrap the Gym environment. +# Note: the environment version may change depending on the gym version +try: + env = gym.make("CartPole-v0") +except gym.error.DeprecatedEnv as e: + env_id = [spec.id for spec in gym.envs.registry.all() if spec.id.startswith("CartPole-v")][0] + print("CartPole-v0 not found. Trying {}".format(env_id)) + env = gym.make(env_id) +env = wrap_env(env) + +device = env.device + + +# Instantiate the agent's model (function approximators). +# CEM requires 1 model, visit its documentation for more details +# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.cem.html#spaces-and-models +models_cem = {} +models_cem["policy"] = Policy(env.observation_space, env.action_space, device) + + +# Configure and instantiate the agent. +# Only modify some of the default configuration, visit its documentation to see all the options +# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.cem.html#configuration-and-hyperparameters +cfg_cem = CEM_DEFAULT_CONFIG.copy() +cfg_cem["rollouts"] = 1000 +cfg_cem["learning_starts"] = 100 +# logging to TensorBoard each 1000 timesteps and ignore checkpoints +cfg_cem["experiment"]["write_interval"] = 1000 +cfg_cem["experiment"]["checkpoint_interval"] = 0 + +agent_cem = CEM(models=models_cem, + memory=None, + cfg=cfg_cem, + observation_space=env.observation_space, + action_space=env.action_space, + device=device) + +# load checkpoint +agent_cem.load("./runs/22-09-07_21-41-05-854385_CEM/checkpoints/best_agent.pt") + +# Configure and instantiate the RL trainer +cfg_trainer = {"timesteps": 100000, "headless": True} +trainer = SequentialTrainer(env=env, agents=[agent_cem], cfg=cfg_trainer) + +# evaluate the agent +trainer.eval() diff --git a/docs/source/examples/gym/gym_cartpole_dqn.py b/docs/source/examples/gym/gym_cartpole_dqn.py index cb08ae85..98c7f6fe 100644 --- a/docs/source/examples/gym/gym_cartpole_dqn.py +++ b/docs/source/examples/gym/gym_cartpole_dqn.py @@ -22,32 +22,33 @@ # Instantiate a RandomMemory (without replacement) as experience replay memory -memory = RandomMemory(memory_size=100000, num_envs=env.num_envs, device=device, replacement=False) +memory = RandomMemory(memory_size=50000, num_envs=env.num_envs, device=device, replacement=False) # Instantiate the agent's models (function approximators) using the model instantiator utility # DQN requires 2 models, visit its documentation for more details # https://skrl.readthedocs.io/en/latest/modules/skrl.agents.dqn.html#spaces-and-models -models_dqn = {"q_network": deterministic_model(observation_space=env.observation_space, - action_space=env.action_space, - device=device, - clip_actions=False, - input_shape=Shape.OBSERVATIONS, - hiddens=[64, 64], - hidden_activation=["relu", "relu"], - output_shape=Shape.ACTIONS, - output_activation=None, - output_scale=1.0), - "target_q_network": deterministic_model(observation_space=env.observation_space, - action_space=env.action_space, - device=device, - clip_actions=False, - input_shape=Shape.OBSERVATIONS, - hiddens=[64, 64], - hidden_activation=["relu", "relu"], - output_shape=Shape.ACTIONS, - output_activation=None, - output_scale=1.0)} +models_dqn = {} +models_dqn["q_network"] = deterministic_model(observation_space=env.observation_space, + action_space=env.action_space, + device=device, + clip_actions=False, + input_shape=Shape.OBSERVATIONS, + hiddens=[64, 64], + hidden_activation=["relu", "relu"], + output_shape=Shape.ACTIONS, + output_activation=None, + output_scale=1.0) +models_dqn["target_q_network"] = deterministic_model(observation_space=env.observation_space, + action_space=env.action_space, + device=device, + clip_actions=False, + input_shape=Shape.OBSERVATIONS, + hiddens=[64, 64], + hidden_activation=["relu", "relu"], + output_shape=Shape.ACTIONS, + output_activation=None, + output_scale=1.0) # Initialize the models' parameters (weights and biases) using a Gaussian distribution for model in models_dqn.values(): @@ -58,7 +59,6 @@ # Only modify some of the default configuration, visit its documentation to see all the options # https://skrl.readthedocs.io/en/latest/modules/skrl.agents.dqn.html#configuration-and-hyperparameters cfg_dqn = DQN_DEFAULT_CONFIG.copy() -cfg_dqn["random_timesteps"] = 0 cfg_dqn["learning_starts"] = 100 cfg_dqn["exploration"]["final_epsilon"] = 0.04 cfg_dqn["exploration"]["timesteps"] = 1500 diff --git a/docs/source/examples/gym/gym_cartpole_dqn_eval.py b/docs/source/examples/gym/gym_cartpole_dqn_eval.py index 6360b1c7..1cd9c90a 100644 --- a/docs/source/examples/gym/gym_cartpole_dqn_eval.py +++ b/docs/source/examples/gym/gym_cartpole_dqn_eval.py @@ -23,26 +23,23 @@ # Instantiate only the policy for evaluation. # DQN requires 2 models, visit its documentation for more details # https://skrl.readthedocs.io/en/latest/modules/skrl.agents.dqn.html#spaces-and-models -models_dqn = {"q_network": deterministic_model(observation_space=env.observation_space, - action_space=env.action_space, - device=device, - clip_actions=False, - input_shape=Shape.OBSERVATIONS, - hiddens=[64, 64], - hidden_activation=["relu", "relu"], - output_shape=Shape.ACTIONS, - output_activation=None, - output_scale=1.0)} - -# load checkpoint -models_dqn["q_network"].load("./runs/22-02-06_19-19-56-857355_DQN/checkpoints/15000_q_network.pt") +models_dqn = {} +models_dqn["q_network"] = deterministic_model(observation_space=env.observation_space, + action_space=env.action_space, + device=device, + clip_actions=False, + input_shape=Shape.OBSERVATIONS, + hiddens=[64, 64], + hidden_activation=["relu", "relu"], + output_shape=Shape.ACTIONS, + output_activation=None, + output_scale=1.0) # Configure and instantiate the agent. # Only modify some of the default configuration, visit its documentation to see all the options # https://skrl.readthedocs.io/en/latest/modules/skrl.agents.dqn.html#configuration-and-hyperparameters cfg_dqn = DQN_DEFAULT_CONFIG.copy() -cfg_dqn["random_timesteps"] = 0 cfg_dqn["exploration"]["timesteps"] = 0 # # logging to TensorBoard each 1000 timesteps and ignore checkpoints cfg_dqn["experiment"]["write_interval"] = 1000 @@ -55,6 +52,9 @@ action_space=env.action_space, device=device) +# load checkpoint +agent_dqn.load("./runs/22-09-10_10-48-10-551426_DQN/checkpoints/best_agent.pt") + # Configure and instantiate the RL trainer cfg_trainer = {"timesteps": 50000, "headless": True} diff --git a/docs/source/examples/gym/gym_frozen_lake_q_learning.py b/docs/source/examples/gym/gym_frozen_lake_q_learning.py index 822cdbda..c3141f38 100644 --- a/docs/source/examples/gym/gym_frozen_lake_q_learning.py +++ b/docs/source/examples/gym/gym_frozen_lake_q_learning.py @@ -3,21 +3,23 @@ import torch # Import the skrl components to build the RL system -from skrl.models.torch import TabularModel +from skrl.models.torch import Model, TabularMixin from skrl.agents.torch.q_learning import Q_LEARNING, Q_LEARNING_DEFAULT_CONFIG from skrl.trainers.torch import SequentialTrainer from skrl.envs.torch import wrap_env -# Define the model (tabular models) for the Q-learning agent using a helper class -class EpilonGreedyPolicy(TabularModel): +# Define the model (tabular model) for the SARSA agent using mixin +class EpilonGreedyPolicy(TabularMixin, Model): def __init__(self, observation_space, action_space, device, num_envs=1, epsilon=0.1): - super().__init__(observation_space, action_space, device, num_envs) + Model.__init__(self, observation_space, action_space, device) + TabularMixin.__init__(self, num_envs) self.epsilon = epsilon - self.q_table = torch.ones((num_envs, self.num_observations, self.num_actions), dtype=torch.float32, device=self.device) + self.q_table = torch.ones((num_envs, self.num_observations, self.num_actions), + dtype=torch.float32, device=self.device) - def compute(self, states, taken_actions): + def compute(self, states, taken_actions, role): actions = torch.argmax(self.q_table[torch.arange(self.num_envs).view(-1, 1), states], dim=-1, keepdim=True).view(-1,1) @@ -44,21 +46,19 @@ def compute(self, states, taken_actions): # Instantiate the agent's models (table) # Q-learning requires 1 model, visit its documentation for more details # https://skrl.readthedocs.io/en/latest/modules/skrl.agents.q_learning.html#spaces-and-models -models_q_learning = {"policy": EpilonGreedyPolicy(env.observation_space, env.action_space, device, \ - num_envs=env.num_envs, epsilon=0.1)} +models_q_learning = {} +models_q_learning["policy"] = EpilonGreedyPolicy(env.observation_space, env.action_space, device, num_envs=env.num_envs, epsilon=0.1) # Configure and instantiate the agent. # Only modify some of the default configuration, visit its documentation to see all the options # https://skrl.readthedocs.io/en/latest/modules/skrl.agents.q_learning.html#configuration-and-hyperparameters cfg_q_learning = Q_LEARNING_DEFAULT_CONFIG.copy() -cfg_q_learning["random_timesteps"] = 0 -cfg_q_learning["learning_starts"] = 0 cfg_q_learning["discount_factor"] = 0.999 cfg_q_learning["alpha"] = 0.4 -# logging to TensorBoard and write checkpoints each 1000 and 5000 timesteps respectively -cfg_q_learning["experiment"]["write_interval"] = 1000 -cfg_q_learning["experiment"]["checkpoint_interval"] = 5000 +# logging to TensorBoard and write checkpoints each 1600 and 8000 timesteps respectively +cfg_q_learning["experiment"]["write_interval"] = 1600 +cfg_q_learning["experiment"]["checkpoint_interval"] = 8000 agent_q_learning = Q_LEARNING(models=models_q_learning, memory=None, diff --git a/docs/source/examples/gym/gym_frozen_lake_q_learning_eval.py b/docs/source/examples/gym/gym_frozen_lake_q_learning_eval.py index c600516f..bfdc9452 100644 --- a/docs/source/examples/gym/gym_frozen_lake_q_learning_eval.py +++ b/docs/source/examples/gym/gym_frozen_lake_q_learning_eval.py @@ -3,21 +3,23 @@ import torch # Import the skrl components to build the RL system -from skrl.models.torch import TabularModel +from skrl.models.torch import Model, TabularMixin from skrl.agents.torch.q_learning import Q_LEARNING, Q_LEARNING_DEFAULT_CONFIG from skrl.trainers.torch import SequentialTrainer from skrl.envs.torch import wrap_env -# Define the model (tabular models) for the Q-learning agent using a helper class -class EpilonGreedyPolicy(TabularModel): +# Define the model (tabular model) for the SARSA agent using mixin +class EpilonGreedyPolicy(TabularMixin, Model): def __init__(self, observation_space, action_space, device, num_envs=1, epsilon=0.1): - super().__init__(observation_space, action_space, device, num_envs) + Model.__init__(self, observation_space, action_space, device) + TabularMixin.__init__(self, num_envs) self.epsilon = epsilon - self.q_table = torch.ones((num_envs, self.num_observations, self.num_actions), dtype=torch.float32, device=self.device) + self.q_table = torch.ones((num_envs, self.num_observations, self.num_actions), + dtype=torch.float32, device=self.device) - def compute(self, states, taken_actions): + def compute(self, states, taken_actions, role): actions = torch.argmax(self.q_table[torch.arange(self.num_envs).view(-1, 1), states], dim=-1, keepdim=True).view(-1,1) @@ -44,11 +46,8 @@ def compute(self, states, taken_actions): # Instantiate the agent's models (table) # Q-learning requires 1 model, visit its documentation for more details # https://skrl.readthedocs.io/en/latest/modules/skrl.agents.q_learning.html#spaces-and-models -models_q_learning = {"policy": EpilonGreedyPolicy(env.observation_space, env.action_space, device, \ - num_envs=env.num_envs, epsilon=0.1)} - -# load checkpoint -models_q_learning["policy"].load("./runs/22-03-09_12-09-36-143036_Q_LEARNING/checkpoints/60000_policy.pt") +models_q_learning = {} +models_q_learning["policy"] = EpilonGreedyPolicy(env.observation_space, env.action_space, device, num_envs=env.num_envs, epsilon=0.1) # Configure and instantiate the agent. @@ -56,8 +55,8 @@ def compute(self, states, taken_actions): # https://skrl.readthedocs.io/en/latest/modules/skrl.agents.q_learning.html#configuration-and-hyperparameters cfg_q_learning = Q_LEARNING_DEFAULT_CONFIG.copy() cfg_q_learning["random_timesteps"] = 0 -# logging to TensorBoard and write checkpoints each 1000 and ignore checkpoints -cfg_q_learning["experiment"]["write_interval"] = 1000 +# logging to TensorBoard and write checkpoints each 1600 and ignore checkpoints +cfg_q_learning["experiment"]["write_interval"] = 1600 cfg_q_learning["experiment"]["checkpoint_interval"] = 0 agent_q_learning = Q_LEARNING(models=models_q_learning, @@ -67,6 +66,9 @@ def compute(self, states, taken_actions): action_space=env.action_space, device=device) +# load checkpoint +agent_q_learning.load("./runs/22-09-10_17-54-20-381109_Q_LEARNING/checkpoints/best_agent.pt") + # Configure and instantiate the RL trainer cfg_trainer = {"timesteps": 80000, "headless": True} diff --git a/docs/source/examples/gym/gym_pendulum_ddpg.py b/docs/source/examples/gym/gym_pendulum_ddpg.py index 391472ec..7ac71623 100644 --- a/docs/source/examples/gym/gym_pendulum_ddpg.py +++ b/docs/source/examples/gym/gym_pendulum_ddpg.py @@ -5,7 +5,7 @@ import torch.nn.functional as F # Import the skrl components to build the RL system -from skrl.models.torch import DeterministicModel +from skrl.models.torch import Model, DeterministicMixin from skrl.memories.torch import RandomMemory from skrl.agents.torch.ddpg import DDPG, DDPG_DEFAULT_CONFIG from skrl.resources.noises.torch import OrnsteinUhlenbeckNoise @@ -13,35 +13,36 @@ from skrl.envs.torch import wrap_env -# Define the models (deterministic models) for the DDPG agent using a helper class -# and programming with two approaches (layer by layer and torch.nn.Sequential class). +# Define the models (deterministic models) for the DDPG agent using mixin # - Actor (policy): takes as input the environment's observation/state and returns an action # - Critic: takes the state and action as input and provides a value to guide the policy -class DeterministicActor(DeterministicModel): - def __init__(self, observation_space, action_space, device, clip_actions = False): - super().__init__(observation_space, action_space, device, clip_actions) +class DeterministicActor(DeterministicMixin, Model): + def __init__(self, observation_space, action_space, device, clip_actions=False): + Model.__init__(self, observation_space, action_space, device) + DeterministicMixin.__init__(self, clip_actions) self.linear_layer_1 = nn.Linear(self.num_observations, 400) self.linear_layer_2 = nn.Linear(400, 300) self.action_layer = nn.Linear(300, self.num_actions) - def compute(self, states, taken_actions): + def compute(self, states, taken_actions, role): x = F.relu(self.linear_layer_1(states)) x = F.relu(self.linear_layer_2(x)) return 2 * torch.tanh(self.action_layer(x)) # Pendulum-v1 action_space is -2 to 2 -class DeterministicCritic(DeterministicModel): - def __init__(self, observation_space, action_space, device, clip_actions = False): - super().__init__(observation_space, action_space, device, clip_actions) +class DeterministicCritic(DeterministicMixin, Model): + def __init__(self, observation_space, action_space, device, clip_actions=False): + Model.__init__(self, observation_space, action_space, device) + DeterministicMixin.__init__(self, clip_actions) - self.net = nn.Sequential(nn.Linear(self.num_observations + self.num_actions, 400), - nn.ReLU(), - nn.Linear(400, 300), - nn.ReLU(), - nn.Linear(300, 1)) + self.linear_layer_1 = nn.Linear(self.num_observations + self.num_actions, 400) + self.linear_layer_2 = nn.Linear(400, 300) + self.linear_layer_3 = nn.Linear(300, 1) - def compute(self, states, taken_actions): - return self.net(torch.cat([states, taken_actions], dim=1)) + def compute(self, states, taken_actions, role): + x = F.relu(self.linear_layer_1(torch.cat([states, taken_actions], dim=1))) + x = F.relu(self.linear_layer_2(x)) + return self.linear_layer_3(x) # Load and wrap the Gym environment. @@ -64,10 +65,11 @@ def compute(self, states, taken_actions): # Instantiate the agent's models (function approximators). # DDPG requires 4 models, visit its documentation for more details # https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ddpg.html#spaces-and-models -models_ddpg = {"policy": DeterministicActor(env.observation_space, env.action_space, device, clip_actions=True), - "target_policy": DeterministicActor(env.observation_space, env.action_space, device, clip_actions=True), - "critic": DeterministicCritic(env.observation_space, env.action_space, device), - "target_critic": DeterministicCritic(env.observation_space, env.action_space, device)} +models_ddpg = {} +models_ddpg["policy"] = DeterministicActor(env.observation_space, env.action_space, device) +models_ddpg["target_policy"] = DeterministicActor(env.observation_space, env.action_space, device) +models_ddpg["critic"] = DeterministicCritic(env.observation_space, env.action_space, device) +models_ddpg["target_critic"] = DeterministicCritic(env.observation_space, env.action_space, device) # Initialize the models' parameters (weights and biases) using a Gaussian distribution for model in models_ddpg.values(): @@ -82,9 +84,9 @@ def compute(self, states, taken_actions): cfg_ddpg["batch_size"] = 100 cfg_ddpg["random_timesteps"] = 100 cfg_ddpg["learning_starts"] = 100 -# logging to TensorBoard and write checkpoints each 1000 and 1000 timesteps respectively -cfg_ddpg["experiment"]["write_interval"] = 1000 -cfg_ddpg["experiment"]["checkpoint_interval"] = 1000 +# logging to TensorBoard and write checkpoints each 300 and 1500 timesteps respectively +cfg_ddpg["experiment"]["write_interval"] = 300 +cfg_ddpg["experiment"]["checkpoint_interval"] = 1500 agent_ddpg = DDPG(models=models_ddpg, memory=memory, diff --git a/docs/source/examples/gym/gym_pendulum_ddpg_eval.py b/docs/source/examples/gym/gym_pendulum_ddpg_eval.py index 7b2628ea..350cc85c 100644 --- a/docs/source/examples/gym/gym_pendulum_ddpg_eval.py +++ b/docs/source/examples/gym/gym_pendulum_ddpg_eval.py @@ -5,22 +5,23 @@ import torch.nn.functional as F # Import the skrl components to build the RL system -from skrl.models.torch import DeterministicModel +from skrl.models.torch import Model, DeterministicMixin from skrl.agents.torch.ddpg import DDPG, DDPG_DEFAULT_CONFIG from skrl.trainers.torch import SequentialTrainer from skrl.envs.torch import wrap_env # Define only the policy for evaluation -class DeterministicActor(DeterministicModel): - def __init__(self, observation_space, action_space, device, clip_actions = False): - super().__init__(observation_space, action_space, device, clip_actions) +class DeterministicActor(DeterministicMixin, Model): + def __init__(self, observation_space, action_space, device, clip_actions=False): + Model.__init__(self, observation_space, action_space, device) + DeterministicMixin.__init__(self, clip_actions) self.linear_layer_1 = nn.Linear(self.num_observations, 400) self.linear_layer_2 = nn.Linear(400, 300) self.action_layer = nn.Linear(300, self.num_actions) - def compute(self, states, taken_actions): + def compute(self, states, taken_actions, role): x = F.relu(self.linear_layer_1(states)) x = F.relu(self.linear_layer_2(x)) return 2 * torch.tanh(self.action_layer(x)) # Pendulum-v1 action_space is -2 to 2 @@ -42,10 +43,8 @@ def compute(self, states, taken_actions): # Instantiate the agent's policy. # DDPG requires 4 models, visit its documentation for more details # https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ddpg.html#spaces-and-models -models_ddpg = {"policy": DeterministicActor(env.observation_space, env.action_space, device, clip_actions=True)} - -# load checkpoint -models_ddpg["policy"].load("./runs/22-02-06_19-23-31-556859_DDPG/checkpoints/36000_policy.pt") +models_ddpg = {} +models_ddpg["policy"] = DeterministicActor(env.observation_space, env.action_space, device) # Configure and instantiate the agent. @@ -53,8 +52,8 @@ def compute(self, states, taken_actions): # https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ddpg.html#configuration-and-hyperparameters cfg_ddpg = DDPG_DEFAULT_CONFIG.copy() cfg_ddpg["random_timesteps"] = 0 -# logging to TensorBoard each 1000 timesteps and ignore checkpoints -cfg_ddpg["experiment"]["write_interval"] = 1000 +# logging to TensorBoard each 300 timesteps and ignore checkpoints +cfg_ddpg["experiment"]["write_interval"] = 300 cfg_ddpg["experiment"]["checkpoint_interval"] = 0 agent_ddpg = DDPG(models=models_ddpg, @@ -64,6 +63,9 @@ def compute(self, states, taken_actions): action_space=env.action_space, device=device) +# load checkpoint +agent_ddpg.load("./runs/22-09-10_11-02-46-773796_DDPG/checkpoints/agent_15000.pt") + # Configure and instantiate the RL trainer cfg_trainer = {"timesteps": 15000, "headless": True} diff --git a/docs/source/examples/gym/gym_taxi_sarsa.py b/docs/source/examples/gym/gym_taxi_sarsa.py index 99f43f12..5f7ba8dd 100644 --- a/docs/source/examples/gym/gym_taxi_sarsa.py +++ b/docs/source/examples/gym/gym_taxi_sarsa.py @@ -3,21 +3,23 @@ import torch # Import the skrl components to build the RL system -from skrl.models.torch import TabularModel +from skrl.models.torch import Model, TabularMixin from skrl.agents.torch.sarsa import SARSA, SARSA_DEFAULT_CONFIG from skrl.trainers.torch import SequentialTrainer from skrl.envs.torch import wrap_env -# Define the model (tabular models) for the SARSA agent using a helper class -class EpilonGreedyPolicy(TabularModel): +# Define the model (tabular model) for the SARSA agent using mixin +class EpilonGreedyPolicy(TabularMixin, Model): def __init__(self, observation_space, action_space, device, num_envs=1, epsilon=0.1): - super().__init__(observation_space, action_space, device, num_envs) + Model.__init__(self, observation_space, action_space, device) + TabularMixin.__init__(self, num_envs) self.epsilon = epsilon - self.q_table = torch.ones((num_envs, self.num_observations, self.num_actions), dtype=torch.float32, device=self.device) + self.q_table = torch.ones((num_envs, self.num_observations, self.num_actions), + dtype=torch.float32, device=self.device) - def compute(self, states, taken_actions): + def compute(self, states, taken_actions, role): actions = torch.argmax(self.q_table[torch.arange(self.num_envs).view(-1, 1), states], dim=-1, keepdim=True).view(-1,1) @@ -44,21 +46,19 @@ def compute(self, states, taken_actions): # Instantiate the agent's models (table) # SARSA requires 1 model, visit its documentation for more details # https://skrl.readthedocs.io/en/latest/modules/skrl.agents.sarsa.html#spaces-and-models -models_sarsa = {"policy": EpilonGreedyPolicy(env.observation_space, env.action_space, device, \ - num_envs=env.num_envs, epsilon=0.1)} +models_sarsa = {} +models_sarsa["policy"] = EpilonGreedyPolicy(env.observation_space, env.action_space, device, num_envs=env.num_envs, epsilon=0.1) # Configure and instantiate the agent. # Only modify some of the default configuration, visit its documentation to see all the options # https://skrl.readthedocs.io/en/latest/modules/skrl.agents.sarsa.html#configuration-and-hyperparameters cfg_sarsa = SARSA_DEFAULT_CONFIG.copy() -cfg_sarsa["random_timesteps"] = 0 -cfg_sarsa["learning_starts"] = 0 cfg_sarsa["discount_factor"] = 0.999 cfg_sarsa["alpha"] = 0.4 -# logging to TensorBoard and write checkpoints each 1000 and 5000 timesteps respectively -cfg_sarsa["experiment"]["write_interval"] = 1000 -cfg_sarsa["experiment"]["checkpoint_interval"] = 5000 +# logging to TensorBoard and write checkpoints each 1600 and 8000 timesteps respectively +cfg_sarsa["experiment"]["write_interval"] = 1600 +cfg_sarsa["experiment"]["checkpoint_interval"] = 8000 agent_sarsa = SARSA(models=models_sarsa, memory=None, diff --git a/docs/source/examples/gym/gym_taxi_sarsa_eval.py b/docs/source/examples/gym/gym_taxi_sarsa_eval.py index 6cc8232d..4cd64ac3 100644 --- a/docs/source/examples/gym/gym_taxi_sarsa_eval.py +++ b/docs/source/examples/gym/gym_taxi_sarsa_eval.py @@ -3,21 +3,23 @@ import torch # Import the skrl components to build the RL system -from skrl.models.torch import TabularModel +from skrl.models.torch import Model, TabularMixin from skrl.agents.torch.sarsa import SARSA, SARSA_DEFAULT_CONFIG from skrl.trainers.torch import SequentialTrainer from skrl.envs.torch import wrap_env # Define the model (tabular models) for the SARSA agent using a helper class -class EpilonGreedyPolicy(TabularModel): +class EpilonGreedyPolicy(TabularMixin, Model): def __init__(self, observation_space, action_space, device, num_envs=1, epsilon=0.1): - super().__init__(observation_space, action_space, device, num_envs) + Model.__init__(self, observation_space, action_space, device) + TabularMixin.__init__(self, num_envs) self.epsilon = epsilon - self.q_table = torch.ones((num_envs, self.num_observations, self.num_actions), dtype=torch.float32, device=self.device) + self.q_table = torch.ones((num_envs, self.num_observations, self.num_actions), + dtype=torch.float32, device=self.device) - def compute(self, states, taken_actions): + def compute(self, states, taken_actions, role): actions = torch.argmax(self.q_table[torch.arange(self.num_envs).view(-1, 1), states], dim=-1, keepdim=True).view(-1,1) @@ -44,11 +46,8 @@ def compute(self, states, taken_actions): # Instantiate the agent's models (table) # SARSA requires 1 model, visit its documentation for more details # https://skrl.readthedocs.io/en/latest/modules/skrl.agents.sarsa.html#spaces-and-models -models_sarsa = {"policy": EpilonGreedyPolicy(env.observation_space, env.action_space, device, \ - num_envs=env.num_envs, epsilon=0.1)} - -# load checkpoint -models_sarsa["policy"].load("./runs/22-03-09_12-20-00-224006_SARSA/checkpoints/80000_policy.pt") +models_sarsa = {} +models_sarsa["policy"] = EpilonGreedyPolicy(env.observation_space, env.action_space, device, num_envs=env.num_envs, epsilon=0.1) # Configure and instantiate the agent. @@ -56,8 +55,8 @@ def compute(self, states, taken_actions): # https://skrl.readthedocs.io/en/latest/modules/skrl.agents.sarsa.html#configuration-and-hyperparameters cfg_sarsa = SARSA_DEFAULT_CONFIG.copy() cfg_sarsa["random_timesteps"] = 0 -# logging to TensorBoard and write checkpoints each 1000 and ignore checkpoints -cfg_sarsa["experiment"]["write_interval"] = 1000 +# logging to TensorBoard and write checkpoints each 1600 and ignore checkpoints +cfg_sarsa["experiment"]["write_interval"] = 1600 cfg_sarsa["experiment"]["checkpoint_interval"] = 0 agent_sarsa = SARSA(models=models_sarsa, @@ -67,6 +66,9 @@ def compute(self, states, taken_actions): action_space=env.action_space, device=device) +# load checkpoint +agent_sarsa.load("./runs/22-09-10_13-13-41-011999_SARSA/checkpoints/agent_80000.pt") + # Configure and instantiate the RL trainer cfg_trainer = {"timesteps": 80000, "headless": True} diff --git a/docs/source/examples/gym/gym_vector_cartpole_dqn.py b/docs/source/examples/gym/gym_vector_cartpole_dqn.py index 1d737299..8e1ea656 100644 --- a/docs/source/examples/gym/gym_vector_cartpole_dqn.py +++ b/docs/source/examples/gym/gym_vector_cartpole_dqn.py @@ -22,32 +22,33 @@ # Instantiate a RandomMemory (without replacement) as experience replay memory -memory = RandomMemory(memory_size=100000, num_envs=env.num_envs, device=device, replacement=False) +memory = RandomMemory(memory_size=200000, num_envs=env.num_envs, device=device, replacement=False) # Instantiate the agent's models (function approximators) using the model instantiator utility # DQN requires 2 models, visit its documentation for more details # https://skrl.readthedocs.io/en/latest/modules/skrl.agents.dqn.html#spaces-and-models -models_dqn = {"q_network": deterministic_model(observation_space=env.observation_space, - action_space=env.action_space, - device=device, - clip_actions=False, - input_shape=Shape.OBSERVATIONS, - hiddens=[64, 64], - hidden_activation=["relu", "relu"], - output_shape=Shape.ACTIONS, - output_activation=None, - output_scale=1.0), - "target_q_network": deterministic_model(observation_space=env.observation_space, - action_space=env.action_space, - device=device, - clip_actions=False, - input_shape=Shape.OBSERVATIONS, - hiddens=[64, 64], - hidden_activation=["relu", "relu"], - output_shape=Shape.ACTIONS, - output_activation=None, - output_scale=1.0)} +models_dqn = {} +models_dqn["q_network"] = deterministic_model(observation_space=env.observation_space, + action_space=env.action_space, + device=device, + clip_actions=False, + input_shape=Shape.OBSERVATIONS, + hiddens=[64, 64], + hidden_activation=["relu", "relu"], + output_shape=Shape.ACTIONS, + output_activation=None, + output_scale=1.0) +models_dqn["target_q_network"] = deterministic_model(observation_space=env.observation_space, + action_space=env.action_space, + device=device, + clip_actions=False, + input_shape=Shape.OBSERVATIONS, + hiddens=[64, 64], + hidden_activation=["relu", "relu"], + output_shape=Shape.ACTIONS, + output_activation=None, + output_scale=1.0) # Initialize the models' parameters (weights and biases) using a Gaussian distribution for model in models_dqn.values(): @@ -58,7 +59,6 @@ # Only modify some of the default configuration, visit its documentation to see all the options # https://skrl.readthedocs.io/en/latest/modules/skrl.agents.dqn.html#configuration-and-hyperparameters cfg_dqn = DQN_DEFAULT_CONFIG.copy() -cfg_dqn["random_timesteps"] = 0 cfg_dqn["learning_starts"] = 100 cfg_dqn["exploration"]["final_epsilon"] = 0.04 cfg_dqn["exploration"]["timesteps"] = 1500 diff --git a/docs/source/examples/gym/gym_vector_frozen_lake_q_learning.py b/docs/source/examples/gym/gym_vector_frozen_lake_q_learning.py index 04b6d827..efc912be 100644 --- a/docs/source/examples/gym/gym_vector_frozen_lake_q_learning.py +++ b/docs/source/examples/gym/gym_vector_frozen_lake_q_learning.py @@ -3,21 +3,23 @@ import torch # Import the skrl components to build the RL system -from skrl.models.torch import TabularModel +from skrl.models.torch import Model, TabularMixin from skrl.agents.torch.q_learning import Q_LEARNING, Q_LEARNING_DEFAULT_CONFIG from skrl.trainers.torch import SequentialTrainer from skrl.envs.torch import wrap_env -# Define the model (tabular models) for the Q-learning agent using a helper class -class EpilonGreedyPolicy(TabularModel): +# Define the model (tabular model) for the SARSA agent using mixin +class EpilonGreedyPolicy(TabularMixin, Model): def __init__(self, observation_space, action_space, device, num_envs=1, epsilon=0.1): - super().__init__(observation_space, action_space, device, num_envs) + Model.__init__(self, observation_space, action_space, device) + TabularMixin.__init__(self, num_envs) self.epsilon = epsilon - self.q_table = torch.ones((num_envs, self.num_observations, self.num_actions), dtype=torch.float32, device=self.device) + self.q_table = torch.ones((num_envs, self.num_observations, self.num_actions), + dtype=torch.float32, device=self.device) - def compute(self, states, taken_actions): + def compute(self, states, taken_actions, role): actions = torch.argmax(self.q_table[torch.arange(self.num_envs).view(-1, 1), states], dim=-1, keepdim=True).view(-1,1) @@ -31,40 +33,32 @@ def compute(self, states, taken_actions): # Load and wrap the Gym environment. # Note: the environment version may change depending on the gym version try: - env = gym.vector.make("FrozenLake-v0", num_envs=5, asynchronous=False) + env = gym.vector.make("FrozenLake-v0", num_envs=10, asynchronous=False) except gym.error.DeprecatedEnv as e: env_id = [spec.id for spec in gym.envs.registry.all() if spec.id.startswith("FrozenLake-v")][0] print("FrozenLake-v0 not found. Trying {}".format(env_id)) - env = gym.vector.make(env_id, num_envs=5, asynchronous=False) + env = gym.vector.make(env_id, num_envs=10, asynchronous=False) env = wrap_env(env) -print(env.num_envs) -print(env.action_space) -print(env.observation_space) -print(env._env.action_space) -print(env._env.observation_space) -# exit() device = env.device # Instantiate the agent's models (table) # Q-learning requires 1 model, visit its documentation for more details # https://skrl.readthedocs.io/en/latest/modules/skrl.agents.q_learning.html#spaces-and-models -models_q_learning = {"policy": EpilonGreedyPolicy(env.observation_space, env.action_space, device, \ - num_envs=env.num_envs, epsilon=0.1)} +models_q_learning = {} +models_q_learning["policy"] = EpilonGreedyPolicy(env.observation_space, env.action_space, device, num_envs=env.num_envs, epsilon=0.1) # Configure and instantiate the agent. # Only modify some of the default configuration, visit its documentation to see all the options # https://skrl.readthedocs.io/en/latest/modules/skrl.agents.q_learning.html#configuration-and-hyperparameters cfg_q_learning = Q_LEARNING_DEFAULT_CONFIG.copy() -cfg_q_learning["random_timesteps"] = 0 -cfg_q_learning["learning_starts"] = 0 cfg_q_learning["discount_factor"] = 0.999 cfg_q_learning["alpha"] = 0.4 -# logging to TensorBoard and write checkpoints each 1000 and 5000 timesteps respectively -cfg_q_learning["experiment"]["write_interval"] = 1000 -cfg_q_learning["experiment"]["checkpoint_interval"] = 5000 +# logging to TensorBoard and write checkpoints each 1600 and 8000 timesteps respectively +cfg_q_learning["experiment"]["write_interval"] = 1600 +cfg_q_learning["experiment"]["checkpoint_interval"] = 8000 agent_q_learning = Q_LEARNING(models=models_q_learning, memory=None, diff --git a/docs/source/examples/gym/gym_vector_pendulum_ddpg.py b/docs/source/examples/gym/gym_vector_pendulum_ddpg.py index 9580340a..c6bf729f 100644 --- a/docs/source/examples/gym/gym_vector_pendulum_ddpg.py +++ b/docs/source/examples/gym/gym_vector_pendulum_ddpg.py @@ -5,7 +5,7 @@ import torch.nn.functional as F # Import the skrl components to build the RL system -from skrl.models.torch import DeterministicModel +from skrl.models.torch import Model, DeterministicMixin from skrl.memories.torch import RandomMemory from skrl.agents.torch.ddpg import DDPG, DDPG_DEFAULT_CONFIG from skrl.resources.noises.torch import OrnsteinUhlenbeckNoise @@ -13,35 +13,36 @@ from skrl.envs.torch import wrap_env -# Define the models (deterministic models) for the DDPG agent using a helper class -# and programming with two approaches (layer by layer and torch.nn.Sequential class). +# Define the models (deterministic models) for the DDPG agent using mixin # - Actor (policy): takes as input the environment's observation/state and returns an action # - Critic: takes the state and action as input and provides a value to guide the policy -class DeterministicActor(DeterministicModel): - def __init__(self, observation_space, action_space, device, clip_actions = False): - super().__init__(observation_space, action_space, device, clip_actions) +class DeterministicActor(DeterministicMixin, Model): + def __init__(self, observation_space, action_space, device, clip_actions=False): + Model.__init__(self, observation_space, action_space, device) + DeterministicMixin.__init__(self, clip_actions) self.linear_layer_1 = nn.Linear(self.num_observations, 400) self.linear_layer_2 = nn.Linear(400, 300) self.action_layer = nn.Linear(300, self.num_actions) - def compute(self, states, taken_actions): + def compute(self, states, taken_actions, role): x = F.relu(self.linear_layer_1(states)) x = F.relu(self.linear_layer_2(x)) return 2 * torch.tanh(self.action_layer(x)) # Pendulum-v1 action_space is -2 to 2 -class DeterministicCritic(DeterministicModel): - def __init__(self, observation_space, action_space, device, clip_actions = False): - super().__init__(observation_space, action_space, device, clip_actions) +class DeterministicCritic(DeterministicMixin, Model): + def __init__(self, observation_space, action_space, device, clip_actions=False): + Model.__init__(self, observation_space, action_space, device) + DeterministicMixin.__init__(self, clip_actions) - self.net = nn.Sequential(nn.Linear(self.num_observations + self.num_actions, 400), - nn.ReLU(), - nn.Linear(400, 300), - nn.ReLU(), - nn.Linear(300, 1)) + self.linear_layer_1 = nn.Linear(self.num_observations + self.num_actions, 400) + self.linear_layer_2 = nn.Linear(400, 300) + self.linear_layer_3 = nn.Linear(300, 1) - def compute(self, states, taken_actions): - return self.net(torch.cat([states, taken_actions], dim=1)) + def compute(self, states, taken_actions, role): + x = F.relu(self.linear_layer_1(torch.cat([states, taken_actions], dim=1))) + x = F.relu(self.linear_layer_2(x)) + return self.linear_layer_3(x) # Load and wrap the Gym environment. @@ -58,16 +59,17 @@ def compute(self, states, taken_actions): # Instantiate a RandomMemory (without replacement) as experience replay memory -memory = RandomMemory(memory_size=15000, num_envs=env.num_envs, device=device, replacement=False) +memory = RandomMemory(memory_size=100000, num_envs=env.num_envs, device=device, replacement=False) # Instantiate the agent's models (function approximators). # DDPG requires 4 models, visit its documentation for more details # https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ddpg.html#spaces-and-models -models_ddpg = {"policy": DeterministicActor(env.observation_space, env.action_space, device, clip_actions=True), - "target_policy": DeterministicActor(env.observation_space, env.action_space, device, clip_actions=True), - "critic": DeterministicCritic(env.observation_space, env.action_space, device), - "target_critic": DeterministicCritic(env.observation_space, env.action_space, device)} +models_ddpg = {} +models_ddpg["policy"] = DeterministicActor(env.observation_space, env.action_space, device) +models_ddpg["target_policy"] = DeterministicActor(env.observation_space, env.action_space, device) +models_ddpg["critic"] = DeterministicCritic(env.observation_space, env.action_space, device) +models_ddpg["target_critic"] = DeterministicCritic(env.observation_space, env.action_space, device) # Initialize the models' parameters (weights and biases) using a Gaussian distribution for model in models_ddpg.values(): diff --git a/docs/source/examples/gym/gym_vector_taxi_sarsa.py b/docs/source/examples/gym/gym_vector_taxi_sarsa.py index e12ce3eb..581489b2 100644 --- a/docs/source/examples/gym/gym_vector_taxi_sarsa.py +++ b/docs/source/examples/gym/gym_vector_taxi_sarsa.py @@ -3,21 +3,23 @@ import torch # Import the skrl components to build the RL system -from skrl.models.torch import TabularModel +from skrl.models.torch import Model, TabularMixin from skrl.agents.torch.sarsa import SARSA, SARSA_DEFAULT_CONFIG from skrl.trainers.torch import SequentialTrainer from skrl.envs.torch import wrap_env -# Define the model (tabular models) for the SARSA agent using a helper class -class EpilonGreedyPolicy(TabularModel): +# Define the model (tabular model) for the SARSA agent using mixin +class EpilonGreedyPolicy(TabularMixin, Model): def __init__(self, observation_space, action_space, device, num_envs=1, epsilon=0.1): - super().__init__(observation_space, action_space, device, num_envs) + Model.__init__(self, observation_space, action_space, device) + TabularMixin.__init__(self, num_envs) self.epsilon = epsilon - self.q_table = torch.ones((num_envs, self.num_observations, self.num_actions), dtype=torch.float32, device=self.device) + self.q_table = torch.ones((num_envs, self.num_observations, self.num_actions), + dtype=torch.float32, device=self.device) - def compute(self, states, taken_actions): + def compute(self, states, taken_actions, role): actions = torch.argmax(self.q_table[torch.arange(self.num_envs).view(-1, 1), states], dim=-1, keepdim=True).view(-1,1) @@ -44,21 +46,19 @@ def compute(self, states, taken_actions): # Instantiate the agent's models (table) # SARSA requires 1 model, visit its documentation for more details # https://skrl.readthedocs.io/en/latest/modules/skrl.agents.sarsa.html#spaces-and-models -models_sarsa = {"policy": EpilonGreedyPolicy(env.observation_space, env.action_space, device, \ - num_envs=env.num_envs, epsilon=0.1)} +models_sarsa = {} +models_sarsa["policy"] = EpilonGreedyPolicy(env.observation_space, env.action_space, device, num_envs=env.num_envs, epsilon=0.1) # Configure and instantiate the agent. # Only modify some of the default configuration, visit its documentation to see all the options # https://skrl.readthedocs.io/en/latest/modules/skrl.agents.sarsa.html#configuration-and-hyperparameters cfg_sarsa = SARSA_DEFAULT_CONFIG.copy() -cfg_sarsa["random_timesteps"] = 0 -cfg_sarsa["learning_starts"] = 0 cfg_sarsa["discount_factor"] = 0.999 cfg_sarsa["alpha"] = 0.4 -# logging to TensorBoard and write checkpoints each 1000 and 5000 timesteps respectively -cfg_sarsa["experiment"]["write_interval"] = 1000 -cfg_sarsa["experiment"]["checkpoint_interval"] = 5000 +# logging to TensorBoard and write checkpoints each 1600 and 8000 timesteps respectively +cfg_sarsa["experiment"]["write_interval"] = 1600 +cfg_sarsa["experiment"]["checkpoint_interval"] = 8000 agent_sarsa = SARSA(models=models_sarsa, memory=None, From a2c66f7f3baf2d815bfb5b8076cfe005d835c606 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20Serrano=20Mu=C3=B1oz?= Date: Sat, 10 Sep 2022 23:50:16 +0200 Subject: [PATCH 066/108] Update OpenAI Gym examples in docs --- docs/source/intro/examples.rst | 144 ++++++++++++++++++++------------- 1 file changed, 87 insertions(+), 57 deletions(-) diff --git a/docs/source/intro/examples.rst b/docs/source/intro/examples.rst index 02adee87..ee9de042 100644 --- a/docs/source/intro/examples.rst +++ b/docs/source/intro/examples.rst @@ -12,10 +12,10 @@ Examples
-Learning in a Gym environment (one agent, one environment) ----------------------------------------------------------- +Learning in an OpenAI Gym environment +------------------------------------- -This example performs the training of one agent in an OpenAI Gym environment +These examples perform the training of one agent in an OpenAI Gym environment (**one agent, one environment**) .. image:: ../_static/imgs/example_gym.png :width: 100% @@ -28,10 +28,10 @@ This example performs the training of one agent in an OpenAI Gym environment The following components or practices are exemplified (highlighted): - - Load and wrap an OpenAI Gym environment: **Pendulum (DDPG)** + - Load and wrap an OpenAI Gym environment: **Pendulum (DDPG)**, **CartPole (CEM)** - Instantiate models using the model instantiation utility: **CartPole (DQN)** - Create a tabular model (:math:`\epsilon`-greedy policy): **Taxi (SARSA)**, **FrozenLake (Q-Learning)** - - Load a checkpoint during evaluation: **Pendulum (DDPG)**, **CartPole (DQN)**, **Taxi (SARSA)**, **FrozenLake (Q-Learning)** + - Load a checkpoint during evaluation: **Pendulum (DDPG)**, **CartPole (CEM)**, **CartPole (DQN)**, **Taxi (SARSA)**, **FrozenLake (Q-Learning)** .. tabs:: @@ -39,106 +39,132 @@ The following components or practices are exemplified (highlighted): .. tabs:: - .. tab:: Training + .. group-tab:: Training - View the raw code: `gym_pendulum_ddpg.py `_ + :download:`gym_pendulum_ddpg.py <../examples/gym/gym_pendulum_ddpg.py>` .. literalinclude:: ../examples/gym/gym_pendulum_ddpg.py :language: python - :linenos: - :emphasize-lines: 1, 13, 49-55, 99 + :emphasize-lines: 1, 13, 50-56 - .. tab:: Evaluation + .. group-tab:: Evaluation - View the raw code: `gym_pendulum_ddpg_eval.py `_ + :download:`gym_pendulum_ddpg_eval.py <../examples/gym/gym_pendulum_ddpg_eval.py>` **Note:** It is necessary to adjust the checkpoint path according to the directories generated by the new experiments + **Note:** Warnings such as :literal:`[skrl:WARNING] Cannot load the module. The agent doesn't have such an instance` can be ignored without problems. The reason for this is that during the evaluation, not all components such as optimizers or other models apart from the policy are defined + .. literalinclude:: ../examples/gym/gym_pendulum_ddpg_eval.py :language: python - :linenos: - :emphasize-lines: 45, 48, 73 + :emphasize-lines: 67 + + .. tab:: CartPole (CEM) + + .. tabs:: + + .. group-tab:: Training + + :download:`gym_cartpole_cem.py <../examples/gym/gym_cartpole_cem.py>` + + .. literalinclude:: ../examples/gym/gym_cartpole_cem.py + :language: python + :emphasize-lines: 1, 11, 33-39 + + .. group-tab:: Evaluation + + :download:`gym_cartpole_cem_eval.py <../examples/gym/gym_cartpole_cem_eval.py>` + + **Note:** It is necessary to adjust the checkpoint path according to the directories generated by the new experiments + + **Note:** Warnings such as :literal:`[skrl:WARNING] Cannot load the module. The agent doesn't have such an instance` can be ignored without problems. The reason for this is that during the evaluation, not all components such as optimizers or other models apart from the policy are defined + + .. literalinclude:: ../examples/gym/gym_cartpole_cem_eval.py + :language: python + :emphasize-lines: 68 .. tab:: CartPole (DQN) .. tabs:: - .. tab:: Training + .. group-tab:: Training - View the raw code: `gym_cartpole_dqn.py `_ + :download:`gym_cartpole_dqn.py <../examples/gym/gym_cartpole_dqn.py>` .. literalinclude:: ../examples/gym/gym_cartpole_dqn.py :language: python - :linenos: - :emphasize-lines: 4, 31-50, 69 + :emphasize-lines: 4, 31-51 - .. tab:: Evaluation + .. group-tab:: Evaluation - View the raw code: `gym_cartpole_dqn_eval.py `_ + :download:`gym_cartpole_dqn_eval.py <../examples/gym/gym_cartpole_dqn_eval.py>` **Note:** It is necessary to adjust the checkpoint path according to the directories generated by the new experiments + **Note:** Warnings such as :literal:`[skrl:WARNING] Cannot load the module. The agent doesn't have such an instance` can be ignored without problems. The reason for this is that during the evaluation, not all components such as optimizers or other models apart from the policy are defined + .. literalinclude:: ../examples/gym/gym_cartpole_dqn_eval.py :language: python - :linenos: - :emphasize-lines: 26-35, 38, 64 + :emphasize-lines: 56 .. tab:: Taxi (SARSA) .. tabs:: - .. tab:: Training + .. group-tab:: Training - View the raw code: `gym_taxi_sarsa.py `_ + :download:`gym_taxi_sarsa.py <../examples/gym/gym_taxi_sarsa.py>` .. literalinclude:: ../examples/gym/gym_taxi_sarsa.py :language: python - :linenos: - :emphasize-lines: 6, 13-28 + :emphasize-lines: 6, 13-30 - .. tab:: Evaluation + .. group-tab:: Evaluation - View the raw code: `gym_taxi_sarsa_eval.py `_ + :download:`gym_taxi_sarsa_eval.py <../examples/gym/gym_taxi_sarsa_eval.py>` **Note:** It is necessary to adjust the checkpoint path according to the directories generated by the new experiments + **Note:** Warnings such as :literal:`[skrl:WARNING] Cannot load the module. The agent doesn't have such an instance` can be ignored without problems. The reason for this is that during the evaluation, not all components such as optimizers or other models apart from the policy are defined + .. literalinclude:: ../examples/gym/gym_taxi_sarsa_eval.py :language: python - :linenos: - :emphasize-lines: 47-48, 51, 76 + :emphasize-lines: 70 .. tab:: FrozenLake (Q-learning) .. tabs:: - .. tab:: Training + .. group-tab:: Training - View the raw code: `gym_frozen_lake_q_learning.py `_ + :download:`gym_frozen_lake_q_learning.py <../examples/gym/gym_frozen_lake_q_learning.py>` .. literalinclude:: ../examples/gym/gym_frozen_lake_q_learning.py :language: python - :linenos: - :emphasize-lines: 6, 13-28 + :emphasize-lines: 6, 13-30 - .. tab:: Evaluation + .. group-tab:: Evaluation - View the raw code: `gym_frozen_lake_q_learning_eval.py `_ + :download:`gym_frozen_lake_q_learning_eval.py <../examples/gym/gym_frozen_lake_q_learning_eval.py>` **Note:** It is necessary to adjust the checkpoint path according to the directories generated by the new experiments + **Note:** Warnings such as :literal:`[skrl:WARNING] Cannot load the module. The agent doesn't have such an instance` can be ignored without problems. The reason for this is that during the evaluation, not all components such as optimizers or other models apart from the policy are defined + .. literalinclude:: ../examples/gym/gym_frozen_lake_q_learning_eval.py :language: python - :linenos: - :emphasize-lines: 47-48, 51, 76 + :emphasize-lines: 70 .. raw:: html
-Learning in a Gym vectorized environment (one agent, multiple environments) ---------------------------------------------------------------------------- +Learning in an OpenAI Gym vectorized environment +------------------------------------------------ -This example performs the training of one agent in an OpenAI Gym vectorized environment (multiple independent copies of the same environment in parallel). The following components or practices are exemplified (highlighted): +These examples perform the training of one agent in an OpenAI Gym vectorized environment (**one agent, multiple independent copies of the same environment in parallel**) + +The following components or practices are exemplified (highlighted): - Load and wrap an OpenAI Gym vectorized environment: **Pendulum (DDPG)**, **CartPole (DQN)**, **Taxi (SARSA)**, **FrozenLake (Q-Learning)** @@ -148,53 +174,49 @@ This example performs the training of one agent in an OpenAI Gym vectorized envi .. tabs:: - .. tab:: Training + .. group-tab:: Training - View the raw code: `gym_vector_pendulum_ddpg.py `_ + :download:`gym_vector_pendulum_ddpg.py <../examples/gym/gym_vector_pendulum_ddpg.py>` .. literalinclude:: ../examples/gym/gym_vector_pendulum_ddpg.py :language: python - :linenos: - :emphasize-lines: 1, 13, 49-55 + :emphasize-lines: 1, 13, 50-56 .. tab:: CartPole (DQN) .. tabs:: - .. tab:: Training + .. group-tab:: Training - View the raw code: `gym_vector_cartpole_dqn.py `_ + :download:`gym_vector_cartpole_dqn.py <../examples/gym/gym_vector_cartpole_dqn.py>` .. literalinclude:: ../examples/gym/gym_vector_cartpole_dqn.py :language: python - :linenos: :emphasize-lines: 1, 8, 13-19 .. tab:: Taxi (SARSA) .. tabs:: - .. tab:: Training + .. group-tab:: Training - View the raw code: `gym_vector_taxi_sarsa.py `_ + :download:`gym_vector_taxi_sarsa.py <../examples/gym/gym_vector_taxi_sarsa.py>` .. literalinclude:: ../examples/gym/gym_vector_taxi_sarsa.py :language: python - :linenos: - :emphasize-lines: 1, 9, 33-39 + :emphasize-lines: 1, 9, 35-41 .. tab:: FrozenLake (Q-learning) .. tabs:: - .. tab:: Training + .. group-tab:: Training - View the raw code: `gym_vector_frozen_lake_q_learning.py `_ + :download:`gym_vector_frozen_lake_q_learning.py <../examples/gym/gym_vector_frozen_lake_q_learning.py>` .. literalinclude:: ../examples/gym/gym_vector_frozen_lake_q_learning.py :language: python - :linenos: - :emphasize-lines: 1, 9, 33-39 + :emphasize-lines: 1, 9, 35-41 .. raw:: html @@ -225,7 +247,7 @@ The following components or practices are exemplified (highlighted): .. tabs:: - .. tab:: Training + .. group-tab:: Training View the raw code: `dm_suite_cartpole_swingup_ddpg.py `_ @@ -238,7 +260,7 @@ The following components or practices are exemplified (highlighted): .. tabs:: - .. tab:: Training + .. group-tab:: Training View the raw code: `dm_manipulation_stack_sac.py `_ @@ -445,6 +467,8 @@ The PPO agent configuration is mapped, as far as possible, from the rl_games' A2 **Note:** It is necessary to adjust the checkpoint path according to the directories generated by the new experiments + **Note:** Warnings such as :literal:`[skrl:WARNING] Cannot load the module. The agent doesn't have such an instance` can be ignored without problems. The reason for this is that during the evaluation, not all components such as optimizers or other models apart from the policy are defined + .. literalinclude:: ../examples/isaacgym/ppo_cartpole_eval.py :language: python :linenos: @@ -506,6 +530,8 @@ The following components or practices are exemplified (highlighted): **Note:** It is necessary to adjust the checkpoint path according to the directories generated by the new experiments + **Note:** Warnings such as :literal:`[skrl:WARNING] Cannot load the module. The agent doesn't have such an instance` can be ignored without problems. The reason for this is that during the evaluation, not all components such as optimizers or other models apart from the policy are defined + .. literalinclude:: ../examples/isaacgym/isaacgym_sequential_shared_memory_eval.py :language: python :linenos: @@ -539,6 +565,8 @@ The following components or practices are exemplified (highlighted): **Note:** It is necessary to adjust the checkpoint path according to the directories generated by the new experiments + **Note:** Warnings such as :literal:`[skrl:WARNING] Cannot load the module. The agent doesn't have such an instance` can be ignored without problems. The reason for this is that during the evaluation, not all components such as optimizers or other models apart from the policy are defined + .. literalinclude:: ../examples/isaacgym/isaacgym_sequential_no_shared_memory_eval.py :language: python :linenos: @@ -550,6 +578,8 @@ The following components or practices are exemplified (highlighted): **Note:** It is necessary to adjust the checkpoint path according to the directories generated by the new experiments + **Note:** Warnings such as :literal:`[skrl:WARNING] Cannot load the module. The agent doesn't have such an instance` can be ignored without problems. The reason for this is that during the evaluation, not all components such as optimizers or other models apart from the policy are defined + .. literalinclude:: ../examples/isaacgym/isaacgym_parallel_no_shared_memory_eval.py :language: python :linenos: From 92a8fdeeb435bb6ab1d01891b315373eca47008b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20Serrano=20Mu=C3=B1oz?= Date: Sun, 11 Sep 2022 00:11:21 +0200 Subject: [PATCH 067/108] Fix vectorized Discrete action space conversion --- skrl/envs/torch/wrappers.py | 34 ++++++++++++++++++---------------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/skrl/envs/torch/wrappers.py b/skrl/envs/torch/wrappers.py index 2ab26175..5f903e58 100644 --- a/skrl/envs/torch/wrappers.py +++ b/skrl/envs/torch/wrappers.py @@ -352,13 +352,15 @@ def _tensor_to_action(self, actions: torch.Tensor) -> Any: if isinstance(space, gym.spaces.MultiDiscrete): return np.array(actions.cpu().numpy(), dtype=space.dtype).reshape(space.shape) elif isinstance(space, gym.spaces.Tuple): - return np.array(actions.cpu().numpy(), dtype=space[0].dtype).reshape(space.shape) + if isinstance(space[0], gym.spaces.Box): + return np.array(actions.cpu().numpy(), dtype=space[0].dtype).reshape(space.shape) + elif isinstance(space[0], gym.spaces.Discrete): + return np.array(actions.cpu().numpy(), dtype=space[0].dtype).reshape(-1) elif isinstance(space, gym.spaces.Discrete): return actions.item() elif isinstance(space, gym.spaces.Box): return np.array(actions.cpu().numpy(), dtype=space.dtype).reshape(space.shape) - else: - raise ValueError("Action space type {} not supported. Please report this issue".format(type(space))) + raise ValueError("Action space type {} not supported. Please report this issue".format(type(space))) def step(self, actions: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, Any]: """Perform a step in the environment @@ -608,53 +610,53 @@ def wrap_env(env: Any, wrapper: str = "auto", verbose: bool = True) -> Wrapper: :rtype: Wrapper """ if verbose: - print("[INFO] Environment:", [str(base).replace("", "") \ - for base in env.__class__.__bases__]) + logger.info("Environment class: {}".format(", ".join([str(base).replace("", "") \ + for base in env.__class__.__bases__]))) if wrapper == "auto": base_classes = [str(base) for base in env.__class__.__bases__] if "" in base_classes or \ "" in base_classes: if verbose: - print("[INFO] Wrapper: Omniverse Isaac Gym") + logger.info("Environment wrapper: Omniverse Isaac Gym") return OmniverseIsaacGymWrapper(env) elif isinstance(env, gym.core.Env) or isinstance(env, gym.core.Wrapper): if verbose: - print("[INFO] Wrapper: Gym") + logger.info("Environment wrapper: Gym") return GymWrapper(env) elif "" in base_classes: if verbose: - print("[INFO] Wrapper: DeepMind") + logger.info("Environment wrapper: DeepMind") return DeepMindWrapper(env) elif "" in base_classes: if verbose: - print("[INFO] Wrapper: Isaac Gym (preview 2)") + logger.info("Environment wrapper: Isaac Gym (preview 2)") return IsaacGymPreview2Wrapper(env) if verbose: - print("[INFO] Wrapper: Isaac Gym (preview 3/4)") + logger.info("Environment wrapper: Isaac Gym (preview 3/4)") return IsaacGymPreview3Wrapper(env) # preview 4 is the same as 3 elif wrapper == "gym": if verbose: - print("[INFO] Wrapper: Gym") + logger.info("Environment wrapper: Gym") return GymWrapper(env) elif wrapper == "dm": if verbose: - print("[INFO] Wrapper: DeepMind") + logger.info("Environment wrapper: DeepMind") return DeepMindWrapper(env) elif wrapper == "isaacgym-preview2": if verbose: - print("[INFO] Wrapper: Isaac Gym (preview 2)") + logger.info("Environment wrapper: Isaac Gym (preview 2)") return IsaacGymPreview2Wrapper(env) elif wrapper == "isaacgym-preview3": if verbose: - print("[INFO] Wrapper: Isaac Gym (preview 3)") + logger.info("Environment wrapper: Isaac Gym (preview 3)") return IsaacGymPreview3Wrapper(env) elif wrapper == "isaacgym-preview4": if verbose: - print("[INFO] Wrapper: Isaac Gym (preview 4)") + logger.info("Environment wrapper: Isaac Gym (preview 4)") return IsaacGymPreview3Wrapper(env) # preview 4 is the same as 3 elif wrapper == "omniverse-isaacgym": if verbose: - print("[INFO] Wrapper: Omniverse Isaac Gym") + logger.info("Environment wrapper: Omniverse Isaac Gym") return OmniverseIsaacGymWrapper(env) else: raise ValueError("Unknown {} wrapper type".format(wrapper)) From 37c08cd4e3f097d5bbc0167bf05611218241d784 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20Serrano=20Mu=C3=B1oz?= Date: Sun, 11 Sep 2022 16:49:13 +0200 Subject: [PATCH 068/108] Update Omniverse Isaac Gym examples --- .../examples/omniisaacgym/ppo_allegro_hand.py | 26 ++++++++++--------- docs/source/examples/omniisaacgym/ppo_ant.py | 26 ++++++++++--------- .../examples/omniisaacgym/ppo_ant_mt.py | 26 ++++++++++--------- .../examples/omniisaacgym/ppo_cartpole.py | 26 ++++++++++--------- .../examples/omniisaacgym/ppo_cartpole_mt.py | 26 ++++++++++--------- .../examples/omniisaacgym/ppo_humanoid.py | 26 ++++++++++--------- .../examples/omniisaacgym/ppo_shadow_hand.py | 26 ++++++++++--------- 7 files changed, 98 insertions(+), 84 deletions(-) diff --git a/docs/source/examples/omniisaacgym/ppo_allegro_hand.py b/docs/source/examples/omniisaacgym/ppo_allegro_hand.py index 02e06628..8b3f589f 100644 --- a/docs/source/examples/omniisaacgym/ppo_allegro_hand.py +++ b/docs/source/examples/omniisaacgym/ppo_allegro_hand.py @@ -2,7 +2,7 @@ import torch.nn as nn # Import the skrl components to build the RL system -from skrl.models.torch import GaussianModel, DeterministicModel +from skrl.models.torch import Model, GaussianMixin, DeterministicMixin from skrl.memories.torch import RandomMemory from skrl.agents.torch.ppo import PPO, PPO_DEFAULT_CONFIG from skrl.resources.schedulers.torch import KLAdaptiveRL @@ -17,14 +17,14 @@ set_seed(42) -# Define the models (stochastic and deterministic models) for the agent using helper classes. +# Define the models (stochastic and deterministic models) for the agent using mixins. # - Policy: takes as input the environment's observation/state and returns an action # - Value: takes the state as input and provides a value to guide the policy -class Policy(GaussianModel): +class Policy(GaussianMixin, Model): def __init__(self, observation_space, action_space, device, clip_actions=False, clip_log_std=True, min_log_std=-20, max_log_std=2): - super().__init__(observation_space, action_space, device, clip_actions, - clip_log_std, min_log_std, max_log_std) + Model.__init__(self, observation_space, action_space, device) + GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std) self.net = nn.Sequential(nn.Linear(self.num_observations, 512), nn.ELU(), @@ -35,12 +35,13 @@ def __init__(self, observation_space, action_space, device, clip_actions=False, nn.Linear(128, self.num_actions)) self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) - def compute(self, states, taken_actions): + def compute(self, states, taken_actions, role): return self.net(states), self.log_std_parameter -class Value(DeterministicModel): +class Value(DeterministicMixin, Model): def __init__(self, observation_space, action_space, device, clip_actions=False): - super().__init__(observation_space, action_space, device, clip_actions) + Model.__init__(self, observation_space, action_space, device) + DeterministicMixin.__init__(self, clip_actions) self.net = nn.Sequential(nn.Linear(self.num_observations, 512), nn.ELU(), @@ -50,7 +51,7 @@ def __init__(self, observation_space, action_space, device, clip_actions=False): nn.ELU(), nn.Linear(128, 1)) - def compute(self, states, taken_actions): + def compute(self, states, taken_actions, role): return self.net(states) @@ -68,8 +69,9 @@ def compute(self, states, taken_actions): # Instantiate the agent's models (function approximators). # PPO requires 2 models, visit its documentation for more details # https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#spaces-and-models -models_ppo = {"policy": Policy(env.observation_space, env.action_space, device), - "value": Value(env.observation_space, env.action_space, device)} +models_ppo = {} +models_ppo["policy"] = Policy(env.observation_space, env.action_space, device) +models_ppo["value"] = Value(env.observation_space, env.action_space, device) # Initialize the models' parameters (weights and biases) using a Gaussian distribution for model in models_ppo.values(): @@ -80,7 +82,7 @@ def compute(self, states, taken_actions): # Only modify some of the default configuration, visit its documentation to see all the options # https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#configuration-and-hyperparameters cfg_ppo = PPO_DEFAULT_CONFIG.copy() -cfg_ppo["rollouts"] = 8 +cfg_ppo["rollouts"] = 8 # memory_size cfg_ppo["learning_epochs"] = 5 cfg_ppo["mini_batches"] = 4 # 8 * 16384 / 32768 cfg_ppo["discount_factor"] = 0.99 diff --git a/docs/source/examples/omniisaacgym/ppo_ant.py b/docs/source/examples/omniisaacgym/ppo_ant.py index dc446032..e6482534 100644 --- a/docs/source/examples/omniisaacgym/ppo_ant.py +++ b/docs/source/examples/omniisaacgym/ppo_ant.py @@ -2,7 +2,7 @@ import torch.nn as nn # Import the skrl components to build the RL system -from skrl.models.torch import GaussianModel, DeterministicModel +from skrl.models.torch import Model, GaussianMixin, DeterministicMixin from skrl.memories.torch import RandomMemory from skrl.agents.torch.ppo import PPO, PPO_DEFAULT_CONFIG from skrl.resources.schedulers.torch import KLAdaptiveRL @@ -17,14 +17,14 @@ set_seed(42) -# Define the models (stochastic and deterministic models) for the agent using helper classes. +# Define the models (stochastic and deterministic models) for the agent using mixins. # - Policy: takes as input the environment's observation/state and returns an action # - Value: takes the state as input and provides a value to guide the policy -class Policy(GaussianModel): +class Policy(GaussianMixin, Model): def __init__(self, observation_space, action_space, device, clip_actions=False, clip_log_std=True, min_log_std=-20, max_log_std=2): - super().__init__(observation_space, action_space, device, clip_actions, - clip_log_std, min_log_std, max_log_std) + Model.__init__(self, observation_space, action_space, device) + GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std) self.net = nn.Sequential(nn.Linear(self.num_observations, 256), nn.ELU(), @@ -35,12 +35,13 @@ def __init__(self, observation_space, action_space, device, clip_actions=False, nn.Linear(64, self.num_actions)) self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) - def compute(self, states, taken_actions): + def compute(self, states, taken_actions, role): return self.net(states), self.log_std_parameter -class Value(DeterministicModel): +class Value(DeterministicMixin, Model): def __init__(self, observation_space, action_space, device, clip_actions=False): - super().__init__(observation_space, action_space, device, clip_actions) + Model.__init__(self, observation_space, action_space, device) + DeterministicMixin.__init__(self, clip_actions) self.net = nn.Sequential(nn.Linear(self.num_observations, 256), nn.ELU(), @@ -50,7 +51,7 @@ def __init__(self, observation_space, action_space, device, clip_actions=False): nn.ELU(), nn.Linear(64, 1)) - def compute(self, states, taken_actions): + def compute(self, states, taken_actions, role): return self.net(states) @@ -68,8 +69,9 @@ def compute(self, states, taken_actions): # Instantiate the agent's models (function approximators). # PPO requires 2 models, visit its documentation for more details # https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#spaces-and-models -models_ppo = {"policy": Policy(env.observation_space, env.action_space, device), - "value": Value(env.observation_space, env.action_space, device)} +models_ppo = {} +models_ppo["policy"] = Policy(env.observation_space, env.action_space, device) +models_ppo["value"] = Value(env.observation_space, env.action_space, device) # Initialize the models' parameters (weights and biases) using a Gaussian distribution for model in models_ppo.values(): @@ -80,7 +82,7 @@ def compute(self, states, taken_actions): # Only modify some of the default configuration, visit its documentation to see all the options # https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#configuration-and-hyperparameters cfg_ppo = PPO_DEFAULT_CONFIG.copy() -cfg_ppo["rollouts"] = 16 +cfg_ppo["rollouts"] = 16 # memory_size cfg_ppo["learning_epochs"] = 4 cfg_ppo["mini_batches"] = 2 # 16 * 4096 / 32768 cfg_ppo["discount_factor"] = 0.99 diff --git a/docs/source/examples/omniisaacgym/ppo_ant_mt.py b/docs/source/examples/omniisaacgym/ppo_ant_mt.py index e576af3b..6072cfc0 100644 --- a/docs/source/examples/omniisaacgym/ppo_ant_mt.py +++ b/docs/source/examples/omniisaacgym/ppo_ant_mt.py @@ -4,7 +4,7 @@ import torch.nn as nn # Import the skrl components to build the RL system -from skrl.models.torch import GaussianModel, DeterministicModel +from skrl.models.torch import Model, GaussianMixin, DeterministicMixin from skrl.memories.torch import RandomMemory from skrl.agents.torch.ppo import PPO, PPO_DEFAULT_CONFIG from skrl.resources.schedulers.torch import KLAdaptiveRL @@ -19,14 +19,14 @@ set_seed(42) -# Define the models (stochastic and deterministic models) for the agent using helper classes. +# Define the models (stochastic and deterministic models) for the agent using mixins. # - Policy: takes as input the environment's observation/state and returns an action # - Value: takes the state as input and provides a value to guide the policy -class Policy(GaussianModel): +class Policy(GaussianMixin, Model): def __init__(self, observation_space, action_space, device, clip_actions=False, clip_log_std=True, min_log_std=-20, max_log_std=2): - super().__init__(observation_space, action_space, device, clip_actions, - clip_log_std, min_log_std, max_log_std) + Model.__init__(self, observation_space, action_space, device) + GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std) self.net = nn.Sequential(nn.Linear(self.num_observations, 256), nn.ELU(), @@ -37,12 +37,13 @@ def __init__(self, observation_space, action_space, device, clip_actions=False, nn.Linear(64, self.num_actions)) self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) - def compute(self, states, taken_actions): + def compute(self, states, taken_actions, role): return self.net(states), self.log_std_parameter -class Value(DeterministicModel): +class Value(DeterministicMixin, Model): def __init__(self, observation_space, action_space, device, clip_actions=False): - super().__init__(observation_space, action_space, device, clip_actions) + Model.__init__(self, observation_space, action_space, device) + DeterministicMixin.__init__(self, clip_actions) self.net = nn.Sequential(nn.Linear(self.num_observations, 256), nn.ELU(), @@ -52,7 +53,7 @@ def __init__(self, observation_space, action_space, device, clip_actions=False): nn.ELU(), nn.Linear(64, 1)) - def compute(self, states, taken_actions): + def compute(self, states, taken_actions, role): return self.net(states) @@ -70,8 +71,9 @@ def compute(self, states, taken_actions): # Instantiate the agent's models (function approximators). # PPO requires 2 models, visit its documentation for more details # https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#spaces-and-models -models_ppo = {"policy": Policy(env.observation_space, env.action_space, device), - "value": Value(env.observation_space, env.action_space, device)} +models_ppo = {} +models_ppo["policy"] = Policy(env.observation_space, env.action_space, device) +models_ppo["value"] = Value(env.observation_space, env.action_space, device) # Initialize the models' parameters (weights and biases) using a Gaussian distribution for model in models_ppo.values(): @@ -82,7 +84,7 @@ def compute(self, states, taken_actions): # Only modify some of the default configuration, visit its documentation to see all the options # https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#configuration-and-hyperparameters cfg_ppo = PPO_DEFAULT_CONFIG.copy() -cfg_ppo["rollouts"] = 16 +cfg_ppo["rollouts"] = 16 # memory_size cfg_ppo["learning_epochs"] = 4 cfg_ppo["mini_batches"] = 2 # 16 * 4096 / 32768 cfg_ppo["discount_factor"] = 0.99 diff --git a/docs/source/examples/omniisaacgym/ppo_cartpole.py b/docs/source/examples/omniisaacgym/ppo_cartpole.py index e14aadd5..248ffd0e 100644 --- a/docs/source/examples/omniisaacgym/ppo_cartpole.py +++ b/docs/source/examples/omniisaacgym/ppo_cartpole.py @@ -2,7 +2,7 @@ import torch.nn as nn # Import the skrl components to build the RL system -from skrl.models.torch import GaussianModel, DeterministicModel +from skrl.models.torch import Model, GaussianMixin, DeterministicMixin from skrl.memories.torch import RandomMemory from skrl.agents.torch.ppo import PPO, PPO_DEFAULT_CONFIG from skrl.resources.schedulers.torch import KLAdaptiveRL @@ -17,14 +17,14 @@ set_seed(42) -# Define the models (stochastic and deterministic models) for the agent using helper classes. +# Define the models (stochastic and deterministic models) for the agent using mixins. # - Policy: takes as input the environment's observation/state and returns an action # - Value: takes the state as input and provides a value to guide the policy -class Policy(GaussianModel): +class Policy(GaussianMixin, Model): def __init__(self, observation_space, action_space, device, clip_actions=False, clip_log_std=True, min_log_std=-20, max_log_std=2): - super().__init__(observation_space, action_space, device, clip_actions, - clip_log_std, min_log_std, max_log_std) + Model.__init__(self, observation_space, action_space, device) + GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std) self.net = nn.Sequential(nn.Linear(self.num_observations, 32), nn.ELU(), @@ -33,12 +33,13 @@ def __init__(self, observation_space, action_space, device, clip_actions=False, nn.Linear(32, self.num_actions)) self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) - def compute(self, states, taken_actions): + def compute(self, states, taken_actions, role): return self.net(states), self.log_std_parameter -class Value(DeterministicModel): +class Value(DeterministicMixin, Model): def __init__(self, observation_space, action_space, device, clip_actions=False): - super().__init__(observation_space, action_space, device, clip_actions) + Model.__init__(self, observation_space, action_space, device) + DeterministicMixin.__init__(self, clip_actions) self.net = nn.Sequential(nn.Linear(self.num_observations, 32), nn.ELU(), @@ -46,7 +47,7 @@ def __init__(self, observation_space, action_space, device, clip_actions=False): nn.ELU(), nn.Linear(32, 1)) - def compute(self, states, taken_actions): + def compute(self, states, taken_actions, role): return self.net(states) @@ -64,8 +65,9 @@ def compute(self, states, taken_actions): # Instantiate the agent's models (function approximators). # PPO requires 2 models, visit its documentation for more details # https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#spaces-and-models -models_ppo = {"policy": Policy(env.observation_space, env.action_space, device), - "value": Value(env.observation_space, env.action_space, device)} +models_ppo = {} +models_ppo["policy"] = Policy(env.observation_space, env.action_space, device) +models_ppo["value"] = Value(env.observation_space, env.action_space, device) # Initialize the models' parameters (weights and biases) using a Gaussian distribution for model in models_ppo.values(): @@ -76,7 +78,7 @@ def compute(self, states, taken_actions): # Only modify some of the default configuration, visit its documentation to see all the options # https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#configuration-and-hyperparameters cfg_ppo = PPO_DEFAULT_CONFIG.copy() -cfg_ppo["rollouts"] = 16 +cfg_ppo["rollouts"] = 16 # memory_size cfg_ppo["learning_epochs"] = 8 cfg_ppo["mini_batches"] = 1 # 16 * 512 / 8192 cfg_ppo["discount_factor"] = 0.99 diff --git a/docs/source/examples/omniisaacgym/ppo_cartpole_mt.py b/docs/source/examples/omniisaacgym/ppo_cartpole_mt.py index 3672d791..8cbb23cd 100644 --- a/docs/source/examples/omniisaacgym/ppo_cartpole_mt.py +++ b/docs/source/examples/omniisaacgym/ppo_cartpole_mt.py @@ -4,7 +4,7 @@ import torch.nn as nn # Import the skrl components to build the RL system -from skrl.models.torch import GaussianModel, DeterministicModel +from skrl.models.torch import Model, GaussianMixin, DeterministicMixin from skrl.memories.torch import RandomMemory from skrl.agents.torch.ppo import PPO, PPO_DEFAULT_CONFIG from skrl.resources.schedulers.torch import KLAdaptiveRL @@ -19,14 +19,14 @@ set_seed(42) -# Define the models (stochastic and deterministic models) for the agent using helper classes. +# Define the models (stochastic and deterministic models) for the agent using mixins. # - Policy: takes as input the environment's observation/state and returns an action # - Value: takes the state as input and provides a value to guide the policy -class Policy(GaussianModel): +class Policy(GaussianMixin, Model): def __init__(self, observation_space, action_space, device, clip_actions=False, clip_log_std=True, min_log_std=-20, max_log_std=2): - super().__init__(observation_space, action_space, device, clip_actions, - clip_log_std, min_log_std, max_log_std) + Model.__init__(self, observation_space, action_space, device) + GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std) self.net = nn.Sequential(nn.Linear(self.num_observations, 32), nn.ELU(), @@ -35,12 +35,13 @@ def __init__(self, observation_space, action_space, device, clip_actions=False, nn.Linear(32, self.num_actions)) self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) - def compute(self, states, taken_actions): + def compute(self, states, taken_actions, role): return self.net(states), self.log_std_parameter -class Value(DeterministicModel): +class Value(DeterministicMixin, Model): def __init__(self, observation_space, action_space, device, clip_actions=False): - super().__init__(observation_space, action_space, device, clip_actions) + Model.__init__(self, observation_space, action_space, device) + DeterministicMixin.__init__(self, clip_actions) self.net = nn.Sequential(nn.Linear(self.num_observations, 32), nn.ELU(), @@ -48,7 +49,7 @@ def __init__(self, observation_space, action_space, device, clip_actions=False): nn.ELU(), nn.Linear(32, 1)) - def compute(self, states, taken_actions): + def compute(self, states, taken_actions, role): return self.net(states) @@ -66,8 +67,9 @@ def compute(self, states, taken_actions): # Instantiate the agent's models (function approximators). # PPO requires 2 models, visit its documentation for more details # https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#spaces-and-models -models_ppo = {"policy": Policy(env.observation_space, env.action_space, device), - "value": Value(env.observation_space, env.action_space, device)} +models_ppo = {} +models_ppo["policy"] = Policy(env.observation_space, env.action_space, device) +models_ppo["value"] = Value(env.observation_space, env.action_space, device) # Initialize the models' parameters (weights and biases) using a Gaussian distribution for model in models_ppo.values(): @@ -78,7 +80,7 @@ def compute(self, states, taken_actions): # Only modify some of the default configuration, visit its documentation to see all the options # https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#configuration-and-hyperparameters cfg_ppo = PPO_DEFAULT_CONFIG.copy() -cfg_ppo["rollouts"] = 16 +cfg_ppo["rollouts"] = 16 # memory_size cfg_ppo["learning_epochs"] = 8 cfg_ppo["mini_batches"] = 1 # 16 * 512 / 8192 cfg_ppo["discount_factor"] = 0.99 diff --git a/docs/source/examples/omniisaacgym/ppo_humanoid.py b/docs/source/examples/omniisaacgym/ppo_humanoid.py index 9fd60330..29d57b6d 100644 --- a/docs/source/examples/omniisaacgym/ppo_humanoid.py +++ b/docs/source/examples/omniisaacgym/ppo_humanoid.py @@ -2,7 +2,7 @@ import torch.nn as nn # Import the skrl components to build the RL system -from skrl.models.torch import GaussianModel, DeterministicModel +from skrl.models.torch import Model, GaussianMixin, DeterministicMixin from skrl.memories.torch import RandomMemory from skrl.agents.torch.ppo import PPO, PPO_DEFAULT_CONFIG from skrl.resources.schedulers.torch import KLAdaptiveRL @@ -17,14 +17,14 @@ set_seed(42) -# Define the models (stochastic and deterministic models) for the agent using helper classes. +# Define the models (stochastic and deterministic models) for the agent using mixins. # - Policy: takes as input the environment's observation/state and returns an action # - Value: takes the state as input and provides a value to guide the policy -class Policy(GaussianModel): +class Policy(GaussianMixin, Model): def __init__(self, observation_space, action_space, device, clip_actions=False, clip_log_std=True, min_log_std=-20, max_log_std=2): - super().__init__(observation_space, action_space, device, clip_actions, - clip_log_std, min_log_std, max_log_std) + Model.__init__(self, observation_space, action_space, device) + GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std) self.net = nn.Sequential(nn.Linear(self.num_observations, 400), nn.ELU(), @@ -35,12 +35,13 @@ def __init__(self, observation_space, action_space, device, clip_actions=False, nn.Linear(100, self.num_actions)) self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) - def compute(self, states, taken_actions): + def compute(self, states, taken_actions, role): return self.net(states), self.log_std_parameter -class Value(DeterministicModel): +class Value(DeterministicMixin, Model): def __init__(self, observation_space, action_space, device, clip_actions=False): - super().__init__(observation_space, action_space, device, clip_actions) + Model.__init__(self, observation_space, action_space, device) + DeterministicMixin.__init__(self, clip_actions) self.net = nn.Sequential(nn.Linear(self.num_observations, 400), nn.ELU(), @@ -50,7 +51,7 @@ def __init__(self, observation_space, action_space, device, clip_actions=False): nn.ELU(), nn.Linear(100, 1)) - def compute(self, states, taken_actions): + def compute(self, states, taken_actions, role): return self.net(states) @@ -68,8 +69,9 @@ def compute(self, states, taken_actions): # Instantiate the agent's models (function approximators). # PPO requires 2 models, visit its documentation for more details # https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#spaces-and-models -models_ppo = {"policy": Policy(env.observation_space, env.action_space, device), - "value": Value(env.observation_space, env.action_space, device)} +models_ppo = {} +models_ppo["policy"] = Policy(env.observation_space, env.action_space, device) +models_ppo["value"] = Value(env.observation_space, env.action_space, device) # Initialize the models' parameters (weights and biases) using a Gaussian distribution for model in models_ppo.values(): @@ -80,7 +82,7 @@ def compute(self, states, taken_actions): # Only modify some of the default configuration, visit its documentation to see all the options # https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#configuration-and-hyperparameters cfg_ppo = PPO_DEFAULT_CONFIG.copy() -cfg_ppo["rollouts"] = 32 +cfg_ppo["rollouts"] = 32 # memory_size cfg_ppo["learning_epochs"] = 5 cfg_ppo["mini_batches"] = 4 # 32 * 4096 / 32768 cfg_ppo["discount_factor"] = 0.99 diff --git a/docs/source/examples/omniisaacgym/ppo_shadow_hand.py b/docs/source/examples/omniisaacgym/ppo_shadow_hand.py index e7b66041..1d1e9e11 100644 --- a/docs/source/examples/omniisaacgym/ppo_shadow_hand.py +++ b/docs/source/examples/omniisaacgym/ppo_shadow_hand.py @@ -2,7 +2,7 @@ import torch.nn as nn # Import the skrl components to build the RL system -from skrl.models.torch import GaussianModel, DeterministicModel +from skrl.models.torch import Model, GaussianMixin, DeterministicMixin from skrl.memories.torch import RandomMemory from skrl.agents.torch.ppo import PPO, PPO_DEFAULT_CONFIG from skrl.resources.schedulers.torch import KLAdaptiveRL @@ -17,14 +17,14 @@ set_seed(42) -# Define the models (stochastic and deterministic models) for the agent using helper classes. +# Define the models (stochastic and deterministic models) for the agent using mixins. # - Policy: takes as input the environment's observation/state and returns an action # - Value: takes the state as input and provides a value to guide the policy -class Policy(GaussianModel): +class Policy(GaussianMixin, Model): def __init__(self, observation_space, action_space, device, clip_actions=False, clip_log_std=True, min_log_std=-20, max_log_std=2): - super().__init__(observation_space, action_space, device, clip_actions, - clip_log_std, min_log_std, max_log_std) + Model.__init__(self, observation_space, action_space, device) + GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std) self.net = nn.Sequential(nn.Linear(self.num_observations, 512), nn.ELU(), @@ -37,12 +37,13 @@ def __init__(self, observation_space, action_space, device, clip_actions=False, nn.Linear(128, self.num_actions)) self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) - def compute(self, states, taken_actions): + def compute(self, states, taken_actions, role): return self.net(states), self.log_std_parameter -class Value(DeterministicModel): +class Value(DeterministicMixin, Model): def __init__(self, observation_space, action_space, device, clip_actions=False): - super().__init__(observation_space, action_space, device, clip_actions) + Model.__init__(self, observation_space, action_space, device) + DeterministicMixin.__init__(self, clip_actions) self.net = nn.Sequential(nn.Linear(self.num_observations, 512), nn.ELU(), @@ -54,7 +55,7 @@ def __init__(self, observation_space, action_space, device, clip_actions=False): nn.ELU(), nn.Linear(128, 1)) - def compute(self, states, taken_actions): + def compute(self, states, taken_actions, role): return self.net(states) @@ -72,8 +73,9 @@ def compute(self, states, taken_actions): # Instantiate the agent's models (function approximators). # PPO requires 2 models, visit its documentation for more details # https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#spaces-and-models -models_ppo = {"policy": Policy(env.observation_space, env.action_space, device), - "value": Value(env.observation_space, env.action_space, device)} +models_ppo = {} +models_ppo["policy"] = Policy(env.observation_space, env.action_space, device) +models_ppo["value"] = Value(env.observation_space, env.action_space, device) # Initialize the models' parameters (weights and biases) using a Gaussian distribution for model in models_ppo.values(): @@ -84,7 +86,7 @@ def compute(self, states, taken_actions): # Only modify some of the default configuration, visit its documentation to see all the options # https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#configuration-and-hyperparameters cfg_ppo = PPO_DEFAULT_CONFIG.copy() -cfg_ppo["rollouts"] = 8 +cfg_ppo["rollouts"] = 8 # memory_size cfg_ppo["learning_epochs"] = 5 cfg_ppo["mini_batches"] = 4 # 8 * 16384 / 32768 cfg_ppo["discount_factor"] = 0.99 From a52a698f9f7b50ae4743f2f240520f583b4ea563 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20Serrano=20Mu=C3=B1oz?= Date: Mon, 12 Sep 2022 19:04:46 +0200 Subject: [PATCH 069/108] Fix tensor dimension when computing parallel variance --- .../resources/preprocessors/torch/running_standard_scaler.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/skrl/resources/preprocessors/torch/running_standard_scaler.py b/skrl/resources/preprocessors/torch/running_standard_scaler.py index 709bc05f..12c8fe47 100644 --- a/skrl/resources/preprocessors/torch/running_standard_scaler.py +++ b/skrl/resources/preprocessors/torch/running_standard_scaler.py @@ -101,7 +101,10 @@ def _compute(self, x: torch.Tensor, train: bool = False, inverse: bool = False) :type inverse: bool, optional """ if train: - self._parallel_variance(torch.mean(x, dim=0), torch.var(x, dim=0), x.shape[0]) + if x.dim() == 3: + self._parallel_variance(torch.mean(x, dim=(0,1)), torch.var(x, dim=(0,1)), x.shape[0] * x.shape[1]) + else: + self._parallel_variance(torch.mean(x, dim=0), torch.var(x, dim=0), x.shape[0]) # scale back the data to the original representation if inverse: From a447e86c43ce1dca2a5d906f43b8b1c50664a43d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20Serrano=20Mu=C3=B1oz?= Date: Mon, 12 Sep 2022 23:03:11 +0200 Subject: [PATCH 070/108] Update Isaac Gym examples --- docs/source/examples/isaacgym/amp_humanoid.py | 37 ++++--- .../isaacgym_parallel_no_shared_memory.py | 101 +++++++++--------- ...isaacgym_parallel_no_shared_memory_eval.py | 90 ++++++---------- .../isaacgym_sequential_no_shared_memory.py | 101 +++++++++--------- ...aacgym_sequential_no_shared_memory_eval.py | 89 ++++++++------- .../isaacgym_sequential_shared_memory.py | 101 +++++++++--------- .../isaacgym_sequential_shared_memory_eval.py | 89 ++++++++------- .../examples/isaacgym/ppo_allegro_hand.py | 26 ++--- docs/source/examples/isaacgym/ppo_ant.py | 38 +++---- docs/source/examples/isaacgym/ppo_anymal.py | 38 +++---- .../examples/isaacgym/ppo_anymal_terrain.py | 38 +++---- .../examples/isaacgym/ppo_ball_balance.py | 38 +++---- docs/source/examples/isaacgym/ppo_cartpole.py | 38 +++---- .../examples/isaacgym/ppo_cartpole_eval.py | 51 +++++---- .../examples/isaacgym/ppo_franka_cabinet.py | 38 +++---- docs/source/examples/isaacgym/ppo_humanoid.py | 38 +++---- .../source/examples/isaacgym/ppo_ingenuity.py | 26 ++--- .../examples/isaacgym/ppo_quadcopter.py | 38 +++---- .../examples/isaacgym/ppo_shadow_hand.py | 38 +++---- .../source/examples/isaacgym/ppo_trifinger.py | 38 +++---- .../source/examples/isaacgym/trpo_cartpole.py | 40 ++++--- 21 files changed, 527 insertions(+), 604 deletions(-) diff --git a/docs/source/examples/isaacgym/amp_humanoid.py b/docs/source/examples/isaacgym/amp_humanoid.py index 8b3c4f05..11a3e580 100644 --- a/docs/source/examples/isaacgym/amp_humanoid.py +++ b/docs/source/examples/isaacgym/amp_humanoid.py @@ -4,7 +4,7 @@ import torch.nn as nn # Import the skrl components to build the RL system -from skrl.models.torch import GaussianModel, DeterministicModel +from skrl.models.torch import Model, GaussianMixin, DeterministicMixin from skrl.memories.torch import RandomMemory from skrl.agents.torch.amp import AMP, AMP_DEFAULT_CONFIG from skrl.resources.preprocessors.torch import RunningStandardScaler @@ -18,15 +18,15 @@ set_seed(42) -# Define the models (stochastic and deterministic models) for the agent using helper classes. +# Define the models (stochastic and deterministic models) for the agent using mixins. # - Policy: takes as input the environment's observation/state and returns an action # - Value: takes the state as input and provides a value to guide the policy # - Discriminator: differentiate between police-generated behaviors and behaviors from the motion dataset -class Policy(GaussianModel): +class Policy(GaussianMixin, Model): def __init__(self, observation_space, action_space, device, clip_actions=False, - clip_log_std=True, min_log_std=-20, max_log_std=2, reduction="sum"): - super().__init__(observation_space, action_space, device, clip_actions, - clip_log_std, min_log_std, max_log_std, reduction) + clip_log_std=True, min_log_std=-20, max_log_std=2): + Model.__init__(self, observation_space, action_space, device) + GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std) self.net = nn.Sequential(nn.Linear(self.num_observations, 1024), nn.ReLU(), @@ -37,12 +37,13 @@ def __init__(self, observation_space, action_space, device, clip_actions=False, # set a fixed log standard deviation for the policy self.log_std_parameter = nn.Parameter(torch.full((self.num_actions,), fill_value=-2.9), requires_grad=False) - def compute(self, states, taken_actions): + def compute(self, states, taken_actions, role): return torch.tanh(self.net(states)), self.log_std_parameter -class Value(DeterministicModel): +class Value(DeterministicMixin, Model): def __init__(self, observation_space, action_space, device, clip_actions=False): - super().__init__(observation_space, action_space, device, clip_actions) + Model.__init__(self, observation_space, action_space, device) + DeterministicMixin.__init__(self, clip_actions) self.net = nn.Sequential(nn.Linear(self.num_observations, 1024), nn.ReLU(), @@ -50,12 +51,13 @@ def __init__(self, observation_space, action_space, device, clip_actions=False): nn.ReLU(), nn.Linear(512, 1)) - def compute(self, states, taken_actions): + def compute(self, states, taken_actions, role): return self.net(states) -class Discriminator(DeterministicModel): +class Discriminator(DeterministicMixin, Model): def __init__(self, observation_space, action_space, device, clip_actions=False): - super().__init__(observation_space, action_space, device, clip_actions) + Model.__init__(self, observation_space, action_space, device) + DeterministicMixin.__init__(self, clip_actions) self.net = nn.Sequential(nn.Linear(self.num_observations, 1024), nn.ReLU(), @@ -63,7 +65,7 @@ def __init__(self, observation_space, action_space, device, clip_actions=False): nn.ReLU(), nn.Linear(512, 1)) - def compute(self, states, taken_actions): + def compute(self, states, taken_actions, role): return self.net(states) @@ -81,16 +83,17 @@ def compute(self, states, taken_actions): # Instantiate the agent's models (function approximators). # AMP requires 3 models, visit its documentation for more details # https://skrl.readthedocs.io/en/latest/modules/skrl.agents.amp.html#spaces-and-models -models_amp = {"policy": Policy(env.observation_space, env.action_space, device), - "value": Value(env.observation_space, env.action_space, device), - "discriminator": Discriminator(env.amp_observation_space, env.action_space, device)} +models_amp = {} +models_amp["policy"] = Policy(env.observation_space, env.action_space, device) +models_amp["value"] = Value(env.observation_space, env.action_space, device) +models_amp["discriminator"] = Discriminator(env.amp_observation_space, env.action_space, device) # Configure and instantiate the agent. # Only modify some of the default configuration, visit its documentation to see all the options # https://skrl.readthedocs.io/en/latest/modules/skrl.agents.amp.html#configuration-and-hyperparameters cfg_amp = AMP_DEFAULT_CONFIG.copy() -cfg_amp["rollouts"] = 16 +cfg_amp["rollouts"] = 16 # memory_size cfg_amp["learning_epochs"] = 6 cfg_amp["mini_batches"] = 2 # 16 * 4096 / 32768 cfg_amp["discount_factor"] = 0.99 diff --git a/docs/source/examples/isaacgym/isaacgym_parallel_no_shared_memory.py b/docs/source/examples/isaacgym/isaacgym_parallel_no_shared_memory.py index 4f852873..6a1d8cf9 100644 --- a/docs/source/examples/isaacgym/isaacgym_parallel_no_shared_memory.py +++ b/docs/source/examples/isaacgym/isaacgym_parallel_no_shared_memory.py @@ -2,10 +2,9 @@ import torch import torch.nn as nn -import torch.nn.functional as F # Import the skrl components to build the RL system -from skrl.models.torch import GaussianModel, DeterministicModel +from skrl.models.torch import Model, GaussianMixin, DeterministicMixin from skrl.memories.torch import RandomMemory from skrl.agents.torch.ddpg import DDPG, DDPG_DEFAULT_CONFIG from skrl.agents.torch.td3 import TD3, TD3_DEFAULT_CONFIG @@ -13,46 +12,47 @@ from skrl.resources.noises.torch import GaussianNoise, OrnsteinUhlenbeckNoise from skrl.trainers.torch import ParallelTrainer from skrl.envs.torch import wrap_env -from skrl.envs.torch import load_isaacgym_env_preview2, load_isaacgym_env_preview4 +from skrl.envs.torch import load_isaacgym_env_preview4 -# Define the models (stochastic and deterministic models) for the agents using helper classes -# and programming with two approaches (layer by layer and torch.nn.Sequential class). +# Define the models (stochastic and deterministic models) for the agents using mixins. # - StochasticActor: takes as input the environment's observation/state and returns an action # - DeterministicActor: takes as input the environment's observation/state and returns an action # - Critic: takes the state and action as input and provides a value to guide the policy -class StochasticActor(GaussianModel): +class StochasticActor(GaussianMixin, Model): def __init__(self, observation_space, action_space, device, clip_actions=False, clip_log_std=True, min_log_std=-20, max_log_std=2): - super().__init__(observation_space, action_space, device, clip_actions, - clip_log_std, min_log_std, max_log_std) + Model.__init__(self, observation_space, action_space, device) + GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std) - self.linear_layer_1 = nn.Linear(self.num_observations, 32) - self.linear_layer_2 = nn.Linear(32, 32) - self.mean_action_layer = nn.Linear(32, self.num_actions) + self.net = nn.Sequential(nn.Linear(self.num_observations, 32), + nn.ELU(), + nn.Linear(32, 32), + nn.ELU(), + nn.Linear(32, self.num_actions)) self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) - def compute(self, states, taken_actions): - x = F.elu(self.linear_layer_1(states)) - x = F.elu(self.linear_layer_2(x)) - return torch.tanh(self.mean_action_layer(x)), self.log_std_parameter + def compute(self, states, taken_actions, role): + return self.net(states), self.log_std_parameter -class DeterministicActor(DeterministicModel): - def __init__(self, observation_space, action_space, device, clip_actions = False): - super().__init__(observation_space, action_space, device, clip_actions) +class DeterministicActor(DeterministicMixin, Model): + def __init__(self, observation_space, action_space, device, clip_actions=False): + Model.__init__(self, observation_space, action_space, device) + DeterministicMixin.__init__(self, clip_actions) - self.linear_layer_1 = nn.Linear(self.num_observations, 32) - self.linear_layer_2 = nn.Linear(32, 32) - self.action_layer = nn.Linear(32, self.num_actions) + self.net = nn.Sequential(nn.Linear(self.num_observations, 32), + nn.ELU(), + nn.Linear(32, 32), + nn.ELU(), + nn.Linear(32, self.num_actions)) - def compute(self, states, taken_actions): - x = F.elu(self.linear_layer_1(states)) - x = F.elu(self.linear_layer_2(x)) - return torch.tanh(self.action_layer(x)) + def compute(self, states, taken_actions, role): + return self.net(states) -class Critic(DeterministicModel): - def __init__(self, observation_space, action_space, device, clip_actions = False): - super().__init__(observation_space, action_space, device, clip_actions) +class Critic(DeterministicMixin, Model): + def __init__(self, observation_space, action_space, device, clip_actions=False): + Model.__init__(self, observation_space, action_space, device) + DeterministicMixin.__init__(self, clip_actions) self.net = nn.Sequential(nn.Linear(self.num_observations + self.num_actions, 32), nn.ELU(), @@ -60,20 +60,14 @@ def __init__(self, observation_space, action_space, device, clip_actions = False nn.ELU(), nn.Linear(32, 1)) - def compute(self, states, taken_actions): + def compute(self, states, taken_actions, role): return self.net(torch.cat([states, taken_actions], dim=1)) if __name__ == '__main__': - # Load and wrap the Isaac Gym environment. - # The following lines are intended to support all versions (preview 2, 3 and 4). - # It tries to load from preview 3/4, but if it fails, it will try to load from preview 2 - try: - env = load_isaacgym_env_preview4(task_name="Cartpole") # preview 3 and 4 use the same loader - except Exception as e: - print("Isaac Gym (preview 3/4) failed: {}\nTrying preview 2...".format(e)) - env = load_isaacgym_env_preview2("Cartpole") + # Load and wrap the Isaac Gym environment + env = load_isaacgym_env_preview4(task_name="Cartpole") # preview 3 and 4 use the same loader env = wrap_env(env) device = env.device @@ -88,25 +82,28 @@ def compute(self, states, taken_actions): # Instantiate the agent's models (function approximators). # DDPG requires 4 models, visit its documentation for more details # https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ddpg.html#spaces-and-models - models_ddpg = {"policy": DeterministicActor(env.observation_space, env.action_space, device, clip_actions=True), - "target_policy": DeterministicActor(env.observation_space, env.action_space, device, clip_actions=True), - "critic": Critic(env.observation_space, env.action_space, device), - "target_critic": Critic(env.observation_space, env.action_space, device)} + models_ddpg = {} + models_ddpg["policy"] = DeterministicActor(env.observation_space, env.action_space, device, clip_actions=True) + models_ddpg["target_policy"] = DeterministicActor(env.observation_space, env.action_space, device, clip_actions=True) + models_ddpg["critic"] = Critic(env.observation_space, env.action_space, device) + models_ddpg["target_critic"] = Critic(env.observation_space, env.action_space, device) # TD3 requires 6 models, visit its documentation for more details # https://skrl.readthedocs.io/en/latest/modules/skrl.agents.td3.html#spaces-and-models - models_td3 = {"policy": DeterministicActor(env.observation_space, env.action_space, device, clip_actions=True), - "target_policy": DeterministicActor(env.observation_space, env.action_space, device, clip_actions=True), - "critic_1": Critic(env.observation_space, env.action_space, device), - "critic_2": Critic(env.observation_space, env.action_space, device), - "target_critic_1": Critic(env.observation_space, env.action_space, device), - "target_critic_2": Critic(env.observation_space, env.action_space, device)} + models_td3 = {} + models_td3["policy"] = DeterministicActor(env.observation_space, env.action_space, device, clip_actions=True) + models_td3["target_policy"] = DeterministicActor(env.observation_space, env.action_space, device, clip_actions=True) + models_td3["critic_1"] = Critic(env.observation_space, env.action_space, device) + models_td3["critic_2"] = Critic(env.observation_space, env.action_space, device) + models_td3["target_critic_1"] = Critic(env.observation_space, env.action_space, device) + models_td3["target_critic_2"] = Critic(env.observation_space, env.action_space, device) # SAC requires 5 models, visit its documentation for more details # https://skrl.readthedocs.io/en/latest/modules/skrl.agents.sac.html#spaces-and-models - models_sac = {"policy": StochasticActor(env.observation_space, env.action_space, device, clip_actions=True), - "critic_1": Critic(env.observation_space, env.action_space, device), - "critic_2": Critic(env.observation_space, env.action_space, device), - "target_critic_1": Critic(env.observation_space, env.action_space, device), - "target_critic_2": Critic(env.observation_space, env.action_space, device)} + models_sac = {} + models_sac["policy"] = StochasticActor(env.observation_space, env.action_space, device, clip_actions=True) + models_sac["critic_1"] = Critic(env.observation_space, env.action_space, device) + models_sac["critic_2"] = Critic(env.observation_space, env.action_space, device) + models_sac["target_critic_1"] = Critic(env.observation_space, env.action_space, device) + models_sac["target_critic_2"] = Critic(env.observation_space, env.action_space, device) # Initialize the models' parameters (weights and biases) using a Gaussian distribution for model in models_ddpg.values(): diff --git a/docs/source/examples/isaacgym/isaacgym_parallel_no_shared_memory_eval.py b/docs/source/examples/isaacgym/isaacgym_parallel_no_shared_memory_eval.py index 6a0b6bbb..a935e9c9 100644 --- a/docs/source/examples/isaacgym/isaacgym_parallel_no_shared_memory_eval.py +++ b/docs/source/examples/isaacgym/isaacgym_parallel_no_shared_memory_eval.py @@ -2,78 +2,53 @@ import torch import torch.nn as nn -import torch.nn.functional as F # Import the skrl components to build the RL system -from skrl.models.torch import GaussianModel, DeterministicModel -from skrl.memories.torch import RandomMemory +from skrl.models.torch import Model, GaussianMixin, DeterministicMixin from skrl.agents.torch.ddpg import DDPG, DDPG_DEFAULT_CONFIG from skrl.agents.torch.td3 import TD3, TD3_DEFAULT_CONFIG from skrl.agents.torch.sac import SAC, SAC_DEFAULT_CONFIG -from skrl.resources.noises.torch import GaussianNoise, OrnsteinUhlenbeckNoise from skrl.trainers.torch import ParallelTrainer from skrl.envs.torch import wrap_env -from skrl.envs.torch import load_isaacgym_env_preview2, load_isaacgym_env_preview4 +from skrl.envs.torch import load_isaacgym_env_preview4 -# Define the models (stochastic and deterministic models) for the agents using helper classes -# and programming with two approaches (layer by layer and torch.nn.Sequential class). -# - StochasticActor: takes as input the environment's observation/state and returns an action -# - DeterministicActor: takes as input the environment's observation/state and returns an action -# - Critic: takes the state and action as input and provides a value to guide the policy -class StochasticActor(GaussianModel): +# Define only the policies for evaluation +class StochasticActor(GaussianMixin, Model): def __init__(self, observation_space, action_space, device, clip_actions=False, clip_log_std=True, min_log_std=-20, max_log_std=2): - super().__init__(observation_space, action_space, device, clip_actions, - clip_log_std, min_log_std, max_log_std) + Model.__init__(self, observation_space, action_space, device) + GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std) - self.linear_layer_1 = nn.Linear(self.num_observations, 32) - self.linear_layer_2 = nn.Linear(32, 32) - self.mean_action_layer = nn.Linear(32, self.num_actions) + self.net = nn.Sequential(nn.Linear(self.num_observations, 32), + nn.ELU(), + nn.Linear(32, 32), + nn.ELU(), + nn.Linear(32, self.num_actions)) self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) - def compute(self, states, taken_actions): - x = F.elu(self.linear_layer_1(states)) - x = F.elu(self.linear_layer_2(x)) - return torch.tanh(self.mean_action_layer(x)), self.log_std_parameter - -class DeterministicActor(DeterministicModel): - def __init__(self, observation_space, action_space, device, clip_actions = False): - super().__init__(observation_space, action_space, device, clip_actions) + def compute(self, states, taken_actions, role): + return self.net(states), self.log_std_parameter - self.linear_layer_1 = nn.Linear(self.num_observations, 32) - self.linear_layer_2 = nn.Linear(32, 32) - self.action_layer = nn.Linear(32, self.num_actions) +class DeterministicActor(DeterministicMixin, Model): + def __init__(self, observation_space, action_space, device, clip_actions=False): + Model.__init__(self, observation_space, action_space, device) + DeterministicMixin.__init__(self, clip_actions) - def compute(self, states, taken_actions): - x = F.elu(self.linear_layer_1(states)) - x = F.elu(self.linear_layer_2(x)) - return torch.tanh(self.action_layer(x)) - -class Critic(DeterministicModel): - def __init__(self, observation_space, action_space, device, clip_actions = False): - super().__init__(observation_space, action_space, device, clip_actions) - - self.net = nn.Sequential(nn.Linear(self.num_observations + self.num_actions, 32), + self.net = nn.Sequential(nn.Linear(self.num_observations, 32), nn.ELU(), nn.Linear(32, 32), nn.ELU(), - nn.Linear(32, 1)) + nn.Linear(32, self.num_actions)) - def compute(self, states, taken_actions): - return self.net(torch.cat([states, taken_actions], dim=1)) + def compute(self, states, taken_actions, role): + return self.net(states) if __name__ == '__main__': - # Load and wrap the Isaac Gym environment. - # The following lines are intended to support all versions (preview 2, 3 and 4). - # It tries to load from preview 3/4, but if it fails, it will try to load from preview 2 - try: - env = load_isaacgym_env_preview4(task_name="Cartpole") # preview 3 and 4 use the same loader - except Exception as e: - print("Isaac Gym (preview 3/4) failed: {}\nTrying preview 2...".format(e)) - env = load_isaacgym_env_preview2("Cartpole") + # Load and wrap the Isaac Gym environment + env = load_isaacgym_env_preview4(task_name="Cartpole") # preview 3 and 4 use the same loader env = wrap_env(env) device = env.device @@ -82,18 +57,16 @@ def compute(self, states, taken_actions): # Instantiate the agent's models (function approximators). # DDPG requires 4 models, visit its documentation for more details # https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ddpg.html#spaces-and-models - models_ddpg = {"policy": DeterministicActor(env.observation_space, env.action_space, device, clip_actions=True)} + models_ddpg = {} + models_ddpg["policy"] = DeterministicActor(env.observation_space, env.action_space, device, clip_actions=True) # TD3 requires 6 models, visit its documentation for more details # https://skrl.readthedocs.io/en/latest/modules/skrl.agents.td3.html#spaces-and-models - models_td3 = {"policy": DeterministicActor(env.observation_space, env.action_space, device, clip_actions=True)} + models_td3 = {} + models_td3["policy"] = DeterministicActor(env.observation_space, env.action_space, device, clip_actions=True) # SAC requires 5 models, visit its documentation for more details # https://skrl.readthedocs.io/en/latest/modules/skrl.agents.sac.html#spaces-and-models - models_sac = {"policy": StochasticActor(env.observation_space, env.action_space, device, clip_actions=True)} - - # load checkpoints - models_ddpg["policy"].load("./runs/22-03-15_21-30-05-578065_DDPG/checkpoints/2000_policy.pt") - models_td3["policy"].load("./runs/22-03-15_21-30-05-401434_TD3/checkpoints/2000_policy.pt") - models_sac["policy"].load("./runs/22-03-15_21-30-05-596393_SAC/checkpoints/2000_policy.pt") + models_sac = {} + models_sac["policy"] = StochasticActor(env.observation_space, env.action_space, device, clip_actions=True) # Configure and instantiate the agents. @@ -138,6 +111,11 @@ def compute(self, states, taken_actions): action_space=env.action_space, device=device) + # load checkpoint (agent) + agent_ddpg.load("./runs/22-09-12_22-30-58-982355_DDPG/checkpoints/agent_8000.pt") + agent_td3.load("./runs/22-09-12_22-30-58-986295_TD3/checkpoints/agent_8000.pt") + agent_sac.load("./runs/22-09-12_22-30-58-987142_SAC/checkpoints/agent_8000.pt") + # Configure and instantiate the RL trainer and define the agent scopes cfg = {"timesteps": 8000, "headless": True} diff --git a/docs/source/examples/isaacgym/isaacgym_sequential_no_shared_memory.py b/docs/source/examples/isaacgym/isaacgym_sequential_no_shared_memory.py index 38486e99..3cb0dc54 100644 --- a/docs/source/examples/isaacgym/isaacgym_sequential_no_shared_memory.py +++ b/docs/source/examples/isaacgym/isaacgym_sequential_no_shared_memory.py @@ -2,10 +2,9 @@ import torch import torch.nn as nn -import torch.nn.functional as F # Import the skrl components to build the RL system -from skrl.models.torch import GaussianModel, DeterministicModel +from skrl.models.torch import Model, GaussianMixin, DeterministicMixin from skrl.memories.torch import RandomMemory from skrl.agents.torch.ddpg import DDPG, DDPG_DEFAULT_CONFIG from skrl.agents.torch.td3 import TD3, TD3_DEFAULT_CONFIG @@ -13,46 +12,47 @@ from skrl.resources.noises.torch import GaussianNoise, OrnsteinUhlenbeckNoise from skrl.trainers.torch import SequentialTrainer from skrl.envs.torch import wrap_env -from skrl.envs.torch import load_isaacgym_env_preview2, load_isaacgym_env_preview4 +from skrl.envs.torch import load_isaacgym_env_preview4 -# Define the models (stochastic and deterministic models) for the agents using helper classes -# and programming with two approaches (layer by layer and torch.nn.Sequential class). +# Define the models (stochastic and deterministic models) for the agents using mixins. # - StochasticActor: takes as input the environment's observation/state and returns an action # - DeterministicActor: takes as input the environment's observation/state and returns an action # - Critic: takes the state and action as input and provides a value to guide the policy -class StochasticActor(GaussianModel): +class StochasticActor(GaussianMixin, Model): def __init__(self, observation_space, action_space, device, clip_actions=False, clip_log_std=True, min_log_std=-20, max_log_std=2): - super().__init__(observation_space, action_space, device, clip_actions, - clip_log_std, min_log_std, max_log_std) + Model.__init__(self, observation_space, action_space, device) + GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std) - self.linear_layer_1 = nn.Linear(self.num_observations, 32) - self.linear_layer_2 = nn.Linear(32, 32) - self.mean_action_layer = nn.Linear(32, self.num_actions) + self.net = nn.Sequential(nn.Linear(self.num_observations, 32), + nn.ELU(), + nn.Linear(32, 32), + nn.ELU(), + nn.Linear(32, self.num_actions)) self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) - def compute(self, states, taken_actions): - x = F.elu(self.linear_layer_1(states)) - x = F.elu(self.linear_layer_2(x)) - return torch.tanh(self.mean_action_layer(x)), self.log_std_parameter + def compute(self, states, taken_actions, role): + return self.net(states), self.log_std_parameter -class DeterministicActor(DeterministicModel): - def __init__(self, observation_space, action_space, device, clip_actions = False): - super().__init__(observation_space, action_space, device, clip_actions) +class DeterministicActor(DeterministicMixin, Model): + def __init__(self, observation_space, action_space, device, clip_actions=False): + Model.__init__(self, observation_space, action_space, device) + DeterministicMixin.__init__(self, clip_actions) - self.linear_layer_1 = nn.Linear(self.num_observations, 32) - self.linear_layer_2 = nn.Linear(32, 32) - self.action_layer = nn.Linear(32, self.num_actions) + self.net = nn.Sequential(nn.Linear(self.num_observations, 32), + nn.ELU(), + nn.Linear(32, 32), + nn.ELU(), + nn.Linear(32, self.num_actions)) - def compute(self, states, taken_actions): - x = F.elu(self.linear_layer_1(states)) - x = F.elu(self.linear_layer_2(x)) - return torch.tanh(self.action_layer(x)) + def compute(self, states, taken_actions, role): + return self.net(states) -class Critic(DeterministicModel): - def __init__(self, observation_space, action_space, device, clip_actions = False): - super().__init__(observation_space, action_space, device, clip_actions) +class Critic(DeterministicMixin, Model): + def __init__(self, observation_space, action_space, device, clip_actions=False): + Model.__init__(self, observation_space, action_space, device) + DeterministicMixin.__init__(self, clip_actions) self.net = nn.Sequential(nn.Linear(self.num_observations + self.num_actions, 32), nn.ELU(), @@ -60,18 +60,12 @@ def __init__(self, observation_space, action_space, device, clip_actions = False nn.ELU(), nn.Linear(32, 1)) - def compute(self, states, taken_actions): + def compute(self, states, taken_actions, role): return self.net(torch.cat([states, taken_actions], dim=1)) -# Load and wrap the Isaac Gym environment. -# The following lines are intended to support all versions (preview 2, 3 and 4). -# It tries to load from preview 3/4, but if it fails, it will try to load from preview 2 -try: - env = load_isaacgym_env_preview4(task_name="Cartpole") # preview 3 and 4 use the same loader -except Exception as e: - print("Isaac Gym (preview 3/4) failed: {}\nTrying preview 2...".format(e)) - env = load_isaacgym_env_preview2("Cartpole") +# Load and wrap the Isaac Gym environment +env = load_isaacgym_env_preview4(task_name="Cartpole") # preview 3 and 4 use the same loader env = wrap_env(env) device = env.device @@ -86,25 +80,28 @@ def compute(self, states, taken_actions): # Instantiate the agent's models (function approximators). # DDPG requires 4 models, visit its documentation for more details # https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ddpg.html#spaces-and-models -models_ddpg = {"policy": DeterministicActor(env.observation_space, env.action_space, device, clip_actions=True), - "target_policy": DeterministicActor(env.observation_space, env.action_space, device, clip_actions=True), - "critic": Critic(env.observation_space, env.action_space, device), - "target_critic": Critic(env.observation_space, env.action_space, device)} +models_ddpg = {} +models_ddpg["policy"] = DeterministicActor(env.observation_space, env.action_space, device, clip_actions=True) +models_ddpg["target_policy"] = DeterministicActor(env.observation_space, env.action_space, device, clip_actions=True) +models_ddpg["critic"] = Critic(env.observation_space, env.action_space, device) +models_ddpg["target_critic"] = Critic(env.observation_space, env.action_space, device) # TD3 requires 6 models, visit its documentation for more details # https://skrl.readthedocs.io/en/latest/modules/skrl.agents.td3.html#spaces-and-models -models_td3 = {"policy": DeterministicActor(env.observation_space, env.action_space, device, clip_actions=True), - "target_policy": DeterministicActor(env.observation_space, env.action_space, device, clip_actions=True), - "critic_1": Critic(env.observation_space, env.action_space, device), - "critic_2": Critic(env.observation_space, env.action_space, device), - "target_critic_1": Critic(env.observation_space, env.action_space, device), - "target_critic_2": Critic(env.observation_space, env.action_space, device)} +models_td3 = {} +models_td3["policy"] = DeterministicActor(env.observation_space, env.action_space, device, clip_actions=True) +models_td3["target_policy"] = DeterministicActor(env.observation_space, env.action_space, device, clip_actions=True) +models_td3["critic_1"] = Critic(env.observation_space, env.action_space, device) +models_td3["critic_2"] = Critic(env.observation_space, env.action_space, device) +models_td3["target_critic_1"] = Critic(env.observation_space, env.action_space, device) +models_td3["target_critic_2"] = Critic(env.observation_space, env.action_space, device) # SAC requires 5 models, visit its documentation for more details # https://skrl.readthedocs.io/en/latest/modules/skrl.agents.sac.html#spaces-and-models -models_sac = {"policy": StochasticActor(env.observation_space, env.action_space, device, clip_actions=True), - "critic_1": Critic(env.observation_space, env.action_space, device), - "critic_2": Critic(env.observation_space, env.action_space, device), - "target_critic_1": Critic(env.observation_space, env.action_space, device), - "target_critic_2": Critic(env.observation_space, env.action_space, device)} +models_sac = {} +models_sac["policy"] = StochasticActor(env.observation_space, env.action_space, device, clip_actions=True) +models_sac["critic_1"] = Critic(env.observation_space, env.action_space, device) +models_sac["critic_2"] = Critic(env.observation_space, env.action_space, device) +models_sac["target_critic_1"] = Critic(env.observation_space, env.action_space, device) +models_sac["target_critic_2"] = Critic(env.observation_space, env.action_space, device) # Initialize the models' parameters (weights and biases) using a Gaussian distribution for model in models_ddpg.values(): diff --git a/docs/source/examples/isaacgym/isaacgym_sequential_no_shared_memory_eval.py b/docs/source/examples/isaacgym/isaacgym_sequential_no_shared_memory_eval.py index f612710d..e775c3c7 100644 --- a/docs/source/examples/isaacgym/isaacgym_sequential_no_shared_memory_eval.py +++ b/docs/source/examples/isaacgym/isaacgym_sequential_no_shared_memory_eval.py @@ -2,57 +2,51 @@ import torch import torch.nn as nn -import torch.nn.functional as F # Import the skrl components to build the RL system -from skrl.models.torch import GaussianModel, DeterministicModel +from skrl.models.torch import Model, GaussianMixin, DeterministicMixin from skrl.agents.torch.ddpg import DDPG, DDPG_DEFAULT_CONFIG from skrl.agents.torch.td3 import TD3, TD3_DEFAULT_CONFIG from skrl.agents.torch.sac import SAC, SAC_DEFAULT_CONFIG from skrl.trainers.torch import SequentialTrainer from skrl.envs.torch import wrap_env -from skrl.envs.torch import load_isaacgym_env_preview2, load_isaacgym_env_preview4 +from skrl.envs.torch import load_isaacgym_env_preview4 -# Define only the policies for evaluation -class StochasticActor(GaussianModel): +# Define only the policies for evaluation +class StochasticActor(GaussianMixin, Model): def __init__(self, observation_space, action_space, device, clip_actions=False, clip_log_std=True, min_log_std=-20, max_log_std=2): - super().__init__(observation_space, action_space, device, clip_actions, - clip_log_std, min_log_std, max_log_std) - - self.linear_layer_1 = nn.Linear(self.num_observations, 32) - self.linear_layer_2 = nn.Linear(32, 32) - self.mean_action_layer = nn.Linear(32, self.num_actions) + Model.__init__(self, observation_space, action_space, device) + GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std) + + self.net = nn.Sequential(nn.Linear(self.num_observations, 32), + nn.ELU(), + nn.Linear(32, 32), + nn.ELU(), + nn.Linear(32, self.num_actions)) self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) - def compute(self, states, taken_actions): - x = F.elu(self.linear_layer_1(states)) - x = F.elu(self.linear_layer_2(x)) - return torch.tanh(self.mean_action_layer(x)), self.log_std_parameter - -class DeterministicActor(DeterministicModel): - def __init__(self, observation_space, action_space, device, clip_actions = False): - super().__init__(observation_space, action_space, device, clip_actions) - - self.linear_layer_1 = nn.Linear(self.num_observations, 32) - self.linear_layer_2 = nn.Linear(32, 32) - self.action_layer = nn.Linear(32, self.num_actions) - - def compute(self, states, taken_actions): - x = F.elu(self.linear_layer_1(states)) - x = F.elu(self.linear_layer_2(x)) - return torch.tanh(self.action_layer(x)) - - -# Load and wrap the Isaac Gym environment. -# The following lines are intended to support all versions (preview 2, 3 and 4). -# It tries to load from preview 3/4, but if it fails, it will try to load from preview 2 -try: - env = load_isaacgym_env_preview4(task_name="Cartpole") # preview 3 and 4 use the same loader -except Exception as e: - print("Isaac Gym (preview 3/4) failed: {}\nTrying preview 2...".format(e)) - env = load_isaacgym_env_preview2("Cartpole") + def compute(self, states, taken_actions, role): + return self.net(states), self.log_std_parameter + +class DeterministicActor(DeterministicMixin, Model): + def __init__(self, observation_space, action_space, device, clip_actions=False): + Model.__init__(self, observation_space, action_space, device) + DeterministicMixin.__init__(self, clip_actions) + + self.net = nn.Sequential(nn.Linear(self.num_observations, 32), + nn.ELU(), + nn.Linear(32, 32), + nn.ELU(), + nn.Linear(32, self.num_actions)) + + def compute(self, states, taken_actions, role): + return self.net(states) + + +# Load and wrap the Isaac Gym environment +env = load_isaacgym_env_preview4(task_name="Cartpole") # preview 3 and 4 use the same loader env = wrap_env(env) device = env.device @@ -61,18 +55,16 @@ def compute(self, states, taken_actions): # Instantiate the agent's policies. # DDPG requires 4 models, visit its documentation for more details # https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ddpg.html#spaces-and-models -models_ddpg = {"policy": DeterministicActor(env.observation_space, env.action_space, device, clip_actions=True)} +models_ddpg = {} +models_ddpg["policy"] = DeterministicActor(env.observation_space, env.action_space, device, clip_actions=True) # TD3 requires 6 models, visit its documentation for more details # https://skrl.readthedocs.io/en/latest/modules/skrl.agents.td3.html#spaces-and-models -models_td3 = {"policy": DeterministicActor(env.observation_space, env.action_space, device, clip_actions=True)} +models_td3 = {} +models_td3["policy"] = DeterministicActor(env.observation_space, env.action_space, device, clip_actions=True) # SAC requires 5 models, visit its documentation for more details # https://skrl.readthedocs.io/en/latest/modules/skrl.agents.sac.html#spaces-and-models -models_sac = {"policy": StochasticActor(env.observation_space, env.action_space, device, clip_actions=True)} - -# load checkpoints -models_ddpg["policy"].load("./runs/22-02-06_19-37-44-874837_DDPG/checkpoints/8000_policy.pt") -models_td3["policy"].load("./runs/22-02-06_19-28-48-436345_TD3/checkpoints/5000_policy.pt") -models_sac["policy"].load("./runs/22-02-06_19-28-48-441161_SAC/checkpoints/3000_policy.pt") +models_sac = {} +models_sac["policy"] = StochasticActor(env.observation_space, env.action_space, device, clip_actions=True) # Configure and instantiate the agents. @@ -117,6 +109,11 @@ def compute(self, states, taken_actions): action_space=env.action_space, device=device) +# load checkpoint (agent) +agent_ddpg.load("./runs/22-09-12_22-30-58-982355_DDPG/checkpoints/agent_8000.pt") +agent_td3.load("./runs/22-09-12_22-30-58-986295_TD3/checkpoints/agent_8000.pt") +agent_sac.load("./runs/22-09-12_22-30-58-987142_SAC/checkpoints/agent_8000.pt") + # Configure and instantiate the RL trainer cfg = {"timesteps": 8000, "headless": True} diff --git a/docs/source/examples/isaacgym/isaacgym_sequential_shared_memory.py b/docs/source/examples/isaacgym/isaacgym_sequential_shared_memory.py index 18914141..ef93756e 100644 --- a/docs/source/examples/isaacgym/isaacgym_sequential_shared_memory.py +++ b/docs/source/examples/isaacgym/isaacgym_sequential_shared_memory.py @@ -2,10 +2,9 @@ import torch import torch.nn as nn -import torch.nn.functional as F # Import the skrl components to build the RL system -from skrl.models.torch import GaussianModel, DeterministicModel +from skrl.models.torch import Model, GaussianMixin, DeterministicMixin from skrl.memories.torch import RandomMemory from skrl.agents.torch.ddpg import DDPG, DDPG_DEFAULT_CONFIG from skrl.agents.torch.td3 import TD3, TD3_DEFAULT_CONFIG @@ -13,46 +12,47 @@ from skrl.resources.noises.torch import GaussianNoise, OrnsteinUhlenbeckNoise from skrl.trainers.torch import SequentialTrainer from skrl.envs.torch import wrap_env -from skrl.envs.torch import load_isaacgym_env_preview2, load_isaacgym_env_preview4 +from skrl.envs.torch import load_isaacgym_env_preview4 -# Define the models (stochastic and deterministic models) for the agents using helper classes -# and programming with two approaches (layer by layer and torch.nn.Sequential class). +# Define the models (stochastic and deterministic models) for the agents using mixins. # - StochasticActor: takes as input the environment's observation/state and returns an action # - DeterministicActor: takes as input the environment's observation/state and returns an action # - Critic: takes the state and action as input and provides a value to guide the policy -class StochasticActor(GaussianModel): +class StochasticActor(GaussianMixin, Model): def __init__(self, observation_space, action_space, device, clip_actions=False, clip_log_std=True, min_log_std=-20, max_log_std=2): - super().__init__(observation_space, action_space, device, clip_actions, - clip_log_std, min_log_std, max_log_std) + Model.__init__(self, observation_space, action_space, device) + GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std) - self.linear_layer_1 = nn.Linear(self.num_observations, 32) - self.linear_layer_2 = nn.Linear(32, 32) - self.mean_action_layer = nn.Linear(32, self.num_actions) + self.net = nn.Sequential(nn.Linear(self.num_observations, 32), + nn.ELU(), + nn.Linear(32, 32), + nn.ELU(), + nn.Linear(32, self.num_actions)) self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) - def compute(self, states, taken_actions): - x = F.elu(self.linear_layer_1(states)) - x = F.elu(self.linear_layer_2(x)) - return torch.tanh(self.mean_action_layer(x)), self.log_std_parameter + def compute(self, states, taken_actions, role): + return self.net(states), self.log_std_parameter -class DeterministicActor(DeterministicModel): - def __init__(self, observation_space, action_space, device, clip_actions = False): - super().__init__(observation_space, action_space, device, clip_actions) +class DeterministicActor(DeterministicMixin, Model): + def __init__(self, observation_space, action_space, device, clip_actions=False): + Model.__init__(self, observation_space, action_space, device) + DeterministicMixin.__init__(self, clip_actions) - self.linear_layer_1 = nn.Linear(self.num_observations, 32) - self.linear_layer_2 = nn.Linear(32, 32) - self.action_layer = nn.Linear(32, self.num_actions) + self.net = nn.Sequential(nn.Linear(self.num_observations, 32), + nn.ELU(), + nn.Linear(32, 32), + nn.ELU(), + nn.Linear(32, self.num_actions)) - def compute(self, states, taken_actions): - x = F.elu(self.linear_layer_1(states)) - x = F.elu(self.linear_layer_2(x)) - return torch.tanh(self.action_layer(x)) + def compute(self, states, taken_actions, role): + return self.net(states) -class Critic(DeterministicModel): - def __init__(self, observation_space, action_space, device, clip_actions = False): - super().__init__(observation_space, action_space, device, clip_actions) +class Critic(DeterministicMixin, Model): + def __init__(self, observation_space, action_space, device, clip_actions=False): + Model.__init__(self, observation_space, action_space, device) + DeterministicMixin.__init__(self, clip_actions) self.net = nn.Sequential(nn.Linear(self.num_observations + self.num_actions, 32), nn.ELU(), @@ -60,18 +60,12 @@ def __init__(self, observation_space, action_space, device, clip_actions = False nn.ELU(), nn.Linear(32, 1)) - def compute(self, states, taken_actions): + def compute(self, states, taken_actions, role): return self.net(torch.cat([states, taken_actions], dim=1)) -# Load and wrap the Isaac Gym environment. -# The following lines are intended to support all versions (preview 2, 3 and 4). -# It tries to load from preview 3/4, but if it fails, it will try to load from preview 2 -try: - env = load_isaacgym_env_preview4(task_name="Cartpole") # preview 3 and 4 use the same loader -except Exception as e: - print("Isaac Gym (preview 3/4) failed: {}\nTrying preview 2...".format(e)) - env = load_isaacgym_env_preview2("Cartpole") +# Load and wrap the Isaac Gym environment +env = load_isaacgym_env_preview4(task_name="Cartpole") # preview 3 and 4 use the same loader env = wrap_env(env) device = env.device @@ -84,25 +78,28 @@ def compute(self, states, taken_actions): # Instantiate the agent's models (function approximators). # DDPG requires 4 models, visit its documentation for more details # https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ddpg.html#spaces-and-models -models_ddpg = {"policy": DeterministicActor(env.observation_space, env.action_space, device, clip_actions=True), - "target_policy": DeterministicActor(env.observation_space, env.action_space, device, clip_actions=True), - "critic": Critic(env.observation_space, env.action_space, device), - "target_critic": Critic(env.observation_space, env.action_space, device)} +models_ddpg = {} +models_ddpg["policy"] = DeterministicActor(env.observation_space, env.action_space, device, clip_actions=True) +models_ddpg["target_policy"] = DeterministicActor(env.observation_space, env.action_space, device, clip_actions=True) +models_ddpg["critic"] = Critic(env.observation_space, env.action_space, device) +models_ddpg["target_critic"] = Critic(env.observation_space, env.action_space, device) # TD3 requires 6 models, visit its documentation for more details # https://skrl.readthedocs.io/en/latest/modules/skrl.agents.td3.html#spaces-and-models -models_td3 = {"policy": DeterministicActor(env.observation_space, env.action_space, device, clip_actions=True), - "target_policy": DeterministicActor(env.observation_space, env.action_space, device, clip_actions=True), - "critic_1": Critic(env.observation_space, env.action_space, device), - "critic_2": Critic(env.observation_space, env.action_space, device), - "target_critic_1": Critic(env.observation_space, env.action_space, device), - "target_critic_2": Critic(env.observation_space, env.action_space, device)} +models_td3 = {} +models_td3["policy"] = DeterministicActor(env.observation_space, env.action_space, device, clip_actions=True) +models_td3["target_policy"] = DeterministicActor(env.observation_space, env.action_space, device, clip_actions=True) +models_td3["critic_1"] = Critic(env.observation_space, env.action_space, device) +models_td3["critic_2"] = Critic(env.observation_space, env.action_space, device) +models_td3["target_critic_1"] = Critic(env.observation_space, env.action_space, device) +models_td3["target_critic_2"] = Critic(env.observation_space, env.action_space, device) # SAC requires 5 models, visit its documentation for more details # https://skrl.readthedocs.io/en/latest/modules/skrl.agents.sac.html#spaces-and-models -models_sac = {"policy": StochasticActor(env.observation_space, env.action_space, device, clip_actions=True), - "critic_1": Critic(env.observation_space, env.action_space, device), - "critic_2": Critic(env.observation_space, env.action_space, device), - "target_critic_1": Critic(env.observation_space, env.action_space, device), - "target_critic_2": Critic(env.observation_space, env.action_space, device)} +models_sac = {} +models_sac["policy"] = StochasticActor(env.observation_space, env.action_space, device, clip_actions=True) +models_sac["critic_1"] = Critic(env.observation_space, env.action_space, device) +models_sac["critic_2"] = Critic(env.observation_space, env.action_space, device) +models_sac["target_critic_1"] = Critic(env.observation_space, env.action_space, device) +models_sac["target_critic_2"] = Critic(env.observation_space, env.action_space, device) # Initialize the models' parameters (weights and biases) using a Gaussian distribution for model in models_ddpg.values(): diff --git a/docs/source/examples/isaacgym/isaacgym_sequential_shared_memory_eval.py b/docs/source/examples/isaacgym/isaacgym_sequential_shared_memory_eval.py index 6fc4d8fe..209a5d1c 100644 --- a/docs/source/examples/isaacgym/isaacgym_sequential_shared_memory_eval.py +++ b/docs/source/examples/isaacgym/isaacgym_sequential_shared_memory_eval.py @@ -2,57 +2,51 @@ import torch import torch.nn as nn -import torch.nn.functional as F # Import the skrl components to build the RL system -from skrl.models.torch import GaussianModel, DeterministicModel +from skrl.models.torch import Model, GaussianMixin, DeterministicMixin from skrl.agents.torch.ddpg import DDPG, DDPG_DEFAULT_CONFIG from skrl.agents.torch.td3 import TD3, TD3_DEFAULT_CONFIG from skrl.agents.torch.sac import SAC, SAC_DEFAULT_CONFIG from skrl.trainers.torch import SequentialTrainer from skrl.envs.torch import wrap_env -from skrl.envs.torch import load_isaacgym_env_preview2, load_isaacgym_env_preview4 +from skrl.envs.torch import load_isaacgym_env_preview4 -# Define only the policies for evaluation -class StochasticActor(GaussianModel): +# Define only the policies for evaluation +class StochasticActor(GaussianMixin, Model): def __init__(self, observation_space, action_space, device, clip_actions=False, clip_log_std=True, min_log_std=-20, max_log_std=2): - super().__init__(observation_space, action_space, device, clip_actions, - clip_log_std, min_log_std, max_log_std) - - self.linear_layer_1 = nn.Linear(self.num_observations, 32) - self.linear_layer_2 = nn.Linear(32, 32) - self.mean_action_layer = nn.Linear(32, self.num_actions) + Model.__init__(self, observation_space, action_space, device) + GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std) + + self.net = nn.Sequential(nn.Linear(self.num_observations, 32), + nn.ELU(), + nn.Linear(32, 32), + nn.ELU(), + nn.Linear(32, self.num_actions)) self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) - def compute(self, states, taken_actions): - x = F.elu(self.linear_layer_1(states)) - x = F.elu(self.linear_layer_2(x)) - return torch.tanh(self.mean_action_layer(x)), self.log_std_parameter - -class DeterministicActor(DeterministicModel): - def __init__(self, observation_space, action_space, device, clip_actions = False): - super().__init__(observation_space, action_space, device, clip_actions) - - self.linear_layer_1 = nn.Linear(self.num_observations, 32) - self.linear_layer_2 = nn.Linear(32, 32) - self.action_layer = nn.Linear(32, self.num_actions) - - def compute(self, states, taken_actions): - x = F.elu(self.linear_layer_1(states)) - x = F.elu(self.linear_layer_2(x)) - return torch.tanh(self.action_layer(x)) - - -# Load and wrap the Isaac Gym environment. -# The following lines are intended to support all versions (preview 2, 3 and 4). -# It tries to load from preview 3/4, but if it fails, it will try to load from preview 2 -try: - env = load_isaacgym_env_preview4(task_name="Cartpole") # preview 3 and 4 use the same loader -except Exception as e: - print("Isaac Gym (preview 3/4) failed: {}\nTrying preview 2...".format(e)) - env = load_isaacgym_env_preview2("Cartpole") + def compute(self, states, taken_actions, role): + return self.net(states), self.log_std_parameter + +class DeterministicActor(DeterministicMixin, Model): + def __init__(self, observation_space, action_space, device, clip_actions=False): + Model.__init__(self, observation_space, action_space, device) + DeterministicMixin.__init__(self, clip_actions) + + self.net = nn.Sequential(nn.Linear(self.num_observations, 32), + nn.ELU(), + nn.Linear(32, 32), + nn.ELU(), + nn.Linear(32, self.num_actions)) + + def compute(self, states, taken_actions, role): + return self.net(states) + + +# Load and wrap the Isaac Gym environment +env = load_isaacgym_env_preview4(task_name="Cartpole") # preview 3 and 4 use the same loader env = wrap_env(env) device = env.device @@ -61,18 +55,16 @@ def compute(self, states, taken_actions): # Instantiate the agent's policies. # DDPG requires 4 models, visit its documentation for more details # https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ddpg.html#spaces-and-models -models_ddpg = {"policy": DeterministicActor(env.observation_space, env.action_space, device, clip_actions=True)} +models_ddpg = {} +models_ddpg["policy"] = DeterministicActor(env.observation_space, env.action_space, device, clip_actions=True) # TD3 requires 6 models, visit its documentation for more details # https://skrl.readthedocs.io/en/latest/modules/skrl.agents.td3.html#spaces-and-models -models_td3 = {"policy": DeterministicActor(env.observation_space, env.action_space, device, clip_actions=True)} +models_td3 = {} +models_td3["policy"] = DeterministicActor(env.observation_space, env.action_space, device, clip_actions=True) # SAC requires 5 models, visit its documentation for more details # https://skrl.readthedocs.io/en/latest/modules/skrl.agents.sac.html#spaces-and-models -models_sac = {"policy": StochasticActor(env.observation_space, env.action_space, device, clip_actions=True)} - -# load checkpoints -models_ddpg["policy"].load("./runs/22-02-06_19-37-44-874837_DDPG/checkpoints/8000_policy.pt") -models_td3["policy"].load("./runs/22-02-06_19-28-48-436345_TD3/checkpoints/5000_policy.pt") -models_sac["policy"].load("./runs/22-02-06_19-28-48-441161_SAC/checkpoints/3000_policy.pt") +models_sac = {} +models_sac["policy"] = StochasticActor(env.observation_space, env.action_space, device, clip_actions=True) # Configure and instantiate the agents. @@ -117,6 +109,11 @@ def compute(self, states, taken_actions): action_space=env.action_space, device=device) +# load checkpoint (agent) +agent_ddpg.load("./runs/22-09-12_22-30-58-982355_DDPG/checkpoints/agent_8000.pt") +agent_td3.load("./runs/22-09-12_22-30-58-986295_TD3/checkpoints/agent_8000.pt") +agent_sac.load("./runs/22-09-12_22-30-58-987142_SAC/checkpoints/agent_8000.pt") + # Configure and instantiate the RL trainer cfg = {"timesteps": 8000, "headless": True} diff --git a/docs/source/examples/isaacgym/ppo_allegro_hand.py b/docs/source/examples/isaacgym/ppo_allegro_hand.py index 52f793a8..927b8f5d 100644 --- a/docs/source/examples/isaacgym/ppo_allegro_hand.py +++ b/docs/source/examples/isaacgym/ppo_allegro_hand.py @@ -5,7 +5,7 @@ import torch.nn as nn # Import the skrl components to build the RL system -from skrl.models.torch import GaussianModel, DeterministicModel +from skrl.models.torch import Model, GaussianMixin, DeterministicMixin from skrl.memories.torch import RandomMemory from skrl.agents.torch.ppo import PPO, PPO_DEFAULT_CONFIG from skrl.resources.schedulers.torch import KLAdaptiveRL @@ -19,14 +19,14 @@ set_seed(42) -# Define the models (stochastic and deterministic models) for the agent using helper classes. +# Define the models (stochastic and deterministic models) for the agent using mixins. # - Policy: takes as input the environment's observation/state and returns an action # - Value: takes the state as input and provides a value to guide the policy -class Policy(GaussianModel): +class Policy(GaussianMixin, Model): def __init__(self, observation_space, action_space, device, clip_actions=False, clip_log_std=True, min_log_std=-20, max_log_std=2): - super().__init__(observation_space, action_space, device, clip_actions, - clip_log_std, min_log_std, max_log_std) + Model.__init__(self, observation_space, action_space, device) + GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std) self.net = nn.Sequential(nn.Linear(self.num_observations, 512), nn.ELU(), @@ -37,12 +37,13 @@ def __init__(self, observation_space, action_space, device, clip_actions=False, nn.Linear(128, self.num_actions)) self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) - def compute(self, states, taken_actions): + def compute(self, states, taken_actions, role): return self.net(states), self.log_std_parameter -class Value(DeterministicModel): +class Value(DeterministicMixin, Model): def __init__(self, observation_space, action_space, device, clip_actions=False): - super().__init__(observation_space, action_space, device, clip_actions) + Model.__init__(self, observation_space, action_space, device) + DeterministicMixin.__init__(self, clip_actions) self.net = nn.Sequential(nn.Linear(self.num_observations, 512), nn.ELU(), @@ -52,7 +53,7 @@ def __init__(self, observation_space, action_space, device, clip_actions=False): nn.ELU(), nn.Linear(128, 1)) - def compute(self, states, taken_actions): + def compute(self, states, taken_actions, role): return self.net(states) @@ -76,8 +77,9 @@ def compute(self, states, taken_actions): # Instantiate the agent's models (function approximators). # PPO requires 2 models, visit its documentation for more details # https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#spaces-and-models -models_ppo = {"policy": Policy(env.observation_space, env.action_space, device), - "value": Value(env.observation_space, env.action_space, device)} +models_ppo = {} +models_ppo["policy"] = Policy(env.observation_space, env.action_space, device) +models_ppo["value"] = Value(env.observation_space, env.action_space, device) # Initialize the models' parameters (weights and biases) using a Gaussian distribution for model in models_ppo.values(): @@ -88,7 +90,7 @@ def compute(self, states, taken_actions): # Only modify some of the default configuration, visit its documentation to see all the options # https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#configuration-and-hyperparameters cfg_ppo = PPO_DEFAULT_CONFIG.copy() -cfg_ppo["rollouts"] = 8 +cfg_ppo["rollouts"] = 8 # memory_size cfg_ppo["learning_epochs"] = 5 cfg_ppo["mini_batches"] = 4 # 8 * 16384 / 32768 cfg_ppo["discount_factor"] = 0.99 diff --git a/docs/source/examples/isaacgym/ppo_ant.py b/docs/source/examples/isaacgym/ppo_ant.py index 0f8bc4cc..ad2ce0bf 100644 --- a/docs/source/examples/isaacgym/ppo_ant.py +++ b/docs/source/examples/isaacgym/ppo_ant.py @@ -4,14 +4,14 @@ import torch.nn as nn # Import the skrl components to build the RL system -from skrl.models.torch import GaussianModel, DeterministicModel +from skrl.models.torch import Model, GaussianMixin, DeterministicMixin from skrl.memories.torch import RandomMemory from skrl.agents.torch.ppo import PPO, PPO_DEFAULT_CONFIG from skrl.resources.schedulers.torch import KLAdaptiveRL from skrl.resources.preprocessors.torch import RunningStandardScaler from skrl.trainers.torch import SequentialTrainer from skrl.envs.torch import wrap_env -from skrl.envs.torch import load_isaacgym_env_preview2, load_isaacgym_env_preview4 +from skrl.envs.torch import load_isaacgym_env_preview4 from skrl.utils import set_seed @@ -19,14 +19,14 @@ set_seed(42) -# Define the models (stochastic and deterministic models) for the agent using helper classes. +# Define the models (stochastic and deterministic models) for the agent using mixins. # - Policy: takes as input the environment's observation/state and returns an action # - Value: takes the state as input and provides a value to guide the policy -class Policy(GaussianModel): +class Policy(GaussianMixin, Model): def __init__(self, observation_space, action_space, device, clip_actions=False, clip_log_std=True, min_log_std=-20, max_log_std=2): - super().__init__(observation_space, action_space, device, clip_actions, - clip_log_std, min_log_std, max_log_std) + Model.__init__(self, observation_space, action_space, device) + GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std) self.net = nn.Sequential(nn.Linear(self.num_observations, 256), nn.ELU(), @@ -37,12 +37,13 @@ def __init__(self, observation_space, action_space, device, clip_actions=False, nn.Linear(64, self.num_actions)) self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) - def compute(self, states, taken_actions): + def compute(self, states, taken_actions, role): return self.net(states), self.log_std_parameter -class Value(DeterministicModel): +class Value(DeterministicMixin, Model): def __init__(self, observation_space, action_space, device, clip_actions=False): - super().__init__(observation_space, action_space, device, clip_actions) + Model.__init__(self, observation_space, action_space, device) + DeterministicMixin.__init__(self, clip_actions) self.net = nn.Sequential(nn.Linear(self.num_observations, 256), nn.ELU(), @@ -52,18 +53,12 @@ def __init__(self, observation_space, action_space, device, clip_actions=False): nn.ELU(), nn.Linear(64, 1)) - def compute(self, states, taken_actions): + def compute(self, states, taken_actions, role): return self.net(states) -# Load and wrap the Isaac Gym environment. -# The following lines are intended to support all versions (preview 2, 3 and 4). -# It tries to load from preview 3/4, but if it fails, it will try to load from preview 2 -try: - env = load_isaacgym_env_preview4(task_name="Ant") # preview 3 and 4 use the same loader -except Exception as e: - print("Isaac Gym (preview 3/4) failed: {}\nTrying preview 2...".format(e)) - env = load_isaacgym_env_preview2("Ant") +# Load and wrap the Isaac Gym environment +env = load_isaacgym_env_preview4(task_name="Ant") # preview 3 and 4 use the same loader env = wrap_env(env) device = env.device @@ -76,8 +71,9 @@ def compute(self, states, taken_actions): # Instantiate the agent's models (function approximators). # PPO requires 2 models, visit its documentation for more details # https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#spaces-and-models -models_ppo = {"policy": Policy(env.observation_space, env.action_space, device), - "value": Value(env.observation_space, env.action_space, device)} +models_ppo = {} +models_ppo["policy"] = Policy(env.observation_space, env.action_space, device) +models_ppo["value"] = Value(env.observation_space, env.action_space, device) # Initialize the models' parameters (weights and biases) using a Gaussian distribution for model in models_ppo.values(): @@ -88,7 +84,7 @@ def compute(self, states, taken_actions): # Only modify some of the default configuration, visit its documentation to see all the options # https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#configuration-and-hyperparameters cfg_ppo = PPO_DEFAULT_CONFIG.copy() -cfg_ppo["rollouts"] = 16 +cfg_ppo["rollouts"] = 16 # memory_size cfg_ppo["learning_epochs"] = 4 cfg_ppo["mini_batches"] = 2 # 16 * 4096 / 32768 cfg_ppo["discount_factor"] = 0.99 diff --git a/docs/source/examples/isaacgym/ppo_anymal.py b/docs/source/examples/isaacgym/ppo_anymal.py index c076aee1..656cc3fe 100644 --- a/docs/source/examples/isaacgym/ppo_anymal.py +++ b/docs/source/examples/isaacgym/ppo_anymal.py @@ -4,14 +4,14 @@ import torch.nn as nn # Import the skrl components to build the RL system -from skrl.models.torch import GaussianModel, DeterministicModel +from skrl.models.torch import Model, GaussianMixin, DeterministicMixin from skrl.memories.torch import RandomMemory from skrl.agents.torch.ppo import PPO, PPO_DEFAULT_CONFIG from skrl.resources.schedulers.torch import KLAdaptiveRL from skrl.resources.preprocessors.torch import RunningStandardScaler from skrl.trainers.torch import SequentialTrainer from skrl.envs.torch import wrap_env -from skrl.envs.torch import load_isaacgym_env_preview2, load_isaacgym_env_preview4 +from skrl.envs.torch import load_isaacgym_env_preview4 from skrl.utils import set_seed @@ -19,14 +19,14 @@ set_seed(42) -# Define the models (stochastic and deterministic models) for the agent using helper classes. +# Define the models (stochastic and deterministic models) for the agent using mixins. # - Policy: takes as input the environment's observation/state and returns an action # - Value: takes the state as input and provides a value to guide the policy -class Policy(GaussianModel): +class Policy(GaussianMixin, Model): def __init__(self, observation_space, action_space, device, clip_actions=False, clip_log_std=True, min_log_std=-20, max_log_std=2): - super().__init__(observation_space, action_space, device, clip_actions, - clip_log_std, min_log_std, max_log_std) + Model.__init__(self, observation_space, action_space, device) + GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std) self.net = nn.Sequential(nn.Linear(self.num_observations, 256), nn.ELU(), @@ -37,12 +37,13 @@ def __init__(self, observation_space, action_space, device, clip_actions=False, nn.Linear(64, self.num_actions)) self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) - def compute(self, states, taken_actions): + def compute(self, states, taken_actions, role): return self.net(states), self.log_std_parameter -class Value(DeterministicModel): +class Value(DeterministicMixin, Model): def __init__(self, observation_space, action_space, device, clip_actions=False): - super().__init__(observation_space, action_space, device, clip_actions) + Model.__init__(self, observation_space, action_space, device) + DeterministicMixin.__init__(self, clip_actions) self.net = nn.Sequential(nn.Linear(self.num_observations, 256), nn.ELU(), @@ -52,18 +53,12 @@ def __init__(self, observation_space, action_space, device, clip_actions=False): nn.ELU(), nn.Linear(64, 1)) - def compute(self, states, taken_actions): + def compute(self, states, taken_actions, role): return self.net(states) -# Load and wrap the Isaac Gym environment. -# The following lines are intended to support all versions (preview 2, 3 and 4). -# It tries to load from preview 3/4, but if it fails, it will try to load from preview 2 -try: - env = load_isaacgym_env_preview4(task_name="Anymal") # preview 3 and 4 use the same loader -except Exception as e: - print("Isaac Gym (preview 3/4) failed: {}\nTrying preview 2...".format(e)) - env = load_isaacgym_env_preview2("Anymal") +# Load and wrap the Isaac Gym environment +env = load_isaacgym_env_preview4(task_name="Anymal") # preview 3 and 4 use the same loader env = wrap_env(env) device = env.device @@ -76,8 +71,9 @@ def compute(self, states, taken_actions): # Instantiate the agent's models (function approximators). # PPO requires 2 models, visit its documentation for more details # https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#spaces-and-models -models_ppo = {"policy": Policy(env.observation_space, env.action_space, device), - "value": Value(env.observation_space, env.action_space, device)} +models_ppo = {} +models_ppo["policy"] = Policy(env.observation_space, env.action_space, device) +models_ppo["value"] = Value(env.observation_space, env.action_space, device) # Initialize the models' parameters (weights and biases) using a Gaussian distribution for model in models_ppo.values(): @@ -88,7 +84,7 @@ def compute(self, states, taken_actions): # Only modify some of the default configuration, visit its documentation to see all the options # https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#configuration-and-hyperparameters cfg_ppo = PPO_DEFAULT_CONFIG.copy() -cfg_ppo["rollouts"] = 24 +cfg_ppo["rollouts"] = 24 # memory_size cfg_ppo["learning_epochs"] = 5 cfg_ppo["mini_batches"] = 3 # 24 * 4096 / 32768 cfg_ppo["discount_factor"] = 0.99 diff --git a/docs/source/examples/isaacgym/ppo_anymal_terrain.py b/docs/source/examples/isaacgym/ppo_anymal_terrain.py index 0b36e0bf..d64118f2 100644 --- a/docs/source/examples/isaacgym/ppo_anymal_terrain.py +++ b/docs/source/examples/isaacgym/ppo_anymal_terrain.py @@ -4,14 +4,14 @@ import torch.nn as nn # Import the skrl components to build the RL system -from skrl.models.torch import GaussianModel, DeterministicModel +from skrl.models.torch import Model, GaussianMixin, DeterministicMixin from skrl.memories.torch import RandomMemory from skrl.agents.torch.ppo import PPO, PPO_DEFAULT_CONFIG from skrl.resources.schedulers.torch import KLAdaptiveRL from skrl.resources.preprocessors.torch import RunningStandardScaler from skrl.trainers.torch import SequentialTrainer from skrl.envs.torch import wrap_env -from skrl.envs.torch import load_isaacgym_env_preview2, load_isaacgym_env_preview4 +from skrl.envs.torch import load_isaacgym_env_preview4 from skrl.utils import set_seed @@ -19,14 +19,14 @@ set_seed(42) -# Define the models (stochastic and deterministic models) for the agent using helper classes. +# Define the models (stochastic and deterministic models) for the agent using mixins. # - Policy: takes as input the environment's observation/state and returns an action # - Value: takes the state as input and provides a value to guide the policy -class Policy(GaussianModel): +class Policy(GaussianMixin, Model): def __init__(self, observation_space, action_space, device, clip_actions=False, clip_log_std=True, min_log_std=-20, max_log_std=2): - super().__init__(observation_space, action_space, device, clip_actions, - clip_log_std, min_log_std, max_log_std) + Model.__init__(self, observation_space, action_space, device) + GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std) self.net = nn.Sequential(nn.Linear(self.num_observations, 512), nn.ELU(), @@ -37,12 +37,13 @@ def __init__(self, observation_space, action_space, device, clip_actions=False, nn.Linear(128, self.num_actions)) self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) - def compute(self, states, taken_actions): + def compute(self, states, taken_actions, role): return self.net(states), self.log_std_parameter -class Value(DeterministicModel): +class Value(DeterministicMixin, Model): def __init__(self, observation_space, action_space, device, clip_actions=False): - super().__init__(observation_space, action_space, device, clip_actions) + Model.__init__(self, observation_space, action_space, device) + DeterministicMixin.__init__(self, clip_actions) self.net = nn.Sequential(nn.Linear(self.num_observations, 512), nn.ELU(), @@ -52,18 +53,12 @@ def __init__(self, observation_space, action_space, device, clip_actions=False): nn.ELU(), nn.Linear(128, 1)) - def compute(self, states, taken_actions): + def compute(self, states, taken_actions, role): return self.net(states) -# Load and wrap the Isaac Gym environment. -# The following lines are intended to support all versions (preview 2, 3 and 4). -# It tries to load from preview 3/4, but if it fails, it will try to load from preview 2 -try: - env = load_isaacgym_env_preview4(task_name="AnymalTerrain") # preview 3 and 4 use the same loader -except Exception as e: - print("Isaac Gym (preview 3/4) failed: {}\nTrying preview 2...".format(e)) - env = load_isaacgym_env_preview2("AnymalTerrain") +# Load and wrap the Isaac Gym environment +env = load_isaacgym_env_preview4(task_name="AnymalTerrain") # preview 3 and 4 use the same loader env = wrap_env(env) device = env.device @@ -76,8 +71,9 @@ def compute(self, states, taken_actions): # Instantiate the agent's models (function approximators). # PPO requires 2 models, visit its documentation for more details # https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#spaces-and-models -models_ppo = {"policy": Policy(env.observation_space, env.action_space, device), - "value": Value(env.observation_space, env.action_space, device)} +models_ppo = {} +models_ppo["policy"] = Policy(env.observation_space, env.action_space, device) +models_ppo["value"] = Value(env.observation_space, env.action_space, device) # Initialize the models' parameters (weights and biases) using a Gaussian distribution for model in models_ppo.values(): @@ -88,7 +84,7 @@ def compute(self, states, taken_actions): # Only modify some of the default configuration, visit its documentation to see all the options # https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#configuration-and-hyperparameters cfg_ppo = PPO_DEFAULT_CONFIG.copy() -cfg_ppo["rollouts"] = 24 +cfg_ppo["rollouts"] = 24 # memory_size cfg_ppo["learning_epochs"] = 5 cfg_ppo["mini_batches"] = 6 # 24 * 4096 / 16384 cfg_ppo["discount_factor"] = 0.99 diff --git a/docs/source/examples/isaacgym/ppo_ball_balance.py b/docs/source/examples/isaacgym/ppo_ball_balance.py index 84e9999a..620af8ce 100644 --- a/docs/source/examples/isaacgym/ppo_ball_balance.py +++ b/docs/source/examples/isaacgym/ppo_ball_balance.py @@ -4,14 +4,14 @@ import torch.nn as nn # Import the skrl components to build the RL system -from skrl.models.torch import GaussianModel, DeterministicModel +from skrl.models.torch import Model, GaussianMixin, DeterministicMixin from skrl.memories.torch import RandomMemory from skrl.agents.torch.ppo import PPO, PPO_DEFAULT_CONFIG from skrl.resources.schedulers.torch import KLAdaptiveRL from skrl.resources.preprocessors.torch import RunningStandardScaler from skrl.trainers.torch import SequentialTrainer from skrl.envs.torch import wrap_env -from skrl.envs.torch import load_isaacgym_env_preview2, load_isaacgym_env_preview4 +from skrl.envs.torch import load_isaacgym_env_preview4 from skrl.utils import set_seed @@ -19,14 +19,14 @@ set_seed(42) -# Define the models (stochastic and deterministic models) for the agent using helper classes. +# Define the models (stochastic and deterministic models) for the agent using mixins. # - Policy: takes as input the environment's observation/state and returns an action # - Value: takes the state as input and provides a value to guide the policy -class Policy(GaussianModel): +class Policy(GaussianMixin, Model): def __init__(self, observation_space, action_space, device, clip_actions=False, clip_log_std=True, min_log_std=-20, max_log_std=2): - super().__init__(observation_space, action_space, device, clip_actions, - clip_log_std, min_log_std, max_log_std) + Model.__init__(self, observation_space, action_space, device) + GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std) self.net = nn.Sequential(nn.Linear(self.num_observations, 128), nn.ELU(), @@ -37,12 +37,13 @@ def __init__(self, observation_space, action_space, device, clip_actions=False, nn.Linear(32, self.num_actions)) self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) - def compute(self, states, taken_actions): + def compute(self, states, taken_actions, role): return self.net(states), self.log_std_parameter -class Value(DeterministicModel): +class Value(DeterministicMixin, Model): def __init__(self, observation_space, action_space, device, clip_actions=False): - super().__init__(observation_space, action_space, device, clip_actions) + Model.__init__(self, observation_space, action_space, device) + DeterministicMixin.__init__(self, clip_actions) self.net = nn.Sequential(nn.Linear(self.num_observations, 128), nn.ELU(), @@ -52,18 +53,12 @@ def __init__(self, observation_space, action_space, device, clip_actions=False): nn.ELU(), nn.Linear(32, 1)) - def compute(self, states, taken_actions): + def compute(self, states, taken_actions, role): return self.net(states) -# Load and wrap the Isaac Gym environment. -# The following lines are intended to support all versions (preview 2, 3 and 4). -# It tries to load from preview 3/4, but if it fails, it will try to load from preview 2 -try: - env = load_isaacgym_env_preview4(task_name="BallBalance") # preview 3 and 4 use the same loader -except Exception as e: - print("Isaac Gym (preview 3/4) failed: {}\nTrying preview 2...".format(e)) - env = load_isaacgym_env_preview2("BallBalance") +# Load and wrap the Isaac Gym environment +env = load_isaacgym_env_preview4(task_name="BallBalance") # preview 3 and 4 use the same loader env = wrap_env(env) device = env.device @@ -80,15 +75,16 @@ def compute(self, states, taken_actions): "value": Value(env.observation_space, env.action_space, device)} # Initialize the models' parameters (weights and biases) using a Gaussian distribution -for model in models_ppo.values(): - model.init_parameters(method_name="normal_", mean=0.0, std=0.1) +models_ppo = {} +models_ppo["policy"] = Policy(env.observation_space, env.action_space, device) +models_ppo["value"] = Value(env.observation_space, env.action_space, device) # Configure and instantiate the agent. # Only modify some of the default configuration, visit its documentation to see all the options # https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#configuration-and-hyperparameters cfg_ppo = PPO_DEFAULT_CONFIG.copy() -cfg_ppo["rollouts"] = 16 +cfg_ppo["rollouts"] = 16 # memory_size cfg_ppo["learning_epochs"] = 8 cfg_ppo["mini_batches"] = 8 # 16 * 4096 / 8192 cfg_ppo["discount_factor"] = 0.99 diff --git a/docs/source/examples/isaacgym/ppo_cartpole.py b/docs/source/examples/isaacgym/ppo_cartpole.py index 260a298e..9525453c 100644 --- a/docs/source/examples/isaacgym/ppo_cartpole.py +++ b/docs/source/examples/isaacgym/ppo_cartpole.py @@ -4,14 +4,14 @@ import torch.nn as nn # Import the skrl components to build the RL system -from skrl.models.torch import GaussianModel, DeterministicModel +from skrl.models.torch import Model, GaussianMixin, DeterministicMixin from skrl.memories.torch import RandomMemory from skrl.agents.torch.ppo import PPO, PPO_DEFAULT_CONFIG from skrl.resources.schedulers.torch import KLAdaptiveRL from skrl.resources.preprocessors.torch import RunningStandardScaler from skrl.trainers.torch import SequentialTrainer from skrl.envs.torch import wrap_env -from skrl.envs.torch import load_isaacgym_env_preview2, load_isaacgym_env_preview4 +from skrl.envs.torch import load_isaacgym_env_preview4 from skrl.utils import set_seed @@ -19,14 +19,14 @@ set_seed(42) -# Define the models (stochastic and deterministic models) for the agent using helper classes. +# Define the models (stochastic and deterministic models) for the agent using mixins. # - Policy: takes as input the environment's observation/state and returns an action # - Value: takes the state as input and provides a value to guide the policy -class Policy(GaussianModel): +class Policy(GaussianMixin, Model): def __init__(self, observation_space, action_space, device, clip_actions=False, clip_log_std=True, min_log_std=-20, max_log_std=2): - super().__init__(observation_space, action_space, device, clip_actions, - clip_log_std, min_log_std, max_log_std) + Model.__init__(self, observation_space, action_space, device) + GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std) self.net = nn.Sequential(nn.Linear(self.num_observations, 32), nn.ELU(), @@ -35,12 +35,13 @@ def __init__(self, observation_space, action_space, device, clip_actions=False, nn.Linear(32, self.num_actions)) self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) - def compute(self, states, taken_actions): + def compute(self, states, taken_actions, role): return self.net(states), self.log_std_parameter -class Value(DeterministicModel): +class Value(DeterministicMixin, Model): def __init__(self, observation_space, action_space, device, clip_actions=False): - super().__init__(observation_space, action_space, device, clip_actions) + Model.__init__(self, observation_space, action_space, device) + DeterministicMixin.__init__(self, clip_actions) self.net = nn.Sequential(nn.Linear(self.num_observations, 32), nn.ELU(), @@ -48,18 +49,12 @@ def __init__(self, observation_space, action_space, device, clip_actions=False): nn.ELU(), nn.Linear(32, 1)) - def compute(self, states, taken_actions): + def compute(self, states, taken_actions, role): return self.net(states) -# Load and wrap the Isaac Gym environment. -# The following lines are intended to support all versions (preview 2, 3 and 4). -# It tries to load from preview 3/4, but if it fails, it will try to load from preview 2 -try: - env = load_isaacgym_env_preview4(task_name="Cartpole") # preview 3 and 4 use the same loader -except Exception as e: - print("Isaac Gym (preview 3/4) failed: {}\nTrying preview 2...".format(e)) - env = load_isaacgym_env_preview2("Cartpole") +# Load and wrap the Isaac Gym environment +env = load_isaacgym_env_preview4(task_name="Cartpole") # preview 3 and 4 use the same loader env = wrap_env(env) device = env.device @@ -72,8 +67,9 @@ def compute(self, states, taken_actions): # Instantiate the agent's models (function approximators). # PPO requires 2 models, visit its documentation for more details # https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#spaces-and-models -models_ppo = {"policy": Policy(env.observation_space, env.action_space, device), - "value": Value(env.observation_space, env.action_space, device)} +models_ppo = {} +models_ppo["policy"] = Policy(env.observation_space, env.action_space, device) +models_ppo["value"] = Value(env.observation_space, env.action_space, device) # Initialize the models' parameters (weights and biases) using a Gaussian distribution for model in models_ppo.values(): @@ -84,7 +80,7 @@ def compute(self, states, taken_actions): # Only modify some of the default configuration, visit its documentation to see all the options # https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#configuration-and-hyperparameters cfg_ppo = PPO_DEFAULT_CONFIG.copy() -cfg_ppo["rollouts"] = 16 +cfg_ppo["rollouts"] = 16 # memory_size cfg_ppo["learning_epochs"] = 8 cfg_ppo["mini_batches"] = 1 # 16 * 512 / 8192 cfg_ppo["discount_factor"] = 0.99 diff --git a/docs/source/examples/isaacgym/ppo_cartpole_eval.py b/docs/source/examples/isaacgym/ppo_cartpole_eval.py index 2c1c4a02..3ed1ff63 100644 --- a/docs/source/examples/isaacgym/ppo_cartpole_eval.py +++ b/docs/source/examples/isaacgym/ppo_cartpole_eval.py @@ -2,42 +2,36 @@ import torch import torch.nn as nn -import torch.nn.functional as F # Import the skrl components to build the RL system -from skrl.models.torch import GaussianModel +from skrl.models.torch import Model, GaussianMixin from skrl.agents.torch.ppo import PPO, PPO_DEFAULT_CONFIG +from skrl.resources.preprocessors.torch import RunningStandardScaler from skrl.trainers.torch import SequentialTrainer from skrl.envs.torch import wrap_env -from skrl.envs.torch import load_isaacgym_env_preview2, load_isaacgym_env_preview4 +from skrl.envs.torch import load_isaacgym_env_preview4 # Define only the policy for evaluation -class Policy(GaussianModel): +class Policy(GaussianMixin, Model): def __init__(self, observation_space, action_space, device, clip_actions=False, clip_log_std=True, min_log_std=-20, max_log_std=2): - super().__init__(observation_space, action_space, device, clip_actions, - clip_log_std, min_log_std, max_log_std) - - self.linear_layer_1 = nn.Linear(self.num_observations, 32) - self.linear_layer_2 = nn.Linear(32, 32) - self.mean_action_layer = nn.Linear(32, self.num_actions) + Model.__init__(self, observation_space, action_space, device) + GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std) + + self.net = nn.Sequential(nn.Linear(self.num_observations, 32), + nn.ELU(), + nn.Linear(32, 32), + nn.ELU(), + nn.Linear(32, self.num_actions)) self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) - def compute(self, states, taken_actions): - x = F.elu(self.linear_layer_1(states)) - x = F.elu(self.linear_layer_2(x)) - return torch.tanh(self.mean_action_layer(x)), self.log_std_parameter + def compute(self, states, taken_actions, role): + return self.net(states), self.log_std_parameter -# Load and wrap the Isaac Gym environment. -# The following lines are intended to support all versions (preview 2, 3 and 4). -# It tries to load from preview 3/4, but if it fails, it will try to load from preview 2 -try: - env = load_isaacgym_env_preview4(task_name="Cartpole") # preview 3 and 4 use the same loader -except Exception as e: - print("Isaac Gym (preview 3/4) failed: {}\nTrying preview 2...".format(e)) - env = load_isaacgym_env_preview2("Cartpole") +# Load and wrap the Isaac Gym environment +env = load_isaacgym_env_preview4(task_name="Cartpole") # preview 3 and 4 use the same loader env = wrap_env(env) device = env.device @@ -46,16 +40,16 @@ def compute(self, states, taken_actions): # Instantiate the agent's policy. # PPO requires 2 models, visit its documentation for more details # https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#spaces-and-models -models_ppo = {"policy": Policy(env.observation_space, env.action_space, device, clip_actions=True)} - -# load checkpoint -models_ppo["policy"].load("./runs/22-02-06_19-42-39-313520_PPO/checkpoints/8000_policy.pt") +models_ppo = {} +models_ppo["policy"] = Policy(env.observation_space, env.action_space, device) # Configure and instantiate the agent. # https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#configuration-and-hyperparameters cfg_ppo = PPO_DEFAULT_CONFIG.copy() cfg_ppo["random_timesteps"] = 0 +cfg_ppo["state_preprocessor"] = RunningStandardScaler +cfg_ppo["state_preprocessor_kwargs"] = {"size": env.observation_space, "device": device} # logging to TensorBoard each 16 timesteps and ignore checkpoints cfg_ppo["experiment"]["write_interval"] = 16 cfg_ppo["experiment"]["checkpoint_interval"] = 0 @@ -67,9 +61,12 @@ def compute(self, states, taken_actions): action_space=env.action_space, device=device) +# load checkpoint (agent) +agent.load("./runs/22-09-12_18-56-10-110956_PPO/checkpoints/agent_1600.pt") + # Configure and instantiate the RL trainer -cfg_trainer = {"timesteps": 8000, "headless": True} +cfg_trainer = {"timesteps": 1600, "headless": True} trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=agent) # evaluate the agent diff --git a/docs/source/examples/isaacgym/ppo_franka_cabinet.py b/docs/source/examples/isaacgym/ppo_franka_cabinet.py index f2104a67..690ad036 100644 --- a/docs/source/examples/isaacgym/ppo_franka_cabinet.py +++ b/docs/source/examples/isaacgym/ppo_franka_cabinet.py @@ -4,14 +4,14 @@ import torch.nn as nn # Import the skrl components to build the RL system -from skrl.models.torch import GaussianModel, DeterministicModel +from skrl.models.torch import Model, GaussianMixin, DeterministicMixin from skrl.memories.torch import RandomMemory from skrl.agents.torch.ppo import PPO, PPO_DEFAULT_CONFIG from skrl.resources.schedulers.torch import KLAdaptiveRL from skrl.resources.preprocessors.torch import RunningStandardScaler from skrl.trainers.torch import SequentialTrainer from skrl.envs.torch import wrap_env -from skrl.envs.torch import load_isaacgym_env_preview2, load_isaacgym_env_preview4 +from skrl.envs.torch import load_isaacgym_env_preview4 from skrl.utils import set_seed @@ -19,14 +19,14 @@ set_seed(42) -# Define the models (stochastic and deterministic models) for the agent using helper classes. +# Define the models (stochastic and deterministic models) for the agent using mixins. # - Policy: takes as input the environment's observation/state and returns an action # - Value: takes the state as input and provides a value to guide the policy -class Policy(GaussianModel): +class Policy(GaussianMixin, Model): def __init__(self, observation_space, action_space, device, clip_actions=False, clip_log_std=True, min_log_std=-20, max_log_std=2): - super().__init__(observation_space, action_space, device, clip_actions, - clip_log_std, min_log_std, max_log_std) + Model.__init__(self, observation_space, action_space, device) + GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std) self.net = nn.Sequential(nn.Linear(self.num_observations, 256), nn.ELU(), @@ -37,12 +37,13 @@ def __init__(self, observation_space, action_space, device, clip_actions=False, nn.Linear(64, self.num_actions)) self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) - def compute(self, states, taken_actions): + def compute(self, states, taken_actions, role): return self.net(states), self.log_std_parameter -class Value(DeterministicModel): +class Value(DeterministicMixin, Model): def __init__(self, observation_space, action_space, device, clip_actions=False): - super().__init__(observation_space, action_space, device, clip_actions) + Model.__init__(self, observation_space, action_space, device) + DeterministicMixin.__init__(self, clip_actions) self.net = nn.Sequential(nn.Linear(self.num_observations, 256), nn.ELU(), @@ -52,18 +53,12 @@ def __init__(self, observation_space, action_space, device, clip_actions=False): nn.ELU(), nn.Linear(64, 1)) - def compute(self, states, taken_actions): + def compute(self, states, taken_actions, role): return self.net(states) -# Load and wrap the Isaac Gym environment. -# The following lines are intended to support all versions (preview 2, 3 and 4). -# It tries to load from preview 3/4, but if it fails, it will try to load from preview 2 -try: - env = load_isaacgym_env_preview4(task_name="FrankaCabinet") # preview 3 and 4 use the same loader -except Exception as e: - print("Isaac Gym (preview 3/4) failed: {}\nTrying preview 2...".format(e)) - env = load_isaacgym_env_preview2("FrankaCabinet") +# Load and wrap the Isaac Gym environment +env = load_isaacgym_env_preview4(task_name="FrankaCabinet") # preview 3 and 4 use the same loader env = wrap_env(env) device = env.device @@ -76,8 +71,9 @@ def compute(self, states, taken_actions): # Instantiate the agent's models (function approximators). # PPO requires 2 models, visit its documentation for more details # https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#spaces-and-models -models_ppo = {"policy": Policy(env.observation_space, env.action_space, device), - "value": Value(env.observation_space, env.action_space, device)} +models_ppo = {} +models_ppo["policy"] = Policy(env.observation_space, env.action_space, device) +models_ppo["value"] = Value(env.observation_space, env.action_space, device) # Initialize the models' parameters (weights and biases) using a Gaussian distribution for model in models_ppo.values(): @@ -88,7 +84,7 @@ def compute(self, states, taken_actions): # Only modify some of the default configuration, visit its documentation to see all the options # https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#configuration-and-hyperparameters cfg_ppo = PPO_DEFAULT_CONFIG.copy() -cfg_ppo["rollouts"] = 16 +cfg_ppo["rollouts"] = 16 # memory_size cfg_ppo["learning_epochs"] = 8 cfg_ppo["mini_batches"] = 8 # 16 * 4096 / 8192 cfg_ppo["discount_factor"] = 0.99 diff --git a/docs/source/examples/isaacgym/ppo_humanoid.py b/docs/source/examples/isaacgym/ppo_humanoid.py index 4fa4d43d..1a1272f9 100644 --- a/docs/source/examples/isaacgym/ppo_humanoid.py +++ b/docs/source/examples/isaacgym/ppo_humanoid.py @@ -4,14 +4,14 @@ import torch.nn as nn # Import the skrl components to build the RL system -from skrl.models.torch import GaussianModel, DeterministicModel +from skrl.models.torch import Model, GaussianMixin, DeterministicMixin from skrl.memories.torch import RandomMemory from skrl.agents.torch.ppo import PPO, PPO_DEFAULT_CONFIG from skrl.resources.schedulers.torch import KLAdaptiveRL from skrl.resources.preprocessors.torch import RunningStandardScaler from skrl.trainers.torch import SequentialTrainer from skrl.envs.torch import wrap_env -from skrl.envs.torch import load_isaacgym_env_preview2, load_isaacgym_env_preview4 +from skrl.envs.torch import load_isaacgym_env_preview4 from skrl.utils import set_seed @@ -19,14 +19,14 @@ set_seed(42) -# Define the models (stochastic and deterministic models) for the agent using helper classes. +# Define the models (stochastic and deterministic models) for the agent using mixins. # - Policy: takes as input the environment's observation/state and returns an action # - Value: takes the state as input and provides a value to guide the policy -class Policy(GaussianModel): +class Policy(GaussianMixin, Model): def __init__(self, observation_space, action_space, device, clip_actions=False, clip_log_std=True, min_log_std=-20, max_log_std=2): - super().__init__(observation_space, action_space, device, clip_actions, - clip_log_std, min_log_std, max_log_std) + Model.__init__(self, observation_space, action_space, device) + GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std) self.net = nn.Sequential(nn.Linear(self.num_observations, 400), nn.ELU(), @@ -37,12 +37,13 @@ def __init__(self, observation_space, action_space, device, clip_actions=False, nn.Linear(100, self.num_actions)) self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) - def compute(self, states, taken_actions): + def compute(self, states, taken_actions, role): return self.net(states), self.log_std_parameter -class Value(DeterministicModel): +class Value(DeterministicMixin, Model): def __init__(self, observation_space, action_space, device, clip_actions=False): - super().__init__(observation_space, action_space, device, clip_actions) + Model.__init__(self, observation_space, action_space, device) + DeterministicMixin.__init__(self, clip_actions) self.net = nn.Sequential(nn.Linear(self.num_observations, 400), nn.ELU(), @@ -52,18 +53,12 @@ def __init__(self, observation_space, action_space, device, clip_actions=False): nn.ELU(), nn.Linear(100, 1)) - def compute(self, states, taken_actions): + def compute(self, states, taken_actions, role): return self.net(states) -# Load and wrap the Isaac Gym environment. -# The following lines are intended to support all versions (preview 2, 3 and 4). -# It tries to load from preview 3/4, but if it fails, it will try to load from preview 2 -try: - env = load_isaacgym_env_preview4(task_name="Humanoid") # preview 3 and 4 use the same loader -except Exception as e: - print("Isaac Gym (preview 3/4) failed: {}\nTrying preview 2...".format(e)) - env = load_isaacgym_env_preview2("Humanoid") +# Load and wrap the Isaac Gym environment +env = load_isaacgym_env_preview4(task_name="Humanoid") # preview 3 and 4 use the same loader env = wrap_env(env) device = env.device @@ -76,8 +71,9 @@ def compute(self, states, taken_actions): # Instantiate the agent's models (function approximators). # PPO requires 2 models, visit its documentation for more details # https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#spaces-and-models -models_ppo = {"policy": Policy(env.observation_space, env.action_space, device), - "value": Value(env.observation_space, env.action_space, device)} +models_ppo = {} +models_ppo["policy"] = Policy(env.observation_space, env.action_space, device) +models_ppo["value"] = Value(env.observation_space, env.action_space, device) # Initialize the models' parameters (weights and biases) using a Gaussian distribution for model in models_ppo.values(): @@ -88,7 +84,7 @@ def compute(self, states, taken_actions): # Only modify some of the default configuration, visit its documentation to see all the options # https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#configuration-and-hyperparameters cfg_ppo = PPO_DEFAULT_CONFIG.copy() -cfg_ppo["rollouts"] = 32 +cfg_ppo["rollouts"] = 32 # memory_size cfg_ppo["learning_epochs"] = 5 cfg_ppo["mini_batches"] = 4 # 32 * 4096 / 32768 cfg_ppo["discount_factor"] = 0.99 diff --git a/docs/source/examples/isaacgym/ppo_ingenuity.py b/docs/source/examples/isaacgym/ppo_ingenuity.py index b21af817..84c7570b 100644 --- a/docs/source/examples/isaacgym/ppo_ingenuity.py +++ b/docs/source/examples/isaacgym/ppo_ingenuity.py @@ -5,7 +5,7 @@ import torch.nn as nn # Import the skrl components to build the RL system -from skrl.models.torch import GaussianModel, DeterministicModel +from skrl.models.torch import Model, GaussianMixin, DeterministicMixin from skrl.memories.torch import RandomMemory from skrl.agents.torch.ppo import PPO, PPO_DEFAULT_CONFIG from skrl.resources.schedulers.torch import KLAdaptiveRL @@ -19,14 +19,14 @@ set_seed(42) -# Define the models (stochastic and deterministic models) for the agent using helper classes. +# Define the models (stochastic and deterministic models) for the agent using mixins. # - Policy: takes as input the environment's observation/state and returns an action # - Value: takes the state as input and provides a value to guide the policy -class Policy(GaussianModel): +class Policy(GaussianMixin, Model): def __init__(self, observation_space, action_space, device, clip_actions=False, clip_log_std=True, min_log_std=-20, max_log_std=2): - super().__init__(observation_space, action_space, device, clip_actions, - clip_log_std, min_log_std, max_log_std) + Model.__init__(self, observation_space, action_space, device) + GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std) self.net = nn.Sequential(nn.Linear(self.num_observations, 256), nn.ELU(), @@ -37,12 +37,13 @@ def __init__(self, observation_space, action_space, device, clip_actions=False, nn.Linear(128, self.num_actions)) self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) - def compute(self, states, taken_actions): + def compute(self, states, taken_actions, role): return self.net(states), self.log_std_parameter -class Value(DeterministicModel): +class Value(DeterministicMixin, Model): def __init__(self, observation_space, action_space, device, clip_actions=False): - super().__init__(observation_space, action_space, device, clip_actions) + Model.__init__(self, observation_space, action_space, device) + DeterministicMixin.__init__(self, clip_actions) self.net = nn.Sequential(nn.Linear(self.num_observations, 256), nn.ELU(), @@ -52,7 +53,7 @@ def __init__(self, observation_space, action_space, device, clip_actions=False): nn.ELU(), nn.Linear(128, 1)) - def compute(self, states, taken_actions): + def compute(self, states, taken_actions, role): return self.net(states) @@ -76,8 +77,9 @@ def compute(self, states, taken_actions): # Instantiate the agent's models (function approximators). # PPO requires 2 models, visit its documentation for more details # https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#spaces-and-models -models_ppo = {"policy": Policy(env.observation_space, env.action_space, device), - "value": Value(env.observation_space, env.action_space, device)} +models_ppo = {} +models_ppo["policy"] = Policy(env.observation_space, env.action_space, device) +models_ppo["value"] = Value(env.observation_space, env.action_space, device) # Initialize the models' parameters (weights and biases) using a Gaussian distribution for model in models_ppo.values(): @@ -88,7 +90,7 @@ def compute(self, states, taken_actions): # Only modify some of the default configuration, visit its documentation to see all the options # https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#configuration-and-hyperparameters cfg_ppo = PPO_DEFAULT_CONFIG.copy() -cfg_ppo["rollouts"] = 16 +cfg_ppo["rollouts"] = 16 # memory_size cfg_ppo["learning_epochs"] = 8 cfg_ppo["mini_batches"] = 4 # 16 * 4096 / 16384 cfg_ppo["discount_factor"] = 0.99 diff --git a/docs/source/examples/isaacgym/ppo_quadcopter.py b/docs/source/examples/isaacgym/ppo_quadcopter.py index 883e31f4..06289885 100644 --- a/docs/source/examples/isaacgym/ppo_quadcopter.py +++ b/docs/source/examples/isaacgym/ppo_quadcopter.py @@ -4,14 +4,14 @@ import torch.nn as nn # Import the skrl components to build the RL system -from skrl.models.torch import GaussianModel, DeterministicModel +from skrl.models.torch import Model, GaussianMixin, DeterministicMixin from skrl.memories.torch import RandomMemory from skrl.agents.torch.ppo import PPO, PPO_DEFAULT_CONFIG from skrl.resources.schedulers.torch import KLAdaptiveRL from skrl.resources.preprocessors.torch import RunningStandardScaler from skrl.trainers.torch import SequentialTrainer from skrl.envs.torch import wrap_env -from skrl.envs.torch import load_isaacgym_env_preview2, load_isaacgym_env_preview4 +from skrl.envs.torch import load_isaacgym_env_preview4 from skrl.utils import set_seed @@ -19,14 +19,14 @@ set_seed(42) -# Define the models (stochastic and deterministic models) for the agent using helper classes. +# Define the models (stochastic and deterministic models) for the agent using mixins. # - Policy: takes as input the environment's observation/state and returns an action # - Value: takes the state as input and provides a value to guide the policy -class Policy(GaussianModel): +class Policy(GaussianMixin, Model): def __init__(self, observation_space, action_space, device, clip_actions=False, clip_log_std=True, min_log_std=-20, max_log_std=2): - super().__init__(observation_space, action_space, device, clip_actions, - clip_log_std, min_log_std, max_log_std) + Model.__init__(self, observation_space, action_space, device) + GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std) self.net = nn.Sequential(nn.Linear(self.num_observations, 256), nn.ELU(), @@ -37,12 +37,13 @@ def __init__(self, observation_space, action_space, device, clip_actions=False, nn.Linear(128, self.num_actions)) self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) - def compute(self, states, taken_actions): + def compute(self, states, taken_actions, role): return self.net(states), self.log_std_parameter -class Value(DeterministicModel): +class Value(DeterministicMixin, Model): def __init__(self, observation_space, action_space, device, clip_actions=False): - super().__init__(observation_space, action_space, device, clip_actions) + Model.__init__(self, observation_space, action_space, device) + DeterministicMixin.__init__(self, clip_actions) self.net = nn.Sequential(nn.Linear(self.num_observations, 256), nn.ELU(), @@ -52,18 +53,12 @@ def __init__(self, observation_space, action_space, device, clip_actions=False): nn.ELU(), nn.Linear(128, 1)) - def compute(self, states, taken_actions): + def compute(self, states, taken_actions, role): return self.net(states) -# Load and wrap the Isaac Gym environment. -# The following lines are intended to support all versions (preview 2, 3 and 4). -# It tries to load from preview 3/4, but if it fails, it will try to load from preview 2 -try: - env = load_isaacgym_env_preview4(task_name="Quadcopter") # preview 3 and 4 use the same loader -except Exception as e: - print("Isaac Gym (preview 3/4) failed: {}\nTrying preview 2...".format(e)) - env = load_isaacgym_env_preview2("Quadcopter") +# Load and wrap the Isaac Gym environment +env = load_isaacgym_env_preview4(task_name="Quadcopter") # preview 3 and 4 use the same loader env = wrap_env(env) device = env.device @@ -76,8 +71,9 @@ def compute(self, states, taken_actions): # Instantiate the agent's models (function approximators). # PPO requires 2 models, visit its documentation for more details # https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#spaces-and-models -models_ppo = {"policy": Policy(env.observation_space, env.action_space, device), - "value": Value(env.observation_space, env.action_space, device)} +models_ppo = {} +models_ppo["policy"] = Policy(env.observation_space, env.action_space, device) +models_ppo["value"] = Value(env.observation_space, env.action_space, device) # Initialize the models' parameters (weights and biases) using a Gaussian distribution for model in models_ppo.values(): @@ -88,7 +84,7 @@ def compute(self, states, taken_actions): # Only modify some of the default configuration, visit its documentation to see all the options # https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#configuration-and-hyperparameters cfg_ppo = PPO_DEFAULT_CONFIG.copy() -cfg_ppo["rollouts"] = 8 +cfg_ppo["rollouts"] = 8 # memory_size cfg_ppo["learning_epochs"] = 8 cfg_ppo["mini_batches"] = 4 # 8 * 8192 / 16384 cfg_ppo["discount_factor"] = 0.99 diff --git a/docs/source/examples/isaacgym/ppo_shadow_hand.py b/docs/source/examples/isaacgym/ppo_shadow_hand.py index 24f29a7f..2f184912 100644 --- a/docs/source/examples/isaacgym/ppo_shadow_hand.py +++ b/docs/source/examples/isaacgym/ppo_shadow_hand.py @@ -4,14 +4,14 @@ import torch.nn as nn # Import the skrl components to build the RL system -from skrl.models.torch import GaussianModel, DeterministicModel +from skrl.models.torch import Model, GaussianMixin, DeterministicMixin from skrl.memories.torch import RandomMemory from skrl.agents.torch.ppo import PPO, PPO_DEFAULT_CONFIG from skrl.resources.schedulers.torch import KLAdaptiveRL from skrl.resources.preprocessors.torch import RunningStandardScaler from skrl.trainers.torch import SequentialTrainer from skrl.envs.torch import wrap_env -from skrl.envs.torch import load_isaacgym_env_preview2, load_isaacgym_env_preview4 +from skrl.envs.torch import load_isaacgym_env_preview4 from skrl.utils import set_seed @@ -19,14 +19,14 @@ set_seed(42) -# Define the models (stochastic and deterministic models) for the agent using helper classes. +# Define the models (stochastic and deterministic models) for the agent using mixins. # - Policy: takes as input the environment's observation/state and returns an action # - Value: takes the state as input and provides a value to guide the policy -class Policy(GaussianModel): +class Policy(GaussianMixin, Model): def __init__(self, observation_space, action_space, device, clip_actions=False, clip_log_std=True, min_log_std=-20, max_log_std=2): - super().__init__(observation_space, action_space, device, clip_actions, - clip_log_std, min_log_std, max_log_std) + Model.__init__(self, observation_space, action_space, device) + GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std) self.net = nn.Sequential(nn.Linear(self.num_observations, 512), nn.ELU(), @@ -39,12 +39,13 @@ def __init__(self, observation_space, action_space, device, clip_actions=False, nn.Linear(128, self.num_actions)) self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) - def compute(self, states, taken_actions): + def compute(self, states, taken_actions, role): return self.net(states), self.log_std_parameter -class Value(DeterministicModel): +class Value(DeterministicMixin, Model): def __init__(self, observation_space, action_space, device, clip_actions=False): - super().__init__(observation_space, action_space, device, clip_actions) + Model.__init__(self, observation_space, action_space, device) + DeterministicMixin.__init__(self, clip_actions) self.net = nn.Sequential(nn.Linear(self.num_observations, 512), nn.ELU(), @@ -56,18 +57,12 @@ def __init__(self, observation_space, action_space, device, clip_actions=False): nn.ELU(), nn.Linear(128, 1)) - def compute(self, states, taken_actions): + def compute(self, states, taken_actions, role): return self.net(states) -# Load and wrap the Isaac Gym environment. -# The following lines are intended to support all versions (preview 2, 3 and 4). -# It tries to load from preview 3/4, but if it fails, it will try to load from preview 2 -try: - env = load_isaacgym_env_preview4(task_name="ShadowHand") # preview 3 and 4 use the same loader -except Exception as e: - print("Isaac Gym (preview 3/4) failed: {}\nTrying preview 2...".format(e)) - env = load_isaacgym_env_preview2("ShadowHand") +# Load and wrap the Isaac Gym environment +env = load_isaacgym_env_preview4(task_name="ShadowHand") # preview 3 and 4 use the same loader env = wrap_env(env) device = env.device @@ -80,8 +75,9 @@ def compute(self, states, taken_actions): # Instantiate the agent's models (function approximators). # PPO requires 2 models, visit its documentation for more details # https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#spaces-and-models -models_ppo = {"policy": Policy(env.observation_space, env.action_space, device), - "value": Value(env.observation_space, env.action_space, device)} +models_ppo = {} +models_ppo["policy"] = Policy(env.observation_space, env.action_space, device) +models_ppo["value"] = Value(env.observation_space, env.action_space, device) # Initialize the models' parameters (weights and biases) using a Gaussian distribution for model in models_ppo.values(): @@ -92,7 +88,7 @@ def compute(self, states, taken_actions): # Only modify some of the default configuration, visit its documentation to see all the options # https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#configuration-and-hyperparameters cfg_ppo = PPO_DEFAULT_CONFIG.copy() -cfg_ppo["rollouts"] = 8 +cfg_ppo["rollouts"] = 8 # memory_size cfg_ppo["learning_epochs"] = 5 cfg_ppo["mini_batches"] = 4 # 8 * 16384 / 32768 cfg_ppo["discount_factor"] = 0.99 diff --git a/docs/source/examples/isaacgym/ppo_trifinger.py b/docs/source/examples/isaacgym/ppo_trifinger.py index 404f4a03..898d5bb6 100644 --- a/docs/source/examples/isaacgym/ppo_trifinger.py +++ b/docs/source/examples/isaacgym/ppo_trifinger.py @@ -4,14 +4,14 @@ import torch.nn as nn # Import the skrl components to build the RL system -from skrl.models.torch import GaussianModel, DeterministicModel +from skrl.models.torch import Model, GaussianMixin, DeterministicMixin from skrl.memories.torch import RandomMemory from skrl.agents.torch.ppo import PPO, PPO_DEFAULT_CONFIG from skrl.resources.schedulers.torch import KLAdaptiveRL from skrl.resources.preprocessors.torch import RunningStandardScaler from skrl.trainers.torch import SequentialTrainer from skrl.envs.torch import wrap_env -from skrl.envs.torch import load_isaacgym_env_preview2, load_isaacgym_env_preview4 +from skrl.envs.torch import load_isaacgym_env_preview4 from skrl.utils import set_seed @@ -19,14 +19,14 @@ set_seed(42) -# Define the models (stochastic and deterministic models) for the agent using helper classes. +# Define the models (stochastic and deterministic models) for the agent using mixins. # - Policy: takes as input the environment's observation/state and returns an action # - Value: takes the state as input and provides a value to guide the policy -class Policy(GaussianModel): +class Policy(GaussianMixin, Model): def __init__(self, observation_space, action_space, device, clip_actions=False, clip_log_std=True, min_log_std=-20, max_log_std=2): - super().__init__(observation_space, action_space, device, clip_actions, - clip_log_std, min_log_std, max_log_std) + Model.__init__(self, observation_space, action_space, device) + GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std) self.net = nn.Sequential(nn.Linear(self.num_observations, 256), nn.ELU(), @@ -39,12 +39,13 @@ def __init__(self, observation_space, action_space, device, clip_actions=False, nn.Linear(128, self.num_actions)) self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) - def compute(self, states, taken_actions): + def compute(self, states, taken_actions, role): return self.net(states), self.log_std_parameter -class Value(DeterministicModel): +class Value(DeterministicMixin, Model): def __init__(self, observation_space, action_space, device, clip_actions=False): - super().__init__(observation_space, action_space, device, clip_actions) + Model.__init__(self, observation_space, action_space, device) + DeterministicMixin.__init__(self, clip_actions) self.net = nn.Sequential(nn.Linear(self.num_observations, 256), nn.ELU(), @@ -56,18 +57,12 @@ def __init__(self, observation_space, action_space, device, clip_actions=False): nn.ELU(), nn.Linear(128, 1)) - def compute(self, states, taken_actions): + def compute(self, states, taken_actions, role): return self.net(states) -# Load and wrap the Isaac Gym environment. -# The following lines are intended to support all versions (preview 2, 3 and 4). -# It tries to load from preview 3/4, but if it fails, it will try to load from preview 2 -try: - env = load_isaacgym_env_preview4(task_name="Trifinger") # preview 3 and 4 use the same loader -except Exception as e: - print("Isaac Gym (preview 3/4) failed: {}\nTrying preview 2...".format(e)) - env = load_isaacgym_env_preview2("Trifinger") +# Load and wrap the Isaac Gym environment +env = load_isaacgym_env_preview4(task_name="Trifinger") # preview 3 and 4 use the same loader env = wrap_env(env) device = env.device @@ -80,8 +75,9 @@ def compute(self, states, taken_actions): # Instantiate the agent's models (function approximators). # PPO requires 2 models, visit its documentation for more details # https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#spaces-and-models -models_ppo = {"policy": Policy(env.observation_space, env.action_space, device), - "value": Value(env.observation_space, env.action_space, device)} +models_ppo = {} +models_ppo["policy"] = Policy(env.observation_space, env.action_space, device) +models_ppo["value"] = Value(env.observation_space, env.action_space, device) # Initialize the models' parameters (weights and biases) using a Gaussian distribution for model in models_ppo.values(): @@ -92,7 +88,7 @@ def compute(self, states, taken_actions): # Only modify some of the default configuration, visit its documentation to see all the options # https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#configuration-and-hyperparameters cfg_ppo = PPO_DEFAULT_CONFIG.copy() -cfg_ppo["rollouts"] = 8 +cfg_ppo["rollouts"] = 8 # memory_size cfg_ppo["learning_epochs"] = 4 cfg_ppo["mini_batches"] = 8 # 8 * 16384 / 16384 cfg_ppo["discount_factor"] = 0.99 diff --git a/docs/source/examples/isaacgym/trpo_cartpole.py b/docs/source/examples/isaacgym/trpo_cartpole.py index 31266a55..8b0a76e7 100644 --- a/docs/source/examples/isaacgym/trpo_cartpole.py +++ b/docs/source/examples/isaacgym/trpo_cartpole.py @@ -4,13 +4,13 @@ import torch.nn as nn # Import the skrl components to build the RL system -from skrl.models.torch import GaussianModel, DeterministicModel +from skrl.models.torch import Model, GaussianMixin, DeterministicMixin from skrl.memories.torch import RandomMemory from skrl.agents.torch.trpo import TRPO, TRPO_DEFAULT_CONFIG from skrl.resources.preprocessors.torch import RunningStandardScaler from skrl.trainers.torch import SequentialTrainer from skrl.envs.torch import wrap_env -from skrl.envs.torch import load_isaacgym_env_preview2, load_isaacgym_env_preview4 +from skrl.envs.torch import load_isaacgym_env_preview4 from skrl.utils import set_seed @@ -18,14 +18,14 @@ set_seed(42) -# Define the models (stochastic and deterministic models) for the agent using helper classes. +# Define the models (stochastic and deterministic models) for the agent using mixins. # - Policy: takes as input the environment's observation/state and returns an action # - Value: takes the state as input and provides a value to guide the policy -class Policy(GaussianModel): +class Policy(GaussianMixin, Model): def __init__(self, observation_space, action_space, device, clip_actions=False, clip_log_std=True, min_log_std=-20, max_log_std=2): - super().__init__(observation_space, action_space, device, clip_actions, - clip_log_std, min_log_std, max_log_std) + Model.__init__(self, observation_space, action_space, device) + GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std) self.net = nn.Sequential(nn.Linear(self.num_observations, 32), nn.ELU(), @@ -34,12 +34,13 @@ def __init__(self, observation_space, action_space, device, clip_actions=False, nn.Linear(32, self.num_actions)) self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) - def compute(self, states, taken_actions): + def compute(self, states, taken_actions, role): return self.net(states), self.log_std_parameter -class Value(DeterministicModel): +class Value(DeterministicMixin, Model): def __init__(self, observation_space, action_space, device, clip_actions=False): - super().__init__(observation_space, action_space, device, clip_actions) + Model.__init__(self, observation_space, action_space, device) + DeterministicMixin.__init__(self, clip_actions) self.net = nn.Sequential(nn.Linear(self.num_observations, 32), nn.ELU(), @@ -47,18 +48,12 @@ def __init__(self, observation_space, action_space, device, clip_actions=False): nn.ELU(), nn.Linear(32, 1)) - def compute(self, states, taken_actions): + def compute(self, states, taken_actions, role): return self.net(states) -# Load and wrap the Isaac Gym environment. -# The following lines are intended to support all versions (preview 2, 3 and 4). -# It tries to load from preview 3/4, but if it fails, it will try to load from preview 2 -try: - env = load_isaacgym_env_preview4(task_name="Cartpole") # preview 3 and 4 use the same loader -except Exception as e: - print("Isaac Gym (preview 3/4) failed: {}\nTrying preview 2...".format(e)) - env = load_isaacgym_env_preview2("Cartpole") +# Load and wrap the Isaac Gym environment +env = load_isaacgym_env_preview4(task_name="Cartpole") # preview 3 and 4 use the same loader env = wrap_env(env) device = env.device @@ -71,8 +66,9 @@ def compute(self, states, taken_actions): # Instantiate the agent's models (function approximators). # TRPO requires 2 models, visit its documentation for more details # https://skrl.readthedocs.io/en/latest/modules/skrl.agents.trpo.html#spaces-and-models -models_trpo = {"policy": Policy(env.observation_space, env.action_space, device), - "value": Value(env.observation_space, env.action_space, device)} +models_trpo = {} +models_trpo["policy"] = Policy(env.observation_space, env.action_space, device) +models_trpo["value"] = Value(env.observation_space, env.action_space, device) # Initialize the models' parameters (weights and biases) using a Gaussian distribution for model in models_trpo.values(): @@ -83,7 +79,7 @@ def compute(self, states, taken_actions): # Only modify some of the default configuration, visit its documentation to see all the options # https://skrl.readthedocs.io/en/latest/modules/skrl.agents.trpo.html#configuration-and-hyperparameters cfg_trpo = TRPO_DEFAULT_CONFIG.copy() -cfg_trpo["rollouts"] = 16 +cfg_trpo["rollouts"] = 16 # memory_size cfg_trpo["learning_epochs"] = 6 cfg_trpo["mini_batches"] = 2 cfg_trpo["grad_norm_clip"] = 0.5 @@ -106,7 +102,7 @@ def compute(self, states, taken_actions): # Configure and instantiate the RL trainer -cfg_trainer = {"timesteps": 2500, "headless": True} +cfg_trainer = {"timesteps": 1600, "headless": True} trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=agent) # start training From d92c403e40bc79265ef502d8ab9f21a57b0aa8b0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20Serrano=20Mu=C3=B1oz?= Date: Mon, 12 Sep 2022 23:05:58 +0200 Subject: [PATCH 071/108] Set role argument when calling policy get_log_std method --- skrl/agents/torch/trpo/trpo.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/skrl/agents/torch/trpo/trpo.py b/skrl/agents/torch/trpo/trpo.py index e9766f8c..fa9d65d0 100644 --- a/skrl/agents/torch/trpo/trpo.py +++ b/skrl/agents/torch/trpo/trpo.py @@ -432,11 +432,11 @@ def kl_divergence(policy_1: Model, policy_2: Model, states: torch.Tensor) -> tor :rtype: torch.Tensor """ _, _, mu_1 = policy_1.act(states, taken_actions=None, role="policy") - logstd_1 = policy_1.get_log_std() + logstd_1 = policy_1.get_log_std(role="policy") mu_1, logstd_1 = mu_1.detach(), logstd_1.detach() _, _, mu_2 = policy_2.act(states, taken_actions=None, role="policy") - logstd_2 = policy_2.get_log_std() + logstd_2 = policy_2.get_log_std(role="policy") kl = logstd_1 - logstd_2 + 0.5 * (torch.square(logstd_1.exp()) + torch.square(mu_1 - mu_2)) \ / torch.square(logstd_2.exp()) - 0.5 From f9608d3e80061a6f756033a92026b767e533ba55 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20Serrano=20Mu=C3=B1oz?= Date: Mon, 12 Sep 2022 23:30:04 +0200 Subject: [PATCH 072/108] Update CHANGELOG --- CHANGELOG.md | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b48c1b53..bc1a765f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,12 +5,29 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). ## [0.8.0] - Unreleased ### Added - AMP agent for physics-based character animation -- Gaussian model - Manual trainer +- Gaussian model mixin +- Support for creating shared models +- Parameter `role` to model methods +- Wrapper compatibility with the new OpenAI Gym environment API (by @JohannLange) +- Internal library colored logger +- Migrate checkpoints/models from other RL libraries to skrl models/agents +- Configuration parameter `store_separately` to agent configuration dict +- Save/load agent modules (models, optimizers, preprocessors) ### Changed -- Multivariate Gaussian model (`GaussianModel` until 0.7.0) to `MultivariateGaussianModel` +- Models implementation as Python mixin [**breaking change**] +- Multivariate Gaussian model (`GaussianModel` until 0.7.0) to `MultivariateGaussianMixin` - Trainer's `cfg` parameter position and default values +- Show training/evaluadion display progress using `tqdm` (by @JohannLange) + +### Fixed +- Missing recursive arguments during model weights initialization +- Tensor dimension when computing preprocessor parallel variance + +### Removed +- Parameter `inference` from model methods +- Configuration parameter `checkpoint_policy_only` from agent configuration dict ## [0.7.0] - 2022-07-11 ### Added From bbcbbbd8f321f99ecbbed03be152316fa367d1e9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20Serrano=20Mu=C3=B1oz?= Date: Mon, 12 Sep 2022 23:33:20 +0200 Subject: [PATCH 073/108] Update Isaac Gym and Omniverse Isaac Gym examples in docs --- .../examples/isaacgym/ppo_ball_balance.py | 7 +- docs/source/intro/examples.rst | 118 +++++++----------- 2 files changed, 51 insertions(+), 74 deletions(-) diff --git a/docs/source/examples/isaacgym/ppo_ball_balance.py b/docs/source/examples/isaacgym/ppo_ball_balance.py index 620af8ce..bacde4d7 100644 --- a/docs/source/examples/isaacgym/ppo_ball_balance.py +++ b/docs/source/examples/isaacgym/ppo_ball_balance.py @@ -71,14 +71,13 @@ def compute(self, states, taken_actions, role): # Instantiate the agent's models (function approximators). # PPO requires 2 models, visit its documentation for more details # https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#spaces-and-models -models_ppo = {"policy": Policy(env.observation_space, env.action_space, device), - "value": Value(env.observation_space, env.action_space, device)} - -# Initialize the models' parameters (weights and biases) using a Gaussian distribution models_ppo = {} models_ppo["policy"] = Policy(env.observation_space, env.action_space, device) models_ppo["value"] = Value(env.observation_space, env.action_space, device) +# Initialize the models' parameters (weights and biases) using a Gaussian distribution +for model in models_ppo.values(): + model.init_parameters(method_name="normal_", mean=0.0, std=0.1) # Configure and instantiate the agent. # Only modify some of the default configuration, visit its documentation to see all the options diff --git a/docs/source/intro/examples.rst b/docs/source/intro/examples.rst index ee9de042..a9abc232 100644 --- a/docs/source/intro/examples.rst +++ b/docs/source/intro/examples.rst @@ -273,10 +273,10 @@ The following components or practices are exemplified (highlighted):
-Learning in an Isaac Gym environment (one agent, multiple environments) ------------------------------------------------------------------------ +Learning in an Isaac Gym environment +------------------------------------ -These examples perform the training of an agent in the `Isaac Gym environments `_. Some scripts try to load the environment from preview 4 (or preview 3), but if they fail, they will try to load the environment from preview 2 +These examples perform the training of an agent in the `Isaac Gym environments `_ (**one agent, multiple environments**) .. image:: ../_static/imgs/example_isaacgym.png :width: 100% @@ -333,129 +333,115 @@ The PPO agent configuration is mapped, as far as possible, from the rl_games' A2 .. tab:: AllegroHand - View the raw code: `ppo_allegro_hand.py `_ + :download:`ppo_allegro_hand.py <../examples/isaacgym/ppo_allegro_hand.py>` .. literalinclude:: ../examples/isaacgym/ppo_allegro_hand.py :language: python - :linenos: - :emphasize-lines: 2, 60-66 + :emphasize-lines: 2, 61-67 .. tab:: Ant - View the raw code: `ppo_ant.py `_ + :download:`ppo_ant.py <../examples/isaacgym/ppo_ant.py>` .. literalinclude:: ../examples/isaacgym/ppo_ant.py :language: python - :linenos: - :emphasize-lines: 13-14, 62-67 + :emphasize-lines: 13-14, 61-62 .. tab:: Anymal - View the raw code: `ppo_anymal.py `_ + :download:`ppo_anymal.py <../examples/isaacgym/ppo_anymal.py>` .. literalinclude:: ../examples/isaacgym/ppo_anymal.py :language: python - :linenos: - :emphasize-lines: 13-14, 62-67 + :emphasize-lines: 13-14, 61-62 .. tab:: AnymalTerrain - View the raw code: `ppo_anymal_terrain.py `_ + :download:`ppo_anymal_terrain.py <../examples/isaacgym/ppo_anymal_terrain.py>` .. literalinclude:: ../examples/isaacgym/ppo_anymal_terrain.py :language: python - :linenos: - :emphasize-lines: 11, 109-112 + :emphasize-lines: 11, 105-108 .. tab:: BallBalance - View the raw code: `ppo_ball_balance.py `_ + :download:`ppo_ball_balance.py <../examples/isaacgym/ppo_ball_balance.py>` .. literalinclude:: ../examples/isaacgym/ppo_ball_balance.py :language: python - :linenos: - :emphasize-lines: 11, 108-111 + :emphasize-lines: 11, 104-107 .. tab:: Cartpole - View the raw code: `ppo_cartpole.py `_ + :download:`ppo_cartpole.py <../examples/isaacgym/ppo_cartpole.py>` .. literalinclude:: ../examples/isaacgym/ppo_cartpole.py :language: python - :linenos: :emphasize-lines: 15, 19 .. tab:: Cartpole (TRPO) - View the raw code: `trpo_cartpole.py `_ + :download:`trpo_cartpole.py <../examples/isaacgym/trpo_cartpole.py>` .. literalinclude:: ../examples/isaacgym/trpo_cartpole.py :language: python - :linenos: :emphasize-lines: 14, 18 .. tab:: FrankaCabinet - View the raw code: `ppo_franka_cabinet.py `_ + :download:`ppo_franka_cabinet.py <../examples/isaacgym/ppo_franka_cabinet.py>` .. literalinclude:: ../examples/isaacgym/ppo_franka_cabinet.py :language: python - :linenos: - :emphasize-lines: 10, 97-98 + :emphasize-lines: 10, 93-94 .. tab:: Humanoid - View the raw code: `ppo_humanoid.py `_ + :download:`ppo_humanoid.py <../examples/isaacgym/ppo_humanoid.py>` .. literalinclude:: ../examples/isaacgym/ppo_humanoid.py :language: python - :linenos: - :emphasize-lines: 10, 97-98 + :emphasize-lines: 10, 93-94 .. tab:: Humanoid (AMP) - View the raw code: `amp_humanoid.py `_ + :download:`amp_humanoid.py <../examples/isaacgym/amp_humanoid.py>` .. literalinclude:: ../examples/isaacgym/amp_humanoid.py :language: python - :linenos: - :emphasize-lines: 86, 120, 131, 134-135 + :emphasize-lines: 89, 124, 135, 138-139 .. tab:: Ingenuity - View the raw code: `ppo_ingenuity.py `_ + :download:`ppo_ingenuity.py <../examples/isaacgym/ppo_ingenuity.py>` .. literalinclude:: ../examples/isaacgym/ppo_ingenuity.py :language: python - :linenos: - :emphasize-lines: 2, 60-66 + :emphasize-lines: 2, 61-67 .. tab:: Quadcopter - View the raw code: `ppo_quadcopter.py `_ + :download:`ppo_quadcopter.py <../examples/isaacgym/ppo_quadcopter.py>` .. literalinclude:: ../examples/isaacgym/ppo_quadcopter.py :language: python - :linenos: - :emphasize-lines: 108 + :emphasize-lines: 104 .. tab:: ShadowHand - View the raw code: `ppo_shadow_hand.py `_ + :download:`ppo_shadow_hand.py <../examples/isaacgym/ppo_shadow_hand.py>` .. literalinclude:: ../examples/isaacgym/ppo_shadow_hand.py :language: python - :linenos: - :emphasize-lines: 112 + :emphasize-lines: 108 .. tab:: Trifinger - View the raw code: `ppo_trifinger.py `_ + :download:`ppo_trifinger.py <../examples/isaacgym/ppo_trifinger.py>` .. literalinclude:: ../examples/isaacgym/ppo_trifinger.py :language: python - :linenos: - :emphasize-lines: 112 + :emphasize-lines: 108 .. tab:: Isaac Gym environments (evaluation) @@ -463,7 +449,7 @@ The PPO agent configuration is mapped, as far as possible, from the rl_games' A2 .. tab:: Cartpole - View the raw code: `ppo_cartpole_eval.py `_ + :download:`ppo_cartpole_eval.py <../examples/isaacgym/ppo_cartpole_eval.py>` **Note:** It is necessary to adjust the checkpoint path according to the directories generated by the new experiments @@ -471,8 +457,7 @@ The PPO agent configuration is mapped, as far as possible, from the rl_games' A2 .. literalinclude:: ../examples/isaacgym/ppo_cartpole_eval.py :language: python - :linenos: - :emphasize-lines: 49, 52, 76 + :emphasize-lines: 65 .. raw:: html @@ -589,10 +574,10 @@ The following components or practices are exemplified (highlighted):
-Learning in an Omniverse Isaac Gym environment (one agent, multiple environments) ---------------------------------------------------------------------------------- +Learning in an Omniverse Isaac Gym environment +---------------------------------------------- -These examples perform the training of an agent in the `Omniverse Isaac Gym environments `_ +These examples perform the training of an agent in the `Omniverse Isaac Gym environments `_ (**one agent, multiple environments**) .. image:: ../_static/imgs/example_omniverse_isaacgym.png :width: 100% @@ -643,66 +628,59 @@ The PPO agent configuration is mapped, as far as possible, from the rl_games' A2 .. tab:: AllegroHand - View the raw code: `omniverse\: ppo_allegro_hand.py `_ + :download:`ppo_allegro_hand.py <../examples/omniisaacgym/ppo_allegro_hand.py>` .. literalinclude:: ../examples/omniisaacgym/ppo_allegro_hand.py :language: python - :linenos: - :emphasize-lines: 11-12, 58-59 + :emphasize-lines: 11-12, 59-60 .. tab:: Ant - View the raw code: `omniverse\: ppo_ant.py `_ + :download:`ppo_ant.py <../examples/omniisaacgym/ppo_ant.py>` .. literalinclude:: ../examples/omniisaacgym/ppo_ant.py :language: python - :linenos: - :emphasize-lines: 11-12, 58-59 + :emphasize-lines: 11-12, 59-60 .. tab:: Ant (multi-threaded) - View the raw code: `omniverse\: ppo_ant_mt.py `_ + :download:`ppo_ant_mt.py <../examples/omniisaacgym/ppo_ant_mt.py>` .. literalinclude:: ../examples/omniisaacgym/ppo_ant_mt.py :language: python - :linenos: - :emphasize-lines: 1, 13-14, 60-61, 124, 128 + :emphasize-lines: 1, 13-14, 61-62, 126, 130 .. tab:: Cartpole - View the raw code: `omniverse\: ppo_cartpole.py `_ + :download:`ppo_cartpole.py <../examples/omniisaacgym/ppo_cartpole.py>` .. literalinclude:: ../examples/omniisaacgym/ppo_cartpole.py :language: python - :linenos: - :emphasize-lines: 11-12, 54-55 + :emphasize-lines: 11-12, 55-56 .. tab:: Cartpole (multi-threaded) - View the raw code: `omniverse\: ppo_cartpole_mt.py `_ + :download:`ppo_cartpole_mt.py <../examples/omniisaacgym/ppo_cartpole_mt.py>` .. literalinclude:: ../examples/omniisaacgym/ppo_cartpole_mt.py :language: python - :linenos: - :emphasize-lines: 1, 13-14, 56-57, 120, 124 + :emphasize-lines: 1, 13-14, 57-58, 122, 126 .. tab:: Humanoid - View the raw code: `omniverse\: ppo_humanoid.py `_ + :download:`ppo_humanoid.py <../examples/omniisaacgym/ppo_humanoid.py>` .. literalinclude:: ../examples/omniisaacgym/ppo_humanoid.py :language: python - :linenos: - :emphasize-lines: 11-12, 58-59 + :emphasize-lines: 11-12, 59-60 .. tab:: ShadowHand - View the raw code: `omniverse\: ppo_shadow_hand.py `_ + :download:`ppo_shadow_hand.py <../examples/omniisaacgym/ppo_shadow_hand.py>` .. literalinclude:: ../examples/omniisaacgym/ppo_shadow_hand.py :language: python - :linenos: - :emphasize-lines: 11-12, 62-63 + :emphasize-lines: 11-12, 63-64 .. raw:: html From c0fffe33daf1294c904b6320b0dc5ac53e35c11c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20Serrano=20Mu=C3=B1oz?= Date: Mon, 12 Sep 2022 23:34:54 +0200 Subject: [PATCH 074/108] Update checkpoint section in docs --- docs/source/intro/data.rst | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/docs/source/intro/data.rst b/docs/source/intro/data.rst index 9deef70b..fa018b8f 100644 --- a/docs/source/intro/data.rst +++ b/docs/source/intro/data.rst @@ -23,7 +23,7 @@ Each agent offers the following parameters under the :literal:`"experiment"` key "write_interval": 250, # TensorBoard writing interval (timesteps) "checkpoint_interval": 1000, # interval for checkpoints (timesteps) - "checkpoint_policy_only": True, # checkpoint for policy only + "store_separately": False, # whether to store checkpoints separately } } @@ -126,13 +126,13 @@ Tracking custom metrics/scales ---------------- -Model checkpoint ----------------- +Checkpoints +----------- Saving checkpoints ^^^^^^^^^^^^^^^^^^ -The checkpoints are saved in the :literal:`checkpoints` subdirectory of the experiment's directory (its path can be customized using the options described in the previous subsection). The checkpoint name is the current timestep and the key referring to the model (e.g. :literal:`runs/22-01-09_22-48-49-816281_DDPG/checkpoints/2500_policy.pt`) +The checkpoints are saved in the :literal:`checkpoints` subdirectory of the experiment's directory (its path can be customized using the options described in the previous subsection). The checkpoint name is the key referring to the agent (or models, optimizers and preprocessors) and the current timestep (e.g. :literal:`runs/22-01-09_22-48-49-816281_DDPG/checkpoints/agent_2500.pt`) The checkpoint management, as in the previous case, is the responsibility of the agents (**can be customized independently for each agent using its configuration dictionary**) @@ -148,30 +148,30 @@ The checkpoint management, as in the previous case, is the responsibility of the "write_interval": 250, # TensorBoard writing interval (timesteps) "checkpoint_interval": 1000, # interval for checkpoints (timesteps) - "checkpoint_policy_only": True, # checkpoint for policy only + "store_separately": False, # whether to store checkpoints separately } } * **checkpoint_interval**: interval for checkpoints (default is 1000 timesteps). A value equal to or less than 0 disables the checkpoint creation -* **checkpoint_policy_only**: if set to :literal:`True`, only the policy will be saved (default behaviour), otherwise all the agent's models (policy, value function, critic, .etc) will be checkpointed +* **store_separately**: if set to :literal:`True`, all the modules that an agent contains (models, optimizers, preprocessors, etc.) will be saved each one in a separate file. By default (:literal:`False`) the modules are grouped in a dictionary and stored in the same file **Checkpointing the best models** -The best models, attending the mean total reward, will be saved in the :literal:`checkpoints` subdirectory of the experiment's directory. The checkpoint name is the word :literal:`best` and the key referring to the model (e.g. :literal:`runs/22-01-09_22-48-49-816281_DDPG/checkpoints/best_policy.pt`) +The best models, attending the mean total reward, will be saved in the :literal:`checkpoints` subdirectory of the experiment's directory. The checkpoint name is the word :literal:`best` and the key referring to the model (e.g. :literal:`runs/22-01-09_22-48-49-816281_DDPG/checkpoints/best_agent.pt`) -The best models are updated internally on each TensorBoard writing interval :literal:`"write_interval"` and they are saved on each checkpoint interval :literal:`"checkpoint_interval"`. The :literal:`"checkpoint_policy_only"` key specifies whether the best policy or the best models (policy, value function, critic, .etc) will be checkpointed +The best models are updated internally on each TensorBoard writing interval :literal:`"write_interval"` and they are saved on each checkpoint interval :literal:`"checkpoint_interval"`. The :literal:`"store_separately"` key specifies whether the best modules are grouped and stored together or separately Loading checkpoints ^^^^^^^^^^^^^^^^^^^ -Checkpoints can be loaded for each of the instantiated models independently via the :literal:`.load(...)` method (`Model.load <../modules/skrl.models.base_class.html#skrl.models.torch.base.Model.load>`_). It accepts the path (relative or absolute) of the checkpoint to load as the only argument. The checkpoint will be dynamically mapped to the device specified as argument in the class constructor (internally the torch load's :literal:`map_location` method is used during loading) +Checkpoints can be loaded for each of the instantiated agents (or models) independently via the :literal:`.load(...)` method (`Agent.load <../modules/skrl.agents.base_class.html#skrl.agents.torch.base.Agent.load>`_ or `Model.load <../modules/skrl.models.base_class.html#skrl.models.torch.base.Model.load>`_). It accepts the path (relative or absolute) of the checkpoint to load as the only argument. The checkpoint will be dynamically mapped to the device specified as argument in the class constructor (internally the torch load's :literal:`map_location` method is used during loading) .. note:: - The model instance must have the same architecture/structure as the one used to save the checkpoint. The current implementation load the model's `state_dict `_ directly + The agents or models instances must have the same architecture/structure as the one used to save the checkpoint. The current implementation load the model's `state_dict `_ directly -The following code shows how to load the checkpoint (e.g. :literal:`runs/22-01-09_22-48-49-816281_DDPG/checkpoints/2500_policy.pt`) of an instantiated policy from a specific definition. See the section :ref:`Examples ` for details about how to load control points and use them to evaluate experiments +The following code shows how to load the checkpoint (e.g. :literal:`runs/22-01-09_22-48-49-816281_DDPG/checkpoints/2500_policy.pt`) of an instantiated policy from a specific definition. See the :ref:`Examples ` section for showcases about how to load control points and use them to continue the training or evaluate experiments .. code-block:: python :emphasize-lines: 21 @@ -198,6 +198,11 @@ The following code shows how to load the checkpoint (e.g. :literal:`runs/22-01-0 # Load the checkpoint policy.load("./runs/22-01-09_22-48-49-816281_DDPG/checkpoints/2500_policy.pt") +Migrating external checkpoints +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +It is possible to load checkpoints generated with external reinforcement learning libraries into skrl agents (or models) via the :literal:`.migrate(...)` method (`Agent.migrate <../modules/skrl.agents.base_class.html#skrl.agents.torch.base.Agent.migrate>`_ or `Model.migrate <../modules/skrl.models.base_class.html#skrl.models.torch.base.Model.migrate>`_). + -------------------- Memory export/import From 9fb7cb494ba940debf9026b9a539862dda5b3068 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20Serrano=20Mu=C3=B1oz?= Date: Tue, 13 Sep 2022 18:25:42 +0200 Subject: [PATCH 075/108] Update DeepMind examples --- .../deepmind/dm_manipulation_stack_sac.py | 32 +++++++++--------- .../dm_suite_cartpole_swingup_ddpg.py | 33 ++++++++++--------- 2 files changed, 35 insertions(+), 30 deletions(-) diff --git a/docs/source/examples/deepmind/dm_manipulation_stack_sac.py b/docs/source/examples/deepmind/dm_manipulation_stack_sac.py index 7890429c..55cc071f 100644 --- a/docs/source/examples/deepmind/dm_manipulation_stack_sac.py +++ b/docs/source/examples/deepmind/dm_manipulation_stack_sac.py @@ -4,21 +4,21 @@ import torch.nn as nn # Import the skrl components to build the RL system -from skrl.models.torch import GaussianModel, DeterministicModel +from skrl.models.torch import Model, GaussianMixin, DeterministicMixin from skrl.memories.torch import RandomMemory from skrl.agents.torch.sac import SAC, SAC_DEFAULT_CONFIG from skrl.trainers.torch import SequentialTrainer from skrl.envs.torch import wrap_env -# Define the models (stochastic and deterministic models) for the SAC agent using the helper classes +# Define the models (stochastic and deterministic models) for the SAC agent using the mixins. # - StochasticActor (policy): takes as input the environment's observation/state and returns an action # - Critic: takes the state and action as input and provides a value to guide the policy -class StochasticActor(GaussianModel): +class StochasticActor(GaussianMixin, Model): def __init__(self, observation_space, action_space, device, clip_actions=False, clip_log_std=True, min_log_std=-20, max_log_std=2): - super().__init__(observation_space, action_space, device, clip_actions, - clip_log_std, min_log_std, max_log_std) + Model.__init__(self, observation_space, action_space, device) + GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std) self.features_extractor = nn.Sequential(nn.Conv2d(3, 32, kernel_size=8, stride=3), nn.ReLU(), @@ -40,7 +40,7 @@ def __init__(self, observation_space, action_space, device, clip_actions=False, self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) - def compute(self, states, taken_actions): + def compute(self, states, taken_actions, role): # The dm_control.manipulation tasks have as observation/state spec a `collections.OrderedDict` object as follows: # OrderedDict([('front_close', BoundedArray(shape=(1, 84, 84, 3), dtype=dtype('uint8'), name='front_close', minimum=0, maximum=255)), # ('jaco_arm/joints_pos', Array(shape=(1, 6, 2), dtype=dtype('float64'), name='jaco_arm/joints_pos')), @@ -83,9 +83,10 @@ def compute(self, states, taken_actions): input["jaco_arm/joints_pos"].view(states.shape[0], -1), input["jaco_arm/joints_vel"].view(states.shape[0], -1)], dim=-1))), self.log_std_parameter -class Critic(DeterministicModel): - def __init__(self, observation_space, action_space, device, clip_actions = False): - super().__init__(observation_space, action_space, device, clip_actions) +class Critic(DeterministicMixin, Model): + def __init__(self, observation_space, action_space, device, clip_actions=False): + Model.__init__(self, observation_space, action_space, device) + DeterministicMixin.__init__(self, clip_actions) self.features_extractor = nn.Sequential(nn.Conv2d(3, 32, kernel_size=8, stride=3), nn.ReLU(), @@ -105,7 +106,7 @@ def __init__(self, observation_space, action_space, device, clip_actions = False nn.ReLU(), nn.Linear(32, 1)) - def compute(self, states, taken_actions): + def compute(self, states, taken_actions, role): # map the observations/states to the original space. # See the explanation above (StochasticActor.compute) input = self.tensor_to_space(states, self.observation_space) @@ -133,11 +134,12 @@ def compute(self, states, taken_actions): # Instantiate the agent's models (function approximators). # SAC requires 5 models, visit its documentation for more details # https://skrl.readthedocs.io/en/latest/modules/skrl.agents.sac.html#spaces-and-models -models_sac = {"policy": StochasticActor(env.observation_space, env.action_space, device, clip_actions=True), - "critic_1": Critic(env.observation_space, env.action_space, device), - "critic_2": Critic(env.observation_space, env.action_space, device), - "target_critic_1": Critic(env.observation_space, env.action_space, device), - "target_critic_2": Critic(env.observation_space, env.action_space, device)} +models_sac = {} +models_sac["policy"] = StochasticActor(env.observation_space, env.action_space, device, clip_actions=True) +models_sac["critic_1"] = Critic(env.observation_space, env.action_space, device) +models_sac["critic_2"] = Critic(env.observation_space, env.action_space, device) +models_sac["target_critic_1"] = Critic(env.observation_space, env.action_space, device) +models_sac["target_critic_2"] = Critic(env.observation_space, env.action_space, device) # Initialize the models' parameters (weights and biases) using a Gaussian distribution for model in models_sac.values(): diff --git a/docs/source/examples/deepmind/dm_suite_cartpole_swingup_ddpg.py b/docs/source/examples/deepmind/dm_suite_cartpole_swingup_ddpg.py index bc3c63db..f0469fb3 100644 --- a/docs/source/examples/deepmind/dm_suite_cartpole_swingup_ddpg.py +++ b/docs/source/examples/deepmind/dm_suite_cartpole_swingup_ddpg.py @@ -5,7 +5,7 @@ import torch.nn.functional as F # Import the skrl components to build the RL system -from skrl.models.torch import DeterministicModel +from skrl.models.torch import Model, DeterministicMixin from skrl.memories.torch import RandomMemory from skrl.agents.torch.ddpg import DDPG, DDPG_DEFAULT_CONFIG from skrl.resources.noises.torch import OrnsteinUhlenbeckNoise @@ -13,26 +13,28 @@ from skrl.envs.torch import wrap_env -# Define the models (deterministic models) for the DDPG agent using a helper class -# and programming with two approaches (layer by layer and torch.nn.Sequential class). +# Define the models (deterministic models) for the DDPG agent using mixins +# and programming with two approaches (torch functional and torch.nn.Sequential class). # - Actor (policy): takes as input the environment's observation/state and returns an action # - Critic: takes the state and action as input and provides a value to guide the policy -class DeterministicActor(DeterministicModel): - def __init__(self, observation_space, action_space, device, clip_actions = False): - super().__init__(observation_space, action_space, device, clip_actions) +class DeterministicActor(DeterministicMixin, Model): + def __init__(self, observation_space, action_space, device, clip_actions=False): + Model.__init__(self, observation_space, action_space, device) + DeterministicMixin.__init__(self, clip_actions) self.linear_layer_1 = nn.Linear(self.num_observations, 400) self.linear_layer_2 = nn.Linear(400, 300) self.action_layer = nn.Linear(300, self.num_actions) - def compute(self, states, taken_actions): + def compute(self, states, taken_actions, role): x = F.relu(self.linear_layer_1(states)) x = F.relu(self.linear_layer_2(x)) return torch.tanh(self.action_layer(x)) -class DeterministicCritic(DeterministicModel): - def __init__(self, observation_space, action_space, device, clip_actions = False): - super().__init__(observation_space, action_space, device, clip_actions) +class DeterministicCritic(DeterministicMixin, Model): + def __init__(self, observation_space, action_space, device, clip_actions=False): + Model.__init__(self, observation_space, action_space, device) + DeterministicMixin.__init__(self, clip_actions) self.net = nn.Sequential(nn.Linear(self.num_observations + self.num_actions, 400), nn.ReLU(), @@ -40,7 +42,7 @@ def __init__(self, observation_space, action_space, device, clip_actions = False nn.ReLU(), nn.Linear(300, 1)) - def compute(self, states, taken_actions): + def compute(self, states, taken_actions, role): return self.net(torch.cat([states, taken_actions], dim=1)) @@ -58,10 +60,11 @@ def compute(self, states, taken_actions): # Instantiate the agent's models (function approximators). # DDPG requires 4 models, visit its documentation for more details # https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ddpg.html#spaces-and-models -models_ddpg = {"policy": DeterministicActor(env.observation_space, env.action_space, device, clip_actions=True), - "target_policy": DeterministicActor(env.observation_space, env.action_space, device, clip_actions=True), - "critic": DeterministicCritic(env.observation_space, env.action_space, device), - "target_critic": DeterministicCritic(env.observation_space, env.action_space, device)} +models_ddpg = {} +models_ddpg["policy"] = DeterministicActor(env.observation_space, env.action_space, device, clip_actions=True) +models_ddpg["target_policy"] = DeterministicActor(env.observation_space, env.action_space, device, clip_actions=True) +models_ddpg["critic"] = DeterministicCritic(env.observation_space, env.action_space, device) +models_ddpg["target_critic"] = DeterministicCritic(env.observation_space, env.action_space, device) # Initialize the models' parameters (weights and biases) using a Gaussian distribution for model in models_ddpg.values(): From a1fceb43b95367640ded766239290c2c6935ab66 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20Serrano=20Mu=C3=B1oz?= Date: Tue, 13 Sep 2022 18:52:15 +0200 Subject: [PATCH 076/108] Update Isaac Gym and DeepMind examples in docs --- docs/source/intro/examples.rst | 55 ++++++++++++++-------------------- 1 file changed, 23 insertions(+), 32 deletions(-) diff --git a/docs/source/intro/examples.rst b/docs/source/intro/examples.rst index a9abc232..ca0756ce 100644 --- a/docs/source/intro/examples.rst +++ b/docs/source/intro/examples.rst @@ -222,10 +222,10 @@ The following components or practices are exemplified (highlighted):
-Learning in a DeepMind environment (one agent, one environment) ---------------------------------------------------------------- +Learning in a DeepMind environment +---------------------------------- -This example performs the training of one agent in an DeepMind environment +These examples perform the training of one agent in an DeepMind environment (**one agent, one environment**) .. image:: ../_static/imgs/example_deepmind.png :width: 100% @@ -249,12 +249,11 @@ The following components or practices are exemplified (highlighted): .. group-tab:: Training - View the raw code: `dm_suite_cartpole_swingup_ddpg.py `_ + :download:`dm_suite_cartpole_swingup_ddpg.py <../examples/deepmind/dm_suite_cartpole_swingup_ddpg.py>` .. literalinclude:: ../examples/deepmind/dm_suite_cartpole_swingup_ddpg.py :language: python - :linenos: - :emphasize-lines: 1, 13, 48-49, 93 + :emphasize-lines: 1, 13, 50-51 .. tab:: manipulation:reach_site_vision (SAC) @@ -262,12 +261,11 @@ The following components or practices are exemplified (highlighted): .. group-tab:: Training - View the raw code: `dm_manipulation_stack_sac.py `_ + :download:`dm_manipulation_stack_sac.py <../examples/deepmind/dm_manipulation_stack_sac.py>` .. literalinclude:: ../examples/deepmind/dm_manipulation_stack_sac.py :language: python - :linenos: - :emphasize-lines: 67, 80, 83-84, 111, 114, 117-118 + :emphasize-lines: 67, 80, 83-84, 112, 115, 118-119 .. raw:: html @@ -463,10 +461,10 @@ The PPO agent configuration is mapped, as far as possible, from the rl_games' A2
-Learning by scopes in an Isaac Gym environment (multiple agents and environments) ---------------------------------------------------------------------------------- +Learning by scopes in an Isaac Gym environment +---------------------------------------------- -This example performs the training of 3 agents by scopes in Isaac Gym's Cartpole environment in the same run. It tries to load the environment from preview 4 (or preview 3), but if it fails, it will try to load the environment from preview 2 +These examples perform the training of 3 agents by scopes in Isaac Gym's Cartpole environment in the same run (**multiple agents and environments**) .. image:: ../_static/imgs/example_parallel.jpg :width: 100% @@ -502,16 +500,15 @@ The following components or practices are exemplified (highlighted): .. tab:: Sequential training - View the raw code: `isaacgym_sequential_shared_memory.py `_ + :download:`isaacgym_sequential_shared_memory.py <../examples/isaacgym/isaacgym_sequential_shared_memory.py>` .. literalinclude:: ../examples/isaacgym/isaacgym_sequential_shared_memory.py :language: python - :linenos: - :emphasize-lines: 81, 152, 159, 166, 177-178 + :emphasize-lines: 75, 149, 156, 163, 174-175 .. tab:: Sequential evaluation - View the raw code: `isaacgym_sequential_shared_memory_eval.py `_ + :download:`isaacgym_sequential_shared_memory_eval.py <../examples/isaacgym/isaacgym_sequential_shared_memory_eval.py>` **Note:** It is necessary to adjust the checkpoint path according to the directories generated by the new experiments @@ -519,8 +516,7 @@ The following components or practices are exemplified (highlighted): .. literalinclude:: ../examples/isaacgym/isaacgym_sequential_shared_memory_eval.py :language: python - :linenos: - :emphasize-lines: 64, 67, 70, 73-75, 129 + :emphasize-lines: 113-115, 126 .. tab:: No shared memory @@ -528,25 +524,23 @@ The following components or practices are exemplified (highlighted): .. tab:: Sequential training - View the raw code: `isaacgym_sequential_no_shared_memory.py `_ + :download:`isaacgym_sequential_no_shared_memory.py <../examples/isaacgym/isaacgym_sequential_no_shared_memory.py>` .. literalinclude:: ../examples/isaacgym/isaacgym_sequential_no_shared_memory.py :language: python - :linenos: - :emphasize-lines: 81-83, 154, 161, 168, 179-180 + :emphasize-lines: 75-77, 151, 158, 165, 176-177 .. tab:: Parallel training - View the raw code: `isaacgym_parallel_no_shared_memory.py `_ + :download:`isaacgym_parallel_no_shared_memory.py <../examples/isaacgym/isaacgym_parallel_no_shared_memory.py>` .. literalinclude:: ../examples/isaacgym/isaacgym_parallel_no_shared_memory.py :language: python - :linenos: - :emphasize-lines: 14, 67, 179-182 + :emphasize-lines: 13, 67, 176-179 .. tab:: Sequential eval... - View the raw code: `isaacgym_sequential_no_shared_memory_eval.py `_ + :download:`isaacgym_sequential_no_shared_memory_eval.py <../examples/isaacgym/isaacgym_sequential_no_shared_memory_eval.py>` **Note:** It is necessary to adjust the checkpoint path according to the directories generated by the new experiments @@ -554,12 +548,11 @@ The following components or practices are exemplified (highlighted): .. literalinclude:: ../examples/isaacgym/isaacgym_sequential_no_shared_memory_eval.py :language: python - :linenos: - :emphasize-lines: 64, 67, 70, 73-75, 129 + :emphasize-lines: 113-115, 126 .. tab:: Parallel eval... - View the raw code: `isaacgym_parallel_no_shared_memory_eval.py `_ + :download:`isaacgym_parallel_no_shared_memory_eval.py <../examples/isaacgym/isaacgym_parallel_no_shared_memory_eval.py>` **Note:** It is necessary to adjust the checkpoint path according to the directories generated by the new experiments @@ -567,8 +560,7 @@ The following components or practices are exemplified (highlighted): .. literalinclude:: ../examples/isaacgym/isaacgym_parallel_no_shared_memory_eval.py :language: python - :linenos: - :emphasize-lines: 85, 88, 91, 94-96, 150 + :emphasize-lines: 115-117, 128 .. raw:: html @@ -799,11 +791,10 @@ This example shows how to use the library utilities to carry out the post-proces Example of a figure, generated by the code, showing the total reward (left) and the mean and standard deviation (right) of all experiments located in the runs folder - View the raw code: `tensorboard_file_iterator.py `_ + :download:`tensorboard_file_iterator.py <../examples/utils/tensorboard_file_iterator.py>` **Note:** The code will load all the Tensorboard files of the experiments located in the :literal:`runs` folder. It is necessary to adjust the iterator's parameters for other paths .. literalinclude:: ../examples/utils/tensorboard_file_iterator.py :language: python - :linenos: :emphasize-lines: 4, 11-13 From 3f4bb9415029d7ec1e640ba60f3b6627e0f9c9b4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20Serrano=20Mu=C3=B1oz?= Date: Tue, 13 Sep 2022 18:56:58 +0200 Subject: [PATCH 077/108] Set max and min clip values dtypes to float32 --- skrl/models/torch/deterministic.py | 4 ++-- skrl/models/torch/gaussian.py | 4 ++-- skrl/models/torch/multivariate_gaussian.py | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/skrl/models/torch/deterministic.py b/skrl/models/torch/deterministic.py index 72996d96..a90cfaff 100644 --- a/skrl/models/torch/deterministic.py +++ b/skrl/models/torch/deterministic.py @@ -55,8 +55,8 @@ def __init__(self, clip_actions: bool = False, role: str = "") -> None: self._d_clip_actions[role] = clip_actions and issubclass(type(self.action_space), gym.Space) if self._d_clip_actions[role]: - self.clip_actions_min = torch.tensor(self.action_space.low, device=self.device) - self.clip_actions_max = torch.tensor(self.action_space.high, device=self.device) + self.clip_actions_min = torch.tensor(self.action_space.low, device=self.device, dtype=torch.float32) + self.clip_actions_max = torch.tensor(self.action_space.high, device=self.device, dtype=torch.float32) # backward compatibility: torch < 1.9 clamp method does not support tensors self._backward_compatibility = tuple(map(int, (torch.__version__.split(".")[:2]))) < (1, 9) diff --git a/skrl/models/torch/gaussian.py b/skrl/models/torch/gaussian.py index 8e69a343..c56fce15 100644 --- a/skrl/models/torch/gaussian.py +++ b/skrl/models/torch/gaussian.py @@ -76,8 +76,8 @@ def __init__(self, self._g_clip_actions[role] = clip_actions and issubclass(type(self.action_space), gym.Space) if self._g_clip_actions[role]: - self.clip_actions_min = torch.tensor(self.action_space.low, device=self.device) - self.clip_actions_max = torch.tensor(self.action_space.high, device=self.device) + self.clip_actions_min = torch.tensor(self.action_space.low, device=self.device, dtype=torch.float32) + self.clip_actions_max = torch.tensor(self.action_space.high, device=self.device, dtype=torch.float32) # backward compatibility: torch < 1.9 clamp method does not support tensors self._backward_compatibility = tuple(map(int, (torch.__version__.split(".")[:2]))) < (1, 9) diff --git a/skrl/models/torch/multivariate_gaussian.py b/skrl/models/torch/multivariate_gaussian.py index b4d6e7b3..b2b85449 100644 --- a/skrl/models/torch/multivariate_gaussian.py +++ b/skrl/models/torch/multivariate_gaussian.py @@ -69,8 +69,8 @@ def __init__(self, self._mg_clip_actions[role] = clip_actions and issubclass(type(self.action_space), gym.Space) if self._mg_clip_actions[role]: - self.clip_actions_min = torch.tensor(self.action_space.low, device=self.device) - self.clip_actions_max = torch.tensor(self.action_space.high, device=self.device) + self.clip_actions_min = torch.tensor(self.action_space.low, device=self.device, dtype=torch.float32) + self.clip_actions_max = torch.tensor(self.action_space.high, device=self.device, dtype=torch.float32) # backward compatibility: torch < 1.9 clamp method does not support tensors self._backward_compatibility = tuple(map(int, (torch.__version__.split(".")[:2]))) < (1, 9) From 5c9f476f799a861ee1b887d5d95af2f9c15387de Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20Serrano=20Mu=C3=B1oz?= Date: Tue, 13 Sep 2022 21:59:16 +0200 Subject: [PATCH 078/108] Update Isaac Sim examples --- .../isaacsim/cartpole_example_skrl.py | 24 +++++++++-------- .../examples/isaacsim/isaacsim_jetbot_ppo.py | 27 ++++++++++--------- 2 files changed, 27 insertions(+), 24 deletions(-) diff --git a/docs/source/examples/isaacsim/cartpole_example_skrl.py b/docs/source/examples/isaacsim/cartpole_example_skrl.py index 5c4a16d0..d31c56e2 100644 --- a/docs/source/examples/isaacsim/cartpole_example_skrl.py +++ b/docs/source/examples/isaacsim/cartpole_example_skrl.py @@ -14,7 +14,7 @@ import torch.nn as nn # Import the skrl components to build the RL system -from skrl.models.torch import GaussianModel, DeterministicModel +from skrl.models.torch import Model, GaussianMixin, DeterministicMixin from skrl.memories.torch import RandomMemory from skrl.agents.torch.ppo import PPO, PPO_DEFAULT_CONFIG from skrl.resources.schedulers.torch import KLAdaptiveRL @@ -22,14 +22,14 @@ from skrl.envs.torch import wrap_env -# Define the models (stochastic and deterministic models) for the agent using helper classes. +# Define the models (stochastic and deterministic models) for the agent using mixins. # - Policy: takes as input the environment's observation/state and returns an action # - Value: takes the state as input and provides a value to guide the policy -class Policy(GaussianModel): +class Policy(GaussianMixin, Model): def __init__(self, observation_space, action_space, device, clip_actions=False, clip_log_std=True, min_log_std=-20, max_log_std=2): - super().__init__(observation_space, action_space, device, clip_actions, - clip_log_std, min_log_std, max_log_std) + Model.__init__(self, observation_space, action_space, device) + GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std) self.net = nn.Sequential(nn.Linear(self.num_observations, 64), nn.Tanh(), @@ -38,12 +38,13 @@ def __init__(self, observation_space, action_space, device, clip_actions=False, nn.Linear(64, self.num_actions)) self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) - def compute(self, states, taken_actions): + def compute(self, states, taken_actions, role): return torch.tanh(self.net(states)), self.log_std_parameter -class Value(DeterministicModel): +class Value(DeterministicMixin, Model): def __init__(self, observation_space, action_space, device, clip_actions=False): - super().__init__(observation_space, action_space, device, clip_actions) + Model.__init__(self, observation_space, action_space, device) + DeterministicMixin.__init__(self, clip_actions) self.net = nn.Sequential(nn.Linear(self.num_observations, 64), nn.Tanh(), @@ -51,7 +52,7 @@ def __init__(self, observation_space, action_space, device, clip_actions=False): nn.Tanh(), nn.Linear(64, 1)) - def compute(self, states, taken_actions): + def compute(self, states, taken_actions, role): return self.net(states) @@ -68,8 +69,9 @@ def compute(self, states, taken_actions): # Instantiate the agent's models (function approximators). # PPO requires 2 models, visit its documentation for more details # https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#spaces-and-models -models_ppo = {"policy": Policy(env.observation_space, env.action_space, device, clip_actions=True), - "value": Value(env.observation_space, env.action_space, device)} +models_ppo = {} +models_ppo["policy"] = Policy(env.observation_space, env.action_space, device) +models_ppo["value"] = Value(env.observation_space, env.action_space, device) # Initialize the models' parameters (weights and biases) using a Gaussian distribution for model in models_ppo.values(): diff --git a/docs/source/examples/isaacsim/isaacsim_jetbot_ppo.py b/docs/source/examples/isaacsim/isaacsim_jetbot_ppo.py index 539d69b9..84790767 100644 --- a/docs/source/examples/isaacsim/isaacsim_jetbot_ppo.py +++ b/docs/source/examples/isaacsim/isaacsim_jetbot_ppo.py @@ -5,22 +5,21 @@ import torch.nn as nn # Import the skrl components to build the RL system -from skrl.models.torch import DeterministicModel, GaussianModel +from skrl.models.torch import Model, GaussianMixin, DeterministicMixin from skrl.memories.torch import RandomMemory from skrl.agents.torch.ppo import PPO, PPO_DEFAULT_CONFIG from skrl.trainers.torch import SequentialTrainer from skrl.envs.torch import wrap_env -# Define the models (stochastic and deterministic models) for the agent using helper classes -# and programming with two approaches (layer by layer and torch.nn.Sequential class). +# Define the models (stochastic and deterministic models) for the agent using mixins. # - Policy: takes as input the environment's observation/state and returns an action # - Value: takes the state as input and provides a value to guide the policy -class Policy(GaussianModel): +class Policy(GaussianMixin, Model): def __init__(self, observation_space, action_space, device, clip_actions=False, clip_log_std=True, min_log_std=-20, max_log_std=2): - super().__init__(observation_space, action_space, device, clip_actions, - clip_log_std, min_log_std, max_log_std) + Model.__init__(self, observation_space, action_space, device) + GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std) self.net = nn.Sequential(nn.Conv2d(3, 32, kernel_size=8, stride=4), nn.ReLU(), @@ -40,15 +39,16 @@ def __init__(self, observation_space, action_space, device, clip_actions=False, nn.Linear(32, self.num_actions)) self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) - def compute(self, states, taken_actions): + def compute(self, states, taken_actions, role): # view (samples, width * height * channels) -> (samples, width, height, channels) # permute (samples, width, height, channels) -> (samples, channels, width, height) x = self.net(states.view(-1, *self.observation_space.shape).permute(0, 3, 1, 2)) return 10 * torch.tanh(x), self.log_std_parameter # JetBotEnv action_space is -10 to 10 -class Value(DeterministicModel): - def __init__(self, observation_space, action_space, device, clip_actions = False): - super().__init__(observation_space, action_space, device, clip_actions) +class Value(DeterministicMixin, Model): + def __init__(self, observation_space, action_space, device, clip_actions=False): + Model.__init__(self, observation_space, action_space, device) + DeterministicMixin.__init__(self, clip_actions) self.net = nn.Sequential(nn.Conv2d(3, 32, kernel_size=8, stride=4), nn.ReLU(), @@ -67,7 +67,7 @@ def __init__(self, observation_space, action_space, device, clip_actions = False nn.Tanh(), nn.Linear(32, 1)) - def compute(self, states, taken_actions): + def compute(self, states, taken_actions, role): # view (samples, width * height * channels) -> (samples, width, height, channels) # permute (samples, width, height, channels) -> (samples, channels, width, height) return self.net(states.view(-1, *self.observation_space.shape).permute(0, 3, 1, 2)) @@ -87,8 +87,9 @@ def compute(self, states, taken_actions): # Instantiate the agent's models (function approximators). # PPO requires 2 models, visit its documentation for more details # https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#spaces-and-models -models_ppo = {"policy": Policy(env.observation_space, env.action_space, device, clip_actions=True), - "value": Value(env.observation_space, env.action_space, device)} +models_ppo = {} +models_ppo["policy"] = Policy(env.observation_space, env.action_space, device) +models_ppo["value"] = Value(env.observation_space, env.action_space, device) # Initialize the models' parameters (weights and biases) using a Gaussian distribution for model in models_ppo.values(): From d1d825522267e0df6ab843c427b8fd3a4c1d8506 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20Serrano=20Mu=C3=B1oz?= Date: Tue, 13 Sep 2022 22:00:32 +0200 Subject: [PATCH 079/108] Update Isaac Sim examples in docs --- docs/source/intro/examples.rst | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/docs/source/intro/examples.rst b/docs/source/intro/examples.rst index ca0756ce..ea41d7e2 100644 --- a/docs/source/intro/examples.rst +++ b/docs/source/intro/examples.rst @@ -678,14 +678,14 @@ The PPO agent configuration is mapped, as far as possible, from the rl_games' A2
-Learning in an Omniverse Isaac Sim environment (one agent, one environment) ---------------------------------------------------------------------------- +Learning in an Omniverse Isaac Sim environment +---------------------------------------------- -These examples show how to train an agent in an Omniverse Isaac Sim environment that is implemented using the OpenAI Gym interface (one environment) +These examples show how to train an agent in an Omniverse Isaac Sim environment that is implemented using the OpenAI Gym interface (**one agent, one environment**) .. tabs:: - .. tab:: Isaac Sim 2022.1.0 (Cartpole) + .. tab:: Isaac Sim 2022.1.X (Cartpole) This example performs the training of an agent in the Isaac Sim's Cartpole environment described in the `Creating New RL Environment `_ tutorial @@ -703,11 +703,10 @@ These examples show how to train an agent in an Omniverse Isaac Sim environment
- View the raw code: `cartpole_example_skrl.py `_ + :download:`cartpole_example_skrl.py <../examples/isaacsim/cartpole_example_skrl.py>` .. literalinclude:: ../examples/isaacsim/cartpole_example_skrl.py :language: python - :linenos: .. tab:: Isaac Sim 2021.2.1 (JetBot) @@ -763,12 +762,11 @@ These examples show how to train an agent in an Omniverse Isaac Sim environment
- View the raw code: `isaacsim_jetbot_ppo.py `_ + :download:`isaacsim_jetbot_ppo.py <../examples/isaacsim/isaacsim_jetbot_ppo.py>` .. literalinclude:: ../examples/isaacsim/isaacsim_jetbot_ppo.py :language: python - :linenos: - :emphasize-lines: 19-47, 49-73 + :emphasize-lines: 24-39, 45, 53-68, 73 .. _library_utilities: From 1402290300164b4f89632ee7e67a0e6ee4379dd8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20Serrano=20Mu=C3=B1oz?= Date: Sat, 17 Sep 2022 10:10:42 +0200 Subject: [PATCH 080/108] Update snippets --- docs/source/snippets/agent.py | 55 +++++---- docs/source/snippets/model.py | 43 -------- docs/source/snippets/model_mixin.py | 104 ++++++++++++++++++ .../snippets/multivariate_gaussian_model.py | 9 -- docs/source/snippets/tabular_model.py | 26 +---- 5 files changed, 141 insertions(+), 96 deletions(-) delete mode 100644 docs/source/snippets/model.py create mode 100644 docs/source/snippets/model_mixin.py diff --git a/docs/source/snippets/agent.py b/docs/source/snippets/agent.py index 7b23b079..1f27f789 100644 --- a/docs/source/snippets/agent.py +++ b/docs/source/snippets/agent.py @@ -1,4 +1,4 @@ -from typing import Union, Tuple, Dict +from typing import Union, Tuple, Dict, Any import gym @@ -17,7 +17,7 @@ "write_interval": 250, # TensorBoard writing interval (timesteps) "checkpoint_interval": 1000, # interval for checkpoints (timesteps) - "checkpoint_policy_only": True, # checkpoint for policy only + "store_separately": False, # whether to store checkpoints separately } } @@ -51,19 +51,25 @@ def __init__(self, action_space=action_space, device=device, cfg=CUSTOM_DEFAULT_CONFIG) - # ================================ + # ===================================================================== # - get and process models from self.models - # - create self.checkpoint_models dictionary for storing checkpoints + # - populate self.checkpoint_modules dictionary for storing checkpoints # - parse configurations from self.cfg - # - setup optimizers + # - setup optimizers and learning rate scheduler + # - set up preprocessors + # ===================================================================== + + def init(self) -> None: + """Initialize the agent + """ + super().init() + self.set_mode("eval") + # ================================================================= # - create tensors in memory if required - # ================================ + # - # create temporary variables needed for storage and computation + # ================================================================= - def act(self, - states: torch.Tensor, - timestep: int, - timesteps: int, - inference: bool = False) -> torch.Tensor: + def act(self, states: torch.Tensor, timestep: int, timesteps: int) -> torch.Tensor: """Process the environment's states to make a decision (actions) using the main policy :param states: Environment's states @@ -78,17 +84,18 @@ def act(self, :return: Actions :rtype: torch.Tensor """ - # ================================ + # ====================================== # - sample random actions if required or # sample and return agent's actions - # ================================ + # ====================================== def record_transition(self, states: torch.Tensor, actions: torch.Tensor, rewards: torch.Tensor, next_states: torch.Tensor, - dones: torch.Tensor, + dones: torch.Tensor, + infos: Any, timestep: int, timesteps: int) -> None: """Record an environment transition in memory @@ -103,15 +110,17 @@ def record_transition(self, :type next_states: torch.Tensor :param dones: Signals to indicate that episodes have ended :type dones: torch.Tensor + :param infos: Additional information about the environment + :type infos: Any type supported by the environment :param timestep: Current timestep :type timestep: int :param timesteps: Number of timesteps :type timesteps: int """ - super().record_transition(states, actions, rewards, next_states, dones, timestep, timesteps) - # ================================ + super().record_transition(states, actions, rewards, next_states, dones, infos, timestep, timesteps) + # ======================================== # - record agent's specific data in memory - # ================================ + # ======================================== def pre_interaction(self, timestep: int, timesteps: int) -> None: """Callback called before the interaction with the environment @@ -121,9 +130,9 @@ def pre_interaction(self, timestep: int, timesteps: int) -> None: :param timesteps: Number of timesteps :type timesteps: int """ - # ================================ + # =================================== # - call self.update(...) if required - # ================================ + # =================================== def post_interaction(self, timestep: int, timesteps: int) -> None: """Callback called after the interaction with the environment @@ -133,9 +142,9 @@ def post_interaction(self, timestep: int, timesteps: int) -> None: :param timesteps: Number of timesteps :type timesteps: int """ - # ================================ + # =================================== # - call self.update(...) if required - # ================================ + # =================================== # call parent's method for checkpointing and TensorBoard writing super().post_interaction(timestep, timesteps) @@ -147,7 +156,7 @@ def _update(self, timestep: int, timesteps: int) -> None: :param timesteps: Number of timesteps :type timesteps: int """ - # ================================ + # ================================================= # - implement algorithm's update step # - record tracking data using self.track_data(...) - # ================================ + # ================================================= diff --git a/docs/source/snippets/model.py b/docs/source/snippets/model.py deleted file mode 100644 index e17e8b71..00000000 --- a/docs/source/snippets/model.py +++ /dev/null @@ -1,43 +0,0 @@ -from typing import Union, Tuple - -import gym - -import torch - -from skrl.models.torch import Model # from . import Model - - -class CustomModel(Model): - def __init__(self, observation_space: Union[int, Tuple[int], gym.Space, None] = None, action_space: Union[int, Tuple[int], gym.Space, None] = None, device: Union[str, torch.device] = "cuda:0") -> None: - """ - :param observation_space: Observation/state space or shape (default: None). - If it is not None, the num_observations property will contain the size of that space (number of elements) - :type observation_space: int, tuple or list of integers, gym.Space or None, optional - :param action_space: Action space or shape (default: None). - If it is not None, the num_actions property will contain the size of that space (number of elements) - :type action_space: int, tuple or list of integers, gym.Space or None, optional - :param device: Device on which a torch tensor is or will be allocated (default: "cuda:0") - :type device: str or torch.device, optional - """ - super().__init__(observation_space, action_space, device) - - def act(self, states: torch.Tensor, taken_actions: Union[torch.Tensor, None] = None, inference=False) -> Tuple[torch.Tensor]: - """Act in response to the state of the environment - - :param states: Observation/state of the environment used to make the decision - :type states: torch.Tensor - :param taken_actions: Actions taken by a policy to the given states (default: None). - The use of these actions only makes sense in critical models, e.g. - :type taken_actions: torch.Tensor or None, optional - :param inference: Flag to indicate whether the model is making inference (default: False). - If True, the returned tensors will be detached from the current graph - :type inference: bool, optional - - :return: Action to be taken by the agent given the state of the environment. - The tuple's components are the actions, the log of the probability density function and mean actions - :rtype: tuple of torch.Tensor - """ - # ================================ - # - act in response to the state - # ================================ - \ No newline at end of file diff --git a/docs/source/snippets/model_mixin.py b/docs/source/snippets/model_mixin.py new file mode 100644 index 00000000..85c0af9e --- /dev/null +++ b/docs/source/snippets/model_mixin.py @@ -0,0 +1,104 @@ +# [start-model] +from typing import Optional, Union, Sequence + +import gym + +import torch + +from skrl.models.torch import Model # from . import Model + + +class CustomModel(Model): + def __init__(self, + observation_space: Union[int, Sequence[int], gym.Space], + action_space: Union[int, Sequence[int], gym.Space], + device: Union[str, torch.device] = "cuda:0") -> None: + """ + :param observation_space: Observation/state space or shape. + The ``num_observations`` property will contain the size of that space + :type observation_space: int, sequence of int, gym.Space + :param action_space: Action space or shape. + The ``num_actions`` property will contain the size of that space + :type action_space: int, sequence of int, gym.Space + :param device: Device on which a torch tensor is or will be allocated (default: ``"cuda:0"``) + :type device: str or torch.device, optional + """ + super().__init__(observation_space, action_space, device) + + def act(self, + states: torch.Tensor, + taken_actions: Optional[torch.Tensor] = None, + role: str = "") -> Sequence[torch.Tensor]: + """Act according to the specified behavior + + :param states: Observation/state of the environment used to make the decision + :type states: torch.Tensor + :param taken_actions: Actions taken by a policy to the given states (default: ``None``). + The use of these actions only makes sense in critical models, e.g. + :type taken_actions: torch.Tensor, optional + :param role: Role play by the model (default: ``""``) + :type role: str, optional + + :raises NotImplementedError: Child class must implement this method + + :return: Action to be taken by the agent given the state of the environment. + The typical sequence's components are the actions, the log of the probability density function and mean actions. + Deterministic agents must ignore the last two components and return empty tensors or None for them + :rtype: sequence of torch.Tensor + """ + # ============================== + # - act in response to the state + # ============================== +# [end-model] + +# ============================================================================= + +# [start-mixin] +from typing import Optional, Sequence + +import gym + +import torch + + +class CustomMixin: + def __init__(self, clip_actions: bool = False, role: str = "") -> None: + """ + :param clip_actions: Flag to indicate whether the actions should be clipped to the action space (default: ``False``) + :type clip_actions: bool, optional + :param role: Role play by the model (default: ``""``) + :type role: str, optional + """ + # e.g. storage custom parameter + if not hasattr(self, "_custom_clip_actions"): + self._custom_clip_actions = {} + self._custom_clip_actions[role] + + def act(self, + states: torch.Tensor, + taken_actions: Optional[torch.Tensor] = None, + role: str = "") -> Sequence[torch.Tensor]: + """Act according to the specified behavior + + :param states: Observation/state of the environment used to make the decision + :type states: torch.Tensor + :param taken_actions: Actions taken by a policy to the given states (default: ``None``). + The use of these actions only makes sense in critical models, e.g. + :type taken_actions: torch.Tensor, optional + :param role: Role play by the model (default: ``""``) + :type role: str, optional + + :raises NotImplementedError: Child class must implement this method + + :return: Action to be taken by the agent given the state of the environment. + The typical sequence's components are the actions, the log of the probability density function and mean actions. + Deterministic agents must ignore the last two components and return empty tensors or None for them + :rtype: sequence of torch.Tensor + """ + # ============================== + # - act in response to the state + # ============================== + + # e.g. retrieve clip actions according to role + clip_actions = self._custom_clip_actions[role] if role in self._custom_clip_actions else self._custom_clip_actions[""] +# [end-mixin] \ No newline at end of file diff --git a/docs/source/snippets/multivariate_gaussian_model.py b/docs/source/snippets/multivariate_gaussian_model.py index d384271c..e53f7fe1 100644 --- a/docs/source/snippets/multivariate_gaussian_model.py +++ b/docs/source/snippets/multivariate_gaussian_model.py @@ -1,12 +1,3 @@ -import gym - -class DummyEnv: - observation_space = gym.spaces.Box(low=-1, high=1, shape=(5,)) - action_space = gym.spaces.Box(low=-1, high=1, shape=(3,)) - device = "cuda:0" - -env = DummyEnv() - # [start-mlp] import torch import torch.nn as nn diff --git a/docs/source/snippets/tabular_model.py b/docs/source/snippets/tabular_model.py index f3355daa..4b94c69c 100644 --- a/docs/source/snippets/tabular_model.py +++ b/docs/source/snippets/tabular_model.py @@ -1,30 +1,19 @@ -import gym - -class DummyEnv: - observation_space = gym.spaces.Discrete(4) - action_space = gym.spaces.Discrete(3) - device = "cuda:0" - num_envs = 2 - -env = DummyEnv() - # [start-epsilon-greedy] import torch -from skrl.models.torch import TabularModel +from skrl.models.torch import Model, TabularMixin # define the model -class EpilonGreedyPolicy(TabularModel): +class EpilonGreedyPolicy(TabularMixin, Model): def __init__(self, observation_space, action_space, device, num_envs=1, epsilon=0.1): - super().__init__(observation_space, action_space, device, num_envs) + Model.__init__(self, observation_space, action_space, device) + TabularMixin.__init__(self, num_envs) self.epsilon = epsilon self.q_table = torch.ones((num_envs, self.num_observations, self.num_actions), dtype=torch.float32) - - self.tables["q_table"] = self.q_table - def compute(self, states, taken_actions): + def compute(self, states, taken_actions, role): actions = torch.argmax(self.q_table[torch.arange(self.num_envs).view(-1, 1), states], dim=-1, keepdim=True).view(-1,1) @@ -41,8 +30,3 @@ def compute(self, states, taken_actions): num_envs=env.num_envs, epsilon=0.15) # [end-epsilon-greedy] - -import torch -policy.to(env.device) -actions = policy.act(torch.tensor([[0, 1, 2, 3]], device=env.device)) -assert actions[0].shape == torch.Size([10, env.action_space.shape[0]]) From ee9f8ca5931f74b962e84bc05e49a956f9942695 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20Serrano=20Mu=C3=B1oz?= Date: Sat, 17 Sep 2022 10:14:20 +0200 Subject: [PATCH 081/108] Purge some docs --- .../source/modules/skrl.agents.base_class.rst | 5 +-- .../modules/skrl.memories.base_class.rst | 2 -- .../source/modules/skrl.models.base_class.rst | 22 ++++++++----- docs/source/modules/skrl.resources.noises.rst | 2 -- .../skrl.utils.omniverse_isaacgym_utils.rst | 33 +++++++++++++++++++ 5 files changed, 48 insertions(+), 16 deletions(-) create mode 100644 docs/source/modules/skrl.utils.omniverse_isaacgym_utils.rst diff --git a/docs/source/modules/skrl.agents.base_class.rst b/docs/source/modules/skrl.agents.base_class.rst index e7ad5637..4fea0caf 100644 --- a/docs/source/modules/skrl.agents.base_class.rst +++ b/docs/source/modules/skrl.agents.base_class.rst @@ -14,11 +14,8 @@ Basic inheritance usage .. tab:: Inheritance - View the raw code `here `_ - .. literalinclude:: ../snippets/agent.py :language: python - :linenos: API ^^^ @@ -27,7 +24,7 @@ API :undoc-members: :show-inheritance: :inherited-members: - :private-members: _update + :private-members: _update, _empty_preprocessor, _get_internal_value :members: .. automethod:: __init__ diff --git a/docs/source/modules/skrl.memories.base_class.rst b/docs/source/modules/skrl.memories.base_class.rst index a40f9afb..101e98dc 100644 --- a/docs/source/modules/skrl.memories.base_class.rst +++ b/docs/source/modules/skrl.memories.base_class.rst @@ -14,8 +14,6 @@ Basic inheritance usage .. tab:: Inheritance - View the raw code `here `_ - .. literalinclude:: ../snippets/memory.py :language: python :linenos: diff --git a/docs/source/modules/skrl.models.base_class.rst b/docs/source/modules/skrl.models.base_class.rst index 5ed11e5b..221221f3 100644 --- a/docs/source/modules/skrl.models.base_class.rst +++ b/docs/source/modules/skrl.models.base_class.rst @@ -7,18 +7,24 @@ Base class It provides the basic functionality for the other classes. **It is not intended to be used directly**. -Basic inheritance usage -^^^^^^^^^^^^^^^^^^^^^^^ +Mixin and inheritance +^^^^^^^^^^^^^^^^^^^^^ .. tabs:: - - .. tab:: Inheritance - View the raw code `here `_ + .. tab:: Mixin - .. literalinclude:: ../snippets/model.py + .. literalinclude:: ../snippets/model_mixin.py :language: python - :linenos: + :start-after: [start-mixin] + :end-before: [end-mixin] + + .. tab:: Model inheritance + + .. literalinclude:: ../snippets/model_mixin.py + :language: python + :start-after: [start-model] + :end-before: [end-model] API ^^^ @@ -26,7 +32,7 @@ API .. autoclass:: skrl.models.torch.base.Model :undoc-members: :show-inheritance: - :private-members: _get_space_size, _get_instantiator_output + :private-members: _get_space_size :members: .. automethod:: __init__ diff --git a/docs/source/modules/skrl.resources.noises.rst b/docs/source/modules/skrl.resources.noises.rst index a9450eb5..a988f0fe 100644 --- a/docs/source/modules/skrl.resources.noises.rst +++ b/docs/source/modules/skrl.resources.noises.rst @@ -117,8 +117,6 @@ Basic inheritance usage .. tab:: Inheritance - View the raw code `here `_ - .. literalinclude:: ../snippets/noise.py :language: python :linenos: diff --git a/docs/source/modules/skrl.utils.omniverse_isaacgym_utils.rst b/docs/source/modules/skrl.utils.omniverse_isaacgym_utils.rst new file mode 100644 index 00000000..806ed6d8 --- /dev/null +++ b/docs/source/modules/skrl.utils.omniverse_isaacgym_utils.rst @@ -0,0 +1,33 @@ +Omniverse Isaac Gym utils +========================= + +.. contents:: Table of Contents + :depth: 2 + :local: + :backlinks: none + +.. raw:: html + +
+ +Control of robotic manipulators +------------------------------- + +Inverse kinematics using damped least squares method +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +This implementation attempts to unify under a single and reusable function the whole set of procedures used to calculate the inverse kinematics of a robotic manipulator shown originally in Isaac Gym's example: Franka IK Picking (:literal:`franka_cube_ik_osc.py`) but this time for Omniverse Isaac Gym + +:math:`\Delta\theta = J^T (JJ^T + \lambda^2 I)^{-1} \, \vec{e}` + +where + +| :math:`\qquad \Delta\theta \;` is the change in joint angles +| :math:`\qquad J \;` is the Jacobian +| :math:`\qquad \lambda \;` is a non-zero damping constant +| :math:`\qquad \vec{e} \;` is the Cartesian pose error (position and orientation) + +API +""" + +.. autofunction:: skrl.utils.omniverse_isaacgym_utils.ik From 2c7099cd99df0a1aa7771e83c8c541304d0ff752 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20Serrano=20Mu=C3=B1oz?= Date: Mon, 19 Sep 2022 14:37:08 +0200 Subject: [PATCH 082/108] Add Franka Emika example files --- .../reaching_franka_real_env.py | 276 ++++++++++++++++++ .../reaching_franka_real_skrl_eval.py | 88 ++++++ .../reaching_franka_sim_env.py | 262 +++++++++++++++++ .../reaching_franka_sim_skrl_eval.py | 92 ++++++ .../reaching_franka_sim_skrl_train.py | 133 +++++++++ 5 files changed, 851 insertions(+) create mode 100644 docs/source/examples/real_world/franka_emika_panda/reaching_franka_real_env.py create mode 100644 docs/source/examples/real_world/franka_emika_panda/reaching_franka_real_skrl_eval.py create mode 100644 docs/source/examples/real_world/franka_emika_panda/reaching_franka_sim_env.py create mode 100644 docs/source/examples/real_world/franka_emika_panda/reaching_franka_sim_skrl_eval.py create mode 100644 docs/source/examples/real_world/franka_emika_panda/reaching_franka_sim_skrl_train.py diff --git a/docs/source/examples/real_world/franka_emika_panda/reaching_franka_real_env.py b/docs/source/examples/real_world/franka_emika_panda/reaching_franka_real_env.py new file mode 100644 index 00000000..d67cfb0b --- /dev/null +++ b/docs/source/examples/real_world/franka_emika_panda/reaching_franka_real_env.py @@ -0,0 +1,276 @@ +import gym +import time +import threading +import numpy as np +from packaging import version + +import frankx + + +class ReachingFranka(gym.Env): + def __init__(self, robot_ip="172.16.0.2", device="cuda:0", control_space="joint", motion_type="waypoint", camera_tracking=False): + # gym API + self._drepecated_api = version.parse(gym.__version__) < version.parse(" 0.25.0") + + self.device = device + self.control_space = control_space # joint or cartesian + self.motion_type = motion_type # waypoint or impedance + + if self.control_space == "cartesian" and self.motion_type == "impedance": + raise ValueError("Unsafe robot operation in cartesian/impedance configuration") + + # camera tracking (disabled by default) + self.camera_tracking = camera_tracking + if self.camera_tracking: + threading.Thread(target=self._update_target_from_camera).start() + + # spaces + self.observation_space = gym.spaces.Box(low=-1000, high=1000, shape=(18,), dtype=np.float32) + if self.control_space == "joint": + self.action_space = gym.spaces.Box(low=-1, high=1, shape=(7,), dtype=np.float32) + elif self.control_space == "cartesian": + self.action_space = gym.spaces.Box(low=-1, high=1, shape=(3,), dtype=np.float32) + else: + raise ValueError("Invalid control space:", self.control_space) + + # init real franka + print("Connecting to robot at {}...".format(robot_ip)) + self.robot = frankx.Robot(robot_ip) + self.robot.set_default_behavior() + self.robot.recover_from_errors() + + # the robot's response can be better managed by independently setting the following properties, for example: + # - self.robot.velocity_rel = 0.2 + # - self.robot.acceleration_rel = 0.1 + # - self.robot.jerk_rel = 0.01 + self.robot.set_dynamic_rel(0.25) + + self.gripper = self.robot.get_gripper() + print("Robot connected") + + self.motion = None + self.motion_thread = None + + self.dt = 1 / 120.0 + self.action_scale = 2.5 + self.dof_vel_scale = 0.1 + self.max_episode_length = 100 + self.robot_dof_speed_scales = 1 + self.target_pos = np.array([0.65, 0.2, 0.2]) + self.robot_default_dof_pos = np.radians([0, -45, 0, -135, 0, 90, 45]) + self.robot_dof_lower_limits = np.array([-2.8973, -1.7628, -2.8973, -3.0718, -2.8973, -0.0175, -2.8973]) + self.robot_dof_upper_limits = np.array([ 2.8973, 1.7628, 2.8973, -0.0698, 2.8973, 3.7525, 2.8973]) + + self.progress_buf = 1 + self.obs_buf = np.zeros((18,), dtype=np.float32) + + def _update_target_from_camera(self): + pixel_to_meter = 1.11 / 375 # m/px: adjust for custom cases + + import cv2 + cap = cv2.VideoCapture(0) + while cap.isOpened(): + ret, frame = cap.read() + if not ret: + break + + # convert to HSV and remove noise + hsv = cv2.cvtColor(frame, cv2.COLOR_BGR2HSV) + hsv = cv2.medianBlur(hsv, 15) + + # color matching in HSV + mask = cv2.inRange(hsv, np.array([80, 100, 100]), np.array([100, 255, 255])) + M = cv2.moments(mask) + if M["m00"]: + x = M["m10"] / M["m00"] + y = M["m01"] / M["m00"] + + # real-world position (fixed z to 0.2 meters) + pos = np.array([pixel_to_meter * (y - 185), pixel_to_meter * (x - 320), 0.2]) + if self is not None: + self.target_pos = pos + + # draw target + frame = cv2.circle(frame, (int(x), int(y)), 30, (0,0,255), 2) + frame = cv2.putText(frame, str(np.round(pos, 4).tolist()), (10, 20), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0,0,255), 1, cv2.LINE_AA) + + # show images + cv2.imshow("frame", frame) + cv2.imshow("mask", mask) + k = cv2.waitKey(1) & 0xFF + if k == ord('q'): + cap.release() + + def _get_observation_reward_done(self): + # get robot state + try: + robot_state = self.robot.get_state(read_once=True) + except frankx.InvalidOperationException: + robot_state = self.robot.get_state(read_once=False) + + # observation + robot_dof_pos = np.array(robot_state.q) + robot_dof_vel = np.array(robot_state.dq) + end_effector_pos = np.array(robot_state.O_T_EE[-4:-1]) + + dof_pos_scaled = 2.0 * (robot_dof_pos - self.robot_dof_lower_limits) / (self.robot_dof_upper_limits - self.robot_dof_lower_limits) - 1.0 + dof_vel_scaled = robot_dof_vel * self.dof_vel_scale + + self.obs_buf[0] = self.progress_buf / float(self.max_episode_length) + self.obs_buf[1:8] = dof_pos_scaled + self.obs_buf[8:15] = dof_vel_scaled + self.obs_buf[15:18] = self.target_pos + + # reward + distance = np.linalg.norm(end_effector_pos - self.target_pos) + reward = -distance + + # done + done = self.progress_buf >= self.max_episode_length - 1 + done = done or distance <= 0.075 + + print("Distance:", distance) + if done: + print("Target or Maximum episode length reached") + time.sleep(1) + + return self.obs_buf, reward, done + + def reset(self): + print("Reseting...") + + # end current motion + if self.motion is not None: + self.motion.finish() + self.motion_thread.join() + self.motion = None + self.motion_thread = None + + # open/close gripper + # self.gripper.open() + # self.gripper.clamp() + + # go to 1) safe position, 2) random position + self.robot.move(frankx.JointMotion(self.robot_default_dof_pos.tolist())) + dof_pos = self.robot_default_dof_pos + 0.25 * (np.random.rand(7) - 0.5) + self.robot.move(frankx.JointMotion(dof_pos.tolist())) + + # get target position from prompt + if not self.camera_tracking: + while True: + try: + print("Enter target position (X, Y, Z) in meters") + raw = input("or press [Enter] key for a random target position: ") + if raw: + self.target_pos = np.array([float(p) for p in raw.replace(' ', '').split(',')]) + else: + noise = (2 * np.random.rand(3) - 1) * np.array([0.25, 0.25, 0.10]) + self.target_pos = np.array([0.5, 0.0, 0.2]) + noise + print("Target position:", self.target_pos) + break + except ValueError: + print("Invalid input. Try something like: 0.65, 0.0, 0.2") + + # initial pose + affine = frankx.Affine(frankx.Kinematics.forward(dof_pos.tolist())) + affine = affine * frankx.Affine(x=0, y=0, z=-0.10335, a=np.pi/2) + + # motion type + if self.motion_type == "waypoint": + self.motion = frankx.WaypointMotion([frankx.Waypoint(affine)], return_when_finished=False) + elif self.motion_type == "impedance": + self.motion = frankx.ImpedanceMotion(500, 50) + else: + raise ValueError("Invalid motion type:", self.motion_type) + + self.motion_thread = self.robot.move_async(self.motion) + if self.motion_type == "impedance": + self.motion.target = affine + + input("Press [Enter] to continue") + + self.progress_buf = 0 + observation, reward, done = self._get_observation_reward_done() + + if self._drepecated_api: + return observation + else: + return observation, {} + + def step(self, action): + self.progress_buf += 1 + + # control space + # joint + if self.control_space == "joint": + # get robot state + try: + robot_state = self.robot.get_state(read_once=True) + except frankx.InvalidOperationException: + robot_state = self.robot.get_state(read_once=False) + # forward kinematics + dof_pos = np.array(robot_state.q) + (self.robot_dof_speed_scales * self.dt * action * self.action_scale) + affine = frankx.Affine(self.robot.forward_kinematics(dof_pos.flatten().tolist())) + affine = affine * frankx.Affine(x=0, y=0, z=-0.10335, a=np.pi/2) + # cartesian + elif self.control_space == "cartesian": + action /= 100.0 + if self.motion_type == "waypoint": + affine = frankx.Affine(x=action[0], y=action[1], z=action[2]) + elif self.motion_type == "impedance": + # get robot pose + try: + robot_pose = self.robot.current_pose(read_once=True) + except frankx.InvalidOperationException: + robot_pose = self.robot.current_pose(read_once=False) + affine = robot_pose * frankx.Affine(x=action[0], y=action[1], z=action[2]) + + # motion type + # waypoint motion + if self.motion_type == "waypoint": + if self.control_space == "joint": + self.motion.set_next_waypoint(frankx.Waypoint(affine)) + elif self.control_space == "cartesian": + self.motion.set_next_waypoint(frankx.Waypoint(affine, frankx.Waypoint.Relative)) + # impedance motion + elif self.motion_type == "impedance": + self.motion.target = affine + else: + raise ValueError("Invalid motion type:", self.motion_type) + + # the use of time.sleep is for simplicity. This does not guarantee control at a specific frequency + time.sleep(0.1) # lower frequency, at 30Hz there are discontinuities + + observation, reward, done = self._get_observation_reward_done() + + if self._drepecated_api: + return observation, reward, done, {} + else: + return observation, reward, done, done, {} + + def render(self, *args, **kwargs): + pass + + def close(self): + pass + + + + +if __name__ == "__main__": + + # test camera capturing + ReachingFranka._update_target_from_camera(None) + exit() + + + env = ReachingFranka() + + observation = env.reset() + for _ in range(100): + observation, reward, done, info = env.step(env.action_space.sample()) + env.render() + if done: + observation = env.reset() + + env.close() diff --git a/docs/source/examples/real_world/franka_emika_panda/reaching_franka_real_skrl_eval.py b/docs/source/examples/real_world/franka_emika_panda/reaching_franka_real_skrl_eval.py new file mode 100644 index 00000000..0d64487e --- /dev/null +++ b/docs/source/examples/real_world/franka_emika_panda/reaching_franka_real_skrl_eval.py @@ -0,0 +1,88 @@ +import torch +import torch.nn as nn + +# Import the skrl components to build the RL system +from skrl.models.torch import Model, GaussianMixin +from skrl.agents.torch.ppo import PPO, PPO_DEFAULT_CONFIG +from skrl.resources.preprocessors.torch import RunningStandardScaler +from skrl.trainers.torch import SequentialTrainer +from skrl.envs.torch import wrap_env + + +# Define only the policy for evaluation +class Policy(GaussianMixin, Model): + def __init__(self, observation_space, action_space, device, clip_actions=False, + clip_log_std=True, min_log_std=-20, max_log_std=2): + Model.__init__(self, observation_space, action_space, device) + GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std) + + self.net = nn.Sequential(nn.Linear(self.num_observations, 256), + nn.ELU(), + nn.Linear(256, 128), + nn.ELU(), + nn.Linear(128, 64), + nn.ELU(), + nn.Linear(64, self.num_actions)) + self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) + + def compute(self, states, taken_actions, role): + return self.net(states), self.log_std_parameter + + +# Load the environment +from reaching_franka_real_env import ReachingFranka + +control_space = "joint" # joint or cartesian +motion_type = "waypoint" # waypoint or impedance +camera_tracking = False # True for USB-camera tracking + +env = ReachingFranka(robot_ip="172.16.0.2", + device="cpu", + control_space=control_space, + motion_type=motion_type, + camera_tracking=camera_tracking) + +# wrap the environment +env = wrap_env(env) + +device = env.device + + +# Instantiate the agent's policy. +# PPO requires 2 models, visit its documentation for more details +# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#spaces-and-models +models_ppo = {} +models_ppo["policy"] = Policy(env.observation_space, env.action_space, device) + +# Configure and instantiate the agent. +# Only modify some of the default configuration, visit its documentation to see all the options +# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#configuration-and-hyperparameters +cfg_ppo = PPO_DEFAULT_CONFIG.copy() +cfg_ppo["random_timesteps"] = 0 +cfg_ppo["learning_starts"] = 0 +cfg_ppo["state_preprocessor"] = RunningStandardScaler +cfg_ppo["state_preprocessor_kwargs"] = {"size": env.observation_space, "device": device} +# logging to TensorBoard each 32 timesteps an ignore checkpoints +cfg_ppo["experiment"]["write_interval"] = 32 +cfg_ppo["experiment"]["checkpoint_interval"] = 0 + +agent = PPO(models=models_ppo, + memory=None, + cfg=cfg_ppo, + observation_space=env.observation_space, + action_space=env.action_space, + device=device) + +# load checkpoints +if control_space == "joint": + agent.load("./agent_joint.pt") +elif control_space == "cartesian": + agent.load("./agent_cartesian.pt") + + +# Configure and instantiate the RL trainer +cfg_trainer = {"timesteps": 1000, "headless": True} +trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=agent) + +# start evaluation +trainer.eval() diff --git a/docs/source/examples/real_world/franka_emika_panda/reaching_franka_sim_env.py b/docs/source/examples/real_world/franka_emika_panda/reaching_franka_sim_env.py new file mode 100644 index 00000000..d530abc7 --- /dev/null +++ b/docs/source/examples/real_world/franka_emika_panda/reaching_franka_sim_env.py @@ -0,0 +1,262 @@ +import torch +import numpy as np + +from omniisaacgymenvs.tasks.base.rl_task import RLTask +from omniisaacgymenvs.robots.articulations.franka import Franka as Robot + +from omni.isaac.core.prims import RigidPrimView, XFormPrimView +from omni.isaac.core.articulations import ArticulationView +from omni.isaac.core.objects import VisualSphere +from omni.isaac.core.utils.prims import get_prim_at_path + +from skrl.utils import omniverse_isaacgym_utils + +# post_physics_step calls +# - get_observations() +# - get_states() +# - calculate_metrics() +# - is_done() +# - get_extras() + + +TASK_CFG = {"test": False, + "device_id": 0, + "headless": True, + "sim_device": "gpu", + "task": {"name": "ReachingFranka", + "physics_engine": "physx", + "env": {"numEnvs": 1024, + "envSpacing": 1.5, + "episodeLength": 100, + "enableDebugVis": False, + "clipObservations": 1000.0, + "clipActions": 1.0, + "controlFrequencyInv": 4, + "actionScale": 2.5, + "dofVelocityScale": 0.1, + "controlSpace": "cartesian"}, + "sim": {"dt": 0.0083, # 1 / 120 + "use_gpu_pipeline": True, + "gravity": [0.0, 0.0, -9.81], + "add_ground_plane": True, + "use_flatcache": True, + "enable_scene_query_support": False, + "enable_cameras": False, + "default_physics_material": {"static_friction": 1.0, + "dynamic_friction": 1.0, + "restitution": 0.0}, + "physx": {"worker_thread_count": 4, + "solver_type": 1, + "use_gpu": True, + "solver_position_iteration_count": 4, + "solver_velocity_iteration_count": 1, + "contact_offset": 0.005, + "rest_offset": 0.0, + "bounce_threshold_velocity": 0.2, + "friction_offset_threshold": 0.04, + "friction_correlation_distance": 0.025, + "enable_sleeping": True, + "enable_stabilization": True, + "max_depenetration_velocity": 1000.0, + "gpu_max_rigid_contact_count": 524288, + "gpu_max_rigid_patch_count": 33554432, + "gpu_found_lost_pairs_capacity": 524288, + "gpu_found_lost_aggregate_pairs_capacity": 262144, + "gpu_total_aggregate_pairs_capacity": 1048576, + "gpu_max_soft_body_contacts": 1048576, + "gpu_max_particle_contacts": 1048576, + "gpu_heap_capacity": 33554432, + "gpu_temp_buffer_capacity": 16777216, + "gpu_max_num_partitions": 8}, + "robot": {"override_usd_defaults": False, + "fixed_base": False, + "enable_self_collisions": False, + "enable_gyroscopic_forces": True, + "solver_position_iteration_count": 4, + "solver_velocity_iteration_count": 1, + "sleep_threshold": 0.005, + "stabilization_threshold": 0.001, + "density": -1, + "max_depenetration_velocity": 1000.0, + "contact_offset": 0.005, + "rest_offset": 0.0}}}} + + +class RobotView(ArticulationView): + def __init__(self, prim_paths_expr: str, name: str = "robot_view") -> None: + super().__init__(prim_paths_expr=prim_paths_expr, name=name, reset_xform_properties=False) + + +class ReachingFrankaTask(RLTask): + def __init__(self, name, sim_config, env, offset=None) -> None: + self._sim_config = sim_config + self._cfg = sim_config.config + self._task_cfg = sim_config.task_config + + self.dt = 1 / 120.0 + + self._num_envs = self._task_cfg["env"]["numEnvs"] + self._env_spacing = self._task_cfg["env"]["envSpacing"] + self._action_scale = self._task_cfg["env"]["actionScale"] + self._dof_vel_scale = self._task_cfg["env"]["dofVelocityScale"] + self._max_episode_length = self._task_cfg["env"]["episodeLength"] + self._control_space = self._task_cfg["env"]["controlSpace"] + + # observation and action space + self._num_observations = 18 + if self._control_space == "joint": + self._num_actions = 7 + elif self._control_space == "cartesian": + self._num_actions = 3 + else: + raise ValueError("Invalid control space: {}".format(self._control_space)) + + self._end_effector_link = "panda_leftfinger" + + RLTask.__init__(self, name, env) + + def set_up_scene(self, scene) -> None: + self.get_robot() + self.get_target() + + super().set_up_scene(scene) + + # robot view + self._robots = RobotView(prim_paths_expr="/World/envs/.*/robot", name="robot_view") + scene.add(self._robots) + # end-effectors view + self._end_effectors = RigidPrimView(prim_paths_expr="/World/envs/.*/robot/{}".format(self._end_effector_link), name="end_effector_view") + scene.add(self._end_effectors) + # hands view (cartesian) + if self._control_space == "cartesian": + self._hands = RigidPrimView(prim_paths_expr="/World/envs/.*/robot/panda_hand", name="hand_view", reset_xform_properties=False) + scene.add(self._hands) + # target view + self._targets = XFormPrimView(prim_paths_expr="/World/envs/.*/target", name="target_view", reset_xform_properties=False) + scene.add(self._targets) + + self.init_data() + + def get_robot(self): + robot = Robot(prim_path=self.default_zero_env_path + "/robot", + translation=torch.tensor([0.0, 0.0, 0.0]), + orientation=torch.tensor([1.0, 0.0, 0.0, 0.0]), + name="robot") + self._sim_config.apply_articulation_settings("robot", get_prim_at_path(robot.prim_path), self._sim_config.parse_actor_config("robot")) + + def get_target(self): + target = VisualSphere(prim_path=self.default_zero_env_path + "/target", + name="target", + radius=0.025, + color=torch.tensor([1, 0, 0])) + target.set_collision_enabled(False) + + def init_data(self) -> None: + self.robot_default_dof_pos = torch.tensor(np.radians([0, -45, 0, -135, 0, 90, 45, 0, 0]), device=self._device, dtype=torch.float32) + self.actions = torch.zeros((self._num_envs, self.num_actions), device=self._device) + + if self._control_space == "cartesian": + self.jacobians = torch.zeros((self._num_envs, 10, 6, 9), device=self._device) + self.hand_pos, self.hand_rot = torch.zeros((self._num_envs, 3), device=self._device), torch.zeros((self._num_envs, 4), device=self._device) + + def get_observations(self) -> dict: + robot_dof_pos = self._robots.get_joint_positions(clone=False) + robot_dof_vel = self._robots.get_joint_velocities(clone=False) + end_effector_pos, end_effector_rot = self._end_effectors.get_local_poses() + target_pos, target_rot = self._targets.get_local_poses() + + dof_pos_scaled = 2.0 * (robot_dof_pos - self.robot_dof_lower_limits) \ + / (self.robot_dof_upper_limits - self.robot_dof_lower_limits) - 1.0 + dof_vel_scaled = robot_dof_vel * self._dof_vel_scale + + generalization_noise = torch.rand((dof_vel_scaled.shape[0], 7), device=self._device) + 0.5 + + self.obs_buf[:, 0] = self.progress_buf / self._max_episode_length + self.obs_buf[:, 1:8] = dof_pos_scaled[:, :7] + self.obs_buf[:, 8:15] = dof_vel_scaled[:, :7] * generalization_noise + self.obs_buf[:, 15:18] = target_pos + + # compute distance for calculate_metrics() and is_done() + self._computed_distance = torch.norm(end_effector_pos - target_pos, dim=-1) + + if self._control_space == "cartesian": + self.jacobians = self._robots.get_jacobians(clone=False) + self.hand_pos, self.hand_rot = self._hands.get_local_poses() + + return {self._robots.name: {"obs_buf": self.obs_buf}} + + def pre_physics_step(self, actions) -> None: + reset_env_ids = self.reset_buf.nonzero(as_tuple=False).squeeze(-1) + if len(reset_env_ids) > 0: + self.reset_idx(reset_env_ids) + + self.actions = actions.clone().to(self._device) + env_ids_int32 = torch.arange(self._robots.count, dtype=torch.int32, device=self._device) + + if self._control_space == "joint": + targets = self.robot_dof_targets[:, :7] + self.robot_dof_speed_scales[:7] * self.dt * self.actions * self._action_scale + + elif self._control_space == "cartesian": + goal_position = self.hand_pos + actions / 100.0 + delta_dof_pos = omniverse_isaacgym_utils.ik(jacobian_end_effector=self.jacobians[:, 8 - 1, :, :7], # franka hand index: 8 + current_position=self.hand_pos, + current_orientation=self.hand_rot, + goal_position=goal_position, + goal_orientation=None) + targets = self.robot_dof_targets[:, :7] + delta_dof_pos + + self.robot_dof_targets[:, :7] = torch.clamp(targets, self.robot_dof_lower_limits[:7], self.robot_dof_upper_limits[:7]) + self.robot_dof_targets[:, 7:] = 0 + self._robots.set_joint_position_targets(self.robot_dof_targets, indices=env_ids_int32) + + def reset_idx(self, env_ids) -> None: + indices = env_ids.to(dtype=torch.int32) + + # reset robot + pos = torch.clamp(self.robot_default_dof_pos.unsqueeze(0) + 0.25 * (torch.rand((len(env_ids), self.num_robot_dofs), device=self._device) - 0.5), + self.robot_dof_lower_limits, self.robot_dof_upper_limits) + dof_pos = torch.zeros((len(indices), self._robots.num_dof), device=self._device) + dof_pos[:, :] = pos + dof_pos[:, 7:] = 0 + dof_vel = torch.zeros((len(indices), self._robots.num_dof), device=self._device) + self.robot_dof_targets[env_ids, :] = pos + self.robot_dof_pos[env_ids, :] = pos + + self._robots.set_joint_position_targets(self.robot_dof_targets[env_ids], indices=indices) + self._robots.set_joint_positions(dof_pos, indices=indices) + self._robots.set_joint_velocities(dof_vel, indices=indices) + + # reset target + pos = (torch.rand((len(env_ids), 3), device=self._device) - 0.5) * 2 + pos[:, 0] = 0.50 + pos[:, 0] * 0.25 + pos[:, 1] = 0.00 + pos[:, 1] * 0.25 + pos[:, 2] = 0.20 + pos[:, 2] * 0.10 + + self._targets.set_local_poses(pos, indices=indices) + + # bookkeeping + self.reset_buf[env_ids] = 0 + self.progress_buf[env_ids] = 0 + + def post_reset(self): + self.num_robot_dofs = self._robots.num_dof + self.robot_dof_pos = torch.zeros((self.num_envs, self.num_robot_dofs), device=self._device) + dof_limits = self._robots.get_dof_limits() + self.robot_dof_lower_limits = dof_limits[0, :, 0].to(device=self._device) + self.robot_dof_upper_limits = dof_limits[0, :, 1].to(device=self._device) + self.robot_dof_speed_scales = torch.ones_like(self.robot_dof_lower_limits) + self.robot_dof_targets = torch.zeros((self._num_envs, self.num_robot_dofs), dtype=torch.float, device=self._device) + + # randomize all envs + indices = torch.arange(self._num_envs, dtype=torch.int64, device=self._device) + self.reset_idx(indices) + + def calculate_metrics(self) -> None: + self.rew_buf[:] = -self._computed_distance + + def is_done(self) -> None: + self.reset_buf.fill_(0) + # target reached + self.reset_buf = torch.where(self._computed_distance <= 0.035, torch.ones_like(self.reset_buf), self.reset_buf) + # max episode length + self.reset_buf = torch.where(self.progress_buf >= self._max_episode_length - 1, torch.ones_like(self.reset_buf), self.reset_buf) diff --git a/docs/source/examples/real_world/franka_emika_panda/reaching_franka_sim_skrl_eval.py b/docs/source/examples/real_world/franka_emika_panda/reaching_franka_sim_skrl_eval.py new file mode 100644 index 00000000..f9283779 --- /dev/null +++ b/docs/source/examples/real_world/franka_emika_panda/reaching_franka_sim_skrl_eval.py @@ -0,0 +1,92 @@ +import torch +import torch.nn as nn + +# Import the skrl components to build the RL system +from skrl.models.torch import Model, GaussianMixin +from skrl.agents.torch.ppo import PPO, PPO_DEFAULT_CONFIG +from skrl.resources.preprocessors.torch import RunningStandardScaler +from skrl.trainers.torch import SequentialTrainer +from skrl.utils.omniverse_isaacgym_utils import get_env_instance +from skrl.envs.torch import wrap_env + + +# Define only the policy for evaluation +class Policy(GaussianMixin, Model): + def __init__(self, observation_space, action_space, device, clip_actions=False, + clip_log_std=True, min_log_std=-20, max_log_std=2): + Model.__init__(self, observation_space, action_space, device) + GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std) + + self.net = nn.Sequential(nn.Linear(self.num_observations, 256), + nn.ELU(), + nn.Linear(256, 128), + nn.ELU(), + nn.Linear(128, 64), + nn.ELU(), + nn.Linear(64, self.num_actions)) + self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) + + def compute(self, states, taken_actions, role): + return self.net(states), self.log_std_parameter + + +# instance VecEnvBase and setup task +headless = True # set headless to False for rendering +env = get_env_instance(headless=headless) + +from omniisaacgymenvs.utils.config_utils.sim_config import SimConfig +from reaching_franka_sim_env import ReachingFrankaTask, TASK_CFG + +TASK_CFG["headless"] = headless +TASK_CFG["task"]["env"]["numEnvs"] = 64 +TASK_CFG["task"]["env"]["controlSpace"] = "joint" # "joint" or "cartesian" + +sim_config = SimConfig(TASK_CFG) +task = ReachingFrankaTask(name="ReachingFranka", sim_config=sim_config, env=env) +env.set_task(task=task, sim_params=sim_config.get_physics_params(), backend="torch", init_sim=True) + +# wrap the environment +env = wrap_env(env, "omniverse-isaacgym") + +device = env.device + + +# Instantiate the agent's policy. +# PPO requires 2 models, visit its documentation for more details +# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#spaces-and-models +models_ppo = {} +models_ppo["policy"] = Policy(env.observation_space, env.action_space, device) + + +# Configure and instantiate the agent. +# Only modify some of the default configuration, visit its documentation to see all the options +# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#configuration-and-hyperparameters +cfg_ppo = PPO_DEFAULT_CONFIG.copy() +cfg_ppo["random_timesteps"] = 0 +cfg_ppo["learning_starts"] = 0 +cfg_ppo["state_preprocessor"] = RunningStandardScaler +cfg_ppo["state_preprocessor_kwargs"] = {"size": env.observation_space, "device": device} +# logging to TensorBoard each 32 timesteps an ignore checkpoints +cfg_ppo["experiment"]["write_interval"] = 32 +cfg_ppo["experiment"]["checkpoint_interval"] = 0 + +agent = PPO(models=models_ppo, + memory=None, + cfg=cfg_ppo, + observation_space=env.observation_space, + action_space=env.action_space, + device=device) + +# load checkpoints +if TASK_CFG["task"]["env"]["controlSpace"] == "joint": + agent.load("./agent_joint.pt") +elif TASK_CFG["task"]["env"]["controlSpace"] == "cartesian": + agent.load("./agent_cartesian.pt") + + +# Configure and instantiate the RL trainer +cfg_trainer = {"timesteps": 5000, "headless": True} +trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=agent) + +# start evaluation +trainer.eval() diff --git a/docs/source/examples/real_world/franka_emika_panda/reaching_franka_sim_skrl_train.py b/docs/source/examples/real_world/franka_emika_panda/reaching_franka_sim_skrl_train.py new file mode 100644 index 00000000..b9a31852 --- /dev/null +++ b/docs/source/examples/real_world/franka_emika_panda/reaching_franka_sim_skrl_train.py @@ -0,0 +1,133 @@ +import torch +import torch.nn as nn + +# Import the skrl components to build the RL system +from skrl.models.torch import Model, GaussianMixin, DeterministicMixin +from skrl.memories.torch import RandomMemory +from skrl.agents.torch.ppo import PPO, PPO_DEFAULT_CONFIG +from skrl.resources.schedulers.torch import KLAdaptiveRL +from skrl.resources.preprocessors.torch import RunningStandardScaler +from skrl.trainers.torch import SequentialTrainer +from skrl.utils.omniverse_isaacgym_utils import get_env_instance +from skrl.envs.torch import wrap_env +from skrl.utils import set_seed + + +# set the seed for reproducibility +set_seed(42) + + +# Define the models (stochastic and deterministic models) for the agent using helper mixin. +# - Policy: takes as input the environment's observation/state and returns an action +# - Value: takes the state as input and provides a value to guide the policy +class Policy(GaussianMixin, Model): + def __init__(self, observation_space, action_space, device, clip_actions=False, + clip_log_std=True, min_log_std=-20, max_log_std=2): + Model.__init__(self, observation_space, action_space, device) + GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std) + + self.net = nn.Sequential(nn.Linear(self.num_observations, 256), + nn.ELU(), + nn.Linear(256, 128), + nn.ELU(), + nn.Linear(128, 64), + nn.ELU(), + nn.Linear(64, self.num_actions)) + self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) + + def compute(self, states, taken_actions, role): + return self.net(states), self.log_std_parameter + +class Value(DeterministicMixin, Model): + def __init__(self, observation_space, action_space, device, clip_actions=False): + Model.__init__(self, observation_space, action_space, device) + DeterministicMixin.__init__(self, clip_actions) + + self.net = nn.Sequential(nn.Linear(self.num_observations, 256), + nn.ELU(), + nn.Linear(256, 128), + nn.ELU(), + nn.Linear(128, 64), + nn.ELU(), + nn.Linear(64, 1)) + + def compute(self, states, taken_actions, role): + return self.net(states) + + +# instance VecEnvBase and setup task +headless = True # set headless to False for rendering +env = get_env_instance(headless=headless) + +from omniisaacgymenvs.utils.config_utils.sim_config import SimConfig +from reaching_franka_sim_env import ReachingFrankaTask, TASK_CFG + +TASK_CFG["headless"] = headless +TASK_CFG["task"]["env"]["numEnvs"] = 1024 +TASK_CFG["task"]["env"]["controlSpace"] = "joint" # "joint" or "cartesian" + +sim_config = SimConfig(TASK_CFG) +task = ReachingFrankaTask(name="ReachingFranka", sim_config=sim_config, env=env) +env.set_task(task=task, sim_params=sim_config.get_physics_params(), backend="torch", init_sim=True) + +# wrap the environment +env = wrap_env(env, "omniverse-isaacgym") + +device = env.device + + +# Instantiate a RandomMemory as rollout buffer (any memory can be used for this) +memory = RandomMemory(memory_size=16, num_envs=env.num_envs, device=device) + + +# Instantiate the agent's models (function approximators). +# PPO requires 2 models, visit its documentation for more details +# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#spaces-and-models +models_ppo = {} +models_ppo["policy"] = Policy(env.observation_space, env.action_space, device) +models_ppo["value"] = Value(env.observation_space, env.action_space, device) + + +# Configure and instantiate the agent. +# Only modify some of the default configuration, visit its documentation to see all the options +# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#configuration-and-hyperparameters +cfg_ppo = PPO_DEFAULT_CONFIG.copy() +cfg_ppo["rollouts"] = 16 +cfg_ppo["learning_epochs"] = 8 +cfg_ppo["mini_batches"] = 8 +cfg_ppo["discount_factor"] = 0.99 +cfg_ppo["lambda"] = 0.95 +cfg_ppo["learning_rate"] = 5e-4 +cfg_ppo["learning_rate_scheduler"] = KLAdaptiveRL +cfg_ppo["learning_rate_scheduler_kwargs"] = {"kl_threshold": 0.008} +cfg_ppo["random_timesteps"] = 0 +cfg_ppo["learning_starts"] = 0 +cfg_ppo["grad_norm_clip"] = 1.0 +cfg_ppo["ratio_clip"] = 0.2 +cfg_ppo["value_clip"] = 0.2 +cfg_ppo["clip_predicted_values"] = True +cfg_ppo["entropy_loss_scale"] = 0.0 +cfg_ppo["value_loss_scale"] = 2.0 +cfg_ppo["kl_threshold"] = 0 +cfg_ppo["state_preprocessor"] = RunningStandardScaler +cfg_ppo["state_preprocessor_kwargs"] = {"size": env.observation_space, "device": device} +cfg_ppo["value_preprocessor"] = RunningStandardScaler +cfg_ppo["value_preprocessor_kwargs"] = {"size": 1, "device": device} +# logging to TensorBoard and write checkpoints each 32 and 250 timesteps respectively +cfg_ppo["experiment"]["write_interval"] = 32 +cfg_ppo["experiment"]["checkpoint_interval"] = 250 + +agent = PPO(models=models_ppo, + memory=memory, + cfg=cfg_ppo, + observation_space=env.observation_space, + action_space=env.action_space, + device=device) + + +# Configure and instantiate the RL trainer +cfg_trainer = {"timesteps": 5000, "headless": True} +trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=agent) + +# start training +trainer.train() From 1779eebfa8a39bfaa30eee4e091ff7906f798cac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20Serrano=20Mu=C3=B1oz?= Date: Mon, 19 Sep 2022 16:22:49 +0200 Subject: [PATCH 083/108] Add Franka Emika example to docs --- docs/source/intro/examples.rst | 153 +++++++++++++++++++++++++++++++++ 1 file changed, 153 insertions(+) diff --git a/docs/source/intro/examples.rst b/docs/source/intro/examples.rst index ea41d7e2..0a365f44 100644 --- a/docs/source/intro/examples.rst +++ b/docs/source/intro/examples.rst @@ -768,6 +768,159 @@ These examples show how to train an agent in an Omniverse Isaac Sim environment :language: python :emphasize-lines: 24-39, 45, 53-68, 73 +Real-world examples +------------------- + +These examples show basic real-world use cases to guide and support advanced RL implementations + +.. tabs:: + + .. tab:: Franka Emika Panda + + **3D reaching task (Franka's gripper must reach a certain target point in space)**. The training was done in Omniverse Isaac Gym. The real robot control is performed through the Python API of a modified version of frankx (see `frankx's pull request #42 `_), a high-level motion library around libfranka. Training and evaluation is performed for both Cartesian and joint control space + + .. raw:: html + +
+ + **Implementation** (see details in the table below): + + * The observation space is composed of the episode's normalized progress, the robot joints' normalized positions (:math:`q`) in the interval -1 to 1, the robot joints' velocities (:math:`\dot{q}`) affected by a random uniform scale for generalization, and the target's position in space (:math:`target_{_{XYZ}}`) with respect to the robot's base + + * The action space, bounded in the range -1 to 1, consists of the following. For the Cartesian control it's the end-effector's position (:math:`ee_{_{XYZ}}`) scaled change. For the joint control it's robot joints' position scaled change. The end-effector's position corresponds to the gripper fingers, which remain closed all the time + + * The instantaneous reward is the negative value of the Euclidean distance (:math:`\text{d}`) between the robot end-effector and the target point position. The episode terminates when this distance is less than 0.035 meters in simulation (0.075 meters in real-world) or when the defined maximum timestep is reached + + * The target position lies within a rectangular cuboid of dimensions 0.5 x 0.5 x 0.2 meters centered at 0.5, 0.0, 0.2 meters with respect to the robot's base. The robot joints' positions are drawn from an initial configuration [0º, -45º, 0º, -135º, 0º, 90º, 45º] modified with uniform random values between -7º and 7º approximately + + .. list-table:: + :header-rows: 1 + + * - Variable + - Formula / value + - Size + * - Observation space + - :math:`\dfrac{t}{t_{max}},\; 2 \dfrac{q - q_{min}}{q_{max} - q_{min}} - 1,\; 0.1\,\dot{q}\,U(0.5,1.5),\; target_{_{XYZ}}` + - 18 + * - Action space (joint) + - :math:`\dfrac{2.5}{120} \, \Delta q` + - 7 + * - Action space (Cartesian) + - :math:`\dfrac{1}{100} \, \Delta ee_{_{XYZ}}` + - 3 + * - Reward + - :math:`-\text{d}(ee_{_{XYZ}},\; target_{_{XYZ}})` + - + * - Episode termination + - :math:`\text{d}(ee_{_{XYZ}},\; target_{_{XYZ}}) \le 0.035 \quad` or :math:`\quad t \ge t_{max} - 1` + - + * - Maximum timesteps (:math:`t_{max}`) + - 100 + - + + .. raw:: html + +
+ + **Workflows** + + .. tabs:: + + .. tab:: Simulation + + .. raw:: html + + + + .. raw:: html + + + + | + + **Prerequisites:** + + All installation steps described in Omniverse Isaac Gym's `Overview & Getting Started `_ section must be fulfilled (especially the subsection 1.3. Installing Examples Repository) + + **Files** (the implementation is self-contained so no specific location is required): + + * Environment: :download:`reaching_franka_sim_env.py <../examples/real_world/franka_emika_panda/reaching_franka_sim_env.py>` + * Training script: :download:`reaching_franka_sim_skrl_train.py <../examples/real_world/franka_emika_panda/reaching_franka_sim_skrl_train.py>` + * Evaluation script: :download:`reaching_franka_sim_skrl_eval.py <../examples/real_world/franka_emika_panda/reaching_franka_sim_skrl_eval.py>` + + **Training and evaluation:** + + .. code-block:: bash + + # training (local workstation) + ~/.local/share/ov/pkg/isaac_sim-*/python.sh reaching_franka_sim_skrl_train.py + + # training (docker container) + /isaac-sim/python.sh reaching_franka_sim_skrl_train.py + + .. code-block:: bash + + # evaluation (local workstation) + ~/.local/share/ov/pkg/isaac_sim-*/python.sh reaching_franka_sim_skrl_eval.py + + # evaluation (docker container) + /isaac-sim/python.sh reaching_franka_sim_skrl_eval.py + + **Main environment configuration:** + + The control space (Cartesian or joint) can be specified in the task configuration dictionary (from :literal:`reaching_franka_sim_skrl_train.py`) as follow: + + .. code-block:: python + + TASK_CFG["task"]["env"]["controlSpace"] = "joint" # "joint" or "cartesian" + + .. tab:: Real-world + + .. warning:: + + Make sure you have the e-stop on hand in case something goes wrong in the run. **Control via RL can be dangerous and unsafe for both the operator and the robot** + + .. raw:: html + + + Target position entered via the command prompt or generated randomly +

+ + Target position in X and Y obtained with a USB-camera (position in Z fixed at 0.2 m) + + | + + **Prerequisites:** + + A physical Franka robot with `Franka Control Interface (FCI) `_ is required. Additionally, the frankx library must be available in the python environment (see `frankx's pull request #42 `_ for the RL-compatible version installation) + + **Files** + + * Environment: :download:`reaching_franka_real_env.py <../examples/real_world/franka_emika_panda/reaching_franka_real_env.py>` + * Evaluation script: :download:`reaching_franka_real_skrl_eval.py <../examples/real_world/franka_emika_panda/reaching_franka_real_skrl_eval.py>` + + **Evaluation:** + + .. code-block:: bash + + python3 reaching_franka_real_skrl_eval.py + + **Main environment configuration:** + + The control space (Cartesian or joint), the robot motion type (waypoint or impedance) and the target position acquisition (command prompt / automatically generated or USB-camera) can be specified in the environment class constructor (from :literal:`reaching_franka_real_skrl_eval.py`) as follow: + + .. code-block:: python + + control_space = "joint" # joint or cartesian + motion_type = "waypoint" # waypoint or impedance + camera_tracking = False # True for USB-camera tracking + .. _library_utilities: Library utilities (skrl.utils module) From 8ff907c2188935bbea10f694272c53ffa00360a6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20Serrano=20Mu=C3=B1oz?= Date: Mon, 19 Sep 2022 22:24:45 +0200 Subject: [PATCH 084/108] Add Omniverse Isaac Gym utilities --- skrl/utils/omniverse_isaacgym_utils.py | 265 +++++++++++++++++++++++++ 1 file changed, 265 insertions(+) create mode 100644 skrl/utils/omniverse_isaacgym_utils.py diff --git a/skrl/utils/omniverse_isaacgym_utils.py b/skrl/utils/omniverse_isaacgym_utils.py new file mode 100644 index 00000000..42d5da67 --- /dev/null +++ b/skrl/utils/omniverse_isaacgym_utils.py @@ -0,0 +1,265 @@ +from typing import Optional, Union + +import torch +import numpy as np + + +def _np_quat_mul(a, b): + assert a.shape == b.shape + shape = a.shape + a = a.reshape(-1, 4) + b = b.reshape(-1, 4) + + x1, y1, z1, w1 = a[:, 0], a[:, 1], a[:, 2], a[:, 3] + x2, y2, z2, w2 = b[:, 0], b[:, 1], b[:, 2], b[:, 3] + ww = (z1 + x1) * (x2 + y2) + yy = (w1 - y1) * (w2 + z2) + zz = (w1 + y1) * (w2 - z2) + xx = ww + yy + zz + qq = 0.5 * (xx + (z1 - x1) * (x2 - y2)) + w = qq - ww + (z1 - y1) * (y2 - z2) + x = qq - xx + (x1 + w1) * (x2 + w2) + y = qq - yy + (w1 - x1) * (y2 + z2) + z = qq - zz + (z1 + y1) * (w2 - x2) + + return np.stack([x, y, z, w], axis=-1).reshape(shape) + +def _np_quat_conjugate(a): + shape = a.shape + a = a.reshape(-1, 4) + return np.concatenate((-a[:, :3], a[:, -1:]), axis=-1).reshape(shape) + +def _torch_quat_mul(a, b): + assert a.shape == b.shape + shape = a.shape + a = a.reshape(-1, 4) + b = b.reshape(-1, 4) + + w1, x1, y1, z1 = a[:, 0], a[:, 1], a[:, 2], a[:, 3] + w2, x2, y2, z2 = b[:, 0], b[:, 1], b[:, 2], b[:, 3] + ww = (z1 + x1) * (x2 + y2) + yy = (w1 - y1) * (w2 + z2) + zz = (w1 + y1) * (w2 - z2) + xx = ww + yy + zz + qq = 0.5 * (xx + (z1 - x1) * (x2 - y2)) + w = qq - ww + (z1 - y1) * (y2 - z2) + x = qq - xx + (x1 + w1) * (x2 + w2) + y = qq - yy + (w1 - x1) * (y2 + z2) + z = qq - zz + (z1 + y1) * (w2 - x2) + + return torch.stack([w, x, y, z], dim=-1).view(shape) + +def _torch_quat_conjugate(a): # wxyz + shape = a.shape + a = a.reshape(-1, 4) + return torch.cat((a[:, :1], -a[:, 1:]), dim=-1).view(shape) + +def ik(jacobian_end_effector: torch.Tensor, + current_position: torch.Tensor, + current_orientation: torch.Tensor, + goal_position: torch.Tensor, + goal_orientation: Optional[torch.Tensor] = None, + damping_factor: float = 0.05, + squeeze_output: bool = True) -> torch.Tensor: + """Inverse kinematics using damped least squares method + + :param jacobian_end_effector: End effector's jacobian + :type jacobian_end_effector: torch.Tensor + :param current_position: End effector's current position + :type current_position: torch.Tensor + :param current_orientation: End effector's current orientation + :type current_orientation: torch.Tensor + :param goal_position: End effector's goal position + :type goal_position: torch.Tensor + :param goal_orientation: End effector's goal orientation (default: ``None``) + :type goal_orientation: torch.Tensor, optional + :param damping_factor: Damping factor (default: ``0.05``) + :type damping_factor: float, optional + :param squeeze_output: Squeeze output (default: ``True``) + :type squeeze_output: bool, optional + + :return: Change in joint angles + :rtype: torch.Tensor + """ + if goal_orientation is None: + goal_orientation = current_orientation + + # torch + if isinstance(jacobian_end_effector, torch.Tensor): + # compute error + + q = _torch_quat_mul(goal_orientation, _torch_quat_conjugate(current_orientation)) + error = torch.cat([goal_position - current_position, # position error + q[:, 1:] * torch.sign(q[:, 0]).unsqueeze(-1)], # orientation error + dim=-1).unsqueeze(-1) + + # solve damped least squares (dO = J.T * V) + transpose = torch.transpose(jacobian_end_effector, 1, 2) + lmbda = torch.eye(6, device=jacobian_end_effector.device) * (damping_factor ** 2) + if squeeze_output: + return (transpose @ torch.inverse(jacobian_end_effector @ transpose + lmbda) @ error).squeeze(dim=2) + else: + return transpose @ torch.inverse(jacobian_end_effector @ transpose + lmbda) @ error + + # numpy + # TODO: test and fix this + else: + # compute error + q = _np_quat_mul(goal_orientation, _np_quat_conjugate(current_orientation)) + error = np.concatenate([goal_position - current_position, # position error + q[:, 0:3] * np.sign(q[:, 3])]) # orientation error + + # solve damped least squares (dO = J.T * V) + transpose = np.transpose(jacobian_end_effector, 1, 2) + lmbda = np.eye(6) * (damping_factor ** 2) + if squeeze_output: + return (transpose @ np.linalg.inv(jacobian_end_effector @ transpose + lmbda) @ error) + else: + return transpose @ np.linalg.inv(jacobian_end_effector @ transpose + lmbda) @ error + +def get_env_instance(headless: bool = True, multi_threaded: bool = False) -> "omni.isaac.gym.vec_env.VecEnvBase": + """ + Instantiate a VecEnvBase-based object compatible with OmniIsaacGymEnvs + + :param headless: Disable UI when running (default: ``True``) + :type headless: bool, optional + :param multi_threaded: Whether to return a multi-threaded environment instance (default: ``False``) + :type multi_threaded: bool, optional + + :return: Environment instance + :rtype: omni.isaac.gym.vec_env.VecEnvBase + + Example:: + + from skrl.envs.torch import wrap_env + from skrl.utils.omniverse_isaacgym_utils import get_env_instance + + # get environment instance + env = get_env_instance(headless=True) + + # parse sim configuration + from omniisaacgymenvs.utils.config_utils.sim_config import SimConfig + sim_config = SimConfig({"test": False, + "device_id": 0, + "headless": True, + "sim_device": "gpu", + "task": {"name": "CustomTask", + "physics_engine": "physx", + "env": {"numEnvs": 512, + "envSpacing": 1.5, + "enableDebugVis": False, + "clipObservations": 1000.0, + "clipActions": 1.0, + "controlFrequencyInv": 4}, + "sim": {"dt": 0.0083, # 1 / 120 + "use_gpu_pipeline": True, + "gravity": [0.0, 0.0, -9.81], + "add_ground_plane": True, + "use_flatcache": True, + "enable_scene_query_support": False, + "enable_cameras": False, + "default_physics_material": {"static_friction": 1.0, + "dynamic_friction": 1.0, + "restitution": 0.0}, + "physx": {"worker_thread_count": 4, + "solver_type": 1, + "use_gpu": True, + "solver_position_iteration_count": 4, + "solver_velocity_iteration_count": 1, + "contact_offset": 0.005, + "rest_offset": 0.0, + "bounce_threshold_velocity": 0.2, + "friction_offset_threshold": 0.04, + "friction_correlation_distance": 0.025, + "enable_sleeping": True, + "enable_stabilization": True, + "max_depenetration_velocity": 1000.0, + "gpu_max_rigid_contact_count": 524288, + "gpu_max_rigid_patch_count": 33554432, + "gpu_found_lost_pairs_capacity": 524288, + "gpu_found_lost_aggregate_pairs_capacity": 262144, + "gpu_total_aggregate_pairs_capacity": 1048576, + "gpu_max_soft_body_contacts": 1048576, + "gpu_max_particle_contacts": 1048576, + "gpu_heap_capacity": 33554432, + "gpu_temp_buffer_capacity": 16777216, + "gpu_max_num_partitions": 8}}}}) + + # import and setup custom task + from custom_task import CustomTask + task = CustomTask(name="CustomTask", sim_config=sim_config, env=env) + env.set_task(task=task, sim_params=sim_config.get_physics_params(), backend="torch", init_sim=True) + + # wrap the environment + env = wrap_env(env, "omniverse-isaacgym") + """ + from omni.isaac.gym.vec_env import VecEnvBase, VecEnvMT, TaskStopException + from omni.isaac.gym.vec_env.vec_env_mt import TrainerMT + + class _OmniIsaacGymVecEnv(VecEnvBase): + def step(self, actions): + actions = torch.clamp(actions, -self._task.clip_actions, self._task.clip_actions).to(self._task.device).clone() + self._task.pre_physics_step(actions) + + for _ in range(self._task.control_frequency_inv): + self._world.step(render=self._render) + self.sim_frame_count += 1 + + observations, rewards, dones, info = self._task.post_physics_step() + + return {"obs": torch.clamp(observations, -self._task.clip_obs, self._task.clip_obs).to(self._task.rl_device).clone()}, \ + rewards.to(self._task.rl_device).clone(), dones.to(self._task.rl_device).clone(), info.copy() + + def reset(self): + self._task.reset() + actions = torch.zeros((self.num_envs, self._task.num_actions), device=self._task.device) + return self.step(actions)[0] + + class _OmniIsaacGymTrainerMT(TrainerMT): + def run(self): + pass + + def stop(self): + pass + + class _OmniIsaacGymVecEnvMT(VecEnvMT): + def __init__(self, headless): + super().__init__(headless) + + self.action_queue = queue.Queue(1) + self.data_queue = queue.Queue(1) + + def run(self, trainer=None): + super().run(_OmniIsaacGymTrainerMT() if trainer is None else trainer) + + def _parse_data(self, data): + self._observations = torch.clamp(data["obs"], -self._task.clip_obs, self._task.clip_obs).to(self._task.rl_device).clone() + self._rewards = data["rew"].to(self._task.rl_device).clone() + self._dones = data["reset"].to(self._task.rl_device).clone() + self._info = data["extras"].copy() + + def step(self, actions): + if self._stop: + raise TaskStopException() + + actions = torch.clamp(actions, -self._task.clip_actions, self._task.clip_actions).clone() + + self.send_actions(actions) + data = self.get_data() + + return {"obs": self._observations}, self._rewards, self._dones, self._info + + def reset(self): + self._task.reset() + actions = torch.zeros((self.num_envs, self._task.num_actions), device=self._task.device) + return self.step(actions)[0] + + def close(self): + # end stop signal to main thread + self.send_actions(None) + self.stop = True + + if multi_threaded: + return _OmniIsaacGymVecEnvMT(headless=headless) + else: + return _OmniIsaacGymVecEnv(headless=headless) From aba1edf47ca4c902f24c6439778aa8923fc8e739 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20Serrano=20Mu=C3=B1oz?= Date: Mon, 19 Sep 2022 22:26:30 +0200 Subject: [PATCH 085/108] Add Omniverse Isaac Gym utilities to docs --- docs/source/index.rst | 2 ++ .../skrl.utils.omniverse_isaacgym_utils.rst | 20 +++++++++++++++---- 2 files changed, 18 insertions(+), 4 deletions(-) diff --git a/docs/source/index.rst b/docs/source/index.rst index 0569fb7c..d48d8ce9 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -198,6 +198,7 @@ Utils * :doc:`Model instantiators ` * Memory and Tensorboard :doc:`file post-processing ` * :doc:`Isaac Gym utils ` + * :doc:`Omniverse Isaac Gym utils ` .. toctree:: :maxdepth: 1 @@ -208,3 +209,4 @@ Utils modules/skrl.utils.model_instantiators modules/skrl.utils.postprocessing modules/skrl.utils.isaacgym_utils + modules/skrl.utils.omniverse_isaacgym_utils diff --git a/docs/source/modules/skrl.utils.omniverse_isaacgym_utils.rst b/docs/source/modules/skrl.utils.omniverse_isaacgym_utils.rst index 806ed6d8..8b4102e5 100644 --- a/docs/source/modules/skrl.utils.omniverse_isaacgym_utils.rst +++ b/docs/source/modules/skrl.utils.omniverse_isaacgym_utils.rst @@ -2,13 +2,13 @@ Omniverse Isaac Gym utils ========================= .. contents:: Table of Contents - :depth: 2 - :local: - :backlinks: none + :depth: 2 + :local: + :backlinks: none .. raw:: html -
+
Control of robotic manipulators ------------------------------- @@ -31,3 +31,15 @@ API """ .. autofunction:: skrl.utils.omniverse_isaacgym_utils.ik + +.. raw:: html + +
+ +OmniIsaacGymEnvs-like environment instance +------------------------------------------ + +API +""" + +.. autofunction:: skrl.utils.omniverse_isaacgym_utils.get_env_instance \ No newline at end of file From bf99a34ecf020f1ee46b4d8c6c63d5916a8d30b8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20Serrano=20Mu=C3=B1oz?= Date: Mon, 19 Sep 2022 22:31:41 +0200 Subject: [PATCH 086/108] Add a download link to franka emika example trained checkpoints --- docs/source/intro/examples.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/source/intro/examples.rst b/docs/source/intro/examples.rst index 0a365f44..968711fb 100644 --- a/docs/source/intro/examples.rst +++ b/docs/source/intro/examples.rst @@ -849,6 +849,7 @@ These examples show basic real-world use cases to guide and support advanced RL * Environment: :download:`reaching_franka_sim_env.py <../examples/real_world/franka_emika_panda/reaching_franka_sim_env.py>` * Training script: :download:`reaching_franka_sim_skrl_train.py <../examples/real_world/franka_emika_panda/reaching_franka_sim_skrl_train.py>` * Evaluation script: :download:`reaching_franka_sim_skrl_eval.py <../examples/real_world/franka_emika_panda/reaching_franka_sim_skrl_eval.py>` + * Checkpoints (:literal:`agent_joint.pt`, :literal:`agent_cartesian.pt`): :download:`trained_checkpoints.zip ` **Training and evaluation:** @@ -904,6 +905,7 @@ These examples show basic real-world use cases to guide and support advanced RL * Environment: :download:`reaching_franka_real_env.py <../examples/real_world/franka_emika_panda/reaching_franka_real_env.py>` * Evaluation script: :download:`reaching_franka_real_skrl_eval.py <../examples/real_world/franka_emika_panda/reaching_franka_real_skrl_eval.py>` + * Checkpoints (:literal:`agent_joint.pt`, :literal:`agent_cartesian.pt`): :download:`trained_checkpoints.zip ` **Evaluation:** From 55b9ddf5d1f69b8744b46b3cfb5983ffdd582497 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20Serrano=20Mu=C3=B1oz?= Date: Mon, 19 Sep 2022 23:31:09 +0200 Subject: [PATCH 087/108] Update frankx pull request link --- docs/source/intro/examples.rst | 96 +++++++++++++++++----------------- 1 file changed, 48 insertions(+), 48 deletions(-) diff --git a/docs/source/intro/examples.rst b/docs/source/intro/examples.rst index 968711fb..0b4c5b88 100644 --- a/docs/source/intro/examples.rst +++ b/docs/source/intro/examples.rst @@ -777,7 +777,7 @@ These examples show basic real-world use cases to guide and support advanced RL .. tab:: Franka Emika Panda - **3D reaching task (Franka's gripper must reach a certain target point in space)**. The training was done in Omniverse Isaac Gym. The real robot control is performed through the Python API of a modified version of frankx (see `frankx's pull request #42 `_), a high-level motion library around libfranka. Training and evaluation is performed for both Cartesian and joint control space + **3D reaching task (Franka's gripper must reach a certain target point in space)**. The training was done in Omniverse Isaac Gym. The real robot control is performed through the Python API of a modified version of frankx (see `frankx's pull request #44 `_), a high-level motion library around libfranka. Training and evaluation is performed for both Cartesian and joint control space .. raw:: html @@ -787,7 +787,7 @@ These examples show basic real-world use cases to guide and support advanced RL * The observation space is composed of the episode's normalized progress, the robot joints' normalized positions (:math:`q`) in the interval -1 to 1, the robot joints' velocities (:math:`\dot{q}`) affected by a random uniform scale for generalization, and the target's position in space (:math:`target_{_{XYZ}}`) with respect to the robot's base - * The action space, bounded in the range -1 to 1, consists of the following. For the Cartesian control it's the end-effector's position (:math:`ee_{_{XYZ}}`) scaled change. For the joint control it's robot joints' position scaled change. The end-effector's position corresponds to the gripper fingers, which remain closed all the time + * The action space, bounded in the range -1 to 1, consists of the following. For the joint control it's robot joints' position scaled change. For the Cartesian control it's the end-effector's position (:math:`ee_{_{XYZ}}`) scaled change. The end-effector position frame corresponds to the point where the left finger connects to the gripper base in simulation, whereas in the real world it corresponds to the end of the fingers. The gripper fingers remain closed all the time in both cases * The instantaneous reward is the negative value of the Euclidean distance (:math:`\text{d}`) between the robot end-effector and the target point position. The episode terminates when this distance is less than 0.035 meters in simulation (0.075 meters in real-world) or when the defined maximum timestep is reached @@ -826,6 +826,52 @@ These examples show basic real-world use cases to guide and support advanced RL .. tabs:: + .. tab:: Real-world + + .. warning:: + + Make sure you have the e-stop on hand in case something goes wrong in the run. **Control via RL can be dangerous and unsafe for both the operator and the robot** + + .. raw:: html + + + Target position entered via the command prompt or generated randomly +

+ + Target position in X and Y obtained with a USB-camera (position in Z fixed at 0.2 m) + + | + + **Prerequisites:** + + A physical Franka robot with `Franka Control Interface (FCI) `_ is required. Additionally, the frankx library must be available in the python environment (see `frankx's pull request #44 `_ for the RL-compatible version installation) + + **Files** + + * Environment: :download:`reaching_franka_real_env.py <../examples/real_world/franka_emika_panda/reaching_franka_real_env.py>` + * Evaluation script: :download:`reaching_franka_real_skrl_eval.py <../examples/real_world/franka_emika_panda/reaching_franka_real_skrl_eval.py>` + * Checkpoints (:literal:`agent_joint.pt`, :literal:`agent_cartesian.pt`): :download:`trained_checkpoints.zip ` + + **Evaluation:** + + .. code-block:: bash + + python3 reaching_franka_real_skrl_eval.py + + **Main environment configuration:** + + The control space (Cartesian or joint), the robot motion type (waypoint or impedance) and the target position acquisition (command prompt / automatically generated or USB-camera) can be specified in the environment class constructor (from :literal:`reaching_franka_real_skrl_eval.py`) as follow: + + .. code-block:: python + + control_space = "joint" # joint or cartesian + motion_type = "waypoint" # waypoint or impedance + camera_tracking = False # True for USB-camera tracking + .. tab:: Simulation .. raw:: html @@ -877,52 +923,6 @@ These examples show basic real-world use cases to guide and support advanced RL TASK_CFG["task"]["env"]["controlSpace"] = "joint" # "joint" or "cartesian" - .. tab:: Real-world - - .. warning:: - - Make sure you have the e-stop on hand in case something goes wrong in the run. **Control via RL can be dangerous and unsafe for both the operator and the robot** - - .. raw:: html - - - Target position entered via the command prompt or generated randomly -

- - Target position in X and Y obtained with a USB-camera (position in Z fixed at 0.2 m) - - | - - **Prerequisites:** - - A physical Franka robot with `Franka Control Interface (FCI) `_ is required. Additionally, the frankx library must be available in the python environment (see `frankx's pull request #42 `_ for the RL-compatible version installation) - - **Files** - - * Environment: :download:`reaching_franka_real_env.py <../examples/real_world/franka_emika_panda/reaching_franka_real_env.py>` - * Evaluation script: :download:`reaching_franka_real_skrl_eval.py <../examples/real_world/franka_emika_panda/reaching_franka_real_skrl_eval.py>` - * Checkpoints (:literal:`agent_joint.pt`, :literal:`agent_cartesian.pt`): :download:`trained_checkpoints.zip ` - - **Evaluation:** - - .. code-block:: bash - - python3 reaching_franka_real_skrl_eval.py - - **Main environment configuration:** - - The control space (Cartesian or joint), the robot motion type (waypoint or impedance) and the target position acquisition (command prompt / automatically generated or USB-camera) can be specified in the environment class constructor (from :literal:`reaching_franka_real_skrl_eval.py`) as follow: - - .. code-block:: python - - control_space = "joint" # joint or cartesian - motion_type = "waypoint" # waypoint or impedance - camera_tracking = False # True for USB-camera tracking - .. _library_utilities: Library utilities (skrl.utils module) From 4afc188cef54c4cfc265849321960e3793ba0586 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20Serrano=20Mu=C3=B1oz?= Date: Thu, 22 Sep 2022 00:21:22 +0200 Subject: [PATCH 088/108] Generate random seed and enable deterministic --- skrl/utils/__init__.py | 66 ++++++++++++++++++++++++++++++++++++++---- 1 file changed, 61 insertions(+), 5 deletions(-) diff --git a/skrl/utils/__init__.py b/skrl/utils/__init__.py index 077e31a9..778ef728 100644 --- a/skrl/utils/__init__.py +++ b/skrl/utils/__init__.py @@ -1,12 +1,22 @@ +from typing import Optional + +import os +import sys +import time +import torch import random import numpy as np -import torch + +from skrl import logger -def set_seed(seed: int) -> None: +def set_seed(seed: Optional[int] = None, deterministic: bool = False) -> int: """ Set the seed for the random number generators + Due to NumPy's legacy seeding constraint the seed must be between 0 and 2**32 - 1. + Otherwise a NumPy exception (``ValueError: Seed must be between 0 and 2**32 - 1``) will be raised + Modified packages: - random @@ -15,15 +25,61 @@ def set_seed(seed: int) -> None: Example:: + # fixed seed >>> from skrl.utils import set_seed >>> set_seed(42) + [skrl:INFO] Seed: 42 + 42 + + # random seed + >>> set_seed() + >>> from skrl.utils import set_seed + [skrl:INFO] Seed: 1776118066 + 1776118066 + + # enable deterministic. The following environment variables should be established: + # - CUDA 10.1: CUDA_LAUNCH_BLOCKING=1 + # - CUDA 10.2 or later: CUBLAS_WORKSPACE_CONFIG=:16:8 or CUBLAS_WORKSPACE_CONFIG=:4096:2 + >>> from skrl.utils import set_seed + >>> set_seed(42, deterministic=True) + [skrl:INFO] Seed: 42 + [skrl:WARNING] PyTorch/cuDNN deterministic algorithms are enabled. This may affect performance + 42 + + :param seed: The seed to set. Is None, a random seed will be generated (default: ``None``) + :type seed: int, optional + :param deterministic: Whether PyTorch is configured to use deterministic algorithms (default: ``False``). + The following environment variables should be established for CUDA 10.1 (``CUDA_LAUNCH_BLOCKING=1``) + and for CUDA 10.2 or later (``CUBLAS_WORKSPACE_CONFIG=:16:8`` or ``CUBLAS_WORKSPACE_CONFIG=:4096:2``). + See PyTorch `Reproducibility `_ for details + :type deterministic: bool, optional - :param seed: The seed to set - :type seed: int + :return: Seed + :rtype: int """ + # generate a random seed + if seed is None: + try: + seed = int.from_bytes(os.urandom(4), byteorder=sys.byteorder) + except NotImplementedError: + seed = int(time.time() * 1000) + seed %= 2 ** 31 # NumPy's legacy seeding seed must be between 0 and 2**32 - 1 + random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed(seed) torch.cuda.manual_seed_all(seed) - \ No newline at end of file + + logger.info("Seed: {}".format(seed)) + + if deterministic: + torch.backends.cudnn.benchmark = False + torch.backends.cudnn.deterministic = True + + # On CUDA 10.1, set environment variable CUDA_LAUNCH_BLOCKING=1 + # On CUDA 10.2 or later, set environment variable CUBLAS_WORKSPACE_CONFIG=:16:8 or CUBLAS_WORKSPACE_CONFIG=:4096:2 + + logger.warning("PyTorch/cuDNN deterministic algorithms are enabled. This may affect performance") + + return seed From 7f91d4cb203545b871ccc161d61deb07f7a38b3a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20Serrano=20Mu=C3=B1oz?= Date: Thu, 22 Sep 2022 21:58:27 +0200 Subject: [PATCH 089/108] Add reference link to resources in docs --- docs/source/modules/skrl.resources.noises.rst | 2 ++ docs/source/modules/skrl.resources.preprocessors.rst | 2 ++ docs/source/modules/skrl.resources.schedulers.rst | 2 ++ 3 files changed, 6 insertions(+) diff --git a/docs/source/modules/skrl.resources.noises.rst b/docs/source/modules/skrl.resources.noises.rst index a988f0fe..e25c87c4 100644 --- a/docs/source/modules/skrl.resources.noises.rst +++ b/docs/source/modules/skrl.resources.noises.rst @@ -1,3 +1,5 @@ +.. _resources_noises: + Noises ====== diff --git a/docs/source/modules/skrl.resources.preprocessors.rst b/docs/source/modules/skrl.resources.preprocessors.rst index d44aa631..21c44a10 100644 --- a/docs/source/modules/skrl.resources.preprocessors.rst +++ b/docs/source/modules/skrl.resources.preprocessors.rst @@ -1,3 +1,5 @@ +.. _resources_preprocessors: + Preprocessors ============= diff --git a/docs/source/modules/skrl.resources.schedulers.rst b/docs/source/modules/skrl.resources.schedulers.rst index e360b730..a879dae6 100644 --- a/docs/source/modules/skrl.resources.schedulers.rst +++ b/docs/source/modules/skrl.resources.schedulers.rst @@ -1,3 +1,5 @@ +.. _resources_schedulers: + Learning rate schedulers ======================== From d8a23abebb07d5e1cccf229585e59e581232a8b1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20Serrano=20Mu=C3=B1oz?= Date: Thu, 22 Sep 2022 21:59:30 +0200 Subject: [PATCH 090/108] Update example code in model's constructor docstring --- skrl/models/torch/gaussian.py | 4 ++-- skrl/models/torch/multivariate_gaussian.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/skrl/models/torch/gaussian.py b/skrl/models/torch/gaussian.py index c56fce15..53fa04b9 100644 --- a/skrl/models/torch/gaussian.py +++ b/skrl/models/torch/gaussian.py @@ -42,9 +42,9 @@ def __init__(self, >>> >>> class Policy(GaussianMixin, Model): ... def __init__(self, observation_space, action_space, device="cuda:0", - ... clip_actions=False, clip_log_std=True, min_log_std=-20, max_log_std=2): + ... clip_actions=False, clip_log_std=True, min_log_std=-20, max_log_std=2, reduction="sum"): ... Model.__init__(self, observation_space, action_space, device) - ... GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std) + ... GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std, reduction) ... ... self.net = nn.Sequential(nn.Linear(self.num_observations, 32), ... nn.ELU(), diff --git a/skrl/models/torch/multivariate_gaussian.py b/skrl/models/torch/multivariate_gaussian.py index b2b85449..4ca63e26 100644 --- a/skrl/models/torch/multivariate_gaussian.py +++ b/skrl/models/torch/multivariate_gaussian.py @@ -34,8 +34,8 @@ def __init__(self, >>> from skrl.models.torch import Model, MultivariateGaussianMixin >>> >>> class Policy(MultivariateGaussianMixin, Model): - ... def __init__(self, observation_space, action_space, device="cuda:0", clip_actions=False, - ... clip_log_std=True, min_log_std=-20, max_log_std=2): + ... def __init__(self, observation_space, action_space, device="cuda:0", + ... clip_actions=False, clip_log_std=True, min_log_std=-20, max_log_std=2): ... Model.__init__(self, observation_space, action_space, device) ... MultivariateGaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std) ... From 1396f5b8b543e2fd5ba6d8628620bdc6752508b9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20Serrano=20Mu=C3=B1oz?= Date: Fri, 30 Sep 2022 09:53:29 +0200 Subject: [PATCH 091/108] Update Omniverse Isaac Gym examples --- .../examples/omniisaacgym/ppo_allegro_hand.py | 65 ++++------ docs/source/examples/omniisaacgym/ppo_ant.py | 51 +++----- .../examples/omniisaacgym/ppo_ant_mt.py | 51 +++----- .../examples/omniisaacgym/ppo_anymal.py | 115 +++++++++++++++++ .../omniisaacgym/ppo_anymal_terrain.py | 120 ++++++++++++++++++ .../examples/omniisaacgym/ppo_ball_balance.py | 115 +++++++++++++++++ .../examples/omniisaacgym/ppo_cartpole.py | 49 +++---- .../examples/omniisaacgym/ppo_cartpole_mt.py | 49 +++---- .../examples/omniisaacgym/ppo_crazy_flie.py | 115 +++++++++++++++++ .../omniisaacgym/ppo_franka_cabinet.py | 115 +++++++++++++++++ .../examples/omniisaacgym/ppo_humanoid.py | 51 +++----- .../examples/omniisaacgym/ppo_ingenuity.py | 115 +++++++++++++++++ .../examples/omniisaacgym/ppo_quadcopter.py | 115 +++++++++++++++++ .../examples/omniisaacgym/ppo_shadow_hand.py | 67 ++++------ 14 files changed, 971 insertions(+), 222 deletions(-) create mode 100644 docs/source/examples/omniisaacgym/ppo_anymal.py create mode 100644 docs/source/examples/omniisaacgym/ppo_anymal_terrain.py create mode 100644 docs/source/examples/omniisaacgym/ppo_ball_balance.py create mode 100644 docs/source/examples/omniisaacgym/ppo_crazy_flie.py create mode 100644 docs/source/examples/omniisaacgym/ppo_franka_cabinet.py create mode 100644 docs/source/examples/omniisaacgym/ppo_ingenuity.py create mode 100644 docs/source/examples/omniisaacgym/ppo_quadcopter.py diff --git a/docs/source/examples/omniisaacgym/ppo_allegro_hand.py b/docs/source/examples/omniisaacgym/ppo_allegro_hand.py index 8b3f589f..b83b0933 100644 --- a/docs/source/examples/omniisaacgym/ppo_allegro_hand.py +++ b/docs/source/examples/omniisaacgym/ppo_allegro_hand.py @@ -17,42 +17,37 @@ set_seed(42) -# Define the models (stochastic and deterministic models) for the agent using mixins. -# - Policy: takes as input the environment's observation/state and returns an action -# - Value: takes the state as input and provides a value to guide the policy -class Policy(GaussianMixin, Model): +# Define the shared model (stochastic and deterministic models) for the agent using mixins. +class Shared(GaussianMixin, DeterministicMixin, Model): def __init__(self, observation_space, action_space, device, clip_actions=False, - clip_log_std=True, min_log_std=-20, max_log_std=2): + clip_log_std=True, min_log_std=-20, max_log_std=2, reduction="sum"): Model.__init__(self, observation_space, action_space, device) - GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std) + GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std, reduction) + DeterministicMixin.__init__(self, clip_actions) self.net = nn.Sequential(nn.Linear(self.num_observations, 512), nn.ELU(), nn.Linear(512, 256), nn.ELU(), nn.Linear(256, 128), - nn.ELU(), - nn.Linear(128, self.num_actions)) + nn.ELU()) + + self.mean_layer = nn.Linear(128, self.num_actions) self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) + + self.value_layer = nn.Linear(128, 1) - def compute(self, states, taken_actions, role): - return self.net(states), self.log_std_parameter - -class Value(DeterministicMixin, Model): - def __init__(self, observation_space, action_space, device, clip_actions=False): - Model.__init__(self, observation_space, action_space, device) - DeterministicMixin.__init__(self, clip_actions) - - self.net = nn.Sequential(nn.Linear(self.num_observations, 512), - nn.ELU(), - nn.Linear(512, 256), - nn.ELU(), - nn.Linear(256, 128), - nn.ELU(), - nn.Linear(128, 1)) + def act(self, states, taken_actions, role): + if role == "policy": + return GaussianMixin.act(self, states, taken_actions, role) + elif role == "value": + return DeterministicMixin.act(self, states, taken_actions, role) def compute(self, states, taken_actions, role): - return self.net(states) + if role == "policy": + return self.mean_layer(self.net(states)), self.log_std_parameter + elif role == "value": + return self.value_layer(self.net(states)) # Load and wrap the Omniverse Isaac Gym environment @@ -63,28 +58,24 @@ def compute(self, states, taken_actions, role): # Instantiate a RandomMemory as rollout buffer (any memory can be used for this) -memory = RandomMemory(memory_size=8, num_envs=env.num_envs, device=device) +memory = RandomMemory(memory_size=16, num_envs=env.num_envs, device=device) # Instantiate the agent's models (function approximators). # PPO requires 2 models, visit its documentation for more details # https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#spaces-and-models models_ppo = {} -models_ppo["policy"] = Policy(env.observation_space, env.action_space, device) -models_ppo["value"] = Value(env.observation_space, env.action_space, device) - -# Initialize the models' parameters (weights and biases) using a Gaussian distribution -for model in models_ppo.values(): - model.init_parameters(method_name="normal_", mean=0.0, std=0.1) +models_ppo["policy"] = Shared(env.observation_space, env.action_space, device) +models_ppo["value"] = models_ppo["policy"] # same instance: shared model # Configure and instantiate the agent. # Only modify some of the default configuration, visit its documentation to see all the options # https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#configuration-and-hyperparameters cfg_ppo = PPO_DEFAULT_CONFIG.copy() -cfg_ppo["rollouts"] = 8 # memory_size +cfg_ppo["rollouts"] = 16 # memory_size cfg_ppo["learning_epochs"] = 5 -cfg_ppo["mini_batches"] = 4 # 8 * 16384 / 32768 +cfg_ppo["mini_batches"] = 4 # 16 * 8192 / 32768 cfg_ppo["discount_factor"] = 0.99 cfg_ppo["lambda"] = 0.95 cfg_ppo["learning_rate"] = 5e-3 @@ -104,9 +95,9 @@ def compute(self, states, taken_actions, role): cfg_ppo["state_preprocessor_kwargs"] = {"size": env.observation_space, "device": device} cfg_ppo["value_preprocessor"] = RunningStandardScaler cfg_ppo["value_preprocessor_kwargs"] = {"size": 1, "device": device} -# logging to TensorBoard and write checkpoints each 200 and 2000 timesteps respectively -cfg_ppo["experiment"]["write_interval"] = 200 -cfg_ppo["experiment"]["checkpoint_interval"] = 2000 +# logging to TensorBoard and write checkpoints each 800 and 8000 timesteps respectively +cfg_ppo["experiment"]["write_interval"] = 800 +cfg_ppo["experiment"]["checkpoint_interval"] = 8000 agent = PPO(models=models_ppo, memory=memory, @@ -117,7 +108,7 @@ def compute(self, states, taken_actions, role): # Configure and instantiate the RL trainer -cfg_trainer = {"timesteps": 40000, "headless": True} +cfg_trainer = {"timesteps": 160000, "headless": True} trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=agent) # start training diff --git a/docs/source/examples/omniisaacgym/ppo_ant.py b/docs/source/examples/omniisaacgym/ppo_ant.py index e6482534..4282b76d 100644 --- a/docs/source/examples/omniisaacgym/ppo_ant.py +++ b/docs/source/examples/omniisaacgym/ppo_ant.py @@ -17,42 +17,37 @@ set_seed(42) -# Define the models (stochastic and deterministic models) for the agent using mixins. -# - Policy: takes as input the environment's observation/state and returns an action -# - Value: takes the state as input and provides a value to guide the policy -class Policy(GaussianMixin, Model): +# Define the shared model (stochastic and deterministic models) for the agent using mixins. +class Shared(GaussianMixin, DeterministicMixin, Model): def __init__(self, observation_space, action_space, device, clip_actions=False, - clip_log_std=True, min_log_std=-20, max_log_std=2): + clip_log_std=True, min_log_std=-20, max_log_std=2, reduction="sum"): Model.__init__(self, observation_space, action_space, device) - GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std) + GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std, reduction) + DeterministicMixin.__init__(self, clip_actions) self.net = nn.Sequential(nn.Linear(self.num_observations, 256), nn.ELU(), nn.Linear(256, 128), nn.ELU(), nn.Linear(128, 64), - nn.ELU(), - nn.Linear(64, self.num_actions)) + nn.ELU()) + + self.mean_layer = nn.Linear(64, self.num_actions) self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) + + self.value_layer = nn.Linear(64, 1) - def compute(self, states, taken_actions, role): - return self.net(states), self.log_std_parameter - -class Value(DeterministicMixin, Model): - def __init__(self, observation_space, action_space, device, clip_actions=False): - Model.__init__(self, observation_space, action_space, device) - DeterministicMixin.__init__(self, clip_actions) - - self.net = nn.Sequential(nn.Linear(self.num_observations, 256), - nn.ELU(), - nn.Linear(256, 128), - nn.ELU(), - nn.Linear(128, 64), - nn.ELU(), - nn.Linear(64, 1)) + def act(self, states, taken_actions, role): + if role == "policy": + return GaussianMixin.act(self, states, taken_actions, role) + elif role == "value": + return DeterministicMixin.act(self, states, taken_actions, role) def compute(self, states, taken_actions, role): - return self.net(states) + if role == "policy": + return self.mean_layer(self.net(states)), self.log_std_parameter + elif role == "value": + return self.value_layer(self.net(states)) # Load and wrap the Omniverse Isaac Gym environment @@ -70,12 +65,8 @@ def compute(self, states, taken_actions, role): # PPO requires 2 models, visit its documentation for more details # https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#spaces-and-models models_ppo = {} -models_ppo["policy"] = Policy(env.observation_space, env.action_space, device) -models_ppo["value"] = Value(env.observation_space, env.action_space, device) - -# Initialize the models' parameters (weights and biases) using a Gaussian distribution -for model in models_ppo.values(): - model.init_parameters(method_name="normal_", mean=0.0, std=0.1) +models_ppo["policy"] = Shared(env.observation_space, env.action_space, device) +models_ppo["value"] = models_ppo["policy"] # same instance: shared model # Configure and instantiate the agent. diff --git a/docs/source/examples/omniisaacgym/ppo_ant_mt.py b/docs/source/examples/omniisaacgym/ppo_ant_mt.py index 6072cfc0..d3bdbc50 100644 --- a/docs/source/examples/omniisaacgym/ppo_ant_mt.py +++ b/docs/source/examples/omniisaacgym/ppo_ant_mt.py @@ -19,42 +19,37 @@ set_seed(42) -# Define the models (stochastic and deterministic models) for the agent using mixins. -# - Policy: takes as input the environment's observation/state and returns an action -# - Value: takes the state as input and provides a value to guide the policy -class Policy(GaussianMixin, Model): +# Define the shared model (stochastic and deterministic models) for the agent using mixins. +class Shared(GaussianMixin, DeterministicMixin, Model): def __init__(self, observation_space, action_space, device, clip_actions=False, - clip_log_std=True, min_log_std=-20, max_log_std=2): + clip_log_std=True, min_log_std=-20, max_log_std=2, reduction="sum"): Model.__init__(self, observation_space, action_space, device) - GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std) + GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std, reduction) + DeterministicMixin.__init__(self, clip_actions) self.net = nn.Sequential(nn.Linear(self.num_observations, 256), nn.ELU(), nn.Linear(256, 128), nn.ELU(), nn.Linear(128, 64), - nn.ELU(), - nn.Linear(64, self.num_actions)) + nn.ELU()) + + self.mean_layer = nn.Linear(64, self.num_actions) self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) + + self.value_layer = nn.Linear(64, 1) - def compute(self, states, taken_actions, role): - return self.net(states), self.log_std_parameter - -class Value(DeterministicMixin, Model): - def __init__(self, observation_space, action_space, device, clip_actions=False): - Model.__init__(self, observation_space, action_space, device) - DeterministicMixin.__init__(self, clip_actions) - - self.net = nn.Sequential(nn.Linear(self.num_observations, 256), - nn.ELU(), - nn.Linear(256, 128), - nn.ELU(), - nn.Linear(128, 64), - nn.ELU(), - nn.Linear(64, 1)) + def act(self, states, taken_actions, role): + if role == "policy": + return GaussianMixin.act(self, states, taken_actions, role) + elif role == "value": + return DeterministicMixin.act(self, states, taken_actions, role) def compute(self, states, taken_actions, role): - return self.net(states) + if role == "policy": + return self.mean_layer(self.net(states)), self.log_std_parameter + elif role == "value": + return self.value_layer(self.net(states)) # Load and wrap the Omniverse Isaac Gym environment @@ -72,12 +67,8 @@ def compute(self, states, taken_actions, role): # PPO requires 2 models, visit its documentation for more details # https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#spaces-and-models models_ppo = {} -models_ppo["policy"] = Policy(env.observation_space, env.action_space, device) -models_ppo["value"] = Value(env.observation_space, env.action_space, device) - -# Initialize the models' parameters (weights and biases) using a Gaussian distribution -for model in models_ppo.values(): - model.init_parameters(method_name="normal_", mean=0.0, std=0.1) +models_ppo["policy"] = Shared(env.observation_space, env.action_space, device) +models_ppo["value"] = models_ppo["policy"] # same instance: shared model # Configure and instantiate the agent. diff --git a/docs/source/examples/omniisaacgym/ppo_anymal.py b/docs/source/examples/omniisaacgym/ppo_anymal.py new file mode 100644 index 00000000..72505e7e --- /dev/null +++ b/docs/source/examples/omniisaacgym/ppo_anymal.py @@ -0,0 +1,115 @@ +import torch +import torch.nn as nn + +# Import the skrl components to build the RL system +from skrl.models.torch import Model, GaussianMixin, DeterministicMixin +from skrl.memories.torch import RandomMemory +from skrl.agents.torch.ppo import PPO, PPO_DEFAULT_CONFIG +from skrl.resources.schedulers.torch import KLAdaptiveRL +from skrl.resources.preprocessors.torch import RunningStandardScaler +from skrl.trainers.torch import SequentialTrainer +from skrl.envs.torch import wrap_env +from skrl.envs.torch import load_omniverse_isaacgym_env +from skrl.utils import set_seed + + +# set the seed for reproducibility +set_seed(42) + + +# Define the shared model (stochastic and deterministic models) for the agent using mixins. +class Shared(GaussianMixin, DeterministicMixin, Model): + def __init__(self, observation_space, action_space, device, clip_actions=False, + clip_log_std=True, min_log_std=-20, max_log_std=2, reduction="sum"): + Model.__init__(self, observation_space, action_space, device) + GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std, reduction) + DeterministicMixin.__init__(self, clip_actions) + + self.net = nn.Sequential(nn.Linear(self.num_observations, 256), + nn.ELU(), + nn.Linear(256, 128), + nn.ELU(), + nn.Linear(128, 64), + nn.ELU()) + + self.mean_layer = nn.Linear(64, self.num_actions) + self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) + + self.value_layer = nn.Linear(64, 1) + + def act(self, states, taken_actions, role): + if role == "policy": + return GaussianMixin.act(self, states, taken_actions, role) + elif role == "value": + return DeterministicMixin.act(self, states, taken_actions, role) + + def compute(self, states, taken_actions, role): + if role == "policy": + return self.mean_layer(self.net(states)), self.log_std_parameter + elif role == "value": + return self.value_layer(self.net(states)) + + +# Load and wrap the Omniverse Isaac Gym environment +env = load_omniverse_isaacgym_env(task_name="Anymal") +env = wrap_env(env) + +device = env.device + + +# Instantiate a RandomMemory as rollout buffer (any memory can be used for this) +memory = RandomMemory(memory_size=24, num_envs=env.num_envs, device=device) + + +# Instantiate the agent's models (function approximators). +# PPO requires 2 models, visit its documentation for more details +# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#spaces-and-models +models_ppo = {} +models_ppo["policy"] = Shared(env.observation_space, env.action_space, device) +models_ppo["value"] = models_ppo["policy"] # same instance: shared model + + +# Configure and instantiate the agent. +# Only modify some of the default configuration, visit its documentation to see all the options +# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#configuration-and-hyperparameters +cfg_ppo = PPO_DEFAULT_CONFIG.copy() +cfg_ppo["rollouts"] = 24 # memory_size +cfg_ppo["learning_epochs"] = 5 +cfg_ppo["mini_batches"] = 3 # 24 * 4096 / 32768 +cfg_ppo["discount_factor"] = 0.99 +cfg_ppo["lambda"] = 0.95 +cfg_ppo["learning_rate"] = 3e-4 +cfg_ppo["learning_rate_scheduler"] = KLAdaptiveRL +cfg_ppo["learning_rate_scheduler_kwargs"] = {"kl_threshold": 0.008} +cfg_ppo["random_timesteps"] = 0 +cfg_ppo["learning_starts"] = 0 +cfg_ppo["grad_norm_clip"] = 1.0 +cfg_ppo["ratio_clip"] = 0.2 +cfg_ppo["value_clip"] = 0.2 +cfg_ppo["clip_predicted_values"] = True +cfg_ppo["entropy_loss_scale"] = 0.0 +cfg_ppo["value_loss_scale"] = 1.0 +cfg_ppo["kl_threshold"] = 0 +cfg_ppo["rewards_shaper"] = None +cfg_ppo["state_preprocessor"] = RunningStandardScaler +cfg_ppo["state_preprocessor_kwargs"] = {"size": env.observation_space, "device": device} +cfg_ppo["value_preprocessor"] = RunningStandardScaler +cfg_ppo["value_preprocessor_kwargs"] = {"size": 1, "device": device} +# logging to TensorBoard and write checkpoints each 120 and 1200 timesteps respectively +cfg_ppo["experiment"]["write_interval"] = 120 +cfg_ppo["experiment"]["checkpoint_interval"] = 1200 + +agent = PPO(models=models_ppo, + memory=memory, + cfg=cfg_ppo, + observation_space=env.observation_space, + action_space=env.action_space, + device=device) + + +# Configure and instantiate the RL trainer +cfg_trainer = {"timesteps": 24000, "headless": True} +trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=agent) + +# start training +trainer.train() diff --git a/docs/source/examples/omniisaacgym/ppo_anymal_terrain.py b/docs/source/examples/omniisaacgym/ppo_anymal_terrain.py new file mode 100644 index 00000000..5b6aaf80 --- /dev/null +++ b/docs/source/examples/omniisaacgym/ppo_anymal_terrain.py @@ -0,0 +1,120 @@ +import torch +import torch.nn as nn + +# Import the skrl components to build the RL system +from skrl.models.torch import Model, GaussianMixin, DeterministicMixin +from skrl.memories.torch import RandomMemory +from skrl.agents.torch.ppo import PPO, PPO_DEFAULT_CONFIG +from skrl.resources.schedulers.torch import KLAdaptiveRL +from skrl.resources.preprocessors.torch import RunningStandardScaler +from skrl.trainers.torch import SequentialTrainer +from skrl.envs.torch import wrap_env +from skrl.envs.torch import load_omniverse_isaacgym_env +from skrl.utils import set_seed + + +# set the seed for reproducibility +set_seed(42) + + +# Define the models (stochastic and deterministic models) for the agent using mixins. +# - Policy: takes as input the environment's observation/state and returns an action +# - Value: takes the state as input and provides a value to guide the policy +class Policy(GaussianMixin, Model): + def __init__(self, observation_space, action_space, device, clip_actions=False, + clip_log_std=True, min_log_std=-20, max_log_std=2, reduction="sum"): + Model.__init__(self, observation_space, action_space, device) + GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std, reduction) + + self.net = nn.Sequential(nn.Linear(self.num_observations, 512), + nn.ELU(), + nn.Linear(512, 256), + nn.ELU(), + nn.Linear(256, 128), + nn.ELU(), + nn.Linear(128, self.num_actions)) + self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) + + def compute(self, states, taken_actions, role): + return self.net(states), self.log_std_parameter + +class Value(DeterministicMixin, Model): + def __init__(self, observation_space, action_space, device, clip_actions=False): + Model.__init__(self, observation_space, action_space, device) + DeterministicMixin.__init__(self, clip_actions) + + self.net = nn.Sequential(nn.Linear(self.num_observations, 512), + nn.ELU(), + nn.Linear(512, 256), + nn.ELU(), + nn.Linear(256, 128), + nn.ELU(), + nn.Linear(128, 1)) + + def compute(self, states, taken_actions, role): + return self.net(states) + + +# Load and wrap the Omniverse Isaac Gym environment +env = load_omniverse_isaacgym_env(task_name="AnymalTerrain") +env = wrap_env(env) + +device = env.device + + +# Instantiate a RandomMemory as rollout buffer (any memory can be used for this) +memory = RandomMemory(memory_size=48, num_envs=env.num_envs, device=device) + + +# Instantiate the agent's models (function approximators). +# PPO requires 2 models, visit its documentation for more details +# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#spaces-and-models +models_ppo = {} +models_ppo["policy"] = Policy(env.observation_space, env.action_space, device) +models_ppo["value"] = Value(env.observation_space, env.action_space, device) + + +# Configure and instantiate the agent. +# Only modify some of the default configuration, visit its documentation to see all the options +# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#configuration-and-hyperparameters +cfg_ppo = PPO_DEFAULT_CONFIG.copy() +cfg_ppo["rollouts"] = 48 # memory_size +cfg_ppo["learning_epochs"] = 5 +cfg_ppo["mini_batches"] = 6 # 48 * 2048 / 16384 +cfg_ppo["discount_factor"] = 0.99 +cfg_ppo["lambda"] = 0.95 +cfg_ppo["learning_rate"] = 3e-4 +cfg_ppo["learning_rate_scheduler"] = KLAdaptiveRL +cfg_ppo["learning_rate_scheduler_kwargs"] = {"kl_threshold": 0.008} +cfg_ppo["random_timesteps"] = 0 +cfg_ppo["learning_starts"] = 0 +cfg_ppo["grad_norm_clip"] = 1.0 +cfg_ppo["ratio_clip"] = 0.2 +cfg_ppo["value_clip"] = 0.2 +cfg_ppo["clip_predicted_values"] = True +cfg_ppo["entropy_loss_scale"] = 0.001 +cfg_ppo["value_loss_scale"] = 1.0 +cfg_ppo["kl_threshold"] = 0 +cfg_ppo["rewards_shaper"] = None +cfg_ppo["state_preprocessor"] = RunningStandardScaler +cfg_ppo["state_preprocessor_kwargs"] = {"size": env.observation_space, "device": device} +cfg_ppo["value_preprocessor"] = RunningStandardScaler +cfg_ppo["value_preprocessor_kwargs"] = {"size": 1, "device": device} +# logging to TensorBoard and write checkpoints each 480 and 4800 timesteps respectively +cfg_ppo["experiment"]["write_interval"] = 480 +cfg_ppo["experiment"]["checkpoint_interval"] = 4800 + +agent = PPO(models=models_ppo, + memory=memory, + cfg=cfg_ppo, + observation_space=env.observation_space, + action_space=env.action_space, + device=device) + + +# Configure and instantiate the RL trainer +cfg_trainer = {"timesteps": 96000, "headless": True} +trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=agent) + +# start training +trainer.train() diff --git a/docs/source/examples/omniisaacgym/ppo_ball_balance.py b/docs/source/examples/omniisaacgym/ppo_ball_balance.py new file mode 100644 index 00000000..f63dbe3a --- /dev/null +++ b/docs/source/examples/omniisaacgym/ppo_ball_balance.py @@ -0,0 +1,115 @@ +import torch +import torch.nn as nn + +# Import the skrl components to build the RL system +from skrl.models.torch import Model, GaussianMixin, DeterministicMixin +from skrl.memories.torch import RandomMemory +from skrl.agents.torch.ppo import PPO, PPO_DEFAULT_CONFIG +from skrl.resources.schedulers.torch import KLAdaptiveRL +from skrl.resources.preprocessors.torch import RunningStandardScaler +from skrl.trainers.torch import SequentialTrainer +from skrl.envs.torch import wrap_env +from skrl.envs.torch import load_omniverse_isaacgym_env +from skrl.utils import set_seed + + +# set the seed for reproducibility +set_seed(42) + + +# Define the shared model (stochastic and deterministic models) for the agent using mixins. +class Shared(GaussianMixin, DeterministicMixin, Model): + def __init__(self, observation_space, action_space, device, clip_actions=False, + clip_log_std=True, min_log_std=-20, max_log_std=2, reduction="sum"): + Model.__init__(self, observation_space, action_space, device) + GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std, reduction) + DeterministicMixin.__init__(self, clip_actions) + + self.net = nn.Sequential(nn.Linear(self.num_observations, 128), + nn.ELU(), + nn.Linear(128, 64), + nn.ELU(), + nn.Linear(64, 32), + nn.ELU()) + + self.mean_layer = nn.Linear(32, self.num_actions) + self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) + + self.value_layer = nn.Linear(32, 1) + + def act(self, states, taken_actions, role): + if role == "policy": + return GaussianMixin.act(self, states, taken_actions, role) + elif role == "value": + return DeterministicMixin.act(self, states, taken_actions, role) + + def compute(self, states, taken_actions, role): + if role == "policy": + return self.mean_layer(self.net(states)), self.log_std_parameter + elif role == "value": + return self.value_layer(self.net(states)) + + +# Load and wrap the Omniverse Isaac Gym environment +env = load_omniverse_isaacgym_env(task_name="BallBalance") +env = wrap_env(env) + +device = env.device + + +# Instantiate a RandomMemory as rollout buffer (any memory can be used for this) +memory = RandomMemory(memory_size=16, num_envs=env.num_envs, device=device) + + +# Instantiate the agent's models (function approximators). +# PPO requires 2 models, visit its documentation for more details +# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#spaces-and-models +models_ppo = {} +models_ppo["policy"] = Shared(env.observation_space, env.action_space, device) +models_ppo["value"] = models_ppo["policy"] # same instance: shared model + + +# Configure and instantiate the agent. +# Only modify some of the default configuration, visit its documentation to see all the options +# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#configuration-and-hyperparameters +cfg_ppo = PPO_DEFAULT_CONFIG.copy() +cfg_ppo["rollouts"] = 16 # memory_size +cfg_ppo["learning_epochs"] = 8 +cfg_ppo["mini_batches"] = 8 # 16 * 4096 / 8192 +cfg_ppo["discount_factor"] = 0.99 +cfg_ppo["lambda"] = 0.95 +cfg_ppo["learning_rate"] = 3e-4 +cfg_ppo["learning_rate_scheduler"] = KLAdaptiveRL +cfg_ppo["learning_rate_scheduler_kwargs"] = {"kl_threshold": 0.008} +cfg_ppo["random_timesteps"] = 0 +cfg_ppo["learning_starts"] = 0 +cfg_ppo["grad_norm_clip"] = 1.0 +cfg_ppo["ratio_clip"] = 0.2 +cfg_ppo["value_clip"] = 0.2 +cfg_ppo["clip_predicted_values"] = True +cfg_ppo["entropy_loss_scale"] = 0.0 +cfg_ppo["value_loss_scale"] = 2.0 +cfg_ppo["kl_threshold"] = 0 +cfg_ppo["rewards_shaper"] = lambda rewards, timestep, timesteps: rewards * 0.1 +cfg_ppo["state_preprocessor"] = RunningStandardScaler +cfg_ppo["state_preprocessor_kwargs"] = {"size": env.observation_space, "device": device} +cfg_ppo["value_preprocessor"] = RunningStandardScaler +cfg_ppo["value_preprocessor_kwargs"] = {"size": 1, "device": device} +# logging to TensorBoard and write checkpoints each 20 and 200 timesteps respectively +cfg_ppo["experiment"]["write_interval"] = 20 +cfg_ppo["experiment"]["checkpoint_interval"] = 200 + +agent = PPO(models=models_ppo, + memory=memory, + cfg=cfg_ppo, + observation_space=env.observation_space, + action_space=env.action_space, + device=device) + + +# Configure and instantiate the RL trainer +cfg_trainer = {"timesteps": 4000, "headless": True} +trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=agent) + +# start training +trainer.train() diff --git a/docs/source/examples/omniisaacgym/ppo_cartpole.py b/docs/source/examples/omniisaacgym/ppo_cartpole.py index 248ffd0e..f091fdf9 100644 --- a/docs/source/examples/omniisaacgym/ppo_cartpole.py +++ b/docs/source/examples/omniisaacgym/ppo_cartpole.py @@ -17,38 +17,35 @@ set_seed(42) -# Define the models (stochastic and deterministic models) for the agent using mixins. -# - Policy: takes as input the environment's observation/state and returns an action -# - Value: takes the state as input and provides a value to guide the policy -class Policy(GaussianMixin, Model): +# Define the shared model (stochastic and deterministic models) for the agent using mixins. +class Shared(GaussianMixin, DeterministicMixin, Model): def __init__(self, observation_space, action_space, device, clip_actions=False, - clip_log_std=True, min_log_std=-20, max_log_std=2): + clip_log_std=True, min_log_std=-20, max_log_std=2, reduction="sum"): Model.__init__(self, observation_space, action_space, device) - GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std) + GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std, reduction) + DeterministicMixin.__init__(self, clip_actions) self.net = nn.Sequential(nn.Linear(self.num_observations, 32), nn.ELU(), nn.Linear(32, 32), - nn.ELU(), - nn.Linear(32, self.num_actions)) + nn.ELU()) + + self.mean_layer = nn.Linear(32, self.num_actions) self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) + + self.value_layer = nn.Linear(32, 1) - def compute(self, states, taken_actions, role): - return self.net(states), self.log_std_parameter - -class Value(DeterministicMixin, Model): - def __init__(self, observation_space, action_space, device, clip_actions=False): - Model.__init__(self, observation_space, action_space, device) - DeterministicMixin.__init__(self, clip_actions) - - self.net = nn.Sequential(nn.Linear(self.num_observations, 32), - nn.ELU(), - nn.Linear(32, 32), - nn.ELU(), - nn.Linear(32, 1)) + def act(self, states, taken_actions, role): + if role == "policy": + return GaussianMixin.act(self, states, taken_actions, role) + elif role == "value": + return DeterministicMixin.act(self, states, taken_actions, role) def compute(self, states, taken_actions, role): - return self.net(states) + if role == "policy": + return self.mean_layer(self.net(states)), self.log_std_parameter + elif role == "value": + return self.value_layer(self.net(states)) # Load and wrap the Omniverse Isaac Gym environment @@ -66,12 +63,8 @@ def compute(self, states, taken_actions, role): # PPO requires 2 models, visit its documentation for more details # https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#spaces-and-models models_ppo = {} -models_ppo["policy"] = Policy(env.observation_space, env.action_space, device) -models_ppo["value"] = Value(env.observation_space, env.action_space, device) - -# Initialize the models' parameters (weights and biases) using a Gaussian distribution -for model in models_ppo.values(): - model.init_parameters(method_name="normal_", mean=0.0, std=0.1) +models_ppo["policy"] = Shared(env.observation_space, env.action_space, device) +models_ppo["value"] = models_ppo["policy"] # same instance: shared model # Configure and instantiate the agent. diff --git a/docs/source/examples/omniisaacgym/ppo_cartpole_mt.py b/docs/source/examples/omniisaacgym/ppo_cartpole_mt.py index 8cbb23cd..9838c6ca 100644 --- a/docs/source/examples/omniisaacgym/ppo_cartpole_mt.py +++ b/docs/source/examples/omniisaacgym/ppo_cartpole_mt.py @@ -19,38 +19,35 @@ set_seed(42) -# Define the models (stochastic and deterministic models) for the agent using mixins. -# - Policy: takes as input the environment's observation/state and returns an action -# - Value: takes the state as input and provides a value to guide the policy -class Policy(GaussianMixin, Model): +# Define the shared model (stochastic and deterministic models) for the agent using mixins. +class Shared(GaussianMixin, DeterministicMixin, Model): def __init__(self, observation_space, action_space, device, clip_actions=False, - clip_log_std=True, min_log_std=-20, max_log_std=2): + clip_log_std=True, min_log_std=-20, max_log_std=2, reduction="sum"): Model.__init__(self, observation_space, action_space, device) - GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std) + GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std, reduction) + DeterministicMixin.__init__(self, clip_actions) self.net = nn.Sequential(nn.Linear(self.num_observations, 32), nn.ELU(), nn.Linear(32, 32), - nn.ELU(), - nn.Linear(32, self.num_actions)) + nn.ELU()) + + self.mean_layer = nn.Linear(32, self.num_actions) self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) + + self.value_layer = nn.Linear(32, 1) - def compute(self, states, taken_actions, role): - return self.net(states), self.log_std_parameter - -class Value(DeterministicMixin, Model): - def __init__(self, observation_space, action_space, device, clip_actions=False): - Model.__init__(self, observation_space, action_space, device) - DeterministicMixin.__init__(self, clip_actions) - - self.net = nn.Sequential(nn.Linear(self.num_observations, 32), - nn.ELU(), - nn.Linear(32, 32), - nn.ELU(), - nn.Linear(32, 1)) + def act(self, states, taken_actions, role): + if role == "policy": + return GaussianMixin.act(self, states, taken_actions, role) + elif role == "value": + return DeterministicMixin.act(self, states, taken_actions, role) def compute(self, states, taken_actions, role): - return self.net(states) + if role == "policy": + return self.mean_layer(self.net(states)), self.log_std_parameter + elif role == "value": + return self.value_layer(self.net(states)) # Load and wrap the multi-threaded Omniverse Isaac Gym environment @@ -68,12 +65,8 @@ def compute(self, states, taken_actions, role): # PPO requires 2 models, visit its documentation for more details # https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#spaces-and-models models_ppo = {} -models_ppo["policy"] = Policy(env.observation_space, env.action_space, device) -models_ppo["value"] = Value(env.observation_space, env.action_space, device) - -# Initialize the models' parameters (weights and biases) using a Gaussian distribution -for model in models_ppo.values(): - model.init_parameters(method_name="normal_", mean=0.0, std=0.1) +models_ppo["policy"] = Shared(env.observation_space, env.action_space, device) +models_ppo["value"] = models_ppo["policy"] # same instance: shared model # Configure and instantiate the agent. diff --git a/docs/source/examples/omniisaacgym/ppo_crazy_flie.py b/docs/source/examples/omniisaacgym/ppo_crazy_flie.py new file mode 100644 index 00000000..b8c16fd6 --- /dev/null +++ b/docs/source/examples/omniisaacgym/ppo_crazy_flie.py @@ -0,0 +1,115 @@ +import torch +import torch.nn as nn + +# Import the skrl components to build the RL system +from skrl.models.torch import Model, GaussianMixin, DeterministicMixin +from skrl.memories.torch import RandomMemory +from skrl.agents.torch.ppo import PPO, PPO_DEFAULT_CONFIG +from skrl.resources.schedulers.torch import KLAdaptiveRL +from skrl.resources.preprocessors.torch import RunningStandardScaler +from skrl.trainers.torch import SequentialTrainer +from skrl.envs.torch import wrap_env +from skrl.envs.torch import load_omniverse_isaacgym_env +from skrl.utils import set_seed + + +# set the seed for reproducibility +set_seed(42) + + +# Define the shared model (stochastic and deterministic models) for the agent using mixins. +class Shared(GaussianMixin, DeterministicMixin, Model): + def __init__(self, observation_space, action_space, device, clip_actions=False, + clip_log_std=True, min_log_std=-20, max_log_std=2, reduction="sum"): + Model.__init__(self, observation_space, action_space, device) + GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std, reduction) + DeterministicMixin.__init__(self, clip_actions) + + self.net = nn.Sequential(nn.Linear(self.num_observations, 256), + nn.Tanh(), + nn.Linear(256, 256), + nn.Tanh(), + nn.Linear(256, 128), + nn.Tanh()) + + self.mean_layer = nn.Linear(128, self.num_actions) + self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) + + self.value_layer = nn.Linear(128, 1) + + def act(self, states, taken_actions, role): + if role == "policy": + return GaussianMixin.act(self, states, taken_actions, role) + elif role == "value": + return DeterministicMixin.act(self, states, taken_actions, role) + + def compute(self, states, taken_actions, role): + if role == "policy": + return self.mean_layer(self.net(states)), self.log_std_parameter + elif role == "value": + return self.value_layer(self.net(states)) + + +# Load and wrap the Omniverse Isaac Gym environment +env = load_omniverse_isaacgym_env(task_name="Crazyflie") +env = wrap_env(env) + +device = env.device + + +# Instantiate a RandomMemory as rollout buffer (any memory can be used for this) +memory = RandomMemory(memory_size=16, num_envs=env.num_envs, device=device) + + +# Instantiate the agent's models (function approximators). +# PPO requires 2 models, visit its documentation for more details +# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#spaces-and-models +models_ppo = {} +models_ppo["policy"] = Shared(env.observation_space, env.action_space, device) +models_ppo["value"] = models_ppo["policy"] # same instance: shared model + + +# Configure and instantiate the agent. +# Only modify some of the default configuration, visit its documentation to see all the options +# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#configuration-and-hyperparameters +cfg_ppo = PPO_DEFAULT_CONFIG.copy() +cfg_ppo["rollouts"] = 16 # memory_size +cfg_ppo["learning_epochs"] = 8 +cfg_ppo["mini_batches"] = 4 # 16 * 4096 / 16384 +cfg_ppo["discount_factor"] = 0.99 +cfg_ppo["lambda"] = 0.95 +cfg_ppo["learning_rate"] = 1e-4 +cfg_ppo["learning_rate_scheduler"] = KLAdaptiveRL +cfg_ppo["learning_rate_scheduler_kwargs"] = {"kl_threshold": 0.016} +cfg_ppo["random_timesteps"] = 0 +cfg_ppo["learning_starts"] = 0 +cfg_ppo["grad_norm_clip"] = 1.0 +cfg_ppo["ratio_clip"] = 0.2 +cfg_ppo["value_clip"] = 0.2 +cfg_ppo["clip_predicted_values"] = True +cfg_ppo["entropy_loss_scale"] = 0.0 +cfg_ppo["value_loss_scale"] = 1.0 +cfg_ppo["kl_threshold"] = 0 +cfg_ppo["rewards_shaper"] = lambda rewards, timestep, timesteps: rewards * 0.01 +cfg_ppo["state_preprocessor"] = RunningStandardScaler +cfg_ppo["state_preprocessor_kwargs"] = {"size": env.observation_space, "device": device} +cfg_ppo["value_preprocessor"] = RunningStandardScaler +cfg_ppo["value_preprocessor_kwargs"] = {"size": 1, "device": device} +# logging to TensorBoard and write checkpoints each 80 and 800 timesteps respectively +cfg_ppo["experiment"]["write_interval"] = 80 +cfg_ppo["experiment"]["checkpoint_interval"] = 800 + +agent = PPO(models=models_ppo, + memory=memory, + cfg=cfg_ppo, + observation_space=env.observation_space, + action_space=env.action_space, + device=device) + + +# Configure and instantiate the RL trainer +cfg_trainer = {"timesteps": 16000, "headless": True} +trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=agent) + +# start training +trainer.train() diff --git a/docs/source/examples/omniisaacgym/ppo_franka_cabinet.py b/docs/source/examples/omniisaacgym/ppo_franka_cabinet.py new file mode 100644 index 00000000..310c2e2e --- /dev/null +++ b/docs/source/examples/omniisaacgym/ppo_franka_cabinet.py @@ -0,0 +1,115 @@ +import torch +import torch.nn as nn + +# Import the skrl components to build the RL system +from skrl.models.torch import Model, GaussianMixin, DeterministicMixin +from skrl.memories.torch import RandomMemory +from skrl.agents.torch.ppo import PPO, PPO_DEFAULT_CONFIG +from skrl.resources.schedulers.torch import KLAdaptiveRL +from skrl.resources.preprocessors.torch import RunningStandardScaler +from skrl.trainers.torch import SequentialTrainer +from skrl.envs.torch import wrap_env +from skrl.envs.torch import load_omniverse_isaacgym_env +from skrl.utils import set_seed + + +# set the seed for reproducibility +set_seed(42) + + +# Define the shared model (stochastic and deterministic models) for the agent using mixins. +class Shared(GaussianMixin, DeterministicMixin, Model): + def __init__(self, observation_space, action_space, device, clip_actions=False, + clip_log_std=True, min_log_std=-20, max_log_std=2, reduction="sum"): + Model.__init__(self, observation_space, action_space, device) + GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std, reduction) + DeterministicMixin.__init__(self, clip_actions) + + self.net = nn.Sequential(nn.Linear(self.num_observations, 256), + nn.ELU(), + nn.Linear(256, 128), + nn.ELU(), + nn.Linear(128, 64), + nn.ELU()) + + self.mean_layer = nn.Linear(64, self.num_actions) + self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) + + self.value_layer = nn.Linear(64, 1) + + def act(self, states, taken_actions, role): + if role == "policy": + return GaussianMixin.act(self, states, taken_actions, role) + elif role == "value": + return DeterministicMixin.act(self, states, taken_actions, role) + + def compute(self, states, taken_actions, role): + if role == "policy": + return self.mean_layer(self.net(states)), self.log_std_parameter + elif role == "value": + return self.value_layer(self.net(states)) + + +# Load and wrap the Omniverse Isaac Gym environment +env = load_omniverse_isaacgym_env(task_name="FrankaCabinet") +env = wrap_env(env) + +device = env.device + + +# Instantiate a RandomMemory as rollout buffer (any memory can be used for this) +memory = RandomMemory(memory_size=16, num_envs=env.num_envs, device=device) + + +# Instantiate the agent's models (function approximators). +# PPO requires 2 models, visit its documentation for more details +# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#spaces-and-models +models_ppo = {} +models_ppo["policy"] = Shared(env.observation_space, env.action_space, device) +models_ppo["value"] = models_ppo["policy"] # same instance: shared model + + +# Configure and instantiate the agent. +# Only modify some of the default configuration, visit its documentation to see all the options +# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#configuration-and-hyperparameters +cfg_ppo = PPO_DEFAULT_CONFIG.copy() +cfg_ppo["rollouts"] = 16 # memory_size +cfg_ppo["learning_epochs"] = 8 +cfg_ppo["mini_batches"] = 8 # 16 * 4096 / 8192 +cfg_ppo["discount_factor"] = 0.99 +cfg_ppo["lambda"] = 0.95 +cfg_ppo["learning_rate"] = 5e-4 +cfg_ppo["learning_rate_scheduler"] = KLAdaptiveRL +cfg_ppo["learning_rate_scheduler_kwargs"] = {"kl_threshold": 0.008} +cfg_ppo["random_timesteps"] = 0 +cfg_ppo["learning_starts"] = 0 +cfg_ppo["grad_norm_clip"] = 1.0 +cfg_ppo["ratio_clip"] = 0.2 +cfg_ppo["value_clip"] = 0.2 +cfg_ppo["clip_predicted_values"] = True +cfg_ppo["entropy_loss_scale"] = 0.0 +cfg_ppo["value_loss_scale"] = 2.0 +cfg_ppo["kl_threshold"] = 0 +cfg_ppo["rewards_shaper"] = lambda rewards, timestep, timesteps: rewards * 0.01 +cfg_ppo["state_preprocessor"] = RunningStandardScaler +cfg_ppo["state_preprocessor_kwargs"] = {"size": env.observation_space, "device": device} +cfg_ppo["value_preprocessor"] = RunningStandardScaler +cfg_ppo["value_preprocessor_kwargs"] = {"size": 1, "device": device} +# logging to TensorBoard and write checkpoints each 120 and 1200 timesteps respectively +cfg_ppo["experiment"]["write_interval"] = 120 +cfg_ppo["experiment"]["checkpoint_interval"] = 1200 + +agent = PPO(models=models_ppo, + memory=memory, + cfg=cfg_ppo, + observation_space=env.observation_space, + action_space=env.action_space, + device=device) + + +# Configure and instantiate the RL trainer +cfg_trainer = {"timesteps": 24000, "headless": True} +trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=agent) + +# start training +trainer.train() diff --git a/docs/source/examples/omniisaacgym/ppo_humanoid.py b/docs/source/examples/omniisaacgym/ppo_humanoid.py index 29d57b6d..cf45a8a1 100644 --- a/docs/source/examples/omniisaacgym/ppo_humanoid.py +++ b/docs/source/examples/omniisaacgym/ppo_humanoid.py @@ -17,42 +17,37 @@ set_seed(42) -# Define the models (stochastic and deterministic models) for the agent using mixins. -# - Policy: takes as input the environment's observation/state and returns an action -# - Value: takes the state as input and provides a value to guide the policy -class Policy(GaussianMixin, Model): +# Define the shared model (stochastic and deterministic models) for the agent using mixins. +class Shared(GaussianMixin, DeterministicMixin, Model): def __init__(self, observation_space, action_space, device, clip_actions=False, - clip_log_std=True, min_log_std=-20, max_log_std=2): + clip_log_std=True, min_log_std=-20, max_log_std=2, reduction="sum"): Model.__init__(self, observation_space, action_space, device) - GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std) + GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std, reduction) + DeterministicMixin.__init__(self, clip_actions) self.net = nn.Sequential(nn.Linear(self.num_observations, 400), nn.ELU(), nn.Linear(400, 200), nn.ELU(), nn.Linear(200, 100), - nn.ELU(), - nn.Linear(100, self.num_actions)) + nn.ELU()) + + self.mean_layer = nn.Linear(100, self.num_actions) self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) + + self.value_layer = nn.Linear(100, 1) - def compute(self, states, taken_actions, role): - return self.net(states), self.log_std_parameter - -class Value(DeterministicMixin, Model): - def __init__(self, observation_space, action_space, device, clip_actions=False): - Model.__init__(self, observation_space, action_space, device) - DeterministicMixin.__init__(self, clip_actions) - - self.net = nn.Sequential(nn.Linear(self.num_observations, 400), - nn.ELU(), - nn.Linear(400, 200), - nn.ELU(), - nn.Linear(200, 100), - nn.ELU(), - nn.Linear(100, 1)) + def act(self, states, taken_actions, role): + if role == "policy": + return GaussianMixin.act(self, states, taken_actions, role) + elif role == "value": + return DeterministicMixin.act(self, states, taken_actions, role) def compute(self, states, taken_actions, role): - return self.net(states) + if role == "policy": + return self.mean_layer(self.net(states)), self.log_std_parameter + elif role == "value": + return self.value_layer(self.net(states)) # Load and wrap the Omniverse Isaac Gym environment @@ -70,12 +65,8 @@ def compute(self, states, taken_actions, role): # PPO requires 2 models, visit its documentation for more details # https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#spaces-and-models models_ppo = {} -models_ppo["policy"] = Policy(env.observation_space, env.action_space, device) -models_ppo["value"] = Value(env.observation_space, env.action_space, device) - -# Initialize the models' parameters (weights and biases) using a Gaussian distribution -for model in models_ppo.values(): - model.init_parameters(method_name="normal_", mean=0.0, std=0.1) +models_ppo["policy"] = Shared(env.observation_space, env.action_space, device) +models_ppo["value"] = models_ppo["policy"] # same instance: shared model # Configure and instantiate the agent. diff --git a/docs/source/examples/omniisaacgym/ppo_ingenuity.py b/docs/source/examples/omniisaacgym/ppo_ingenuity.py new file mode 100644 index 00000000..345ec93c --- /dev/null +++ b/docs/source/examples/omniisaacgym/ppo_ingenuity.py @@ -0,0 +1,115 @@ +import torch +import torch.nn as nn + +# Import the skrl components to build the RL system +from skrl.models.torch import Model, GaussianMixin, DeterministicMixin +from skrl.memories.torch import RandomMemory +from skrl.agents.torch.ppo import PPO, PPO_DEFAULT_CONFIG +from skrl.resources.schedulers.torch import KLAdaptiveRL +from skrl.resources.preprocessors.torch import RunningStandardScaler +from skrl.trainers.torch import SequentialTrainer +from skrl.envs.torch import wrap_env +from skrl.envs.torch import load_omniverse_isaacgym_env +from skrl.utils import set_seed + + +# set the seed for reproducibility +set_seed(42) + + +# Define the shared model (stochastic and deterministic models) for the agent using mixins. +class Shared(GaussianMixin, DeterministicMixin, Model): + def __init__(self, observation_space, action_space, device, clip_actions=False, + clip_log_std=True, min_log_std=-20, max_log_std=2, reduction="sum"): + Model.__init__(self, observation_space, action_space, device) + GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std, reduction) + DeterministicMixin.__init__(self, clip_actions) + + self.net = nn.Sequential(nn.Linear(self.num_observations, 256), + nn.ELU(), + nn.Linear(256, 256), + nn.ELU(), + nn.Linear(256, 128), + nn.ELU()) + + self.mean_layer = nn.Linear(128, self.num_actions) + self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) + + self.value_layer = nn.Linear(128, 1) + + def act(self, states, taken_actions, role): + if role == "policy": + return GaussianMixin.act(self, states, taken_actions, role) + elif role == "value": + return DeterministicMixin.act(self, states, taken_actions, role) + + def compute(self, states, taken_actions, role): + if role == "policy": + return self.mean_layer(self.net(states)), self.log_std_parameter + elif role == "value": + return self.value_layer(self.net(states)) + + +# Load and wrap the Omniverse Isaac Gym environment +env = load_omniverse_isaacgym_env(task_name="Ingenuity") +env = wrap_env(env) + +device = env.device + + +# Instantiate a RandomMemory as rollout buffer (any memory can be used for this) +memory = RandomMemory(memory_size=16, num_envs=env.num_envs, device=device) + + +# Instantiate the agent's models (function approximators). +# PPO requires 2 models, visit its documentation for more details +# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#spaces-and-models +models_ppo = {} +models_ppo["policy"] = Shared(env.observation_space, env.action_space, device) +models_ppo["value"] = models_ppo["policy"] # same instance: shared model + + +# Configure and instantiate the agent. +# Only modify some of the default configuration, visit its documentation to see all the options +# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#configuration-and-hyperparameters +cfg_ppo = PPO_DEFAULT_CONFIG.copy() +cfg_ppo["rollouts"] = 16 # memory_size +cfg_ppo["learning_epochs"] = 8 +cfg_ppo["mini_batches"] = 4 # 16 * 4096 / 16384 +cfg_ppo["discount_factor"] = 0.99 +cfg_ppo["lambda"] = 0.95 +cfg_ppo["learning_rate"] = 1e-3 +cfg_ppo["learning_rate_scheduler"] = KLAdaptiveRL +cfg_ppo["learning_rate_scheduler_kwargs"] = {"kl_threshold": 0.016} +cfg_ppo["random_timesteps"] = 0 +cfg_ppo["learning_starts"] = 0 +cfg_ppo["grad_norm_clip"] = 1.0 +cfg_ppo["ratio_clip"] = 0.2 +cfg_ppo["value_clip"] = 0.2 +cfg_ppo["clip_predicted_values"] = True +cfg_ppo["entropy_loss_scale"] = 0.0 +cfg_ppo["value_loss_scale"] = 1.0 +cfg_ppo["kl_threshold"] = 0 +cfg_ppo["rewards_shaper"] = lambda rewards, timestep, timesteps: rewards * 0.01 +cfg_ppo["state_preprocessor"] = RunningStandardScaler +cfg_ppo["state_preprocessor_kwargs"] = {"size": env.observation_space, "device": device} +cfg_ppo["value_preprocessor"] = RunningStandardScaler +cfg_ppo["value_preprocessor_kwargs"] = {"size": 1, "device": device} +# logging to TensorBoard and write checkpoints each 32 and 320 timesteps respectively +cfg_ppo["experiment"]["write_interval"] = 32 +cfg_ppo["experiment"]["checkpoint_interval"] = 320 + +agent = PPO(models=models_ppo, + memory=memory, + cfg=cfg_ppo, + observation_space=env.observation_space, + action_space=env.action_space, + device=device) + + +# Configure and instantiate the RL trainer +cfg_trainer = {"timesteps": 6400, "headless": True} +trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=agent) + +# start training +trainer.train() diff --git a/docs/source/examples/omniisaacgym/ppo_quadcopter.py b/docs/source/examples/omniisaacgym/ppo_quadcopter.py new file mode 100644 index 00000000..2425c4f2 --- /dev/null +++ b/docs/source/examples/omniisaacgym/ppo_quadcopter.py @@ -0,0 +1,115 @@ +import torch +import torch.nn as nn + +# Import the skrl components to build the RL system +from skrl.models.torch import Model, GaussianMixin, DeterministicMixin +from skrl.memories.torch import RandomMemory +from skrl.agents.torch.ppo import PPO, PPO_DEFAULT_CONFIG +from skrl.resources.schedulers.torch import KLAdaptiveRL +from skrl.resources.preprocessors.torch import RunningStandardScaler +from skrl.trainers.torch import SequentialTrainer +from skrl.envs.torch import wrap_env +from skrl.envs.torch import load_omniverse_isaacgym_env +from skrl.utils import set_seed + + +# set the seed for reproducibility +set_seed(42) + + +# Define the shared model (stochastic and deterministic models) for the agent using mixins. +class Shared(GaussianMixin, DeterministicMixin, Model): + def __init__(self, observation_space, action_space, device, clip_actions=False, + clip_log_std=True, min_log_std=-20, max_log_std=2, reduction="sum"): + Model.__init__(self, observation_space, action_space, device) + GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std, reduction) + DeterministicMixin.__init__(self, clip_actions) + + self.net = nn.Sequential(nn.Linear(self.num_observations, 256), + nn.ELU(), + nn.Linear(256, 256), + nn.ELU(), + nn.Linear(256, 128), + nn.ELU()) + + self.mean_layer = nn.Linear(128, self.num_actions) + self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) + + self.value_layer = nn.Linear(128, 1) + + def act(self, states, taken_actions, role): + if role == "policy": + return GaussianMixin.act(self, states, taken_actions, role) + elif role == "value": + return DeterministicMixin.act(self, states, taken_actions, role) + + def compute(self, states, taken_actions, role): + if role == "policy": + return self.mean_layer(self.net(states)), self.log_std_parameter + elif role == "value": + return self.value_layer(self.net(states)) + + +# Load and wrap the Omniverse Isaac Gym environment +env = load_omniverse_isaacgym_env(task_name="Quadcopter") +env = wrap_env(env) + +device = env.device + + +# Instantiate a RandomMemory as rollout buffer (any memory can be used for this) +memory = RandomMemory(memory_size=16, num_envs=env.num_envs, device=device) + + +# Instantiate the agent's models (function approximators). +# PPO requires 2 models, visit its documentation for more details +# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#spaces-and-models +models_ppo = {} +models_ppo["policy"] = Shared(env.observation_space, env.action_space, device) +models_ppo["value"] = models_ppo["policy"] # same instance: shared model + + +# Configure and instantiate the agent. +# Only modify some of the default configuration, visit its documentation to see all the options +# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#configuration-and-hyperparameters +cfg_ppo = PPO_DEFAULT_CONFIG.copy() +cfg_ppo["rollouts"] = 16 # memory_size +cfg_ppo["learning_epochs"] = 8 +cfg_ppo["mini_batches"] = 4 # 16 * 4096 / 16384 +cfg_ppo["discount_factor"] = 0.99 +cfg_ppo["lambda"] = 0.95 +cfg_ppo["learning_rate"] = 1e-3 +cfg_ppo["learning_rate_scheduler"] = KLAdaptiveRL +cfg_ppo["learning_rate_scheduler_kwargs"] = {"kl_threshold": 0.016} +cfg_ppo["random_timesteps"] = 0 +cfg_ppo["learning_starts"] = 0 +cfg_ppo["grad_norm_clip"] = 1.0 +cfg_ppo["ratio_clip"] = 0.2 +cfg_ppo["value_clip"] = 0.2 +cfg_ppo["clip_predicted_values"] = True +cfg_ppo["entropy_loss_scale"] = 0.0 +cfg_ppo["value_loss_scale"] = 1.0 +cfg_ppo["kl_threshold"] = 0 +cfg_ppo["rewards_shaper"] = lambda rewards, timestep, timesteps: rewards * 0.1 +cfg_ppo["state_preprocessor"] = RunningStandardScaler +cfg_ppo["state_preprocessor_kwargs"] = {"size": env.observation_space, "device": device} +cfg_ppo["value_preprocessor"] = RunningStandardScaler +cfg_ppo["value_preprocessor_kwargs"] = {"size": 1, "device": device} +# logging to TensorBoard and write checkpoints each 80 and 800 timesteps respectively +cfg_ppo["experiment"]["write_interval"] = 80 +cfg_ppo["experiment"]["checkpoint_interval"] = 800 + +agent = PPO(models=models_ppo, + memory=memory, + cfg=cfg_ppo, + observation_space=env.observation_space, + action_space=env.action_space, + device=device) + + +# Configure and instantiate the RL trainer +cfg_trainer = {"timesteps": 16000, "headless": True} +trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=agent) + +# start training +trainer.train() diff --git a/docs/source/examples/omniisaacgym/ppo_shadow_hand.py b/docs/source/examples/omniisaacgym/ppo_shadow_hand.py index 1d1e9e11..4908a92c 100644 --- a/docs/source/examples/omniisaacgym/ppo_shadow_hand.py +++ b/docs/source/examples/omniisaacgym/ppo_shadow_hand.py @@ -17,14 +17,13 @@ set_seed(42) -# Define the models (stochastic and deterministic models) for the agent using mixins. -# - Policy: takes as input the environment's observation/state and returns an action -# - Value: takes the state as input and provides a value to guide the policy -class Policy(GaussianMixin, Model): +# Define the shared model (stochastic and deterministic models) for the agent using mixins. +class Shared(GaussianMixin, DeterministicMixin, Model): def __init__(self, observation_space, action_space, device, clip_actions=False, - clip_log_std=True, min_log_std=-20, max_log_std=2): + clip_log_std=True, min_log_std=-20, max_log_std=2, reduction="sum"): Model.__init__(self, observation_space, action_space, device) - GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std) + GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std, reduction) + DeterministicMixin.__init__(self, clip_actions) self.net = nn.Sequential(nn.Linear(self.num_observations, 512), nn.ELU(), @@ -33,30 +32,24 @@ def __init__(self, observation_space, action_space, device, clip_actions=False, nn.Linear(512, 256), nn.ELU(), nn.Linear(256, 128), - nn.ELU(), - nn.Linear(128, self.num_actions)) + nn.ELU()) + + self.mean_layer = nn.Linear(128, self.num_actions) self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) + + self.value_layer = nn.Linear(128, 1) - def compute(self, states, taken_actions, role): - return self.net(states), self.log_std_parameter - -class Value(DeterministicMixin, Model): - def __init__(self, observation_space, action_space, device, clip_actions=False): - Model.__init__(self, observation_space, action_space, device) - DeterministicMixin.__init__(self, clip_actions) - - self.net = nn.Sequential(nn.Linear(self.num_observations, 512), - nn.ELU(), - nn.Linear(512, 512), - nn.ELU(), - nn.Linear(512, 256), - nn.ELU(), - nn.Linear(256, 128), - nn.ELU(), - nn.Linear(128, 1)) + def act(self, states, taken_actions, role): + if role == "policy": + return GaussianMixin.act(self, states, taken_actions, role) + elif role == "value": + return DeterministicMixin.act(self, states, taken_actions, role) def compute(self, states, taken_actions, role): - return self.net(states) + if role == "policy": + return self.mean_layer(self.net(states)), self.log_std_parameter + elif role == "value": + return self.value_layer(self.net(states)) # Load and wrap the Omniverse Isaac Gym environment @@ -67,28 +60,24 @@ def compute(self, states, taken_actions, role): # Instantiate a RandomMemory as rollout buffer (any memory can be used for this) -memory = RandomMemory(memory_size=8, num_envs=env.num_envs, device=device) +memory = RandomMemory(memory_size=16, num_envs=env.num_envs, device=device) # Instantiate the agent's models (function approximators). # PPO requires 2 models, visit its documentation for more details # https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#spaces-and-models models_ppo = {} -models_ppo["policy"] = Policy(env.observation_space, env.action_space, device) -models_ppo["value"] = Value(env.observation_space, env.action_space, device) - -# Initialize the models' parameters (weights and biases) using a Gaussian distribution -for model in models_ppo.values(): - model.init_parameters(method_name="normal_", mean=0.0, std=0.1) +models_ppo["policy"] = Shared(env.observation_space, env.action_space, device) +models_ppo["value"] = models_ppo["policy"] # same instance: shared model # Configure and instantiate the agent. # Only modify some of the default configuration, visit its documentation to see all the options # https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#configuration-and-hyperparameters cfg_ppo = PPO_DEFAULT_CONFIG.copy() -cfg_ppo["rollouts"] = 8 # memory_size +cfg_ppo["rollouts"] = 16 # memory_size cfg_ppo["learning_epochs"] = 5 -cfg_ppo["mini_batches"] = 4 # 8 * 16384 / 32768 +cfg_ppo["mini_batches"] = 4 # 16 * 8192 / 32768 cfg_ppo["discount_factor"] = 0.99 cfg_ppo["lambda"] = 0.95 cfg_ppo["learning_rate"] = 5e-4 @@ -108,9 +97,9 @@ def compute(self, states, taken_actions, role): cfg_ppo["state_preprocessor_kwargs"] = {"size": env.observation_space, "device": device} cfg_ppo["value_preprocessor"] = RunningStandardScaler cfg_ppo["value_preprocessor_kwargs"] = {"size": 1, "device": device} -# logging to TensorBoard and write checkpoints each 200 and 2000 timesteps respectively -cfg_ppo["experiment"]["write_interval"] = 200 -cfg_ppo["experiment"]["checkpoint_interval"] = 2000 +# logging to TensorBoard and write checkpoints each 800 and 8000 timesteps respectively +cfg_ppo["experiment"]["write_interval"] = 800 +cfg_ppo["experiment"]["checkpoint_interval"] = 8000 agent = PPO(models=models_ppo, memory=memory, @@ -121,7 +110,7 @@ def compute(self, states, taken_actions, role): # Configure and instantiate the RL trainer -cfg_trainer = {"timesteps": 40000, "headless": True} +cfg_trainer = {"timesteps": 160000, "headless": True} trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=agent) # start training From 00a317b33344c90fdb75354b56f4076ae05ba162 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20Serrano=20Mu=C3=B1oz?= Date: Fri, 30 Sep 2022 10:23:30 +0200 Subject: [PATCH 092/108] Update Isaac Gym examples --- .../examples/isaacgym/ppo_allegro_hand.py | 55 ++++++++---------- docs/source/examples/isaacgym/ppo_ant.py | 51 +++++++---------- docs/source/examples/isaacgym/ppo_anymal.py | 51 +++++++---------- .../examples/isaacgym/ppo_anymal_terrain.py | 8 +-- .../examples/isaacgym/ppo_ball_balance.py | 50 +++++++--------- docs/source/examples/isaacgym/ppo_cartpole.py | 49 +++++++--------- .../examples/isaacgym/ppo_cartpole_eval.py | 31 +++++++--- .../examples/isaacgym/ppo_franka_cabinet.py | 51 +++++++---------- docs/source/examples/isaacgym/ppo_humanoid.py | 51 +++++++---------- .../source/examples/isaacgym/ppo_ingenuity.py | 57 ++++++++----------- .../examples/isaacgym/ppo_quadcopter.py | 51 +++++++---------- .../examples/isaacgym/ppo_shadow_hand.py | 53 +++++++---------- .../source/examples/isaacgym/ppo_trifinger.py | 57 +++++++------------ 13 files changed, 261 insertions(+), 354 deletions(-) diff --git a/docs/source/examples/isaacgym/ppo_allegro_hand.py b/docs/source/examples/isaacgym/ppo_allegro_hand.py index 927b8f5d..2214df9b 100644 --- a/docs/source/examples/isaacgym/ppo_allegro_hand.py +++ b/docs/source/examples/isaacgym/ppo_allegro_hand.py @@ -16,49 +16,44 @@ # set the seed for reproducibility -set_seed(42) +seed = set_seed(42) -# Define the models (stochastic and deterministic models) for the agent using mixins. -# - Policy: takes as input the environment's observation/state and returns an action -# - Value: takes the state as input and provides a value to guide the policy -class Policy(GaussianMixin, Model): +# Define the shared model (stochastic and deterministic models) for the agent using mixins. +class Shared(GaussianMixin, DeterministicMixin, Model): def __init__(self, observation_space, action_space, device, clip_actions=False, - clip_log_std=True, min_log_std=-20, max_log_std=2): + clip_log_std=True, min_log_std=-20, max_log_std=2, reduction="sum"): Model.__init__(self, observation_space, action_space, device) - GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std) + GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std, reduction) + DeterministicMixin.__init__(self, clip_actions) self.net = nn.Sequential(nn.Linear(self.num_observations, 512), nn.ELU(), nn.Linear(512, 256), nn.ELU(), nn.Linear(256, 128), - nn.ELU(), - nn.Linear(128, self.num_actions)) + nn.ELU()) + + self.mean_layer = nn.Linear(128, self.num_actions) self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) + + self.value_layer = nn.Linear(128, 1) - def compute(self, states, taken_actions, role): - return self.net(states), self.log_std_parameter - -class Value(DeterministicMixin, Model): - def __init__(self, observation_space, action_space, device, clip_actions=False): - Model.__init__(self, observation_space, action_space, device) - DeterministicMixin.__init__(self, clip_actions) - - self.net = nn.Sequential(nn.Linear(self.num_observations, 512), - nn.ELU(), - nn.Linear(512, 256), - nn.ELU(), - nn.Linear(256, 128), - nn.ELU(), - nn.Linear(128, 1)) + def act(self, states, taken_actions, role): + if role == "policy": + return GaussianMixin.act(self, states, taken_actions, role) + elif role == "value": + return DeterministicMixin.act(self, states, taken_actions, role) def compute(self, states, taken_actions, role): - return self.net(states) + if role == "policy": + return self.mean_layer(self.net(states)), self.log_std_parameter + elif role == "value": + return self.value_layer(self.net(states)) # Load and wrap the Isaac Gym environment using the easy-to-use API from NVIDIA -env = isaacgymenvs.make(seed=42, +env = isaacgymenvs.make(seed=seed, task="AllegroHand", num_envs=16384, sim_device="cuda:0", @@ -78,12 +73,8 @@ def compute(self, states, taken_actions, role): # PPO requires 2 models, visit its documentation for more details # https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#spaces-and-models models_ppo = {} -models_ppo["policy"] = Policy(env.observation_space, env.action_space, device) -models_ppo["value"] = Value(env.observation_space, env.action_space, device) - -# Initialize the models' parameters (weights and biases) using a Gaussian distribution -for model in models_ppo.values(): - model.init_parameters(method_name="normal_", mean=0.0, std=0.1) +models_ppo["policy"] = Shared(env.observation_space, env.action_space, device) +models_ppo["value"] = models_ppo["policy"] # same instance: shared model # Configure and instantiate the agent. diff --git a/docs/source/examples/isaacgym/ppo_ant.py b/docs/source/examples/isaacgym/ppo_ant.py index ad2ce0bf..4bbc11f1 100644 --- a/docs/source/examples/isaacgym/ppo_ant.py +++ b/docs/source/examples/isaacgym/ppo_ant.py @@ -19,42 +19,37 @@ set_seed(42) -# Define the models (stochastic and deterministic models) for the agent using mixins. -# - Policy: takes as input the environment's observation/state and returns an action -# - Value: takes the state as input and provides a value to guide the policy -class Policy(GaussianMixin, Model): +# Define the shared model (stochastic and deterministic models) for the agent using mixins. +class Shared(GaussianMixin, DeterministicMixin, Model): def __init__(self, observation_space, action_space, device, clip_actions=False, - clip_log_std=True, min_log_std=-20, max_log_std=2): + clip_log_std=True, min_log_std=-20, max_log_std=2, reduction="sum"): Model.__init__(self, observation_space, action_space, device) - GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std) + GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std, reduction) + DeterministicMixin.__init__(self, clip_actions) self.net = nn.Sequential(nn.Linear(self.num_observations, 256), nn.ELU(), nn.Linear(256, 128), nn.ELU(), nn.Linear(128, 64), - nn.ELU(), - nn.Linear(64, self.num_actions)) + nn.ELU()) + + self.mean_layer = nn.Linear(64, self.num_actions) self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) + + self.value_layer = nn.Linear(64, 1) - def compute(self, states, taken_actions, role): - return self.net(states), self.log_std_parameter - -class Value(DeterministicMixin, Model): - def __init__(self, observation_space, action_space, device, clip_actions=False): - Model.__init__(self, observation_space, action_space, device) - DeterministicMixin.__init__(self, clip_actions) - - self.net = nn.Sequential(nn.Linear(self.num_observations, 256), - nn.ELU(), - nn.Linear(256, 128), - nn.ELU(), - nn.Linear(128, 64), - nn.ELU(), - nn.Linear(64, 1)) + def act(self, states, taken_actions, role): + if role == "policy": + return GaussianMixin.act(self, states, taken_actions, role) + elif role == "value": + return DeterministicMixin.act(self, states, taken_actions, role) def compute(self, states, taken_actions, role): - return self.net(states) + if role == "policy": + return self.mean_layer(self.net(states)), self.log_std_parameter + elif role == "value": + return self.value_layer(self.net(states)) # Load and wrap the Isaac Gym environment @@ -72,12 +67,8 @@ def compute(self, states, taken_actions, role): # PPO requires 2 models, visit its documentation for more details # https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#spaces-and-models models_ppo = {} -models_ppo["policy"] = Policy(env.observation_space, env.action_space, device) -models_ppo["value"] = Value(env.observation_space, env.action_space, device) - -# Initialize the models' parameters (weights and biases) using a Gaussian distribution -for model in models_ppo.values(): - model.init_parameters(method_name="normal_", mean=0.0, std=0.1) +models_ppo["policy"] = Shared(env.observation_space, env.action_space, device) +models_ppo["value"] = models_ppo["policy"] # same instance: shared model # Configure and instantiate the agent. diff --git a/docs/source/examples/isaacgym/ppo_anymal.py b/docs/source/examples/isaacgym/ppo_anymal.py index 656cc3fe..1de573e8 100644 --- a/docs/source/examples/isaacgym/ppo_anymal.py +++ b/docs/source/examples/isaacgym/ppo_anymal.py @@ -19,42 +19,37 @@ set_seed(42) -# Define the models (stochastic and deterministic models) for the agent using mixins. -# - Policy: takes as input the environment's observation/state and returns an action -# - Value: takes the state as input and provides a value to guide the policy -class Policy(GaussianMixin, Model): +# Define the shared model (stochastic and deterministic models) for the agent using mixins. +class Shared(GaussianMixin, DeterministicMixin, Model): def __init__(self, observation_space, action_space, device, clip_actions=False, - clip_log_std=True, min_log_std=-20, max_log_std=2): + clip_log_std=True, min_log_std=-20, max_log_std=2, reduction="sum"): Model.__init__(self, observation_space, action_space, device) - GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std) + GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std, reduction) + DeterministicMixin.__init__(self, clip_actions) self.net = nn.Sequential(nn.Linear(self.num_observations, 256), nn.ELU(), nn.Linear(256, 128), nn.ELU(), nn.Linear(128, 64), - nn.ELU(), - nn.Linear(64, self.num_actions)) + nn.ELU()) + + self.mean_layer = nn.Linear(64, self.num_actions) self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) + + self.value_layer = nn.Linear(64, 1) - def compute(self, states, taken_actions, role): - return self.net(states), self.log_std_parameter - -class Value(DeterministicMixin, Model): - def __init__(self, observation_space, action_space, device, clip_actions=False): - Model.__init__(self, observation_space, action_space, device) - DeterministicMixin.__init__(self, clip_actions) - - self.net = nn.Sequential(nn.Linear(self.num_observations, 256), - nn.ELU(), - nn.Linear(256, 128), - nn.ELU(), - nn.Linear(128, 64), - nn.ELU(), - nn.Linear(64, 1)) + def act(self, states, taken_actions, role): + if role == "policy": + return GaussianMixin.act(self, states, taken_actions, role) + elif role == "value": + return DeterministicMixin.act(self, states, taken_actions, role) def compute(self, states, taken_actions, role): - return self.net(states) + if role == "policy": + return self.mean_layer(self.net(states)), self.log_std_parameter + elif role == "value": + return self.value_layer(self.net(states)) # Load and wrap the Isaac Gym environment @@ -72,12 +67,8 @@ def compute(self, states, taken_actions, role): # PPO requires 2 models, visit its documentation for more details # https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#spaces-and-models models_ppo = {} -models_ppo["policy"] = Policy(env.observation_space, env.action_space, device) -models_ppo["value"] = Value(env.observation_space, env.action_space, device) - -# Initialize the models' parameters (weights and biases) using a Gaussian distribution -for model in models_ppo.values(): - model.init_parameters(method_name="normal_", mean=0.0, std=0.1) +models_ppo["policy"] = Shared(env.observation_space, env.action_space, device) +models_ppo["value"] = models_ppo["policy"] # same instance: shared model # Configure and instantiate the agent. diff --git a/docs/source/examples/isaacgym/ppo_anymal_terrain.py b/docs/source/examples/isaacgym/ppo_anymal_terrain.py index d64118f2..f83e0c9f 100644 --- a/docs/source/examples/isaacgym/ppo_anymal_terrain.py +++ b/docs/source/examples/isaacgym/ppo_anymal_terrain.py @@ -24,9 +24,9 @@ # - Value: takes the state as input and provides a value to guide the policy class Policy(GaussianMixin, Model): def __init__(self, observation_space, action_space, device, clip_actions=False, - clip_log_std=True, min_log_std=-20, max_log_std=2): + clip_log_std=True, min_log_std=-20, max_log_std=2, reduction="sum"): Model.__init__(self, observation_space, action_space, device) - GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std) + GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std, reduction) self.net = nn.Sequential(nn.Linear(self.num_observations, 512), nn.ELU(), @@ -75,10 +75,6 @@ def compute(self, states, taken_actions, role): models_ppo["policy"] = Policy(env.observation_space, env.action_space, device) models_ppo["value"] = Value(env.observation_space, env.action_space, device) -# Initialize the models' parameters (weights and biases) using a Gaussian distribution -for model in models_ppo.values(): - model.init_parameters(method_name="normal_", mean=0.0, std=0.1) - # Configure and instantiate the agent. # Only modify some of the default configuration, visit its documentation to see all the options diff --git a/docs/source/examples/isaacgym/ppo_ball_balance.py b/docs/source/examples/isaacgym/ppo_ball_balance.py index bacde4d7..e3428f60 100644 --- a/docs/source/examples/isaacgym/ppo_ball_balance.py +++ b/docs/source/examples/isaacgym/ppo_ball_balance.py @@ -19,42 +19,37 @@ set_seed(42) -# Define the models (stochastic and deterministic models) for the agent using mixins. -# - Policy: takes as input the environment's observation/state and returns an action -# - Value: takes the state as input and provides a value to guide the policy -class Policy(GaussianMixin, Model): +# Define the shared model (stochastic and deterministic models) for the agent using mixins. +class Shared(GaussianMixin, DeterministicMixin, Model): def __init__(self, observation_space, action_space, device, clip_actions=False, - clip_log_std=True, min_log_std=-20, max_log_std=2): + clip_log_std=True, min_log_std=-20, max_log_std=2, reduction="sum"): Model.__init__(self, observation_space, action_space, device) - GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std) + GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std, reduction) + DeterministicMixin.__init__(self, clip_actions) self.net = nn.Sequential(nn.Linear(self.num_observations, 128), nn.ELU(), nn.Linear(128, 64), nn.ELU(), nn.Linear(64, 32), - nn.ELU(), - nn.Linear(32, self.num_actions)) + nn.ELU()) + + self.mean_layer = nn.Linear(32, self.num_actions) self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) + + self.value_layer = nn.Linear(32, 1) - def compute(self, states, taken_actions, role): - return self.net(states), self.log_std_parameter - -class Value(DeterministicMixin, Model): - def __init__(self, observation_space, action_space, device, clip_actions=False): - Model.__init__(self, observation_space, action_space, device) - DeterministicMixin.__init__(self, clip_actions) - - self.net = nn.Sequential(nn.Linear(self.num_observations, 128), - nn.ELU(), - nn.Linear(128, 64), - nn.ELU(), - nn.Linear(64, 32), - nn.ELU(), - nn.Linear(32, 1)) + def act(self, states, taken_actions, role): + if role == "policy": + return GaussianMixin.act(self, states, taken_actions, role) + elif role == "value": + return DeterministicMixin.act(self, states, taken_actions, role) def compute(self, states, taken_actions, role): - return self.net(states) + if role == "policy": + return self.mean_layer(self.net(states)), self.log_std_parameter + elif role == "value": + return self.value_layer(self.net(states)) # Load and wrap the Isaac Gym environment @@ -72,12 +67,9 @@ def compute(self, states, taken_actions, role): # PPO requires 2 models, visit its documentation for more details # https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#spaces-and-models models_ppo = {} -models_ppo["policy"] = Policy(env.observation_space, env.action_space, device) -models_ppo["value"] = Value(env.observation_space, env.action_space, device) +models_ppo["policy"] = Shared(env.observation_space, env.action_space, device) +models_ppo["value"] = models_ppo["policy"] # same instance: shared model -# Initialize the models' parameters (weights and biases) using a Gaussian distribution -for model in models_ppo.values(): - model.init_parameters(method_name="normal_", mean=0.0, std=0.1) # Configure and instantiate the agent. # Only modify some of the default configuration, visit its documentation to see all the options diff --git a/docs/source/examples/isaacgym/ppo_cartpole.py b/docs/source/examples/isaacgym/ppo_cartpole.py index 9525453c..e3f8c253 100644 --- a/docs/source/examples/isaacgym/ppo_cartpole.py +++ b/docs/source/examples/isaacgym/ppo_cartpole.py @@ -19,38 +19,35 @@ set_seed(42) -# Define the models (stochastic and deterministic models) for the agent using mixins. -# - Policy: takes as input the environment's observation/state and returns an action -# - Value: takes the state as input and provides a value to guide the policy -class Policy(GaussianMixin, Model): +# Define the shared model (stochastic and deterministic models) for the agent using mixins. +class Shared(GaussianMixin, DeterministicMixin, Model): def __init__(self, observation_space, action_space, device, clip_actions=False, - clip_log_std=True, min_log_std=-20, max_log_std=2): + clip_log_std=True, min_log_std=-20, max_log_std=2, reduction="sum"): Model.__init__(self, observation_space, action_space, device) - GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std) + GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std, reduction) + DeterministicMixin.__init__(self, clip_actions) self.net = nn.Sequential(nn.Linear(self.num_observations, 32), nn.ELU(), nn.Linear(32, 32), - nn.ELU(), - nn.Linear(32, self.num_actions)) + nn.ELU()) + + self.mean_layer = nn.Linear(32, self.num_actions) self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) + + self.value_layer = nn.Linear(32, 1) - def compute(self, states, taken_actions, role): - return self.net(states), self.log_std_parameter - -class Value(DeterministicMixin, Model): - def __init__(self, observation_space, action_space, device, clip_actions=False): - Model.__init__(self, observation_space, action_space, device) - DeterministicMixin.__init__(self, clip_actions) - - self.net = nn.Sequential(nn.Linear(self.num_observations, 32), - nn.ELU(), - nn.Linear(32, 32), - nn.ELU(), - nn.Linear(32, 1)) + def act(self, states, taken_actions, role): + if role == "policy": + return GaussianMixin.act(self, states, taken_actions, role) + elif role == "value": + return DeterministicMixin.act(self, states, taken_actions, role) def compute(self, states, taken_actions, role): - return self.net(states) + if role == "policy": + return self.mean_layer(self.net(states)), self.log_std_parameter + elif role == "value": + return self.value_layer(self.net(states)) # Load and wrap the Isaac Gym environment @@ -68,12 +65,8 @@ def compute(self, states, taken_actions, role): # PPO requires 2 models, visit its documentation for more details # https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#spaces-and-models models_ppo = {} -models_ppo["policy"] = Policy(env.observation_space, env.action_space, device) -models_ppo["value"] = Value(env.observation_space, env.action_space, device) - -# Initialize the models' parameters (weights and biases) using a Gaussian distribution -for model in models_ppo.values(): - model.init_parameters(method_name="normal_", mean=0.0, std=0.1) +models_ppo["policy"] = Shared(env.observation_space, env.action_space, device) +models_ppo["value"] = models_ppo["policy"] # same instance: shared model # Configure and instantiate the agent. diff --git a/docs/source/examples/isaacgym/ppo_cartpole_eval.py b/docs/source/examples/isaacgym/ppo_cartpole_eval.py index 3ed1ff63..dde950f6 100644 --- a/docs/source/examples/isaacgym/ppo_cartpole_eval.py +++ b/docs/source/examples/isaacgym/ppo_cartpole_eval.py @@ -4,7 +4,7 @@ import torch.nn as nn # Import the skrl components to build the RL system -from skrl.models.torch import Model, GaussianMixin +from skrl.models.torch import Model, GaussianMixin, DeterministicMixin from skrl.agents.torch.ppo import PPO, PPO_DEFAULT_CONFIG from skrl.resources.preprocessors.torch import RunningStandardScaler from skrl.trainers.torch import SequentialTrainer @@ -12,22 +12,35 @@ from skrl.envs.torch import load_isaacgym_env_preview4 -# Define only the policy for evaluation -class Policy(GaussianMixin, Model): +# Define the shared model (stochastic and deterministic models) for the agent using mixins. +class Shared(GaussianMixin, DeterministicMixin, Model): def __init__(self, observation_space, action_space, device, clip_actions=False, - clip_log_std=True, min_log_std=-20, max_log_std=2): + clip_log_std=True, min_log_std=-20, max_log_std=2, reduction="sum"): Model.__init__(self, observation_space, action_space, device) - GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std) + GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std, reduction) + DeterministicMixin.__init__(self, clip_actions) self.net = nn.Sequential(nn.Linear(self.num_observations, 32), nn.ELU(), nn.Linear(32, 32), - nn.ELU(), - nn.Linear(32, self.num_actions)) + nn.ELU()) + + self.mean_layer = nn.Linear(32, self.num_actions) self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) + + self.value_layer = nn.Linear(32, 1) + + def act(self, states, taken_actions, role): + if role == "policy": + return GaussianMixin.act(self, states, taken_actions, role) + elif role == "value": + return DeterministicMixin.act(self, states, taken_actions, role) def compute(self, states, taken_actions, role): - return self.net(states), self.log_std_parameter + if role == "policy": + return self.mean_layer(self.net(states)), self.log_std_parameter + elif role == "value": + return self.value_layer(self.net(states)) # Load and wrap the Isaac Gym environment @@ -41,7 +54,7 @@ def compute(self, states, taken_actions, role): # PPO requires 2 models, visit its documentation for more details # https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#spaces-and-models models_ppo = {} -models_ppo["policy"] = Policy(env.observation_space, env.action_space, device) +models_ppo["policy"] = Shared(env.observation_space, env.action_space, device) # Configure and instantiate the agent. diff --git a/docs/source/examples/isaacgym/ppo_franka_cabinet.py b/docs/source/examples/isaacgym/ppo_franka_cabinet.py index 690ad036..3fd9e834 100644 --- a/docs/source/examples/isaacgym/ppo_franka_cabinet.py +++ b/docs/source/examples/isaacgym/ppo_franka_cabinet.py @@ -19,42 +19,37 @@ set_seed(42) -# Define the models (stochastic and deterministic models) for the agent using mixins. -# - Policy: takes as input the environment's observation/state and returns an action -# - Value: takes the state as input and provides a value to guide the policy -class Policy(GaussianMixin, Model): +# Define the shared model (stochastic and deterministic models) for the agent using mixins. +class Shared(GaussianMixin, DeterministicMixin, Model): def __init__(self, observation_space, action_space, device, clip_actions=False, - clip_log_std=True, min_log_std=-20, max_log_std=2): + clip_log_std=True, min_log_std=-20, max_log_std=2, reduction="sum"): Model.__init__(self, observation_space, action_space, device) - GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std) + GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std, reduction) + DeterministicMixin.__init__(self, clip_actions) self.net = nn.Sequential(nn.Linear(self.num_observations, 256), nn.ELU(), nn.Linear(256, 128), nn.ELU(), nn.Linear(128, 64), - nn.ELU(), - nn.Linear(64, self.num_actions)) + nn.ELU()) + + self.mean_layer = nn.Linear(64, self.num_actions) self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) + + self.value_layer = nn.Linear(64, 1) - def compute(self, states, taken_actions, role): - return self.net(states), self.log_std_parameter - -class Value(DeterministicMixin, Model): - def __init__(self, observation_space, action_space, device, clip_actions=False): - Model.__init__(self, observation_space, action_space, device) - DeterministicMixin.__init__(self, clip_actions) - - self.net = nn.Sequential(nn.Linear(self.num_observations, 256), - nn.ELU(), - nn.Linear(256, 128), - nn.ELU(), - nn.Linear(128, 64), - nn.ELU(), - nn.Linear(64, 1)) + def act(self, states, taken_actions, role): + if role == "policy": + return GaussianMixin.act(self, states, taken_actions, role) + elif role == "value": + return DeterministicMixin.act(self, states, taken_actions, role) def compute(self, states, taken_actions, role): - return self.net(states) + if role == "policy": + return self.mean_layer(self.net(states)), self.log_std_parameter + elif role == "value": + return self.value_layer(self.net(states)) # Load and wrap the Isaac Gym environment @@ -72,12 +67,8 @@ def compute(self, states, taken_actions, role): # PPO requires 2 models, visit its documentation for more details # https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#spaces-and-models models_ppo = {} -models_ppo["policy"] = Policy(env.observation_space, env.action_space, device) -models_ppo["value"] = Value(env.observation_space, env.action_space, device) - -# Initialize the models' parameters (weights and biases) using a Gaussian distribution -for model in models_ppo.values(): - model.init_parameters(method_name="normal_", mean=0.0, std=0.1) +models_ppo["policy"] = Shared(env.observation_space, env.action_space, device) +models_ppo["value"] = models_ppo["policy"] # same instance: shared model # Configure and instantiate the agent. diff --git a/docs/source/examples/isaacgym/ppo_humanoid.py b/docs/source/examples/isaacgym/ppo_humanoid.py index 1a1272f9..19893771 100644 --- a/docs/source/examples/isaacgym/ppo_humanoid.py +++ b/docs/source/examples/isaacgym/ppo_humanoid.py @@ -19,42 +19,37 @@ set_seed(42) -# Define the models (stochastic and deterministic models) for the agent using mixins. -# - Policy: takes as input the environment's observation/state and returns an action -# - Value: takes the state as input and provides a value to guide the policy -class Policy(GaussianMixin, Model): +# Define the shared model (stochastic and deterministic models) for the agent using mixins. +class Shared(GaussianMixin, DeterministicMixin, Model): def __init__(self, observation_space, action_space, device, clip_actions=False, - clip_log_std=True, min_log_std=-20, max_log_std=2): + clip_log_std=True, min_log_std=-20, max_log_std=2, reduction="sum"): Model.__init__(self, observation_space, action_space, device) - GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std) + GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std, reduction) + DeterministicMixin.__init__(self, clip_actions) self.net = nn.Sequential(nn.Linear(self.num_observations, 400), nn.ELU(), nn.Linear(400, 200), nn.ELU(), nn.Linear(200, 100), - nn.ELU(), - nn.Linear(100, self.num_actions)) + nn.ELU()) + + self.mean_layer = nn.Linear(100, self.num_actions) self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) + + self.value_layer = nn.Linear(100, 1) - def compute(self, states, taken_actions, role): - return self.net(states), self.log_std_parameter - -class Value(DeterministicMixin, Model): - def __init__(self, observation_space, action_space, device, clip_actions=False): - Model.__init__(self, observation_space, action_space, device) - DeterministicMixin.__init__(self, clip_actions) - - self.net = nn.Sequential(nn.Linear(self.num_observations, 400), - nn.ELU(), - nn.Linear(400, 200), - nn.ELU(), - nn.Linear(200, 100), - nn.ELU(), - nn.Linear(100, 1)) + def act(self, states, taken_actions, role): + if role == "policy": + return GaussianMixin.act(self, states, taken_actions, role) + elif role == "value": + return DeterministicMixin.act(self, states, taken_actions, role) def compute(self, states, taken_actions, role): - return self.net(states) + if role == "policy": + return self.mean_layer(self.net(states)), self.log_std_parameter + elif role == "value": + return self.value_layer(self.net(states)) # Load and wrap the Isaac Gym environment @@ -72,12 +67,8 @@ def compute(self, states, taken_actions, role): # PPO requires 2 models, visit its documentation for more details # https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#spaces-and-models models_ppo = {} -models_ppo["policy"] = Policy(env.observation_space, env.action_space, device) -models_ppo["value"] = Value(env.observation_space, env.action_space, device) - -# Initialize the models' parameters (weights and biases) using a Gaussian distribution -for model in models_ppo.values(): - model.init_parameters(method_name="normal_", mean=0.0, std=0.1) +models_ppo["policy"] = Shared(env.observation_space, env.action_space, device) +models_ppo["value"] = models_ppo["policy"] # same instance: shared model # Configure and instantiate the agent. diff --git a/docs/source/examples/isaacgym/ppo_ingenuity.py b/docs/source/examples/isaacgym/ppo_ingenuity.py index 84c7570b..f60d46aa 100644 --- a/docs/source/examples/isaacgym/ppo_ingenuity.py +++ b/docs/source/examples/isaacgym/ppo_ingenuity.py @@ -16,55 +16,50 @@ # set the seed for reproducibility -set_seed(42) +seed = set_seed(42) -# Define the models (stochastic and deterministic models) for the agent using mixins. -# - Policy: takes as input the environment's observation/state and returns an action -# - Value: takes the state as input and provides a value to guide the policy -class Policy(GaussianMixin, Model): +# Define the shared model (stochastic and deterministic models) for the agent using mixins. +class Shared(GaussianMixin, DeterministicMixin, Model): def __init__(self, observation_space, action_space, device, clip_actions=False, - clip_log_std=True, min_log_std=-20, max_log_std=2): + clip_log_std=True, min_log_std=-20, max_log_std=2, reduction="sum"): Model.__init__(self, observation_space, action_space, device) - GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std) + GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std, reduction) + DeterministicMixin.__init__(self, clip_actions) self.net = nn.Sequential(nn.Linear(self.num_observations, 256), nn.ELU(), nn.Linear(256, 256), nn.ELU(), nn.Linear(256, 128), - nn.ELU(), - nn.Linear(128, self.num_actions)) + nn.ELU()) + + self.mean_layer = nn.Linear(128, self.num_actions) self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) + + self.value_layer = nn.Linear(128, 1) - def compute(self, states, taken_actions, role): - return self.net(states), self.log_std_parameter - -class Value(DeterministicMixin, Model): - def __init__(self, observation_space, action_space, device, clip_actions=False): - Model.__init__(self, observation_space, action_space, device) - DeterministicMixin.__init__(self, clip_actions) - - self.net = nn.Sequential(nn.Linear(self.num_observations, 256), - nn.ELU(), - nn.Linear(256, 256), - nn.ELU(), - nn.Linear(256, 128), - nn.ELU(), - nn.Linear(128, 1)) + def act(self, states, taken_actions, role): + if role == "policy": + return GaussianMixin.act(self, states, taken_actions, role) + elif role == "value": + return DeterministicMixin.act(self, states, taken_actions, role) def compute(self, states, taken_actions, role): - return self.net(states) + if role == "policy": + return self.mean_layer(self.net(states)), self.log_std_parameter + elif role == "value": + return self.value_layer(self.net(states)) # Load and wrap the Isaac Gym environment using the easy-to-use API from NVIDIA -env = isaacgymenvs.make(seed=42, +env = isaacgymenvs.make(seed=seed, task="Ingenuity", num_envs=4096, sim_device="cuda:0", rl_device="cuda:0", graphics_device_id=0, - headless=False) + headless=True) env = wrap_env(env) device = env.device @@ -78,12 +73,8 @@ def compute(self, states, taken_actions, role): # PPO requires 2 models, visit its documentation for more details # https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#spaces-and-models models_ppo = {} -models_ppo["policy"] = Policy(env.observation_space, env.action_space, device) -models_ppo["value"] = Value(env.observation_space, env.action_space, device) - -# Initialize the models' parameters (weights and biases) using a Gaussian distribution -for model in models_ppo.values(): - model.init_parameters(method_name="normal_", mean=0.0, std=0.1) +models_ppo["policy"] = Shared(env.observation_space, env.action_space, device) +models_ppo["value"] = models_ppo["policy"] # same instance: shared model # Configure and instantiate the agent. diff --git a/docs/source/examples/isaacgym/ppo_quadcopter.py b/docs/source/examples/isaacgym/ppo_quadcopter.py index 06289885..34ed47ed 100644 --- a/docs/source/examples/isaacgym/ppo_quadcopter.py +++ b/docs/source/examples/isaacgym/ppo_quadcopter.py @@ -19,42 +19,37 @@ set_seed(42) -# Define the models (stochastic and deterministic models) for the agent using mixins. -# - Policy: takes as input the environment's observation/state and returns an action -# - Value: takes the state as input and provides a value to guide the policy -class Policy(GaussianMixin, Model): +# Define the shared model (stochastic and deterministic models) for the agent using mixins. +class Shared(GaussianMixin, DeterministicMixin, Model): def __init__(self, observation_space, action_space, device, clip_actions=False, - clip_log_std=True, min_log_std=-20, max_log_std=2): + clip_log_std=True, min_log_std=-20, max_log_std=2, reduction="sum"): Model.__init__(self, observation_space, action_space, device) - GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std) + GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std, reduction) + DeterministicMixin.__init__(self, clip_actions) self.net = nn.Sequential(nn.Linear(self.num_observations, 256), nn.ELU(), nn.Linear(256, 256), nn.ELU(), nn.Linear(256, 128), - nn.ELU(), - nn.Linear(128, self.num_actions)) + nn.ELU()) + + self.mean_layer = nn.Linear(128, self.num_actions) self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) + + self.value_layer = nn.Linear(128, 1) - def compute(self, states, taken_actions, role): - return self.net(states), self.log_std_parameter - -class Value(DeterministicMixin, Model): - def __init__(self, observation_space, action_space, device, clip_actions=False): - Model.__init__(self, observation_space, action_space, device) - DeterministicMixin.__init__(self, clip_actions) - - self.net = nn.Sequential(nn.Linear(self.num_observations, 256), - nn.ELU(), - nn.Linear(256, 256), - nn.ELU(), - nn.Linear(256, 128), - nn.ELU(), - nn.Linear(128, 1)) + def act(self, states, taken_actions, role): + if role == "policy": + return GaussianMixin.act(self, states, taken_actions, role) + elif role == "value": + return DeterministicMixin.act(self, states, taken_actions, role) def compute(self, states, taken_actions, role): - return self.net(states) + if role == "policy": + return self.mean_layer(self.net(states)), self.log_std_parameter + elif role == "value": + return self.value_layer(self.net(states)) # Load and wrap the Isaac Gym environment @@ -72,12 +67,8 @@ def compute(self, states, taken_actions, role): # PPO requires 2 models, visit its documentation for more details # https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#spaces-and-models models_ppo = {} -models_ppo["policy"] = Policy(env.observation_space, env.action_space, device) -models_ppo["value"] = Value(env.observation_space, env.action_space, device) - -# Initialize the models' parameters (weights and biases) using a Gaussian distribution -for model in models_ppo.values(): - model.init_parameters(method_name="normal_", mean=0.0, std=0.1) +models_ppo["policy"] = Shared(env.observation_space, env.action_space, device) +models_ppo["value"] = models_ppo["policy"] # same instance: shared model # Configure and instantiate the agent. diff --git a/docs/source/examples/isaacgym/ppo_shadow_hand.py b/docs/source/examples/isaacgym/ppo_shadow_hand.py index 2f184912..c7e8636a 100644 --- a/docs/source/examples/isaacgym/ppo_shadow_hand.py +++ b/docs/source/examples/isaacgym/ppo_shadow_hand.py @@ -19,14 +19,13 @@ set_seed(42) -# Define the models (stochastic and deterministic models) for the agent using mixins. -# - Policy: takes as input the environment's observation/state and returns an action -# - Value: takes the state as input and provides a value to guide the policy -class Policy(GaussianMixin, Model): +# Define the shared model (stochastic and deterministic models) for the agent using mixins. +class Shared(GaussianMixin, DeterministicMixin, Model): def __init__(self, observation_space, action_space, device, clip_actions=False, - clip_log_std=True, min_log_std=-20, max_log_std=2): + clip_log_std=True, min_log_std=-20, max_log_std=2, reduction="sum"): Model.__init__(self, observation_space, action_space, device) - GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std) + GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std, reduction) + DeterministicMixin.__init__(self, clip_actions) self.net = nn.Sequential(nn.Linear(self.num_observations, 512), nn.ELU(), @@ -35,30 +34,24 @@ def __init__(self, observation_space, action_space, device, clip_actions=False, nn.Linear(512, 256), nn.ELU(), nn.Linear(256, 128), - nn.ELU(), - nn.Linear(128, self.num_actions)) + nn.ELU()) + + self.mean_layer = nn.Linear(128, self.num_actions) self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) + + self.value_layer = nn.Linear(128, 1) - def compute(self, states, taken_actions, role): - return self.net(states), self.log_std_parameter - -class Value(DeterministicMixin, Model): - def __init__(self, observation_space, action_space, device, clip_actions=False): - Model.__init__(self, observation_space, action_space, device) - DeterministicMixin.__init__(self, clip_actions) - - self.net = nn.Sequential(nn.Linear(self.num_observations, 512), - nn.ELU(), - nn.Linear(512, 512), - nn.ELU(), - nn.Linear(512, 256), - nn.ELU(), - nn.Linear(256, 128), - nn.ELU(), - nn.Linear(128, 1)) + def act(self, states, taken_actions, role): + if role == "policy": + return GaussianMixin.act(self, states, taken_actions, role) + elif role == "value": + return DeterministicMixin.act(self, states, taken_actions, role) def compute(self, states, taken_actions, role): - return self.net(states) + if role == "policy": + return self.mean_layer(self.net(states)), self.log_std_parameter + elif role == "value": + return self.value_layer(self.net(states)) # Load and wrap the Isaac Gym environment @@ -76,12 +69,8 @@ def compute(self, states, taken_actions, role): # PPO requires 2 models, visit its documentation for more details # https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#spaces-and-models models_ppo = {} -models_ppo["policy"] = Policy(env.observation_space, env.action_space, device) -models_ppo["value"] = Value(env.observation_space, env.action_space, device) - -# Initialize the models' parameters (weights and biases) using a Gaussian distribution -for model in models_ppo.values(): - model.init_parameters(method_name="normal_", mean=0.0, std=0.1) +models_ppo["policy"] = Shared(env.observation_space, env.action_space, device) +models_ppo["value"] = models_ppo["policy"] # same instance: shared model # Configure and instantiate the agent. diff --git a/docs/source/examples/isaacgym/ppo_trifinger.py b/docs/source/examples/isaacgym/ppo_trifinger.py index 898d5bb6..87591dcf 100644 --- a/docs/source/examples/isaacgym/ppo_trifinger.py +++ b/docs/source/examples/isaacgym/ppo_trifinger.py @@ -19,14 +19,13 @@ set_seed(42) -# Define the models (stochastic and deterministic models) for the agent using mixins. -# - Policy: takes as input the environment's observation/state and returns an action -# - Value: takes the state as input and provides a value to guide the policy -class Policy(GaussianMixin, Model): +# Define the shared model (stochastic and deterministic models) for the agent using mixins. +class Shared(GaussianMixin, DeterministicMixin, Model): def __init__(self, observation_space, action_space, device, clip_actions=False, - clip_log_std=True, min_log_std=-20, max_log_std=2): + clip_log_std=True, min_log_std=-20, max_log_std=2, reduction="sum"): Model.__init__(self, observation_space, action_space, device) - GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std) + GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std, reduction) + DeterministicMixin.__init__(self, clip_actions) self.net = nn.Sequential(nn.Linear(self.num_observations, 256), nn.ELU(), @@ -35,30 +34,24 @@ def __init__(self, observation_space, action_space, device, clip_actions=False, nn.Linear(256, 128), nn.ELU(), nn.Linear(128, 128), - nn.ELU(), - nn.Linear(128, self.num_actions)) + nn.ELU()) + + self.mean_layer = nn.Linear(128, self.num_actions) self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) + + self.value_layer = nn.Linear(128, 1) - def compute(self, states, taken_actions, role): - return self.net(states), self.log_std_parameter - -class Value(DeterministicMixin, Model): - def __init__(self, observation_space, action_space, device, clip_actions=False): - Model.__init__(self, observation_space, action_space, device) - DeterministicMixin.__init__(self, clip_actions) - - self.net = nn.Sequential(nn.Linear(self.num_observations, 256), - nn.ELU(), - nn.Linear(256, 256), - nn.ELU(), - nn.Linear(256, 128), - nn.ELU(), - nn.Linear(128, 128), - nn.ELU(), - nn.Linear(128, 1)) + def act(self, states, taken_actions, role): + if role == "policy": + return GaussianMixin.act(self, states, taken_actions, role) + elif role == "value": + return DeterministicMixin.act(self, states, taken_actions, role) def compute(self, states, taken_actions, role): - return self.net(states) + if role == "policy": + return self.mean_layer(self.net(states)), self.log_std_parameter + elif role == "value": + return self.value_layer(self.net(states)) # Load and wrap the Isaac Gym environment @@ -76,12 +69,8 @@ def compute(self, states, taken_actions, role): # PPO requires 2 models, visit its documentation for more details # https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#spaces-and-models models_ppo = {} -models_ppo["policy"] = Policy(env.observation_space, env.action_space, device) -models_ppo["value"] = Value(env.observation_space, env.action_space, device) - -# Initialize the models' parameters (weights and biases) using a Gaussian distribution -for model in models_ppo.values(): - model.init_parameters(method_name="normal_", mean=0.0, std=0.1) +models_ppo["policy"] = Shared(env.observation_space, env.action_space, device) +models_ppo["value"] = models_ppo["policy"] # same instance: shared model # Configure and instantiate the agent. @@ -94,8 +83,6 @@ def compute(self, states, taken_actions, role): cfg_ppo["discount_factor"] = 0.99 cfg_ppo["lambda"] = 0.95 cfg_ppo["learning_rate"] = 3e-4 -cfg_ppo["learning_rate_scheduler"] = KLAdaptiveRL -cfg_ppo["learning_rate_scheduler_kwargs"] = {"kl_threshold": 0.016} cfg_ppo["random_timesteps"] = 0 cfg_ppo["learning_starts"] = 0 cfg_ppo["grad_norm_clip"] = 1.0 @@ -104,7 +91,7 @@ def compute(self, states, taken_actions, role): cfg_ppo["clip_predicted_values"] = True cfg_ppo["entropy_loss_scale"] = 0.0 cfg_ppo["value_loss_scale"] = 2.0 -cfg_ppo["kl_threshold"] = 0 +cfg_ppo["kl_threshold"] = 0.016 cfg_ppo["rewards_shaper"] = lambda rewards, timestep, timesteps: rewards * 0.01 cfg_ppo["state_preprocessor"] = RunningStandardScaler cfg_ppo["state_preprocessor_kwargs"] = {"size": env.observation_space, "device": device} From 23dc7ceeda2a8f947d86d9706cc574a0428eb3f8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20Serrano=20Mu=C3=B1oz?= Date: Sat, 1 Oct 2022 09:59:34 +0200 Subject: [PATCH 093/108] Update the getting started section --- docs/source/intro/getting_started.rst | 531 +++++++++++++++++++++++++- 1 file changed, 510 insertions(+), 21 deletions(-) diff --git a/docs/source/intro/getting_started.rst b/docs/source/intro/getting_started.rst index a6259c1c..112a80ef 100644 --- a/docs/source/intro/getting_started.rst +++ b/docs/source/intro/getting_started.rst @@ -4,60 +4,549 @@ Getting Started **Reinforcement Learning (RL)** is a Machine Learning sub-field for decision making that allows an agent to learn from its interaction with the environment as shown in the following schema: .. image:: ../_static/imgs/rl_schema.svg - :width: 100% - :align: center - :alt: Reinforcement Learning schema + :width: 100% + :align: center + :alt: Reinforcement Learning schema .. raw:: html -
+
-At each step (also called timestep) of interaction with the environment, the agent sees an observation :math:`o_t` of the complete description of the state :math:`s_t \in S` of the environment. Then, it decides which action :math:`a_t \in A` to take from the action space using a policy. The environment, which changes in response to the agent's action (or by itself), returns a reward signal :math:`r_t = R(s_t, a_t, s_{t+1})` as a measure of how good or bad the action was that moved it to its new state :math:`s_{t+1}`. The agent aims to maximize the cumulative reward (discounted or not by a factor :math:`\gamma \in (0,1]`) by adjusting the policy's behaviour via some optimization algorithm +At each step (also called timestep) of interaction with the environment, the agent sees an observation :math:`o_t` of the complete description of the state :math:`s_t \in S` of the environment. Then, it decides which action :math:`a_t \in A` to take from the action space using a policy. The environment, which changes in response to the agent's action (or by itself), returns a reward signal :math:`r_t = R(s_t, a_t, s_{t+1})` as a measure of how good or bad the action was that moved it to its new state :math:`s_{t+1}`. The agent aims to maximize the cumulative reward (discounted or not by a factor :math:`\gamma \in (0,1]`) by adjusting the policy's behaviour via some optimization algorithm. -**Based on this schema, this section intends to guide, step by step, in the creation of an RL system** +**From this schema, this section is intended to guide in the creation of a RL system using skrl**. Visit the :ref:`Examples ` section for training and evaluation demonstrations with different environment interfaces and highlighted practices, among others. 1. Environments --------------- -The environment plays a fundamental role in the definition of the RL schema. For example, the selection of the agent depends strongly on the observation and action space nature. There are several interfaces to interact with the environments such as OpenAI Gym or DeepMind. However, each of them has a different API and work with non-compatible data types +The environment plays a fundamental role in the definition of the RL schema. For example, the selection of the agent depends strongly on the observation and action space nature. There are several interfaces to interact with the environments such as OpenAI Gym or DeepMind. However, each of them has a different API and work with non-compatible data types. -skrl offers a function to **wrap environments** based on the OpenAI Gym, DeepMind, Isaac Gym and Omniverse Isaac Gym interfaces (the last two have slight differences with OpenAI Gym) and offer, for library components, a common interface (based on OpenAI Gym) as shown in the following figure. Refer to the :doc:`Wrapping <../modules/skrl.envs.wrapping>` section for more information +skrl offers a function to **wrap environments** based on the OpenAI Gym, DeepMind, Isaac Gym and Omniverse Isaac Gym interfaces (the last two have slight differences with OpenAI Gym) and offer, for library components, a common interface (based on OpenAI Gym) as shown in the following figure. Refer to the :doc:`Wrapping <../modules/skrl.envs.wrapping>` section for more information. .. image:: ../_static/imgs/wrapping.svg - :width: 100% - :align: center - :alt: Environment wrapping + :width: 100% + :align: center + :alt: Environment wrapping -.. raw:: html +Within the methods and properties defined in the wrapped environment, the observation and action space are one of the most relevant for instantiating other library components. The following code snippets show how to load and wrap environments based on the supported interfaces: + +.. tabs:: + + .. tab:: Omniverse Isaac Gym + + .. tabs:: + + .. tab:: Common environment + + .. code-block:: python + + # import the environment wrapper and loader + from skrl.envs.torch import wrap_env + from skrl.envs.torch import load_omniverse_isaacgym_env + + # load the environment + env = load_omniverse_isaacgym_env(task_name="Cartpole") + + # wrap the environment + env = wrap_env(env) # or 'env = wrap_env(env, wrapper="omniverse-isaacgym")' + + .. tab:: Multi-threaded environment + + .. code-block:: python + + # import the environment wrapper and loader + from skrl.envs.torch import wrap_env + from skrl.envs.torch import load_omniverse_isaacgym_env + + # load the multi-threaded environment + env = load_omniverse_isaacgym_env(task_name="Cartpole", multi_threaded=True, timeout=30) + + # wrap the environment + env = wrap_env(env) # or 'env = wrap_env(env, wrapper="omniverse-isaacgym")' + + .. tab:: Isaac Gym + + .. tabs:: + + .. tab:: Preview 4 (isaacgymenvs.make) + + .. code-block:: python + + import isaacgymenvs + + # import the environment wrapper + from skrl.envs.torch import wrap_env + + # create/load the environment using the easy-to-use API from NVIDIA + env = isaacgymenvs.make(seed=0, + task="Cartpole", + num_envs=512, + sim_device="cuda:0", + rl_device="cuda:0", + graphics_device_id=0, + headless=False) + + # wrap the environment + env = wrap_env(env) # or 'env = wrap_env(env, wrapper="isaacgym-preview4")' + + .. tab:: Preview 4 + + .. code-block:: python + + # import the environment wrapper and loader + from skrl.envs.torch import wrap_env + from skrl.envs.torch import load_isaacgym_env_preview4 + + # load the environment + env = load_isaacgym_env_preview4(task_name="Cartpole") + + # wrap the environment + env = wrap_env(env) # or 'env = wrap_env(env, wrapper="isaacgym-preview4")' + + .. tab:: Preview 3 + + .. code-block:: python + + # import the environment wrapper and loader + from skrl.envs.torch import wrap_env + from skrl.envs.torch import load_isaacgym_env_preview3 + + # load the environment + env = load_isaacgym_env_preview3(task_name="Cartpole") + + # wrap the environment + env = wrap_env(env) # or 'env = wrap_env(env, wrapper="isaacgym-preview3")' + + .. tab:: Preview 2 + + .. code-block:: python + + # import the environment wrapper and loader + from skrl.envs.torch import wrap_env + from skrl.envs.torch import load_isaacgym_env_preview2 + + # load the environment + env = load_isaacgym_env_preview2(task_name="Cartpole") + + # wrap the environment + env = wrap_env(env) # or 'env = wrap_env(env, wrapper="isaacgym-preview2")' + + .. tab:: OpenAI Gym + + .. tabs:: + + .. tab:: Single environment + + .. code-block:: python + + # import the environment wrapper and gym + from skrl.envs.torch import wrap_env + import gym + + # load environment + env = gym.make('Pendulum-v1') + + # wrap the environment + env = wrap_env(env) # or 'env = wrap_env(env, wrapper="gym")' + + .. tab:: Vectorized environment -
+ Visit the OpenAI Gym documentation (`Vector API `_) for more information about the creation and usage of vectorized environments. + + .. code-block:: python + + # import the environment wrapper and gym + from skrl.envs.torch import wrap_env + import gym + + # load a vectorized environment + env = gym.vector.make("Pendulum-v1", num_envs=10, asynchronous=False) + + # wrap the environment + env = wrap_env(env) # or 'env = wrap_env(env, wrapper="gym")' + + .. tab:: DeepMind + + .. code-block:: python + + # import the environment wrapper and the deepmind suite + from skrl.envs.torch import wrap_env + from dm_control import suite + + # load environment + env = suite.load(domain_name="cartpole", task_name="swingup") + + # wrap the environment + env = wrap_env(env) # or 'env = wrap_env(env, wrapper="dm")' + +Once the environment is known (and instantiated), it is time to configure and instantiate the agent. Agents are composed, apart from the optimization algorithm, by several components, such as memories, models or noises, for example, according to their nature. The following subsections focus on those components. 2. Memories ----------- -:red:`Under construction...` +Memories are storage components that allow agents to collect and use/reuse recent or past experiences or other types of information. These can be large in size (such as replay buffers used by off-policy algorithms like DDPG, TD3 or SAC) or small in size (such as rollout buffers used by on-policy algorithms like PPO or TRPO to store batches that are discarded after use). + +skrl provides **generic memory definitions** that are not tied to the agent implementation and can be used for any role, such as rollout or replay buffers. They are empty shells when they are instantiated and the agents are in charge of defining the tensors according to their needs. The total space occupied is the product of the memory size (:literal:`memory_size`), the number of environments (:literal:`num_envs`) obtained from the wrapped environment and the data size for each defined tensor. + +The following code snippets show how to instantiate a memory: + +.. tabs:: + + .. tab:: Random memory + + .. code-block:: python + + from skrl.memories.torch import RandomMemory + + # instantiate a memory + memory = RandomMemory(memory_size=100000, num_envs=env.num_envs) + +Memories are passed directly to the agent constructor, if required (not all agents require memory, such as Q-learning or SARSA, for example), during its instantiation under the argument :literal:`memory`. 3. Models --------- -:red:`Under construction...` +Models are the agents' brains. Agents can have one or several models and their parameters are adjusted via the optimization algorithms. + +In contrast to other libraries, skrl does not provide predefined models or fixed templates (this practice tends to hide and reduce the flexibility of the system, forcing developers to deeply inspect the code to make even small changes). Nevertheless, **helper classes/mixins are provided** to create discrete and continuous (stochastic or deterministic) models with the library. In this way, the user/researcher should only be concerned with the definition of the approximation functions (tables or artificial neural networks), having all the control in his hands. + +The following code snippets show how to define a model, based on the concept of each respective image, using the provided classes/mixins. For more information refer to :ref:`Categorical `, :ref:`Gaussian `, :ref:`Multivariate Gaussian ` and :ref:`Deterministic ` sections for artificial neural networks models, and :ref:`Tabular ` section for tabular models. + +.. tabs:: + + .. tab:: Categorical + + .. image:: ../_static/imgs/model_categorical.svg + :width: 100% + :align: center + :alt: Categorical model + + .. raw:: html + +
+ + .. code-block:: python + + import torch + import torch.nn as nn + from skrl.models.torch import Model, CategoricalMixin + + # define the model + class Policy(CategoricalMixin, Model): + def __init__(self, observation_space, action_space, device="cuda:0", unnormalized_log_prob=True): + Model.__init__(self, observation_space, action_space, device) + CategoricalMixin.__init__(self, unnormalized_log_prob) + + self.net = nn.Sequential(nn.Linear(self.num_observations, 32), + nn.ELU(), + nn.Linear(32, 32), + nn.ELU(), + nn.Linear(32, self.num_actions)) + + def compute(self, states, taken_actions, role): + return self.net(states) + + .. tab:: Gaussian + + .. image:: ../_static/imgs/model_gaussian.svg + :width: 100% + :align: center + :alt: Gaussian model + + .. raw:: html + +
+ + .. code-block:: python + + import torch + import torch.nn as nn + from skrl.models.torch import Model, GaussianMixin + + # define the model + class Policy(GaussianMixin, Model): + def __init__(self, observation_space, action_space, device="cuda:0", + clip_actions=False, clip_log_std=True, min_log_std=-20, max_log_std=2, reduction="sum"): + Model.__init__(self, observation_space, action_space, device) + GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std, reduction) + + self.net = nn.Sequential(nn.Linear(self.num_observations, 32), + nn.ELU(), + nn.Linear(32, 32), + nn.ELU(), + nn.Linear(32, self.num_actions)) + self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) + + def compute(self, states, taken_actions, role): + return self.net(states), self.log_std_parameter + + .. tab:: Multivariate Gaussian + + .. image:: ../_static/imgs/model_multivariate_gaussian.svg + :width: 100% + :align: center + :alt: Multivariate Gaussian model + + .. raw:: html + +
+ + .. code-block:: python + + import torch + import torch.nn as nn + from skrl.models.torch import Model, MultivariateGaussianMixin + + # define the model + class Policy(MultivariateGaussianMixin, Model): + def __init__(self, observation_space, action_space, device="cuda:0", + clip_actions=False, clip_log_std=True, min_log_std=-20, max_log_std=2): + Model.__init__(self, observation_space, action_space, device) + MultivariateGaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std) + + self.net = nn.Sequential(nn.Linear(self.num_observations, 32), + nn.ELU(), + nn.Linear(32, 32), + nn.ELU(), + nn.Linear(32, self.num_actions)) + self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) + + def compute(self, states, taken_actions, role): + return self.net(states), self.log_std_parameter + + .. tab:: Deterministic + + .. image:: ../_static/imgs/model_deterministic.svg + :width: 60% + :align: center + :alt: Deterministic model + + .. raw:: html + +
+ + .. code-block:: python + + import torch + import torch.nn as nn + from skrl.models.torch import Model, DeterministicMixin + + # define the model + class Policy(DeterministicMixin, Model): + def __init__(self, observation_space, action_space, device="cuda:0", clip_actions=False): + Model.__init__(self, observation_space, action_space, device) + DeterministicMixin.__init__(self, clip_actions) + + self.net = nn.Sequential(nn.Linear(self.num_observations, 32), + nn.ELU(), + nn.Linear(32, 32), + nn.ELU(), + nn.Linear(32, self.num_actions)) + + def compute(self, states, taken_actions, role): + return self.net(states) + + .. tab:: Tabular + + .. code-block:: python + + import torch + from skrl.models.torch import Model, TabularMixin + + # define the model + class Policy(TabularMixin, Model): + def __init__(self, observation_space, action_space, device="cuda:0", num_envs=1): + Model.__init__(self, observation_space, action_space, device) + TabularMixin.__init__(self, num_envs) + + self.table = torch.ones((num_envs, self.num_observations, self.num_actions), + dtype=torch.float32, device=self.device) + + def compute(self, states, taken_actions, role): + actions = torch.argmax(self.table[torch.arange(self.num_envs).view(-1, 1), states], + dim=-1, keepdim=True).view(-1,1) + +Models must be collected in a dictionary and passed to the agent constructor during its instantiation under the argument :literal:`models`. The dictionary keys are specific to each agent. Visit their respective documentation for more details (under *Spaces and models* section). For example, the PPO agent requires the policy and value models as shown below: + +.. code-block:: python + + models = {} + models["policy"] = Policy(env.observation_space, env.action_space, env.device) + models["value"] = Value(env.observation_space, env.action_space, env.device) + +Models can be saved and loaded to and from the file system. However, the recommended practice for loading checkpoints to perform evaluations or continue an interrupted training is through the agents (they include, in addition to the models, other components and internal instances such as preprocessors or optimizers). Refer to :ref:`Saving, loading and logging ` (under *Checkpoints* section) for more information. 4. Noises --------- -:red:`Under construction...` +Noise plays a fundamental role in the exploration stage, especially in agents of a deterministic nature, such as DDPG or TD3, for example. + +skrl provides, as part of its resources, **classes for instantiating noises** as shown in the following code snippets. Refer to :ref:`Noises ` documentation for more information. + +.. tabs:: + + .. tab:: Gaussian noise + + .. code-block:: python + + from skrl.resources.noises.torch import GaussianNoise + + # instantiate a noise + noise = GaussianNoise(mean=0, std=0.2, device=env.device) + + .. tab:: Ornstein-Uhlenbeck noise + + .. code-block:: python + + from skrl.resources.noises.torch import OrnsteinUhlenbeckNoise + + # instantiate a noise + noise = OrnsteinUhlenbeckNoise(theta=0.15, sigma=0.2, base_scale=1.0, device=env.device) + +Noise instances are passed to the agents in their respective configuration dictionaries. For example, the DDPG agent requires the exploration noise as shown below: + +.. code-block:: python + + from skrl.agents.torch.ddpg import DDPG, DDPG_DEFAULT_CONFIG + + agent_cfg = DDPG_DEFAULT_CONFIG.copy() + agent_cfg["exploration"]["noise"] = noise 5. Learning rate schedulers --------------------------- -:red:`Under construction...` +Learning rate schedulers help RL system converge faster and improve accuracy. + +skrl **supports all PyTorch learning rate schedulers** and provides, as part of its resources, **additional schedulers**. Refer to :ref:`Learning rate schedulers ` documentation for more information. + +Learning rate schedulers classes and their respective arguments (except the :literal:`optimizer` argument) are passed to the agents in their respective configuration dictionaries. For example, for the PPO agent, one of the schedulers can be configured as shown below: + +.. code-block:: python + + from skrl.agents.torch.ppo import PPO, PPO_DEFAULT_CONFIG + from skrl.resources.schedulers.torch import KLAdaptiveRL + + agent_cfg = PPO_DEFAULT_CONFIG.copy() + agent_cfg["learning_rate_scheduler"] = KLAdaptiveRL + agent_cfg["learning_rate_scheduler_kwargs"] = {"kl_threshold": 0.008} + +6. Preprocessors +---------------- -6. Agents +Data preprocessing can help increase the accuracy and efficiency of training by cleaning or making data suitable for machine learning models. + +skrl provides, as part of its resources, **preprocessors** classes. Refer to :ref:`Preprocessors ` documentation for more information. + +Preprocessors classes and their respective arguments are passed to the agents in their respective configuration dictionaries. For example, for the PPO agent, one of the preprocessors can be configured as shown below: + +.. code-block:: python + + from skrl.agents.torch.ppo import PPO, PPO_DEFAULT_CONFIG + from skrl.resources.preprocessors.torch import RunningStandardScaler + + agent_cfg["state_preprocessor"] = RunningStandardScaler + agent_cfg["state_preprocessor_kwargs"] = {"size": env.observation_space, "device": env.device} + agent_cfg["value_preprocessor"] = RunningStandardScaler + agent_cfg["value_preprocessor_kwargs"] = {"size": 1, "device": env.device} + +7. Agents --------- -:red:`Under construction...` +Agents are the components in charge of decision making. They are much more than models (neural networks, for example) and include the optimization algorithms that compute the optimal policy + +skrl provides **state-of-the-art agent**. Its implementations are focused on readability, simplicity and code transparency. Each agent is implemented independently even when two or more agents may contain code in common. Refer to each agent documentation for more information about the models and spaces they support, their respective configurations, algorithm details and more. + + * :doc:`Advantage Actor Critic <../modules/skrl.agents.a2c>` (**A2C**) + * :doc:`Adversarial Motion Priors <../modules/skrl.agents.amp>` (**AMP**) + * :doc:`Cross-Entropy Method <../modules/skrl.agents.cem>` (**CEM**) + * :doc:`Deep Deterministic Policy Gradient <../modules/skrl.agents.ddpg>` (**DDPG**) + * :doc:`Double Deep Q-Network <../modules/skrl.agents.ddqn>` (**DDQN**) + * :doc:`Deep Q-Network <../modules/skrl.agents.dqn>` (**DQN**) + * :doc:`Proximal Policy Optimization <../modules/skrl.agents.ppo>` (**PPO**) + * :doc:`Q-learning <../modules/skrl.agents.q_learning>` (**Q-learning**) + * :doc:`Soft Actor-Critic <../modules/skrl.agents.sac>` (**SAC**) + * :doc:`State Action Reward State Action <../modules/skrl.agents.sarsa>` (**SARSA**) + * :doc:`Twin-Delayed DDPG <../modules/skrl.agents.td3>` (**TD3**) + * :doc:`Trust Region Policy Optimization <../modules/skrl.agents.trpo>` (**TRPO**) + +Agents generally expect, as arguments, the following components: models and memories, as well as the following variables: observation and action spaces, the device where their logic is executed and a configuration dictionary with hyperparameters and other values. The remaining components, mentioned above, are collected through the configuration dictionary. For example, the PPO agent can be instantiated as follows: + +.. code-block:: python -7. Trainers + from skrl.agents.torch.ppo import PPO + + agent = PPO(models=models, # models dict + memory=memory, # memory instance, or None if not required + cfg=agent_cfg, # configuration dict (preprocessors, learning rate schedulers, etc.) + observation_space=env.observation_space, + action_space=env.action_space, + device=env.device) + +Agents can be saved and loaded to and from the file system. This is the **recommended practice** for loading checkpoints to perform evaluations or to continue interrupted training (since they include, in addition to models, other internal components and instances such as preprocessors or optimizers). Refer to :ref:`Saving, loading and logging ` (under *Checkpoints* section) for more information. + +8. Trainers ----------- -:red:`Under construction...` +Now that both actors, the environment and the agent, are instantiated, it is time to put the RL system in motion. + +skrl offers classes (called **trainers**) that manage the interaction cycle between the environment and the agent(s) for both: training and evaluation. These classes also enable the simultaneous training and evaluation of several agents by scope (subsets of environments among all available environments), which may or may not share resources, in the same run. + +The following code snippets show how to load and wrap environments based on the supported interfaces: + +.. tabs:: + + .. tab:: Sequential trainer + + .. code-block:: python + + from skrl.trainers.torch import SequentialTrainer + + # create a sequential trainer + cfg = {"timesteps": 50000, "headless": False} + trainer = SequentialTrainer(env=env, agents=[agent], cfg=cfg) + + # train the agent(s) + trainer.train() + + # evaluate the agent(s) + trainer.eval() + + .. tab:: Parallel trainer + + .. code-block:: python + + from skrl.trainers.torch import ParallelTrainer + + # create a parallel trainer + cfg = {"timesteps": 50000, "headless": False} + trainer = ParallelTrainer(env=env, agents=[agent], cfg=cfg) + + # train the agent(s) + trainer.train() + + # evaluate the agent(s) + trainer.eval() + + .. tab:: Manual trainer + + .. code-block:: python + + from skrl.trainers.torch import ManualTrainer + + # create a manual trainer + cfg = {"timesteps": 50000, "headless": False} + trainer = ManualTrainer(env=env, agents=[agent], cfg=cfg) + + # train the agent(s) + trainer.train() + + # evaluate the agent(s) + trainer.eval() + +.. raw:: html + +
+ +**What's next?** + +Visit the :ref:`Examples ` section for training and evaluation demonstrations with different environment interfaces and highlighted practices, among others. From 66d4dd81fcc672ee00ad91b9b8af3c55effb1bf0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20Serrano=20Mu=C3=B1oz?= Date: Sat, 1 Oct 2022 12:16:24 +0200 Subject: [PATCH 094/108] Update data checkpoints section --- docs/source/intro/data.rst | 164 ++++++++++++++++++++++++++++--------- 1 file changed, 125 insertions(+), 39 deletions(-) diff --git a/docs/source/intro/data.rst b/docs/source/intro/data.rst index fa018b8f..eb09f06c 100644 --- a/docs/source/intro/data.rst +++ b/docs/source/intro/data.rst @@ -1,3 +1,5 @@ +.. _data: + Saving, loading and logging =========================== @@ -7,9 +9,9 @@ Tracking metrics (TensorBoard) Configuration ^^^^^^^^^^^^^ -`TensorBoard `_ is used for tracking and visualizing metrics and scalars (coefficients, losses, etc.). The tracking and writing of metrics and scalars is the responsibility of the agents (**can be customized independently for each agent using its configuration dictionary**) +`TensorBoard `_ is used for tracking and visualizing metrics and scalars (coefficients, losses, etc.). The tracking and writing of metrics and scalars is the responsibility of the agents (**can be customized independently for each agent using its configuration dictionary**). -Each agent offers the following parameters under the :literal:`"experiment"` key +Each agent offers the following parameters under the :literal:`"experiment"` key: .. code-block:: python :emphasize-lines: 5,6,7 @@ -27,16 +29,16 @@ Each agent offers the following parameters under the :literal:`"experiment"` key } } -* **directory**: directory path where the data generated by the experiments (a subdirectory) are stored. If no value is set, the :literal:`runs` folder (inside the current working directory) will be used (and created if it does not exist) +* **directory**: directory path where the data generated by the experiments (a subdirectory) are stored. If no value is set, the :literal:`runs` folder (inside the current working directory) will be used (and created if it does not exist). -* **experiment_name**: name of the experiment (subdirectory). If no value is set, it will be the current date and time and the agent's name (e.g. :literal:`22-01-09_22-48-49-816281_DDPG`) +* **experiment_name**: name of the experiment (subdirectory). If no value is set, it will be the current date and time and the agent's name (e.g. :literal:`22-01-09_22-48-49-816281_DDPG`). -* **write_interval**: interval for writing metrics and values to TensorBoard (default is 250 timesteps). A value equal to or less than 0 disables tracking and writing to TensorBoard +* **write_interval**: interval for writing metrics and values to TensorBoard (default is 250 timesteps). A value equal to or less than 0 disables tracking and writing to TensorBoard. Tracked metrics/scales visualization ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -To visualize the tracked metrics/scales, during or after the training, TensorBoard can be launched using the following command in a terminal +To visualize the tracked metrics/scales, during or after the training, TensorBoard can be launched using the following command in a terminal: .. code-block:: bash @@ -104,7 +106,7 @@ Tracking custom metrics/scales * **Tracking custom data attached to the agent's control and timing logic (recommended)** - Although the TensorBoard's writing control and timing logic is controlled by the base class Agent, it is possible to track custom data. The :literal:`track_data` method can be used (see :doc:`Agent <../modules/skrl.agents.base_class>` class for more details), passing as arguments the data identification (tag) and the scalar value to be recorded + Although the TensorBoard's writing control and timing logic is controlled by the base class Agent, it is possible to track custom data. The :literal:`track_data` method can be used (see :doc:`Agent <../modules/skrl.agents.base_class>` class for more details), passing as arguments the data identification (tag) and the scalar value to be recorded. For example, to track the current CPU usage, the following code can be used: @@ -115,7 +117,7 @@ Tracking custom metrics/scales * **Tracking custom data directly to Tensorboard** - It is also feasible to access directly to the `SummaryWriter `_ instance through the :literal:`writer` property if it is desired to write directly to Tensorboard, avoiding the base class's control and timing logic + It is also feasible to access directly to the `SummaryWriter `_ instance through the :literal:`writer` property if it is desired to write directly to Tensorboard, avoiding the base class's control and timing logic. For example, to write directly to TensorBoard: @@ -132,9 +134,9 @@ Checkpoints Saving checkpoints ^^^^^^^^^^^^^^^^^^ -The checkpoints are saved in the :literal:`checkpoints` subdirectory of the experiment's directory (its path can be customized using the options described in the previous subsection). The checkpoint name is the key referring to the agent (or models, optimizers and preprocessors) and the current timestep (e.g. :literal:`runs/22-01-09_22-48-49-816281_DDPG/checkpoints/agent_2500.pt`) +The checkpoints are saved in the :literal:`checkpoints` subdirectory of the experiment's directory (its path can be customized using the options described in the previous subsection). The checkpoint name is the key referring to the agent (or models, optimizers and preprocessors) and the current timestep (e.g. :literal:`runs/22-01-09_22-48-49-816281_DDPG/checkpoints/agent_2500.pt`). -The checkpoint management, as in the previous case, is the responsibility of the agents (**can be customized independently for each agent using its configuration dictionary**) +The checkpoint management, as in the previous case, is the responsibility of the agents (**can be customized independently for each agent using its configuration dictionary**). .. code-block:: python :emphasize-lines: 9,10 @@ -152,57 +154,141 @@ The checkpoint management, as in the previous case, is the responsibility of the } } -* **checkpoint_interval**: interval for checkpoints (default is 1000 timesteps). A value equal to or less than 0 disables the checkpoint creation +* **checkpoint_interval**: interval for checkpoints (default is 1000 timesteps). A value equal to or less than 0 disables the checkpoint creation. -* **store_separately**: if set to :literal:`True`, all the modules that an agent contains (models, optimizers, preprocessors, etc.) will be saved each one in a separate file. By default (:literal:`False`) the modules are grouped in a dictionary and stored in the same file +* **store_separately**: if set to :literal:`True`, all the modules that an agent contains (models, optimizers, preprocessors, etc.) will be saved each one in a separate file. By default (:literal:`False`) the modules are grouped in a dictionary and stored in the same file. **Checkpointing the best models** -The best models, attending the mean total reward, will be saved in the :literal:`checkpoints` subdirectory of the experiment's directory. The checkpoint name is the word :literal:`best` and the key referring to the model (e.g. :literal:`runs/22-01-09_22-48-49-816281_DDPG/checkpoints/best_agent.pt`) +The best models, attending the mean total reward, will be saved in the :literal:`checkpoints` subdirectory of the experiment's directory. The checkpoint name is the word :literal:`best` and the key referring to the model (e.g. :literal:`runs/22-01-09_22-48-49-816281_DDPG/checkpoints/best_agent.pt`). -The best models are updated internally on each TensorBoard writing interval :literal:`"write_interval"` and they are saved on each checkpoint interval :literal:`"checkpoint_interval"`. The :literal:`"store_separately"` key specifies whether the best modules are grouped and stored together or separately +The best models are updated internally on each TensorBoard writing interval :literal:`"write_interval"` and they are saved on each checkpoint interval :literal:`"checkpoint_interval"`. The :literal:`"store_separately"` key specifies whether the best modules are grouped and stored together or separately. Loading checkpoints ^^^^^^^^^^^^^^^^^^^ -Checkpoints can be loaded for each of the instantiated agents (or models) independently via the :literal:`.load(...)` method (`Agent.load <../modules/skrl.agents.base_class.html#skrl.agents.torch.base.Agent.load>`_ or `Model.load <../modules/skrl.models.base_class.html#skrl.models.torch.base.Model.load>`_). It accepts the path (relative or absolute) of the checkpoint to load as the only argument. The checkpoint will be dynamically mapped to the device specified as argument in the class constructor (internally the torch load's :literal:`map_location` method is used during loading) +Checkpoints can be loaded for each of the instantiated agents (or models) independently via the :literal:`.load(...)` method (`Agent.load <../modules/skrl.agents.base_class.html#skrl.agents.torch.base.Agent.load>`_ or `Model.load <../modules/skrl.models.base_class.html#skrl.models.torch.base.Model.load>`_). It accepts the path (relative or absolute) of the checkpoint to load as the only argument. The checkpoint will be dynamically mapped to the device specified as argument in the class constructor (internally the torch load's :literal:`map_location` method is used during loading). .. note:: - The agents or models instances must have the same architecture/structure as the one used to save the checkpoint. The current implementation load the model's `state_dict `_ directly + The agents or models instances must have the same architecture/structure as the one used to save the checkpoint. The current implementation load the model's `state_dict `_ directly. -The following code shows how to load the checkpoint (e.g. :literal:`runs/22-01-09_22-48-49-816281_DDPG/checkpoints/2500_policy.pt`) of an instantiated policy from a specific definition. See the :ref:`Examples ` section for showcases about how to load control points and use them to continue the training or evaluate experiments +The following code snippets show how to load the checkpoints through the instantiated agent (recommended) or models. See the :ref:`Examples ` section for showcases about how to load control points and use them to continue the training or evaluate experiments. -.. code-block:: python - :emphasize-lines: 21 +.. tabs:: + + .. tab:: Agent (recommended) + + .. code-block:: python + :emphasize-lines: 12 + + from skrl.agents.torch.ppo import PPO + + # Instantiate the agent + agent = PPO(models=models, # models dict + memory=memory, # memory instance, or None if not required + cfg=agent_cfg, # configuration dict (preprocessors, learning rate schedulers, etc.) + observation_space=env.observation_space, + action_space=env.action_space, + device=env.device) + + # Load the checkpoint + agent.load("./runs/22-09-29_22-48-49-816281_DDPG/checkpoints/agent_1200.pt") + + .. tab:: Model - from skrl.models.torch import DeterministicModel + .. code-block:: python + :emphasize-lines: 22 - # Define the model - class Policy(DeterministicModel): - def __init__(self, observation_space, action_space, device, clip_actions = False): - super().__init__(observation_space, action_space, device, clip_actions) + from skrl.models.torch import Model, DeterministicMixin - self.net = nn.Sequential(nn.Linear(self.num_observations, 32), - nn.ReLU(), - nn.Linear(32, 32), - nn.ReLU(), - nn.Linear(32, self.num_actions)) + # Define the model + class Policy(DeterministicMixin, Model): + def __init__(self, observation_space, action_space, device, clip_actions=False): + Model.__init__(self, observation_space, action_space, device) + DeterministicMixin.__init__(self, clip_actions) - def compute(self, states, taken_actions): - return self.net(states) + self.net = nn.Sequential(nn.Linear(self.num_observations, 32), + nn.ReLU(), + nn.Linear(32, 32), + nn.ReLU(), + nn.Linear(32, self.num_actions)) - # Instantiate the agent's model - policy = Policy(env.observation_space, env.action_space, device, clip_actions=True) + def compute(self, states, taken_actions, role): + return self.net(states) - # Load the checkpoint - policy.load("./runs/22-01-09_22-48-49-816281_DDPG/checkpoints/2500_policy.pt") + # Instantiate the model + policy = Policy(env.observation_space, env.action_space, env.device, clip_actions=True) + + # Load the checkpoint + policy.load("./runs/22-09-29_22-48-49-816281_DDPG/checkpoints/2500_policy.pt") Migrating external checkpoints ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ It is possible to load checkpoints generated with external reinforcement learning libraries into skrl agents (or models) via the :literal:`.migrate(...)` method (`Agent.migrate <../modules/skrl.agents.base_class.html#skrl.agents.torch.base.Agent.migrate>`_ or `Model.migrate <../modules/skrl.models.base_class.html#skrl.models.torch.base.Model.migrate>`_). +.. note:: + + In some cases it will be necessary to specify a parameter mapping, especially in ambiguous models (where 2 or more parameters, for source or current model, have equal shape). Refer to the respective method documentation for more details in these cases. + +The following code snippets show how to migrate checkpoints from other libraries to the agents or models implemented in skrl: + +.. tabs:: + + .. tab:: Agent + + .. code-block:: python + :emphasize-lines: 12 + + from skrl.agents.torch.ppo import PPO + + # Instantiate the agent + agent = PPO(models=models, # models dict + memory=memory, # memory instance, or None if not required + cfg=agent_cfg, # configuration dict (preprocessors, learning rate schedulers, etc.) + observation_space=env.observation_space, + action_space=env.action_space, + device=env.device) + + # Migrate a rl_games checkpoint + agent.migrate(path="./runs/Cartpole/nn/Cartpole.pth") + + .. tab:: Model + + .. code-block:: python + :emphasize-lines: 22, 25, 28-29 + + from skrl.models.torch import Model, DeterministicMixin + + # Define the model + class Policy(DeterministicMixin, Model): + def __init__(self, observation_space, action_space, device, clip_actions=False): + Model.__init__(self, observation_space, action_space, device) + DeterministicMixin.__init__(self, clip_actions) + + self.net = nn.Sequential(nn.Linear(self.num_observations, 32), + nn.ReLU(), + nn.Linear(32, 32), + nn.ReLU(), + nn.Linear(32, self.num_actions)) + + def compute(self, states, taken_actions, role): + return self.net(states) + + # Instantiate the model + policy = Policy(env.observation_space, env.action_space, env.device, clip_actions=True) + + # Migrate a rl_games checkpoint (only the model) + policy.migrate(path="./runs/Cartpole/nn/Cartpole.pth") + + # or migrate a stable-baselines3 checkpoint + policy.migrate(path="./ddpg_pendulum.zip") + + # or migrate a checkpoint of any other library + state_dict = torch.load("./external_model.pt") + policy.migrate(state_dict=state_dict) + -------------------- Memory export/import @@ -211,7 +297,7 @@ Memory export/import Exporting memories ^^^^^^^^^^^^^^^^^^ -Memories can be automatically exported to files at each filling cycle (before data overwriting is performed). Its activation, the output files' format and their path can be modified through the constructor parameters when an instance is created +Memories can be automatically exported to files at each filling cycle (before data overwriting is performed). Its activation, the output files' format and their path can be modified through the constructor parameters when an instance is created. .. code-block:: python :emphasize-lines: 7-9 @@ -226,11 +312,11 @@ Memories can be automatically exported to files at each filling cycle (before da export_format="pt", export_directory="./memories") -* **export**: enable or disable the memory export (default is disabled) +* **export**: enable or disable the memory export (default is disabled). -* **export_format**: the format of the exported memory (default is :literal:`"pt"`). Supported formats are PyTorch (:literal:`"pt"`), NumPy (:literal:`"np"`) and Comma-separated values (:literal:`"csv"`) +* **export_format**: the format of the exported memory (default is :literal:`"pt"`). Supported formats are PyTorch (:literal:`"pt"`), NumPy (:literal:`"np"`) and Comma-separated values (:literal:`"csv"`). -* **export_directory**: the directory where the memory will be exported (default is :literal:`"memory"`) +* **export_directory**: the directory where the memory will be exported (default is :literal:`"memory"`). Importing memories ^^^^^^^^^^^^^^^^^^ From 8d0f6633c7042812a8a8558ca17c137e18f0625d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20Serrano=20Mu=C3=B1oz?= Date: Sat, 1 Oct 2022 14:26:00 +0200 Subject: [PATCH 095/108] Update Isaac Gym and Omniverse Isaac Gym examples in docs --- docs/source/intro/examples.rst | 114 +++++++++++++++++++++++++++------ 1 file changed, 94 insertions(+), 20 deletions(-) diff --git a/docs/source/intro/examples.rst b/docs/source/intro/examples.rst index 0b4c5b88..ffeb0791 100644 --- a/docs/source/intro/examples.rst +++ b/docs/source/intro/examples.rst @@ -300,6 +300,10 @@ The PPO agent configuration is mapped, as far as possible, from the rl_games' A2 .. code-block:: bash + # memory + memory_size = horizon_length + + # agent rollouts = horizon_length learning_epochs = mini_epochs mini_batches = horizon_length * num_actors / minibatch_size @@ -319,6 +323,9 @@ The PPO agent configuration is mapped, as far as possible, from the rl_games' A2 kl_threshold = 0 rewards_shaper = lambda rewards, timestep, timesteps: rewards * scale_value + # trainer + timesteps = horizon_length * max_epochs + .. note:: Isaac Gym environments implement a functionality to get their configuration from the command line. Because of this feature, setting the :literal:`headless` option from the trainer configuration will not work. In this case, it is necessary to invoke the scripts as follows: :literal:`python script.py headless=True` for Isaac Gym environments (preview 3 and preview 4) or :literal:`python script.py --headless` for Isaac Gym environments (preview 2) @@ -335,7 +342,7 @@ The PPO agent configuration is mapped, as far as possible, from the rl_games' A2 .. literalinclude:: ../examples/isaacgym/ppo_allegro_hand.py :language: python - :emphasize-lines: 2, 61-67 + :emphasize-lines: 2, 19, 56-62 .. tab:: Ant @@ -343,7 +350,7 @@ The PPO agent configuration is mapped, as far as possible, from the rl_games' A2 .. literalinclude:: ../examples/isaacgym/ppo_ant.py :language: python - :emphasize-lines: 13-14, 61-62 + :emphasize-lines: 13-14, 56-57 .. tab:: Anymal @@ -351,7 +358,7 @@ The PPO agent configuration is mapped, as far as possible, from the rl_games' A2 .. literalinclude:: ../examples/isaacgym/ppo_anymal.py :language: python - :emphasize-lines: 13-14, 61-62 + :emphasize-lines: 13-14, 56-57 .. tab:: AnymalTerrain @@ -359,7 +366,7 @@ The PPO agent configuration is mapped, as far as possible, from the rl_games' A2 .. literalinclude:: ../examples/isaacgym/ppo_anymal_terrain.py :language: python - :emphasize-lines: 11, 105-108 + :emphasize-lines: 11, 101-104 .. tab:: BallBalance @@ -367,7 +374,7 @@ The PPO agent configuration is mapped, as far as possible, from the rl_games' A2 .. literalinclude:: ../examples/isaacgym/ppo_ball_balance.py :language: python - :emphasize-lines: 11, 104-107 + :emphasize-lines: 11, 96-99 .. tab:: Cartpole @@ -391,7 +398,7 @@ The PPO agent configuration is mapped, as far as possible, from the rl_games' A2 .. literalinclude:: ../examples/isaacgym/ppo_franka_cabinet.py :language: python - :emphasize-lines: 10, 93-94 + :emphasize-lines: 10, 84-85 .. tab:: Humanoid @@ -399,7 +406,7 @@ The PPO agent configuration is mapped, as far as possible, from the rl_games' A2 .. literalinclude:: ../examples/isaacgym/ppo_humanoid.py :language: python - :emphasize-lines: 10, 93-94 + :emphasize-lines: 10, 84-85 .. tab:: Humanoid (AMP) @@ -415,7 +422,7 @@ The PPO agent configuration is mapped, as far as possible, from the rl_games' A2 .. literalinclude:: ../examples/isaacgym/ppo_ingenuity.py :language: python - :emphasize-lines: 2, 61-67 + :emphasize-lines: 2, 19, 56-62 .. tab:: Quadcopter @@ -423,7 +430,7 @@ The PPO agent configuration is mapped, as far as possible, from the rl_games' A2 .. literalinclude:: ../examples/isaacgym/ppo_quadcopter.py :language: python - :emphasize-lines: 104 + :emphasize-lines: 95 .. tab:: ShadowHand @@ -431,7 +438,7 @@ The PPO agent configuration is mapped, as far as possible, from the rl_games' A2 .. literalinclude:: ../examples/isaacgym/ppo_shadow_hand.py :language: python - :emphasize-lines: 108 + :emphasize-lines: 97 .. tab:: Trifinger @@ -439,7 +446,7 @@ The PPO agent configuration is mapped, as far as possible, from the rl_games' A2 .. literalinclude:: ../examples/isaacgym/ppo_trifinger.py :language: python - :emphasize-lines: 108 + :emphasize-lines: 95 .. tab:: Isaac Gym environments (evaluation) @@ -582,13 +589,21 @@ These examples perform the training of an agent in the `Omniverse Isaac Gym envi The following components or practices are exemplified (highlighted): - - Load and wrap an Omniverse Isaac Gym environment: **AllegroHand**, **Ant**, **Cartpole**, **Humanoid**, **ShadowHand** + - Load and wrap an Omniverse Isaac Gym environment: **AllegroHand**, **Ant**, **Anymal** - Load and wrap an Omniverse Isaac Gym multi-threaded environment: **Ant (multi-threaded)**, **Cartpole (multi-threaded)** + - Set an input preprocessor: **AnymalTerrain**, **BallBalance** + - Set a random seed for reproducibility: **Cartpole**, **Crazyflie** + - Set a learning rate scheduler: **FrankaCabinet**, **Humanoid** + - Define a reward shaping function: **Ingenuity**, **Quadcopter**, **ShadowHand** The PPO agent configuration is mapped, as far as possible, from the rl_games' A2C-PPO `configuration for Omniverse Isaac Gym environments `_. The following list shows the mapping between the two configurations .. code-block:: bash + # memory + memory_size = horizon_length + + # agent rollouts = horizon_length learning_epochs = mini_epochs mini_batches = horizon_length * num_actors / minibatch_size @@ -608,6 +623,9 @@ The PPO agent configuration is mapped, as far as possible, from the rl_games' A2 kl_threshold = 0 rewards_shaper = lambda rewards, timestep, timesteps: rewards * scale_value + # trainer + timesteps = horizon_length * max_epochs + .. note:: Omniverse Isaac Gym environments implement a functionality to get their configuration from the command line. Because of this feature, setting the :literal:`headless` option from the trainer configuration will not work. In this case, it is necessary to invoke the scripts as follows: :literal:`python script.py headless=True` @@ -624,7 +642,7 @@ The PPO agent configuration is mapped, as far as possible, from the rl_games' A2 .. literalinclude:: ../examples/omniisaacgym/ppo_allegro_hand.py :language: python - :emphasize-lines: 11-12, 59-60 + :emphasize-lines: 11-12, 54-55 .. tab:: Ant @@ -632,7 +650,7 @@ The PPO agent configuration is mapped, as far as possible, from the rl_games' A2 .. literalinclude:: ../examples/omniisaacgym/ppo_ant.py :language: python - :emphasize-lines: 11-12, 59-60 + :emphasize-lines: 11-12, 54-55 .. tab:: Ant (multi-threaded) @@ -640,7 +658,31 @@ The PPO agent configuration is mapped, as far as possible, from the rl_games' A2 .. literalinclude:: ../examples/omniisaacgym/ppo_ant_mt.py :language: python - :emphasize-lines: 1, 13-14, 61-62, 126, 130 + :emphasize-lines: 1, 13-14, 56-57, 117, 121 + + .. tab:: Anymal + + :download:`ppo_anymal.py <../examples/omniisaacgym/ppo_anymal.py>` + + .. literalinclude:: ../examples/omniisaacgym/ppo_anymal.py + :language: python + :emphasize-lines: 11-12, 54-55 + + .. tab:: AnymalTerrain + + :download:`ppo_anymal_terrain.py <../examples/omniisaacgym/ppo_anymal_terrain.py>` + + .. literalinclude:: ../examples/omniisaacgym/ppo_anymal_terrain.py + :language: python + :emphasize-lines: 9, 99-102 + + .. tab:: BallBalance + + :download:`ppo_ball_balance.py <../examples/omniisaacgym/ppo_ball_balance.py>` + + .. literalinclude:: ../examples/omniisaacgym/ppo_ball_balance.py + :language: python + :emphasize-lines: 9, 94-97 .. tab:: Cartpole @@ -648,7 +690,7 @@ The PPO agent configuration is mapped, as far as possible, from the rl_games' A2 .. literalinclude:: ../examples/omniisaacgym/ppo_cartpole.py :language: python - :emphasize-lines: 11-12, 55-56 + :emphasize-lines: 13, 17 .. tab:: Cartpole (multi-threaded) @@ -656,23 +698,55 @@ The PPO agent configuration is mapped, as far as possible, from the rl_games' A2 .. literalinclude:: ../examples/omniisaacgym/ppo_cartpole_mt.py :language: python - :emphasize-lines: 1, 13-14, 57-58, 122, 126 - + :emphasize-lines: 1, 13-14, 54-55, 115, 119 + + .. tab:: Crazyflie + + :download:`ppo_crazy_flie.py <../examples/omniisaacgym/ppo_crazy_flie.py>` + + .. literalinclude:: ../examples/omniisaacgym/ppo_crazy_flie.py + :language: python + :emphasize-lines: 13, 17 + + .. tab:: FrankaCabinet + + :download:`ppo_franka_cabinet.py <../examples/omniisaacgym/ppo_franka_cabinet.py>` + + .. literalinclude:: ../examples/omniisaacgym/ppo_franka_cabinet.py + :language: python + :emphasize-lines: 8, 82-83 + .. tab:: Humanoid :download:`ppo_humanoid.py <../examples/omniisaacgym/ppo_humanoid.py>` .. literalinclude:: ../examples/omniisaacgym/ppo_humanoid.py :language: python - :emphasize-lines: 11-12, 59-60 + :emphasize-lines: 8, 82-83 + .. tab:: Ingenuity + + :download:`ppo_ingenuity.py <../examples/omniisaacgym/ppo_ingenuity.py>` + + .. literalinclude:: ../examples/omniisaacgym/ppo_ingenuity.py + :language: python + :emphasize-lines: 93 + + .. tab:: Quadcopter + + :download:`ppo_quadcopter.py <../examples/omniisaacgym/ppo_quadcopter.py>` + + .. literalinclude:: ../examples/omniisaacgym/ppo_quadcopter.py + :language: python + :emphasize-lines: 93 + .. tab:: ShadowHand :download:`ppo_shadow_hand.py <../examples/omniisaacgym/ppo_shadow_hand.py>` .. literalinclude:: ../examples/omniisaacgym/ppo_shadow_hand.py :language: python - :emphasize-lines: 11-12, 63-64 + :emphasize-lines: 95 .. raw:: html From 534faba9417fb79dc861be1b11129c19333279da Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20Serrano=20Mu=C3=B1oz?= Date: Sun, 2 Oct 2022 20:20:11 +0200 Subject: [PATCH 096/108] Add multivariate gaussian model to the table of agent models in docs --- docs/source/modules/skrl.agents.a2c.rst | 6 +++--- docs/source/modules/skrl.agents.amp.rst | 8 ++++---- docs/source/modules/skrl.agents.cem.rst | 4 ++-- docs/source/modules/skrl.agents.ddpg.rst | 10 +++++----- docs/source/modules/skrl.agents.ddqn.rst | 6 +++--- docs/source/modules/skrl.agents.dqn.rst | 6 +++--- docs/source/modules/skrl.agents.ppo.rst | 6 +++--- docs/source/modules/skrl.agents.q_learning.rst | 4 ++-- docs/source/modules/skrl.agents.sac.rst | 12 ++++++------ docs/source/modules/skrl.agents.sarsa.rst | 4 ++-- docs/source/modules/skrl.agents.td3.rst | 14 +++++++------- docs/source/modules/skrl.agents.trpo.rst | 6 +++--- 12 files changed, 43 insertions(+), 43 deletions(-) diff --git a/docs/source/modules/skrl.agents.a2c.rst b/docs/source/modules/skrl.agents.a2c.rst index f2d52e46..bcbf2a0e 100644 --- a/docs/source/modules/skrl.agents.a2c.rst +++ b/docs/source/modules/skrl.agents.a2c.rst @@ -108,21 +108,21 @@ The implementation uses 1 stochastic (discrete or continuous) and 1 deterministi * - Notation - Concept - Key - - Type - Input shape - Output shape + - Type * - :math:`\pi_\theta(s)` - Policy - :literal:`"policy"` - - :ref:`Categorical ` / :ref:`Gaussian ` - observation - action + - :ref:`Categorical ` / :ref:`Gaussian ` / :ref:`MultivariateGaussian ` * - :math:`V_\phi(s)` - Value - :literal:`"value"` - - :ref:`Deterministic ` - observation - 1 + - :ref:`Deterministic ` API ^^^ diff --git a/docs/source/modules/skrl.agents.amp.rst b/docs/source/modules/skrl.agents.amp.rst index 4ee3db5b..2c337b77 100644 --- a/docs/source/modules/skrl.agents.amp.rst +++ b/docs/source/modules/skrl.agents.amp.rst @@ -133,27 +133,27 @@ The implementation uses 1 stochastic (continuous) and 2 deterministic function a * - Notation - Concept - Key - - Type - Input shape - Output shape + - Type * - :math:`\pi_\theta(s)` - Policy - :literal:`"policy"` - - :ref:`Gaussian ` - observation - action + - :ref:`Gaussian ` / :ref:`MultivariateGaussian ` * - :math:`V_\phi(s)` - Value - :literal:`"value"` - - :ref:`Deterministic ` - observation - 1 + - :ref:`Deterministic ` * - :math:`D_\psi(s_{_{AMP}})` - Discriminator - :literal:`"discriminator"` - - :ref:`Deterministic ` - AMP observation - 1 + - :ref:`Deterministic ` API ^^^ diff --git a/docs/source/modules/skrl.agents.cem.rst b/docs/source/modules/skrl.agents.cem.rst index 6ef63e4c..0a774635 100644 --- a/docs/source/modules/skrl.agents.cem.rst +++ b/docs/source/modules/skrl.agents.cem.rst @@ -64,15 +64,15 @@ The implementation uses 1 discrete function approximator. This function approxim * - Notation - Concept - Key - - Type - Input shape - Output shape + - Type * - :math:`\pi(s)` - Policy - :literal:`"policy"` - - :ref:`Categorical ` - observation - action + - :ref:`Categorical ` API ^^^ diff --git a/docs/source/modules/skrl.agents.ddpg.rst b/docs/source/modules/skrl.agents.ddpg.rst index ae57fc80..f7547713 100644 --- a/docs/source/modules/skrl.agents.ddpg.rst +++ b/docs/source/modules/skrl.agents.ddpg.rst @@ -92,33 +92,33 @@ The implementation uses 4 deterministic function approximators. These function a * - Notation - Concept - Key - - Type - Input shape - Output shape + - Type * - :math:`\mu_\theta(s)` - Policy (actor) - :literal:`"policy"` - - :ref:`Deterministic ` - observation - action + - :ref:`Deterministic ` * - :math:`\mu_{\theta_{target}}(s)` - Target policy - :literal:`"target_policy"` - - :ref:`Deterministic ` - observation - action + - :ref:`Deterministic ` * - :math:`Q_\phi(s, a)` - Q-network (critic) - :literal:`"critic"` - - :ref:`Deterministic ` - observation + action - 1 + - :ref:`Deterministic ` * - :math:`Q_{\phi_{target}}(s, a)` - Target Q-network - :literal:`"target_critic"` - - :ref:`Deterministic ` - observation + action - 1 + - :ref:`Deterministic ` API ^^^ diff --git a/docs/source/modules/skrl.agents.ddqn.rst b/docs/source/modules/skrl.agents.ddqn.rst index f12ac7c1..4af5ea0f 100644 --- a/docs/source/modules/skrl.agents.ddqn.rst +++ b/docs/source/modules/skrl.agents.ddqn.rst @@ -71,21 +71,21 @@ The implementation uses 2 deterministic function approximators. These function a * - Notation - Concept - Key - - Type - Input shape - Output shape + - Type * - :math:`Q_\phi(s, a)` - Q-network - :literal:`"q_network"` - - :ref:`Deterministic ` - observation - action + - :ref:`Deterministic ` * - :math:`Q_{\phi_{target}}(s, a)` - Target Q-network - :literal:`"target_q_network"` - - :ref:`Deterministic ` - observation - action + - :ref:`Deterministic ` API ^^^ diff --git a/docs/source/modules/skrl.agents.dqn.rst b/docs/source/modules/skrl.agents.dqn.rst index 0ed111f8..200baf9a 100644 --- a/docs/source/modules/skrl.agents.dqn.rst +++ b/docs/source/modules/skrl.agents.dqn.rst @@ -71,21 +71,21 @@ The implementation uses 2 deterministic function approximators. These function a * - Notation - Concept - Key - - Type - Input shape - Output shape + - Type * - :math:`Q_\phi(s, a)` - Q-network - :literal:`"q_network"` - - :ref:`Deterministic ` - observation - action + - :ref:`Deterministic ` * - :math:`Q_{\phi_{target}}(s, a)` - Target Q-network - :literal:`"target_q_network"` - - :ref:`Deterministic ` - observation - action + - :ref:`Deterministic ` API ^^^ diff --git a/docs/source/modules/skrl.agents.ppo.rst b/docs/source/modules/skrl.agents.ppo.rst index 8be4f1f1..11fdb703 100644 --- a/docs/source/modules/skrl.agents.ppo.rst +++ b/docs/source/modules/skrl.agents.ppo.rst @@ -124,21 +124,21 @@ The implementation uses 1 stochastic (discrete or continuous) and 1 deterministi * - Notation - Concept - Key - - Type - Input shape - Output shape + - Type * - :math:`\pi_\theta(s)` - Policy - :literal:`"policy"` - - :ref:`Categorical ` / :ref:`Gaussian ` - observation - action + - :ref:`Categorical ` / :ref:`Gaussian ` / :ref:`MultivariateGaussian ` * - :math:`V_\phi(s)` - Value - :literal:`"value"` - - :ref:`Deterministic ` - observation - 1 + - :ref:`Deterministic ` API ^^^ diff --git a/docs/source/modules/skrl.agents.q_learning.rst b/docs/source/modules/skrl.agents.q_learning.rst index 9b270188..2ea947f6 100644 --- a/docs/source/modules/skrl.agents.q_learning.rst +++ b/docs/source/modules/skrl.agents.q_learning.rst @@ -62,15 +62,15 @@ The implementation uses 1 table. This table (model) must be collected in a dicti * - Notation - Concept - Key - - Type - Input shape - Output shape + - Type * - :math:`\pi_{Q[s,a]}(s)` - Policy (:math:`\epsilon`-greedy) - :literal:`"policy"` - - :ref:`Tabular ` - observation - action + - :ref:`Tabular ` API ^^^ diff --git a/docs/source/modules/skrl.agents.sac.rst b/docs/source/modules/skrl.agents.sac.rst index 20dbe252..0af74b70 100644 --- a/docs/source/modules/skrl.agents.sac.rst +++ b/docs/source/modules/skrl.agents.sac.rst @@ -99,39 +99,39 @@ The implementation uses 1 stochastic and 4 deterministic function approximators. * - Notation - Concept - Key - - Type - Input shape - Output shape + - Type * - :math:`\pi_\theta(s)` - Policy (actor) - :literal:`"policy"` - - :ref:`Gaussian ` - observation - action + - :ref:`Gaussian ` / :ref:`MultivariateGaussian ` * - :math:`Q_{\phi 1}(s, a)` - Q1-network (critic 1) - :literal:`"critic_1"` - - :ref:`Deterministic ` - observation + action - 1 + - :ref:`Deterministic ` * - :math:`Q_{\phi 2}(s, a)` - Q2-network (critic 2) - :literal:`"critic_2"` - - :ref:`Deterministic ` - observation + action - 1 + - :ref:`Deterministic ` * - :math:`Q_{{\phi 1}_{target}}(s, a)` - Target Q1-network - :literal:`"target_critic_1"` - - :ref:`Deterministic ` - observation + action - 1 + - :ref:`Deterministic ` * - :math:`Q_{{\phi 2}_{target}}(s, a)` - Target Q2-network - :literal:`"target_critic_2"` - - :ref:`Deterministic ` - observation + action - 1 + - :ref:`Deterministic ` API ^^^ diff --git a/docs/source/modules/skrl.agents.sarsa.rst b/docs/source/modules/skrl.agents.sarsa.rst index dacfb320..9bb4f23f 100644 --- a/docs/source/modules/skrl.agents.sarsa.rst +++ b/docs/source/modules/skrl.agents.sarsa.rst @@ -61,15 +61,15 @@ The implementation uses 1 table. This table (model) must be collected in a dicti * - Notation - Concept - Key - - Type - Input shape - Output shape + - Type * - :math:`\pi_{Q[s,a]}(s)` - Policy (:math:`\epsilon`-greedy) - :literal:`"policy"` - - :ref:`Tabular ` - observation - action + - :ref:`Tabular ` API ^^^ diff --git a/docs/source/modules/skrl.agents.td3.rst b/docs/source/modules/skrl.agents.td3.rst index 74bb068e..eb30fb80 100644 --- a/docs/source/modules/skrl.agents.td3.rst +++ b/docs/source/modules/skrl.agents.td3.rst @@ -102,45 +102,45 @@ The implementation uses 6 deterministic function approximators. These function a * - Notation - Concept - Key - - Type - Input shape - Output shape + - Type * - :math:`\mu_\theta(s)` - Policy (actor) - :literal:`"policy"` - - :ref:`Deterministic ` - observation - action + - :ref:`Deterministic ` * - :math:`\mu_{\theta_{target}}(s)` - Target policy - :literal:`"target_policy"` - - :ref:`Deterministic ` - observation - action + - :ref:`Deterministic ` * - :math:`Q_{\phi 1}(s, a)` - Q1-network (critic 1) - :literal:`"critic_1"` - - :ref:`Deterministic ` - observation + action - 1 + - :ref:`Deterministic ` * - :math:`Q_{\phi 2}(s, a)` - Q2-network (critic 2) - :literal:`"critic_2"` - - :ref:`Deterministic ` - observation + action - 1 + - :ref:`Deterministic ` * - :math:`Q_{{\phi 1}_{target}}(s, a)` - Target Q1-network - :literal:`"target_critic_1"` - - :ref:`Deterministic ` - observation + action - 1 + - :ref:`Deterministic ` * - :math:`Q_{{\phi 2}_{target}}(s, a)` - Target Q2-network - :literal:`"target_critic_2"` - - :ref:`Deterministic ` - observation + action - 1 + - :ref:`Deterministic ` API ^^^ diff --git a/docs/source/modules/skrl.agents.trpo.rst b/docs/source/modules/skrl.agents.trpo.rst index 52da1555..bd2b0ad6 100644 --- a/docs/source/modules/skrl.agents.trpo.rst +++ b/docs/source/modules/skrl.agents.trpo.rst @@ -162,21 +162,21 @@ The implementation uses 1 stochastic and 1 deterministic function approximator. * - Notation - Concept - Key - - Type - Input shape - Output shape + - Type * - :math:`\pi_\theta(s)` - Policy - :literal:`"policy"` - - :ref:`Gaussian ` - observation - action + - :ref:`Gaussian ` / :ref:`MultivariateGaussian ` * - :math:`V_\phi(s)` - Value - :literal:`"value"` - - :ref:`Deterministic ` - observation - 1 + - :ref:`Deterministic ` API ^^^ From 539d62836ca76a4d7b984cdfb3149c6cf4b80d28 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20Serrano=20Mu=C3=B1oz?= Date: Sun, 2 Oct 2022 20:22:52 +0200 Subject: [PATCH 097/108] Add a short introduction to each model mixin in docs --- .../source/modules/skrl.models.base_class.rst | 2 ++ .../modules/skrl.models.categorical.rst | 24 +++++++++++++++++ .../modules/skrl.models.deterministic.rst | 24 +++++++++++++++++ docs/source/modules/skrl.models.gaussian.rst | 26 +++++++++++++++++++ .../skrl.models.multivariate_gaussian.rst | 26 +++++++++++++++++++ docs/source/modules/skrl.models.tabular.rst | 24 +++++++++++++++++ 6 files changed, 126 insertions(+) diff --git a/docs/source/modules/skrl.models.base_class.rst b/docs/source/modules/skrl.models.base_class.rst index 221221f3..8422448e 100644 --- a/docs/source/modules/skrl.models.base_class.rst +++ b/docs/source/modules/skrl.models.base_class.rst @@ -1,3 +1,5 @@ +.. _models_base_class: + Base class ========== diff --git a/docs/source/modules/skrl.models.categorical.rst b/docs/source/modules/skrl.models.categorical.rst index 66c075c9..6dc185c2 100644 --- a/docs/source/modules/skrl.models.categorical.rst +++ b/docs/source/modules/skrl.models.categorical.rst @@ -3,6 +3,30 @@ Categorical model ================= +Categorical models run **discrete-domain stochastic** policies. + +skrl provides a Python mixin (:literal:`CategoricalMixin`) to assist in the creation of these types of models, allowing users to have full control over the function approximator definitions and architectures. Note that the use of this mixin must comply with the following rules: + +* The definition of multiple inheritance must always include the :ref:`Model ` base class at the end. + + .. code-block:: python + :emphasize-lines: 1 + + class CategoricalModel(CategoricalMixin, Model): + def __init__(self, observation_space, action_space, device, unnormalized_log_prob=True): + Model.__init__(self, observation_space, action_space, device) + CategoricalMixin.__init__(self, unnormalized_log_prob) + +* The :ref:`Model ` base class constructor must be invoked before the mixins constructor. + + .. code-block:: python + :emphasize-lines: 3-4 + + class CategoricalModel(CategoricalMixin, Model): + def __init__(self, observation_space, action_space, device, unnormalized_log_prob=True): + Model.__init__(self, observation_space, action_space, device) + CategoricalMixin.__init__(self, unnormalized_log_prob) + Concept ------- diff --git a/docs/source/modules/skrl.models.deterministic.rst b/docs/source/modules/skrl.models.deterministic.rst index 04266a88..bccbf334 100644 --- a/docs/source/modules/skrl.models.deterministic.rst +++ b/docs/source/modules/skrl.models.deterministic.rst @@ -3,6 +3,30 @@ Deterministic model =================== +Deterministic models run **continuous-domain deterministic** policies. + +skrl provides a Python mixin (:literal:`DeterministicMixin`) to assist in the creation of these types of models, allowing users to have full control over the function approximator definitions and architectures. Note that the use of this mixin must comply with the following rules: + +* The definition of multiple inheritance must always include the :ref:`Model ` base class at the end. + + .. code-block:: python + :emphasize-lines: 1 + + class DeterministicModel(DeterministicMixin, Model): + def __init__(self, observation_space, action_space, device="cuda:0", clip_actions=False): + Model.__init__(self, observation_space, action_space, device) + DeterministicMixin.__init__(self, clip_actions) + +* The :ref:`Model ` base class constructor must be invoked before the mixins constructor. + + .. code-block:: python + :emphasize-lines: 3-4 + + class DeterministicModel(DeterministicMixin, Model): + def __init__(self, observation_space, action_space, device="cuda:0", clip_actions=False): + Model.__init__(self, observation_space, action_space, device) + DeterministicMixin.__init__(self, clip_actions) + Concept ------- diff --git a/docs/source/modules/skrl.models.gaussian.rst b/docs/source/modules/skrl.models.gaussian.rst index 7a8b788b..0b7dd56a 100644 --- a/docs/source/modules/skrl.models.gaussian.rst +++ b/docs/source/modules/skrl.models.gaussian.rst @@ -3,6 +3,32 @@ Gaussian model ============== +Gaussian models run **continuous-domain stochastic** policies. + +skrl provides a Python mixin (:literal:`GaussianMixin`) to assist in the creation of these types of models, allowing users to have full control over the function approximator definitions and architectures. Note that the use of this mixin must comply with the following rules: + +* The definition of multiple inheritance must always include the :ref:`Model ` base class at the end. + + .. code-block:: python + :emphasize-lines: 1 + + class GaussianModel(GaussianMixin, Model): + def __init__(self, observation_space, action_space, device="cuda:0", + clip_actions=False, clip_log_std=True, min_log_std=-20, max_log_std=2, reduction="sum"): + Model.__init__(self, observation_space, action_space, device) + GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std, reduction) + +* The :ref:`Model ` base class constructor must be invoked before the mixins constructor. + + .. code-block:: python + :emphasize-lines: 4-5 + + class GaussianModel(GaussianMixin, Model): + def __init__(self, observation_space, action_space, device="cuda:0", + clip_actions=False, clip_log_std=True, min_log_std=-20, max_log_std=2, reduction="sum"): + Model.__init__(self, observation_space, action_space, device) + GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std, reduction) + Concept ------- diff --git a/docs/source/modules/skrl.models.multivariate_gaussian.rst b/docs/source/modules/skrl.models.multivariate_gaussian.rst index c4a20066..13661349 100644 --- a/docs/source/modules/skrl.models.multivariate_gaussian.rst +++ b/docs/source/modules/skrl.models.multivariate_gaussian.rst @@ -3,6 +3,32 @@ Multivariate Gaussian model =========================== +Multivariate Gaussian models run **continuous-domain stochastic** policies. + +skrl provides a Python mixin (:literal:`MultivariateGaussianMixin`) to assist in the creation of these types of models, allowing users to have full control over the function approximator definitions and architectures. Note that the use of this mixin must comply with the following rules: + +* The definition of multiple inheritance must always include the :ref:`Model ` base class at the end. + + .. code-block:: python + :emphasize-lines: 1 + + class MultivariateGaussianModel(MultivariateGaussianMixin, Model): + def __init__(self, observation_space, action_space, device="cuda:0", + clip_actions=False, clip_log_std=True, min_log_std=-20, max_log_std=2): + Model.__init__(self, observation_space, action_space, device) + MultivariateGaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std) + +* The :ref:`Model ` base class constructor must be invoked before the mixins constructor. + + .. code-block:: python + :emphasize-lines: 4-5 + + class MultivariateGaussianModel(MultivariateGaussianMixin, Model): + def __init__(self, observation_space, action_space, device="cuda:0", + clip_actions=False, clip_log_std=True, min_log_std=-20, max_log_std=2): + Model.__init__(self, observation_space, action_space, device) + MultivariateGaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std) + Concept ------- diff --git a/docs/source/modules/skrl.models.tabular.rst b/docs/source/modules/skrl.models.tabular.rst index 774b338a..1190c245 100644 --- a/docs/source/modules/skrl.models.tabular.rst +++ b/docs/source/modules/skrl.models.tabular.rst @@ -3,6 +3,30 @@ Tabular model ============= +Tabular models run **discrete-domain deterministic/stochastic** policies. + +skrl provides a Python mixin (:literal:`TabularMixin`) to assist in the creation of these types of models, allowing users to have full control over the table definitions. Note that the use of this mixin must comply with the following rules: + +* The definition of multiple inheritance must always include the :ref:`Model ` base class at the end. + + .. code-block:: python + :emphasize-lines: 1 + + class TabularModel(TabularMixin, Model): + def __init__(self, observation_space, action_space, device="cuda:0", num_envs=1): + Model.__init__(self, observation_space, action_space, device) + TabularMixin.__init__(self, num_envs) + +* The :ref:`Model ` base class constructor must be invoked before the mixins constructor. + + .. code-block:: python + :emphasize-lines: 3-4 + + class TabularModel(TabularMixin, Model): + def __init__(self, observation_space, action_space, device="cuda:0", num_envs=1): + Model.__init__(self, observation_space, action_space, device) + TabularMixin.__init__(self, num_envs) + Basic usage ----------- From e096609512ac61a6b949ad9aaf4c85d7d8b5ba14 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20Serrano=20Mu=C3=B1oz?= Date: Sun, 2 Oct 2022 23:22:24 +0200 Subject: [PATCH 098/108] Add shared models to docs --- docs/source/index.rst | 5 ++- .../modules/skrl.models.shared_model.rst | 41 +++++++++++++++++++ 2 files changed, 44 insertions(+), 2 deletions(-) create mode 100644 docs/source/modules/skrl.models.shared_model.rst diff --git a/docs/source/index.rst b/docs/source/index.rst index d48d8ce9..52ca475d 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -12,7 +12,7 @@ SKRL - Reinforcement Learning library (|version|) .. warning:: - **skrl** is under **active continuous development**. Make sure you always have the latest version + **skrl** is under **active continuous development**. Make sure you always have the latest version. Visit the `develop `_ branch or its `documentation `_ to access the latest updates to be released. | **GitHub repository:** https://github.com/Toni-SM/skrl | **Questions or discussions:** https://github.com/Toni-SM/skrl/discussions @@ -123,7 +123,7 @@ Memories Models ^^^^^^ - Definition of helper classes for the construction of tabular functions or function approximators using artificial neural networks. This library does not provide predefined policies but helper classes to create discrete and continuous (stochastic or deterministic) policies in which the user only has to define the tables (tensors) or artificial neural networks. All models inherit from one :doc:`base class ` that defines a uniform interface and provides for common functionalities + Definition of helper mixins for the construction of tabular functions or function approximators using artificial neural networks. This library does not provide predefined policies but helper mixins to create discrete and continuous (stochastic or deterministic) policies in which the user only has to define the tables (tensors) or artificial neural networks. All models inherit from one :doc:`base class ` that defines a uniform interface and provides for common functionalities. In addition, it is possible to create :doc:`shared model ` by combining the implemented definitions * :doc:`Tabular model ` (discrete domain) * :doc:`Categorical model ` (discrete domain) @@ -142,6 +142,7 @@ Models modules/skrl.models.gaussian modules/skrl.models.multivariate_gaussian modules/skrl.models.deterministic + modules/skrl.models.shared_model Trainers ^^^^^^^^ diff --git a/docs/source/modules/skrl.models.shared_model.rst b/docs/source/modules/skrl.models.shared_model.rst new file mode 100644 index 00000000..fe16afcf --- /dev/null +++ b/docs/source/modules/skrl.models.shared_model.rst @@ -0,0 +1,41 @@ +Shared model +============ + +Sometimes it is desirable to define models that use shared layers or network to represent multiple function approximators. This practice, known as *shared parameters* (or *parameter sharing*), *shared layers*, *shared model*, *shared networks* or *joint architecture* among others, is typically justified by the following criteria: + +* Learning the same characteristics, especially when processing large inputs (such as images, e.g.). + +* Reduce the number of parameters in the whole system. + +* Make the computation more efficient. + +Implementation +-------------- + +By combining the implemented mixins, it is possible to define shared models with skrl. In these cases, the use of the :literal:`role` argument (a Python string) is relevant. The agents will call the models by setting the :literal:`role` argument according to their requirements. Visit each agent's documentation (*Key* column of the table under *Spaces and models* section) to know the possible values that this parameter can take. + +The code snippet below shows how to define a shared model. The following practices for building shared models can be identified: + +* The definition of multiple inheritance must always include the :ref:`Model ` base class at the end. + +* The :ref:`Model ` base class constructor must be invoked before the mixins constructor. + +* All mixin constructors must be invoked. + + * Specify :literal:`role` argument is optional if all constructors belong to different mixins. + + * If multiple models of the same mixin type are required, the same constructor must be invoked as many times as needed. To do so, it is mandatory to specify the :literal:`role` argument. + +* The :literal:`.act(...)` method needs to be overridden to disambiguate its call. + +* The same instance of the shared model must be passed to all keys involved. + +.. raw:: html + +
+ +.. literalinclude:: ../snippets/shared_model.py + :language: python + :linenos: + :start-after: [start-mlp] + :end-before: [end-mlp] From c52560f12d1a02d5d747f0025f39139a3baeecf4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20Serrano=20Mu=C3=B1oz?= Date: Sun, 2 Oct 2022 23:32:05 +0200 Subject: [PATCH 099/108] Add benchmark results link to docs --- docs/source/intro/examples.rst | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/docs/source/intro/examples.rst b/docs/source/intro/examples.rst index ffeb0791..7348ee4d 100644 --- a/docs/source/intro/examples.rst +++ b/docs/source/intro/examples.rst @@ -296,7 +296,7 @@ The following components or practices are exemplified (highlighted): - Access to environment-specific properties and methods: **Humanoid (AMP)** - Load a checkpoint during evaluation: **Cartpole** -The PPO agent configuration is mapped, as far as possible, from the rl_games' A2C-PPO `configuration for Isaac Gym preview environments `_. The following list shows the mapping between the two configurations +The PPO agent configuration is mapped, as far as possible, from the rl_games' A2C-PPO `configuration for Isaac Gym preview environments `_. Shared models or separated models are used depending on the value of the :literal:`network.separate` variable. The following list shows the mapping between the two configurations: .. code-block:: bash @@ -326,6 +326,8 @@ The PPO agent configuration is mapped, as far as possible, from the rl_games' A2 # trainer timesteps = horizon_length * max_epochs +**Benchmark results** for Isaac Gym are listed in `Benchmark results #32 `_. + .. note:: Isaac Gym environments implement a functionality to get their configuration from the command line. Because of this feature, setting the :literal:`headless` option from the trainer configuration will not work. In this case, it is necessary to invoke the scripts as follows: :literal:`python script.py headless=True` for Isaac Gym environments (preview 3 and preview 4) or :literal:`python script.py --headless` for Isaac Gym environments (preview 2) @@ -596,7 +598,7 @@ The following components or practices are exemplified (highlighted): - Set a learning rate scheduler: **FrankaCabinet**, **Humanoid** - Define a reward shaping function: **Ingenuity**, **Quadcopter**, **ShadowHand** -The PPO agent configuration is mapped, as far as possible, from the rl_games' A2C-PPO `configuration for Omniverse Isaac Gym environments `_. The following list shows the mapping between the two configurations +The PPO agent configuration is mapped, as far as possible, from the rl_games' A2C-PPO `configuration for Omniverse Isaac Gym environments `_. Shared models or separated models are used depending on the value of the :literal:`network.separate` variable. The following list shows the mapping between the two configurations:configurations .. code-block:: bash @@ -626,6 +628,8 @@ The PPO agent configuration is mapped, as far as possible, from the rl_games' A2 # trainer timesteps = horizon_length * max_epochs +**Benchmark results** for Omniverse Isaac Gym are listed in `Benchmark results #32 `_. + .. note:: Omniverse Isaac Gym environments implement a functionality to get their configuration from the command line. Because of this feature, setting the :literal:`headless` option from the trainer configuration will not work. In this case, it is necessary to invoke the scripts as follows: :literal:`python script.py headless=True` From ceffaf89161eacfd3029d6a85f555f26231219b5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20Serrano=20Mu=C3=B1oz?= Date: Sun, 2 Oct 2022 23:33:22 +0200 Subject: [PATCH 100/108] Add shared model snippet to docs --- docs/source/snippets/shared_model.py | 48 ++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) create mode 100644 docs/source/snippets/shared_model.py diff --git a/docs/source/snippets/shared_model.py b/docs/source/snippets/shared_model.py new file mode 100644 index 00000000..40182f61 --- /dev/null +++ b/docs/source/snippets/shared_model.py @@ -0,0 +1,48 @@ +# [start-mlp] +import torch +import torch.nn as nn + +from skrl.models.torch import Model, GaussianMixin, DeterministicMixin + + +# define the shared model +class SharedModel(GaussianMixin, DeterministicMixin, Model): + def __init__(self, observation_space, action_space, device, clip_actions=False, + clip_log_std=True, min_log_std=-20, max_log_std=2, reduction="sum"): + Model.__init__(self, observation_space, action_space, device) + GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std, reduction, role="policy") + DeterministicMixin.__init__(self, clip_actions, role="value") + + # shared layers/network + self.net = nn.Sequential(nn.Linear(self.num_observations, 32), + nn.ELU(), + nn.Linear(32, 32), + nn.ELU()) + + # separated layers ("policy") + self.mean_layer = nn.Linear(32, self.num_actions) + self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) + + # separated layer ("value") + self.value_layer = nn.Linear(32, 1) + + # override the .act(...) method to disambiguate its call + def act(self, states, taken_actions, role): + if role == "policy": + return GaussianMixin.act(self, states, taken_actions, role) + elif role == "value": + return DeterministicMixin.act(self, states, taken_actions, role) + + # forward the input to compute model output according to the specified role + def compute(self, states, taken_actions, role): + if role == "policy": + return self.mean_layer(self.net(states)), self.log_std_parameter + elif role == "value": + return self.value_layer(self.net(states)) + + +# instantiate the shared model and pass the same instance to the other key +models = {} +models["policy"] = SharedModel(env.observation_space, env.action_space, env.device) +models["value"] = models["policy"] +# [end-mlp] From c326041cbe557f274271f77ce504ba7ea9e740de Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20Serrano=20Mu=C3=B1oz?= Date: Sun, 2 Oct 2022 23:36:17 +0200 Subject: [PATCH 101/108] Disable gradient computation when tensors are updated by their names --- skrl/memories/torch/base.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/skrl/memories/torch/base.py b/skrl/memories/torch/base.py index 7765854a..aaa18ab5 100644 --- a/skrl/memories/torch/base.py +++ b/skrl/memories/torch/base.py @@ -139,7 +139,8 @@ def set_tensor_by_name(self, name: str, tensor: torch.Tensor) -> None: :raises KeyError: The tensor does not exist """ - self.tensors[name].copy_(tensor) + with torch.no_grad(): + self.tensors[name].copy_(tensor) def create_tensor(self, name: str, size: Union[int, Tuple[int], gym.Space], dtype: Union[torch.dtype, None] = None) -> bool: """Create a new internal tensor in memory From a46baa321856a1ad9746e31535859cc36f6d6ebc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20Serrano=20Mu=C3=B1oz?= Date: Sun, 2 Oct 2022 23:39:44 +0200 Subject: [PATCH 102/108] Fix docstring example --- skrl/utils/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skrl/utils/__init__.py b/skrl/utils/__init__.py index 778ef728..aaef12ab 100644 --- a/skrl/utils/__init__.py +++ b/skrl/utils/__init__.py @@ -32,8 +32,8 @@ def set_seed(seed: Optional[int] = None, deterministic: bool = False) -> int: 42 # random seed - >>> set_seed() >>> from skrl.utils import set_seed + >>> set_seed() [skrl:INFO] Seed: 1776118066 1776118066 From d0470fc8d31cea02b2fc4e4b25a5e30c7648fe38 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20Serrano=20Mu=C3=B1oz?= Date: Sun, 2 Oct 2022 23:54:04 +0200 Subject: [PATCH 103/108] Add Isaac Gym Franka Emika example to docs --- .../reaching_franka_isaacgym_env.py | 364 ++++++++++++++++++ .../reaching_franka_isaacgym_skrl_eval.py | 89 +++++ .../reaching_franka_isaacgym_skrl_train.py | 130 +++++++ 3 files changed, 583 insertions(+) create mode 100644 docs/source/examples/real_world/franka_emika_panda/reaching_franka_isaacgym_env.py create mode 100644 docs/source/examples/real_world/franka_emika_panda/reaching_franka_isaacgym_skrl_eval.py create mode 100644 docs/source/examples/real_world/franka_emika_panda/reaching_franka_isaacgym_skrl_train.py diff --git a/docs/source/examples/real_world/franka_emika_panda/reaching_franka_isaacgym_env.py b/docs/source/examples/real_world/franka_emika_panda/reaching_franka_isaacgym_env.py new file mode 100644 index 00000000..b4a19b7a --- /dev/null +++ b/docs/source/examples/real_world/franka_emika_panda/reaching_franka_isaacgym_env.py @@ -0,0 +1,364 @@ +import os +import numpy as np +import torch + +from isaacgym import gymtorch, gymapi + +# isaacgymenvs (VecTask class) +import sys +import isaacgymenvs +sys.path.append(list(isaacgymenvs.__path__)[0]) +from tasks.base.vec_task import VecTask + +from skrl.utils import isaacgym_utils + + +TASK_CFG = {"name": "ReachingFranka", + "physics_engine": "physx", + "rl_device": "cuda:0", + "sim_device": "cuda:0", + "graphics_device_id": 0, + "headless": False, + "virtual_screen_capture": False, + "force_render": True, + "env": {"numEnvs": 1024, + "envSpacing": 1.5, + "episodeLength": 100, + "enableDebugVis": False, + "clipObservations": 1000.0, + "clipActions": 1.0, + "controlFrequencyInv": 4, + "actionScale": 2.5, + "dofVelocityScale": 0.1, + "controlSpace": "cartesian", + "enableCameraSensors": False}, + "sim": {"dt": 0.0083, # 1 / 120 + "substeps": 1, + "up_axis": "z", + "use_gpu_pipeline": True, + "gravity": [0.0, 0.0, -9.81], + "physx": {"num_threads": 4, + "solver_type": 1, + "use_gpu": True, + "num_position_iterations": 4, + "num_velocity_iterations": 1, + "contact_offset": 0.005, + "rest_offset": 0.0, + "bounce_threshold_velocity": 0.2, + "max_depenetration_velocity": 1000.0, + "default_buffer_size_multiplier": 5.0, + "max_gpu_contact_pairs": 1048576, + "num_subscenes": 4, + "contact_collection": 0}}, + "task": {"randomize": False}} + + +class ReachingFrankaTask(VecTask): + def __init__(self, cfg): + self.cfg = cfg + rl_device = cfg["rl_device"] + sim_device = cfg["sim_device"] + graphics_device_id = cfg["graphics_device_id"] + headless = cfg["headless"] + virtual_screen_capture = cfg["virtual_screen_capture"] + force_render = cfg["force_render"] + + self.dt = 1 / 120.0 + + self._action_scale = self.cfg["env"]["actionScale"] + self._dof_vel_scale = self.cfg["env"]["dofVelocityScale"] + self._control_space = self.cfg["env"]["controlSpace"] + self.max_episode_length = self.cfg["env"]["episodeLength"] # name required for VecTask + + self.debug_viz = self.cfg["env"]["enableDebugVis"] + + # observation and action space + self.cfg["env"]["numObservations"] = 18 + if self._control_space == "joint": + self.cfg["env"]["numActions"] = 7 + elif self._control_space == "cartesian": + self.cfg["env"]["numActions"] = 3 + else: + raise ValueError("Invalid control space: {}".format(self._control_space)) + + self._end_effector_link = "panda_leftfinger" + + # setup VecTask + super().__init__(config=self.cfg, + rl_device=rl_device, + sim_device=sim_device, + graphics_device_id=graphics_device_id, + headless=headless, + virtual_screen_capture=virtual_screen_capture, + force_render=force_render) + + # tensors and views: DOFs, roots, rigid bodies + dof_state_tensor = self.gym.acquire_dof_state_tensor(self.sim) + root_state_tensor = self.gym.acquire_actor_root_state_tensor(self.sim) + rigid_body_state_tensor = self.gym.acquire_rigid_body_state_tensor(self.sim) + + self.gym.refresh_dof_state_tensor(self.sim) + self.gym.refresh_actor_root_state_tensor(self.sim) + self.gym.refresh_rigid_body_state_tensor(self.sim) + + self.dof_state = gymtorch.wrap_tensor(dof_state_tensor) + self.root_state = gymtorch.wrap_tensor(root_state_tensor) + self.rigid_body_state = gymtorch.wrap_tensor(rigid_body_state_tensor) + + self.dof_pos = self.dof_state.view(self.num_envs, -1, 2)[..., 0] + self.dof_vel = self.dof_state.view(self.num_envs, -1, 2)[..., 1] + + self.root_pos = self.root_state[:, 0:3].view(self.num_envs, -1, 3) + self.root_rot = self.root_state[:, 3:7].view(self.num_envs, -1, 4) + self.root_vel_lin = self.root_state[:, 7:10].view(self.num_envs, -1, 3) + self.root_vel_ang = self.root_state[:, 10:13].view(self.num_envs, -1, 3) + + self.rigid_body_pos = self.rigid_body_state[:, 0:3].view(self.num_envs, -1, 3) + self.rigid_body_rot = self.rigid_body_state[:, 3:7].view(self.num_envs, -1, 4) + self.rigid_body_vel_lin = self.rigid_body_state[:, 7:10].view(self.num_envs, -1, 3) + self.rigid_body_vel_ang = self.rigid_body_state[:, 10:13].view(self.num_envs, -1, 3) + + # tensors and views: jacobian + if self._control_space == "cartesian": + jacobian_tensor = self.gym.acquire_jacobian_tensor(self.sim, "robot") + self.jacobian = gymtorch.wrap_tensor(jacobian_tensor) + self.jacobian_end_effector = self.jacobian[:, self.rigid_body_dict_robot[self._end_effector_link] - 1, :, :7] + + self.reset_idx(torch.arange(self.num_envs, device=self.device)) + + def create_sim(self): + self.sim_params.up_axis = gymapi.UP_AXIS_Z + self.sim_params.gravity.x = 0 + self.sim_params.gravity.y = 0 + self.sim_params.gravity.z = -9.81 + self.sim = super().create_sim(self.device_id, self.graphics_device_id, self.physics_engine, self.sim_params) + self._create_ground_plane() + self._create_envs(self.num_envs, self.cfg["env"]["envSpacing"], int(np.sqrt(self.num_envs))) + + def _create_ground_plane(self): + plane_params = gymapi.PlaneParams() + plane_params.normal = gymapi.Vec3(0.0, 0.0, 1.0) + self.gym.add_ground(self.sim, plane_params) + + def _create_envs(self, num_envs, spacing, num_per_row): + lower = gymapi.Vec3(-spacing, -spacing, 0.0) + upper = gymapi.Vec3(spacing, spacing, spacing) + + asset_root = os.path.join(os.path.dirname(os.path.abspath(isaacgymenvs.__file__)), "../assets") + robot_asset_file = "urdf/franka_description/robots/franka_panda.urdf" + + # robot asset + asset_options = gymapi.AssetOptions() + asset_options.flip_visual_attachments = True + asset_options.fix_base_link = True + asset_options.collapse_fixed_joints = True + asset_options.disable_gravity = True + asset_options.thickness = 0.001 + asset_options.default_dof_drive_mode = gymapi.DOF_MODE_POS + asset_options.use_mesh_materials = True + robot_asset = self.gym.load_asset(self.sim, asset_root, robot_asset_file, asset_options) + + # target asset + asset_options = gymapi.AssetOptions() + asset_options.fix_base_link = True + asset_options.collapse_fixed_joints = False + asset_options.disable_gravity = True + asset_options.thickness = 0.001 + asset_options.use_mesh_materials = True + target_asset = self.gym.create_sphere(self.sim, 0.025, asset_options) + + robot_dof_stiffness = torch.tensor([400, 400, 400, 400, 400, 400, 400, 1.0e6, 1.0e6], dtype=torch.float32, device=self.device) + robot_dof_damping = torch.tensor([80, 80, 80, 80, 80, 80, 80, 1.0e2, 1.0e2], dtype=torch.float, device=self.device) + + # set robot dof properties + robot_dof_props = self.gym.get_asset_dof_properties(robot_asset) + self.robot_dof_lower_limits = [] + self.robot_dof_upper_limits = [] + for i in range(9): + robot_dof_props["driveMode"][i] = gymapi.DOF_MODE_POS + if self.physics_engine == gymapi.SIM_PHYSX: + robot_dof_props["stiffness"][i] = robot_dof_stiffness[i] + robot_dof_props["damping"][i] = robot_dof_damping[i] + else: + robot_dof_props["stiffness"][i] = 7000.0 + robot_dof_props["damping"][i] = 50.0 + + self.robot_dof_lower_limits.append(robot_dof_props["lower"][i]) + self.robot_dof_upper_limits.append(robot_dof_props["upper"][i]) + + self.robot_dof_lower_limits = torch.tensor(self.robot_dof_lower_limits, device=self.device) + self.robot_dof_upper_limits = torch.tensor(self.robot_dof_upper_limits, device=self.device) + self.robot_dof_speed_scales = torch.ones_like(self.robot_dof_lower_limits) + robot_dof_props["effort"][7] = 200 + robot_dof_props["effort"][8] = 200 + + self.handle_targets = [] + self.handle_robots = [] + self.handle_envs = [] + + indexes_sim_robot = [] + indexes_sim_target = [] + + for i in range(self.num_envs): + # create env instance + env_ptr = self.gym.create_env(self.sim, lower, upper, num_per_row) + + # create robot instance + pose = gymapi.Transform() + pose.p = gymapi.Vec3(0.0, 0.0, 0.0) + pose.r = gymapi.Quat(0.0, 0.0, 0.0, 1) + + robot_actor = self.gym.create_actor(env=env_ptr, + asset=robot_asset, + pose=pose, + name="robot", + group=i, # collision group + filter=1, # mask off collision + segmentationId=0) + self.gym.set_actor_dof_properties(env_ptr, robot_actor, robot_dof_props) + indexes_sim_robot.append(self.gym.get_actor_index(env_ptr, robot_actor, gymapi.DOMAIN_SIM)) + + # create target instance + pose = gymapi.Transform() + pose.p = gymapi.Vec3(0.5, 0.0, 0.2) + pose.r = gymapi.Quat(0.0, 0.0, 0.0, 1) + + target_actor = self.gym.create_actor(env=env_ptr, + asset=target_asset, + pose=pose, + name="target", + group=i + 1, # collision group + filter=1, # mask off collision + segmentationId=1) + indexes_sim_target.append(self.gym.get_actor_index(env_ptr, target_actor, gymapi.DOMAIN_SIM)) + + self.gym.set_rigid_body_color(env_ptr, target_actor, 0, gymapi.MESH_VISUAL, gymapi.Vec3(1., 0., 0.)) + + self.handle_envs.append(env_ptr) + self.handle_robots.append(robot_actor) + self.handle_targets.append(target_actor) + + self.indexes_sim_robot = torch.tensor(indexes_sim_robot, dtype=torch.int32, device=self.device) + self.indexes_sim_target = torch.tensor(indexes_sim_target, dtype=torch.int32, device=self.device) + + self.num_robot_dofs = self.gym.get_asset_dof_count(robot_asset) + self.rigid_body_dict_robot = self.gym.get_asset_rigid_body_dict(robot_asset) + + self.init_data() + + def init_data(self): + self.robot_default_dof_pos = torch.tensor(np.radians([0, -45, 0, -135, 0, 90, 45, 0, 0]), device=self.device, dtype=torch.float32) + self.robot_dof_targets = torch.zeros((self.num_envs, self.num_robot_dofs), device=self.device, dtype=torch.float32) + + if self._control_space == "cartesian": + self.end_effector_pos = torch.zeros((self.num_envs, 3), device=self.device) + self.end_effector_rot = torch.zeros((self.num_envs, 4), device=self.device) + + def compute_reward(self): + self.rew_buf[:] = -self._computed_distance + + self.reset_buf.fill_(0) + # target reached + self.reset_buf = torch.where(self._computed_distance <= 0.035, torch.ones_like(self.reset_buf), self.reset_buf) + # max episode length + self.reset_buf = torch.where(self.progress_buf >= self.max_episode_length - 1, torch.ones_like(self.reset_buf), self.reset_buf) + + # double restart correction (why?, is it necessary?) + self.rew_buf = torch.where(self.progress_buf == 0, -0.75 * torch.ones_like(self.reset_buf), self.rew_buf) + self.reset_buf = torch.where(self.progress_buf == 0, torch.zeros_like(self.reset_buf), self.reset_buf) + + def compute_observations(self): + self.gym.refresh_dof_state_tensor(self.sim) + self.gym.refresh_actor_root_state_tensor(self.sim) + self.gym.refresh_rigid_body_state_tensor(self.sim) + + if self._control_space == "cartesian": + self.gym.refresh_jacobian_tensors(self.sim) + + robot_dof_pos = self.dof_pos + robot_dof_vel = self.dof_vel + self.end_effector_pos = self.rigid_body_pos[:, self.rigid_body_dict_robot[self._end_effector_link]] + self.end_effector_rot = self.rigid_body_rot[:, self.rigid_body_dict_robot[self._end_effector_link]] + target_pos = self.root_pos[:, 1] + target_rot = self.root_rot[:, 1] + + dof_pos_scaled = 2.0 * (robot_dof_pos - self.robot_dof_lower_limits) \ + / (self.robot_dof_upper_limits - self.robot_dof_lower_limits) - 1.0 + dof_vel_scaled = robot_dof_vel * self._dof_vel_scale + + generalization_noise = torch.rand((dof_vel_scaled.shape[0], 7), device=self.device) + 0.5 + + self.obs_buf[:, 0] = self.progress_buf / self.max_episode_length + self.obs_buf[:, 1:8] = dof_pos_scaled[:, :7] + self.obs_buf[:, 8:15] = dof_vel_scaled[:, :7] * generalization_noise + self.obs_buf[:, 15:18] = target_pos + + # compute distance for compute_reward() + self._computed_distance = torch.norm(self.end_effector_pos - target_pos, dim=-1) + + def reset_idx(self, env_ids): + # reset robot + pos = torch.clamp(self.robot_default_dof_pos.unsqueeze(0) + 0.25 * (torch.rand((len(env_ids), self.num_robot_dofs), device=self.device) - 0.5), + self.robot_dof_lower_limits, self.robot_dof_upper_limits) + pos[:, 7:] = 0 + + self.robot_dof_targets[env_ids, :] = pos[:] + self.dof_pos[env_ids, :] = pos[:] + self.dof_vel[env_ids, :] = 0 + + indexes = self.indexes_sim_robot[env_ids] + self.gym.set_dof_position_target_tensor_indexed(self.sim, + gymtorch.unwrap_tensor(self.robot_dof_targets), + gymtorch.unwrap_tensor(indexes), + len(env_ids)) + + self.gym.set_dof_state_tensor_indexed(self.sim, + gymtorch.unwrap_tensor(self.dof_state), + gymtorch.unwrap_tensor(indexes), + len(env_ids)) + + # reset targets + pos = (torch.rand((len(env_ids), 3), device=self.device) - 0.5) * 2 + pos[:, 0] = 0.50 + pos[:, 0] * 0.25 + pos[:, 1] = 0.00 + pos[:, 1] * 0.25 + pos[:, 2] = 0.20 + pos[:, 2] * 0.10 + + self.root_pos[env_ids, 1, :] = pos[:] + + indexes = self.indexes_sim_target[env_ids] + self.gym.set_actor_root_state_tensor_indexed(self.sim, + gymtorch.unwrap_tensor(self.root_state), + gymtorch.unwrap_tensor(indexes), + len(env_ids)) + + # bookkeeping + self.reset_buf[env_ids] = 0 + self.progress_buf[env_ids] = 0 + + def pre_physics_step(self, actions): + actions = actions.clone().to(self.device) + + if self._control_space == "joint": + targets = self.robot_dof_targets[:, :7] + self.robot_dof_speed_scales[:7] * self.dt * actions * self._action_scale + + elif self._control_space == "cartesian": + goal_position = self.end_effector_pos + actions / 100.0 + delta_dof_pos = isaacgym_utils.ik(jacobian_end_effector=self.jacobian_end_effector, + current_position=self.end_effector_pos, + current_orientation=self.end_effector_rot, + goal_position=goal_position, + goal_orientation=None) + targets = self.robot_dof_targets[:, :7] + delta_dof_pos + + self.robot_dof_targets[:, :7] = torch.clamp(targets, self.robot_dof_lower_limits[:7], self.robot_dof_upper_limits[:7]) + self.gym.set_dof_position_target_tensor(self.sim, gymtorch.unwrap_tensor(self.robot_dof_targets)) + + def post_physics_step(self): + self.progress_buf += 1 + + env_ids = self.reset_buf.nonzero(as_tuple=False).squeeze(-1) + if len(env_ids) > 0: + self.reset_idx(env_ids) + + self.compute_observations() + self.compute_reward() diff --git a/docs/source/examples/real_world/franka_emika_panda/reaching_franka_isaacgym_skrl_eval.py b/docs/source/examples/real_world/franka_emika_panda/reaching_franka_isaacgym_skrl_eval.py new file mode 100644 index 00000000..cf30f686 --- /dev/null +++ b/docs/source/examples/real_world/franka_emika_panda/reaching_franka_isaacgym_skrl_eval.py @@ -0,0 +1,89 @@ +import isaacgym + +import torch +import torch.nn as nn + +# Import the skrl components to build the RL system +from skrl.models.torch import Model, GaussianMixin +from skrl.agents.torch.ppo import PPO, PPO_DEFAULT_CONFIG +from skrl.resources.preprocessors.torch import RunningStandardScaler +from skrl.trainers.torch import SequentialTrainer +from skrl.envs.torch import wrap_env + + +# Define only the policy for evaluation +class Policy(GaussianMixin, Model): + def __init__(self, observation_space, action_space, device, clip_actions=False, + clip_log_std=True, min_log_std=-20, max_log_std=2): + Model.__init__(self, observation_space, action_space, device) + GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std) + + self.net = nn.Sequential(nn.Linear(self.num_observations, 256), + nn.ELU(), + nn.Linear(256, 128), + nn.ELU(), + nn.Linear(128, 64), + nn.ELU(), + nn.Linear(64, self.num_actions)) + self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) + + def compute(self, states, taken_actions, role): + return self.net(states), self.log_std_parameter + + +# instantiate and configure the task +headless = True # set headless to False for rendering + +from reaching_franka_isaacgym_env import ReachingFrankaTask, TASK_CFG + +TASK_CFG["headless"] = headless +TASK_CFG["env"]["numEnvs"] = 64 +TASK_CFG["env"]["controlSpace"] = "joint" # "joint" or "cartesian" + +env = ReachingFrankaTask(cfg=TASK_CFG) + +# wrap the environment +env = wrap_env(env, "isaacgym-preview4") + +device = env.device + + +# Instantiate the agent's policy. +# PPO requires 2 models, visit its documentation for more details +# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#spaces-and-models +models_ppo = {} +models_ppo["policy"] = Policy(env.observation_space, env.action_space, device) + + +# Configure and instantiate the agent. +# Only modify some of the default configuration, visit its documentation to see all the options +# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#configuration-and-hyperparameters +cfg_ppo = PPO_DEFAULT_CONFIG.copy() +cfg_ppo["random_timesteps"] = 0 +cfg_ppo["learning_starts"] = 0 +cfg_ppo["state_preprocessor"] = RunningStandardScaler +cfg_ppo["state_preprocessor_kwargs"] = {"size": env.observation_space, "device": device} +# logging to TensorBoard each 32 timesteps an ignore checkpoints +cfg_ppo["experiment"]["write_interval"] = 32 +cfg_ppo["experiment"]["checkpoint_interval"] = 0 + +agent = PPO(models=models_ppo, + memory=None, + cfg=cfg_ppo, + observation_space=env.observation_space, + action_space=env.action_space, + device=device) + +# load checkpoints +if TASK_CFG["env"]["controlSpace"] == "joint": + agent.load("./agent_joint_isaacgym.pt") +elif TASK_CFG["env"]["controlSpace"] == "cartesian": + agent.load("./agent_cartesian_isaacgym.pt") + + +# Configure and instantiate the RL trainer +cfg_trainer = {"timesteps": 1000, "headless": True} +trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=agent) + +# start evaluation +trainer.eval() diff --git a/docs/source/examples/real_world/franka_emika_panda/reaching_franka_isaacgym_skrl_train.py b/docs/source/examples/real_world/franka_emika_panda/reaching_franka_isaacgym_skrl_train.py new file mode 100644 index 00000000..841129aa --- /dev/null +++ b/docs/source/examples/real_world/franka_emika_panda/reaching_franka_isaacgym_skrl_train.py @@ -0,0 +1,130 @@ +import isaacgym + +import torch +import torch.nn as nn + +# Import the skrl components to build the RL system +from skrl.models.torch import Model, GaussianMixin, DeterministicMixin +from skrl.memories.torch import RandomMemory +from skrl.agents.torch.ppo import PPO, PPO_DEFAULT_CONFIG +from skrl.resources.schedulers.torch import KLAdaptiveRL +from skrl.resources.preprocessors.torch import RunningStandardScaler +from skrl.trainers.torch import SequentialTrainer +from skrl.envs.torch import wrap_env +from skrl.utils import set_seed + + +# set the seed for reproducibility +set_seed(42) + + +# Define the models (stochastic and deterministic models) for the agent using helper mixin. +# - Policy: takes as input the environment's observation/state and returns an action +# - Value: takes the state as input and provides a value to guide the policy +class Policy(GaussianMixin, Model): + def __init__(self, observation_space, action_space, device, clip_actions=False, + clip_log_std=True, min_log_std=-20, max_log_std=2): + Model.__init__(self, observation_space, action_space, device) + GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std) + + self.net = nn.Sequential(nn.Linear(self.num_observations, 256), + nn.ELU(), + nn.Linear(256, 128), + nn.ELU(), + nn.Linear(128, 64), + nn.ELU(), + nn.Linear(64, self.num_actions)) + self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) + + def compute(self, states, taken_actions, role): + return self.net(states), self.log_std_parameter + +class Value(DeterministicMixin, Model): + def __init__(self, observation_space, action_space, device, clip_actions=False): + Model.__init__(self, observation_space, action_space, device) + DeterministicMixin.__init__(self, clip_actions) + + self.net = nn.Sequential(nn.Linear(self.num_observations, 256), + nn.ELU(), + nn.Linear(256, 128), + nn.ELU(), + nn.Linear(128, 64), + nn.ELU(), + nn.Linear(64, 1)) + + def compute(self, states, taken_actions, role): + return self.net(states) + + +# instantiate and configure the task +headless = True # set headless to False for rendering + +from reaching_franka_isaacgym_env import ReachingFrankaTask, TASK_CFG + +TASK_CFG["headless"] = headless +TASK_CFG["env"]["numEnvs"] = 1024 +TASK_CFG["env"]["controlSpace"] = "joint" # "joint" or "cartesian" + +env = ReachingFrankaTask(cfg=TASK_CFG) + +# wrap the environment +env = wrap_env(env, "isaacgym-preview4") + +device = env.device + + +# Instantiate a RandomMemory as rollout buffer (any memory can be used for this) +memory = RandomMemory(memory_size=16, num_envs=env.num_envs, device=device) + + +# Instantiate the agent's models (function approximators). +# PPO requires 2 models, visit its documentation for more details +# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#spaces-and-models +models_ppo = {} +models_ppo["policy"] = Policy(env.observation_space, env.action_space, device) +models_ppo["value"] = Value(env.observation_space, env.action_space, device) + + +# Configure and instantiate the agent. +# Only modify some of the default configuration, visit its documentation to see all the options +# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#configuration-and-hyperparameters +cfg_ppo = PPO_DEFAULT_CONFIG.copy() +cfg_ppo["rollouts"] = 16 +cfg_ppo["learning_epochs"] = 8 +cfg_ppo["mini_batches"] = 8 +cfg_ppo["discount_factor"] = 0.99 +cfg_ppo["lambda"] = 0.95 +cfg_ppo["learning_rate"] = 5e-4 +cfg_ppo["learning_rate_scheduler"] = KLAdaptiveRL +cfg_ppo["learning_rate_scheduler_kwargs"] = {"kl_threshold": 0.008} +cfg_ppo["random_timesteps"] = 0 +cfg_ppo["learning_starts"] = 0 +cfg_ppo["grad_norm_clip"] = 1.0 +cfg_ppo["ratio_clip"] = 0.2 +cfg_ppo["value_clip"] = 0.2 +cfg_ppo["clip_predicted_values"] = True +cfg_ppo["entropy_loss_scale"] = 0.0 +cfg_ppo["value_loss_scale"] = 2.0 +cfg_ppo["kl_threshold"] = 0 +cfg_ppo["state_preprocessor"] = RunningStandardScaler +cfg_ppo["state_preprocessor_kwargs"] = {"size": env.observation_space, "device": device} +cfg_ppo["value_preprocessor"] = RunningStandardScaler +cfg_ppo["value_preprocessor_kwargs"] = {"size": 1, "device": device} +# logging to TensorBoard and write checkpoints each 32 and 250 timesteps respectively +cfg_ppo["experiment"]["write_interval"] = 5 +cfg_ppo["experiment"]["checkpoint_interval"] = 250 + +agent = PPO(models=models_ppo, + memory=memory, + cfg=cfg_ppo, + observation_space=env.observation_space, + action_space=env.action_space, + device=device) + + +# Configure and instantiate the RL trainer +cfg_trainer = {"timesteps": 5000, "headless": True} +trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=agent) + +# start training +trainer.train() From 010deab77b44b80d5ac22902bc6d51e4c788b6ef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20Serrano=20Mu=C3=B1oz?= Date: Sun, 2 Oct 2022 23:57:37 +0200 Subject: [PATCH 104/108] Rename Omniverse Isaac Gym example files for Franka Emika --- ...ranka_sim_env.py => reaching_franka_omniverse_isaacgym_env.py} | 0 ...rl_eval.py => reaching_franka_omniverse_isaacgym_skrl_eval.py} | 0 ..._train.py => reaching_franka_omniverse_isaacgym_skrl_train.py} | 0 3 files changed, 0 insertions(+), 0 deletions(-) rename docs/source/examples/real_world/franka_emika_panda/{reaching_franka_sim_env.py => reaching_franka_omniverse_isaacgym_env.py} (100%) rename docs/source/examples/real_world/franka_emika_panda/{reaching_franka_sim_skrl_eval.py => reaching_franka_omniverse_isaacgym_skrl_eval.py} (100%) rename docs/source/examples/real_world/franka_emika_panda/{reaching_franka_sim_skrl_train.py => reaching_franka_omniverse_isaacgym_skrl_train.py} (100%) diff --git a/docs/source/examples/real_world/franka_emika_panda/reaching_franka_sim_env.py b/docs/source/examples/real_world/franka_emika_panda/reaching_franka_omniverse_isaacgym_env.py similarity index 100% rename from docs/source/examples/real_world/franka_emika_panda/reaching_franka_sim_env.py rename to docs/source/examples/real_world/franka_emika_panda/reaching_franka_omniverse_isaacgym_env.py diff --git a/docs/source/examples/real_world/franka_emika_panda/reaching_franka_sim_skrl_eval.py b/docs/source/examples/real_world/franka_emika_panda/reaching_franka_omniverse_isaacgym_skrl_eval.py similarity index 100% rename from docs/source/examples/real_world/franka_emika_panda/reaching_franka_sim_skrl_eval.py rename to docs/source/examples/real_world/franka_emika_panda/reaching_franka_omniverse_isaacgym_skrl_eval.py diff --git a/docs/source/examples/real_world/franka_emika_panda/reaching_franka_sim_skrl_train.py b/docs/source/examples/real_world/franka_emika_panda/reaching_franka_omniverse_isaacgym_skrl_train.py similarity index 100% rename from docs/source/examples/real_world/franka_emika_panda/reaching_franka_sim_skrl_train.py rename to docs/source/examples/real_world/franka_emika_panda/reaching_franka_omniverse_isaacgym_skrl_train.py From 71dc0ef3673ffb9096f63dd81a1119688b9ee0c5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20Serrano=20Mu=C3=B1oz?= Date: Mon, 3 Oct 2022 11:42:35 +0200 Subject: [PATCH 105/108] Add Franka Emika Isaac Gym example to docs --- docs/source/intro/examples.rst | 70 +++++++++++++++++++++++++++++----- 1 file changed, 61 insertions(+), 9 deletions(-) diff --git a/docs/source/intro/examples.rst b/docs/source/intro/examples.rst index 7348ee4d..c15e6c92 100644 --- a/docs/source/intro/examples.rst +++ b/docs/source/intro/examples.rst @@ -942,6 +942,10 @@ These examples show basic real-world use cases to guide and support advanced RL **Main environment configuration:** + .. note:: + + In the joint control space the final control of the robot is performed through the Cartesian pose (forward kinematics from specified values for the joints) + The control space (Cartesian or joint), the robot motion type (waypoint or impedance) and the target position acquisition (command prompt / automatically generated or USB-camera) can be specified in the environment class constructor (from :literal:`reaching_franka_real_skrl_eval.py`) as follow: .. code-block:: python @@ -950,7 +954,7 @@ These examples show basic real-world use cases to guide and support advanced RL motion_type = "waypoint" # waypoint or impedance camera_tracking = False # True for USB-camera tracking - .. tab:: Simulation + .. tab:: Simulation (Omniverse Isaac Gym) .. raw:: html @@ -970,9 +974,9 @@ These examples show basic real-world use cases to guide and support advanced RL **Files** (the implementation is self-contained so no specific location is required): - * Environment: :download:`reaching_franka_sim_env.py <../examples/real_world/franka_emika_panda/reaching_franka_sim_env.py>` - * Training script: :download:`reaching_franka_sim_skrl_train.py <../examples/real_world/franka_emika_panda/reaching_franka_sim_skrl_train.py>` - * Evaluation script: :download:`reaching_franka_sim_skrl_eval.py <../examples/real_world/franka_emika_panda/reaching_franka_sim_skrl_eval.py>` + * Environment: :download:`reaching_franka_omniverse_isaacgym_env.py <../examples/real_world/franka_emika_panda/reaching_franka_omniverse_isaacgym_env.py>` + * Training script: :download:`reaching_franka_omniverse_isaacgym_skrl_train.py <../examples/real_world/franka_emika_panda/reaching_franka_omniverse_isaacgym_skrl_train.py>` + * Evaluation script: :download:`reaching_franka_omniverse_isaacgym_skrl_eval.py <../examples/real_world/franka_emika_panda/reaching_franka_omniverse_isaacgym_skrl_eval.py>` * Checkpoints (:literal:`agent_joint.pt`, :literal:`agent_cartesian.pt`): :download:`trained_checkpoints.zip ` **Training and evaluation:** @@ -980,27 +984,75 @@ These examples show basic real-world use cases to guide and support advanced RL .. code-block:: bash # training (local workstation) - ~/.local/share/ov/pkg/isaac_sim-*/python.sh reaching_franka_sim_skrl_train.py + ~/.local/share/ov/pkg/isaac_sim-*/python.sh reaching_franka_omniverse_isaacgym_skrl_train.py # training (docker container) - /isaac-sim/python.sh reaching_franka_sim_skrl_train.py + /isaac-sim/python.sh reaching_franka_omniverse_isaacgym_skrl_train.py .. code-block:: bash # evaluation (local workstation) - ~/.local/share/ov/pkg/isaac_sim-*/python.sh reaching_franka_sim_skrl_eval.py + ~/.local/share/ov/pkg/isaac_sim-*/python.sh reaching_franka_omniverse_isaacgym_skrl_eval.py # evaluation (docker container) - /isaac-sim/python.sh reaching_franka_sim_skrl_eval.py + /isaac-sim/python.sh reaching_franka_omniverse_isaacgym_skrl_eval.py **Main environment configuration:** - The control space (Cartesian or joint) can be specified in the task configuration dictionary (from :literal:`reaching_franka_sim_skrl_train.py`) as follow: + The control space (Cartesian or joint) can be specified in the task configuration dictionary (from :literal:`reaching_franka_omniverse_isaacgym_skrl_train.py`) as follow: .. code-block:: python TASK_CFG["task"]["env"]["controlSpace"] = "joint" # "joint" or "cartesian" + .. tab:: Simulation (Isaac Gym) + + .. raw:: html + + + + .. raw:: html + + + + | + + **Prerequisites:** + + All installation steps described in Isaac Gym's `Installation `_ section must be fulfilled + + **Files** (the implementation is self-contained so no specific location is required): + + * Environment: :download:`reaching_franka_isaacgym_env.py <../examples/real_world/franka_emika_panda/reaching_franka_isaacgym_env.py>` + * Training script: :download:`reaching_franka_isaacgym_skrl_train.py <../examples/real_world/franka_emika_panda/reaching_franka_isaacgym_skrl_train.py>` + * Evaluation script: :download:`reaching_franka_isaacgym_skrl_eval.py <../examples/real_world/franka_emika_panda/reaching_franka_isaacgym_skrl_eval.py>` + + **Training and evaluation:** + + .. note:: + + The checkpoints obtained in Isaac Gym were not evaluated with the real robot. However, they were evaluated in Omniverse Isaac Gym showing successful performance + + .. code-block:: bash + + # training (with the Python virtual environment active) + python reaching_franka_isaacgym_skrl_train.py + + .. code-block:: bash + + # evaluation (with the Python virtual environment active) + python reaching_franka_isaacgym_skrl_eval.py + + **Main environment configuration:** + + The control space (Cartesian or joint) can be specified in the task configuration dictionary (from :literal:`reaching_franka_isaacgym_skrl_train.py`) as follow: + + .. code-block:: python + + TASK_CFG["env"]["controlSpace"] = "joint" # "joint" or "cartesian" + .. _library_utilities: Library utilities (skrl.utils module) From 1d6ee4b00e57f4daf28427c85b070b0f95b97dcf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20Serrano=20Mu=C3=B1oz?= Date: Mon, 3 Oct 2022 11:45:59 +0200 Subject: [PATCH 106/108] Update CHANGELOG --- CHANGELOG.md | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index bc1a765f..2b66df84 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,7 +2,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). -## [0.8.0] - Unreleased +## [0.8.0] - 2022-10-03 ### Added - AMP agent for physics-based character animation - Manual trainer @@ -14,16 +14,21 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). - Migrate checkpoints/models from other RL libraries to skrl models/agents - Configuration parameter `store_separately` to agent configuration dict - Save/load agent modules (models, optimizers, preprocessors) +- Set random seed and configure deterministic behavior for reproducibility +- Benchmark results for Isaac Gym and Omniverse Isaac Gym on the GitHub discussion page +- Franka Emika real-world example ### Changed - Models implementation as Python mixin [**breaking change**] - Multivariate Gaussian model (`GaussianModel` until 0.7.0) to `MultivariateGaussianMixin` - Trainer's `cfg` parameter position and default values -- Show training/evaluadion display progress using `tqdm` (by @JohannLange) +- Show training/evaluation display progress using `tqdm` (by @JohannLange) +- Update Isaac Gym and Omniverse Isaac Gym examples ### Fixed - Missing recursive arguments during model weights initialization - Tensor dimension when computing preprocessor parallel variance +- Models' clip tensors dtype to `float32` ### Removed - Parameter `inference` from model methods From 996b8eed5a160338be962d7009dd30be736a3513 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20Serrano=20Mu=C3=B1oz?= Date: Mon, 3 Oct 2022 11:55:40 +0200 Subject: [PATCH 107/108] Clean the code and add comment for Cartesian-impedance control --- .../reaching_franka_real_env.py | 27 ++++--------------- 1 file changed, 5 insertions(+), 22 deletions(-) diff --git a/docs/source/examples/real_world/franka_emika_panda/reaching_franka_real_env.py b/docs/source/examples/real_world/franka_emika_panda/reaching_franka_real_env.py index d67cfb0b..8afb649d 100644 --- a/docs/source/examples/real_world/franka_emika_panda/reaching_franka_real_env.py +++ b/docs/source/examples/real_world/franka_emika_panda/reaching_franka_real_env.py @@ -17,7 +17,11 @@ def __init__(self, robot_ip="172.16.0.2", device="cuda:0", control_space="joint" self.motion_type = motion_type # waypoint or impedance if self.control_space == "cartesian" and self.motion_type == "impedance": - raise ValueError("Unsafe robot operation in cartesian/impedance configuration") + # The operation of this mode (Cartesian-impedance) was adjusted later without being able to test it on the real robot. + # Dangerous movements may occur for the operator and the robot. + # Comment the following line of code if you want to proceed with this mode. + raise ValueError("See comment in the code to proceed with this mode") + pass # camera tracking (disabled by default) self.camera_tracking = camera_tracking @@ -253,24 +257,3 @@ def render(self, *args, **kwargs): def close(self): pass - - - - -if __name__ == "__main__": - - # test camera capturing - ReachingFranka._update_target_from_camera(None) - exit() - - - env = ReachingFranka() - - observation = env.reset() - for _ in range(100): - observation, reward, done, info = env.step(env.action_space.sample()) - env.render() - if done: - observation = env.reset() - - env.close() From 77029f8c8cc33517c4510e087bd78b3c7887ca8d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20Serrano=20Mu=C3=B1oz?= Date: Mon, 3 Oct 2022 12:00:16 +0200 Subject: [PATCH 108/108] Update README --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index f9fa13a8..c4ae3499 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@

- +

SKRL - Reinforcement Learning library


@@ -14,7 +14,7 @@ https://skrl.readthedocs.io/en/latest/
-> **Note:** This project is under **active continuous development**. Please make sure you always have the latest version +> **Note:** This project is under **active continuous development**. Please make sure you always have the latest version. Visit the [develop](https://github.com/Toni-SM/skrl/tree/develop) branch or its [documentation](https://skrl.readthedocs.io/en/develop) to access the latest updates to be released.