feat: support isaac gym interface (#325)

PKU-Alignment · May 2, 2024 · 924f74c · 924f74c
1 parent 3c9a235
commit 924f74c
Show file tree

Hide file tree

Showing 15 changed files with 692 additions and 19 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -90,12 +90,13 @@ repos:
             ^examples/|
             ^tests/|
             ^setup.py$|
-            ^docs/source/conf.py$|
             ^omnisafe/envs/classic_control/envs_from_crabs.py$|
             ^omnisafe/common/control_barrier_function/crabs/models.py$|
             ^omnisafe/common/control_barrier_function/crabs/optimizers.py$|
             ^omnisafe/common/control_barrier_function/crabs/utils.py$|
-            ^omnisafe/algorithms/off_policy/crabs.py$
+            ^omnisafe/algorithms/off_policy/crabs.py$|
+            ^omnisafe/utils/isaac_gym_utils.py$|
+            ^docs/source/conf.py$
           )
   - repo: https://github.com/pycqa/pydocstyle
     rev: 6.3.0

diff --git a/.pylintrc b/.pylintrc
@@ -48,7 +48,7 @@ ignore=CVS,.vscode,.history
 # ignore-list. The regex matches against paths and can be in Posix or Windows
 # format. Because '\' represents the directory delimiter on Windows systems, it
 # can't be used as an escape character.
-ignore-paths=^examples/$,^tests/$
+ignore-paths=^examples/$,^tests/$,^omnisafe/utils/isaac_gym_utils.py$,
 
 # Files or directories matching the regular expression patterns are skipped.
 # The regex matches against base names, not paths. The default value ignores

diff --git a/README.md b/README.md
@@ -287,6 +287,21 @@ Here is a list of environments that [Safety-Gymnasium](https://www.safety-gymnas
     <td>HalfCheetah, Hopper, Swimmer, Walker2d, Ant, Humanoid</td>
     <td>SafetyHumanoidVelocity-v1</td>
   </tr>
+  <tr>
+    <td rowspan="4">Safe Isaac Gym</td>
+    <td>OverSafeFinger</td>
+    <td rowspan="4">ShadowHand</td>
+    <td rowspan="4">ShadowHandOverSafeFinger</td>
+  </tr>
+  <tr>
+    <td>OverSafeJoint</td>
+  </tr>
+  <tr>
+    <td>CatchOver2UnderarmSafeFinger</td>
+  </tr>
+  <tr>
+    <td>CatchOver2UnderarmSafeJoint</td>
+  </tr>
 </tbody>
 </table>
 

diff --git a/omnisafe/__init__.py b/omnisafe/__init__.py
@@ -14,6 +14,12 @@
 # ==============================================================================
 """OmniSafe: A comprehensive and reliable benchmark for safe reinforcement learning."""
 
+from contextlib import suppress
+
+
+with suppress(ImportError):
+    from isaacgym import gymutil
+
 from omnisafe import algorithms
 from omnisafe.algorithms import ALGORITHMS
 from omnisafe.algorithms.algo_wrapper import AlgoWrapper as Agent

diff --git a/omnisafe/adapter/crabs_adapter.py b/omnisafe/adapter/crabs_adapter.py
@@ -74,13 +74,13 @@ def eval_policy(  # pylint: disable=too-many-locals
         """
         for _ in range(episode):
             ep_ret, ep_cost, ep_len = 0.0, 0.0, 0
-            obs, _ = self._eval_env.reset()
+            obs, _ = self._eval_env.reset()  # type: ignore
             obs = obs.to(self._device)
 
             done = False
             while not done:
                 act = agent.step(obs, deterministic=False)
-                obs, reward, cost, terminated, truncated, info = self._eval_env.step(act)
+                obs, reward, cost, terminated, truncated, info = self._eval_env.step(act)  # type: ignore
                 obs, reward, cost, terminated, truncated = (
                     torch.as_tensor(x, dtype=torch.float32, device=self._device)
                     for x in (obs, reward, cost, terminated, truncated)

diff --git a/omnisafe/adapter/offpolicy_adapter.py b/omnisafe/adapter/offpolicy_adapter.py
@@ -76,6 +76,7 @@ def eval_policy(  # pylint: disable=too-many-locals
             agent (ConstraintActorCritic): Agent.
             logger (Logger): Logger, to log ``EpRet``, ``EpCost``, ``EpLen``.
         """
+        assert self._eval_env, 'Environment for evaluation has not been set!'
         for _ in range(episode):
             ep_ret, ep_cost, ep_len = 0.0, 0.0, 0
             obs, _ = self._eval_env.reset()

diff --git a/omnisafe/adapter/online_adapter.py b/omnisafe/adapter/online_adapter.py
@@ -69,14 +69,17 @@ def __init__(  # pylint: disable=too-many-arguments
             env_cfgs = self._cfgs.env_cfgs.todict()
 
         self._env: CMDP = make(env_id, num_envs=num_envs, device=self._device, **env_cfgs)
-        self._eval_env: CMDP = make(env_id, num_envs=1, device=self._device, **env_cfgs)
-
         self._wrapper(
             obs_normalize=cfgs.algo_cfgs.obs_normalize,
             reward_normalize=cfgs.algo_cfgs.reward_normalize,
             cost_normalize=cfgs.algo_cfgs.cost_normalize,
         )
 
+        self._eval_env: CMDP | None = None
+        if self._env.need_evaluation:
+            self._eval_env = make(env_id, num_envs=1, device=self._device, **env_cfgs)
+            self._wrapper_eval(obs_normalize=cfgs.algo_cfgs.obs_normalize)
+
         self._env.set_seed(seed)
 
     def _wrapper(
@@ -116,32 +119,53 @@ def _wrapper(
         """
         if self._env.need_time_limit_wrapper:
             assert (
-                self._env.max_episode_steps and self._eval_env.max_episode_steps
+                self._env.max_episode_steps
             ), 'You must define max_episode_steps as an integer\
-                or cancel the use of the time_limit wrapper.'
+                \nor cancel the use of the time_limit wrapper.'
             self._env = TimeLimit(
                 self._env,
                 time_limit=self._env.max_episode_steps,
                 device=self._device,
             )
-            self._eval_env = TimeLimit(
-                self._eval_env,
-                time_limit=self._eval_env.max_episode_steps,
-                device=self._device,
-            )
         if self._env.need_auto_reset_wrapper:
             self._env = AutoReset(self._env, device=self._device)
         if obs_normalize:
             self._env = ObsNormalize(self._env, device=self._device)
-            self._eval_env = ObsNormalize(self._eval_env, device=self._device)
         if reward_normalize:
             self._env = RewardNormalize(self._env, device=self._device)
         if cost_normalize:
             self._env = CostNormalize(self._env, device=self._device)
         self._env = ActionScale(self._env, low=-1.0, high=1.0, device=self._device)
-        self._eval_env = ActionScale(self._eval_env, low=-1.0, high=1.0, device=self._device)
         if self._env.num_envs == 1:
             self._env = Unsqueeze(self._env, device=self._device)
+
+    def _wrapper_eval(
+        self,
+        obs_normalize: bool = True,
+    ) -> None:
+        """Wrapper the environment for evaluation.
+
+        Args:
+            obs_normalize (bool, optional): Whether to normalize the observation. Defaults to True.
+            reward_normalize (bool, optional): Whether to normalize the reward. Defaults to True.
+            cost_normalize (bool, optional): Whether to normalize the cost. Defaults to True.
+        """
+        assert self._eval_env, 'Your environment for evaluation does not exist!'
+        if self._env.need_time_limit_wrapper:
+            assert (
+                self._eval_env.max_episode_steps
+            ), 'You must define max_episode_steps as an\
+                \ninteger or cancel the use of the time_limit wrapper.'
+            self._eval_env = TimeLimit(
+                self._eval_env,
+                time_limit=self._eval_env.max_episode_steps,
+                device=self._device,
+            )
+        if self._env.need_auto_reset_wrapper:
+            self._eval_env = AutoReset(self._eval_env, device=self._device)
+        if obs_normalize:
+            self._eval_env = ObsNormalize(self._eval_env, device=self._device)
+        self._eval_env = ActionScale(self._eval_env, low=-1.0, high=1.0, device=self._device)
         self._eval_env = Unsqueeze(self._eval_env, device=self._device)
 
     @property

diff --git a/omnisafe/adapter/onpolicy_adapter.py b/omnisafe/adapter/onpolicy_adapter.py
@@ -103,15 +103,20 @@ def rollout(  # pylint: disable=too-many-locals
 
             obs = next_obs
             epoch_end = step >= steps_per_epoch - 1
+            if epoch_end:
+                num_dones = int(terminated.contiguous().sum())
+                if self._env.num_envs - num_dones:
+                    logger.log(
+                        f'\nWarning: trajectory cut off when rollout by epoch\
+                            in {self._env.num_envs - num_dones} of {self._env.num_envs} environments.',
+                    )
+
             for idx, (done, time_out) in enumerate(zip(terminated, truncated)):
                 if epoch_end or done or time_out:
                     last_value_r = torch.zeros(1)
                     last_value_c = torch.zeros(1)
                     if not done:
                         if epoch_end:
-                            logger.log(
-                                f'Warning: trajectory cut off when rollout by epoch at {self._ep_len[idx]} steps.',
-                            )
                             _, last_value_r, last_value_c, _ = agent.step(obs[idx])
                         if time_out:
                             _, last_value_r, last_value_c, _ = agent.step(

diff --git a/omnisafe/configs/on-policy/PPO.yaml b/omnisafe/configs/on-policy/PPO.yaml
@@ -118,3 +118,159 @@ defaults:
       activation: tanh
       # learning rate
       lr: 0.0003
+
+ShadowHandCatchOver2UnderarmSafeFinger:
+  # training configurations
+  train_cfgs:
+    # number of vectorized environments
+    vector_env_nums: 256
+    # total number of steps to train
+    total_steps: 100000000
+  # algorithm configurations
+  algo_cfgs:
+    # number of steps to update the policy
+    steps_per_epoch: 38400
+    # number of iterations to update the policy
+    update_iters: 8
+    # batch size for each iteration
+    batch_size: 8192
+    # target kl divergence
+    target_kl: 0.016
+    # max gradient norm
+    max_grad_norm: 1.0
+    # use critic norm
+    use_critic_norm: False
+    # reward discount factor
+    gamma: 0.96
+    # normalize reward
+    reward_normalize: False
+    # normalize cost
+    cost_normalize: False
+    # normalize observation
+    obs_normalize: False
+  # model configurations
+  model_cfgs:
+    # actor network configurations
+    actor:
+      # hidden layer sizes
+      hidden_sizes: [1024, 1024, 512]
+    critic:
+      # hidden layer sizes
+      hidden_sizes: [1024, 1024, 512]
+      # learning rate
+      lr: 0.0006
+
+ShadowHandOverSafeFinger:
+  # training configurations
+  train_cfgs:
+    # number of vectorized environments
+    vector_env_nums: 256
+    # total number of steps to train
+    total_steps: 100000000
+  # algorithm configurations
+  algo_cfgs:
+    # number of steps to update the policy
+    steps_per_epoch: 38400
+    # number of iterations to update the policy
+    update_iters: 8
+    # batch size for each iteration
+    batch_size: 8192
+    # target kl divergence
+    target_kl: 0.016
+    # max gradient norm
+    max_grad_norm: 1.0
+    # use critic norm
+    use_critic_norm: False
+    # reward discount factor
+    gamma: 0.96
+    # normalize observation
+    obs_normalize: False
+  # model configurations
+  model_cfgs:
+    # actor network configurations
+    actor:
+      # hidden layer sizes
+      hidden_sizes: [1024, 1024, 512]
+    critic:
+      # hidden layer sizes
+      hidden_sizes: [1024, 1024, 512]
+      # learning rate
+      lr: 0.0006
+
+ShadowHandCatchOver2UnderarmSafeJoint:
+  # training configurations
+  train_cfgs:
+    # number of vectorized environments
+    vector_env_nums: 256
+    # total number of steps to train
+    total_steps: 100000000
+  # algorithm configurations
+  algo_cfgs:
+    # number of steps to update the policy
+    steps_per_epoch: 38400
+    # number of iterations to update the policy
+    update_iters: 8
+    # batch size for each iteration
+    batch_size: 8192
+    # target kl divergence
+    target_kl: 0.016
+    # max gradient norm
+    max_grad_norm: 1.0
+    # use critic norm
+    use_critic_norm: False
+    # reward discount factor
+    gamma: 0.96
+    # normalize reward
+    reward_normalize: False
+    # normalize cost
+    cost_normalize: False
+    # normalize observation
+    obs_normalize: False
+  # model configurations
+  model_cfgs:
+    # actor network configurations
+    actor:
+      # hidden layer sizes
+      hidden_sizes: [1024, 1024, 512]
+    critic:
+      # hidden layer sizes
+      hidden_sizes: [1024, 1024, 512]
+      # learning rate
+      lr: 0.0006
+
+ShadowHandOverSafeJoint:
+  # training configurations
+  train_cfgs:
+    # number of vectorized environments
+    vector_env_nums: 256
+    # total number of steps to train
+    total_steps: 100000000
+  # algorithm configurations
+  algo_cfgs:
+    # number of steps to update the policy
+    steps_per_epoch: 38400
+    # number of iterations to update the policy
+    update_iters: 8
+    # batch size for each iteration
+    batch_size: 8192
+    # target kl divergence
+    target_kl: 0.016
+    # max gradient norm
+    max_grad_norm: 1.0
+    # use critic norm
+    use_critic_norm: False
+    # reward discount factor
+    gamma: 0.96
+    # normalize observation
+    obs_normalize: False
+  # model configurations
+  model_cfgs:
+    # actor network configurations
+    actor:
+      # hidden layer sizes
+      hidden_sizes: [1024, 1024, 512]
+    critic:
+      # hidden layer sizes
+      hidden_sizes: [1024, 1024, 512]
+      # learning rate
+      lr: 0.0006