From 51818709f1bef340f1abc67ce3f6d096c75338c3 Mon Sep 17 00:00:00 2001 From: Jiayi Zhou <108712610+Gaiejj@users.noreply.github.com> Date: Tue, 3 Sep 2024 23:47:50 +0800 Subject: [PATCH] docs: update appendix (#350) --- .github/workflows/lint.yml | 4 - docs/source/benchmark/case-study.md | 54 + docs/source/benchmark/modelbased.md | 223 ++ docs/source/benchmark/off-policy.md | 938 ++++++ docs/source/benchmark/offline.md | 275 ++ docs/source/benchmark/on-policy.md | 2841 +++++++++++++++++ docs/source/index.rst | 15 + docs/source/spelling_wordlist.txt | 11 + docs/source/start/algo.md | 111 + docs/source/start/efficiency.rst | 60 + docs/source/start/exp-grid.md | 31 + docs/source/start/features.md | 257 ++ omnisafe/adapter/modelbased_adapter.py | 2 +- omnisafe/common/logger.py | 4 +- omnisafe/common/offline/data_collector.py | 2 +- .../envs/classic_control/envs_from_crabs.py | 2 +- omnisafe/envs/safety_gymnasium_modelbased.py | 9 +- omnisafe/evaluator.py | 2 +- omnisafe/utils/plotter.py | 3 +- pyproject.toml | 20 +- 20 files changed, 4842 insertions(+), 22 deletions(-) create mode 100644 docs/source/benchmark/case-study.md create mode 100644 docs/source/benchmark/modelbased.md create mode 100644 docs/source/benchmark/off-policy.md create mode 100644 docs/source/benchmark/offline.md create mode 100644 docs/source/benchmark/on-policy.md create mode 100644 docs/source/start/algo.md create mode 100644 docs/source/start/efficiency.rst create mode 100644 docs/source/start/exp-grid.md create mode 100644 docs/source/start/features.md diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index 80db45035..e6f178cb0 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -45,10 +45,6 @@ jobs: run: | make pre-commit - - name: ruff - run: | - make ruff - - name: flake8 run: | make flake8 diff --git a/docs/source/benchmark/case-study.md b/docs/source/benchmark/case-study.md new file mode 100644 index 000000000..28f93ba72 --- /dev/null +++ b/docs/source/benchmark/case-study.md @@ -0,0 +1,54 @@ +# Case Study + +One important motivation for SafeRL is to enable agents to explore and +learn safely. Therefore, evaluating algorithm performance concerning +*procedural constraint violations* is also important. We have selected +representative experimental results and report as shown in Figure 1 and Figure 2: + +#### Radical vs. Conservative + +*Radical* policies often explore higher rewards but violate more safety +constraints, whereas *Conservative* policies are the opposite. +Figure 1 illustrates this: during training, CPO and +PPOLag consistently pursue the highest rewards among all algorithms, as +depicted in the first row. However, as shown in the second row, they +experience significant fluctuations in constraint violations, especially +for PPOLag. So, they are relatively radical, *i.e.,* higher rewards but +higher costs. In comparison, while P3O achieves slightly lower rewards +than PPOLag, it maintains fewer oscillations in constraint violations, +making it safer in adhering to safety constraints, evident from the +smaller proportion of its distribution crossing the black dashed line. A +similar pattern is also observed when comparing PCPO with CPO. +Therefore, P3O and PCPO are relatively conservative, *i.e.,* lower costs +but lower rewards. + + +
+ +**Figure 1:** PPOLag, P3O, CPO, and PCPO training on four tasks in for 1e7 steps, showing the distribution of all episodic rewards and costs. All data covers over 5 random seeds and filters out data points over 3 standard deviations. The black dashed line in the graph represents the preset `cost_limit`. + + +#### Oscillation vs. Stability + +The oscillations in the degree of constraint violations during the +training process can indicate the performance of SafeRL algorithms. +These oscillations are quantified by *Extremes*, *i.e.,* the maximum +constraint violation, and *Distributions*, *i.e.,* the frequency of +violations remaining below a predefined `cost_limit`. As shown in +Figure 2, PPOLag, a popular baseline in SafeRL, +utilizes the Lagrangian multiplier for constraint handling. Despite its +simplicity and ease of implementation, PPOLag often suffers from +significant oscillations due to challenges in setting appropriate +initial values and learning rates. It consistently seeks higher rewards +but always leads to larger extremes and unsafe distributions. +Conversely, CPPOPID, which employs a PID controller for updating the +Lagrangian multiplier, markedly reduces these extremes. CUP implements a +two-stage projection method that constrains violations' distribution +below the `cost_limit`. Lastly, PPOSaute integrates state observations +with constraints, resulting in smaller extremes and safer distributions +of violations. + + +
+ +**Figure 2:** PPOLag, CPPOPID, CUP, and PPOSaute trained on four tasks in for all 1e7 steps, showing the distribution of all episodic rewards and costs. All data covers over 5 random seeds and filters out data points over 3 standard deviations. The black dashed line in the graph represents the preset `cost_limit`. diff --git a/docs/source/benchmark/modelbased.md b/docs/source/benchmark/modelbased.md new file mode 100644 index 000000000..abbc1fce2 --- /dev/null +++ b/docs/source/benchmark/modelbased.md @@ -0,0 +1,223 @@ +# Model-based Algorithms + +The OmniSafe Navigation Benchmark for model-based algorithms evaluates the effectiveness of OmniSafe's model-based algorithms across two different environments from the [Safety-Gymnasium](https://github.com/PKU-Alignment/safety-gymnasium) task suite. For each supported algorithm and environment, we offer the following: + +- Default hyperparameters used for the benchmark and scripts that enable result replication. +- Graphs and raw data that can be utilized for research purposes. +- Detailed logs obtained during training. + +Supported algorithms are listed below: + +- **[NeurIPS 2001]** [Deep Reinforcement Learning in a Handful of Trials using Probabilistic Dynamics Models (PETS))](https://arxiv.org/abs/1805.12114) +- **[CoRL 2021]** [Learning Off-Policy with Online Planning (LOOP and SafeLOOP)](https://arxiv.org/abs/2008.10066) +- **[AAAI 2022]** [Conservative and Adaptive Penalty for Model-Based Safe Reinforcement Learning (CAP)](https://arxiv.org/abs/2112.07701) +- **[ICML 2022 Workshop]** [Constrained Model-based Reinforcement Learning with Robust Cross-Entropy Method (RCE)](https://arxiv.org/abs/2010.07968) +- **[NeurIPS 2018]** [Constrained Cross-Entropy Method for Safe Reinforcement Learning (CCE)](https://proceedings.neurips.cc/paper/2018/hash/34ffeb359a192eb8174b6854643cc046-Abstract.html) + +## Safety-Gymnasium + +We highly recommend using **Safety-Gymnasium** to run the following experiments. To install, in a linux machine, type: + +```bash +pip install safety_gymnasium +``` + +## Run the Benchmark + +You can set the main function of ``examples/benchmarks/experiment_grid.py`` as: + +```python +if __name__ == '__main__': + eg = ExperimentGrid(exp_name='Model-Based-Benchmarks') + + # set up the algorithms. + model_based_base_policy = ['LOOP', 'PETS'] + model_based_safe_policy = ['SafeLOOP', 'CCEPETS', 'CAPPETS', 'RCEPETS'] + eg.add('algo', model_based_base_policy + model_based_safe_policy) + + # you can use wandb to monitor the experiment. + eg.add('logger_cfgs:use_wandb', [False]) + # you can use tensorboard to monitor the experiment. + eg.add('logger_cfgs:use_tensorboard', [True]) + eg.add('train_cfgs:total_steps', [1000000]) + + # set up the environment. + eg.add('env_id', [ + 'SafetyPointGoal1-v0-modelbased', + 'SafetyCarGoal1-v0-modelbased', + ]) + eg.add('seed', [0, 5, 10, 15, 20]) + + # total experiment num must can be divided by num_pool + # meanwhile, users should decide this value according to their machine + eg.run(train, num_pool=5) +``` + +After that, you can run the following command to run the benchmark: + +```bash +cd examples/benchmarks +python run_experiment_grid.py +``` + +You can set the path of ``examples/benchmarks/experiment_grid.py`` : +example: + +```python +path ='omnisafe/examples/benchmarks/exp-x/Model-Based-Benchmarks' +``` + +You can also plot the results by running the following command: + +```bash +cd examples +python analyze_experiment_results.py +``` + +**For a detailed usage of OmniSafe statistics tool, please refer to [this tutorial](https://omnisafe.readthedocs.io/en/latest/common/stastics_tool.html).** + +## OmniSafe Benchmark + +To demonstrate the high reliability of the algorithms implemented, OmniSafe offers performance insights within the Safety-Gymnasium environment. It should be noted that all data is procured under the constraint of `cost_limit=1.00`. The results are presented in Table 1 and Figure 1. + +### Performance Table + + + + + + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
PETSLOOPSafeLOOP
EnvironmentRewardCostRewardCostRewardCost
SafetyCarGoal1-v033.07 ±1.3361.20 ±7.2325.41 ±1.2362.64 ±8.3422.09 ±0.300.16 ±0.15
SafetyPointGoal1-v027.66 ±0.0749.16 ±2.6925.08 ±1.4755.23 ±2.6422.94 ±0.720.04 ±0.07
CCEPETSRCEPETSCAPPETS
EnvironmentRewardCostRewardCostRewardCost
SafetyCarGoal1-v027.60 ±1.211.03 ±0.2929.08 ±1.631.02 ±0.8823.33 ±6.340.48 ±0.17
SafetyPointGoal1-v024.98 ±0.051.87 ±1.2725.39 ±0.282.46 ±0.589.45 ±8.620.64 ±0.77
+
+ +

Table 1: The performance of OmniSafe model-based algorithms, encompassing both reward and cost, was assessed within the Safety-Gymnasium environments. It is crucial to highlight that all model-based algorithms underwent evaluation following 1e6 training steps.

+ +### Performance Curves + + + + + + + + +
+ +
+
+ SafetyCarGoal1-v0 +
+
+ +
+
+ SafetyPointGoal1-v0 +
+
+ +

Figure 1: Training curves in Safety-Gymnasium environments, covering classical reinforcement learning algorithms and safe learning algorithms mentioned in Table 1.

diff --git a/docs/source/benchmark/off-policy.md b/docs/source/benchmark/off-policy.md new file mode 100644 index 000000000..077f1a3d5 --- /dev/null +++ b/docs/source/benchmark/off-policy.md @@ -0,0 +1,938 @@ +# Off-Policy Algorithms + +The OmniSafe Safety-Gymnasium Benchmark for off-policy algorithms evaluates the effectiveness of OmniSafe's off-policy algorithms across multiple environments from the [Safety-Gymnasium](https://github.com/PKU-Alignment/safety-gymnasium) task suite. For each supported algorithm and environment, we offer the following: + +- Default hyperparameters used for the benchmark and scripts that enable result replication. +- Performance comparison with other open-source implementations. +- Graphs and raw data that can be utilized for research purposes. +- Detailed logs obtained during training. + +Supported algorithms are listed below: + +- **[ICLR 2016]** [Deep Deterministic Policy Gradient (DDPG)](https://arxiv.org/pdf/1509.02971.pdf) +- **[ICML 2018]** [Twin Delayed DDPG (TD3)](https://arxiv.org/pdf/1802.09477.pdf) +- **[ICML 2018]** [Soft Actor-Critic (SAC)](https://arxiv.org/pdf/1812.05905.pdf) +- **[Preprint 2019][[1]](#footnote1)** [The Lagrangian version of DDPG (DDPGLag)](https://cdn.openai.com/safexp-short.pdf) +- **[Preprint 2019][[1]](#footnote1)** [The Lagrangian version of TD3 (TD3Lag)](https://cdn.openai.com/safexp-short.pdf) +- **[Preprint 2019][[1]](#footnote1)** [The Lagrangian version of SAC (SACLag)](https://cdn.openai.com/safexp-short.pdf) +- **[ICML 2020]** [Responsive Safety in Reinforcement Learning by PID Lagrangian Methods (DDPGPID)](https://arxiv.org/abs/2007.03964) +- **[ICML 2020]** [Responsive Safety in Reinforcement Learning by PID Lagrangian Methods (TD3PID)](https://arxiv.org/abs/2007.03964) +- **[ICML 2020]** [Responsive Safety in Reinforcement Learning by PID Lagrangian Methods (SACPID)](https://arxiv.org/abs/2007.03964) + +## Safety-Gymnasium + +We highly recommend using **Safety-Gymnasium** to run the following experiments. To install, in a linux machine, type: + +```bash +pip install safety_gymnasium +``` + +## Run the Benchmark +You can set the main function of `examples/benchmarks/experiment_grid.py` as: + +```python +if __name__ == '__main__': + eg = ExperimentGrid(exp_name='Off-Policy-Benchmarks') + + # set up the algorithms. + off_policy = ['DDPG', 'SAC', 'TD3', 'DDPGLag', 'TD3Lag', 'SACLag', 'DDPGPID', 'TD3PID', 'SACPID'] + eg.add('algo', off_policy) + + # you can use wandb to monitor the experiment. + eg.add('logger_cfgs:use_wandb', [False]) + # you can use tensorboard to monitor the experiment. + eg.add('logger_cfgs:use_tensorboard', [True]) + + # the default configs here are as follows: + # eg.add('algo_cfgs:steps_per_epoch', [2000]) + # eg.add('train_cfgs:total_steps', [2000 * 500]) + # which can reproduce results of 1e6 steps. + + # if you want to reproduce results of 3e6 steps, using + # eg.add('algo_cfgs:steps_per_epoch', [2000]) + # eg.add('train_cfgs:total_steps', [2000 * 1500]) + + # set the device. + avaliable_gpus = list(range(torch.cuda.device_count())) + gpu_id = [0, 1, 2, 3] + # if you want to use CPU, please set gpu_id = None + # gpu_id = None + + if gpu_id and not set(gpu_id).issubset(avaliable_gpus): + warnings.warn('The GPU ID is not available, use CPU instead.', stacklevel=1) + gpu_id = None + + # set up the environments. + eg.add('env_id', [ + 'SafetyHopper', + 'SafetyWalker2d', + 'SafetySwimmer', + 'SafetyAnt', + 'SafetyHalfCheetah', + 'SafetyHumanoid' + ]) + eg.add('seed', [0, 5, 10, 15, 20]) + eg.run(train, num_pool=5, gpu_id=gpu_id) +``` + +After that, you can run the following command to run the benchmark: + +```bash +cd examples/benchmarks +python run_experiment_grid.py +``` + +You can also plot the results by running the following command: + +```bash +cd examples +python analyze_experiment_results.py +``` + +**For a detailed usage of OmniSafe statistics tool, please refer to [this tutorial](https://omnisafe.readthedocs.io/en/latest/common/stastics_tool.html).** + +Logs are saved in `examples/benchmarks/exp-x` and can be monitored with tensorboard or wandb. + +```bash +tensorboard --logdir examples/benchmarks/exp-x +``` + +After the experiment is finished, you can use the following command to generate the video of the trained agent: + +```bash +cd examples +python evaluate_saved_policy.py +``` +Please note that before you evaluate, please set the `LOG_DIR` in `evaluate_saved_policy.py`. + +For example, if I train `DDPG` in `SafetyHumanoid` + +```python +LOG_DIR = '~/omnisafe/examples/runs/DDPG-/seed-000' +play = True +save_replay = True +if __name__ == '__main__': + evaluator = omnisafe.Evaluator(play=play, save_replay=save_replay) + for item in os.scandir(os.path.join(LOG_DIR, 'torch_save')): + if item.is_file() and item.name.split('.')[-1] == 'pt': + evaluator.load_saved( + save_dir=LOG_DIR, model_name=item.name, camera_name='track', width=256, height=256 + ) + evaluator.render(num_episodes=1) + evaluator.evaluate(num_episodes=1) +``` + +## OmniSafe Benchmark + +### Classic Reinforcement Learning Algorithms + +In an effort to ascertain the credibility of OmniSafe’s algorithmic implementation, a comparative assessment was conducted, juxtaposing the performance of classical reinforcement +learning algorithms, such as DDPG, TD3 and SAC. The performance table is provided in Table 1, with +well-established open-source implementations, specifically [Tianshou](https://github.com/thu-ml/tianshou) and +[Stable-Baselines3](https://github.com/DLR-RM/stable-baselines3). + + + + + + + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
DDPGTD3SAC
EnvironmentOmniSafe (Ours)TianshouStable-Baselines3OmniSafe (Ours)TianshouStable-Baselines3OmniSafe (Ours)TianshouStable-Baselines3
SafetyAntVelocity-v1860.86 ± 198.03308.60 ± 318.602654.58 ± 1738.215246.86 ± 580.505379.55 ± 224.693079.45 ± 1456.815456.31 ± 156.046012.30 ± 102.642404.50 ± 1152.65
SafetyHalfCheetahVelocity-v111377.10 ± 75.2912493.55 ± 437.547796.63 ± 3541.6411246.12 ± 488.6210246.77 ± 908.398631.27 ± 2869.1511488.86 ± 513.0912083.89 ± 564.517767.74 ± 3159.07
SafetyHopperVelocity-v11462.56 ± 591.142018.97 ± 1045.202214.06 ± 1219.573404.41 ± 82.572682.53 ± 1004.842542.67 ± 1253.333597.70 ± 32.233546.59 ± 76 .002158.54 ± 1343.24
SafetyHumanoidVelocity-v11537.39 ± 335.62124.96 ± 61.682276.92 ± 2299.685798.01 ± 160.723838.06 ± 1832.903511.06 ± 2214.126039.77 ± 167.825424.55 ± 118.522713.60 ± 2256.89
SafetySwimmerVelocity-v1139.39 ± 11.74138.98 ± 8.60210.40 ± 148.0198.39 ± 32.2894.43 ±9.63247.09 ± 131.6946.44 ±1.2344.34 ±2.01247.33 ± 122.02
SafetyWalker2dVelocity-v11911.70 ± 395.97543.23 ± 316.103917.46 ± 1077.383034.83 ± 1374.724267.05 ± 678.654087.94 ± 755.104419.29 ± 232.064619.34 ± 274.433906.78 ± 795.48
+
+ + +

Table 1: The performance of OmniSafe, which was evaluated in relation to published baselines within the Safety-Gymnasium environments. Experimental outcomes, comprising mean and standard deviation, were derived from 10 assessment iterations encompassing multiple random seeds. A noteworthy distinction lies in the fact that Stable-Baselines3 employs distinct parameters tailored to each environment, while OmniSafe maintains a consistent parameter set across all environments.

+ +### Safe Reinforcement Learning Algorithms + +To demonstrate the high reliability of the algorithms implemented, OmniSafe offers performance insights within the Safety-Gymnasium environment. It should be noted that all data is procured under the constraint of `cost_limit=25.00`. The results are presented in Table 2, Figure 1, Figure 2, Figure 3. + +#### Performance Table + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
DDPGTD3SAC
EnvironmentRewardCostRewardCostRewardCost
SafetyAntVelocity-v1860.86 ± 198.03234.80 ± 40.635246.86 ± 580.50912.90 ± 93.735456.31 ± 156.04943.10 ± 47.51
SafetyHalfCheetahVelocity-v111377.10 ± 75.29980.93 ± 1.0511246.12 ± 488.62981.27 ± 0.3111488.86 ± 513.09981.93 ± 0.33
SafetyHopperVelocity-v11462.56 ± 591.14429.17 ± 220.053404.41 ± 82.57973.80 ± 4.923537.70 ± 32.23975.23 ± 2.39
SafetyHumanoidVelocity-v11537.39 ± 335.6248.79 ±13.065798.01 ± 160.72255.43 ± 437.136039.77 ± 167.8241.42 ±49.78
SafetySwimmerVelocity-v1139.39 ± 11.74200.53 ± 43.2898.39 ±32.28115.27 ± 44.9046.44 ±1.2340.97 ±0.47
SafetyWalker2dVelocity-v11911.70 ± 395.97318.10 ± 71.033034.83 ± 1374.72606.47 ± 337.334419.29 ± 232.06877.70 ± 8.95
SafetyCarCircle1-v044.64 ±2.15371.93 ± 38.7544.57 ±2.71383.37 ± 62.0343.46 ±4.39406.87 ± 78.78
SafetyCarGoal1-v036.99 ±1.6657.13 ±38.4036.26 ±2.3569.70 ±52.1835.71 ±2.2454.73 ±46.74
SafetyPointCircle1-v0113.67 ± 1.33421.53 ± 142.66115.15 ± 2.24391.07 ± 38.34115.06 ± 2.04403.43 ± 44.78
SafetyPointGoal1-v025.55 ±2.6241.60 ±37.1727.28 ±1.2151.43 ±33.0527.04 ±1.4967.57 ±32.13
DDPGLagTD3LagSACLag
EnvironmentRewardCostRewardCostRewardCost
SafetyAntVelocity-v11271.48 ± 581.7133.27 ±13.341944.38 ± 759.2063.27 ±46.891897.32 ± 1213.745.73 ±7.83
SafetyHalfCheetahVelocity-v12743.06 ± 21.770.33 ±0.122741.08 ± 49.1310.47 ±14.452833.72 ± 3.620.00 ±0.00
SafetyHopperVelocity-v11093.25 ± 81.5515.00 ±21.21928.79 ± 389.4840.67 ±30.99963.49 ± 291.6420.23 ±28.47
SafetyHumanoidVelocity-v12059.96 ± 485.6819.71 ±4.055751.99 ± 157.2810.71 ±23.605940.04 ± 121.9317.59 ±6.24
SafetySwimmerVelocity-v113.18 ±20.3128.27 ±32.2715.58 ±16.9713.27 ±17.6411.03 ±11.1722.70 ±32.10
SafetyWalker2dVelocity-v12238.92 ± 400.6733.43 ±20.082996.21 ± 74.4022.50 ±16.972676.47 ± 300.4330.67 ±32.30
SafetyCarCircle1-v033.29 ±6.5520.67 ±28.4834.38 ±1.552.25 ±3.9031.42 ±11.6722.33 ±26.16
SafetyCarGoal1-v022.80 ±8.7517.33 ±21.407.31 ±5.3433.83 ±31.0310.83 ±11.2922.67 ±28.91
SafetyPointCircle1-v070.71 ±13.6122.00 ±32.8083.07 ±3.497.83 ±15.7983.68 ±3.3212.83 ±19.53
SafetyPointGoal1-v017.17 ±10.0320.33 ±31.5925.27 ±2.7428.00 ±15.7521.45 ±6.9719.17 ±9.72
DDPGPIDTD3PIDSACPID
EnvironmentRewardCostRewardCostRewardCost
SafetyAntVelocity-v12078.27 ± 704.7718.20 ±7.212410.46 ± 217.0044.50 ±38.391940.55 ± 482.4113.73 ±7.24
SafetyHalfCheetahVelocity-v12737.61 ± 45.9336.10 ±11.032695.64 ± 29.4235.93 ±14.032689.01 ± 15.4621.43 ±5.49
SafetyHopperVelocity-v11034.42 ± 350.5929.53 ±34.541225.97 ± 224.7146.87 ±65.28812.80 ± 381.8692.23 ±77.64
SafetyHumanoidVelocity-v11082.36 ± 486.4815.00 ±19.516179.38 ± 105.705.60 ±6.236107.36 ± 113.246.20 ±10.14
SafetySwimmerVelocity-v123.99 ±7.7630.70 ±21.8128.62 ±8.4822.47 ±7.697.50 ±10.427.77 ±8.48
SafetyWalker2dVelocity-v11378.75 ± 896.7314.77 ±13.022769.64 ± 67.236.53 ±8.861251.87 ± 721.5441.23 ±73.33
SafetyCarCircle1-v026.89 ±11.1831.83 ±33.5934.77 ±3.2447.00 ±39.5334.41 ±7.195.00 ±11.18
SafetyCarGoal1-v019.35 ±14.6317.50 ±21.3127.28 ±4.509.50 ±12.1516.21 ±12.656.67 ±14.91
SafetyPointCircle1-v071.63 ±8.390.00 ±0.0070.95 ±6.000.00 ±0.0075.15 ±6.654.50 ±4.65
SafetyPointGoal1-v019.85 ±5.3222.67 ±13.7318.76 ±7.8712.17 ±9.3915.87 ±6.7327.50 ±15.25
+
+ +**Table 2:** The performance of OmniSafe off-policy algorithms, which underwent evaluation under the experimental setting of `cost_limit=25.00`. During experimentation, it was observed that off-policy algorithms did not violate safety constraints in `SafetyHumanoidVeloicty-v1`. This observation suggests that the agent may not have fully learned to run within 1e6 steps; consequently, the 3e6 results were utilized in off-policy `SafetyHumanoidVeloicty-v1`. Meanwhile, in environments with strong stochasticity such as `SafetyCarCircle1-v0`, `SafetyCarGoal1-v0`, `SafetyPointCircle1-v0`, and `SafetyPointGoal1-v0`, off-policy methods require more training steps to estimate a more accurate Q-function. Therefore, we also conducted evaluations on these four environments using a training duration of 3e6 steps. For other environments, we use the evaluation results after 1e6 training steps. + +#### Performance Curves + +
+DDPG, TD3, and SAC + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+
+ SafetyAntVelocity-v1 +
+
+ +
+
+ SafetyHalfCheetahVelocity-v1 +
+
+ +
+
+ SafetyHopperVelocity-v1 +
+
+ +
+
+ SafetyHumanoidVelocity-v1 +
+
+ +
+
+ SafetySwimmerVelocity-v1 +
+
+ +
+
+ SafetyWalker2dVelocity-v1 +
+
+ +
+
+ SafetyCarCircle1-v0 +
+
+ +
+
+ SafetyCarGoal1-v0 +
+
+ +
+
+ SafetyPointCircle1-v0 +
+
+ +
+
+ SafetyPointGoal1-v0 +
+
+

Figure 1: Training curves in Safety-Gymnasium environments, covering classical reinforcement learning algorithms mentioned in Table 1 and safe reinforcement learning algorithms.

+
+ +
+DDPGLag, TD3Lag, and SACLag + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+
+ SafetyAntVelocity-v1 +
+
+ +
+
+ SafetyHalfCheetahVelocity-v1 +
+
+ +
+
+ SafetyHopperVelocity-v1 +
+
+ +
+
+ SafetyHumanoidVelocity-v1 +
+
+ +
+
+ SafetySwimmerVelocity-v1 +
+
+ +
+
+ SafetyWalker2dVelocity-v1 +
+
+ +
+
+ SafetyCarCircle1-v0 +
+
+ +
+
+ SafetyCarGoal1-v0 +
+
+ +
+
+ SafetyPointCircle1-v0 +
+
+ +
+
+ SafetyPointGoal1-v0 +
+
+

Figure 2: Training curves in Safety-Gymnasium environments, covering lagrangian reinforcement learning algorithms mentioned in Table 1 and safe reinforcement learning algorithms.

+
+ +
+DDPGPID, TD3PID, and SACPID + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+
+ SafetyAnt +
+
+ +
+
+ SafetyHalfCheetahVelocity-v1 +
+
+ +
+
+ SafetyHopperVelocity-v1 +
+
+ +
+
+ SafetyHumanoidVelocity-v1 +
+
+ +
+
+ SafetySwimmerVelocity-v1 +
+
+ +
+
+ SafetyWalker2dVelocity-v1 +
+
+ +
+
+ SafetyCarCircle1-v0 +
+
+ +
+
+ SafetyCarGoal1-v0 +
+
+ +
+
+ SafetyPointCircle1-v0 +
+
+ +
+
+ SafetyPointGoal1-v0 +
+
+ +

Figure 3: Training curves in Safety-Gymnasium environments, covering pid-lagrangian reinforcement learning algorithms mentioned in Table 1 and safe reinforcement learning algorithms.

+
diff --git a/docs/source/benchmark/offline.md b/docs/source/benchmark/offline.md new file mode 100644 index 000000000..58bb5cebf --- /dev/null +++ b/docs/source/benchmark/offline.md @@ -0,0 +1,275 @@ +# Offline Algorithms + +OmniSafe's Mujoco Velocity Benchmark evaluated the performance of OmniSafe's offline algorithm implementations in SafetyPointCirlce, SafetyPointCirlce from the Safety-Gymnasium task suite. For each algorithm and environment supported, we provide: + +- Default hyperparameters used for the benchmark and scripts to reproduce the results. +- A comparison of performance or code-level details with other open-source implementations or classic papers. +- Graphs and raw data that can be used for research purposes. +- Log details obtained during training. + +Supported algorithms are listed below: + +- **[ICML 2019]** [Batch-Constrained deep Q-learning(BCQ)](https://arxiv.org/pdf/1812.02900.pdf) +- [The Lagrange version of BCQ (BCQ-Lag)](https://arxiv.org/pdf/1812.02900.pdf) +- **[NeurIPS 2020]** [Critic Regularized Regression](https://proceedings.neurips.cc//paper/2020/file/588cb956d6bbe67078f29f8de420a13d-Paper.pdf) +- [The Constrained version of CRR (C-CRR)](https://proceedings.neurips.cc/paper/2020/hash/588cb956d6bbe67078f29f8de420a13d-Abstract.html) +- **[ICLR 2022 (Spotlight)]** [COptiDICE: Offline Constrained Reinforcement Learning via Stationary Distribution Correction Estimation](https://arxiv.org/abs/2204.08957?context=cs.AI) + +## Safety-Gymnasium + +We highly recommend using ``safety-gymnasium`` to run the following experiments. To install, in a linux machine, type: + +```bash +pip install safety_gymnasium +``` + +## Training agents used to generate data + +```bash +omnisafe train --env-id SafetyAntVelocity-v1 --algo PPO +omnisafe train --env-id SafetyAntVelocity-v1 --algo PPOLag +``` + +## Collect offline data + +```python +from omnisafe.common.offline.data_collector import OfflineDataCollector + + +# please change agent path +env_name = 'SafetyAntVelocity-v1' +size = 1_000_000 +agents = [ + ('./runs/PPO', 'epoch-500', 500_000), + ('./runs/PPOLag', 'epoch-500', 500_000), +] +save_dir = './data' + +if __name__ == '__main__': + col = OfflineDataCollector(size, env_name) + for agent, model_name, num in agents: + col.register_agent(agent, model_name, num) + col.collect(save_dir) +``` + +## Run the Benchmark + +You can set the main function of ``examples/benchmarks/experimrnt_grid.py`` as: + +```python +if __name__ == '__main__': + eg = ExperimentGrid(exp_name='offline-Benchmarks') + + # set up the algorithms. + offline_policy = ['VAEBC', 'BCQ', 'BCQLag', 'CCR', 'CCRR', 'COptiDICE'] + + eg.add('algo', offline_policy) + + # you can use wandb to monitor the experiment. + eg.add('logger_cfgs:use_wandb', [False]) + # you can use tensorboard to monitor the experiment. + eg.add('logger_cfgs:use_tensorboard', [True]) + # add dataset path + eg.add('train_cfgs:dataset', [dataset_path]) + + # set up the environment. + eg.add('env_id', [ + 'SafetyAntVelocity-v1', + ]) + eg.add('seed', [0, 5, 10, 15, 20]) + + # total experiment num must can be divided by num_pool + # meanwhile, users should decide this value according to their machine + eg.run(train, num_pool=5) +``` + +After that, you can run the following command to run the benchmark: + +```bash +cd examples/benchmarks +python run_experiment_grid.py +``` + +You can also plot the results by running the following command: + +```bash +cd examples +python plot.py --log-dir ALGODIR +``` + +## OmniSafe Benchmark + +### Performance Table + + + + + + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
VAE-BCC-CRRBCQLagCOptiDICE
EnvironmentRewardCostRewardCostRewardCostRewardCost
SafetyPointCircle1-v0(beta=0.25)43.66 ± 0.90109.86 ± 13.2445.48 ± 0.87127.30 ± 12.6043.31 ± 0.76113.39 ± 12.8140.68 ± 0.9367.11 ± 13.15
SafetyPointCircle1-v0(beta=0.50)42.84 ± 1.3662.34 ± 14.8445.99 ± 1.3697.20 ± 13.5744.68 ± 1.9795.06 ± 33.0739.55 ± 1.3953.87 ± 13.27
SafetyPointCircle1-v0(beta=0.75)40.23 ± 0.7541.25 ± 10.1240.66 ± 0.8849.90 ± 10.8142.94 ± 1.0485.37 ± 23.4140.98 ± 0.8970.40 ± 12.14
SafetyCarCircle1-v0(beta=0.25)19.62 ± 0.28150.54 ± 7.6318.53 ± 0.45122.63 ± 13.1418.88 ± 0.61125.44 ± 15.6817.25 ± 0.3790.86 ± 10.75
SafetyCarCircle1-v0(beta=0.50)18.69 ± 0.33125.97 ± 10.3617.24 ± 0.4389.47 ± 11.5518.14 ± 0.96108.07 ± 20.7016.38 ± 0.4370.54 ± 12.36
SafetyCarCircle1-v0(beta=0.75)17.31 ± 0.3385.53 ± 11.3315.74 ± 0.4248.38 ± 10.3117.10 ± 0.8477.54 ± 14.0715.58 ± 0.3749.42 ± 8.70
+
+ +

Table 1:The performance of OmniSafe offline algorithms, which was evaluated following 1e6 training steps and under the experimental setting of cost limit=25.00. We introduce a quantization parameter beta from the perspective of safe trajectories and control the trajectory distribution of the mixed dataset. This parameter beta indicates the difficulty of this dataset to a certain extent. When beta is smaller, it means that the number of safe trajectories in the current dataset is smaller, the less safe information can be available for the algorithm to learn.

+ + + +### Performance Curves + + +
+
SafetyPointCircle1-v0(beta=0.25)
+ + +
+
SafetyPointCircle1-v0(beta=0.50)
+ + +
+
SafetyPointCircle1-v0(beta=0.75)
+ + +
SafetyCarCircle1-v0(beta=0.25)
+ + +
+
SafetyCarCircle1-v0(beta=0.5)
+ + +
+
SafetyCarCircle1-v0(beta=0.75)
diff --git a/docs/source/benchmark/on-policy.md b/docs/source/benchmark/on-policy.md new file mode 100644 index 000000000..798551e8e --- /dev/null +++ b/docs/source/benchmark/on-policy.md @@ -0,0 +1,2841 @@ +# On-Policy Algorithms + +The OmniSafe Safety-Gymnasium Benchmark for on-policy algorithms evaluates the effectiveness of OmniSafe's on-policy algorithms across multiple environments from the [Safety-Gymnasium](https://github.com/PKU-Alignment/safety-gymnasium) task suite. For each supported algorithm and environment, we offer the following: + +- Default hyperparameters used for the benchmark and scripts that enable result replication. +- Performance comparison with other open-source implementations. +- Graphs and raw data that can be utilized for research purposes. +- Detailed logs obtained during training. + +Supported algorithms are listed below: + +**First-Order** + +- **[NIPS 1999]** [Policy Gradient (PG)](https://papers.nips.cc/paper/1999/file/464d828b85b0bed98e80ade0a5c43b0f-Paper.pdf) +- **[Preprint 2017]** [Proximal Policy Optimization (PPO)](https://arxiv.org/pdf/1707.06347.pdf) +- [The Lagrange version of PPO (PPOLag)](https://cdn.openai.com/safexp-short.pdf) +- **[IJCAI 2022]** [Penalized Proximal Policy Optimization for Safe Reinforcement Learning (P3O)]( https://arxiv.org/pdf/2205.11814.pdf) +- **[NeurIPS 2020]** [First Order Constrained Optimization in Policy Space (FOCOPS)](https://arxiv.org/abs/2002.06506) +- **[NeurIPS 2022]** [Constrained Update Projection Approach to Safe Policy Optimization (CUP)](https://arxiv.org/abs/2209.07089) + +**Second-Order** + +- **[NeurIPS 2001]** [A Natural Policy Gradient (NaturalPG))](https://proceedings.neurips.cc/paper/2001/file/4b86abe48d358ecf194c56c69108433e-Paper.pdf) +- **[PMLR 2015]** [Trust Region Policy Optimization (TRPO)](https://arxiv.org/abs/1502.05477) +- [The Lagrange version of TRPO (TRPOLag)](https://cdn.openai.com/safexp-short.pdf) +- **[ICML 2017]** [Constrained Policy Optimization (CPO)](https://proceedings.mlr.press/v70/achiam17a) +- **[ICML 2017]** [Proximal Constrained Policy Optimization (PCPO)](https://proceedings.mlr.press/v70/achiam17a) +- **[ICLR 2019]** [Reward Constrained Policy Optimization (RCPO)](https://openreview.net/forum?id=SkfrvsA9FX) + +**Saute RL** + +- **[ICML 2022]** [Sauté RL: Almost Surely Safe Reinforcement Learning Using State Augmentation (PPOSaute, TRPOSaute)](https://arxiv.org/abs/2202.06558) + +**Simmer** + +- **[NeurIPS 2022]** [Effects of Safety State Augmentation on Safe Exploration (PPOSimmerPID, TRPOSimmerPID)](https://arxiv.org/abs/2206.02675) + +**PID-Lagrangian** + +- **[ICML 2020]** [Responsive Safety in Reinforcement Learning by PID Lagrangian Methods (CPPOPID, TRPOPID)](https://arxiv.org/abs/2007.03964) + +**Early Terminated MDP** + +- **[Preprint 2021]** [Safe Exploration by Solving Early Terminated MDP (PPOEarlyTerminated, TRPOEarlyTerminated)](https://arxiv.org/pdf/2107.04200.pdf) + + + + +## Safety-Gymnasium + +We highly recommend using **Safety-Gymnasium** to run the following experiments. To install, in a linux machine, type: + +```bash +pip install safety_gymnasium +``` + +## Run the Benchmark + +You can set the main function of `examples/benchmarks/experiment_grid.py` as: + +```python +if __name__ == '__main__': + eg = ExperimentGrid(exp_name='On-Policy-Benchmarks') + + # set up the algorithms. + base_policy = ['PolicyGradient', 'NaturalPG', 'TRPO', 'PPO'] + naive_lagrange_policy = ['PPOLag', 'TRPOLag', 'RCPO'] + first_order_policy = ['CUP', 'FOCOPS', 'P3O'] + second_order_policy = ['CPO', 'PCPO'] + saute_policy = ['PPOSaute', 'TRPOSaute'] + simmer_policy = ['PPOSimmerPID', 'TRPOSimmerPID'] + pid_policy = ['CPPOPID', 'TRPOPID'] + early_mdp_policy = ['PPOEarlyTerminated', 'TRPOEarlyTerminated'] + + eg.add( + 'algo', + base_policy + + naive_lagrange_policy + + first_order_policy + + second_order_policy + + saute_policy + + simmer_policy + + pid_policy + + early_mdp_policy + ) + + # you can use wandb to monitor the experiment. + eg.add('logger_cfgs:use_wandb', [False]) + # you can use tensorboard to monitor the experiment. + eg.add('logger_cfgs:use_tensorboard', [True]) + + # the default configs here are as follows: + # eg.add('algo_cfgs:steps_per_epoch', [20000]) + # eg.add('train_cfgs:total_steps', [20000 * 500]) + # which can reproduce results of 1e7 steps. + + # if you want to reproduce results of 1e6 steps, using + # eg.add('algo_cfgs:steps_per_epoch', [2048]) + # eg.add('train_cfgs:total_steps', [2048 * 500]) + + # set the device. + avaliable_gpus = list(range(torch.cuda.device_count())) + # if you want to use GPU, please set gpu_id like follows: + # gpu_id = [0, 1, 2, 3] + # if you want to use CPU, please set gpu_id = None + # we recommends using CPU to obtain results as consistent + # as possible with our publicly available results, + # since the performance of all on-policy algorithms + # in OmniSafe is tested on CPU. + gpu_id = None + + if gpu_id and not set(gpu_id).issubset(avaliable_gpus): + warnings.warn('The GPU ID is not available, use CPU instead.', stacklevel=1) + gpu_id = None + + # set up the environment. + eg.add('env_id', [ + 'SafetyHopper', + 'SafetyWalker2d', + 'SafetySwimmer', + 'SafetyAnt', + 'SafetyHalfCheetah', + 'SafetyHumanoid' + ]) + eg.add('seed', [0, 5, 10, 15, 20]) + + # total experiment num must can be divided by num_pool. + # meanwhile, users should decide this value according to their machine. + eg.run(train, num_pool=5, gpu_id=gpu_id) +``` + +After that, you can run the following command to run the benchmark: + +```bash +cd examples/benchmarks +python run_experiment_grid.py +``` + +You can also plot the results by running the following command: + +```bash +cd examples +python analyze_experiment_results.py +``` + +**For a detailed usage of OmniSafe statistics tool, please refer to [this tutorial](https://omnisafe.readthedocs.io/en/latest/common/stastics_tool.html).** + +Logs is saved in `examples/benchmarks/exp-x` and can be monitored with tensorboard or wandb. + +```bash +tensorboard --logdir examples/benchmarks/exp-x +``` + +After the experiment is finished, you can use the following command to generate the video of the trained agent: + +```bash +cd examples +python evaluate_saved_policy.py +``` + +Please note that before you evaluate, set the `LOG_DIR` in `evaluate_saved_policy.py`. + +For example, if I train `PPOLag` in `SafetyHumanoid` + +```python +LOG_DIR = '~/omnisafe/examples/runs/PPOLag-/seed-000' +play = True +save_replay = True +if __name__ == '__main__': + evaluator = omnisafe.Evaluator(play=play, save_replay=save_replay) + for item in os.scandir(os.path.join(LOG_DIR, 'torch_save')): + if item.is_file() and item.name.split('.')[-1] == 'pt': + evaluator.load_saved( + save_dir=LOG_DIR, model_name=item.name, camera_name='track', width=256, height=256 + ) + evaluator.render(num_episodes=1) + evaluator.evaluate(num_episodes=1) +``` + +## OmniSafe Benchmark + +### Classic Reinforcement Learning Algorithms +To ascertain the credibility of OmniSafe ’s algorithmic implementation, a comparative assessment was conducted, juxtaposing the performance of classical reinforcement learning algorithms. Such as Policy Gradient, Natural Policy Gradient, TRPO and PPO. The performance table is provided in Table 1. with well-established open-source implementations, specifically [Tianshou](https://github.com/thu-ml/tianshou) and [Stable-Baselines3](https://github.com/DLR-RM/stable-baselines3). + + + + + + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Policy +GradientPPO
EnvironmentOmniSafe (Ours)TianshouStable-Baselines3OmniSafe (Ours)TianshouStable-Baselines3
SafetyAntVelocity-v12769.45 ± 550.71145.33 ± 127.55- ±-4295.96 ± 658.22607.48 ± 1415.781780.61 ± 780.65
SafetyHalfCheetahVelocity-v12625.44 ± 1079.04707.56 ± 158.59- ±-3507.47 ± 1563.696299.27 ± 1692.385074.85 ± 2225.47
SafetyHopperVelocity-v11884.38 ± 825.13343.88 ± 51.85- ±-2679.98 ± 921.961834.7 ± 862.06838.96 ± 351.10
SafetyHumanoidVelocity-v1647.52 ± 154.82438.97 ± 123.68- ±-1106.09 ± 607.6677.43 ± 189.96762.73 ± 170.22
SafetySwimmerVelocity-v147.31 ± 16.1927.12 ±7.47- ±-113.28 ± 20.2237.93 ±8.68273.86 ± 87.76
SafetyWalker2dVelocity-v11665 .00 ± 930.18373.63 ± 129.2- ±-3806.39 ± 1547.483748.26 ± 1832.833304.35 ± 706.13
NaturalPGTRPO
EnvironmentOmniSafe (Ours)TianshouStable-Baselines3OmniSafe (Ours)TianshouStable-Baselines3
SafetyAntVelocity-v13793.70 ± 583.662062.45 ± 876.43- ±-4362.43 ± 640.542521.36 ± 1442.103233.58 ± 1437.16
SafetyHalfCheetahVelocity-v14096.77 ± 1223.703430.9 ± 239.38- ±-3313.31 ± 1048.784255.73 ± 1053.827185.06 ± 3650.82
SafetyHopperVelocity-v12590.54 ± 631.05993.63 ± 489.42- ±-2698.19 ± 568.801346.94 ± 984.092467.10 ± 1160.25
SafetyHumanoidVelocity-v13838.67 ± 1654.79810.76 ± 270.69- ±-1461.51 ± 602.23749.42 ± 149.812828.18 ± 2256.38
SafetySwimmerVelocity-v1116.33 ± 5.9729.75 ±12.00- ±-105.08 ± 31.0037.21 ±4.04258.62 ± 124.91
SafetyWalker2dVelocity-v14054.62 ± 1266.763372.59 ± 1049.14- ±-4099.97 ± 409.053372.59 ± 961.744227.91 ± 760.93
+
+ + +

Table 1:The performance of OmniSafe, which was evaluated in relation to published baselines within the Safety-Gymnasium MuJoCo Velocity environments. Experimental outcomes, comprising mean and standard deviation, were derived from 10 assessment iterations encompassing multiple random seeds.

+ +### Safe Reinforcement Learning Algorithms + +To demonstrate the high reliability of the algorithms implemented, OmniSafe offers performance insights within the Safety-Gymnasium environment. It should be noted that all data is procured under the constraint of `cost_limit=25.00`. The results are presented in Table 2 and the training curves are in the following sections (Please click the triangle button to see the training curves). + +#### Performance Table + + + + + + + + + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Policy GradientNatural PGTRPOPPO
EnvironmentRewardCostRewardCostRewardCostRewardCost
SafetyAntVelocity-v15292.29 ± 913.44919.42 ± 158.615547.20 ± 807.89895.56 ± 77.136026.79 ± 314.98933.46 ± 41.285977.73 ± 885.65958.13 ± 134.5
SafetyHalfCheetahVelocity-v15188.46 ± 1202.76896.55 ± 184.75878.28 ± 2012.24847.74 ± 249.026490.76 ± 2507.18734.26 ± 321.886921.83 ± 1721.79919.2 ±173.08
SafetyHopperVelocity-v13218.17 ± 672.88881.76 ± 198.462613.95 ± 866.13587.78 ± 220.972047.35 ± 447.33448.12 ± 103.872337.11 ± 942.06550.02 ± 237.70
SafetyHumanoidVelocity-v17001.78 ± 419.67834.11 ± 212.438055.20 ± 641.67946.40 ± 9.118681.24 ± 3934.08718.42 ± 323.309115.93 ± 596.88960.44 ± 7.06
SafetySwimmerVelocity-v177.05 ±33.44107.1 ±60.58120.19 ± 7.74161.78 ± 17.51124.91 ± 6.13176.56 ± 15.95119.77 ± 13.8165.27 ± 20.15
SafetyWalker2dVelocity-v14832.34 ± 685.76866.59 ± 93.475347.35 ± 436.86914.74 ± 32.616096.67 ± 723.06914.46 ± 27.856239.52 ± 879.99902.68 ± 100.93
SafetyCarGoal1-v035.86 ±1.9757.46 ±48.3436.07 ±1.2558.06 ±10.0336.60 ±0.2255.58 ±12.6833.41 ±2.8958.06 ±42.06
SafetyCarButton1-v019.76 ±10.15353.26 ± 177.0822.16 ±4.48333.98 ± 67.4921.98 ±2.06343.22 ± 24.6017.51 ±9.46373.98 ± 156.64
SafetyCarGoal2-v029.43 ±4.62179.2 ±84.8630.26 ±0.38209.62 ± 29.9732.17 ±1.24190.74 ± 21.0529.88 ±4.55194.16 ± 106.2
SafetyCarButton2-v018.06 ±10.53349.82 ± 187.0720.85 ±3.14313.88 ± 58.2020.51 ±3.34316.42 ± 35.2821.35 ±8.22312.64 ± 138.4
SafetyPointGoal1-v026.19 ±3.44201.22 ± 80.426.92 ±0.5857.92 ±9.9727.20 ±0.4445.88 ±11.2725.44 ±5.4355.72 ±35.55
SafetyPointButton1-v029.98 ±5.24141.74 ± 75.1331.95 ±1.53123.98 ± 32.0530.61 ±0.40134.38 ± 22.0627.03 ±6.14152.48 ± 80.39
SafetyPointGoal2-v025.18 ±3.62204.96 ± 104.9726.19 ±0.84193.60 ± 18.5425.61 ±0.89202.26 ± 15.1525.49 ±2.46159.28 ± 87.13
SafetyPointButton2-v026.88 ±4.38153.88 ± 65.5428.45 ±1.49160.40 ± 20.0828.78 ±2.05170.30 ± 30.5925.91 ±6.15166.6 ±111.21
RCPOTRPOLagPPOLagP3O
EnvironmentRewardCostRewardCostRewardCostRewardCost
SafetyAntVelocity-v13139.52 ± 110.3412.34 ±3.113041.89 ± 180.7719.52 ±20.213261.87 ± 80.0012.05 ±6.572636.62 ± 181.0920.69 ±10.23
SafetyHalfCheetahVelocity-v12440.97 ± 451.889.02 ±9.342884.68 ± 77.479.04 ±11.832946.15 ± 306.353.44 ±4.772117.84 ± 313.5527.6 ±8.36
SafetyHopperVelocity-v11428.58 ± 199.8711.12 ±12.661391.79 ± 269.0711.22 ±9.97961.92 ± 752.8713.96 ±19.331231.52 ± 465.3516.33 ±11.38
SafetyHumanoidVelocity-v16286.51 ± 151.0319.47 ±7.746551.30 ± 58.4259.56 ±117.376624.46 ± 25.95.87 ±9.466342.47 ± 82.45126.4 ±193.76
SafetySwimmerVelocity-v161.29 ±18.1222.60 ±1.1681.18 ±16.3322.24 ±3.9164.74 ±17.6728.02 ±4.0938.02 ±34.1818.4 ±12.13
SafetyWalker2dVelocity-v13064.43 ± 218.833.02 ±1.483207.10 ± 7.8814.98 ±9.272982.27 ± 681.5513.49 ±14.552713.57 ± 313.220.51 ±14.09
SafetyCarGoal1-v018.71 ±2.7223.10 ±12.5727.04 ±1.8226.80 ±5.6413.27 ±9.2621.72 ±32.06-1.10 ±6.85150.58 ±99.24
SafetyCarButton1-v0-2.04 ±2.9843.48 ±31.52-0.38 ±0.8537.54 ±31.720.33 ±1.9655.5 ±89.64-2.06 ±7.243.78 ±98.01
SafetyCarGoal2-v02.30 ±1.7622.90 ±16.223.65 ±1.0939.98 ±20.291.58 ±2.4913.82 ±24.62-0.07 ±1.6243.86 ±99.58
SafetyCarButton2-v0-1.35 ±2.4142.02 ±31.77-1.68 ±2.5520.36 ±13.670.76 ±2.5247.86 ±103.270.11 ±0.7285.94 ±122.01
SafetyPointGoal1-v015.27 ±4.0530.56 ±19.1518.51 ±3.8322.98 ±8.4512.96 ±6.9525.80 ±34.991.6 ±3.0131.1 ±80.03
SafetyPointButton1-v03.65 ±4.4726.30 ±9.226.93 ±1.8431.16 ±20.584.60 ±4.7320.8 ±35.78-0.34 ±1.5352.86 ±85.62
SafetyPointGoal2-v02.17 ±1.4633.82 ±21.934.64 ±1.4326.00 ±4.701.98 ±3.8641.20 ±61.030.34 ±2.265.84 ±195.76
SafetyPointButton2-v07.18 ±1.9345.02 ±25.285.43 ±3.4425.10 ±8.980.93 ±3.6933.72 ±58.750.33 ±2.4428.5 ±49.79
CUPPCPOFOCOPSCPO
EnvironmentRewardCostRewardCostRewardCostRewardCost
SafetyAntVelocity-v13215.79 ± 346.6818.25 ±17.122257.07 ± 47.9710.44 ±5.223184.48 ± 305.5914.75 ±6.363098.54 ± 78.9014.12 ±3.41
SafetyHalfCheetahVelocity-v12850.6 ± 244.654.27 ±4.461677.93 ± 217.3119.06 ±15.262965.2 ± 290.432.37 ±3.52786.48 ± 173.454.70 ±6.72
SafetyHopperVelocity-v11716.08 ± 5.937.48 ±5.5351551.22 ± 85.1615.46 ±9.831437.75 ± 446.8710.13 ±8.871713.71 ± 18.2613.40 ±5.82
SafetyHumanoidVelocity-v16109.94 ± 497.5624.69 ±20.545852.25 ± 78.010.24 ±0.486489.39 ± 35.113.86 ±39.336465.34 ± 79.870.18 ±0.36
SafetySwimmerVelocity-v163.83 ±46.4521.95 ±11.0454.42 ±38.6517.34 ±1.5753.87 ±17.929.75 ±7.3365.30 ±43.2518.22 ±8.01
SafetyWalker2dVelocity-v12466.95 ± 1114.136.63 ±8.251802.86 ± 714.0418.82 ±5.573117.05 ± 53.608.78 ±12.382074.76 ± 962.4521.90 ±9.41
SafetyCarGoal1-v06.14 ±6.9736.12 ±89.5621.56 ±2.8738.42 ±8.3615.23 ±10.7631.66 ±93.5125.52 ±2.6543.32 ±14.35
SafetyCarButton1-v01.49 ±2.84103.24 ± 123.120.36 ±0.8540.52 ±21.250.21 ±2.2731.78 ±47.030.82 ±1.6037.86 ±27.41
SafetyCarGoal2-v01.78 ±4.0395.4 ±129.641.62 ±0.5648.12 ±31.192.09 ±4.3331.56 ±58.933.56 ±0.9232.66 ±3.31
SafetyCarButton2-v01.49 ±2.64173.68 ± 163.770.66 ±0.4249.72 ±36.501.14 ±3.1846.78 ±57.470.17 ±1.1948.56 ±29.34
SafetyPointGoal1-v014.42 ±6.7419.02 ±20.0818.57 ±1.7122.98 ±6.5614.97 ±9.0133.72 ±42.2420.46 ±1.3828.84 ±7.76
SafetyPointButton1-v03.5 ±7.0739.56 ±54.262.66 ±1.8349.40 ±36.765.89 ±7.6638.24 ±42.964.04 ±4.5440.00 ±4.52
SafetyPointGoal2-v01.06 ±2.67107.3 ±204.261.06 ±0.6951.92 ±47.402.21 ±4.1537.92 ±111.812.50 ±1.2540.84 ±23.31
SafetyPointButton2-v02.88 ±3.6554.24 ±71.071.05 ±1.2741.14 ±12.352.43 ±3.3317.92 ±26.15.09 ±1.8348.92 ±17.79
PPOSauteTRPOSautePPOSimmerPIDTRPOSimmerPID
EnvironmentRewardCostRewardCostRewardCostRewardCost
SafetyAntVelocity-v12978.74 ± 93.6516.77 ±0.922507.65 ± 63.978.036 ±0.392944.84 ± 60.5316.20 ±0.663018.95 ± 66.4416.52 ±0.23
SafetyHalfCheetahVelocity-v12901.40 ± 25.4916.20 ± 0.602521.80 ± 477.297.61 ±0.392922.17 ± 24.8416.14 ±0.142737.79 ± 37.5316.44 ±0.21
SafetyHopperVelocity-v11650.91 ± 152.6517.87 ±1.331368.28 ± 576.0810.38 ±4.381699.94 ± 24.2517.04 ±0.411608.41 ± 88.2316.30 ±0.30
SafetyHumanoidVelocity-v16401.00 ± 32.2317.10 ±2.415759.44 ± 75.7315.84 ±1.426401.85 ± 57.6211.06 ±5.356411.32 ± 44.2613.04 ±2.68
SafetySwimmerVelocity-v135.61 ±4.373.44 ±1.3534.72 ±1.3710.19 ±2.3277.52 ±40.200.98 ±1.9151.39 ±40.090.00 ±0.00
SafetyWalker2dVelocity-v12410.89 ± 241.2218.88 ±2.382548.82 ± 891.6513.21 ±6.093187.56 ± 32.6617.10 ±0.493156.99 ± 30.9317.14 ±0.54
SafetyCarGoal1-v07.12 ±5.4121.68 ±29.1116.67 ±10.5723.58 ±26.398.45 ±7.1618.98 ±25.6315.08 ±13.4123.22 ±19.80
SafetyCarButton1-v0-1.72 ±0.8951.88 ±28.18-2.03 ±0.406.24 ±6.14-0.57 ±0.6349.14 ±37.77-1.24 ±0.4717.26 ±16.13
SafetyCarGoal2-v00.90 ±1.2019.98 ±10.121.76 ±5.2031.50 ±45.501.02 ±1.4127.32 ±60.120.93 ±2.2126.66 ±60.07
SafetyCarButton2-v0-1.89 ±1.8647.33 ±28.90-2.60 ±0.4074.57 ±84.95-1.31 ±0.9352.33 ±19.96-0.99 ±0.6320.40 ±12.77
SafetyPointGoal1-v07.06 ±5.8520.04 ±21.9116.18 ±9.5529.94 ±26.688.30 ±6.0325.32 ±31.9111.64 ±8.4630.00 ±27.67
SafetyPointButton1-v0-1.47 ±0.9822.60 ±13.91-3.13 ±3.519.04 ±3.94-1.97 ±1.4112.80 ±7.84-1.36 ±0.372.14 ±1.73
SafetyPointGoal2-v00.84 ±2.9314.06 ±30.211.64 ±4.0219.00 ±34.690.56 ±2.5212.36 ±43.391.55 ±4.6814.90 ±27.82
SafetyPointButton2-v0-1.38 ±0.1112.00 ±8.60-2.56 ±0.6717.27 ±10.01-1.70 ±0.297.90 ±3.30-1.66 ±0.996.70 ±4.74
CPPOPIDTRPOPIDPPOEarlyTerminatedTRPOEarlyTerminated
EnvironmentRewardCostRewardCostRewardCostRewardCost
SafetyAntVelocity-v13213.36 ± 146.7814.30 ±7.393052.94 ± 139.6715.22 ±3.682801.53 ± 19.660.23 ±0.093052.63 ± 58.410.40 ±0.23
SafetyHalfCheetahVelocity-v12837.89 ± 398.528.06 ±9.622796.75 ± 190.8411.16 ±9.802447.25 ± 346.843.47 ±4.902555.70 ± 368.170.06 ±0.08
SafetyHopperVelocity-v11713.29 ± 10.218.96 ±4.281178.59 ± 646.7118.76 ±8.931643.39 ± 2.580.77 ±0.261646.47 ± 49.950.42 ±0.84
SafetyHumanoidVelocity-v16579.26 ± 55.703.76 ±3.616407.95 ± 254.067.38 ±11.346321.45 ± 35.730.00 ±0.006332.14 ± 89.860.00 ±0.00
SafetySwimmerVelocity-v191.05 ±62.6819.12 ±8.3369.75 ±46.5220.48 ±9.1333.02 ±7.2624.23 ±0.5439.24 ±5.0123.20 ±0.48
SafetyWalker2dVelocity-v12183.43 ± 1300.6914.12 ±10.282707.75 ± 980.569.60 ±8.942195.57 ± 1046.297.63 ±10.442079.64 ± 1028.7313.74 ±15.94
SafetyCarGoal1-v010.60 ±2.5130.66 ±7.5325.49 ±1.3128.92 ±7.6617.92 ±1.5421.60 ±0.8322.09 ±3.0717.97 ±1.35
SafetyCarButton1-v0-1.36 ±0.6814.62 ±9.40-0.31 ±0.4915.24 ±17.014.47 ±1.1225.00 ±0.004.34 ±0.7225.00 ±0.00
SafetyCarGoal2-v00.13 ±1.1123.50 ±1.221.77 ±1.2017.43 ±12.136.59 ±0.5825.00 ±0.007.12 ±4.0623.37 ±1.35
SafetyCarButton2-v0-1.59 ±0.7039.97 ±26.91-2.95 ±4.0327.90 ±6.374.86 ±1.5725.00 ±0.005.07 ±1.2425.00 ±0.00
SafetyPointGoal1-v08.43 ±3.4325.74 ±7.8319.24 ±3.9421.38 ±6.9616.03 ±8.6019.17 ±9.4216.31 ±6.9922.10 ±6.13
SafetyPointButton1-v01.18 ±1.0229.42 ±12.106.40 ±1.4327.90 ±13.277.48 ±8.4724.27 ±3.959.52 ±7.8625.00 ±0.00
SafetyPointGoal2-v0-0.56 ±0.0648.43 ±40.551.67 ±1.4323.50 ±11.176.09 ±5.0325.00 ±0.008.62 ±7.1325.00 ±0.00
SafetyPointButton2-v00.42 ±0.6328.87 ±11.271.00 ±1.0030.00 ±9.506.94 ±4.4725.00 ±0.008.35 ±10.4425.00 ±0.00
+
+ +

Table 2: The performance of OmniSafe on-policy algorithms, encompassing both reward and cost, was assessed within the Safety-Gymnasium environments. It is crucial to highlight that all on-policy algorithms underwent evaluation following 1e7 training steps.

+ +#### First Order Algorithms + +
+1e6 Steps Velocity Results + + + + +
+ +
+
+ SafetyAntVelocity-v1 +
+
+ + + + +
+ +
+
+ SafetyHalfCheetahVelocity-v1 +
+
+ + + +
+ +
+
+ SafetyHopperVelocity-v1 +
+
+ + + + +
+ +
+
+ SafetyHumanoidVelocity-v1 +
+
+ + + + +
+ +
+
+ SafetyWalker2dVelocity-v1 +
+
+ + + + +
+ +
+
+ SafetySwimmerVelocity-v1 +
+
+

Figure 1.1: Training curves in Safety-Gymnasium MuJoCo Velocity environments within 1e6 steps + +

+ +
+1e7 Steps Velocity Results + + + + +
+ +
+
+ SafetyAntVelocity-v1 +
+
+ + + + +
+ +
+
+ SafetyHalfCheetahVelocity-v1 +
+
+ + + +
+ +
+
+ SafetyHopperVelocity-v1 +
+
+ + + + +
+ +
+
+ SafetyHumanoidVelocity-v1 +
+
+ + + + +
+ +
+
+ SafetyWalker2dVelocity-v1 +
+
+ + + + +
+ +
+
+ SafetySwimmerVelocity-v1 +
+
+

Figure 1.2: Training curves in Safety-Gymnasium MuJoCo Velocity environments within 1e7 steps + +

+ +
+1e7 Steps Navigation Results + + + + +
+ +
+
+ SafetyCarButton1-v0 +
+
+ + + + +
+ +
+
+ SafetyCarButton2-v0 +
+
+ + + + +
+ +
+
+ SafetyCarGoal1-v0 +
+
+ + + + +
+ +
+
+ SafetyCarGoal2-v0 +
+
+ + + + +
+ +
+
+ SafetyPointButton1-v0 +
+
+ + + + +
+ +
+
+ SafetyPointButton2-v0 +
+
+ + + + +
+ +
+
+ SafetyPointGoal1-v0 +
+
+ + + + +
+ +
+
+ SafetyPointGoal2-v0 +
+
+

Figure 1.3: Training curves in Safety-Gymnasium MuJoCo Navigation environments within 1e7 steps + +

+ +#### Second Order Algorithms + +
+1e6 Steps Velocity Results + + + + +
+ +
+
+ SafetyAntVelocity-v1 +
+
+ + + + +
+ +
+
+ SafetyHalfCheetahVelocity-v1 +
+
+ + + +
+ +
+
+ SafetyHopperVelocity-v1 +
+
+ + + + +
+ +
+
+ SafetyHumanoidVelocity-v1 +
+
+ + + + +
+ +
+
+ SafetyWalker2dVelocity-v1 +
+
+ + + + +
+ +
+
+ SafetySwimmerVelocity-v1 +
+
+

Figure 2.1: Training curves of second order algorithms in Safety-Gymnasium MuJoCo Velocity environments within 1e6 steps + +

+ +
+1e7 Steps Velocity Results + + + + +
+ +
+
+ SafetyAntVelocity-v1 +
+
+ + + + +
+ +
+
+ SafetyHalfCheetahVelocity-v1 +
+
+ + + +
+ +
+
+ SafetyHopperVelocity-v1 +
+
+ + + + +
+ +
+
+ SafetyHumanoidVelocity-v1 +
+
+ + + + +
+ +
+
+ SafetyWalker2dVelocity-v1 +
+
+ + + + +
+ +
+
+ SafetySwimmerVelocity-v1 +
+
+

Figure 2.2: Training curves of second order algorithms in Safety-Gymnasium MuJoCo Velocity environments within 1e7 steps + +

+ +
+1e7 Steps Navigation Results + + + + +
+ +
+
+ SafetyCarButton1-v0 +
+
+ + + + +
+ +
+
+ SafetyCarButton2-v0 +
+
+ + + + +
+ +
+
+ SafetyCarGoal1-v0 +
+
+ + + + +
+ +
+
+ SafetyCarGoal2-v0 +
+
+ + + + +
+ +
+
+ SafetyPointButton1-v0 +
+
+ + + + +
+ +
+
+ SafetyPointButton2-v0 +
+
+ + + + +
+ +
+
+ SafetyPointGoal1-v0 +
+
+ + + + +
+ +
+
+ SafetyPointGoal2-v0 +
+
+

Figure 2.3: Training curves of second order algorithms in Safety-Gymnasium MuJoCo Navigation environments within 1e7 steps + +

+ +#### Saute Algorithms + +
+1e6 Steps Velocity Results + + + + +
+ +
+
+ SafetyAntVelocity-v1 +
+
+ + + + +
+ +
+
+ SafetyHalfCheetahVelocity-v1 +
+
+ + + +
+ +
+
+ SafetyHopperVelocity-v1 +
+
+ + + + +
+ +
+
+ SafetyHumanoidVelocity-v1 +
+
+ + + + +
+ +
+
+ SafetyWalker2dVelocity-v1 +
+
+ + + + +
+ +
+
+ SafetySwimmerVelocity-v1 +
+
+

Figure 3.1: Training curves of Saute MDP algorithms in Safety-Gymnasium MuJoCo Velocity environments within 1e6 steps + +

+ +
+1e7 Steps Velocity Results + + + + +
+ +
+
+ SafetyAntVelocity-v1 +
+
+ + + + +
+ +
+
+ SafetyHalfCheetahVelocity-v1 +
+
+ + + +
+ +
+
+ SafetyHopperVelocity-v1 +
+
+ + + + +
+ +
+
+ SafetyHumanoidVelocity-v1 +
+
+ + + + +
+ +
+
+ SafetyWalker2dVelocity-v1 +
+
+ + + + +
+ +
+
+ SafetySwimmerVelocity-v1 +
+
+

Figure 3.2: Training curves of Saute MDP algorithms in Safety-Gymnasium MuJoCo Velocity environments within 1e7 steps + +

+ +
+1e7 Steps Navigation Results + + + + +
+ +
+
+ SafetyCarButton1-v0 +
+
+ + + + +
+ +
+
+ SafetyCarButton2-v0 +
+
+ + + + +
+ +
+
+ SafetyCarCircle1-v0 +
+
+ + + + +
+ +
+
+ SafetyCarCircle2-v0 +
+
+ + + + +
+ +
+
+ SafetyCarGoal1-v0 +
+
+ + + + +
+ +
+
+ SafetyCarGoal2-v0 +
+
+ + + + +
+ +
+
+ SafetyPointButton1-v0 +
+
+ + + + +
+ +
+
+ SafetyPointButton2-v0 +
+
+ + + + +
+ +
+
+ SafetyPointCircle1-v0 +
+
+ + + + +
+ +
+
+ SafetyPointCircle2-v0 +
+
+ + + + +
+ +
+
+ SafetyPointGoal1-v0 +
+
+ + + + +
+ +
+
+ SafetyPointGoal2-v0 +
+
+

Figure 3.3: Training curves of Saute MDP algorithms in Safety-Gymnasium MuJoCo Navigation environments within 1e7 steps + +

+ +#### Simmer Algorithms + +
+1e6 Steps Velocity Results + + + + +
+ +
+
+ SafetyAntVelocity-v1 +
+
+ + + + +
+ +
+
+ SafetyHalfCheetahVelocity-v1 +
+
+ + + +
+ +
+
+ SafetyHopperVelocity-v1 +
+
+ + + + +
+ +
+
+ SafetyHumanoidVelocity-v1 +
+
+ + + + +
+ +
+
+ SafetyWalker2dVelocity-v1 +
+
+ + + + +
+ +
+
+ SafetySwimmerVelocity-v1 +
+
+

Figure 4.1: Training curves of Simmer MDP algorithms in Safety-Gymnasium MuJoCo Velocity environments within 1e6 steps + +

+ +
+1e7 Steps Velocity Results + + + + +
+ +
+
+ SafetyAntVelocity-v1 +
+
+ + + + +
+ +
+
+ SafetyHalfCheetahVelocity-v1 +
+
+ + + +
+ +
+
+ SafetyHopperVelocity-v1 +
+
+ + + + +
+ +
+
+ SafetyHumanoidVelocity-v1 +
+
+ + + + +
+ +
+
+ SafetyWalker2dVelocity-v1 +
+
+ + + + +
+ +
+
+ SafetySwimmerVelocity-v1 +
+
+

Figure 4.2: Training curves of Simmer MDP algorithms in Safety-Gymnasium MuJoCo Velocity environments within 1e7 steps + +

+ +
+1e7 Steps Navigation Results + + + + +
+ +
+
+ SafetyCarButton1-v0 +
+
+ + + + +
+ +
+
+ SafetyCarButton2-v0 +
+
+ + + + +
+ +
+
+ SafetyCarGoal1-v0 +
+
+ + + + +
+ +
+
+ SafetyCarGoal2-v0 +
+
+ + + + +
+ +
+
+ SafetyPointButton1-v0 +
+
+ + + + +
+ +
+
+ SafetyPointButton2-v0 +
+
+ + + + +
+ +
+
+ SafetyPointGoal1-v0 +
+
+ + + + +
+ +
+
+ SafetyPointGoal2-v0 +
+
+

Figure 4.3: Training curves of Simmer MDP algorithms in Safety-Gymnasium MuJoCo Navigation environments within 1e7 steps + +

+ +#### PID-Lagrangian Algorithms + +
+1e6 Steps Velocity Results + + + + +
+ +
+
+ SafetyAntVelocity-v1 +
+
+ + + + +
+ +
+
+ SafetyHalfCheetahVelocity-v1 +
+
+ + + +
+ +
+
+ SafetyHopperVelocity-v1 +
+
+ + + + +
+ +
+
+ SafetyHumanoidVelocity-v1 +
+
+ + + + +
+ +
+
+ SafetyWalker2dVelocity-v1 +
+
+ + + + +
+ +
+
+ SafetySwimmerVelocity-v1 +
+
+

Figure 5.1: Training curves of PID-Lagrangian algorithms in Safety-Gymnasium MuJoCo Velocity environments within 1e6 steps + +

+ +
+1e7 Steps Velocity Results + + + + +
+ +
+
+ SafetyAntVelocity-v1 +
+
+ + + + +
+ +
+
+ SafetyHalfCheetahVelocity-v1 +
+
+ + + +
+ +
+
+ SafetyHopperVelocity-v1 +
+
+ + + + +
+ +
+
+ SafetyHumanoidVelocity-v1 +
+
+ + + + +
+ +
+
+ SafetyWalker2dVelocity-v1 +
+
+ + + + +
+ +
+
+ SafetySwimmerVelocity-v1 +
+
+

Figure 5.2: Training curves of PID-Lagrangian algorithms in Safety-Gymnasium MuJoCo Velocity environments within 1e7 steps + +

+ +
+1e7 Steps Navigation Results + + + + +
+ +
+
+ SafetyCarButton1-v0 +
+
+ + + + +
+ +
+
+ SafetyCarButton2-v0 +
+
+ + + + +
+ +
+
+ SafetyCarGoal1-v0 +
+
+ + + + +
+ +
+
+ SafetyCarGoal2-v0 +
+
+ + + + +
+ +
+
+ SafetyPointButton1-v0 +
+
+ + + + +
+ +
+
+ SafetyPointButton2-v0 +
+
+ + + + +
+ +
+
+ SafetyPointGoal1-v0 +
+
+ + + + +
+ +
+
+ SafetyPointGoal2-v0 +
+
+

Figure 5.3: Training curves of PID-Lagrangian algorithms in Safety-Gymnasium MuJoCo Navigation environments within 1e7 steps. + +

+ +#### Early Terminated MDP Algorithms + +
+1e6 Steps Velocity Results + + + + +
+ +
+
+ SafetyAntVelocity-v1 +
+
+ + + + +
+ +
+
+ SafetyHalfCheetahVelocity-v1 +
+
+ + + +
+ +
+
+ SafetyHopperVelocity-v1 +
+
+ + + + +
+ +
+
+ SafetyHumanoidVelocity-v1 +
+
+ + + + +
+ +
+
+ SafetyWalker2dVelocity-v1 +
+
+ + + + +
+ +
+
+ SafetySwimmerVelocity-v1 +
+
+

Figure 6.1: Training curves of early terminated MDP algorithms in Safety-Gymnasium MuJoCo Velocity environments within 1e6 steps. + +

+ +
+1e7 Steps Velocity Results + + + + +
+ +
+
+ SafetyAntVelocity-v1 +
+
+ + + + +
+ +
+
+ SafetyHalfCheetahVelocity-v1 +
+
+ + + +
+ +
+
+ SafetyHopperVelocity-v1 +
+
+ + + + +
+ +
+
+ SafetyHumanoidVelocity-v1 +
+
+ + + + +
+ +
+
+ SafetyWalker2dVelocity-v1 +
+
+ + + + +
+ +
+
+ SafetySwimmerVelocity-v1 +
+
+

Figure 6.2: Training curves of early terminated MDP algorithms in Safety-Gymnasium MuJoCo Velocity environments within 1e7 steps. + +

+ +
+1e7 Steps Navigation Results + + + + +
+ +
+
+ SafetyCarButton1-v0 +
+
+ + + + +
+ +
+
+ SafetyCarButton2-v0 +
+
+ + + + +
+ +
+
+ SafetyCarGoal1-v0 +
+
+ + + + +
+ +
+
+ SafetyCarGoal2-v0 +
+
+ + + + +
+ +
+
+ SafetyPointButton1-v0 +
+
+ + + + +
+ +
+
+ SafetyPointButton2-v0 +
+
+ + + + +
+ +
+
+ SafetyPointGoal1-v0 +
+
+ + + + +
+ +
+
+ SafetyPointGoal2-v0 +
+
+

Figure 6.3: Training curves of early terminated MDP algorithms in Safety-Gymnasium MuJoCo Navigation environments within 1e7 steps. + +

diff --git a/docs/source/index.rst b/docs/source/index.rst index 792f62052..595844afc 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -365,8 +365,23 @@ this project, don't hesitate to ask your question on `the GitHub issue page + + + + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
DomainsTypesAlgorithms Registry
On PolicyPrimal DualTRPOLag; PPOLag; PDO; RCPO
TRPOPID; CPPOPID
Convex OptimizationCPO; PCPO; FOCOPS; CUP
Penalty FunctionIPO; P3O
PrimalOnCRPO
Off PolicyPrimal-DualDDPGLag; TD3Lag; SACLag
DDPGPID; TD3PID; SACPID
Control Barrier FunctionDDPGCBF, SACRCBF, CRABS
Model-basedOnline PlanSafeLOOP; CCEPETS; RCEPETS
Pessimistic EstimateCAPPETS
OfflineQ-Learning BasedBCQLag; C-CRR
DICE BasedCOptDICE
Other Formulation MDPET-MDPPPOEarlyTerminated; TRPOEarlyTerminated
SauteRLPPOSaute; TRPOSaute
SimmerRLPPOSimmerPID; TRPOSimmerPID
+
+ +

Table 1: OmniSafe supports varieties of SafeRL algorithms. From the perspective of classic RL, OmniSafe includes on-policy, off-policy, offline, and model-based algorithms; From the perspective of the SafeRL learning paradigm, OmniSafe supports primal-dual, projection, penalty function, primal, etc.

diff --git a/docs/source/start/efficiency.rst b/docs/source/start/efficiency.rst new file mode 100644 index 000000000..c0b9bb428 --- /dev/null +++ b/docs/source/start/efficiency.rst @@ -0,0 +1,60 @@ +Efficiency +========== + +To demonstrate the effectiveness and resource utilization of OmniSafe as a SafeRL infrastructure, we have added a comparison of the runtime efficiency between OmniSafe and other SafeRL libraries, *i.e.*, `SafePO `_, `RL-Safety-Algorithms `_, and `Safety-starter-agents `_. The test results are shown in Table 1: + + + +.. table:: **Table 1**: Comparison of computational time consumption between OmniSafe and other libraries in one thread (unit: seconds). We selected classic algorithms PPOLag and CPO for analysis and tested the average single epoch time consumption over 10 epochs with different sizes of neural networks on SG's SafetyPointGoal1-v0. + :name: appendix_f + :width: 100 % + + +------------------------------------+----------------------------+------------------+------------------+------------------+ + | | **PPOLag** | | **CPO** | | + +------------------------------------+----------------------------+------------------+------------------+------------------+ + |**Hidden Layers Size** | 64 x 64 | 1024 x 1024 | 64 x 64 | 1024 x 1024 | + +------------------------------------+----------------------------+------------------+------------------+------------------+ + |**Safety-starter-agents** | 51.64 ± 1.56 | 63.99 ± 1.75 | 50.70 ± 1.17 | 83.09 ± 0.92 | + +------------------------------------+----------------------------+------------------+------------------+------------------+ + | **RL-Safety-Algorithms** | 46.25 ± 0.43 | 107.50 ± 2.18 | 47.24 ± 0.43 | 134.12 ± 0.71 | + +------------------------------------+----------------------------+------------------+------------------+------------------+ + | **SafePO** | 15.91 ± 0.46 | 20.84 ± 0.26 | 16.50 ± 0.50 | 19.72 ± 0.16 | + +------------------------------------+----------------------------+------------------+------------------+------------------+ + | **OmniSafe** | **10.59 ± 0.15** | **14.02 ± 0.16** | **10.06 ± 0.09** | **12.28 ± 0.81** | + +------------------------------------+----------------------------+------------------+------------------+------------------+ + + +In our comparative experiments, we rigorously ensure uniformity across all experimental settings. More specifically, PPOLag and CPO implement early stopping techniques, which vary the number of +updates based on the KL divergence between the current and reference policies. This introduces +randomness into the time measurements. To control for consistent variables, we fixed the number of +``update_iters`` at 1, ``steps_per_epoch`` at 20,000, and ``batch_size`` at 64, conducting the tests on the same machine with no other processes running. The specific device parameters are: + +- **CPU**: AMD Ryzen Threadripper PRO 3975WX 32-Cores +- **GPU**: NVIDIA GeForce RTX 3090, Driver Version: 535.154.05 + +Under these consistent conditions, **OmniSafe achieved the lowest computational time consumption on +the same baseline algorithms**, which we attribute to 3 factors: *vectorized environment +parallelism* for accelerated data collections, `asynchronous agent parallelism `_ for parallelized learning, and *GPU resource utilization* for immense network +support. We will elaborate on how these features contribute to OmniSafe's computational efficiency. + +**Vectorized Environment Parallelism**: OmniSafe and SafePO support vectorized environment +interfaces and buffers. In this experiment, we set the parallelism number of vectorized +environments to 10, meaning that a single agent can simultaneously generate 10 actions based on 10 +vectorized observations and perform batch updates through a vectorized buffer. This feature +enhances the efficiency of agents' data sampling from environments. + +**Asynchronous Agent Parallelism**: OmniSafe supports *Asynchronous Advantage Actor-Critic (A3C)* +parallelism based on the distributed framework ``torch.distributed``. In this experiment, we set +the parallelism number of asynchronous agents to 2, meaning two agents were instantiated to sample +and learn simultaneously, synchronizing their neural network parameters at the end of each epoch. +This feature further enhances the efficiency of agent sampling and updating. + +**GPU Resource Utilization**: Since only OmniSafe and SafePO utilize GPU computing resources, in +this experiment, we used the NVIDIA GeForce RTX 3090 as the computing device. As shown in +:ref:`Table 1 `., when the hidden layer parameters increased from 64 x 64 to 1024 x +1024, the runtime of RL-Safety-Algorithms and Safety-starter-agents significantly increased, +whereas the runtime increase for OmniSafe and SafePO was relatively smaller. This trend is +particularly notable with the CPO algorithm, which requires computing a second-order Hessian matrix +during updates. If computed using a CPU, the computational overhead would increase with the size of +the neural network parameters. However, OmniSafe and SafePO, which support GPU acceleration, are +almost unaffected. diff --git a/docs/source/start/exp-grid.md b/docs/source/start/exp-grid.md new file mode 100644 index 000000000..ddd0b818f --- /dev/null +++ b/docs/source/start/exp-grid.md @@ -0,0 +1,31 @@ +# Experiment Grid + +In the context of RL experiments, it is imperative to assess the performance of various algorithms across multiple environments. However, the inherent influence of randomness necessitates repeated evaluations employing distinct random seeds. To tackle this challenge, introduces an `Experiment Grid`, facilitating simultaneous initiation of multiple experimental sets. Researchers are merely required to pre-configure the experimental parameters, subsequently executing multiple experiment sets in parallel via a single file. An exemplification of this process can be found in Figure 1. + + +
+

+ +**Figure 1:** OmniSafe ’s `Experiment Grid`. The left side of the figure displays the main unction of the run `experiment_grid.py` file, while the right side shows the status of the `Experiment Grid` execution. In this example, three distinct random seeds are selected for the `SafetyAntVelocity-v1` and `SafetyWalker2dVelocity-v1`, then the PPOLag and TRPO-Lag algorithms are executed. + + +
+
+SafetyAntVelocity-v1 +
+ +
+SafetyWalker2dVelocity-v1 +
+
+ +**Figure 2:** Analysis of the example experiment results. The blue lines are the results from PPOLag, while the orange ones are TRPO-Lag. The solid line in the figure represents the mean of multiple random seeds, while the shadow represents the standard deviation among 0, 5, and 10 random seeds. + +The `run_experiment_grid.py` script executes experiments in parallel based on user-configured parameters and generates corresponding graphs of the experimental results. In the example presented in Figure 1, we specified that the script should draw curves based on different environments and obtained the training curves of PPOLag and TRPO-Lag in `SafetyAntVelocity-v1` and `SafetyWalker2dVelocity-v1`, where seeds have been grouped. + +Moreover, combined with `Statistics Tools`, the `Experiment Grid` is a powerful tool for parameter tuning. As illustrated in Figure 3, we utilized the `Experiment Grid` to explore the impact of `batch_size` on the performance of PPOLag and TRPO-Lag in `SafetyWalker2dVelocity-v1` and `SafetyAntVelocity-v1`, then used `Statistics Tools` to analyze the experiment results. It is obvious that the `batch_size` has a significant influence on the performance of PPOLag in `SafetyWalker2dVelocity-v1`, and the optimal `batch_size` is 128. Obtaining this conclusion requires repeating the experiment multiple times, and the `Experiment Grid` significantly expedites the process. + + +
+ +**Figure 3:** An example of how the `Experiment Grid` can be utilized for parameter tuning. In this particular example, we set the `batch_size` in the `algo_cfgs` to 64, 128, and 256. Then we ran multiple experiments using the `Experiment Grid`, and finally used `Statistics Tools` to analyze the impact of the `batch_size` on the performance of the algorithm. Note that different colors denote different `batch_size`. The results showed that the `batch_size` had a significant effect on the performance of the algorithm, and the optimal `batch_size` was found to be 128. The `Experiment Grid` enabled us to efficiently explore the effect of different parameter values on the algorithm's performance. diff --git a/docs/source/start/features.md b/docs/source/start/features.md new file mode 100644 index 000000000..e84e0e98f --- /dev/null +++ b/docs/source/start/features.md @@ -0,0 +1,257 @@ +# Features + +OmniSafe transcends its role as a mere SafeRL library, functioning concurrently as a standardized and user-friendly SafeRL infrastructure. We compared the features of OmniSafe with popular open-source RL libraries. [See comparison results](#compare_with_repo). + +> **Note:** All results in [compare_with_repo](#compare_with_repo) are accurate as of 2024. Please consider the latest results if you find any discrepancies between these data. + +**Table 1:** Comparison of OmniSafe to a representative subset of RL or SafeRL libraries. + + + + + + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
FeaturesOmniSafeTianShouStable-Baselines3SafePORL-Safety-AlgorithmsSafety-starter-agents
Algorithm Tutorial
API Documentation
Command Line Interface
Custom Environment
Docker Support
GPU Support
Ipython / Notebook
PEP8 Code Style
Statistics Tools
Test Coverage97%91%96%91%--
Type Hints
Vectorized Environments
Video Examples
+
+ + Compared to classic RL open-source libraries, [TianShou](https://www.jmlr.org/papers/v23/21-1127.html) and [Stable-Baselines3](https://jmlr.org/papers/v22/20-1364.html), OmniSafe adheres to the same engineering standards and supports user-friendly features. Compared to the SafeRL library, [SafePO](https://proceedings.neurips.cc/paper_files/paper/2023/file/3c557a3d6a48cc99444f85e924c66753-Paper-Datasets_and_Benchmarks.pdf), [RL-Safety-Algorithms](https://github.com/SvenGronauer/RL-Safety-Algorithms), and [Safety-starter-agents](https://github.com/openai/safety-starter-agents), OmniSafe offers greater ease of use and robustness, making it a foundational infrastructure to accelerate SafeRL research. The complete codebase of OmniSafe adheres to the PEP8 style, with each commit undergoing stringent evaluations, such as `isort`, `pylint`, `black`, and `ruff`. Before merging into the main branch, code modifications necessitate approval from at least two reviewers. These features enhance the reliability of OmniSafe and provide assurances for effective ongoing development. + +OmniSafe includes a tutorial on `Colab` that provides a step-by-step guide to the training process, as illustrated in [Figure 2](#figure_2). For those who are new to SafeRL, the tutorial allows for interactive learning of the training procedure. By clicking on `Colab Tutorial`, users can access it and follow the instructions to understand better how to use OmniSafe. Seasoned researchers can capitalize on OmniSafe's informative command-line interface, as demonstrated in [Figure 1](#figure_1) and [Figure 3](#figure_3), facilitating rapid comprehension of the platform's utilization to expedite their scientific investigations. + +Regarding the experiment execution process, OmniSafe presents an array of tools for analyzing experimental outcomes, encompassing `WandB`, `TensorBoard`, and `Statistics Tools`. Furthermore, OmniSafe has submitted its experimental benchmark to the `WandB` report [1], as depicted in [Figure 4](#figure_4). This report furnishes more detailed training curves and evaluation demonstrations of classic algorithms, serving as a valuable reference for researchers. + +[1]: [https://api.wandb.ai/links/pku_rl/mv1eeetb](https://api.wandb.ai/links/pku_rl/mv1eeetb) | [https://api.wandb.ai/links/pku_rl/scvni0oj](https://api.wandb.ai/links/pku_rl/scvni0oj) + +[cli]: #cli +[tutorial]: #tutorial +[cli_details]: #cli_details +[wandb_video]: #wandb_video + + + +
+ + **Figure 1:** An illustration of the OmniSafe command line interface. Users can view the commands supported by OmniSafe and a brief usage guide by simply typing `omnisafe --help` in the command line. If a user wants to further understand how to use a specific command, they can obtain additional prompts by using the command `omnisafe COMMAND --help`, as shown in [Figure 3](#figure_3). + + +
+ + **Figure 2:** A example demonstrating the Colab tutorial provided by OmniSafe for using the `Experiment Grid`. The tutorial includes detailed usage descriptions and allows users to try running it and then see the results. + + +
+ +(a) Example of `omnisafe analyze-grid --help` in command line. + + +
+ +(b) Example of `omnisafe benchmark --help` in command line. + + +
+ +(c) Example of `omnisafe eval --help` in command line. + + +
+ +(d) Example of `omnisafe train-config --help` in command line. + + +
+ + **Figure 3:** Here are some more details on using `omnisafe --help` command. Users can input `omnisafe COMMAND --help` to get help, where `COMMAND` includes all the items listed in `Commands` of [Figure 1](#figure_1). This feature enables users to swiftly acquire proficiency in executing common operations provided by OmniSafe via command-line and customize them further to meet their specific requirements. + + + + + + + + + + +
+ SafetyPointGoal1-v0 +
+ (a) SafetyPointGoal1-v0 +
+ SafetyPointButton1-v0 +
+ (b) SafetyPointButton1-v0 +
+ SafetyCarGoal1-v0 +
+ (c) SafetyCarGoal1-v0 +
+ SafetyCarButton1-v0 +
+ (d) SafetyCarButton1-v0 +
+ +

Figure 4: An exemplification of OmniSafe's WandB reports videos. This example supplies videos of PPO and PPOLag in SafetyPointGoal1-v0, SafetyPointButton1-v0, SafetyCarGoal1-v0, and SafetyCarButton1-v0 environments. The left of each sub-figure is PPO, while the right is PPOLag. Through these videos, we can intuitively witness the difference between safe and unsafe behavior. This is exactly what OmniSafe pursues: not just the safety of the training curve, but the true safety in a real sense.

+ + + +
+ +**Figure 5:** An exemplification of OmniSafe's `WandB` reports training curve in `SafetyPointGoal1-v0`: The left panel represents the episode reward, and the right panel denotes the episode cost, with both encompassing the performance over 1e7 steps. diff --git a/omnisafe/adapter/modelbased_adapter.py b/omnisafe/adapter/modelbased_adapter.py index 8abbd90d7..5d4321bbf 100644 --- a/omnisafe/adapter/modelbased_adapter.py +++ b/omnisafe/adapter/modelbased_adapter.py @@ -330,7 +330,7 @@ def rollout( # pylint: disable=too-many-arguments,too-many-locals eval_start = time.time() eval_func(current_step, True) self._last_eval = current_step - eval_time += time.time() - eval_start + eval_time += time.time() - eval_start # pylint: disable=undefined-variable if not self._first_log or current_step >= self._cfgs.train_cfgs.total_steps: self._log_metrics(logger) diff --git a/omnisafe/common/logger.py b/omnisafe/common/logger.py index 43d447800..9fc753e46 100644 --- a/omnisafe/common/logger.py +++ b/omnisafe/common/logger.py @@ -144,10 +144,10 @@ def __init__( # pylint: disable=too-many-arguments,too-many-locals config=config, ) if config is not None: - wandb.config.update(config) + wandb.config.update(config) # type: ignore if models is not None: for model in models: - wandb.watch(model) + wandb.watch(model) # type: ignore def log(self, msg: str, color: str = 'green', bold: bool = False) -> None: """Log the message to the console and the file. diff --git a/omnisafe/common/offline/data_collector.py b/omnisafe/common/offline/data_collector.py index 35d1e1b75..fc95fda00 100644 --- a/omnisafe/common/offline/data_collector.py +++ b/omnisafe/common/offline/data_collector.py @@ -110,7 +110,7 @@ def register_agent(self, save_dir: str, model_name: str, size: int) -> None: model_path = os.path.join(save_dir, 'torch_save', model_name) try: - model_params = torch.load(model_path) + model_params = torch.load(model_path, weights_only=False) except FileNotFoundError as error: raise FileNotFoundError(f'Model {model_name} not found in {save_dir}') from error diff --git a/omnisafe/envs/classic_control/envs_from_crabs.py b/omnisafe/envs/classic_control/envs_from_crabs.py index a3fd3b404..4f6933db0 100644 --- a/omnisafe/envs/classic_control/envs_from_crabs.py +++ b/omnisafe/envs/classic_control/envs_from_crabs.py @@ -238,7 +238,7 @@ def _get_obs(self): else: return np.array([np.cos(th), np.sin(th), thdot], dtype=np.float32) - def reset(self): + def reset(self): # type: ignore """Reset the environment.""" self.state = self.init_state self.last_u = None diff --git a/omnisafe/envs/safety_gymnasium_modelbased.py b/omnisafe/envs/safety_gymnasium_modelbased.py index fe5ae5071..372ccc4e8 100644 --- a/omnisafe/envs/safety_gymnasium_modelbased.py +++ b/omnisafe/envs/safety_gymnasium_modelbased.py @@ -174,6 +174,7 @@ def get_cost_from_obs_tensor(self, obs: torch.Tensor, is_binary: bool = True) -> cost: Batch cost. """ assert torch.is_tensor(obs), 'obs must be tensor' + assert len(obs.shape) == 2 or len(obs.shape) == 3 hazards_key = self.key_to_slice_tensor['hazards'] if len(obs.shape) == 2: batch_size = obs.shape[0] @@ -181,7 +182,11 @@ def get_cost_from_obs_tensor(self, obs: torch.Tensor, is_binary: bool = True) -> elif len(obs.shape) == 3: batch_size = obs.shape[0] * obs.shape[1] hazard_obs = obs[:, :, hazards_key].reshape(batch_size, -1, 2) - hazards_dist = torch.sqrt(torch.sum(torch.square(hazard_obs), dim=2)).reshape( + else: + raise RuntimeError('observation size mismatch') + hazards_dist = torch.sqrt( + torch.sum(torch.square(hazard_obs), dim=2), + ).reshape( batch_size, -1, ) @@ -499,7 +504,7 @@ def reset( info['goal_met'] = False obs = torch.as_tensor(flat_coordinate_obs, dtype=torch.float32, device=self._device) - return obs, info + return obs, info # pylint: disable=possibly-used-before-assignment def set_seed(self, seed: int) -> None: """Set the seed for the environment. diff --git a/omnisafe/evaluator.py b/omnisafe/evaluator.py index 8732d6e34..13eac7263 100644 --- a/omnisafe/evaluator.py +++ b/omnisafe/evaluator.py @@ -150,7 +150,7 @@ def __load_model_and_env( # load the saved model model_path = os.path.join(save_dir, 'torch_save', model_name) try: - model_params = torch.load(model_path) + model_params = torch.load(model_path, weights_only=False) except FileNotFoundError as error: raise FileNotFoundError('The model is not found in the save directory.') from error diff --git a/omnisafe/utils/plotter.py b/omnisafe/utils/plotter.py index 5bdbb7ec2..f24a97bb4 100644 --- a/omnisafe/utils/plotter.py +++ b/omnisafe/utils/plotter.py @@ -118,8 +118,7 @@ def plot_data( smoothed_x = np.convolve(x, y, 'same') / np.convolve(z, y, 'same') datum['Costs'] = smoothed_x - if isinstance(data, list): - data_to_plot = pd.concat(data, ignore_index=True) + data_to_plot = pd.concat(data, ignore_index=True) sns.lineplot( data=data_to_plot, x=xaxis, diff --git a/pyproject.toml b/pyproject.toml index a74b46723..d7351aeb5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -39,6 +39,11 @@ dependencies = [ "matplotlib >= 3.7.1", "gdown >= 4.6.0", "pytorch_lightning >= 2.2.2", + "cvxopt== 1.3.2", + "gpytorch== 1.11", + "joblib == 1.3.2", + "qpth == 0.0.16", + "scikit_learn == 1.3.2" ] dynamic = ["version", "entry-points"] @@ -125,9 +130,8 @@ ignore-words = "docs/source/spelling_wordlist.txt" # Sync with requires-python target-version = "py38" line-length = 100 -show-source = true src = ["omnisafe", "tests", "examples"] -select = [ +lint.select = [ "E", "W", # pycodestyle "F", # pyflakes "UP", # pyupgrade @@ -148,7 +152,7 @@ select = [ "TID", # flake8-tidy-imports "RUF", # ruff ] -ignore = [ +lint.ignore = [ # E501: line too long # W505: doc line too long # too long docstring due to long example blocks @@ -167,9 +171,9 @@ ignore = [ # use alias for import convention (e.g., `import torch.nn as nn`) "PLR0402", ] -typing-modules = ["omnisafe.typing"] +lint.typing-modules = ["omnisafe.typing"] -[tool.ruff.per-file-ignores] +[tool.ruff.lint.per-file-ignores] "__init__.py" = [ "F401", # unused-import ] @@ -231,15 +235,15 @@ typing-modules = ["omnisafe.typing"] "ANN003", # Missing type annotation ] -[tool.ruff.flake8-annotations] +[tool.ruff.lint.flake8-annotations] allow-star-arg-any = true -[tool.ruff.flake8-quotes] +[tool.ruff.lint.flake8-quotes] docstring-quotes = "double" multiline-quotes = "double" inline-quotes = "single" -[tool.ruff.flake8-tidy-imports] +[tool.ruff.lint.flake8-tidy-imports] ban-relative-imports = "all" [tool.pytest.ini_options]