diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml new file mode 100644 index 00000000..ffaffd29 --- /dev/null +++ b/.github/workflows/pre-commit.yml @@ -0,0 +1,24 @@ +name: pre-commit + +on: [ push, pull_request ] + +jobs: + + pre-commit: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v3 + + - name: Set up Python + uses: actions/setup-python@v3 + with: + python-version: '3.8' + + - name: Install dependencies + run: | + python -m pip install pre-commit + + - name: Run pre-commit + run: | + pre-commit run --all-files diff --git a/.github/workflows/python-publish-manual.yml b/.github/workflows/python-publish-manual.yml index c3caae87..1f3de732 100644 --- a/.github/workflows/python-publish-manual.yml +++ b/.github/workflows/python-publish-manual.yml @@ -1,4 +1,4 @@ -name: Upload Python Package (manually triggered workflow) +name: pypi (manually triggered workflow) on: workflow_dispatch: @@ -12,22 +12,28 @@ permissions: contents: read jobs: + pypi: name: Publish package to PyPI runs-on: ubuntu-latest if: ${{ github.event.inputs.job == 'pypi'}} + steps: - uses: actions/checkout@v3 + - name: Set up Python uses: actions/setup-python@v3 with: python-version: '3.7' + - name: Install dependencies run: | python -m pip install --upgrade pip pip install build + - name: Build package run: python -m build + - name: Publish package to PyPI uses: pypa/gh-action-pypi-publish@release/v1 with: @@ -39,18 +45,23 @@ jobs: name: Publish package to TestPyPI runs-on: ubuntu-latest if: ${{ github.event.inputs.job == 'test-pypi'}} + steps: - uses: actions/checkout@v3 + - name: Set up Python uses: actions/setup-python@v3 with: python-version: '3.7' + - name: Install dependencies run: | python -m pip install --upgrade pip pip install build + - name: Build package run: python -m build + - name: Publish package to TestPyPI uses: pypa/gh-action-pypi-publish@release/v1 with: diff --git a/.github/workflows/python-test.yml b/.github/workflows/python-test.yml new file mode 100644 index 00000000..b644c064 --- /dev/null +++ b/.github/workflows/python-test.yml @@ -0,0 +1,32 @@ +name: tests + +on: [ push, pull_request ] + +jobs: + + tests: + strategy: + fail-fast: false + matrix: + os: ["ubuntu-latest", "macos-latest", "windows-latest"] + python-version: ["3.7", "3.8", "3.9", "3.10"] + + runs-on: ${{ matrix.os }} + + steps: + - uses: actions/checkout@v3 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v3 + with: + python-version: ${{ matrix.python-version }} + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + python -m pip install flake8 pytest hypothesis + python -m pip install -e . + + - name: Tests + run: | + pytest --ignore-glob "tests/test_examples*" diff --git a/.gitignore b/.gitignore index 65099c3b..6b0ca87a 100644 --- a/.gitignore +++ b/.gitignore @@ -2,11 +2,18 @@ # Created by https://www.toptal.com/developers/gitignore/api/python,c,c++,visualstudiocode,cmake # Edit at https://www.toptal.com/developers/gitignore?templates=python,c,c++,visualstudiocode,cmake +### Other files ### +*.xml +*.mp4 +*.zip +*_old.* + ### Library experiments ### *.pt *.npz *.npy *.csv +*.onnx events.out.tfevents.* runs diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 00000000..17d0cb62 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,11 @@ +repos: +- repo: https://github.com/pre-commit/pre-commit-hooks + rev: v2.3.0 + hooks: + - id: check-ast + - id: check-case-conflict + - id: check-docstring-first + - id: check-merge-conflict + - id: check-yaml + - id: end-of-file-fixer + - id: trailing-whitespace diff --git a/.readthedocs.yaml b/.readthedocs.yaml index f8924d00..7aed1192 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -23,4 +23,4 @@ build: # Python requirements required to build your docs python: install: - - requirements: docs/requirements.txt \ No newline at end of file + - requirements: docs/requirements.txt diff --git a/CHANGELOG.md b/CHANGELOG.md index 2b66df84..00b260c4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,35 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). +## [0.9.0] - 2023-01-13 +### Added +- Support for Farama Gymnasium interface +- Wrapper for robosuite environments +- Weights & Biases integration (by @juhannc) +- Set the running mode (training or evaluation) of the agents +- Allow clipping the gradient norm for DDPG, TD3 and SAC agents +- Initialize model biases +- Add RNN (RNN, LSTM, GRU and any other variant) support for A2C, DDPG, PPO, SAC, TD3 and TRPO agents +- Allow disabling training/evaluation progressbar +- Farama Shimmy and robosuite examples +- KUKA LBR iiwa real-world example + +### Changed +- Forward model inputs as a Python dictionary [**breaking change**] +- Returns a Python dictionary with extra output values in model calls [**breaking change**] +- Adopt the implementation of `terminated` and `truncated` over `done` for all environments + +### Fixed +- Omniverse Isaac Gym simulation speed for the Franka Emika real-world example +- Call agents' method `record_transition` instead of parent method +to allow storing samples in memories during evaluation +- Move TRPO policy optimization out of the value optimization loop +- Access to the categorical model distribution +- Call reset only once for Gym/Gymnasium vectorized environments + +### Removed +- Deprecated method `start` in trainers + ## [0.8.0] - 2022-10-03 ### Added - AMP agent for physics-based character animation @@ -9,7 +38,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). - Gaussian model mixin - Support for creating shared models - Parameter `role` to model methods -- Wrapper compatibility with the new OpenAI Gym environment API (by @JohannLange) +- Wrapper compatibility with the new OpenAI Gym environment API (by @juhannc) - Internal library colored logger - Migrate checkpoints/models from other RL libraries to skrl models/agents - Configuration parameter `store_separately` to agent configuration dict @@ -22,7 +51,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). - Models implementation as Python mixin [**breaking change**] - Multivariate Gaussian model (`GaussianModel` until 0.7.0) to `MultivariateGaussianMixin` - Trainer's `cfg` parameter position and default values -- Show training/evaluation display progress using `tqdm` (by @JohannLange) +- Show training/evaluation display progress using `tqdm` (by @juhannc) - Update Isaac Gym and Omniverse Isaac Gym examples ### Fixed @@ -100,7 +129,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). - More examples and contents in the documentation ### Fixed -- Clip actions using the whole space's limits +- Clip actions using the whole space's limits ## [0.2.0] - 2022-01-18 ### Added diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 53004b68..1017b999 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -18,7 +18,7 @@ If you have a question, please do not open an issue for this. Instead, use the f Open an issue on [skrl's GitHub issues](https://github.com/Toni-SM/skrl/issues) and describe the bug. If possible, please provide some of the following items: - Minimum code that reproduces the bug... -- or the exact steps to reproduce it +- or the exact steps to reproduce it - The error log or a screenshot of it - A link to the source code of the library that you are using (some problems may be due to the use of older versions. If possible, always use the latest version) - Any other information that you think may be useful or help to reproduce/describe the problem @@ -31,7 +31,9 @@ There is a [board](https://github.com/users/Toni-SM/projects/2/views/8) containi - Try to **communicate your change first** to [discuss](https://github.com/Toni-SM/skrl/discussions) the implementation if you want to add a new feature or change an existing one - Modify only the minimum amount of code required and the files needed to make the change +- Use the provided [pre-commit](https://pre-commit.com/) hooks to format the code. Install it by running `pre-commit install` in the root of the repository, running it periodically using `pre-commit run --all` helps reducing commit errors - Changes that are cosmetic in nature (code formatting, removing whitespace, etc.) or that correct grammatical, spelling or typo errors, and that do not add anything substantial to the functionality of the library will generally not be accepted as a pull request + - The only exception are changes that results from the use of the pre-commit hooks #### Coding conventions @@ -51,7 +53,7 @@ Read the code a little bit and you will understand it at first glance... Also ```ini function annotation (e.g. typing) - # insert an empty line + # insert an empty line python libraries and other libraries (e.g. gym, numpy, time, etc.) # insert an empty line machine learning framework modules (e.g. torch, torch.nn) @@ -63,4 +65,4 @@ Read the code a little bit and you will understand it at first glance... Also Thank you once again, -Toni \ No newline at end of file +Toni diff --git a/README.md b/README.md index c4ae3499..7313d5f1 100644 --- a/README.md +++ b/README.md @@ -1,10 +1,18 @@ +[![license](https://img.shields.io/pypi/l/skrl)](https://github.com/Toni-SM/skrl) +[![docs](https://readthedocs.org/projects/skrl/badge/?version=latest)](https://skrl.readthedocs.io/en/latest/?badge=latest) +[![pypi](https://img.shields.io/pypi/v/skrl)](https://pypi.org/project/skrl) +   +[![pytest](https://github.com/Toni-SM/skrl/actions/workflows/python-test.yml/badge.svg)](https://github.com/Toni-SM/skrl/actions/workflows/python-test.yml) +[![pre-commit](https://github.com/Toni-SM/skrl/actions/workflows/pre-commit.yml/badge.svg)](https://github.com/Toni-SM/skrl/actions/workflows/pre-commit.yml) + +

SKRL - Reinforcement Learning library


-**skrl** is an open-source modular library for Reinforcement Learning written in Python (using [PyTorch](https://pytorch.org/)) and designed with a focus on readability, simplicity, and transparency of algorithm implementation. In addition to supporting the [OpenAI Gym](https://www.gymlibrary.dev) and [DeepMind](https://github.com/deepmind/dm_env) environment interfaces, it allows loading and configuring [NVIDIA Isaac Gym](https://developer.nvidia.com/isaac-gym/) and [NVIDIA Omniverse Isaac Gym](https://docs.omniverse.nvidia.com/app_isaacsim/app_isaacsim/tutorial_gym_isaac_gym.html) environments, enabling agents' simultaneous training by scopes (subsets of environments among all available environments), which may or may not share resources, in the same run +**skrl** is an open-source modular library for Reinforcement Learning written in Python (using [PyTorch](https://pytorch.org/)) and designed with a focus on readability, simplicity, and transparency of algorithm implementation. In addition to supporting the OpenAI [Gym](https://www.gymlibrary.dev) / Farama [Gymnasium](https://gymnasium.farama.org) and [DeepMind](https://github.com/deepmind/dm_env) environment interfaces, it allows loading and configuring [NVIDIA Isaac Gym](https://developer.nvidia.com/isaac-gym/) and [NVIDIA Omniverse Isaac Gym](https://docs.omniverse.nvidia.com/app_isaacsim/app_isaacsim/tutorial_gym_isaac_gym.html) environments, enabling agents' simultaneous training by scopes (subsets of environments among all available environments), which may or may not share resources, in the same run
diff --git a/docs/README.md b/docs/README.md index 2e5da518..d06d59ce 100644 --- a/docs/README.md +++ b/docs/README.md @@ -1,6 +1,6 @@ -## Documentation +# Documentation -### Install Sphinx and Read the Docs Sphinx Theme +## Install Sphinx and Read the Docs Sphinx Theme ```bash pip install sphinx @@ -9,7 +9,7 @@ pip install sphinx-autobuild pip install sphinx-tabs==3.2.0 ``` -### Building the documentation +## Building the documentation ```bash cd docs diff --git a/docs/requirements.txt b/docs/requirements.txt index dc80cf40..c8ca4fdc 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -3,6 +3,7 @@ sphinx_rtd_theme sphinx-autobuild sphinx-tabs==3.2.0 gym +gymnasium torch tensorboard tqdm diff --git a/docs/source/_static/css/s5defs-roles.css b/docs/source/_static/css/s5defs-roles.css index 05c5dfdb..00feb4b9 100644 --- a/docs/source/_static/css/s5defs-roles.css +++ b/docs/source/_static/css/s5defs-roles.css @@ -81,4 +81,4 @@ .purple { color: purple; -} \ No newline at end of file +} diff --git a/docs/source/_static/imgs/example_robosuite.png b/docs/source/_static/imgs/example_robosuite.png new file mode 100644 index 00000000..5a6a9a83 Binary files /dev/null and b/docs/source/_static/imgs/example_robosuite.png differ diff --git a/docs/source/_static/imgs/example_shimmy.png b/docs/source/_static/imgs/example_shimmy.png new file mode 100644 index 00000000..9500d48b Binary files /dev/null and b/docs/source/_static/imgs/example_shimmy.png differ diff --git a/docs/source/_static/imgs/manual_trainer.svg b/docs/source/_static/imgs/manual_trainer.svg index 460497a3..b2ecb545 100755 --- a/docs/source/_static/imgs/manual_trainer.svg +++ b/docs/source/_static/imgs/manual_trainer.svg @@ -1 +1 @@ -step environmentsenv.step(…)a0ana0ana0an.........a0an...scope0scope1scope2scope3scopemrs0sns0sns0sns0snrrrddddrenderenv.render(…)agent.record_transitions(…)agent.act(…)record transitionscompute actions. . .. . ...............................Execute each agent method sequentially (one agent after the other) in the same processscope0scope1scope2scope3scopem𝒂𝒕𝒅𝒕+𝟏𝒓𝒕+𝟏𝒔𝒕+𝟏A1A2A3. . .Amagentenvagent.post_interaction(…)agent.pre_interaction(…)post-interactionpre-interaction𝒔𝒕resetenv.reset(…)training / evaluationiteration \ No newline at end of file +step environmentsenv.step(…)a0ana0ana0an.........a0an...renderenv.render(…)agent.act(…)record transitionscompute actions.........Execute each agent method sequentially (one agent after the other) in the same processscope0scope1scope2scope3scopem𝒂𝒕A1A2A3. . .Amagentenvagent.post_interaction(…)agent.pre_interaction(…)post-interactionpre-interaction𝒔𝒕resetenv.reset(…)training / evaluationiteration𝑖𝑛𝑓𝑜𝒕𝒕+𝟏𝑻𝒕+𝟏𝒓𝒕+𝟏𝒔𝒕+𝟏scope0scope1scope2scope3scopemrs0sns0sns0sns0snrrrTTTT. . .. . ......................tttt. . .agent.record_transitions(…) diff --git a/docs/source/_static/imgs/model_categorical.svg b/docs/source/_static/imgs/model_categorical.svg old mode 100644 new mode 100755 index 8e48383e..19f8b106 --- a/docs/source/_static/imgs/model_categorical.svg +++ b/docs/source/_static/imgs/model_categorical.svg @@ -1 +1 @@ -inputhiddenoutput.compute(…)states (𝒔𝒕)with or withoutactions (𝒂𝒕)unnormalized_log_problog probabilities(logits)probabilities(probs)categoricaldistribution1 2 3 … n𝑃(𝑥)actions (𝒂𝒕+𝟏)log probevaluated at𝑎𝑡+1networkoutput \ No newline at end of file +inputhiddenoutput.compute(…)inputs (𝒔𝒕)unnormalized_log_problog probabilities(logits)probabilities(probs)categoricaldistribution1 2 3 … n𝑃(𝑥)actions (𝒂𝒕+𝟏)log probevaluated at𝑎𝑡+1outputs.act(…) /.forward(…) diff --git a/docs/source/_static/imgs/model_categorical_cnn.svg b/docs/source/_static/imgs/model_categorical_cnn.svg new file mode 100755 index 00000000..da4893c4 --- /dev/null +++ b/docs/source/_static/imgs/model_categorical_cnn.svg @@ -0,0 +1 @@ +FC3+ TanhFC4+TanhFC5logits6432n165121024FC2+ TanhFC1+ ReLUflatten646432341286156415128Conv3+ ReLUConv2+ ReLUConv1+ ReLUstates12288reshape (view)and permute(channels first) diff --git a/docs/source/_static/imgs/model_categorical_mlp.svg b/docs/source/_static/imgs/model_categorical_mlp.svg new file mode 100755 index 00000000..51c4e25e --- /dev/null +++ b/docs/source/_static/imgs/model_categorical_mlp.svg @@ -0,0 +1 @@ +FC1+ ReLUFC2+ReLUFC3stateslogits6432n diff --git a/docs/source/_static/imgs/model_categorical_rnn.svg b/docs/source/_static/imgs/model_categorical_rnn.svg new file mode 100755 index 00000000..3a6130e6 --- /dev/null +++ b/docs/source/_static/imgs/model_categorical_rnn.svg @@ -0,0 +1 @@ +statesreshape(view)(N*L, Hin)(N, L, Hin)finalstatesinitialstates(N, L, D*Hout)flattenRNNHoutFC1+ ReLUFC2+ReLU6432FC3logitsn diff --git a/docs/source/_static/imgs/model_deterministic.svg b/docs/source/_static/imgs/model_deterministic.svg old mode 100644 new mode 100755 index ee65e6d5..182e3023 --- a/docs/source/_static/imgs/model_deterministic.svg +++ b/docs/source/_static/imgs/model_deterministic.svg @@ -1 +1 @@ -inputhiddenoutput.compute(…)states (𝒔𝒕)with or withoutactions (𝒂𝒕)actions (𝒂𝒕+𝟏)NoneNoneclip_actions \ No newline at end of file +inputhiddenoutput.compute(…)actions (𝒂𝒕+𝟏)Noneoutputsclip_actionsinputs (𝒔𝒕)actions (𝒂𝒕+𝟏)Noneoutputs.act(…) /.forward(…) diff --git a/docs/source/_static/imgs/model_deterministic_cnn.svg b/docs/source/_static/imgs/model_deterministic_cnn.svg new file mode 100755 index 00000000..7ddb76f7 --- /dev/null +++ b/docs/source/_static/imgs/model_deterministic_cnn.svg @@ -0,0 +1 @@ +165121024FC2+ TanhFC1+ ReLUflatten646432341286156415128Conv3+ ReLUConv2+ ReLUConv1+ ReLUstatesreshape (view)and permute(channels first)12288FC3+ TanhFC4+Tanh6432FC51takenactions diff --git a/docs/source/_static/imgs/model_deterministic_mlp.svg b/docs/source/_static/imgs/model_deterministic_mlp.svg new file mode 100755 index 00000000..e5a26ce9 --- /dev/null +++ b/docs/source/_static/imgs/model_deterministic_mlp.svg @@ -0,0 +1 @@ +FC1+ ReLUFC2+ReLUFC3statestakenactions64321 diff --git a/docs/source/_static/imgs/model_deterministic_rnn.svg b/docs/source/_static/imgs/model_deterministic_rnn.svg new file mode 100755 index 00000000..8e6dc222 --- /dev/null +++ b/docs/source/_static/imgs/model_deterministic_rnn.svg @@ -0,0 +1 @@ +FC1+ ReLUFC2+ReLUFC364321takenactionsstatesreshape(view)(N*L, Hin)(N, L, Hin)finalstatesinitialstates(N, L, D*Hout)flattenRNNHout diff --git a/docs/source/_static/imgs/model_gaussian.svg b/docs/source/_static/imgs/model_gaussian.svg index 92fa89ca..d4cfe41d 100755 --- a/docs/source/_static/imgs/model_gaussian.svg +++ b/docs/source/_static/imgs/model_gaussian.svg @@ -1 +1 @@ -inputhiddenoutput.compute(…)states (𝒔𝒕)with or withoutactions (𝒂𝒕)log standarddeviations(𝑙𝑜𝑔(𝜎))mean actions(𝑎𝑡+1)gaussiandistribution𝒩(𝜇,𝜎)actions (𝒂𝒕+𝟏)log prob evaluated at𝑎𝑡+1mean actions (𝒂𝒕+𝟏)paramclip_log_stdclip_actionsreduction \ No newline at end of file +inputhiddenoutput.compute(…)log standarddeviations(𝑙𝑜𝑔(𝜎))mean actions(𝑎𝑡+1)gaussiandistribution𝒩(𝜇,𝜎)actions (𝒂𝒕+𝟏)log prob evaluated at𝑎𝑡+1outputsparamclip_log_stdclip_actionsreductioninputs (𝒔𝒕).act(…) /.forward(…)actions (𝒂𝒕+𝟏)log probevaluated at𝑎𝑡+1outputs diff --git a/docs/source/_static/imgs/model_gaussian_cnn.svg b/docs/source/_static/imgs/model_gaussian_cnn.svg new file mode 100755 index 00000000..426753a5 --- /dev/null +++ b/docs/source/_static/imgs/model_gaussian_cnn.svg @@ -0,0 +1 @@ +FC3+ TanhFC4+Tanh6432165121024FC2+ TanhFC1+ ReLUflatten646432341286156415128Conv3+ ReLUConv2+ ReLUConv1+ ReLUstatesreshape (view)and permute(channels first)12288meanactionsFC5numactions diff --git a/docs/source/_static/imgs/model_gaussian_mlp.svg b/docs/source/_static/imgs/model_gaussian_mlp.svg new file mode 100755 index 00000000..d1765944 --- /dev/null +++ b/docs/source/_static/imgs/model_gaussian_mlp.svg @@ -0,0 +1 @@ +FC1+ ReLUFC2+ReLUFC3+Tanhstatesmeanactions6432numactions diff --git a/docs/source/_static/imgs/model_gaussian_rnn.svg b/docs/source/_static/imgs/model_gaussian_rnn.svg new file mode 100755 index 00000000..dd33f668 --- /dev/null +++ b/docs/source/_static/imgs/model_gaussian_rnn.svg @@ -0,0 +1 @@ +statesreshape(view)(N*L, Hin)(N, L, Hin)finalstatesinitialstates(N, L, D*Hout)flattenRNNHoutFC1+ ReLUFC2+ReLUFC3+Tanhmeanactions6432numactions diff --git a/docs/source/_static/imgs/model_multivariate_gaussian.svg b/docs/source/_static/imgs/model_multivariate_gaussian.svg old mode 100644 new mode 100755 index 19bd5771..3647f3e2 --- a/docs/source/_static/imgs/model_multivariate_gaussian.svg +++ b/docs/source/_static/imgs/model_multivariate_gaussian.svg @@ -1 +1 @@ -inputhiddenoutput.compute(…)states (𝒔𝒕)with or withoutactions (𝒂𝒕)log standarddeviations(𝑙𝑜𝑔(𝜎))mean actions(𝑎𝑡+1)multivariategaussian distribution𝒩(𝜇,𝛴)actions (𝒂𝒕+𝟏)log prob evaluated at𝑎𝑡+1mean actions (𝒂𝒕+𝟏)paramclip_log_stdclip_actions \ No newline at end of file +inputhiddenoutputlog standarddeviations(𝑙𝑜𝑔(𝜎))mean actions(𝑎𝑡+1)multivariategaussian distribution𝒩(𝜇,𝛴)actions (𝒂𝒕+𝟏)log prob evaluated at𝑎𝑡+1outputsparamclip_log_stdclip_actions.compute(…)inputs (𝒔𝒕).act(…) /.forward(…)actions (𝒂𝒕+𝟏)log probevaluated at𝑎𝑡+1outputs diff --git a/docs/source/_static/imgs/parallel_trainer.svg b/docs/source/_static/imgs/parallel_trainer.svg old mode 100644 new mode 100755 index 07d82364..d950d8b0 --- a/docs/source/_static/imgs/parallel_trainer.svg +++ b/docs/source/_static/imgs/parallel_trainer.svg @@ -1 +1 @@ -step environmentsenv.step(…)a0ana0ana0an.........a0an...scope0scope1scope2scope3scopemrs0sns0sns0sns0snrrrddddrenderenv.render(…)agent.record_transitions(…)agent.post_interaction(…)agent.pre_interaction(…)agent.act(…)record transitionspost-interactionpre-interactioncompute actionsA0A1A2Ambarrier. . .. . .. . ...............................Execute each agent method in a different process and control synchronization through a multiprocessing.Barrierobjectscope0scope1scope2scope3scopem𝒂𝒕𝒅𝒕+𝟏𝒓𝒕+𝟏𝒔𝒕+𝟏𝒔𝒕agentenvresetenv.reset(…)init \ No newline at end of file +step environmentsenv.step(…)a0ana0ana0an.........a0an...scope0scope1scope2scope3scopemrs0sns0sns0sns0snrrrTTTTrenderenv.render(…)agent.record_transitions(…)agent.post_interaction(…)agent.pre_interaction(…)agent.act(…)record transitionspost-interactionpre-interactioncompute actionsA0A1A2Ambarrier. . .. . .. . ..........Execute each agent method in a different process and control synchronization through a multiprocessing.Barrierobjectscope0scope1scope2scope3scopem𝒂𝒕𝒔𝒕agentenvresetenv.reset(…)init𝑖𝑛𝑓𝑜𝒕𝒕+𝟏𝑻𝒕+𝟏𝒓𝒕+𝟏𝒔𝒕+𝟏.....................tttt. . . diff --git a/docs/source/_static/imgs/rl_schema.svg b/docs/source/_static/imgs/rl_schema.svg index aac7fe7a..0a45da66 100755 --- a/docs/source/_static/imgs/rl_schema.svg +++ b/docs/source/_static/imgs/rl_schema.svg @@ -1 +1 @@ -action (𝒂𝒕)state (𝒔𝒕+𝟏)reward (𝒓𝒕+𝟏)state (𝒔𝒕)reward (𝒓𝒕)agentenv17234586 \ No newline at end of file +action (𝒂𝒕)state (𝒔𝒕+𝟏)reward (𝒓𝒕+𝟏)state (𝒔𝒕)reward (𝒓𝒕)agentenv17234586 diff --git a/docs/source/_static/imgs/sequential_trainer.svg b/docs/source/_static/imgs/sequential_trainer.svg old mode 100644 new mode 100755 index 3495aa86..7565f44a --- a/docs/source/_static/imgs/sequential_trainer.svg +++ b/docs/source/_static/imgs/sequential_trainer.svg @@ -1 +1 @@ -step environmentsenv.step(…)a0ana0ana0an.........a0an...scope0scope1scope2scope3scopemrs0sns0sns0sns0snrrrddddrenderenv.render(…)agent.record_transitions(…)agent.act(…)record transitionscompute actions. . .. . ...............................Execute each agent method sequentially (one agent after the other) in the same processscope0scope1scope2scope3scopem𝒂𝒕𝒅𝒕+𝟏𝒓𝒕+𝟏𝒔𝒕+𝟏A1A2A3. . .Amagentenvagent.post_interaction(…)agent.pre_interaction(…)post-interactionpre-interaction𝒔𝒕resetenv.reset(…)init \ No newline at end of file +step environmentsenv.step(…)a0ana0ana0an.........a0an...renderenv.render(…)agent.record_transitions(…)agent.act(…)record transitionscompute actions.........Execute each agent method sequentially (one agent after the other) in the same processscope0scope1scope2scope3scopem𝒂𝒕A1A2A3. . .Amagentenvagent.post_interaction(…)agent.pre_interaction(…)post-interactionpre-interaction𝒔𝒕resetenv.reset(…)init𝑖𝑛𝑓𝑜𝒕𝒕+𝟏𝑻𝒕+𝟏𝒓𝒕+𝟏𝒔𝒕+𝟏scope0scope1scope2scope3scopemrs0sns0sns0sns0snrrrTTTT. . .. . ......................tttt. . . diff --git a/docs/source/_static/imgs/utils_tensorboard_file_iterator.svg b/docs/source/_static/imgs/utils_tensorboard_file_iterator.svg index ac016140..43925923 100644 --- a/docs/source/_static/imgs/utils_tensorboard_file_iterator.svg +++ b/docs/source/_static/imgs/utils_tensorboard_file_iterator.svg @@ -22,33 +22,33 @@ - - - - @@ -59,25 +59,25 @@ L 0 3.5 - @@ -87,8 +87,8 @@ z - @@ -100,17 +100,17 @@ L 126.67762 21.6 - @@ -123,8 +123,8 @@ z - @@ -136,28 +136,28 @@ L 179.220722 21.6 - @@ -170,8 +170,8 @@ z - @@ -183,36 +183,36 @@ L 231.763825 21.6 - @@ -225,8 +225,8 @@ z - @@ -238,21 +238,21 @@ L 284.306928 21.6 - @@ -265,8 +265,8 @@ z - @@ -278,28 +278,28 @@ L 336.850031 21.6 - @@ -312,8 +312,8 @@ z - @@ -325,34 +325,34 @@ L 389.393133 21.6 - @@ -365,8 +365,8 @@ z - @@ -378,13 +378,13 @@ L 441.936236 21.6 - @@ -397,8 +397,8 @@ z - @@ -410,43 +410,43 @@ L 494.479339 21.6 - @@ -461,156 +461,156 @@ z - - - - - - - @@ -629,14 +629,14 @@ z - - @@ -652,8 +652,8 @@ L -3.5 0 - @@ -672,8 +672,8 @@ L 515.454545 254.415099 - @@ -692,8 +692,8 @@ L 515.454545 199.265149 - @@ -712,8 +712,8 @@ L 515.454545 144.115199 - @@ -732,8 +732,8 @@ L 515.454545 88.965249 - @@ -754,120 +754,120 @@ L 515.454545 33.815299 - - - - - @@ -881,3574 +881,3574 @@ z - - - - - - - - - - - - - - - - - - - - - - @@ -4490,21 +4490,21 @@ z - - @@ -4512,86 +4512,86 @@ L 335.959233 189.492188 - - - - - @@ -4626,8 +4626,8 @@ z - @@ -4665,8 +4665,8 @@ L 335.959233 204.448438 - @@ -4704,8 +4704,8 @@ L 335.959233 219.404688 - @@ -4743,8 +4743,8 @@ L 335.959233 234.360938 - @@ -4782,8 +4782,8 @@ L 335.959233 249.317188 - @@ -4821,8 +4821,8 @@ L 335.959233 264.273438 - @@ -4860,8 +4860,8 @@ L 335.959233 279.229688 - @@ -4899,8 +4899,8 @@ L 335.959233 294.185938 - @@ -4941,1024 +4941,1024 @@ L 335.959233 309.142188 - - - @@ -5975,8 +5975,8 @@ L 627.879972 21.6 - @@ -5996,8 +5996,8 @@ L 680.423074 21.6 - @@ -6017,8 +6017,8 @@ L 732.966177 21.6 - @@ -6038,8 +6038,8 @@ L 785.50928 21.6 - @@ -6059,8 +6059,8 @@ L 838.052382 21.6 - @@ -6080,8 +6080,8 @@ L 890.595485 21.6 - @@ -6101,8 +6101,8 @@ L 943.138588 21.6 - @@ -6122,8 +6122,8 @@ L 995.681691 21.6 - @@ -6159,8 +6159,8 @@ L 1048.224793 21.6 - @@ -6177,8 +6177,8 @@ L 1069.2 309.619058 - @@ -6197,8 +6197,8 @@ L 1069.2 257.960781 - @@ -6217,8 +6217,8 @@ L 1069.2 206.302503 - @@ -6237,8 +6237,8 @@ L 1069.2 154.644225 - @@ -6257,8 +6257,8 @@ L 1069.2 102.985947 - @@ -6288,425 +6288,425 @@ L 1069.2 51.32767 - - - - - @@ -6762,21 +6762,21 @@ L 1069.2 21.6 - - @@ -6790,10 +6790,10 @@ L 1023.840625 294.742188 - diff --git a/docs/source/_static/imgs/wrapping.svg b/docs/source/_static/imgs/wrapping.svg index 02138b44..cd8605c0 100755 --- a/docs/source/_static/imgs/wrapping.svg +++ b/docs/source/_static/imgs/wrapping.svg @@ -1 +1 @@ -DeepMindIsaac GymGymnum_envs: intdevice: ML framework specific devicestate_space: gym.Spaceobservation_space: gym.Spaceaction_space: gym.Spacereset()-> observationsstep(actions)-> observations, rewards, dones, infosrender()close()propertiesmethodswrap_env()Omniverse \ No newline at end of file +DeepMindOmniverseIsaac GymGymnum_envs: intdevice: ML framework specific devicestate_space: gym/gymnasiumspaceobservation_space: gym/gymnasiumspaceaction_space: gym/gymnasiumspacereset()-> states, infosstep(actions)-> states, rewards, terminated, truncated, infosrender()close()propertiesmethodswrap_env()Isaac GymGymnasium diff --git a/docs/source/examples/deepmind/dm_manipulation_stack_sac.py b/docs/source/examples/deepmind/dm_manipulation_stack_sac.py index 55cc071f..2cb943d0 100644 --- a/docs/source/examples/deepmind/dm_manipulation_stack_sac.py +++ b/docs/source/examples/deepmind/dm_manipulation_stack_sac.py @@ -40,33 +40,35 @@ def __init__(self, observation_space, action_space, device, clip_actions=False, self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) - def compute(self, states, taken_actions, role): + def compute(self, inputs, role): + states = inputs["states"] + # The dm_control.manipulation tasks have as observation/state spec a `collections.OrderedDict` object as follows: - # OrderedDict([('front_close', BoundedArray(shape=(1, 84, 84, 3), dtype=dtype('uint8'), name='front_close', minimum=0, maximum=255)), - # ('jaco_arm/joints_pos', Array(shape=(1, 6, 2), dtype=dtype('float64'), name='jaco_arm/joints_pos')), - # ('jaco_arm/joints_torque', Array(shape=(1, 6), dtype=dtype('float64'), name='jaco_arm/joints_torque')), - # ('jaco_arm/joints_vel', Array(shape=(1, 6), dtype=dtype('float64'), name='jaco_arm/joints_vel')), - # ('jaco_arm/jaco_hand/joints_pos', Array(shape=(1, 3), dtype=dtype('float64'), name='jaco_arm/jaco_hand/joints_pos')), - # ('jaco_arm/jaco_hand/joints_vel', Array(shape=(1, 3), dtype=dtype('float64'), name='jaco_arm/jaco_hand/joints_vel')), - # ('jaco_arm/jaco_hand/pinch_site_pos', Array(shape=(1, 3), dtype=dtype('float64'), name='jaco_arm/jaco_hand/pinch_site_pos')), + # OrderedDict([('front_close', BoundedArray(shape=(1, 84, 84, 3), dtype=dtype('uint8'), name='front_close', minimum=0, maximum=255)), + # ('jaco_arm/joints_pos', Array(shape=(1, 6, 2), dtype=dtype('float64'), name='jaco_arm/joints_pos')), + # ('jaco_arm/joints_torque', Array(shape=(1, 6), dtype=dtype('float64'), name='jaco_arm/joints_torque')), + # ('jaco_arm/joints_vel', Array(shape=(1, 6), dtype=dtype('float64'), name='jaco_arm/joints_vel')), + # ('jaco_arm/jaco_hand/joints_pos', Array(shape=(1, 3), dtype=dtype('float64'), name='jaco_arm/jaco_hand/joints_pos')), + # ('jaco_arm/jaco_hand/joints_vel', Array(shape=(1, 3), dtype=dtype('float64'), name='jaco_arm/jaco_hand/joints_vel')), + # ('jaco_arm/jaco_hand/pinch_site_pos', Array(shape=(1, 3), dtype=dtype('float64'), name='jaco_arm/jaco_hand/pinch_site_pos')), # ('jaco_arm/jaco_hand/pinch_site_rmat', Array(shape=(1, 9), dtype=dtype('float64'), name='jaco_arm/jaco_hand/pinch_site_rmat'))]) # This spec is converted to a `gym.spaces.Dict` space by the `wrap_env` function as follows: - # Dict(front_close: Box(0, 255, (1, 84, 84, 3), uint8), - # jaco_arm/jaco_hand/joints_pos: Box(-inf, inf, (1, 3), float64), - # jaco_arm/jaco_hand/joints_vel: Box(-inf, inf, (1, 3), float64), - # jaco_arm/jaco_hand/pinch_site_pos: Box(-inf, inf, (1, 3), float64), - # jaco_arm/jaco_hand/pinch_site_rmat: Box(-inf, inf, (1, 9), float64), - # jaco_arm/joints_pos: Box(-inf, inf, (1, 6, 2), float64), - # jaco_arm/joints_torque: Box(-inf, inf, (1, 6), float64), + # Dict(front_close: Box(0, 255, (1, 84, 84, 3), uint8), + # jaco_arm/jaco_hand/joints_pos: Box(-inf, inf, (1, 3), float64), + # jaco_arm/jaco_hand/joints_vel: Box(-inf, inf, (1, 3), float64), + # jaco_arm/jaco_hand/pinch_site_pos: Box(-inf, inf, (1, 3), float64), + # jaco_arm/jaco_hand/pinch_site_rmat: Box(-inf, inf, (1, 9), float64), + # jaco_arm/joints_pos: Box(-inf, inf, (1, 6, 2), float64), + # jaco_arm/joints_torque: Box(-inf, inf, (1, 6), float64), # jaco_arm/joints_vel: Box(-inf, inf, (1, 6), float64)) - + # The `spaces` parameter is a flat tensor of the flattened observation/state space with shape (batch_size, size_of_flat_space). # Using the model's method `tensor_to_space` we can convert the flattened tensor to the original space. # https://skrl.readthedocs.io/en/latest/modules/skrl.models.base_class.html#skrl.models.torch.base.Model.tensor_to_space - input = self.tensor_to_space(states, self.observation_space) - - # For this case, the `input` variable is a Python dictionary with the following structure and shapes: + space = self.tensor_to_space(states, self.observation_space) + + # For this case, the `space` variable is a Python dictionary with the following structure and shapes: # {'front_close': torch.Tensor(shape=[batch_size, 1, 84, 84, 3], dtype=torch.float32), # 'jaco_arm/jaco_hand/joints_pos': torch.Tensor(shape=[batch_size, 1, 3], dtype=torch.float32) # 'jaco_arm/jaco_hand/joints_vel': torch.Tensor(shape=[batch_size, 1, 3], dtype=torch.float32) @@ -77,12 +79,14 @@ def compute(self, states, taken_actions, role): # 'jaco_arm/joints_vel': torch.Tensor(shape=[batch_size, 1, 6], dtype=torch.float32)} # permute and normalize the images (samples, width, height, channels) -> (samples, channels, width, height) - features = self.features_extractor(input['front_close'][:,0].permute(0, 3, 1, 2) / 255.0) + features = self.features_extractor(space['front_close'][:,0].permute(0, 3, 1, 2) / 255.0) + + mean_actions = torch.tanh(self.net(torch.cat([features, + space["jaco_arm/joints_pos"].view(states.shape[0], -1), + space["jaco_arm/joints_vel"].view(states.shape[0], -1)], dim=-1))) + + return mean_actions, self.log_std_parameter, {} - return torch.tanh(self.net(torch.cat([features, - input["jaco_arm/joints_pos"].view(states.shape[0], -1), - input["jaco_arm/joints_vel"].view(states.shape[0], -1)], dim=-1))), self.log_std_parameter - class Critic(DeterministicMixin, Model): def __init__(self, observation_space, action_space, device, clip_actions=False): Model.__init__(self, observation_space, action_space, device) @@ -106,18 +110,20 @@ def __init__(self, observation_space, action_space, device, clip_actions=False): nn.ReLU(), nn.Linear(32, 1)) - def compute(self, states, taken_actions, role): - # map the observations/states to the original space. + def compute(self, inputs, role): + states = inputs["states"] + + # map the observations/states to the original space. # See the explanation above (StochasticActor.compute) - input = self.tensor_to_space(states, self.observation_space) - + space = self.tensor_to_space(states, self.observation_space) + # permute and normalize the images (samples, width, height, channels) -> (samples, channels, width, height) - features = self.features_extractor(input['front_close'][:,0].permute(0, 3, 1, 2) / 255.0) + features = self.features_extractor(space['front_close'][:,0].permute(0, 3, 1, 2) / 255.0) - return self.net(torch.cat([features, - input["jaco_arm/joints_pos"].view(states.shape[0], -1), - input["jaco_arm/joints_vel"].view(states.shape[0], -1), - taken_actions], dim=-1)) + return self.net(torch.cat([features, + space["jaco_arm/joints_pos"].view(states.shape[0], -1), + space["jaco_arm/joints_vel"].view(states.shape[0], -1), + inputs["taken_actions"]], dim=-1)), {} # Load and wrap the DeepMind environment @@ -160,10 +166,10 @@ def compute(self, states, taken_actions, role): cfg_sac["experiment"]["checkpoint_interval"] = 5000 -agent_sac = SAC(models=models_sac, - memory=memory, - cfg=cfg_sac, - observation_space=env.observation_space, +agent_sac = SAC(models=models_sac, + memory=memory, + cfg=cfg_sac, + observation_space=env.observation_space, action_space=env.action_space, device=device) diff --git a/docs/source/examples/deepmind/dm_suite_cartpole_swingup_ddpg.py b/docs/source/examples/deepmind/dm_suite_cartpole_swingup_ddpg.py index f0469fb3..1e498c20 100644 --- a/docs/source/examples/deepmind/dm_suite_cartpole_swingup_ddpg.py +++ b/docs/source/examples/deepmind/dm_suite_cartpole_swingup_ddpg.py @@ -16,7 +16,7 @@ # Define the models (deterministic models) for the DDPG agent using mixins # and programming with two approaches (torch functional and torch.nn.Sequential class). # - Actor (policy): takes as input the environment's observation/state and returns an action -# - Critic: takes the state and action as input and provides a value to guide the policy +# - Critic: takes the state and action as input and provides a value to guide the policy class DeterministicActor(DeterministicMixin, Model): def __init__(self, observation_space, action_space, device, clip_actions=False): Model.__init__(self, observation_space, action_space, device) @@ -26,10 +26,10 @@ def __init__(self, observation_space, action_space, device, clip_actions=False): self.linear_layer_2 = nn.Linear(400, 300) self.action_layer = nn.Linear(300, self.num_actions) - def compute(self, states, taken_actions, role): - x = F.relu(self.linear_layer_1(states)) + def compute(self, inputs, role): + x = F.relu(self.linear_layer_1(inputs["states"])) x = F.relu(self.linear_layer_2(x)) - return torch.tanh(self.action_layer(x)) + return torch.tanh(self.action_layer(x)), {} class DeterministicCritic(DeterministicMixin, Model): def __init__(self, observation_space, action_space, device, clip_actions=False): @@ -42,8 +42,8 @@ def __init__(self, observation_space, action_space, device, clip_actions=False): nn.ReLU(), nn.Linear(300, 1)) - def compute(self, states, taken_actions, role): - return self.net(torch.cat([states, taken_actions], dim=1)) + def compute(self, inputs, role): + return self.net(torch.cat([inputs["states"], inputs["taken_actions"]], dim=1)), {} # Load and wrap the DeepMind environment @@ -83,10 +83,10 @@ def compute(self, states, taken_actions, role): cfg_ddpg["experiment"]["write_interval"] = 1000 cfg_ddpg["experiment"]["checkpoint_interval"] = 5000 -agent_ddpg = DDPG(models=models_ddpg, - memory=memory, - cfg=cfg_ddpg, - observation_space=env.observation_space, +agent_ddpg = DDPG(models=models_ddpg, + memory=memory, + cfg=cfg_ddpg, + observation_space=env.observation_space, action_space=env.action_space, device=device) diff --git a/docs/source/examples/gym/a2c_gym_pendulum.py b/docs/source/examples/gym/a2c_gym_pendulum.py new file mode 100644 index 00000000..8590d020 --- /dev/null +++ b/docs/source/examples/gym/a2c_gym_pendulum.py @@ -0,0 +1,114 @@ +import gym + +import torch +import torch.nn as nn + +# Import the skrl components to build the RL system +from skrl.models.torch import Model, GaussianMixin, DeterministicMixin +from skrl.memories.torch import RandomMemory +from skrl.agents.torch.a2c import A2C, A2C_DEFAULT_CONFIG +from skrl.trainers.torch import SequentialTrainer +from skrl.resources.preprocessors.torch import RunningStandardScaler +from skrl.resources.schedulers.torch import KLAdaptiveRL +from skrl.envs.torch import wrap_env +from skrl.utils import set_seed + + +# Define the models (stochastic and deterministic models) for the agent using mixins. +# - Policy: takes as input the environment's observation/state and returns an action +# - Value: takes the state as input and provides a value to guide the policy +class Policy(GaussianMixin, Model): + def __init__(self, observation_space, action_space, device, clip_actions=False, + clip_log_std=True, min_log_std=-20, max_log_std=2, reduction="sum"): + Model.__init__(self, observation_space, action_space, device) + GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std, reduction) + + self.net = nn.Sequential(nn.Linear(self.num_observations, 64), + nn.ReLU(), + nn.Linear(64, 64), + nn.ReLU(), + nn.Linear(64, self.num_actions)) + self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) + + def compute(self, inputs, role): + # Pendulum-v1 action_space is -2 to 2 + return 2 * torch.tanh(self.net(inputs["states"])), self.log_std_parameter, {} + +class Value(DeterministicMixin, Model): + def __init__(self, observation_space, action_space, device, clip_actions=False): + Model.__init__(self, observation_space, action_space, device) + DeterministicMixin.__init__(self, clip_actions) + + self.net = nn.Sequential(nn.Linear(self.num_observations, 64), + nn.ReLU(), + nn.Linear(64, 64), + nn.ReLU(), + nn.Linear(64, 1)) + + def compute(self, inputs, role): + return self.net(inputs["states"]), {} + + +# Load and wrap the Gym environment. +# Note: the environment version may change depending on the gym version +try: + env = gym.vector.make("Pendulum-v1", num_envs=10, asynchronous=False) +except gym.error.DeprecatedEnv as e: + env_id = [spec.id for spec in gym.envs.registry.all() if spec.id.startswith("Pendulum-v")][0] + print("Pendulum-v1 not found. Trying {}".format(env_id)) + env = gym.vector.make(env_id, num_envs=10, asynchronous=False) +env = wrap_env(env) + +device = env.device + + +# Instantiate a RandomMemory as rollout buffer (any memory can be used for this) +memory = RandomMemory(memory_size=1024, num_envs=env.num_envs, device=device) + + +# Instantiate the agent's models (function approximators). +# A2C requires 2 models, visit its documentation for more details +# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.a2c.html#spaces-and-models +models_a2c = {} +models_a2c["policy"] = Policy(env.observation_space, env.action_space, device, clip_actions=True) +models_a2c["value"] = Value(env.observation_space, env.action_space, device) + + +# Configure and instantiate the agent. +# Only modify some of the default configuration, visit its documentation to see all the options +# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.a2c.html#configuration-and-hyperparameters +cfg_a2c = A2C_DEFAULT_CONFIG.copy() +cfg_a2c["rollouts"] = 1024 # memory_size +cfg_a2c["learning_epochs"] = 10 +cfg_a2c["mini_batches"] = 32 +cfg_a2c["discount_factor"] = 0.9 +cfg_a2c["lambda"] = 0.95 +cfg_a2c["learning_rate"] = 1e-3 +cfg_a2c["learning_rate_scheduler"] = KLAdaptiveRL +cfg_a2c["learning_rate_scheduler_kwargs"] = {"kl_threshold": 0.008, "min_lr": 5e-4} +cfg_a2c["random_timesteps"] = 0 +cfg_a2c["learning_starts"] = 0 +cfg_a2c["grad_norm_clip"] = 0.5 +cfg_a2c["entropy_loss_scale"] = 0.0 +cfg_a2c["state_preprocessor"] = RunningStandardScaler +cfg_a2c["state_preprocessor_kwargs"] = {"size": env.observation_space, "device": device} +cfg_a2c["value_preprocessor"] = RunningStandardScaler +cfg_a2c["value_preprocessor_kwargs"] = {"size": 1, "device": device} +# logging to TensorBoard and write checkpoints each 500 and 5000 timesteps respectively +cfg_a2c["experiment"]["write_interval"] = 500 +cfg_a2c["experiment"]["checkpoint_interval"] = 5000 + +agent_ddpg = A2C(models=models_a2c, + memory=memory, + cfg=cfg_a2c, + observation_space=env.observation_space, + action_space=env.action_space, + device=device) + + +# Configure and instantiate the RL trainer +cfg_trainer = {"timesteps": 100000, "headless": True} +trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=agent_ddpg) + +# start training +trainer.train() diff --git a/docs/source/examples/gym/a2c_gym_pendulumnovel.py b/docs/source/examples/gym/a2c_gym_pendulumnovel.py new file mode 100644 index 00000000..49401866 --- /dev/null +++ b/docs/source/examples/gym/a2c_gym_pendulumnovel.py @@ -0,0 +1,116 @@ +import gym + +import torch +import torch.nn as nn +import numpy as np + +# Import the skrl components to build the RL system +from skrl.models.torch import Model, GaussianMixin, DeterministicMixin +from skrl.memories.torch import RandomMemory +from skrl.agents.torch.a2c import A2C, A2C_DEFAULT_CONFIG +from skrl.trainers.torch import SequentialTrainer +from skrl.resources.preprocessors.torch import RunningStandardScaler +from skrl.resources.schedulers.torch import KLAdaptiveRL +from skrl.envs.torch import wrap_env + + +# Define the models (stochastic and deterministic models) for the agent using mixins. +# - Policy: takes as input the environment's observation/state and returns an action +# - Value: takes the state as input and provides a value to guide the policy +class Policy(GaussianMixin, Model): + def __init__(self, observation_space, action_space, device, clip_actions=False, + clip_log_std=True, min_log_std=-20, max_log_std=2, reduction="sum"): + Model.__init__(self, observation_space, action_space, device) + GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std, reduction) + + self.net = nn.Sequential(nn.Linear(self.num_observations, 64), + nn.ReLU(), + nn.Linear(64, 64), + nn.ReLU(), + nn.Linear(64, self.num_actions)) + self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) + + def compute(self, inputs, role): + # Pendulum-v1 action_space is -2 to 2 + return 2 * torch.tanh(self.net(inputs["states"])), self.log_std_parameter, {} + +class Value(DeterministicMixin, Model): + def __init__(self, observation_space, action_space, device, clip_actions=False): + Model.__init__(self, observation_space, action_space, device) + DeterministicMixin.__init__(self, clip_actions) + + self.net = nn.Sequential(nn.Linear(self.num_observations, 64), + nn.ReLU(), + nn.Linear(64, 64), + nn.ReLU(), + nn.Linear(64, 1)) + + def compute(self, inputs, role): + return self.net(inputs["states"]), {} + + +# Gym environment observation wrapper used to mask velocity. Adapted from rl_zoo3 (rl_zoo3/wrappers.py) +class NoVelocityWrapper(gym.ObservationWrapper): + def observation(self, observation): + # observation: x, y, angular velocity + return observation * np.array([1, 1, 0]) + +gym.envs.registration.register(id="PendulumNoVel-v1", entry_point=lambda: NoVelocityWrapper(gym.make("Pendulum-v1"))) + +# Load and wrap the Gym environment +env = gym.vector.make("PendulumNoVel-v1", num_envs=4, asynchronous=False) +env = wrap_env(env) + +device = env.device + + +# Instantiate a RandomMemory as rollout buffer (any memory can be used for this) +memory = RandomMemory(memory_size=1024, num_envs=env.num_envs, device=device) + + +# Instantiate the agent's models (function approximators). +# A2C requires 2 models, visit its documentation for more details +# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.a2c.html#spaces-and-models +models_a2c = {} +models_a2c["policy"] = Policy(env.observation_space, env.action_space, device, clip_actions=True) +models_a2c["value"] = Value(env.observation_space, env.action_space, device) + + +# Configure and instantiate the agent. +# Only modify some of the default configuration, visit its documentation to see all the options +# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.a2c.html#configuration-and-hyperparameters +cfg_a2c = A2C_DEFAULT_CONFIG.copy() +cfg_a2c["rollouts"] = 1024 # memory_size +cfg_a2c["learning_epochs"] = 10 +cfg_a2c["mini_batches"] = 32 +cfg_a2c["discount_factor"] = 0.9 +cfg_a2c["lambda"] = 0.95 +cfg_a2c["learning_rate"] = 1e-3 +cfg_a2c["learning_rate_scheduler"] = KLAdaptiveRL +cfg_a2c["learning_rate_scheduler_kwargs"] = {"kl_threshold": 0.008, "min_lr": 5e-4} +cfg_a2c["random_timesteps"] = 0 +cfg_a2c["learning_starts"] = 0 +cfg_a2c["grad_norm_clip"] = 0.5 +cfg_a2c["entropy_loss_scale"] = 0.0 +cfg_a2c["state_preprocessor"] = RunningStandardScaler +cfg_a2c["state_preprocessor_kwargs"] = {"size": env.observation_space, "device": device} +cfg_a2c["value_preprocessor"] = RunningStandardScaler +cfg_a2c["value_preprocessor_kwargs"] = {"size": 1, "device": device} +# logging to TensorBoard and write checkpoints each 500 and 5000 timesteps respectively +cfg_a2c["experiment"]["write_interval"] = 500 +cfg_a2c["experiment"]["checkpoint_interval"] = 5000 + +agent_ddpg = A2C(models=models_a2c, + memory=memory, + cfg=cfg_a2c, + observation_space=env.observation_space, + action_space=env.action_space, + device=device) + + +# Configure and instantiate the RL trainer +cfg_trainer = {"timesteps": 100000, "headless": True} +trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=agent_ddpg) + +# start training +trainer.train() diff --git a/docs/source/examples/gym/a2c_gym_pendulumnovel_gru.py b/docs/source/examples/gym/a2c_gym_pendulumnovel_gru.py new file mode 100644 index 00000000..a27b5f87 --- /dev/null +++ b/docs/source/examples/gym/a2c_gym_pendulumnovel_gru.py @@ -0,0 +1,215 @@ +import gym + +import torch +import torch.nn as nn +import numpy as np + +# Import the skrl components to build the RL system +from skrl.models.torch import Model, GaussianMixin, DeterministicMixin +from skrl.memories.torch import RandomMemory +from skrl.agents.torch.a2c import A2C, A2C_DEFAULT_CONFIG +from skrl.trainers.torch import SequentialTrainer +from skrl.resources.preprocessors.torch import RunningStandardScaler +from skrl.resources.schedulers.torch import KLAdaptiveRL +from skrl.envs.torch import wrap_env + + +# Define the models (stochastic and deterministic models) for the agent using mixins. +# - Policy: takes as input the environment's observation/state and returns an action +# - Value: takes the state as input and provides a value to guide the policy +class Policy(GaussianMixin, Model): + def __init__(self, observation_space, action_space, device, clip_actions=False, + clip_log_std=True, min_log_std=-20, max_log_std=2, reduction="sum", + num_envs=1, num_layers=1, hidden_size=64, sequence_length=128): + Model.__init__(self, observation_space, action_space, device) + GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std, reduction) + + self.num_envs = num_envs + self.num_layers = num_layers + self.hidden_size = hidden_size # Hout + self.sequence_length = sequence_length + + self.gru = nn.GRU(input_size=self.num_observations, + hidden_size=self.hidden_size, + num_layers=self.num_layers, + batch_first=True) # batch_first -> (batch, sequence, features) + + self.net = nn.Sequential(nn.Linear(self.hidden_size, 64), + nn.ReLU(), + nn.Linear(64, self.num_actions)) + self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) + + def get_specification(self): + # batch size (N) is the number of envs + return {"rnn": {"sequence_length": self.sequence_length, + "sizes": [(self.num_layers, self.num_envs, self.hidden_size)]}} # hidden states (D ∗ num_layers, N, Hout) + + def compute(self, inputs, role): + states = inputs["states"] + terminated = inputs.get("terminated", None) + hidden_states = inputs["rnn"][0] + + # training + if self.training: + rnn_input = states.view(-1, self.sequence_length, states.shape[-1]) # (N, L, Hin): N=batch_size, L=sequence_length + hidden_states = hidden_states.view(self.num_layers, -1, self.sequence_length, hidden_states.shape[-1]) # (D * num_layers, N, L, Hout) + # get the hidden states corresponding to the initial sequence + hidden_states = hidden_states[:,:,0,:].contiguous() # (D * num_layers, N, Hout) + + # reset the RNN state in the middle of a sequence + if terminated is not None and torch.any(terminated): + rnn_outputs = [] + terminated = terminated.view(-1, self.sequence_length) + indexes = [0] + (terminated[:,:-1].any(dim=0).nonzero(as_tuple=True)[0] + 1).tolist() + [self.sequence_length] + + for i in range(len(indexes) - 1): + i0, i1 = indexes[i], indexes[i + 1] + rnn_output, hidden_states = self.gru(rnn_input[:,i0:i1,:], hidden_states) + hidden_states[:, (terminated[:,i1-1]), :] = 0 + rnn_outputs.append(rnn_output) + + rnn_output = torch.cat(rnn_outputs, dim=1) + # no need to reset the RNN state in the sequence + else: + rnn_output, hidden_states = self.gru(rnn_input, hidden_states) + # rollout + else: + rnn_input = states.view(-1, 1, states.shape[-1]) # (N, L, Hin): N=num_envs, L=1 + rnn_output, hidden_states = self.gru(rnn_input, hidden_states) + + # flatten the RNN output + rnn_output = torch.flatten(rnn_output, start_dim=0, end_dim=1) # (N, L, D ∗ Hout) -> (N * L, D ∗ Hout) + + # Pendulum-v1 action_space is -2 to 2 + return 2 * torch.tanh(self.net(rnn_output)), self.log_std_parameter, {"rnn": [hidden_states]} + +class Value(DeterministicMixin, Model): + def __init__(self, observation_space, action_space, device, clip_actions=False, + num_envs=1, num_layers=1, hidden_size=64, sequence_length=128): + Model.__init__(self, observation_space, action_space, device) + DeterministicMixin.__init__(self, clip_actions) + + self.num_envs = num_envs + self.num_layers = num_layers + self.hidden_size = hidden_size # Hout + self.sequence_length = sequence_length + + self.gru = nn.GRU(input_size=self.num_observations, + hidden_size=self.hidden_size, + num_layers=self.num_layers, + batch_first=True) # batch_first -> (batch, sequence, features) + + self.net = nn.Sequential(nn.Linear(self.hidden_size, 64), + nn.ReLU(), + nn.Linear(64, 1)) + + def get_specification(self): + # batch size (N) is the number of envs + return {"rnn": {"sequence_length": self.sequence_length, + "sizes": [(self.num_layers, self.num_envs, self.hidden_size)]}} # hidden states (D ∗ num_layers, N, Hout) + + def compute(self, inputs, role): + states = inputs["states"] + terminated = inputs.get("terminated", None) + hidden_states = inputs["rnn"][0] + + # training + if self.training: + rnn_input = states.view(-1, self.sequence_length, states.shape[-1]) # (N, L, Hin): N=batch_size, L=sequence_length + + hidden_states = hidden_states.view(self.num_layers, -1, self.sequence_length, hidden_states.shape[-1]) # (D * num_layers, N, L, Hout) + # get the hidden states corresponding to the initial sequence + hidden_states = hidden_states[:,:,0,:].contiguous() # (D * num_layers, N, Hout) + + # reset the RNN state in the middle of a sequence + if terminated is not None and torch.any(terminated): + rnn_outputs = [] + terminated = terminated.view(-1, self.sequence_length) + indexes = [0] + (terminated[:,:-1].any(dim=0).nonzero(as_tuple=True)[0] + 1).tolist() + [self.sequence_length] + + for i in range(len(indexes) - 1): + i0, i1 = indexes[i], indexes[i + 1] + rnn_output, hidden_states = self.gru(rnn_input[:,i0:i1,:], hidden_states) + hidden_states[:, (terminated[:,i1-1]), :] = 0 + rnn_outputs.append(rnn_output) + + rnn_output = torch.cat(rnn_outputs, dim=1) + # no need to reset the RNN state in the sequence + else: + rnn_output, hidden_states = self.gru(rnn_input, hidden_states) + # rollout + else: + rnn_input = states.view(-1, 1, states.shape[-1]) # (N, L, Hin): N=num_envs, L=1 + rnn_output, hidden_states = self.gru(rnn_input, hidden_states) + + # flatten the RNN output + rnn_output = torch.flatten(rnn_output, start_dim=0, end_dim=1) # (N, L, D ∗ Hout) -> (N * L, D ∗ Hout) + + return self.net(rnn_output), {"rnn": [hidden_states]} + + +# Gym environment observation wrapper used to mask velocity. Adapted from rl_zoo3 (rl_zoo3/wrappers.py) +class NoVelocityWrapper(gym.ObservationWrapper): + def observation(self, observation): + # observation: x, y, angular velocity + return observation * np.array([1, 1, 0]) + +gym.envs.registration.register(id="PendulumNoVel-v1", entry_point=lambda: NoVelocityWrapper(gym.make("Pendulum-v1"))) + +# Load and wrap the Gym environment +env = gym.vector.make("PendulumNoVel-v1", num_envs=4, asynchronous=False) +env = wrap_env(env) + +device = env.device + + +# Instantiate a RandomMemory as rollout buffer (any memory can be used for this) +memory = RandomMemory(memory_size=1024, num_envs=env.num_envs, device=device) + + +# Instantiate the agent's models (function approximators). +# A2C requires 2 models, visit its documentation for more details +# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.a2c.html#spaces-and-models +models_a2c = {} +models_a2c["policy"] = Policy(env.observation_space, env.action_space, device, clip_actions=True, num_envs=env.num_envs) +models_a2c["value"] = Value(env.observation_space, env.action_space, device, num_envs=env.num_envs) + + +# Configure and instantiate the agent. +# Only modify some of the default configuration, visit its documentation to see all the options +# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.a2c.html#configuration-and-hyperparameters +cfg_a2c = A2C_DEFAULT_CONFIG.copy() +cfg_a2c["rollouts"] = 1024 # memory_size +cfg_a2c["learning_epochs"] = 10 +cfg_a2c["mini_batches"] = 32 +cfg_a2c["discount_factor"] = 0.9 +cfg_a2c["lambda"] = 0.95 +cfg_a2c["learning_rate"] = 1e-3 +cfg_a2c["learning_rate_scheduler"] = KLAdaptiveRL +cfg_a2c["learning_rate_scheduler_kwargs"] = {"kl_threshold": 0.008, "min_lr": 5e-4} +cfg_a2c["random_timesteps"] = 0 +cfg_a2c["learning_starts"] = 0 +cfg_a2c["grad_norm_clip"] = 0.5 +cfg_a2c["entropy_loss_scale"] = 0.0 +cfg_a2c["state_preprocessor"] = RunningStandardScaler +cfg_a2c["state_preprocessor_kwargs"] = {"size": env.observation_space, "device": device} +cfg_a2c["value_preprocessor"] = RunningStandardScaler +cfg_a2c["value_preprocessor_kwargs"] = {"size": 1, "device": device} +# logging to TensorBoard and write checkpoints each 500 and 5000 timesteps respectively +cfg_a2c["experiment"]["write_interval"] = 500 +cfg_a2c["experiment"]["checkpoint_interval"] = 5000 + +agent_ddpg = A2C(models=models_a2c, + memory=memory, + cfg=cfg_a2c, + observation_space=env.observation_space, + action_space=env.action_space, + device=device) + + +# Configure and instantiate the RL trainer +cfg_trainer = {"timesteps": 100000, "headless": True} +trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=agent_ddpg) + +# start training +trainer.train() diff --git a/docs/source/examples/gym/a2c_gym_pendulumnovel_lstm.py b/docs/source/examples/gym/a2c_gym_pendulumnovel_lstm.py new file mode 100644 index 00000000..ab0731ad --- /dev/null +++ b/docs/source/examples/gym/a2c_gym_pendulumnovel_lstm.py @@ -0,0 +1,225 @@ +import gym + +import torch +import torch.nn as nn +import numpy as np + +# Import the skrl components to build the RL system +from skrl.models.torch import Model, GaussianMixin, DeterministicMixin +from skrl.memories.torch import RandomMemory +from skrl.agents.torch.a2c import A2C, A2C_DEFAULT_CONFIG +from skrl.trainers.torch import SequentialTrainer +from skrl.resources.preprocessors.torch import RunningStandardScaler +from skrl.resources.schedulers.torch import KLAdaptiveRL +from skrl.envs.torch import wrap_env + + +# Define the models (stochastic and deterministic models) for the agent using mixins. +# - Policy: takes as input the environment's observation/state and returns an action +# - Value: takes the state as input and provides a value to guide the policy +class Policy(GaussianMixin, Model): + def __init__(self, observation_space, action_space, device, clip_actions=False, + clip_log_std=True, min_log_std=-20, max_log_std=2, reduction="sum", + num_envs=1, num_layers=1, hidden_size=64, sequence_length=128): + Model.__init__(self, observation_space, action_space, device) + GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std, reduction) + + self.num_envs = num_envs + self.num_layers = num_layers + self.hidden_size = hidden_size # Hcell (Hout is Hcell because proj_size = 0) + self.sequence_length = sequence_length + + self.lstm = nn.LSTM(input_size=self.num_observations, + hidden_size=self.hidden_size, + num_layers=self.num_layers, + batch_first=True) # batch_first -> (batch, sequence, features) + + self.net = nn.Sequential(nn.Linear(self.hidden_size, 64), + nn.ReLU(), + nn.Linear(64, self.num_actions)) + self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) + + def get_specification(self): + # batch size (N) is the number of envs + return {"rnn": {"sequence_length": self.sequence_length, + "sizes": [(self.num_layers, self.num_envs, self.hidden_size), # hidden states (D ∗ num_layers, N, Hout) + (self.num_layers, self.num_envs, self.hidden_size)]}} # cell states (D ∗ num_layers, N, Hcell) + + def compute(self, inputs, role): + states = inputs["states"] + terminated = inputs.get("terminated", None) + hidden_states, cell_states = inputs["rnn"][0], inputs["rnn"][1] + + # training + if self.training: + rnn_input = states.view(-1, self.sequence_length, states.shape[-1]) # (N, L, Hin): N=batch_size, L=sequence_length + hidden_states = hidden_states.view(self.num_layers, -1, self.sequence_length, hidden_states.shape[-1]) # (D * num_layers, N, L, Hout) + cell_states = cell_states.view(self.num_layers, -1, self.sequence_length, cell_states.shape[-1]) # (D * num_layers, N, L, Hcell) + # get the hidden/cell states corresponding to the initial sequence + hidden_states = hidden_states[:,:,0,:].contiguous() # (D * num_layers, N, Hout) + cell_states = cell_states[:,:,0,:].contiguous() # (D * num_layers, N, Hcell) + + # reset the RNN state in the middle of a sequence + if terminated is not None and torch.any(terminated): + rnn_outputs = [] + terminated = terminated.view(-1, self.sequence_length) + indexes = [0] + (terminated[:,:-1].any(dim=0).nonzero(as_tuple=True)[0] + 1).tolist() + [self.sequence_length] + + for i in range(len(indexes) - 1): + i0, i1 = indexes[i], indexes[i + 1] + rnn_output, (hidden_states, cell_states) = self.lstm(rnn_input[:,i0:i1,:], (hidden_states, cell_states)) + hidden_states[:, (terminated[:,i1-1]), :] = 0 + cell_states[:, (terminated[:,i1-1]), :] = 0 + rnn_outputs.append(rnn_output) + + rnn_states = (hidden_states, cell_states) + rnn_output = torch.cat(rnn_outputs, dim=1) + # no need to reset the RNN state in the sequence + else: + rnn_output, rnn_states = self.lstm(rnn_input, (hidden_states, cell_states)) + # rollout + else: + rnn_input = states.view(-1, 1, states.shape[-1]) # (N, L, Hin): N=num_envs, L=1 + rnn_output, rnn_states = self.lstm(rnn_input, (hidden_states, cell_states)) + + # flatten the RNN output + rnn_output = torch.flatten(rnn_output, start_dim=0, end_dim=1) # (N, L, D ∗ Hout) -> (N * L, D ∗ Hout) + + # Pendulum-v1 action_space is -2 to 2 + return 2 * torch.tanh(self.net(rnn_output)), self.log_std_parameter, {"rnn": [rnn_states[0], rnn_states[1]]} + +class Value(DeterministicMixin, Model): + def __init__(self, observation_space, action_space, device, clip_actions=False, + num_envs=1, num_layers=1, hidden_size=64, sequence_length=128): + Model.__init__(self, observation_space, action_space, device) + DeterministicMixin.__init__(self, clip_actions) + + self.num_envs = num_envs + self.num_layers = num_layers + self.hidden_size = hidden_size # Hcell (Hout is Hcell because proj_size = 0) + self.sequence_length = sequence_length + + self.lstm = nn.LSTM(input_size=self.num_observations, + hidden_size=self.hidden_size, + num_layers=self.num_layers, + batch_first=True) # batch_first -> (batch, sequence, features) + + self.net = nn.Sequential(nn.Linear(self.hidden_size, 64), + nn.ReLU(), + nn.Linear(64, 1)) + + def get_specification(self): + # batch size (N) is the number of envs + return {"rnn": {"sequence_length": self.sequence_length, + "sizes": [(self.num_layers, self.num_envs, self.hidden_size), # hidden states (D ∗ num_layers, N, Hout) + (self.num_layers, self.num_envs, self.hidden_size)]}} # cell states (D ∗ num_layers, N, Hcell) + + def compute(self, inputs, role): + states = inputs["states"] + terminated = inputs.get("terminated", None) + hidden_states, cell_states = inputs["rnn"][0], inputs["rnn"][1] + + # training + if self.training: + rnn_input = states.view(-1, self.sequence_length, states.shape[-1]) # (N, L, Hin): N=batch_size, L=sequence_length + + hidden_states = hidden_states.view(self.num_layers, -1, self.sequence_length, hidden_states.shape[-1]) # (D * num_layers, N, L, Hout) + cell_states = cell_states.view(self.num_layers, -1, self.sequence_length, cell_states.shape[-1]) # (D * num_layers, N, L, Hcell) + # get the hidden/cell states corresponding to the initial sequence + hidden_states = hidden_states[:,:,0,:].contiguous() # (D * num_layers, N, Hout) + cell_states = cell_states[:,:,0,:].contiguous() # (D * num_layers, N, Hcell) + + # reset the RNN state in the middle of a sequence + if terminated is not None and torch.any(terminated): + rnn_outputs = [] + terminated = terminated.view(-1, self.sequence_length) + indexes = [0] + (terminated[:,:-1].any(dim=0).nonzero(as_tuple=True)[0] + 1).tolist() + [self.sequence_length] + + for i in range(len(indexes) - 1): + i0, i1 = indexes[i], indexes[i + 1] + rnn_output, (hidden_states, cell_states) = self.lstm(rnn_input[:,i0:i1,:], (hidden_states, cell_states)) + hidden_states[:, (terminated[:,i1-1]), :] = 0 + cell_states[:, (terminated[:,i1-1]), :] = 0 + rnn_outputs.append(rnn_output) + + rnn_states = (hidden_states, cell_states) + rnn_output = torch.cat(rnn_outputs, dim=1) + # no need to reset the RNN state in the sequence + else: + rnn_output, rnn_states = self.lstm(rnn_input, (hidden_states, cell_states)) + # rollout + else: + rnn_input = states.view(-1, 1, states.shape[-1]) # (N, L, Hin): N=num_envs, L=1 + rnn_output, rnn_states = self.lstm(rnn_input, (hidden_states, cell_states)) + + # flatten the RNN output + rnn_output = torch.flatten(rnn_output, start_dim=0, end_dim=1) # (N, L, D ∗ Hout) -> (N * L, D ∗ Hout) + + return self.net(rnn_output), {"rnn": [rnn_states[0], rnn_states[1]]} + + +# Gym environment observation wrapper used to mask velocity. Adapted from rl_zoo3 (rl_zoo3/wrappers.py) +class NoVelocityWrapper(gym.ObservationWrapper): + def observation(self, observation): + # observation: x, y, angular velocity + return observation * np.array([1, 1, 0]) + +gym.envs.registration.register(id="PendulumNoVel-v1", entry_point=lambda: NoVelocityWrapper(gym.make("Pendulum-v1"))) + +# Load and wrap the Gym environment +env = gym.vector.make("PendulumNoVel-v1", num_envs=4, asynchronous=False) +env = wrap_env(env) + +device = env.device + + +# Instantiate a RandomMemory as rollout buffer (any memory can be used for this) +memory = RandomMemory(memory_size=1024, num_envs=env.num_envs, device=device) + + +# Instantiate the agent's models (function approximators). +# A2C requires 2 models, visit its documentation for more details +# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.a2c.html#spaces-and-models +models_a2c = {} +models_a2c["policy"] = Policy(env.observation_space, env.action_space, device, clip_actions=True, num_envs=env.num_envs) +models_a2c["value"] = Value(env.observation_space, env.action_space, device, num_envs=env.num_envs) + + +# Configure and instantiate the agent. +# Only modify some of the default configuration, visit its documentation to see all the options +# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.a2c.html#configuration-and-hyperparameters +cfg_a2c = A2C_DEFAULT_CONFIG.copy() +cfg_a2c["rollouts"] = 1024 # memory_size +cfg_a2c["learning_epochs"] = 10 +cfg_a2c["mini_batches"] = 32 +cfg_a2c["discount_factor"] = 0.9 +cfg_a2c["lambda"] = 0.95 +cfg_a2c["learning_rate"] = 1e-3 +cfg_a2c["learning_rate_scheduler"] = KLAdaptiveRL +cfg_a2c["learning_rate_scheduler_kwargs"] = {"kl_threshold": 0.008, "min_lr": 5e-4} +cfg_a2c["random_timesteps"] = 0 +cfg_a2c["learning_starts"] = 0 +cfg_a2c["grad_norm_clip"] = 0.5 +cfg_a2c["entropy_loss_scale"] = 0.0 +cfg_a2c["state_preprocessor"] = RunningStandardScaler +cfg_a2c["state_preprocessor_kwargs"] = {"size": env.observation_space, "device": device} +cfg_a2c["value_preprocessor"] = RunningStandardScaler +cfg_a2c["value_preprocessor_kwargs"] = {"size": 1, "device": device} +# logging to TensorBoard and write checkpoints each 500 and 5000 timesteps respectively +cfg_a2c["experiment"]["write_interval"] = 500 +cfg_a2c["experiment"]["checkpoint_interval"] = 5000 + +agent_ddpg = A2C(models=models_a2c, + memory=memory, + cfg=cfg_a2c, + observation_space=env.observation_space, + action_space=env.action_space, + device=device) + + +# Configure and instantiate the RL trainer +cfg_trainer = {"timesteps": 100000, "headless": True} +trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=agent_ddpg) + +# start training +trainer.train() diff --git a/docs/source/examples/gym/a2c_gym_pendulumnovel_rnn.py b/docs/source/examples/gym/a2c_gym_pendulumnovel_rnn.py new file mode 100644 index 00000000..6357a894 --- /dev/null +++ b/docs/source/examples/gym/a2c_gym_pendulumnovel_rnn.py @@ -0,0 +1,215 @@ +import gym + +import torch +import torch.nn as nn +import numpy as np + +# Import the skrl components to build the RL system +from skrl.models.torch import Model, GaussianMixin, DeterministicMixin +from skrl.memories.torch import RandomMemory +from skrl.agents.torch.a2c import A2C, A2C_DEFAULT_CONFIG +from skrl.trainers.torch import SequentialTrainer +from skrl.resources.preprocessors.torch import RunningStandardScaler +from skrl.resources.schedulers.torch import KLAdaptiveRL +from skrl.envs.torch import wrap_env + + +# Define the models (stochastic and deterministic models) for the agent using mixins. +# - Policy: takes as input the environment's observation/state and returns an action +# - Value: takes the state as input and provides a value to guide the policy +class Policy(GaussianMixin, Model): + def __init__(self, observation_space, action_space, device, clip_actions=False, + clip_log_std=True, min_log_std=-20, max_log_std=2, reduction="sum", + num_envs=1, num_layers=1, hidden_size=64, sequence_length=128): + Model.__init__(self, observation_space, action_space, device) + GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std, reduction) + + self.num_envs = num_envs + self.num_layers = num_layers + self.hidden_size = hidden_size # Hout + self.sequence_length = sequence_length + + self.rnn = nn.RNN(input_size=self.num_observations, + hidden_size=self.hidden_size, + num_layers=self.num_layers, + batch_first=True) # batch_first -> (batch, sequence, features) + + self.net = nn.Sequential(nn.Linear(self.hidden_size, 64), + nn.ReLU(), + nn.Linear(64, self.num_actions)) + self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) + + def get_specification(self): + # batch size (N) is the number of envs + return {"rnn": {"sequence_length": self.sequence_length, + "sizes": [(self.num_layers, self.num_envs, self.hidden_size)]}} # hidden states (D ∗ num_layers, N, Hout) + + def compute(self, inputs, role): + states = inputs["states"] + terminated = inputs.get("terminated", None) + hidden_states = inputs["rnn"][0] + + # training + if self.training: + rnn_input = states.view(-1, self.sequence_length, states.shape[-1]) # (N, L, Hin): N=batch_size, L=sequence_length + hidden_states = hidden_states.view(self.num_layers, -1, self.sequence_length, hidden_states.shape[-1]) # (D * num_layers, N, L, Hout) + # get the hidden states corresponding to the initial sequence + hidden_states = hidden_states[:,:,0,:].contiguous() # (D * num_layers, N, Hout) + + # reset the RNN state in the middle of a sequence + if terminated is not None and torch.any(terminated): + rnn_outputs = [] + terminated = terminated.view(-1, self.sequence_length) + indexes = [0] + (terminated[:,:-1].any(dim=0).nonzero(as_tuple=True)[0] + 1).tolist() + [self.sequence_length] + + for i in range(len(indexes) - 1): + i0, i1 = indexes[i], indexes[i + 1] + rnn_output, hidden_states = self.rnn(rnn_input[:,i0:i1,:], hidden_states) + hidden_states[:, (terminated[:,i1-1]), :] = 0 + rnn_outputs.append(rnn_output) + + rnn_output = torch.cat(rnn_outputs, dim=1) + # no need to reset the RNN state in the sequence + else: + rnn_output, hidden_states = self.rnn(rnn_input, hidden_states) + # rollout + else: + rnn_input = states.view(-1, 1, states.shape[-1]) # (N, L, Hin): N=num_envs, L=1 + rnn_output, hidden_states = self.rnn(rnn_input, hidden_states) + + # flatten the RNN output + rnn_output = torch.flatten(rnn_output, start_dim=0, end_dim=1) # (N, L, D ∗ Hout) -> (N * L, D ∗ Hout) + + # Pendulum-v1 action_space is -2 to 2 + return 2 * torch.tanh(self.net(rnn_output)), self.log_std_parameter, {"rnn": [hidden_states]} + +class Value(DeterministicMixin, Model): + def __init__(self, observation_space, action_space, device, clip_actions=False, + num_envs=1, num_layers=1, hidden_size=64, sequence_length=128): + Model.__init__(self, observation_space, action_space, device) + DeterministicMixin.__init__(self, clip_actions) + + self.num_envs = num_envs + self.num_layers = num_layers + self.hidden_size = hidden_size # Hout + self.sequence_length = sequence_length + + self.rnn = nn.RNN(input_size=self.num_observations, + hidden_size=self.hidden_size, + num_layers=self.num_layers, + batch_first=True) # batch_first -> (batch, sequence, features) + + self.net = nn.Sequential(nn.Linear(self.hidden_size, 64), + nn.ReLU(), + nn.Linear(64, 1)) + + def get_specification(self): + # batch size (N) is the number of envs + return {"rnn": {"sequence_length": self.sequence_length, + "sizes": [(self.num_layers, self.num_envs, self.hidden_size)]}} # hidden states (D ∗ num_layers, N, Hout) + + def compute(self, inputs, role): + states = inputs["states"] + terminated = inputs.get("terminated", None) + hidden_states = inputs["rnn"][0] + + # training + if self.training: + rnn_input = states.view(-1, self.sequence_length, states.shape[-1]) # (N, L, Hin): N=batch_size, L=sequence_length + + hidden_states = hidden_states.view(self.num_layers, -1, self.sequence_length, hidden_states.shape[-1]) # (D * num_layers, N, L, Hout) + # get the hidden states corresponding to the initial sequence + hidden_states = hidden_states[:,:,0,:].contiguous() # (D * num_layers, N, Hout) + + # reset the RNN state in the middle of a sequence + if terminated is not None and torch.any(terminated): + rnn_outputs = [] + terminated = terminated.view(-1, self.sequence_length) + indexes = [0] + (terminated[:,:-1].any(dim=0).nonzero(as_tuple=True)[0] + 1).tolist() + [self.sequence_length] + + for i in range(len(indexes) - 1): + i0, i1 = indexes[i], indexes[i + 1] + rnn_output, hidden_states = self.rnn(rnn_input[:,i0:i1,:], hidden_states) + hidden_states[:, (terminated[:,i1-1]), :] = 0 + rnn_outputs.append(rnn_output) + + rnn_output = torch.cat(rnn_outputs, dim=1) + # no need to reset the RNN state in the sequence + else: + rnn_output, hidden_states = self.rnn(rnn_input, hidden_states) + # rollout + else: + rnn_input = states.view(-1, 1, states.shape[-1]) # (N, L, Hin): N=num_envs, L=1 + rnn_output, hidden_states = self.rnn(rnn_input, hidden_states) + + # flatten the RNN output + rnn_output = torch.flatten(rnn_output, start_dim=0, end_dim=1) # (N, L, D ∗ Hout) -> (N * L, D ∗ Hout) + + return self.net(rnn_output), {"rnn": [hidden_states]} + + +# Gym environment observation wrapper used to mask velocity. Adapted from rl_zoo3 (rl_zoo3/wrappers.py) +class NoVelocityWrapper(gym.ObservationWrapper): + def observation(self, observation): + # observation: x, y, angular velocity + return observation * np.array([1, 1, 0]) + +gym.envs.registration.register(id="PendulumNoVel-v1", entry_point=lambda: NoVelocityWrapper(gym.make("Pendulum-v1"))) + +# Load and wrap the Gym environment +env = gym.vector.make("PendulumNoVel-v1", num_envs=4, asynchronous=False) +env = wrap_env(env) + +device = env.device + + +# Instantiate a RandomMemory as rollout buffer (any memory can be used for this) +memory = RandomMemory(memory_size=1024, num_envs=env.num_envs, device=device) + + +# Instantiate the agent's models (function approximators). +# A2C requires 2 models, visit its documentation for more details +# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.a2c.html#spaces-and-models +models_a2c = {} +models_a2c["policy"] = Policy(env.observation_space, env.action_space, device, clip_actions=True, num_envs=env.num_envs) +models_a2c["value"] = Value(env.observation_space, env.action_space, device, num_envs=env.num_envs) + + +# Configure and instantiate the agent. +# Only modify some of the default configuration, visit its documentation to see all the options +# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.a2c.html#configuration-and-hyperparameters +cfg_a2c = A2C_DEFAULT_CONFIG.copy() +cfg_a2c["rollouts"] = 1024 # memory_size +cfg_a2c["learning_epochs"] = 10 +cfg_a2c["mini_batches"] = 32 +cfg_a2c["discount_factor"] = 0.9 +cfg_a2c["lambda"] = 0.95 +cfg_a2c["learning_rate"] = 1e-3 +cfg_a2c["learning_rate_scheduler"] = KLAdaptiveRL +cfg_a2c["learning_rate_scheduler_kwargs"] = {"kl_threshold": 0.008, "min_lr": 5e-4} +cfg_a2c["random_timesteps"] = 0 +cfg_a2c["learning_starts"] = 0 +cfg_a2c["grad_norm_clip"] = 0.5 +cfg_a2c["entropy_loss_scale"] = 0.0 +cfg_a2c["state_preprocessor"] = RunningStandardScaler +cfg_a2c["state_preprocessor_kwargs"] = {"size": env.observation_space, "device": device} +cfg_a2c["value_preprocessor"] = RunningStandardScaler +cfg_a2c["value_preprocessor_kwargs"] = {"size": 1, "device": device} +# logging to TensorBoard and write checkpoints each 500 and 5000 timesteps respectively +cfg_a2c["experiment"]["write_interval"] = 500 +cfg_a2c["experiment"]["checkpoint_interval"] = 5000 + +agent_ddpg = A2C(models=models_a2c, + memory=memory, + cfg=cfg_a2c, + observation_space=env.observation_space, + action_space=env.action_space, + device=device) + + +# Configure and instantiate the RL trainer +cfg_trainer = {"timesteps": 100000, "headless": True} +trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=agent_ddpg) + +# start training +trainer.train() diff --git a/docs/source/examples/gym/gym_cartpole_cem.py b/docs/source/examples/gym/cem_gym_cartpole.py similarity index 90% rename from docs/source/examples/gym/gym_cartpole_cem.py rename to docs/source/examples/gym/cem_gym_cartpole.py index ac78940b..96082e71 100644 --- a/docs/source/examples/gym/gym_cartpole_cem.py +++ b/docs/source/examples/gym/cem_gym_cartpole.py @@ -22,10 +22,10 @@ def __init__(self, observation_space, action_space, device, unnormalized_log_pro self.linear_layer_2 = nn.Linear(64, 64) self.output_layer = nn.Linear(64, self.num_actions) - def compute(self, states, taken_actions, role): - x = F.relu(self.linear_layer_1(states)) + def compute(self, inputs, role): + x = F.relu(self.linear_layer_1(inputs["states"])) x = F.relu(self.linear_layer_2(x)) - return self.output_layer(x) + return self.output_layer(x), {} # Load and wrap the Gym environment. @@ -66,10 +66,10 @@ def compute(self, states, taken_actions, role): cfg_cem["experiment"]["write_interval"] = 1000 cfg_cem["experiment"]["checkpoint_interval"] = 5000 -agent_cem = CEM(models=models_cem, - memory=memory, - cfg=cfg_cem, - observation_space=env.observation_space, +agent_cem = CEM(models=models_cem, + memory=memory, + cfg=cfg_cem, + observation_space=env.observation_space, action_space=env.action_space, device=device) diff --git a/docs/source/examples/gym/gym_cartpole_cem_eval.py b/docs/source/examples/gym/cem_gym_cartpole_eval.py similarity index 89% rename from docs/source/examples/gym/gym_cartpole_cem_eval.py rename to docs/source/examples/gym/cem_gym_cartpole_eval.py index 72a3633e..e3c33247 100644 --- a/docs/source/examples/gym/gym_cartpole_cem_eval.py +++ b/docs/source/examples/gym/cem_gym_cartpole_eval.py @@ -21,10 +21,10 @@ def __init__(self, observation_space, action_space, device, unnormalized_log_pro self.linear_layer_2 = nn.Linear(64, 64) self.output_layer = nn.Linear(64, self.num_actions) - def compute(self, states, taken_actions, role): - x = F.relu(self.linear_layer_1(states)) + def compute(self, inputs, role): + x = F.relu(self.linear_layer_1(inputs["states"])) x = F.relu(self.linear_layer_2(x)) - return self.output_layer(x) + return self.output_layer(x), {} # Load and wrap the Gym environment. @@ -57,10 +57,10 @@ def compute(self, states, taken_actions, role): cfg_cem["experiment"]["write_interval"] = 1000 cfg_cem["experiment"]["checkpoint_interval"] = 0 -agent_cem = CEM(models=models_cem, - memory=None, - cfg=cfg_cem, - observation_space=env.observation_space, +agent_cem = CEM(models=models_cem, + memory=None, + cfg=cfg_cem, + observation_space=env.observation_space, action_space=env.action_space, device=device) diff --git a/docs/source/examples/gym/ddpg_gym_pendulum.py b/docs/source/examples/gym/ddpg_gym_pendulum.py new file mode 100644 index 00000000..e57eb411 --- /dev/null +++ b/docs/source/examples/gym/ddpg_gym_pendulum.py @@ -0,0 +1,106 @@ +import gym + +import torch +import torch.nn as nn +import torch.nn.functional as F + +# Import the skrl components to build the RL system +from skrl.models.torch import Model, DeterministicMixin +from skrl.memories.torch import RandomMemory +from skrl.agents.torch.ddpg import DDPG, DDPG_DEFAULT_CONFIG +from skrl.resources.noises.torch import OrnsteinUhlenbeckNoise +from skrl.trainers.torch import SequentialTrainer +from skrl.envs.torch import wrap_env + + +# Define the models (deterministic models) for the DDPG agent using mixin +# - Actor (policy): takes as input the environment's observation/state and returns an action +# - Critic: takes the state and action as input and provides a value to guide the policy +class Actor(DeterministicMixin, Model): + def __init__(self, observation_space, action_space, device, clip_actions=False): + Model.__init__(self, observation_space, action_space, device) + DeterministicMixin.__init__(self, clip_actions) + + self.linear_layer_1 = nn.Linear(self.num_observations, 400) + self.linear_layer_2 = nn.Linear(400, 300) + self.action_layer = nn.Linear(300, self.num_actions) + + def compute(self, inputs, role): + x = F.relu(self.linear_layer_1(inputs["states"])) + x = F.relu(self.linear_layer_2(x)) + # Pendulum-v1 action_space is -2 to 2 + return 2 * torch.tanh(self.action_layer(x)), {} + +class Critic(DeterministicMixin, Model): + def __init__(self, observation_space, action_space, device, clip_actions=False): + Model.__init__(self, observation_space, action_space, device) + DeterministicMixin.__init__(self, clip_actions) + + self.linear_layer_1 = nn.Linear(self.num_observations + self.num_actions, 400) + self.linear_layer_2 = nn.Linear(400, 300) + self.linear_layer_3 = nn.Linear(300, 1) + + def compute(self, inputs, role): + x = F.relu(self.linear_layer_1(torch.cat([inputs["states"], inputs["taken_actions"]], dim=1))) + x = F.relu(self.linear_layer_2(x)) + return self.linear_layer_3(x), {} + + +# Load and wrap the Gym environment. +# Note: the environment version may change depending on the gym version +try: + env = gym.make("Pendulum-v1") +except gym.error.DeprecatedEnv as e: + env_id = [spec.id for spec in gym.envs.registry.all() if spec.id.startswith("Pendulum-v")][0] + print("Pendulum-v1 not found. Trying {}".format(env_id)) + env = gym.make(env_id) +env = wrap_env(env) + +device = env.device + + +# Instantiate a RandomMemory (without replacement) as experience replay memory +memory = RandomMemory(memory_size=20000, num_envs=env.num_envs, device=device, replacement=False) + + +# Instantiate the agent's models (function approximators). +# DDPG requires 4 models, visit its documentation for more details +# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ddpg.html#spaces-and-models +models_ddpg = {} +models_ddpg["policy"] = Actor(env.observation_space, env.action_space, device) +models_ddpg["target_policy"] = Actor(env.observation_space, env.action_space, device) +models_ddpg["critic"] = Critic(env.observation_space, env.action_space, device) +models_ddpg["target_critic"] = Critic(env.observation_space, env.action_space, device) + +# Initialize the models' parameters (weights and biases) using a Gaussian distribution +for model in models_ddpg.values(): + model.init_parameters(method_name="normal_", mean=0.0, std=0.1) + + +# Configure and instantiate the agent. +# Only modify some of the default configuration, visit its documentation to see all the options +# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ddpg.html#configuration-and-hyperparameters +cfg_ddpg = DDPG_DEFAULT_CONFIG.copy() +cfg_ddpg["exploration"]["noise"] = OrnsteinUhlenbeckNoise(theta=0.15, sigma=0.1, base_scale=1.0, device=device) +cfg_ddpg["discount_factor"] = 0.98 +cfg_ddpg["batch_size"] = 100 +cfg_ddpg["random_timesteps"] = 1000 +cfg_ddpg["learning_starts"] = 1000 +# logging to TensorBoard and write checkpoints each 75 and 750 timesteps respectively +cfg_ddpg["experiment"]["write_interval"] = 75 +cfg_ddpg["experiment"]["checkpoint_interval"] = 750 + +agent_ddpg = DDPG(models=models_ddpg, + memory=memory, + cfg=cfg_ddpg, + observation_space=env.observation_space, + action_space=env.action_space, + device=device) + + +# Configure and instantiate the RL trainer +cfg_trainer = {"timesteps": 15000, "headless": True} +trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=agent_ddpg) + +# start training +trainer.train() diff --git a/docs/source/examples/gym/gym_pendulum_ddpg_eval.py b/docs/source/examples/gym/ddpg_gym_pendulum_eval.py similarity index 85% rename from docs/source/examples/gym/gym_pendulum_ddpg_eval.py rename to docs/source/examples/gym/ddpg_gym_pendulum_eval.py index 350cc85c..226f6523 100644 --- a/docs/source/examples/gym/gym_pendulum_ddpg_eval.py +++ b/docs/source/examples/gym/ddpg_gym_pendulum_eval.py @@ -11,7 +11,7 @@ from skrl.envs.torch import wrap_env -# Define only the policy for evaluation +# Define only the policy for evaluation class DeterministicActor(DeterministicMixin, Model): def __init__(self, observation_space, action_space, device, clip_actions=False): Model.__init__(self, observation_space, action_space, device) @@ -21,10 +21,10 @@ def __init__(self, observation_space, action_space, device, clip_actions=False): self.linear_layer_2 = nn.Linear(400, 300) self.action_layer = nn.Linear(300, self.num_actions) - def compute(self, states, taken_actions, role): - x = F.relu(self.linear_layer_1(states)) + def compute(self, inputs, role): + x = F.relu(self.linear_layer_1(inputs["states"])) x = F.relu(self.linear_layer_2(x)) - return 2 * torch.tanh(self.action_layer(x)) # Pendulum-v1 action_space is -2 to 2 + return 2 * torch.tanh(self.action_layer(x)), {} # Pendulum-v1 action_space is -2 to 2 # Load and wrap the Gym environment. @@ -56,10 +56,10 @@ def compute(self, states, taken_actions, role): cfg_ddpg["experiment"]["write_interval"] = 300 cfg_ddpg["experiment"]["checkpoint_interval"] = 0 -agent_ddpg = DDPG(models=models_ddpg, - memory=None, - cfg=cfg_ddpg, - observation_space=env.observation_space, +agent_ddpg = DDPG(models=models_ddpg, + memory=None, + cfg=cfg_ddpg, + observation_space=env.observation_space, action_space=env.action_space, device=device) diff --git a/docs/source/examples/gym/gym_vector_pendulum_ddpg.py b/docs/source/examples/gym/ddpg_gym_pendulum_vector.py similarity index 88% rename from docs/source/examples/gym/gym_vector_pendulum_ddpg.py rename to docs/source/examples/gym/ddpg_gym_pendulum_vector.py index c6bf729f..72990c3e 100644 --- a/docs/source/examples/gym/gym_vector_pendulum_ddpg.py +++ b/docs/source/examples/gym/ddpg_gym_pendulum_vector.py @@ -15,7 +15,7 @@ # Define the models (deterministic models) for the DDPG agent using mixin # - Actor (policy): takes as input the environment's observation/state and returns an action -# - Critic: takes the state and action as input and provides a value to guide the policy +# - Critic: takes the state and action as input and provides a value to guide the policy class DeterministicActor(DeterministicMixin, Model): def __init__(self, observation_space, action_space, device, clip_actions=False): Model.__init__(self, observation_space, action_space, device) @@ -25,10 +25,10 @@ def __init__(self, observation_space, action_space, device, clip_actions=False): self.linear_layer_2 = nn.Linear(400, 300) self.action_layer = nn.Linear(300, self.num_actions) - def compute(self, states, taken_actions, role): - x = F.relu(self.linear_layer_1(states)) + def compute(self, inputs, role): + x = F.relu(self.linear_layer_1(inputs["states"])) x = F.relu(self.linear_layer_2(x)) - return 2 * torch.tanh(self.action_layer(x)) # Pendulum-v1 action_space is -2 to 2 + return 2 * torch.tanh(self.action_layer(x)), {} # Pendulum-v1 action_space is -2 to 2 class DeterministicCritic(DeterministicMixin, Model): def __init__(self, observation_space, action_space, device, clip_actions=False): @@ -39,10 +39,10 @@ def __init__(self, observation_space, action_space, device, clip_actions=False): self.linear_layer_2 = nn.Linear(400, 300) self.linear_layer_3 = nn.Linear(300, 1) - def compute(self, states, taken_actions, role): - x = F.relu(self.linear_layer_1(torch.cat([states, taken_actions], dim=1))) + def compute(self, inputs, role): + x = F.relu(self.linear_layer_1(torch.cat([inputs["states"], inputs["taken_actions"]], dim=1))) x = F.relu(self.linear_layer_2(x)) - return self.linear_layer_3(x) + return self.linear_layer_3(x), {} # Load and wrap the Gym environment. @@ -88,10 +88,10 @@ def compute(self, states, taken_actions, role): cfg_ddpg["experiment"]["write_interval"] = 1000 cfg_ddpg["experiment"]["checkpoint_interval"] = 1000 -agent_ddpg = DDPG(models=models_ddpg, - memory=memory, - cfg=cfg_ddpg, - observation_space=env.observation_space, +agent_ddpg = DDPG(models=models_ddpg, + memory=memory, + cfg=cfg_ddpg, + observation_space=env.observation_space, action_space=env.action_space, device=device) diff --git a/docs/source/examples/gym/ddpg_gym_pendulumnovel.py b/docs/source/examples/gym/ddpg_gym_pendulumnovel.py new file mode 100644 index 00000000..d1adc59d --- /dev/null +++ b/docs/source/examples/gym/ddpg_gym_pendulumnovel.py @@ -0,0 +1,109 @@ +import gym + +import torch +import torch.nn as nn +import torch.nn.functional as F +import numpy as np + +# Import the skrl components to build the RL system +from skrl.models.torch import Model, DeterministicMixin +from skrl.memories.torch import RandomMemory +from skrl.agents.torch.ddpg import DDPG, DDPG_DEFAULT_CONFIG +from skrl.resources.noises.torch import OrnsteinUhlenbeckNoise +from skrl.trainers.torch import SequentialTrainer +from skrl.envs.torch import wrap_env + + +# Define the models (deterministic models) for the DDPG agent using mixin +# - Actor (policy): takes as input the environment's observation/state and returns an action +# - Critic: takes the state and action as input and provides a value to guide the policy +class Actor(DeterministicMixin, Model): + def __init__(self, observation_space, action_space, device, clip_actions=False): + Model.__init__(self, observation_space, action_space, device) + DeterministicMixin.__init__(self, clip_actions) + + self.linear_layer_1 = nn.Linear(self.num_observations, 400) + self.linear_layer_2 = nn.Linear(400, 300) + self.action_layer = nn.Linear(300, self.num_actions) + + def compute(self, inputs, role): + x = F.relu(self.linear_layer_1(inputs["states"])) + x = F.relu(self.linear_layer_2(x)) + # Pendulum-v1 action_space is -2 to 2 + return 2 * torch.tanh(self.action_layer(x)), {} + +class Critic(DeterministicMixin, Model): + def __init__(self, observation_space, action_space, device, clip_actions=False): + Model.__init__(self, observation_space, action_space, device) + DeterministicMixin.__init__(self, clip_actions) + + self.linear_layer_1 = nn.Linear(self.num_observations + self.num_actions, 400) + self.linear_layer_2 = nn.Linear(400, 300) + self.linear_layer_3 = nn.Linear(300, 1) + + def compute(self, inputs, role): + x = F.relu(self.linear_layer_1(torch.cat([inputs["states"], inputs["taken_actions"]], dim=1))) + x = F.relu(self.linear_layer_2(x)) + return self.linear_layer_3(x), {} + + +# Gym environment observation wrapper used to mask velocity. Adapted from rl_zoo3 (rl_zoo3/wrappers.py) +class NoVelocityWrapper(gym.ObservationWrapper): + def observation(self, observation): + # observation: x, y, angular velocity + return observation * np.array([1, 1, 0]) + +gym.envs.registration.register(id="PendulumNoVel-v1", entry_point=lambda: NoVelocityWrapper(gym.make("Pendulum-v1"))) + +# Load and wrap the Gym environment +env = gym.make("PendulumNoVel-v1") +env = wrap_env(env) + +device = env.device + + +# Instantiate a RandomMemory (without replacement) as experience replay memory +memory = RandomMemory(memory_size=20000, num_envs=env.num_envs, device=device, replacement=False) + + +# Instantiate the agent's models (function approximators). +# DDPG requires 4 models, visit its documentation for more details +# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ddpg.html#spaces-and-models +models_ddpg = {} +models_ddpg["policy"] = Actor(env.observation_space, env.action_space, device) +models_ddpg["target_policy"] = Actor(env.observation_space, env.action_space, device) +models_ddpg["critic"] = Critic(env.observation_space, env.action_space, device) +models_ddpg["target_critic"] = Critic(env.observation_space, env.action_space, device) + +# Initialize the models' parameters (weights and biases) using a Gaussian distribution +for model in models_ddpg.values(): + model.init_parameters(method_name="normal_", mean=0.0, std=0.1) + + +# Configure and instantiate the agent. +# Only modify some of the default configuration, visit its documentation to see all the options +# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ddpg.html#configuration-and-hyperparameters +cfg_ddpg = DDPG_DEFAULT_CONFIG.copy() +cfg_ddpg["exploration"]["noise"] = OrnsteinUhlenbeckNoise(theta=0.15, sigma=0.1, base_scale=1.0, device=device) +cfg_ddpg["discount_factor"] = 0.98 +cfg_ddpg["batch_size"] = 100 +cfg_ddpg["random_timesteps"] = 1000 +cfg_ddpg["learning_starts"] = 1000 +# logging to TensorBoard and write checkpoints each 75 and 750 timesteps respectively +cfg_ddpg["experiment"]["write_interval"] = 75 +cfg_ddpg["experiment"]["checkpoint_interval"] = 750 + +agent_ddpg = DDPG(models=models_ddpg, + memory=memory, + cfg=cfg_ddpg, + observation_space=env.observation_space, + action_space=env.action_space, + device=device) + + +# Configure and instantiate the RL trainer +cfg_trainer = {"timesteps": 15000, "headless": True} +trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=agent_ddpg) + +# start training +trainer.train() diff --git a/docs/source/examples/gym/ddpg_gym_pendulumnovel_gru.py b/docs/source/examples/gym/ddpg_gym_pendulumnovel_gru.py new file mode 100644 index 00000000..90f36533 --- /dev/null +++ b/docs/source/examples/gym/ddpg_gym_pendulumnovel_gru.py @@ -0,0 +1,211 @@ +import gym + +import torch +import torch.nn as nn +import torch.nn.functional as F +import numpy as np + +# Import the skrl components to build the RL system +from skrl.models.torch import Model, DeterministicMixin +from skrl.memories.torch import RandomMemory +from skrl.agents.torch.ddpg import DDPG, DDPG_DEFAULT_CONFIG +from skrl.resources.noises.torch import OrnsteinUhlenbeckNoise +from skrl.trainers.torch import SequentialTrainer +from skrl.envs.torch import wrap_env + + +# Define the models (deterministic models) for the DDPG agent using mixin +# - Actor (policy): takes as input the environment's observation/state and returns an action +# - Critic: takes the state and action as input and provides a value to guide the policy +class Actor(DeterministicMixin, Model): + def __init__(self, observation_space, action_space, device, clip_actions=False, + num_envs=1, num_layers=1, hidden_size=400, sequence_length=20): + Model.__init__(self, observation_space, action_space, device) + DeterministicMixin.__init__(self, clip_actions) + + self.num_envs = num_envs + self.num_layers = num_layers + self.hidden_size = hidden_size # Hout + self.sequence_length = sequence_length + + self.gru = nn.GRU(input_size=self.num_observations, + hidden_size=self.hidden_size, + num_layers=self.num_layers, + batch_first=True) # batch_first -> (batch, sequence, features) + + self.linear_layer_1 = nn.Linear(self.hidden_size, 400) + self.linear_layer_2 = nn.Linear(400, 300) + self.action_layer = nn.Linear(300, self.num_actions) + + def get_specification(self): + # batch size (N) is the number of envs + return {"rnn": {"sequence_length": self.sequence_length, + "sizes": [(self.num_layers, self.num_envs, self.hidden_size)]}} # hidden states (D ∗ num_layers, N, Hout) + + def compute(self, inputs, role): + states = inputs["states"] + terminated = inputs.get("terminated", None) + hidden_states = inputs["rnn"][0] + + # training + if self.training: + rnn_input = states.view(-1, self.sequence_length, states.shape[-1]) # (N, L, Hin): N=batch_size, L=sequence_length + hidden_states = hidden_states.view(self.num_layers, -1, self.sequence_length, hidden_states.shape[-1]) # (D * num_layers, N, L, Hout) + # get the hidden states corresponding to the initial sequence + sequence_index = 1 if role == "target_policy" else 0 # target networks act on the next state of the environment + hidden_states = hidden_states[:,:,sequence_index,:].contiguous() # (D * num_layers, N, Hout) + + # reset the RNN state in the middle of a sequence + if terminated is not None and torch.any(terminated): + rnn_outputs = [] + terminated = terminated.view(-1, self.sequence_length) + indexes = [0] + (terminated[:,:-1].any(dim=0).nonzero(as_tuple=True)[0] + 1).tolist() + [self.sequence_length] + + for i in range(len(indexes) - 1): + i0, i1 = indexes[i], indexes[i + 1] + rnn_output, hidden_states = self.gru(rnn_input[:,i0:i1,:], hidden_states) + hidden_states[:, (terminated[:,i1-1]), :] = 0 + rnn_outputs.append(rnn_output) + + rnn_output = torch.cat(rnn_outputs, dim=1) + # no need to reset the RNN state in the sequence + else: + rnn_output, hidden_states = self.gru(rnn_input, hidden_states) + # rollout + else: + rnn_input = states.view(-1, 1, states.shape[-1]) # (N, L, Hin): N=num_envs, L=1 + rnn_output, hidden_states = self.gru(rnn_input, hidden_states) + + # flatten the RNN output + rnn_output = torch.flatten(rnn_output, start_dim=0, end_dim=1) # (N, L, D ∗ Hout) -> (N * L, D ∗ Hout) + + x = F.relu(self.linear_layer_1(rnn_output)) + x = F.relu(self.linear_layer_2(x)) + + # Pendulum-v1 action_space is -2 to 2 + return 2 * torch.tanh(self.action_layer(x)), {"rnn": [hidden_states]} + +class Critic(DeterministicMixin, Model): + def __init__(self, observation_space, action_space, device, clip_actions=False, + num_envs=1, num_layers=1, hidden_size=400, sequence_length=20): + Model.__init__(self, observation_space, action_space, device) + DeterministicMixin.__init__(self, clip_actions) + + self.num_envs = num_envs + self.num_layers = num_layers + self.hidden_size = hidden_size # Hout + self.sequence_length = sequence_length + + self.gru = nn.GRU(input_size=self.num_observations, + hidden_size=self.hidden_size, + num_layers=self.num_layers, + batch_first=True) # batch_first -> (batch, sequence, features) + + self.linear_layer_1 = nn.Linear(self.hidden_size + self.num_actions, 400) + self.linear_layer_2 = nn.Linear(400, 300) + self.linear_layer_3 = nn.Linear(300, 1) + + def get_specification(self): + # batch size (N) is the number of envs + return {"rnn": {"sequence_length": self.sequence_length, + "sizes": [(self.num_layers, self.num_envs, self.hidden_size)]}} # hidden states (D ∗ num_layers, N, Hout) + + def compute(self, inputs, role): + states = inputs["states"] + terminated = inputs.get("terminated", None) + hidden_states = inputs["rnn"][0] + + # critic is only used during training + rnn_input = states.view(-1, self.sequence_length, states.shape[-1]) # (N, L, Hin): N=batch_size, L=sequence_length + + hidden_states = hidden_states.view(self.num_layers, -1, self.sequence_length, hidden_states.shape[-1]) # (D * num_layers, N, L, Hout) + # get the hidden states corresponding to the initial sequence + sequence_index = 1 if role == "target_critic" else 0 # target networks act on the next state of the environment + hidden_states = hidden_states[:,:,sequence_index,:].contiguous() # (D * num_layers, N, Hout) + + # reset the RNN state in the middle of a sequence + if terminated is not None and torch.any(terminated): + rnn_outputs = [] + terminated = terminated.view(-1, self.sequence_length) + indexes = [0] + (terminated[:,:-1].any(dim=0).nonzero(as_tuple=True)[0] + 1).tolist() + [self.sequence_length] + + for i in range(len(indexes) - 1): + i0, i1 = indexes[i], indexes[i + 1] + rnn_output, hidden_states = self.gru(rnn_input[:,i0:i1,:], hidden_states) + hidden_states[:, (terminated[:,i1-1]), :] = 0 + rnn_outputs.append(rnn_output) + + rnn_output = torch.cat(rnn_outputs, dim=1) + # no need to reset the RNN state in the sequence + else: + rnn_output, hidden_states = self.gru(rnn_input, hidden_states) + + # flatten the RNN output + rnn_output = torch.flatten(rnn_output, start_dim=0, end_dim=1) # (N, L, D ∗ Hout) -> (N * L, D ∗ Hout) + + x = F.relu(self.linear_layer_1(torch.cat([rnn_output, inputs["taken_actions"]], dim=1))) + x = F.relu(self.linear_layer_2(x)) + + return self.linear_layer_3(x), {"rnn": [hidden_states]} + + +# Gym environment observation wrapper used to mask velocity. Adapted from rl_zoo3 (rl_zoo3/wrappers.py) +class NoVelocityWrapper(gym.ObservationWrapper): + def observation(self, observation): + # observation: x, y, angular velocity + return observation * np.array([1, 1, 0]) + +gym.envs.registration.register(id="PendulumNoVel-v1", entry_point=lambda: NoVelocityWrapper(gym.make("Pendulum-v1"))) + +# Load and wrap the Gym environment +env = gym.make("PendulumNoVel-v1") +env = wrap_env(env) + +device = env.device + + +# Instantiate a RandomMemory (without replacement) as experience replay memory +memory = RandomMemory(memory_size=20000, num_envs=env.num_envs, device=device, replacement=False) + + +# Instantiate the agent's models (function approximators). +# DDPG requires 4 models, visit its documentation for more details +# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ddpg.html#spaces-and-models +models_ddpg = {} +models_ddpg["policy"] = Actor(env.observation_space, env.action_space, device, num_envs=env.num_envs) +models_ddpg["target_policy"] = Actor(env.observation_space, env.action_space, device, num_envs=env.num_envs) +models_ddpg["critic"] = Critic(env.observation_space, env.action_space, device, num_envs=env.num_envs) +models_ddpg["target_critic"] = Critic(env.observation_space, env.action_space, device, num_envs=env.num_envs) + +# Initialize the models' parameters (weights and biases) using a Gaussian distribution +for model in models_ddpg.values(): + model.init_parameters(method_name="normal_", mean=0.0, std=0.1) + + +# Configure and instantiate the agent. +# Only modify some of the default configuration, visit its documentation to see all the options +# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ddpg.html#configuration-and-hyperparameters +cfg_ddpg = DDPG_DEFAULT_CONFIG.copy() +cfg_ddpg["exploration"]["noise"] = OrnsteinUhlenbeckNoise(theta=0.15, sigma=0.1, base_scale=1.0, device=device) +cfg_ddpg["discount_factor"] = 0.98 +cfg_ddpg["batch_size"] = 100 +cfg_ddpg["random_timesteps"] = 0 +cfg_ddpg["learning_starts"] = 1000 +# logging to TensorBoard and write checkpoints each 75 and 750 timesteps respectively +cfg_ddpg["experiment"]["write_interval"] = 75 +cfg_ddpg["experiment"]["checkpoint_interval"] = 750 + +agent_ddpg = DDPG(models=models_ddpg, + memory=memory, + cfg=cfg_ddpg, + observation_space=env.observation_space, + action_space=env.action_space, + device=device) + + +# Configure and instantiate the RL trainer +cfg_trainer = {"timesteps": 15000, "headless": True} +trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=agent_ddpg) + +# start training +trainer.train() diff --git a/docs/source/examples/gym/ddpg_gym_pendulumnovel_lstm.py b/docs/source/examples/gym/ddpg_gym_pendulumnovel_lstm.py new file mode 100644 index 00000000..d7310c74 --- /dev/null +++ b/docs/source/examples/gym/ddpg_gym_pendulumnovel_lstm.py @@ -0,0 +1,221 @@ +import gym + +import torch +import torch.nn as nn +import torch.nn.functional as F +import numpy as np + +# Import the skrl components to build the RL system +from skrl.models.torch import Model, DeterministicMixin +from skrl.memories.torch import RandomMemory +from skrl.agents.torch.ddpg import DDPG, DDPG_DEFAULT_CONFIG +from skrl.resources.noises.torch import OrnsteinUhlenbeckNoise +from skrl.trainers.torch import SequentialTrainer +from skrl.envs.torch import wrap_env + + +# Define the models (deterministic models) for the DDPG agent using mixin +# - Actor (policy): takes as input the environment's observation/state and returns an action +# - Critic: takes the state and action as input and provides a value to guide the policy +class Actor(DeterministicMixin, Model): + def __init__(self, observation_space, action_space, device, clip_actions=False, + num_envs=1, num_layers=1, hidden_size=400, sequence_length=20): + Model.__init__(self, observation_space, action_space, device) + DeterministicMixin.__init__(self, clip_actions) + + self.num_envs = num_envs + self.num_layers = num_layers + self.hidden_size = hidden_size # Hcell (Hout is Hcell because proj_size = 0) + self.sequence_length = sequence_length + + self.lstm = nn.LSTM(input_size=self.num_observations, + hidden_size=self.hidden_size, + num_layers=self.num_layers, + batch_first=True) # batch_first -> (batch, sequence, features) + + self.linear_layer_1 = nn.Linear(self.hidden_size, 400) + self.linear_layer_2 = nn.Linear(400, 300) + self.action_layer = nn.Linear(300, self.num_actions) + + def get_specification(self): + # batch size (N) is the number of envs + return {"rnn": {"sequence_length": self.sequence_length, + "sizes": [(self.num_layers, self.num_envs, self.hidden_size), # hidden states (D ∗ num_layers, N, Hout) + (self.num_layers, self.num_envs, self.hidden_size)]}} # cell states (D ∗ num_layers, N, Hcell) + + def compute(self, inputs, role): + states = inputs["states"] + terminated = inputs.get("terminated", None) + hidden_states, cell_states = inputs["rnn"][0], inputs["rnn"][1] + + # training + if self.training: + rnn_input = states.view(-1, self.sequence_length, states.shape[-1]) # (N, L, Hin): N=batch_size, L=sequence_length + hidden_states = hidden_states.view(self.num_layers, -1, self.sequence_length, hidden_states.shape[-1]) # (D * num_layers, N, L, Hout) + cell_states = cell_states.view(self.num_layers, -1, self.sequence_length, cell_states.shape[-1]) # (D * num_layers, N, L, Hcell) + # get the hidden/cell states corresponding to the initial sequence + sequence_index = 1 if role == "target_policy" else 0 # target networks act on the next state of the environment + hidden_states = hidden_states[:,:,sequence_index,:].contiguous() # (D * num_layers, N, Hout) + cell_states = cell_states[:,:,sequence_index,:].contiguous() # (D * num_layers, N, Hcell) + + # reset the RNN state in the middle of a sequence + if terminated is not None and torch.any(terminated): + rnn_outputs = [] + terminated = terminated.view(-1, self.sequence_length) + indexes = [0] + (terminated[:,:-1].any(dim=0).nonzero(as_tuple=True)[0] + 1).tolist() + [self.sequence_length] + + for i in range(len(indexes) - 1): + i0, i1 = indexes[i], indexes[i + 1] + rnn_output, (hidden_states, cell_states) = self.lstm(rnn_input[:,i0:i1,:], (hidden_states, cell_states)) + hidden_states[:, (terminated[:,i1-1]), :] = 0 + cell_states[:, (terminated[:,i1-1]), :] = 0 + rnn_outputs.append(rnn_output) + + rnn_states = (hidden_states, cell_states) + rnn_output = torch.cat(rnn_outputs, dim=1) + # no need to reset the RNN state in the sequence + else: + rnn_output, rnn_states = self.lstm(rnn_input, (hidden_states, cell_states)) + # rollout + else: + rnn_input = states.view(-1, 1, states.shape[-1]) # (N, L, Hin): N=num_envs, L=1 + rnn_output, rnn_states = self.lstm(rnn_input, (hidden_states, cell_states)) + + # flatten the RNN output + rnn_output = torch.flatten(rnn_output, start_dim=0, end_dim=1) # (N, L, D ∗ Hout) -> (N * L, D ∗ Hout) + + x = F.relu(self.linear_layer_1(rnn_output)) + x = F.relu(self.linear_layer_2(x)) + + # Pendulum-v1 action_space is -2 to 2 + return 2 * torch.tanh(self.action_layer(x)), {"rnn": [rnn_states[0], rnn_states[1]]} + +class Critic(DeterministicMixin, Model): + def __init__(self, observation_space, action_space, device, clip_actions=False, + num_envs=1, num_layers=1, hidden_size=400, sequence_length=20): + Model.__init__(self, observation_space, action_space, device) + DeterministicMixin.__init__(self, clip_actions) + + self.num_envs = num_envs + self.num_layers = num_layers + self.hidden_size = hidden_size # Hcell (Hout is Hcell because proj_size = 0) + self.sequence_length = sequence_length + + self.lstm = nn.LSTM(input_size=self.num_observations, + hidden_size=self.hidden_size, + num_layers=self.num_layers, + batch_first=True) # batch_first -> (batch, sequence, features) + + self.linear_layer_1 = nn.Linear(self.hidden_size + self.num_actions, 400) + self.linear_layer_2 = nn.Linear(400, 300) + self.linear_layer_3 = nn.Linear(300, 1) + + def get_specification(self): + # batch size (N) is the number of envs + return {"rnn": {"sequence_length": self.sequence_length, + "sizes": [(self.num_layers, self.num_envs, self.hidden_size), # hidden states (D ∗ num_layers, N, Hout) + (self.num_layers, self.num_envs, self.hidden_size)]}} # cell states (D ∗ num_layers, N, Hcell) + + def compute(self, inputs, role): + states = inputs["states"] + terminated = inputs.get("terminated", None) + hidden_states, cell_states = inputs["rnn"][0], inputs["rnn"][1] + + # critic is only used during training + rnn_input = states.view(-1, self.sequence_length, states.shape[-1]) # (N, L, Hin): N=batch_size, L=sequence_length + + hidden_states = hidden_states.view(self.num_layers, -1, self.sequence_length, hidden_states.shape[-1]) # (D * num_layers, N, L, Hout) + cell_states = cell_states.view(self.num_layers, -1, self.sequence_length, cell_states.shape[-1]) # (D * num_layers, N, L, Hcell) + # get the hidden/cell states corresponding to the initial sequence + sequence_index = 1 if role == "target_critic" else 0 # target networks act on the next state of the environment + hidden_states = hidden_states[:,:,sequence_index,:].contiguous() # (D * num_layers, N, Hout) + cell_states = cell_states[:,:,sequence_index,:].contiguous() # (D * num_layers, N, Hcell) + + # reset the RNN state in the middle of a sequence + if terminated is not None and torch.any(terminated): + rnn_outputs = [] + terminated = terminated.view(-1, self.sequence_length) + indexes = [0] + (terminated[:,:-1].any(dim=0).nonzero(as_tuple=True)[0] + 1).tolist() + [self.sequence_length] + + for i in range(len(indexes) - 1): + i0, i1 = indexes[i], indexes[i + 1] + rnn_output, (hidden_states, cell_states) = self.lstm(rnn_input[:,i0:i1,:], (hidden_states, cell_states)) + hidden_states[:, (terminated[:,i1-1]), :] = 0 + cell_states[:, (terminated[:,i1-1]), :] = 0 + rnn_outputs.append(rnn_output) + + rnn_states = (hidden_states, cell_states) + rnn_output = torch.cat(rnn_outputs, dim=1) + # no need to reset the RNN state in the sequence + else: + rnn_output, rnn_states = self.lstm(rnn_input, (hidden_states, cell_states)) + + # flatten the RNN output + rnn_output = torch.flatten(rnn_output, start_dim=0, end_dim=1) # (N, L, D ∗ Hout) -> (N * L, D ∗ Hout) + + x = F.relu(self.linear_layer_1(torch.cat([rnn_output, inputs["taken_actions"]], dim=1))) + x = F.relu(self.linear_layer_2(x)) + + return self.linear_layer_3(x), {"rnn": [rnn_states[0], rnn_states[1]]} + + +# Gym environment observation wrapper used to mask velocity. Adapted from rl_zoo3 (rl_zoo3/wrappers.py) +class NoVelocityWrapper(gym.ObservationWrapper): + def observation(self, observation): + # observation: x, y, angular velocity + return observation * np.array([1, 1, 0]) + +gym.envs.registration.register(id="PendulumNoVel-v1", entry_point=lambda: NoVelocityWrapper(gym.make("Pendulum-v1"))) + +# Load and wrap the Gym environment +env = gym.make("PendulumNoVel-v1") +env = wrap_env(env) + +device = env.device + + +# Instantiate a RandomMemory (without replacement) as experience replay memory +memory = RandomMemory(memory_size=20000, num_envs=env.num_envs, device=device, replacement=False) + + +# Instantiate the agent's models (function approximators). +# DDPG requires 4 models, visit its documentation for more details +# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ddpg.html#spaces-and-models +models_ddpg = {} +models_ddpg["policy"] = Actor(env.observation_space, env.action_space, device, num_envs=env.num_envs) +models_ddpg["target_policy"] = Actor(env.observation_space, env.action_space, device, num_envs=env.num_envs) +models_ddpg["critic"] = Critic(env.observation_space, env.action_space, device, num_envs=env.num_envs) +models_ddpg["target_critic"] = Critic(env.observation_space, env.action_space, device, num_envs=env.num_envs) + +# Initialize the models' parameters (weights and biases) using a Gaussian distribution +for model in models_ddpg.values(): + model.init_parameters(method_name="normal_", mean=0.0, std=0.1) + + +# Configure and instantiate the agent. +# Only modify some of the default configuration, visit its documentation to see all the options +# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ddpg.html#configuration-and-hyperparameters +cfg_ddpg = DDPG_DEFAULT_CONFIG.copy() +cfg_ddpg["exploration"]["noise"] = OrnsteinUhlenbeckNoise(theta=0.15, sigma=0.1, base_scale=1.0, device=device) +cfg_ddpg["discount_factor"] = 0.98 +cfg_ddpg["batch_size"] = 100 +cfg_ddpg["random_timesteps"] = 0 +cfg_ddpg["learning_starts"] = 1000 +# logging to TensorBoard and write checkpoints each 75 and 750 timesteps respectively +cfg_ddpg["experiment"]["write_interval"] = 75 +cfg_ddpg["experiment"]["checkpoint_interval"] = 750 + +agent_ddpg = DDPG(models=models_ddpg, + memory=memory, + cfg=cfg_ddpg, + observation_space=env.observation_space, + action_space=env.action_space, + device=device) + + +# Configure and instantiate the RL trainer +cfg_trainer = {"timesteps": 15000, "headless": True} +trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=agent_ddpg) + +# start training +trainer.train() diff --git a/docs/source/examples/gym/ddpg_gym_pendulumnovel_rnn.py b/docs/source/examples/gym/ddpg_gym_pendulumnovel_rnn.py new file mode 100644 index 00000000..a6a8df71 --- /dev/null +++ b/docs/source/examples/gym/ddpg_gym_pendulumnovel_rnn.py @@ -0,0 +1,211 @@ +import gym + +import torch +import torch.nn as nn +import torch.nn.functional as F +import numpy as np + +# Import the skrl components to build the RL system +from skrl.models.torch import Model, DeterministicMixin +from skrl.memories.torch import RandomMemory +from skrl.agents.torch.ddpg import DDPG, DDPG_DEFAULT_CONFIG +from skrl.resources.noises.torch import OrnsteinUhlenbeckNoise +from skrl.trainers.torch import SequentialTrainer +from skrl.envs.torch import wrap_env + + +# Define the models (deterministic models) for the DDPG agent using mixin +# - Actor (policy): takes as input the environment's observation/state and returns an action +# - Critic: takes the state and action as input and provides a value to guide the policy +class Actor(DeterministicMixin, Model): + def __init__(self, observation_space, action_space, device, clip_actions=False, + num_envs=1, num_layers=1, hidden_size=400, sequence_length=20): + Model.__init__(self, observation_space, action_space, device) + DeterministicMixin.__init__(self, clip_actions) + + self.num_envs = num_envs + self.num_layers = num_layers + self.hidden_size = hidden_size # Hout + self.sequence_length = sequence_length + + self.rnn = nn.RNN(input_size=self.num_observations, + hidden_size=self.hidden_size, + num_layers=self.num_layers, + batch_first=True) # batch_first -> (batch, sequence, features) + + self.linear_layer_1 = nn.Linear(self.hidden_size, 400) + self.linear_layer_2 = nn.Linear(400, 300) + self.action_layer = nn.Linear(300, self.num_actions) + + def get_specification(self): + # batch size (N) is the number of envs + return {"rnn": {"sequence_length": self.sequence_length, + "sizes": [(self.num_layers, self.num_envs, self.hidden_size)]}} # hidden states (D ∗ num_layers, N, Hout) + + def compute(self, inputs, role): + states = inputs["states"] + terminated = inputs.get("terminated", None) + hidden_states = inputs["rnn"][0] + + # training + if self.training: + rnn_input = states.view(-1, self.sequence_length, states.shape[-1]) # (N, L, Hin): N=batch_size, L=sequence_length + hidden_states = hidden_states.view(self.num_layers, -1, self.sequence_length, hidden_states.shape[-1]) # (D * num_layers, N, L, Hout) + # get the hidden states corresponding to the initial sequence + sequence_index = 1 if role == "target_policy" else 0 # target networks act on the next state of the environment + hidden_states = hidden_states[:,:,sequence_index,:].contiguous() # (D * num_layers, N, Hout) + + # reset the RNN state in the middle of a sequence + if terminated is not None and torch.any(terminated): + rnn_outputs = [] + terminated = terminated.view(-1, self.sequence_length) + indexes = [0] + (terminated[:,:-1].any(dim=0).nonzero(as_tuple=True)[0] + 1).tolist() + [self.sequence_length] + + for i in range(len(indexes) - 1): + i0, i1 = indexes[i], indexes[i + 1] + rnn_output, hidden_states = self.rnn(rnn_input[:,i0:i1,:], hidden_states) + hidden_states[:, (terminated[:,i1-1]), :] = 0 + rnn_outputs.append(rnn_output) + + rnn_output = torch.cat(rnn_outputs, dim=1) + # no need to reset the RNN state in the sequence + else: + rnn_output, hidden_states = self.rnn(rnn_input, hidden_states) + # rollout + else: + rnn_input = states.view(-1, 1, states.shape[-1]) # (N, L, Hin): N=num_envs, L=1 + rnn_output, hidden_states = self.rnn(rnn_input, hidden_states) + + # flatten the RNN output + rnn_output = torch.flatten(rnn_output, start_dim=0, end_dim=1) # (N, L, D ∗ Hout) -> (N * L, D ∗ Hout) + + x = F.relu(self.linear_layer_1(rnn_output)) + x = F.relu(self.linear_layer_2(x)) + + # Pendulum-v1 action_space is -2 to 2 + return 2 * torch.tanh(self.action_layer(x)), {"rnn": [hidden_states]} + +class Critic(DeterministicMixin, Model): + def __init__(self, observation_space, action_space, device, clip_actions=False, + num_envs=1, num_layers=1, hidden_size=400, sequence_length=20): + Model.__init__(self, observation_space, action_space, device) + DeterministicMixin.__init__(self, clip_actions) + + self.num_envs = num_envs + self.num_layers = num_layers + self.hidden_size = hidden_size # Hout + self.sequence_length = sequence_length + + self.rnn = nn.RNN(input_size=self.num_observations, + hidden_size=self.hidden_size, + num_layers=self.num_layers, + batch_first=True) # batch_first -> (batch, sequence, features) + + self.linear_layer_1 = nn.Linear(self.hidden_size + self.num_actions, 400) + self.linear_layer_2 = nn.Linear(400, 300) + self.linear_layer_3 = nn.Linear(300, 1) + + def get_specification(self): + # batch size (N) is the number of envs + return {"rnn": {"sequence_length": self.sequence_length, + "sizes": [(self.num_layers, self.num_envs, self.hidden_size)]}} # hidden states (D ∗ num_layers, N, Hout) + + def compute(self, inputs, role): + states = inputs["states"] + terminated = inputs.get("terminated", None) + hidden_states = inputs["rnn"][0] + + # critic is only used during training + rnn_input = states.view(-1, self.sequence_length, states.shape[-1]) # (N, L, Hin): N=batch_size, L=sequence_length + + hidden_states = hidden_states.view(self.num_layers, -1, self.sequence_length, hidden_states.shape[-1]) # (D * num_layers, N, L, Hout) + # get the hidden states corresponding to the initial sequence + sequence_index = 1 if role == "target_critic" else 0 # target networks act on the next state of the environment + hidden_states = hidden_states[:,:,sequence_index,:].contiguous() # (D * num_layers, N, Hout) + + # reset the RNN state in the middle of a sequence + if terminated is not None and torch.any(terminated): + rnn_outputs = [] + terminated = terminated.view(-1, self.sequence_length) + indexes = [0] + (terminated[:,:-1].any(dim=0).nonzero(as_tuple=True)[0] + 1).tolist() + [self.sequence_length] + + for i in range(len(indexes) - 1): + i0, i1 = indexes[i], indexes[i + 1] + rnn_output, hidden_states = self.rnn(rnn_input[:,i0:i1,:], hidden_states) + hidden_states[:, (terminated[:,i1-1]), :] = 0 + rnn_outputs.append(rnn_output) + + rnn_output = torch.cat(rnn_outputs, dim=1) + # no need to reset the RNN state in the sequence + else: + rnn_output, hidden_states = self.rnn(rnn_input, hidden_states) + + # flatten the RNN output + rnn_output = torch.flatten(rnn_output, start_dim=0, end_dim=1) # (N, L, D ∗ Hout) -> (N * L, D ∗ Hout) + + x = F.relu(self.linear_layer_1(torch.cat([rnn_output, inputs["taken_actions"]], dim=1))) + x = F.relu(self.linear_layer_2(x)) + + return self.linear_layer_3(x), {"rnn": [hidden_states]} + + +# Gym environment observation wrapper used to mask velocity. Adapted from rl_zoo3 (rl_zoo3/wrappers.py) +class NoVelocityWrapper(gym.ObservationWrapper): + def observation(self, observation): + # observation: x, y, angular velocity + return observation * np.array([1, 1, 0]) + +gym.envs.registration.register(id="PendulumNoVel-v1", entry_point=lambda: NoVelocityWrapper(gym.make("Pendulum-v1"))) + +# Load and wrap the Gym environment +env = gym.make("PendulumNoVel-v1") +env = wrap_env(env) + +device = env.device + + +# Instantiate a RandomMemory (without replacement) as experience replay memory +memory = RandomMemory(memory_size=20000, num_envs=env.num_envs, device=device, replacement=False) + + +# Instantiate the agent's models (function approximators). +# DDPG requires 4 models, visit its documentation for more details +# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ddpg.html#spaces-and-models +models_ddpg = {} +models_ddpg["policy"] = Actor(env.observation_space, env.action_space, device, num_envs=env.num_envs) +models_ddpg["target_policy"] = Actor(env.observation_space, env.action_space, device, num_envs=env.num_envs) +models_ddpg["critic"] = Critic(env.observation_space, env.action_space, device, num_envs=env.num_envs) +models_ddpg["target_critic"] = Critic(env.observation_space, env.action_space, device, num_envs=env.num_envs) + +# Initialize the models' parameters (weights and biases) using a Gaussian distribution +for model in models_ddpg.values(): + model.init_parameters(method_name="normal_", mean=0.0, std=0.1) + + +# Configure and instantiate the agent. +# Only modify some of the default configuration, visit its documentation to see all the options +# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ddpg.html#configuration-and-hyperparameters +cfg_ddpg = DDPG_DEFAULT_CONFIG.copy() +cfg_ddpg["exploration"]["noise"] = OrnsteinUhlenbeckNoise(theta=0.15, sigma=0.1, base_scale=1.0, device=device) +cfg_ddpg["discount_factor"] = 0.98 +cfg_ddpg["batch_size"] = 100 +cfg_ddpg["random_timesteps"] = 0 +cfg_ddpg["learning_starts"] = 1000 +# logging to TensorBoard and write checkpoints each 75 and 750 timesteps respectively +cfg_ddpg["experiment"]["write_interval"] = 75 +cfg_ddpg["experiment"]["checkpoint_interval"] = 750 + +agent_ddpg = DDPG(models=models_ddpg, + memory=memory, + cfg=cfg_ddpg, + observation_space=env.observation_space, + action_space=env.action_space, + device=device) + + +# Configure and instantiate the RL trainer +cfg_trainer = {"timesteps": 15000, "headless": True} +trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=agent_ddpg) + +# start training +trainer.train() diff --git a/docs/source/examples/gym/gym_cartpole_dqn.py b/docs/source/examples/gym/dqn_gym_cartpole.py similarity index 95% rename from docs/source/examples/gym/gym_cartpole_dqn.py rename to docs/source/examples/gym/dqn_gym_cartpole.py index 98c7f6fe..1e574283 100644 --- a/docs/source/examples/gym/gym_cartpole_dqn.py +++ b/docs/source/examples/gym/dqn_gym_cartpole.py @@ -29,10 +29,10 @@ # DQN requires 2 models, visit its documentation for more details # https://skrl.readthedocs.io/en/latest/modules/skrl.agents.dqn.html#spaces-and-models models_dqn = {} -models_dqn["q_network"] = deterministic_model(observation_space=env.observation_space, +models_dqn["q_network"] = deterministic_model(observation_space=env.observation_space, action_space=env.action_space, device=device, - clip_actions=False, + clip_actions=False, input_shape=Shape.OBSERVATIONS, hiddens=[64, 64], hidden_activation=["relu", "relu"], @@ -66,10 +66,10 @@ cfg_dqn["experiment"]["write_interval"] = 1000 cfg_dqn["experiment"]["checkpoint_interval"] = 5000 -agent_dqn = DQN(models=models_dqn, - memory=memory, - cfg=cfg_dqn, - observation_space=env.observation_space, +agent_dqn = DQN(models=models_dqn, + memory=memory, + cfg=cfg_dqn, + observation_space=env.observation_space, action_space=env.action_space, device=device) diff --git a/docs/source/examples/gym/gym_cartpole_dqn_eval.py b/docs/source/examples/gym/dqn_gym_cartpole_eval.py similarity index 93% rename from docs/source/examples/gym/gym_cartpole_dqn_eval.py rename to docs/source/examples/gym/dqn_gym_cartpole_eval.py index 1cd9c90a..baa5902d 100644 --- a/docs/source/examples/gym/gym_cartpole_dqn_eval.py +++ b/docs/source/examples/gym/dqn_gym_cartpole_eval.py @@ -24,10 +24,10 @@ # DQN requires 2 models, visit its documentation for more details # https://skrl.readthedocs.io/en/latest/modules/skrl.agents.dqn.html#spaces-and-models models_dqn = {} -models_dqn["q_network"] = deterministic_model(observation_space=env.observation_space, +models_dqn["q_network"] = deterministic_model(observation_space=env.observation_space, action_space=env.action_space, device=device, - clip_actions=False, + clip_actions=False, input_shape=Shape.OBSERVATIONS, hiddens=[64, 64], hidden_activation=["relu", "relu"], @@ -45,10 +45,10 @@ cfg_dqn["experiment"]["write_interval"] = 1000 cfg_dqn["experiment"]["checkpoint_interval"] = 0 -agent_dqn = DQN(models=models_dqn, - memory=None, - cfg=cfg_dqn, - observation_space=env.observation_space, +agent_dqn = DQN(models=models_dqn, + memory=None, + cfg=cfg_dqn, + observation_space=env.observation_space, action_space=env.action_space, device=device) diff --git a/docs/source/examples/gym/gym_vector_cartpole_dqn.py b/docs/source/examples/gym/dqn_gym_cartpole_vector.py similarity index 95% rename from docs/source/examples/gym/gym_vector_cartpole_dqn.py rename to docs/source/examples/gym/dqn_gym_cartpole_vector.py index 8e1ea656..56f4a0ef 100644 --- a/docs/source/examples/gym/gym_vector_cartpole_dqn.py +++ b/docs/source/examples/gym/dqn_gym_cartpole_vector.py @@ -29,10 +29,10 @@ # DQN requires 2 models, visit its documentation for more details # https://skrl.readthedocs.io/en/latest/modules/skrl.agents.dqn.html#spaces-and-models models_dqn = {} -models_dqn["q_network"] = deterministic_model(observation_space=env.observation_space, +models_dqn["q_network"] = deterministic_model(observation_space=env.observation_space, action_space=env.action_space, device=device, - clip_actions=False, + clip_actions=False, input_shape=Shape.OBSERVATIONS, hiddens=[64, 64], hidden_activation=["relu", "relu"], @@ -66,10 +66,10 @@ cfg_dqn["experiment"]["write_interval"] = 1000 cfg_dqn["experiment"]["checkpoint_interval"] = 5000 -agent_dqn = DQN(models=models_dqn, - memory=memory, - cfg=cfg_dqn, - observation_space=env.observation_space, +agent_dqn = DQN(models=models_dqn, + memory=memory, + cfg=cfg_dqn, + observation_space=env.observation_space, action_space=env.action_space, device=device) diff --git a/docs/source/examples/gym/ppo_gym_pendulum.py b/docs/source/examples/gym/ppo_gym_pendulum.py new file mode 100644 index 00000000..70917ec6 --- /dev/null +++ b/docs/source/examples/gym/ppo_gym_pendulum.py @@ -0,0 +1,116 @@ +import gym + +import torch +import torch.nn as nn + +# Import the skrl components to build the RL system +from skrl.models.torch import Model, GaussianMixin, DeterministicMixin +from skrl.memories.torch import RandomMemory +from skrl.agents.torch.ppo import PPO, PPO_DEFAULT_CONFIG +from skrl.trainers.torch import SequentialTrainer +from skrl.resources.preprocessors.torch import RunningStandardScaler +from skrl.resources.schedulers.torch import KLAdaptiveRL +from skrl.envs.torch import wrap_env + + +# Define the models (stochastic and deterministic models) for the agent using mixins. +# - Policy: takes as input the environment's observation/state and returns an action +# - Value: takes the state as input and provides a value to guide the policy +class Policy(GaussianMixin, Model): + def __init__(self, observation_space, action_space, device, clip_actions=False, + clip_log_std=True, min_log_std=-20, max_log_std=2, reduction="sum"): + Model.__init__(self, observation_space, action_space, device) + GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std, reduction) + + self.net = nn.Sequential(nn.Linear(self.num_observations, 64), + nn.ReLU(), + nn.Linear(64, 64), + nn.ReLU(), + nn.Linear(64, self.num_actions)) + self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) + + def compute(self, inputs, role): + # Pendulum-v1 action_space is -2 to 2 + return 2 * torch.tanh(self.net(inputs["states"])), self.log_std_parameter, {} + +class Value(DeterministicMixin, Model): + def __init__(self, observation_space, action_space, device, clip_actions=False): + Model.__init__(self, observation_space, action_space, device) + DeterministicMixin.__init__(self, clip_actions) + + self.net = nn.Sequential(nn.Linear(self.num_observations, 64), + nn.ReLU(), + nn.Linear(64, 64), + nn.ReLU(), + nn.Linear(64, 1)) + + def compute(self, inputs, role): + return self.net(inputs["states"]), {} + + +# Load and wrap the Gym environment. +# Note: the environment version may change depending on the gym version +try: + env = gym.vector.make("Pendulum-v1", num_envs=4, asynchronous=False) +except gym.error.DeprecatedEnv as e: + env_id = [spec.id for spec in gym.envs.registry.all() if spec.id.startswith("Pendulum-v")][0] + print("Pendulum-v1 not found. Trying {}".format(env_id)) + env = gym.vector.make(env_id, num_envs=4, asynchronous=False) +env = wrap_env(env) + +device = env.device + + +# Instantiate a RandomMemory as rollout buffer (any memory can be used for this) +memory = RandomMemory(memory_size=1024, num_envs=env.num_envs, device=device) + + +# Instantiate the agent's models (function approximators). +# PPO requires 2 models, visit its documentation for more details +# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#spaces-and-models +models_ppo = {} +models_ppo["policy"] = Policy(env.observation_space, env.action_space, device, clip_actions=True) +models_ppo["value"] = Value(env.observation_space, env.action_space, device) + + +# Configure and instantiate the agent. +# Only modify some of the default configuration, visit its documentation to see all the options +# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#configuration-and-hyperparameters +cfg_ppo = PPO_DEFAULT_CONFIG.copy() +cfg_ppo["rollouts"] = 1024 # memory_size +cfg_ppo["learning_epochs"] = 10 +cfg_ppo["mini_batches"] = 32 +cfg_ppo["discount_factor"] = 0.9 +cfg_ppo["lambda"] = 0.95 +cfg_ppo["learning_rate"] = 1e-3 +cfg_ppo["learning_rate_scheduler"] = KLAdaptiveRL +cfg_ppo["learning_rate_scheduler_kwargs"] = {"kl_threshold": 0.008} +cfg_ppo["grad_norm_clip"] = 0.5 +cfg_ppo["ratio_clip"] = 0.2 +cfg_ppo["value_clip"] = 0.2 +cfg_ppo["clip_predicted_values"] = False +cfg_ppo["entropy_loss_scale"] = 0.0 +cfg_ppo["value_loss_scale"] = 0.5 +cfg_ppo["kl_threshold"] = 0 +cfg_ppo["state_preprocessor"] = RunningStandardScaler +cfg_ppo["state_preprocessor_kwargs"] = {"size": env.observation_space, "device": device} +cfg_ppo["value_preprocessor"] = RunningStandardScaler +cfg_ppo["value_preprocessor_kwargs"] = {"size": 1, "device": device} +# logging to TensorBoard and write checkpoints each 500 and 5000 timesteps respectively +cfg_ppo["experiment"]["write_interval"] = 500 +cfg_ppo["experiment"]["checkpoint_interval"] = 5000 + +agent_ppo = PPO(models=models_ppo, + memory=memory, + cfg=cfg_ppo, + observation_space=env.observation_space, + action_space=env.action_space, + device=device) + + +# Configure and instantiate the RL trainer +cfg_trainer = {"timesteps": 100000, "headless": True} +trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=agent_ppo) + +# start training +trainer.train() diff --git a/docs/source/examples/gym/ppo_gym_pendulumnovel.py b/docs/source/examples/gym/ppo_gym_pendulumnovel.py new file mode 100644 index 00000000..79134c8d --- /dev/null +++ b/docs/source/examples/gym/ppo_gym_pendulumnovel.py @@ -0,0 +1,119 @@ +import gym + +import torch +import torch.nn as nn +import numpy as np + +# Import the skrl components to build the RL system +from skrl.models.torch import Model, GaussianMixin, DeterministicMixin +from skrl.memories.torch import RandomMemory +from skrl.agents.torch.ppo import PPO, PPO_DEFAULT_CONFIG +from skrl.trainers.torch import SequentialTrainer +from skrl.resources.preprocessors.torch import RunningStandardScaler +from skrl.resources.schedulers.torch import KLAdaptiveRL +from skrl.envs.torch import wrap_env + + +# Define the models (stochastic and deterministic models) for the agent using mixins. +# - Policy: takes as input the environment's observation/state and returns an action +# - Value: takes the state as input and provides a value to guide the policy +class Policy(GaussianMixin, Model): + def __init__(self, observation_space, action_space, device, clip_actions=False, + clip_log_std=True, min_log_std=-20, max_log_std=2, reduction="sum"): + Model.__init__(self, observation_space, action_space, device) + GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std, reduction) + + self.net = nn.Sequential(nn.Linear(self.num_observations, 64), + nn.ReLU(), + nn.Linear(64, 64), + nn.ReLU(), + nn.Linear(64, self.num_actions)) + self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) + + def compute(self, inputs, role): + # Pendulum-v1 action_space is -2 to 2 + return 2 * torch.tanh(self.net(inputs["states"])), self.log_std_parameter, {} + +class Value(DeterministicMixin, Model): + def __init__(self, observation_space, action_space, device, clip_actions=False): + Model.__init__(self, observation_space, action_space, device) + DeterministicMixin.__init__(self, clip_actions) + + self.net = nn.Sequential(nn.Linear(self.num_observations, 64), + nn.ReLU(), + nn.Linear(64, 64), + nn.ReLU(), + nn.Linear(64, 1)) + + def compute(self, inputs, role): + return self.net(inputs["states"]), {} + + +# Gym environment observation wrapper used to mask velocity. Adapted from rl_zoo3 (rl_zoo3/wrappers.py) +class NoVelocityWrapper(gym.ObservationWrapper): + def observation(self, observation): + # observation: x, y, angular velocity + return observation * np.array([1, 1, 0]) + +gym.envs.registration.register(id="PendulumNoVel-v1", entry_point=lambda: NoVelocityWrapper(gym.make("Pendulum-v1"))) + +# Load and wrap the Gym environment +env = gym.vector.make("PendulumNoVel-v1", num_envs=4, asynchronous=False) +env = wrap_env(env) + +device = env.device + + +# Instantiate a RandomMemory as rollout buffer (any memory can be used for this) +memory = RandomMemory(memory_size=1024, num_envs=env.num_envs, device=device) + + +# Instantiate the agent's models (function approximators). +# PPO requires 2 models, visit its documentation for more details +# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#spaces-and-models +models_ppo = {} +models_ppo["policy"] = Policy(env.observation_space, env.action_space, device, clip_actions=True) +models_ppo["value"] = Value(env.observation_space, env.action_space, device) + + +# Configure and instantiate the agent. +# Only modify some of the default configuration, visit its documentation to see all the options +# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#configuration-and-hyperparameters +cfg_ppo = PPO_DEFAULT_CONFIG.copy() +cfg_ppo["rollouts"] = 1024 # memory_size +cfg_ppo["learning_epochs"] = 10 +cfg_ppo["mini_batches"] = 32 +cfg_ppo["discount_factor"] = 0.9 +cfg_ppo["lambda"] = 0.95 +cfg_ppo["learning_rate"] = 1e-3 +cfg_ppo["learning_rate_scheduler"] = KLAdaptiveRL +cfg_ppo["learning_rate_scheduler_kwargs"] = {"kl_threshold": 0.008} +cfg_ppo["grad_norm_clip"] = 0.5 +cfg_ppo["ratio_clip"] = 0.2 +cfg_ppo["value_clip"] = 0.2 +cfg_ppo["clip_predicted_values"] = False +cfg_ppo["entropy_loss_scale"] = 0.0 +cfg_ppo["value_loss_scale"] = 0.5 +cfg_ppo["kl_threshold"] = 0 +cfg_ppo["state_preprocessor"] = RunningStandardScaler +cfg_ppo["state_preprocessor_kwargs"] = {"size": env.observation_space, "device": device} +cfg_ppo["value_preprocessor"] = RunningStandardScaler +cfg_ppo["value_preprocessor_kwargs"] = {"size": 1, "device": device} +# logging to TensorBoard and write checkpoints each 500 and 5000 timesteps respectively +cfg_ppo["experiment"]["write_interval"] = 500 +cfg_ppo["experiment"]["checkpoint_interval"] = 5000 + +agent_ppo = PPO(models=models_ppo, + memory=memory, + cfg=cfg_ppo, + observation_space=env.observation_space, + action_space=env.action_space, + device=device) + + +# Configure and instantiate the RL trainer +cfg_trainer = {"timesteps": 100000, "headless": True} +trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=agent_ppo) + +# start training +trainer.train() diff --git a/docs/source/examples/gym/ppo_gym_pendulumnovel_gru.py b/docs/source/examples/gym/ppo_gym_pendulumnovel_gru.py new file mode 100644 index 00000000..ce4c8fa7 --- /dev/null +++ b/docs/source/examples/gym/ppo_gym_pendulumnovel_gru.py @@ -0,0 +1,218 @@ +import gym + +import torch +import torch.nn as nn +import numpy as np + +# Import the skrl components to build the RL system +from skrl.models.torch import Model, GaussianMixin, DeterministicMixin +from skrl.memories.torch import RandomMemory +from skrl.agents.torch.ppo import PPO, PPO_DEFAULT_CONFIG +from skrl.trainers.torch import SequentialTrainer +from skrl.resources.preprocessors.torch import RunningStandardScaler +from skrl.resources.schedulers.torch import KLAdaptiveRL +from skrl.envs.torch import wrap_env + + +# Define the models (stochastic and deterministic models) for the agent using mixins. +# - Policy: takes as input the environment's observation/state and returns an action +# - Value: takes the state as input and provides a value to guide the policy +class Policy(GaussianMixin, Model): + def __init__(self, observation_space, action_space, device, clip_actions=False, + clip_log_std=True, min_log_std=-20, max_log_std=2, reduction="sum", + num_envs=1, num_layers=1, hidden_size=64, sequence_length=128): + Model.__init__(self, observation_space, action_space, device) + GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std, reduction) + + self.num_envs = num_envs + self.num_layers = num_layers + self.hidden_size = hidden_size # Hout + self.sequence_length = sequence_length + + self.gru = nn.GRU(input_size=self.num_observations, + hidden_size=self.hidden_size, + num_layers=self.num_layers, + batch_first=True) # batch_first -> (batch, sequence, features) + + self.net = nn.Sequential(nn.Linear(self.hidden_size, 64), + nn.ReLU(), + nn.Linear(64, self.num_actions)) + self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) + + def get_specification(self): + # batch size (N) is the number of envs + return {"rnn": {"sequence_length": self.sequence_length, + "sizes": [(self.num_layers, self.num_envs, self.hidden_size)]}} # hidden states (D ∗ num_layers, N, Hout) + + def compute(self, inputs, role): + states = inputs["states"] + terminated = inputs.get("terminated", None) + hidden_states = inputs["rnn"][0] + + # training + if self.training: + rnn_input = states.view(-1, self.sequence_length, states.shape[-1]) # (N, L, Hin): N=batch_size, L=sequence_length + hidden_states = hidden_states.view(self.num_layers, -1, self.sequence_length, hidden_states.shape[-1]) # (D * num_layers, N, L, Hout) + # get the hidden states corresponding to the initial sequence + hidden_states = hidden_states[:,:,0,:].contiguous() # (D * num_layers, N, Hout) + + # reset the RNN state in the middle of a sequence + if terminated is not None and torch.any(terminated): + rnn_outputs = [] + terminated = terminated.view(-1, self.sequence_length) + indexes = [0] + (terminated[:,:-1].any(dim=0).nonzero(as_tuple=True)[0] + 1).tolist() + [self.sequence_length] + + for i in range(len(indexes) - 1): + i0, i1 = indexes[i], indexes[i + 1] + rnn_output, hidden_states = self.gru(rnn_input[:,i0:i1,:], hidden_states) + hidden_states[:, (terminated[:,i1-1]), :] = 0 + rnn_outputs.append(rnn_output) + + rnn_output = torch.cat(rnn_outputs, dim=1) + # no need to reset the RNN state in the sequence + else: + rnn_output, hidden_states = self.gru(rnn_input, hidden_states) + # rollout + else: + rnn_input = states.view(-1, 1, states.shape[-1]) # (N, L, Hin): N=num_envs, L=1 + rnn_output, hidden_states = self.gru(rnn_input, hidden_states) + + # flatten the RNN output + rnn_output = torch.flatten(rnn_output, start_dim=0, end_dim=1) # (N, L, D ∗ Hout) -> (N * L, D ∗ Hout) + + # Pendulum-v1 action_space is -2 to 2 + return 2 * torch.tanh(self.net(rnn_output)), self.log_std_parameter, {"rnn": [hidden_states]} + +class Value(DeterministicMixin, Model): + def __init__(self, observation_space, action_space, device, clip_actions=False, + num_envs=1, num_layers=1, hidden_size=64, sequence_length=128): + Model.__init__(self, observation_space, action_space, device) + DeterministicMixin.__init__(self, clip_actions) + + self.num_envs = num_envs + self.num_layers = num_layers + self.hidden_size = hidden_size # Hout + self.sequence_length = sequence_length + + self.gru = nn.GRU(input_size=self.num_observations, + hidden_size=self.hidden_size, + num_layers=self.num_layers, + batch_first=True) # batch_first -> (batch, sequence, features) + + self.net = nn.Sequential(nn.Linear(self.hidden_size, 64), + nn.ReLU(), + nn.Linear(64, 1)) + + def get_specification(self): + # batch size (N) is the number of envs + return {"rnn": {"sequence_length": self.sequence_length, + "sizes": [(self.num_layers, self.num_envs, self.hidden_size)]}} # hidden states (D ∗ num_layers, N, Hout) + + def compute(self, inputs, role): + states = inputs["states"] + terminated = inputs.get("terminated", None) + hidden_states = inputs["rnn"][0] + + # training + if self.training: + rnn_input = states.view(-1, self.sequence_length, states.shape[-1]) # (N, L, Hin): N=batch_size, L=sequence_length + + hidden_states = hidden_states.view(self.num_layers, -1, self.sequence_length, hidden_states.shape[-1]) # (D * num_layers, N, L, Hout) + # get the hidden states corresponding to the initial sequence + hidden_states = hidden_states[:,:,0,:].contiguous() # (D * num_layers, N, Hout) + + # reset the RNN state in the middle of a sequence + if terminated is not None and torch.any(terminated): + rnn_outputs = [] + terminated = terminated.view(-1, self.sequence_length) + indexes = [0] + (terminated[:,:-1].any(dim=0).nonzero(as_tuple=True)[0] + 1).tolist() + [self.sequence_length] + + for i in range(len(indexes) - 1): + i0, i1 = indexes[i], indexes[i + 1] + rnn_output, hidden_states = self.gru(rnn_input[:,i0:i1,:], hidden_states) + hidden_states[:, (terminated[:,i1-1]), :] = 0 + rnn_outputs.append(rnn_output) + + rnn_output = torch.cat(rnn_outputs, dim=1) + # no need to reset the RNN state in the sequence + else: + rnn_output, hidden_states = self.gru(rnn_input, hidden_states) + # rollout + else: + rnn_input = states.view(-1, 1, states.shape[-1]) # (N, L, Hin): N=num_envs, L=1 + rnn_output, hidden_states = self.gru(rnn_input, hidden_states) + + # flatten the RNN output + rnn_output = torch.flatten(rnn_output, start_dim=0, end_dim=1) # (N, L, D ∗ Hout) -> (N * L, D ∗ Hout) + + return self.net(rnn_output), {"rnn": [hidden_states]} + + +# Gym environment observation wrapper used to mask velocity. Adapted from rl_zoo3 (rl_zoo3/wrappers.py) +class NoVelocityWrapper(gym.ObservationWrapper): + def observation(self, observation): + # observation: x, y, angular velocity + return observation * np.array([1, 1, 0]) + +gym.envs.registration.register(id="PendulumNoVel-v1", entry_point=lambda: NoVelocityWrapper(gym.make("Pendulum-v1"))) + +# Load and wrap the Gym environment +env = gym.vector.make("PendulumNoVel-v1", num_envs=4, asynchronous=False) +env = wrap_env(env) + +device = env.device + + +# Instantiate a RandomMemory as rollout buffer (any memory can be used for this) +memory = RandomMemory(memory_size=1024, num_envs=env.num_envs, device=device) + + +# Instantiate the agent's models (function approximators). +# PPO requires 2 models, visit its documentation for more details +# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#spaces-and-models +models_ppo = {} +models_ppo["policy"] = Policy(env.observation_space, env.action_space, device, clip_actions=True, num_envs=env.num_envs) +models_ppo["value"] = Value(env.observation_space, env.action_space, device, num_envs=env.num_envs) + + +# Configure and instantiate the agent. +# Only modify some of the default configuration, visit its documentation to see all the options +# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#configuration-and-hyperparameters +cfg_ppo = PPO_DEFAULT_CONFIG.copy() +cfg_ppo["rollouts"] = 1024 # memory_size +cfg_ppo["learning_epochs"] = 10 +cfg_ppo["mini_batches"] = 32 +cfg_ppo["discount_factor"] = 0.9 +cfg_ppo["lambda"] = 0.95 +cfg_ppo["learning_rate"] = 1e-3 +cfg_ppo["learning_rate_scheduler"] = KLAdaptiveRL +cfg_ppo["learning_rate_scheduler_kwargs"] = {"kl_threshold": 0.008} +cfg_ppo["grad_norm_clip"] = 0.5 +cfg_ppo["ratio_clip"] = 0.2 +cfg_ppo["value_clip"] = 0.2 +cfg_ppo["clip_predicted_values"] = False +cfg_ppo["entropy_loss_scale"] = 0.0 +cfg_ppo["value_loss_scale"] = 0.5 +cfg_ppo["kl_threshold"] = 0 +cfg_ppo["state_preprocessor"] = RunningStandardScaler +cfg_ppo["state_preprocessor_kwargs"] = {"size": env.observation_space, "device": device} +cfg_ppo["value_preprocessor"] = RunningStandardScaler +cfg_ppo["value_preprocessor_kwargs"] = {"size": 1, "device": device} +# logging to TensorBoard and write checkpoints each 500 and 5000 timesteps respectively +cfg_ppo["experiment"]["write_interval"] = 500 +cfg_ppo["experiment"]["checkpoint_interval"] = 5000 + +agent_ppo = PPO(models=models_ppo, + memory=memory, + cfg=cfg_ppo, + observation_space=env.observation_space, + action_space=env.action_space, + device=device) + + +# Configure and instantiate the RL trainer +cfg_trainer = {"timesteps": 100000, "headless": True} +trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=agent_ppo) + +# start training +trainer.train() diff --git a/docs/source/examples/gym/ppo_gym_pendulumnovel_lstm.py b/docs/source/examples/gym/ppo_gym_pendulumnovel_lstm.py new file mode 100644 index 00000000..b5901411 --- /dev/null +++ b/docs/source/examples/gym/ppo_gym_pendulumnovel_lstm.py @@ -0,0 +1,228 @@ +import gym + +import torch +import torch.nn as nn +import numpy as np + +# Import the skrl components to build the RL system +from skrl.models.torch import Model, GaussianMixin, DeterministicMixin +from skrl.memories.torch import RandomMemory +from skrl.agents.torch.ppo import PPO, PPO_DEFAULT_CONFIG +from skrl.trainers.torch import SequentialTrainer +from skrl.resources.preprocessors.torch import RunningStandardScaler +from skrl.resources.schedulers.torch import KLAdaptiveRL +from skrl.envs.torch import wrap_env + + +# Define the models (stochastic and deterministic models) for the agent using mixins. +# - Policy: takes as input the environment's observation/state and returns an action +# - Value: takes the state as input and provides a value to guide the policy +class Policy(GaussianMixin, Model): + def __init__(self, observation_space, action_space, device, clip_actions=False, + clip_log_std=True, min_log_std=-20, max_log_std=2, reduction="sum", + num_envs=1, num_layers=1, hidden_size=64, sequence_length=128): + Model.__init__(self, observation_space, action_space, device) + GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std, reduction) + + self.num_envs = num_envs + self.num_layers = num_layers + self.hidden_size = hidden_size # Hcell (Hout is Hcell because proj_size = 0) + self.sequence_length = sequence_length + + self.lstm = nn.LSTM(input_size=self.num_observations, + hidden_size=self.hidden_size, + num_layers=self.num_layers, + batch_first=True) # batch_first -> (batch, sequence, features) + + self.net = nn.Sequential(nn.Linear(self.hidden_size, 64), + nn.ReLU(), + nn.Linear(64, self.num_actions)) + self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) + + def get_specification(self): + # batch size (N) is the number of envs + return {"rnn": {"sequence_length": self.sequence_length, + "sizes": [(self.num_layers, self.num_envs, self.hidden_size), # hidden states (D ∗ num_layers, N, Hout) + (self.num_layers, self.num_envs, self.hidden_size)]}} # cell states (D ∗ num_layers, N, Hcell) + + def compute(self, inputs, role): + states = inputs["states"] + terminated = inputs.get("terminated", None) + hidden_states, cell_states = inputs["rnn"][0], inputs["rnn"][1] + + # training + if self.training: + rnn_input = states.view(-1, self.sequence_length, states.shape[-1]) # (N, L, Hin): N=batch_size, L=sequence_length + hidden_states = hidden_states.view(self.num_layers, -1, self.sequence_length, hidden_states.shape[-1]) # (D * num_layers, N, L, Hout) + cell_states = cell_states.view(self.num_layers, -1, self.sequence_length, cell_states.shape[-1]) # (D * num_layers, N, L, Hcell) + # get the hidden/cell states corresponding to the initial sequence + hidden_states = hidden_states[:,:,0,:].contiguous() # (D * num_layers, N, Hout) + cell_states = cell_states[:,:,0,:].contiguous() # (D * num_layers, N, Hcell) + + # reset the RNN state in the middle of a sequence + if terminated is not None and torch.any(terminated): + rnn_outputs = [] + terminated = terminated.view(-1, self.sequence_length) + indexes = [0] + (terminated[:,:-1].any(dim=0).nonzero(as_tuple=True)[0] + 1).tolist() + [self.sequence_length] + + for i in range(len(indexes) - 1): + i0, i1 = indexes[i], indexes[i + 1] + rnn_output, (hidden_states, cell_states) = self.lstm(rnn_input[:,i0:i1,:], (hidden_states, cell_states)) + hidden_states[:, (terminated[:,i1-1]), :] = 0 + cell_states[:, (terminated[:,i1-1]), :] = 0 + rnn_outputs.append(rnn_output) + + rnn_states = (hidden_states, cell_states) + rnn_output = torch.cat(rnn_outputs, dim=1) + # no need to reset the RNN state in the sequence + else: + rnn_output, rnn_states = self.lstm(rnn_input, (hidden_states, cell_states)) + # rollout + else: + rnn_input = states.view(-1, 1, states.shape[-1]) # (N, L, Hin): N=num_envs, L=1 + rnn_output, rnn_states = self.lstm(rnn_input, (hidden_states, cell_states)) + + # flatten the RNN output + rnn_output = torch.flatten(rnn_output, start_dim=0, end_dim=1) # (N, L, D ∗ Hout) -> (N * L, D ∗ Hout) + + # Pendulum-v1 action_space is -2 to 2 + return 2 * torch.tanh(self.net(rnn_output)), self.log_std_parameter, {"rnn": [rnn_states[0], rnn_states[1]]} + +class Value(DeterministicMixin, Model): + def __init__(self, observation_space, action_space, device, clip_actions=False, + num_envs=1, num_layers=1, hidden_size=64, sequence_length=128): + Model.__init__(self, observation_space, action_space, device) + DeterministicMixin.__init__(self, clip_actions) + + self.num_envs = num_envs + self.num_layers = num_layers + self.hidden_size = hidden_size # Hcell (Hout is Hcell because proj_size = 0) + self.sequence_length = sequence_length + + self.lstm = nn.LSTM(input_size=self.num_observations, + hidden_size=self.hidden_size, + num_layers=self.num_layers, + batch_first=True) # batch_first -> (batch, sequence, features) + + self.net = nn.Sequential(nn.Linear(self.hidden_size, 64), + nn.ReLU(), + nn.Linear(64, 1)) + + def get_specification(self): + # batch size (N) is the number of envs + return {"rnn": {"sequence_length": self.sequence_length, + "sizes": [(self.num_layers, self.num_envs, self.hidden_size), # hidden states (D ∗ num_layers, N, Hout) + (self.num_layers, self.num_envs, self.hidden_size)]}} # cell states (D ∗ num_layers, N, Hcell) + + def compute(self, inputs, role): + states = inputs["states"] + terminated = inputs.get("terminated", None) + hidden_states, cell_states = inputs["rnn"][0], inputs["rnn"][1] + + # training + if self.training: + rnn_input = states.view(-1, self.sequence_length, states.shape[-1]) # (N, L, Hin): N=batch_size, L=sequence_length + + hidden_states = hidden_states.view(self.num_layers, -1, self.sequence_length, hidden_states.shape[-1]) # (D * num_layers, N, L, Hout) + cell_states = cell_states.view(self.num_layers, -1, self.sequence_length, cell_states.shape[-1]) # (D * num_layers, N, L, Hcell) + # get the hidden/cell states corresponding to the initial sequence + hidden_states = hidden_states[:,:,0,:].contiguous() # (D * num_layers, N, Hout) + cell_states = cell_states[:,:,0,:].contiguous() # (D * num_layers, N, Hcell) + + # reset the RNN state in the middle of a sequence + if terminated is not None and torch.any(terminated): + rnn_outputs = [] + terminated = terminated.view(-1, self.sequence_length) + indexes = [0] + (terminated[:,:-1].any(dim=0).nonzero(as_tuple=True)[0] + 1).tolist() + [self.sequence_length] + + for i in range(len(indexes) - 1): + i0, i1 = indexes[i], indexes[i + 1] + rnn_output, (hidden_states, cell_states) = self.lstm(rnn_input[:,i0:i1,:], (hidden_states, cell_states)) + hidden_states[:, (terminated[:,i1-1]), :] = 0 + cell_states[:, (terminated[:,i1-1]), :] = 0 + rnn_outputs.append(rnn_output) + + rnn_states = (hidden_states, cell_states) + rnn_output = torch.cat(rnn_outputs, dim=1) + # no need to reset the RNN state in the sequence + else: + rnn_output, rnn_states = self.lstm(rnn_input, (hidden_states, cell_states)) + # rollout + else: + rnn_input = states.view(-1, 1, states.shape[-1]) # (N, L, Hin): N=num_envs, L=1 + rnn_output, rnn_states = self.lstm(rnn_input, (hidden_states, cell_states)) + + # flatten the RNN output + rnn_output = torch.flatten(rnn_output, start_dim=0, end_dim=1) # (N, L, D ∗ Hout) -> (N * L, D ∗ Hout) + + return self.net(rnn_output), {"rnn": [rnn_states[0], rnn_states[1]]} + + +# Gym environment observation wrapper used to mask velocity. Adapted from rl_zoo3 (rl_zoo3/wrappers.py) +class NoVelocityWrapper(gym.ObservationWrapper): + def observation(self, observation): + # observation: x, y, angular velocity + return observation * np.array([1, 1, 0]) + +gym.envs.registration.register(id="PendulumNoVel-v1", entry_point=lambda: NoVelocityWrapper(gym.make("Pendulum-v1"))) + +# Load and wrap the Gym environment +env = gym.vector.make("PendulumNoVel-v1", num_envs=4, asynchronous=False) +env = wrap_env(env) + +device = env.device + + +# Instantiate a RandomMemory as rollout buffer (any memory can be used for this) +memory = RandomMemory(memory_size=1024, num_envs=env.num_envs, device=device) + + +# Instantiate the agent's models (function approximators). +# PPO requires 2 models, visit its documentation for more details +# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#spaces-and-models +models_ppo = {} +models_ppo["policy"] = Policy(env.observation_space, env.action_space, device, clip_actions=True, num_envs=env.num_envs) +models_ppo["value"] = Value(env.observation_space, env.action_space, device, num_envs=env.num_envs) + + +# Configure and instantiate the agent. +# Only modify some of the default configuration, visit its documentation to see all the options +# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#configuration-and-hyperparameters +cfg_ppo = PPO_DEFAULT_CONFIG.copy() +cfg_ppo["rollouts"] = 1024 # memory_size +cfg_ppo["learning_epochs"] = 10 +cfg_ppo["mini_batches"] = 32 +cfg_ppo["discount_factor"] = 0.9 +cfg_ppo["lambda"] = 0.95 +cfg_ppo["learning_rate"] = 1e-3 +cfg_ppo["learning_rate_scheduler"] = KLAdaptiveRL +cfg_ppo["learning_rate_scheduler_kwargs"] = {"kl_threshold": 0.008} +cfg_ppo["grad_norm_clip"] = 0.5 +cfg_ppo["ratio_clip"] = 0.2 +cfg_ppo["value_clip"] = 0.2 +cfg_ppo["clip_predicted_values"] = False +cfg_ppo["entropy_loss_scale"] = 0.0 +cfg_ppo["value_loss_scale"] = 0.5 +cfg_ppo["kl_threshold"] = 0 +cfg_ppo["state_preprocessor"] = RunningStandardScaler +cfg_ppo["state_preprocessor_kwargs"] = {"size": env.observation_space, "device": device} +cfg_ppo["value_preprocessor"] = RunningStandardScaler +cfg_ppo["value_preprocessor_kwargs"] = {"size": 1, "device": device} +# logging to TensorBoard and write checkpoints each 500 and 5000 timesteps respectively +cfg_ppo["experiment"]["write_interval"] = 500 +cfg_ppo["experiment"]["checkpoint_interval"] = 5000 + +agent_ppo = PPO(models=models_ppo, + memory=memory, + cfg=cfg_ppo, + observation_space=env.observation_space, + action_space=env.action_space, + device=device) + + +# Configure and instantiate the RL trainer +cfg_trainer = {"timesteps": 100000, "headless": True} +trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=agent_ppo) + +# start training +trainer.train() diff --git a/docs/source/examples/gym/ppo_gym_pendulumnovel_rnn.py b/docs/source/examples/gym/ppo_gym_pendulumnovel_rnn.py new file mode 100644 index 00000000..de595dc5 --- /dev/null +++ b/docs/source/examples/gym/ppo_gym_pendulumnovel_rnn.py @@ -0,0 +1,218 @@ +import gym + +import torch +import torch.nn as nn +import numpy as np + +# Import the skrl components to build the RL system +from skrl.models.torch import Model, GaussianMixin, DeterministicMixin +from skrl.memories.torch import RandomMemory +from skrl.agents.torch.ppo import PPO, PPO_DEFAULT_CONFIG +from skrl.trainers.torch import SequentialTrainer +from skrl.resources.preprocessors.torch import RunningStandardScaler +from skrl.resources.schedulers.torch import KLAdaptiveRL +from skrl.envs.torch import wrap_env + + +# Define the models (stochastic and deterministic models) for the agent using mixins. +# - Policy: takes as input the environment's observation/state and returns an action +# - Value: takes the state as input and provides a value to guide the policy +class Policy(GaussianMixin, Model): + def __init__(self, observation_space, action_space, device, clip_actions=False, + clip_log_std=True, min_log_std=-20, max_log_std=2, reduction="sum", + num_envs=1, num_layers=1, hidden_size=64, sequence_length=128): + Model.__init__(self, observation_space, action_space, device) + GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std, reduction) + + self.num_envs = num_envs + self.num_layers = num_layers + self.hidden_size = hidden_size # Hout + self.sequence_length = sequence_length + + self.rnn = nn.RNN(input_size=self.num_observations, + hidden_size=self.hidden_size, + num_layers=self.num_layers, + batch_first=True) # batch_first -> (batch, sequence, features) + + self.net = nn.Sequential(nn.Linear(self.hidden_size, 64), + nn.ReLU(), + nn.Linear(64, self.num_actions)) + self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) + + def get_specification(self): + # batch size (N) is the number of envs + return {"rnn": {"sequence_length": self.sequence_length, + "sizes": [(self.num_layers, self.num_envs, self.hidden_size)]}} # hidden states (D ∗ num_layers, N, Hout) + + def compute(self, inputs, role): + states = inputs["states"] + terminated = inputs.get("terminated", None) + hidden_states = inputs["rnn"][0] + + # training + if self.training: + rnn_input = states.view(-1, self.sequence_length, states.shape[-1]) # (N, L, Hin): N=batch_size, L=sequence_length + hidden_states = hidden_states.view(self.num_layers, -1, self.sequence_length, hidden_states.shape[-1]) # (D * num_layers, N, L, Hout) + # get the hidden states corresponding to the initial sequence + hidden_states = hidden_states[:,:,0,:].contiguous() # (D * num_layers, N, Hout) + + # reset the RNN state in the middle of a sequence + if terminated is not None and torch.any(terminated): + rnn_outputs = [] + terminated = terminated.view(-1, self.sequence_length) + indexes = [0] + (terminated[:,:-1].any(dim=0).nonzero(as_tuple=True)[0] + 1).tolist() + [self.sequence_length] + + for i in range(len(indexes) - 1): + i0, i1 = indexes[i], indexes[i + 1] + rnn_output, hidden_states = self.rnn(rnn_input[:,i0:i1,:], hidden_states) + hidden_states[:, (terminated[:,i1-1]), :] = 0 + rnn_outputs.append(rnn_output) + + rnn_output = torch.cat(rnn_outputs, dim=1) + # no need to reset the RNN state in the sequence + else: + rnn_output, hidden_states = self.rnn(rnn_input, hidden_states) + # rollout + else: + rnn_input = states.view(-1, 1, states.shape[-1]) # (N, L, Hin): N=num_envs, L=1 + rnn_output, hidden_states = self.rnn(rnn_input, hidden_states) + + # flatten the RNN output + rnn_output = torch.flatten(rnn_output, start_dim=0, end_dim=1) # (N, L, D ∗ Hout) -> (N * L, D ∗ Hout) + + # Pendulum-v1 action_space is -2 to 2 + return 2 * torch.tanh(self.net(rnn_output)), self.log_std_parameter, {"rnn": [hidden_states]} + +class Value(DeterministicMixin, Model): + def __init__(self, observation_space, action_space, device, clip_actions=False, + num_envs=1, num_layers=1, hidden_size=64, sequence_length=128): + Model.__init__(self, observation_space, action_space, device) + DeterministicMixin.__init__(self, clip_actions) + + self.num_envs = num_envs + self.num_layers = num_layers + self.hidden_size = hidden_size # Hout + self.sequence_length = sequence_length + + self.rnn = nn.RNN(input_size=self.num_observations, + hidden_size=self.hidden_size, + num_layers=self.num_layers, + batch_first=True) # batch_first -> (batch, sequence, features) + + self.net = nn.Sequential(nn.Linear(self.hidden_size, 64), + nn.ReLU(), + nn.Linear(64, 1)) + + def get_specification(self): + # batch size (N) is the number of envs + return {"rnn": {"sequence_length": self.sequence_length, + "sizes": [(self.num_layers, self.num_envs, self.hidden_size)]}} # hidden states (D ∗ num_layers, N, Hout) + + def compute(self, inputs, role): + states = inputs["states"] + terminated = inputs.get("terminated", None) + hidden_states = inputs["rnn"][0] + + # training + if self.training: + rnn_input = states.view(-1, self.sequence_length, states.shape[-1]) # (N, L, Hin): N=batch_size, L=sequence_length + + hidden_states = hidden_states.view(self.num_layers, -1, self.sequence_length, hidden_states.shape[-1]) # (D * num_layers, N, L, Hout) + # get the hidden states corresponding to the initial sequence + hidden_states = hidden_states[:,:,0,:].contiguous() # (D * num_layers, N, Hout) + + # reset the RNN state in the middle of a sequence + if terminated is not None and torch.any(terminated): + rnn_outputs = [] + terminated = terminated.view(-1, self.sequence_length) + indexes = [0] + (terminated[:,:-1].any(dim=0).nonzero(as_tuple=True)[0] + 1).tolist() + [self.sequence_length] + + for i in range(len(indexes) - 1): + i0, i1 = indexes[i], indexes[i + 1] + rnn_output, hidden_states = self.rnn(rnn_input[:,i0:i1,:], hidden_states) + hidden_states[:, (terminated[:,i1-1]), :] = 0 + rnn_outputs.append(rnn_output) + + rnn_output = torch.cat(rnn_outputs, dim=1) + # no need to reset the RNN state in the sequence + else: + rnn_output, hidden_states = self.rnn(rnn_input, hidden_states) + # rollout + else: + rnn_input = states.view(-1, 1, states.shape[-1]) # (N, L, Hin): N=num_envs, L=1 + rnn_output, hidden_states = self.rnn(rnn_input, hidden_states) + + # flatten the RNN output + rnn_output = torch.flatten(rnn_output, start_dim=0, end_dim=1) # (N, L, D ∗ Hout) -> (N * L, D ∗ Hout) + + return self.net(rnn_output), {"rnn": [hidden_states]} + + +# Gym environment observation wrapper used to mask velocity. Adapted from rl_zoo3 (rl_zoo3/wrappers.py) +class NoVelocityWrapper(gym.ObservationWrapper): + def observation(self, observation): + # observation: x, y, angular velocity + return observation * np.array([1, 1, 0]) + +gym.envs.registration.register(id="PendulumNoVel-v1", entry_point=lambda: NoVelocityWrapper(gym.make("Pendulum-v1"))) + +# Load and wrap the Gym environment +env = gym.vector.make("PendulumNoVel-v1", num_envs=4, asynchronous=False) +env = wrap_env(env) + +device = env.device + + +# Instantiate a RandomMemory as rollout buffer (any memory can be used for this) +memory = RandomMemory(memory_size=1024, num_envs=env.num_envs, device=device) + + +# Instantiate the agent's models (function approximators). +# PPO requires 2 models, visit its documentation for more details +# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#spaces-and-models +models_ppo = {} +models_ppo["policy"] = Policy(env.observation_space, env.action_space, device, clip_actions=True, num_envs=env.num_envs) +models_ppo["value"] = Value(env.observation_space, env.action_space, device, num_envs=env.num_envs) + + +# Configure and instantiate the agent. +# Only modify some of the default configuration, visit its documentation to see all the options +# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#configuration-and-hyperparameters +cfg_ppo = PPO_DEFAULT_CONFIG.copy() +cfg_ppo["rollouts"] = 1024 # memory_size +cfg_ppo["learning_epochs"] = 10 +cfg_ppo["mini_batches"] = 32 +cfg_ppo["discount_factor"] = 0.9 +cfg_ppo["lambda"] = 0.95 +cfg_ppo["learning_rate"] = 1e-3 +cfg_ppo["learning_rate_scheduler"] = KLAdaptiveRL +cfg_ppo["learning_rate_scheduler_kwargs"] = {"kl_threshold": 0.008} +cfg_ppo["grad_norm_clip"] = 0.5 +cfg_ppo["ratio_clip"] = 0.2 +cfg_ppo["value_clip"] = 0.2 +cfg_ppo["clip_predicted_values"] = False +cfg_ppo["entropy_loss_scale"] = 0.0 +cfg_ppo["value_loss_scale"] = 0.5 +cfg_ppo["kl_threshold"] = 0 +cfg_ppo["state_preprocessor"] = RunningStandardScaler +cfg_ppo["state_preprocessor_kwargs"] = {"size": env.observation_space, "device": device} +cfg_ppo["value_preprocessor"] = RunningStandardScaler +cfg_ppo["value_preprocessor_kwargs"] = {"size": 1, "device": device} +# logging to TensorBoard and write checkpoints each 500 and 5000 timesteps respectively +cfg_ppo["experiment"]["write_interval"] = 500 +cfg_ppo["experiment"]["checkpoint_interval"] = 5000 + +agent_ppo = PPO(models=models_ppo, + memory=memory, + cfg=cfg_ppo, + observation_space=env.observation_space, + action_space=env.action_space, + device=device) + + +# Configure and instantiate the RL trainer +cfg_trainer = {"timesteps": 100000, "headless": True} +trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=agent_ppo) + +# start training +trainer.train() diff --git a/docs/source/examples/gym/gym_frozen_lake_q_learning.py b/docs/source/examples/gym/q_learning_gym_frozen_lake.py similarity index 88% rename from docs/source/examples/gym/gym_frozen_lake_q_learning.py rename to docs/source/examples/gym/q_learning_gym_frozen_lake.py index c3141f38..881e86bb 100644 --- a/docs/source/examples/gym/gym_frozen_lake_q_learning.py +++ b/docs/source/examples/gym/q_learning_gym_frozen_lake.py @@ -16,18 +16,18 @@ def __init__(self, observation_space, action_space, device, num_envs=1, epsilon= TabularMixin.__init__(self, num_envs) self.epsilon = epsilon - self.q_table = torch.ones((num_envs, self.num_observations, self.num_actions), + self.q_table = torch.ones((num_envs, self.num_observations, self.num_actions), dtype=torch.float32, device=self.device) - - def compute(self, states, taken_actions, role): - actions = torch.argmax(self.q_table[torch.arange(self.num_envs).view(-1, 1), states], + + def compute(self, inputs, role): + actions = torch.argmax(self.q_table[torch.arange(self.num_envs).view(-1, 1), inputs["states"]], dim=-1, keepdim=True).view(-1,1) - + # choose random actions for exploration according to epsilon - indexes = (torch.rand(states.shape[0], device=self.device) < self.epsilon).nonzero().view(-1) + indexes = (torch.rand(inputs["states"].shape[0], device=self.device) < self.epsilon).nonzero().view(-1) if indexes.numel(): actions[indexes] = torch.randint(self.num_actions, (indexes.numel(), 1), device=self.device) - return actions + return actions, {} # Load and wrap the Gym environment. @@ -55,15 +55,15 @@ def compute(self, states, taken_actions, role): # https://skrl.readthedocs.io/en/latest/modules/skrl.agents.q_learning.html#configuration-and-hyperparameters cfg_q_learning = Q_LEARNING_DEFAULT_CONFIG.copy() cfg_q_learning["discount_factor"] = 0.999 -cfg_q_learning["alpha"] = 0.4 +cfg_q_learning["alpha"] = 0.4 # logging to TensorBoard and write checkpoints each 1600 and 8000 timesteps respectively cfg_q_learning["experiment"]["write_interval"] = 1600 cfg_q_learning["experiment"]["checkpoint_interval"] = 8000 agent_q_learning = Q_LEARNING(models=models_q_learning, - memory=None, - cfg=cfg_q_learning, - observation_space=env.observation_space, + memory=None, + cfg=cfg_q_learning, + observation_space=env.observation_space, action_space=env.action_space, device=device) diff --git a/docs/source/examples/gym/gym_frozen_lake_q_learning_eval.py b/docs/source/examples/gym/q_learning_gym_frozen_lake_eval.py similarity index 89% rename from docs/source/examples/gym/gym_frozen_lake_q_learning_eval.py rename to docs/source/examples/gym/q_learning_gym_frozen_lake_eval.py index bfdc9452..74487c8c 100644 --- a/docs/source/examples/gym/gym_frozen_lake_q_learning_eval.py +++ b/docs/source/examples/gym/q_learning_gym_frozen_lake_eval.py @@ -16,18 +16,18 @@ def __init__(self, observation_space, action_space, device, num_envs=1, epsilon= TabularMixin.__init__(self, num_envs) self.epsilon = epsilon - self.q_table = torch.ones((num_envs, self.num_observations, self.num_actions), + self.q_table = torch.ones((num_envs, self.num_observations, self.num_actions), dtype=torch.float32, device=self.device) - - def compute(self, states, taken_actions, role): - actions = torch.argmax(self.q_table[torch.arange(self.num_envs).view(-1, 1), states], + + def compute(self, inputs, role): + actions = torch.argmax(self.q_table[torch.arange(self.num_envs).view(-1, 1), inputs["states"]], dim=-1, keepdim=True).view(-1,1) - + # choose random actions for exploration according to epsilon - indexes = (torch.rand(states.shape[0], device=self.device) < self.epsilon).nonzero().view(-1) + indexes = (torch.rand(inputs["states"].shape[0], device=self.device) < self.epsilon).nonzero().view(-1) if indexes.numel(): actions[indexes] = torch.randint(self.num_actions, (indexes.numel(), 1), device=self.device) - return actions + return actions, {} # Load and wrap the Gym environment. @@ -60,9 +60,9 @@ def compute(self, states, taken_actions, role): cfg_q_learning["experiment"]["checkpoint_interval"] = 0 agent_q_learning = Q_LEARNING(models=models_q_learning, - memory=None, - cfg=cfg_q_learning, - observation_space=env.observation_space, + memory=None, + cfg=cfg_q_learning, + observation_space=env.observation_space, action_space=env.action_space, device=device) diff --git a/docs/source/examples/gym/gym_vector_frozen_lake_q_learning.py b/docs/source/examples/gym/q_learning_gym_frozen_lake_vector.py similarity index 88% rename from docs/source/examples/gym/gym_vector_frozen_lake_q_learning.py rename to docs/source/examples/gym/q_learning_gym_frozen_lake_vector.py index efc912be..a16908f4 100644 --- a/docs/source/examples/gym/gym_vector_frozen_lake_q_learning.py +++ b/docs/source/examples/gym/q_learning_gym_frozen_lake_vector.py @@ -16,18 +16,18 @@ def __init__(self, observation_space, action_space, device, num_envs=1, epsilon= TabularMixin.__init__(self, num_envs) self.epsilon = epsilon - self.q_table = torch.ones((num_envs, self.num_observations, self.num_actions), + self.q_table = torch.ones((num_envs, self.num_observations, self.num_actions), dtype=torch.float32, device=self.device) - - def compute(self, states, taken_actions, role): - actions = torch.argmax(self.q_table[torch.arange(self.num_envs).view(-1, 1), states], + + def compute(self, inputs, role): + actions = torch.argmax(self.q_table[torch.arange(self.num_envs).view(-1, 1), inputs["states"]], dim=-1, keepdim=True).view(-1,1) - + # choose random actions for exploration according to epsilon - indexes = (torch.rand(states.shape[0], device=self.device) < self.epsilon).nonzero().view(-1) + indexes = (torch.rand(inputs["states"].shape[0], device=self.device) < self.epsilon).nonzero().view(-1) if indexes.numel(): actions[indexes] = torch.randint(self.num_actions, (indexes.numel(), 1), device=self.device) - return actions + return actions, {} # Load and wrap the Gym environment. @@ -55,15 +55,15 @@ def compute(self, states, taken_actions, role): # https://skrl.readthedocs.io/en/latest/modules/skrl.agents.q_learning.html#configuration-and-hyperparameters cfg_q_learning = Q_LEARNING_DEFAULT_CONFIG.copy() cfg_q_learning["discount_factor"] = 0.999 -cfg_q_learning["alpha"] = 0.4 +cfg_q_learning["alpha"] = 0.4 # logging to TensorBoard and write checkpoints each 1600 and 8000 timesteps respectively cfg_q_learning["experiment"]["write_interval"] = 1600 cfg_q_learning["experiment"]["checkpoint_interval"] = 8000 agent_q_learning = Q_LEARNING(models=models_q_learning, - memory=None, - cfg=cfg_q_learning, - observation_space=env.observation_space, + memory=None, + cfg=cfg_q_learning, + observation_space=env.observation_space, action_space=env.action_space, device=device) diff --git a/docs/source/examples/gym/sac_gym_pendulum.py b/docs/source/examples/gym/sac_gym_pendulum.py new file mode 100644 index 00000000..2c08048c --- /dev/null +++ b/docs/source/examples/gym/sac_gym_pendulum.py @@ -0,0 +1,109 @@ +import gym + +import torch +import torch.nn as nn +import torch.nn.functional as F + +# Import the skrl components to build the RL system +from skrl.models.torch import Model, DeterministicMixin, GaussianMixin +from skrl.memories.torch import RandomMemory +from skrl.agents.torch.sac import SAC, SAC_DEFAULT_CONFIG +from skrl.trainers.torch import SequentialTrainer +from skrl.envs.torch import wrap_env + + +# Define the models (deterministic models) for the SAC agent using mixin +# - Actor (policy): takes as input the environment's observation/state and returns an action +# - Critic: takes the state and action as input and provides a value to guide the policy +class Actor(GaussianMixin, Model): + def __init__(self, observation_space, action_space, device, clip_actions=False, + clip_log_std=True, min_log_std=-20, max_log_std=2, reduction="sum"): + Model.__init__(self, observation_space, action_space, device) + GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std, reduction) + + self.linear_layer_1 = nn.Linear(self.num_observations, 400) + self.linear_layer_2 = nn.Linear(400, 300) + self.action_layer = nn.Linear(300, self.num_actions) + + self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) + + def compute(self, inputs, role): + x = F.relu(self.linear_layer_1(inputs["states"])) + x = F.relu(self.linear_layer_2(x)) + # Pendulum-v1 action_space is -2 to 2 + return 2 * torch.tanh(self.action_layer(x)), self.log_std_parameter, {} + +class Critic(DeterministicMixin, Model): + def __init__(self, observation_space, action_space, device, clip_actions=False): + Model.__init__(self, observation_space, action_space, device) + DeterministicMixin.__init__(self, clip_actions) + + self.linear_layer_1 = nn.Linear(self.num_observations + self.num_actions, 400) + self.linear_layer_2 = nn.Linear(400, 300) + self.linear_layer_3 = nn.Linear(300, 1) + + def compute(self, inputs, role): + x = F.relu(self.linear_layer_1(torch.cat([inputs["states"], inputs["taken_actions"]], dim=1))) + x = F.relu(self.linear_layer_2(x)) + return self.linear_layer_3(x), {} + + +# Load and wrap the Gym environment. +# Note: the environment version may change depending on the gym version +try: + env = gym.make("Pendulum-v1") +except gym.error.DeprecatedEnv as e: + env_id = [spec.id for spec in gym.envs.registry.all() if spec.id.startswith("Pendulum-v")][0] + print("Pendulum-v1 not found. Trying {}".format(env_id)) + env = gym.make(env_id) +env = wrap_env(env) + +device = env.device + + +# Instantiate a RandomMemory (without replacement) as experience replay memory +memory = RandomMemory(memory_size=20000, num_envs=env.num_envs, device=device, replacement=False) + + +# Instantiate the agent's models (function approximators). +# SAC requires 5 models, visit its documentation for more details +# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.sac.html#spaces-and-models +models_sac = {} +models_sac["policy"] = Actor(env.observation_space, env.action_space, device, clip_actions=True) +models_sac["critic_1"] = Critic(env.observation_space, env.action_space, device) +models_sac["critic_2"] = Critic(env.observation_space, env.action_space, device) +models_sac["target_critic_1"] = Critic(env.observation_space, env.action_space, device) +models_sac["target_critic_2"] = Critic(env.observation_space, env.action_space, device) + +# Initialize the models' parameters (weights and biases) using a Gaussian distribution +for model in models_sac.values(): + model.init_parameters(method_name="normal_", mean=0.0, std=0.1) + + +# Configure and instantiate the agent. +# Only modify some of the default configuration, visit its documentation to see all the options +# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.sac.html#configuration-and-hyperparameters +cfg_sac = SAC_DEFAULT_CONFIG.copy() +cfg_sac["discount_factor"] = 0.98 +cfg_sac["batch_size"] = 100 +cfg_sac["random_timesteps"] = 0 +cfg_sac["learning_starts"] = 1000 +cfg_sac["learn_entropy"] = True +# logging to TensorBoard and write checkpoints each 75 and 750 timesteps respectively +cfg_sac["experiment"]["write_interval"] = 75 +cfg_sac["experiment"]["checkpoint_interval"] = 750 + +agent_sac = SAC(models=models_sac, + memory=memory, + cfg=cfg_sac, + observation_space=env.observation_space, + action_space=env.action_space, + device=device) + + +# Configure and instantiate the RL trainer +cfg_trainer = {"timesteps": 15000, "headless": True} +trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=agent_sac) + +# start training +trainer.train() diff --git a/docs/source/examples/gym/sac_gym_pendulumnovel.py b/docs/source/examples/gym/sac_gym_pendulumnovel.py new file mode 100644 index 00000000..50debf6f --- /dev/null +++ b/docs/source/examples/gym/sac_gym_pendulumnovel.py @@ -0,0 +1,112 @@ +import gym + +import torch +import torch.nn as nn +import torch.nn.functional as F +import numpy as np + +# Import the skrl components to build the RL system +from skrl.models.torch import Model, DeterministicMixin, GaussianMixin +from skrl.memories.torch import RandomMemory +from skrl.agents.torch.sac import SAC, SAC_DEFAULT_CONFIG +from skrl.trainers.torch import SequentialTrainer +from skrl.envs.torch import wrap_env + + +# Define the models (deterministic models) for the SAC agent using mixin +# - Actor (policy): takes as input the environment's observation/state and returns an action +# - Critic: takes the state and action as input and provides a value to guide the policy +class Actor(GaussianMixin, Model): + def __init__(self, observation_space, action_space, device, clip_actions=False, + clip_log_std=True, min_log_std=-20, max_log_std=2, reduction="sum"): + Model.__init__(self, observation_space, action_space, device) + GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std, reduction) + + self.linear_layer_1 = nn.Linear(self.num_observations, 400) + self.linear_layer_2 = nn.Linear(400, 300) + self.action_layer = nn.Linear(300, self.num_actions) + + self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) + + def compute(self, inputs, role): + x = F.relu(self.linear_layer_1(inputs["states"])) + x = F.relu(self.linear_layer_2(x)) + # Pendulum-v1 action_space is -2 to 2 + return 2 * torch.tanh(self.action_layer(x)), self.log_std_parameter, {} + +class Critic(DeterministicMixin, Model): + def __init__(self, observation_space, action_space, device, clip_actions=False): + Model.__init__(self, observation_space, action_space, device) + DeterministicMixin.__init__(self, clip_actions) + + self.linear_layer_1 = nn.Linear(self.num_observations + self.num_actions, 400) + self.linear_layer_2 = nn.Linear(400, 300) + self.linear_layer_3 = nn.Linear(300, 1) + + def compute(self, inputs, role): + x = F.relu(self.linear_layer_1(torch.cat([inputs["states"], inputs["taken_actions"]], dim=1))) + x = F.relu(self.linear_layer_2(x)) + return self.linear_layer_3(x), {} + + +# Gym environment observation wrapper used to mask velocity. Adapted from rl_zoo3 (rl_zoo3/wrappers.py) +class NoVelocityWrapper(gym.ObservationWrapper): + def observation(self, observation): + # observation: x, y, angular velocity + return observation * np.array([1, 1, 0]) + +gym.envs.registration.register(id="PendulumNoVel-v1", entry_point=lambda: NoVelocityWrapper(gym.make("Pendulum-v1"))) + +# Load and wrap the Gym environment +env = gym.make("PendulumNoVel-v1") +env = wrap_env(env) + +device = env.device + + +# Instantiate a RandomMemory (without replacement) as experience replay memory +memory = RandomMemory(memory_size=20000, num_envs=env.num_envs, device=device, replacement=False) + + +# Instantiate the agent's models (function approximators). +# SAC requires 5 models, visit its documentation for more details +# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.sac.html#spaces-and-models +models_sac = {} +models_sac["policy"] = Actor(env.observation_space, env.action_space, device, clip_actions=True) +models_sac["critic_1"] = Critic(env.observation_space, env.action_space, device) +models_sac["critic_2"] = Critic(env.observation_space, env.action_space, device) +models_sac["target_critic_1"] = Critic(env.observation_space, env.action_space, device) +models_sac["target_critic_2"] = Critic(env.observation_space, env.action_space, device) + +# Initialize the models' parameters (weights and biases) using a Gaussian distribution +for model in models_sac.values(): + model.init_parameters(method_name="normal_", mean=0.0, std=0.1) + + +# Configure and instantiate the agent. +# Only modify some of the default configuration, visit its documentation to see all the options +# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.sac.html#configuration-and-hyperparameters +cfg_sac = SAC_DEFAULT_CONFIG.copy() +cfg_sac["discount_factor"] = 0.98 +cfg_sac["batch_size"] = 100 +cfg_sac["random_timesteps"] = 0 +cfg_sac["learning_starts"] = 1000 +cfg_sac["learn_entropy"] = True +# logging to TensorBoard and write checkpoints each 75 and 750 timesteps respectively +cfg_sac["experiment"]["write_interval"] = 75 +cfg_sac["experiment"]["checkpoint_interval"] = 750 + +agent_sac = SAC(models=models_sac, + memory=memory, + cfg=cfg_sac, + observation_space=env.observation_space, + action_space=env.action_space, + device=device) + + +# Configure and instantiate the RL trainer +cfg_trainer = {"timesteps": 15000, "headless": True} +trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=agent_sac) + +# start training +trainer.train() diff --git a/docs/source/examples/gym/sac_gym_pendulumnovel_gru.py b/docs/source/examples/gym/sac_gym_pendulumnovel_gru.py new file mode 100644 index 00000000..66e6d111 --- /dev/null +++ b/docs/source/examples/gym/sac_gym_pendulumnovel_gru.py @@ -0,0 +1,213 @@ +import gym + +import torch +import torch.nn as nn +import torch.nn.functional as F +import numpy as np + +# Import the skrl components to build the RL system +from skrl.models.torch import Model, DeterministicMixin, GaussianMixin +from skrl.memories.torch import RandomMemory +from skrl.agents.torch.sac import SAC, SAC_DEFAULT_CONFIG +from skrl.trainers.torch import SequentialTrainer +from skrl.envs.torch import wrap_env + + +# Define the models (deterministic models) for the SAC agent using mixin +# - Actor (policy): takes as input the environment's observation/state and returns an action +# - Critic: takes the state and action as input and provides a value to guide the policy +class Actor(GaussianMixin, Model): + def __init__(self, observation_space, action_space, device, clip_actions=False, + clip_log_std=True, min_log_std=-20, max_log_std=2, reduction="sum", + num_envs=1, num_layers=1, hidden_size=400, sequence_length=20): + Model.__init__(self, observation_space, action_space, device) + GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std, reduction) + + self.num_envs = num_envs + self.num_layers = num_layers + self.hidden_size = hidden_size # Hout + self.sequence_length = sequence_length + + self.gru = nn.GRU(input_size=self.num_observations, + hidden_size=self.hidden_size, + num_layers=self.num_layers, + batch_first=True) # batch_first -> (batch, sequence, features) + + self.linear_layer_1 = nn.Linear(self.hidden_size, 400) + self.linear_layer_2 = nn.Linear(400, 300) + self.action_layer = nn.Linear(300, self.num_actions) + + self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) + + def get_specification(self): + # batch size (N) is the number of envs + return {"rnn": {"sequence_length": self.sequence_length, + "sizes": [(self.num_layers, self.num_envs, self.hidden_size)]}} # hidden states (D ∗ num_layers, N, Hout) + + def compute(self, inputs, role): + states = inputs["states"] + terminated = inputs.get("terminated", None) + hidden_states = inputs["rnn"][0] + + # training + if self.training: + rnn_input = states.view(-1, self.sequence_length, states.shape[-1]) # (N, L, Hin): N=batch_size, L=sequence_length + hidden_states = hidden_states.view(self.num_layers, -1, self.sequence_length, hidden_states.shape[-1]) # (D * num_layers, N, L, Hout) + # get the hidden states corresponding to the initial sequence + hidden_states = hidden_states[:,:,0,:].contiguous() # (D * num_layers, N, Hout) + + # reset the RNN state in the middle of a sequence + if terminated is not None and torch.any(terminated): + rnn_outputs = [] + terminated = terminated.view(-1, self.sequence_length) + indexes = [0] + (terminated[:,:-1].any(dim=0).nonzero(as_tuple=True)[0] + 1).tolist() + [self.sequence_length] + + for i in range(len(indexes) - 1): + i0, i1 = indexes[i], indexes[i + 1] + rnn_output, hidden_states = self.gru(rnn_input[:,i0:i1,:], hidden_states) + hidden_states[:, (terminated[:,i1-1]), :] = 0 + rnn_outputs.append(rnn_output) + + rnn_output = torch.cat(rnn_outputs, dim=1) + # no need to reset the RNN state in the sequence + else: + rnn_output, hidden_states = self.gru(rnn_input, hidden_states) + # rollout + else: + rnn_input = states.view(-1, 1, states.shape[-1]) # (N, L, Hin): N=num_envs, L=1 + rnn_output, hidden_states = self.gru(rnn_input, hidden_states) + + # flatten the RNN output + rnn_output = torch.flatten(rnn_output, start_dim=0, end_dim=1) # (N, L, D ∗ Hout) -> (N * L, D ∗ Hout) + + x = F.relu(self.linear_layer_1(rnn_output)) + x = F.relu(self.linear_layer_2(x)) + + # Pendulum-v1 action_space is -2 to 2 + return 2 * torch.tanh(self.action_layer(x)), self.log_std_parameter, {"rnn": [hidden_states]} + +class Critic(DeterministicMixin, Model): + def __init__(self, observation_space, action_space, device, clip_actions=False, + num_envs=1, num_layers=1, hidden_size=400, sequence_length=20): + Model.__init__(self, observation_space, action_space, device) + DeterministicMixin.__init__(self, clip_actions) + + self.num_envs = num_envs + self.num_layers = num_layers + self.hidden_size = hidden_size # Hout + self.sequence_length = sequence_length + + self.gru = nn.GRU(input_size=self.num_observations, + hidden_size=self.hidden_size, + num_layers=self.num_layers, + batch_first=True) # batch_first -> (batch, sequence, features) + + self.linear_layer_1 = nn.Linear(self.hidden_size + self.num_actions, 400) + self.linear_layer_2 = nn.Linear(400, 300) + self.linear_layer_3 = nn.Linear(300, 1) + + def get_specification(self): + # batch size (N) is the number of envs + return {"rnn": {"sequence_length": self.sequence_length, + "sizes": [(self.num_layers, self.num_envs, self.hidden_size)]}} # hidden states (D ∗ num_layers, N, Hout) + + def compute(self, inputs, role): + states = inputs["states"] + terminated = inputs.get("terminated", None) + hidden_states = inputs["rnn"][0] + + # critic is only used during training + rnn_input = states.view(-1, self.sequence_length, states.shape[-1]) # (N, L, Hin): N=batch_size, L=sequence_length + + hidden_states = hidden_states.view(self.num_layers, -1, self.sequence_length, hidden_states.shape[-1]) # (D * num_layers, N, L, Hout) + # get the hidden states corresponding to the initial sequence + sequence_index = 1 if role in ["target_critic_1", "target_critic_2"] else 0 # target networks act on the next state of the environment + hidden_states = hidden_states[:,:,sequence_index,:].contiguous() # (D * num_layers, N, Hout) + + # reset the RNN state in the middle of a sequence + if terminated is not None and torch.any(terminated): + rnn_outputs = [] + terminated = terminated.view(-1, self.sequence_length) + indexes = [0] + (terminated[:,:-1].any(dim=0).nonzero(as_tuple=True)[0] + 1).tolist() + [self.sequence_length] + + for i in range(len(indexes) - 1): + i0, i1 = indexes[i], indexes[i + 1] + rnn_output, hidden_states = self.gru(rnn_input[:,i0:i1,:], hidden_states) + hidden_states[:, (terminated[:,i1-1]), :] = 0 + rnn_outputs.append(rnn_output) + + rnn_output = torch.cat(rnn_outputs, dim=1) + # no need to reset the RNN state in the sequence + else: + rnn_output, hidden_states = self.gru(rnn_input, hidden_states) + + # flatten the RNN output + rnn_output = torch.flatten(rnn_output, start_dim=0, end_dim=1) # (N, L, D ∗ Hout) -> (N * L, D ∗ Hout) + + x = F.relu(self.linear_layer_1(torch.cat([rnn_output, inputs["taken_actions"]], dim=1))) + x = F.relu(self.linear_layer_2(x)) + + return self.linear_layer_3(x), {"rnn": [hidden_states]} + + +# Gym environment observation wrapper used to mask velocity. Adapted from rl_zoo3 (rl_zoo3/wrappers.py) +class NoVelocityWrapper(gym.ObservationWrapper): + def observation(self, observation): + # observation: x, y, angular velocity + return observation * np.array([1, 1, 0]) + +gym.envs.registration.register(id="PendulumNoVel-v1", entry_point=lambda: NoVelocityWrapper(gym.make("Pendulum-v1"))) + +# Load and wrap the Gym environment +env = gym.make("PendulumNoVel-v1") +env = wrap_env(env) + +device = env.device + + +# Instantiate a RandomMemory (without replacement) as experience replay memory +memory = RandomMemory(memory_size=20000, num_envs=env.num_envs, device=device, replacement=False) + + +# Instantiate the agent's models (function approximators). +# SAC requires 5 models, visit its documentation for more details +# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.sac.html#spaces-and-models +models_sac = {} +models_sac["policy"] = Actor(env.observation_space, env.action_space, device, clip_actions=True, num_envs=env.num_envs) +models_sac["critic_1"] = Critic(env.observation_space, env.action_space, device, num_envs=env.num_envs) +models_sac["critic_2"] = Critic(env.observation_space, env.action_space, device, num_envs=env.num_envs) +models_sac["target_critic_1"] = Critic(env.observation_space, env.action_space, device, num_envs=env.num_envs) +models_sac["target_critic_2"] = Critic(env.observation_space, env.action_space, device, num_envs=env.num_envs) + +# Initialize the models' parameters (weights and biases) using a Gaussian distribution +for model in models_sac.values(): + model.init_parameters(method_name="normal_", mean=0.0, std=0.1) + + +# Configure and instantiate the agent. +# Only modify some of the default configuration, visit its documentation to see all the options +# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.sac.html#configuration-and-hyperparameters +cfg_sac = SAC_DEFAULT_CONFIG.copy() +cfg_sac["discount_factor"] = 0.98 +cfg_sac["batch_size"] = 100 +cfg_sac["random_timesteps"] = 0 +cfg_sac["learning_starts"] = 1000 +cfg_sac["learn_entropy"] = True +# logging to TensorBoard and write checkpoints each 75 and 750 timesteps respectively +cfg_sac["experiment"]["write_interval"] = 75 +cfg_sac["experiment"]["checkpoint_interval"] = 750 + +agent_sac = SAC(models=models_sac, + memory=memory, + cfg=cfg_sac, + observation_space=env.observation_space, + action_space=env.action_space, + device=device) + + +# Configure and instantiate the RL trainer +cfg_trainer = {"timesteps": 15000, "headless": True} +trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=agent_sac) + +# start training +trainer.train() diff --git a/docs/source/examples/gym/sac_gym_pendulumnovel_lstm.py b/docs/source/examples/gym/sac_gym_pendulumnovel_lstm.py new file mode 100644 index 00000000..8866a4bb --- /dev/null +++ b/docs/source/examples/gym/sac_gym_pendulumnovel_lstm.py @@ -0,0 +1,223 @@ +import gym + +import torch +import torch.nn as nn +import torch.nn.functional as F +import numpy as np + +# Import the skrl components to build the RL system +from skrl.models.torch import Model, DeterministicMixin, GaussianMixin +from skrl.memories.torch import RandomMemory +from skrl.agents.torch.sac import SAC, SAC_DEFAULT_CONFIG +from skrl.trainers.torch import SequentialTrainer +from skrl.envs.torch import wrap_env + + +# Define the models (deterministic models) for the SAC agent using mixin +# - Actor (policy): takes as input the environment's observation/state and returns an action +# - Critic: takes the state and action as input and provides a value to guide the policy +class Actor(GaussianMixin, Model): + def __init__(self, observation_space, action_space, device, clip_actions=False, + clip_log_std=True, min_log_std=-20, max_log_std=2, reduction="sum", + num_envs=1, num_layers=1, hidden_size=400, sequence_length=20): + Model.__init__(self, observation_space, action_space, device) + GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std, reduction) + + self.num_envs = num_envs + self.num_layers = num_layers + self.hidden_size = hidden_size # Hcell (Hout is Hcell because proj_size = 0) + self.sequence_length = sequence_length + + self.lstm = nn.LSTM(input_size=self.num_observations, + hidden_size=self.hidden_size, + num_layers=self.num_layers, + batch_first=True) # batch_first -> (batch, sequence, features) + + self.linear_layer_1 = nn.Linear(self.hidden_size, 400) + self.linear_layer_2 = nn.Linear(400, 300) + self.action_layer = nn.Linear(300, self.num_actions) + + self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) + + def get_specification(self): + # batch size (N) is the number of envs + return {"rnn": {"sequence_length": self.sequence_length, + "sizes": [(self.num_layers, self.num_envs, self.hidden_size), # hidden states (D ∗ num_layers, N, Hout) + (self.num_layers, self.num_envs, self.hidden_size)]}} # cell states (D ∗ num_layers, N, Hcell) + + def compute(self, inputs, role): + states = inputs["states"] + terminated = inputs.get("terminated", None) + hidden_states, cell_states = inputs["rnn"][0], inputs["rnn"][1] + + # training + if self.training: + rnn_input = states.view(-1, self.sequence_length, states.shape[-1]) # (N, L, Hin): N=batch_size, L=sequence_length + hidden_states = hidden_states.view(self.num_layers, -1, self.sequence_length, hidden_states.shape[-1]) # (D * num_layers, N, L, Hout) + cell_states = cell_states.view(self.num_layers, -1, self.sequence_length, cell_states.shape[-1]) # (D * num_layers, N, L, Hcell) + # get the hidden/cell states corresponding to the initial sequence + hidden_states = hidden_states[:,:,0,:].contiguous() # (D * num_layers, N, Hout) + cell_states = cell_states[:,:,0,:].contiguous() # (D * num_layers, N, Hcell) + + # reset the RNN state in the middle of a sequence + if terminated is not None and torch.any(terminated): + rnn_outputs = [] + terminated = terminated.view(-1, self.sequence_length) + indexes = [0] + (terminated[:,:-1].any(dim=0).nonzero(as_tuple=True)[0] + 1).tolist() + [self.sequence_length] + + for i in range(len(indexes) - 1): + i0, i1 = indexes[i], indexes[i + 1] + rnn_output, (hidden_states, cell_states) = self.lstm(rnn_input[:,i0:i1,:], (hidden_states, cell_states)) + hidden_states[:, (terminated[:,i1-1]), :] = 0 + cell_states[:, (terminated[:,i1-1]), :] = 0 + rnn_outputs.append(rnn_output) + + rnn_states = (hidden_states, cell_states) + rnn_output = torch.cat(rnn_outputs, dim=1) + # no need to reset the RNN state in the sequence + else: + rnn_output, rnn_states = self.lstm(rnn_input, (hidden_states, cell_states)) + # rollout + else: + rnn_input = states.view(-1, 1, states.shape[-1]) # (N, L, Hin): N=num_envs, L=1 + rnn_output, rnn_states = self.lstm(rnn_input, (hidden_states, cell_states)) + + # flatten the RNN output + rnn_output = torch.flatten(rnn_output, start_dim=0, end_dim=1) # (N, L, D ∗ Hout) -> (N * L, D ∗ Hout) + + x = F.relu(self.linear_layer_1(rnn_output)) + x = F.relu(self.linear_layer_2(x)) + + # Pendulum-v1 action_space is -2 to 2 + return 2 * torch.tanh(self.action_layer(x)), self.log_std_parameter, {"rnn": [rnn_states[0], rnn_states[1]]} + +class Critic(DeterministicMixin, Model): + def __init__(self, observation_space, action_space, device, clip_actions=False, + num_envs=1, num_layers=1, hidden_size=400, sequence_length=20): + Model.__init__(self, observation_space, action_space, device) + DeterministicMixin.__init__(self, clip_actions) + + self.num_envs = num_envs + self.num_layers = num_layers + self.hidden_size = hidden_size # Hcell (Hout is Hcell because proj_size = 0) + self.sequence_length = sequence_length + + self.lstm = nn.LSTM(input_size=self.num_observations, + hidden_size=self.hidden_size, + num_layers=self.num_layers, + batch_first=True) # batch_first -> (batch, sequence, features) + + self.linear_layer_1 = nn.Linear(self.hidden_size + self.num_actions, 400) + self.linear_layer_2 = nn.Linear(400, 300) + self.linear_layer_3 = nn.Linear(300, 1) + + def get_specification(self): + # batch size (N) is the number of envs + return {"rnn": {"sequence_length": self.sequence_length, + "sizes": [(self.num_layers, self.num_envs, self.hidden_size), # hidden states (D ∗ num_layers, N, Hout) + (self.num_layers, self.num_envs, self.hidden_size)]}} # cell states (D ∗ num_layers, N, Hcell) + + def compute(self, inputs, role): + states = inputs["states"] + terminated = inputs.get("terminated", None) + hidden_states, cell_states = inputs["rnn"][0], inputs["rnn"][1] + + # critic is only used during training + rnn_input = states.view(-1, self.sequence_length, states.shape[-1]) # (N, L, Hin): N=batch_size, L=sequence_length + + hidden_states = hidden_states.view(self.num_layers, -1, self.sequence_length, hidden_states.shape[-1]) # (D * num_layers, N, L, Hout) + cell_states = cell_states.view(self.num_layers, -1, self.sequence_length, cell_states.shape[-1]) # (D * num_layers, N, L, Hcell) + # get the hidden/cell states corresponding to the initial sequence + sequence_index = 1 if role in ["target_critic_1", "target_critic_2"] else 0 # target networks act on the next state of the environment + hidden_states = hidden_states[:,:,sequence_index,:].contiguous() # (D * num_layers, N, Hout) + cell_states = cell_states[:,:,sequence_index,:].contiguous() # (D * num_layers, N, Hcell) + + # reset the RNN state in the middle of a sequence + if terminated is not None and torch.any(terminated): + rnn_outputs = [] + terminated = terminated.view(-1, self.sequence_length) + indexes = [0] + (terminated[:,:-1].any(dim=0).nonzero(as_tuple=True)[0] + 1).tolist() + [self.sequence_length] + + for i in range(len(indexes) - 1): + i0, i1 = indexes[i], indexes[i + 1] + rnn_output, (hidden_states, cell_states) = self.lstm(rnn_input[:,i0:i1,:], (hidden_states, cell_states)) + hidden_states[:, (terminated[:,i1-1]), :] = 0 + cell_states[:, (terminated[:,i1-1]), :] = 0 + rnn_outputs.append(rnn_output) + + rnn_states = (hidden_states, cell_states) + rnn_output = torch.cat(rnn_outputs, dim=1) + # no need to reset the RNN state in the sequence + else: + rnn_output, rnn_states = self.lstm(rnn_input, (hidden_states, cell_states)) + + # flatten the RNN output + rnn_output = torch.flatten(rnn_output, start_dim=0, end_dim=1) # (N, L, D ∗ Hout) -> (N * L, D ∗ Hout) + + x = F.relu(self.linear_layer_1(torch.cat([rnn_output, inputs["taken_actions"]], dim=1))) + x = F.relu(self.linear_layer_2(x)) + + return self.linear_layer_3(x), {"rnn": [rnn_states[0], rnn_states[1]]} + + +# Gym environment observation wrapper used to mask velocity. Adapted from rl_zoo3 (rl_zoo3/wrappers.py) +class NoVelocityWrapper(gym.ObservationWrapper): + def observation(self, observation): + # observation: x, y, angular velocity + return observation * np.array([1, 1, 0]) + +gym.envs.registration.register(id="PendulumNoVel-v1", entry_point=lambda: NoVelocityWrapper(gym.make("Pendulum-v1"))) + +# Load and wrap the Gym environment +env = gym.make("PendulumNoVel-v1") +env = wrap_env(env) + +device = env.device + + +# Instantiate a RandomMemory (without replacement) as experience replay memory +memory = RandomMemory(memory_size=20000, num_envs=env.num_envs, device=device, replacement=False) + + +# Instantiate the agent's models (function approximators). +# SAC requires 5 models, visit its documentation for more details +# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.sac.html#spaces-and-models +models_sac = {} +models_sac["policy"] = Actor(env.observation_space, env.action_space, device, clip_actions=True, num_envs=env.num_envs) +models_sac["critic_1"] = Critic(env.observation_space, env.action_space, device, num_envs=env.num_envs) +models_sac["critic_2"] = Critic(env.observation_space, env.action_space, device, num_envs=env.num_envs) +models_sac["target_critic_1"] = Critic(env.observation_space, env.action_space, device, num_envs=env.num_envs) +models_sac["target_critic_2"] = Critic(env.observation_space, env.action_space, device, num_envs=env.num_envs) + +# Initialize the models' parameters (weights and biases) using a Gaussian distribution +for model in models_sac.values(): + model.init_parameters(method_name="normal_", mean=0.0, std=0.1) + + +# Configure and instantiate the agent. +# Only modify some of the default configuration, visit its documentation to see all the options +# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.sac.html#configuration-and-hyperparameters +cfg_sac = SAC_DEFAULT_CONFIG.copy() +cfg_sac["discount_factor"] = 0.98 +cfg_sac["batch_size"] = 100 +cfg_sac["random_timesteps"] = 0 +cfg_sac["learning_starts"] = 1000 +cfg_sac["learn_entropy"] = True +# logging to TensorBoard and write checkpoints each 75 and 750 timesteps respectively +cfg_sac["experiment"]["write_interval"] = 75 +cfg_sac["experiment"]["checkpoint_interval"] = 750 + +agent_sac = SAC(models=models_sac, + memory=memory, + cfg=cfg_sac, + observation_space=env.observation_space, + action_space=env.action_space, + device=device) + + +# Configure and instantiate the RL trainer +cfg_trainer = {"timesteps": 15000, "headless": True} +trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=agent_sac) + +# start training +trainer.train() diff --git a/docs/source/examples/gym/sac_gym_pendulumnovel_rnn.py b/docs/source/examples/gym/sac_gym_pendulumnovel_rnn.py new file mode 100644 index 00000000..03ea028d --- /dev/null +++ b/docs/source/examples/gym/sac_gym_pendulumnovel_rnn.py @@ -0,0 +1,213 @@ +import gym + +import torch +import torch.nn as nn +import torch.nn.functional as F +import numpy as np + +# Import the skrl components to build the RL system +from skrl.models.torch import Model, DeterministicMixin, GaussianMixin +from skrl.memories.torch import RandomMemory +from skrl.agents.torch.sac import SAC, SAC_DEFAULT_CONFIG +from skrl.trainers.torch import SequentialTrainer +from skrl.envs.torch import wrap_env + + +# Define the models (deterministic models) for the SAC agent using mixin +# - Actor (policy): takes as input the environment's observation/state and returns an action +# - Critic: takes the state and action as input and provides a value to guide the policy +class Actor(GaussianMixin, Model): + def __init__(self, observation_space, action_space, device, clip_actions=False, + clip_log_std=True, min_log_std=-20, max_log_std=2, reduction="sum", + num_envs=1, num_layers=1, hidden_size=400, sequence_length=20): + Model.__init__(self, observation_space, action_space, device) + GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std, reduction) + + self.num_envs = num_envs + self.num_layers = num_layers + self.hidden_size = hidden_size # Hout + self.sequence_length = sequence_length + + self.rnn = nn.RNN(input_size=self.num_observations, + hidden_size=self.hidden_size, + num_layers=self.num_layers, + batch_first=True) # batch_first -> (batch, sequence, features) + + self.linear_layer_1 = nn.Linear(self.hidden_size, 400) + self.linear_layer_2 = nn.Linear(400, 300) + self.action_layer = nn.Linear(300, self.num_actions) + + self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) + + def get_specification(self): + # batch size (N) is the number of envs + return {"rnn": {"sequence_length": self.sequence_length, + "sizes": [(self.num_layers, self.num_envs, self.hidden_size)]}} # hidden states (D ∗ num_layers, N, Hout) + + def compute(self, inputs, role): + states = inputs["states"] + terminated = inputs.get("terminated", None) + hidden_states = inputs["rnn"][0] + + # training + if self.training: + rnn_input = states.view(-1, self.sequence_length, states.shape[-1]) # (N, L, Hin): N=batch_size, L=sequence_length + hidden_states = hidden_states.view(self.num_layers, -1, self.sequence_length, hidden_states.shape[-1]) # (D * num_layers, N, L, Hout) + # get the hidden states corresponding to the initial sequence + hidden_states = hidden_states[:,:,0,:].contiguous() # (D * num_layers, N, Hout) + + # reset the RNN state in the middle of a sequence + if terminated is not None and torch.any(terminated): + rnn_outputs = [] + terminated = terminated.view(-1, self.sequence_length) + indexes = [0] + (terminated[:,:-1].any(dim=0).nonzero(as_tuple=True)[0] + 1).tolist() + [self.sequence_length] + + for i in range(len(indexes) - 1): + i0, i1 = indexes[i], indexes[i + 1] + rnn_output, hidden_states = self.rnn(rnn_input[:,i0:i1,:], hidden_states) + hidden_states[:, (terminated[:,i1-1]), :] = 0 + rnn_outputs.append(rnn_output) + + rnn_output = torch.cat(rnn_outputs, dim=1) + # no need to reset the RNN state in the sequence + else: + rnn_output, hidden_states = self.rnn(rnn_input, hidden_states) + # rollout + else: + rnn_input = states.view(-1, 1, states.shape[-1]) # (N, L, Hin): N=num_envs, L=1 + rnn_output, hidden_states = self.rnn(rnn_input, hidden_states) + + # flatten the RNN output + rnn_output = torch.flatten(rnn_output, start_dim=0, end_dim=1) # (N, L, D ∗ Hout) -> (N * L, D ∗ Hout) + + x = F.relu(self.linear_layer_1(rnn_output)) + x = F.relu(self.linear_layer_2(x)) + + # Pendulum-v1 action_space is -2 to 2 + return 2 * torch.tanh(self.action_layer(x)), self.log_std_parameter, {"rnn": [hidden_states]} + +class Critic(DeterministicMixin, Model): + def __init__(self, observation_space, action_space, device, clip_actions=False, + num_envs=1, num_layers=1, hidden_size=400, sequence_length=20): + Model.__init__(self, observation_space, action_space, device) + DeterministicMixin.__init__(self, clip_actions) + + self.num_envs = num_envs + self.num_layers = num_layers + self.hidden_size = hidden_size # Hout + self.sequence_length = sequence_length + + self.rnn = nn.RNN(input_size=self.num_observations, + hidden_size=self.hidden_size, + num_layers=self.num_layers, + batch_first=True) # batch_first -> (batch, sequence, features) + + self.linear_layer_1 = nn.Linear(self.hidden_size + self.num_actions, 400) + self.linear_layer_2 = nn.Linear(400, 300) + self.linear_layer_3 = nn.Linear(300, 1) + + def get_specification(self): + # batch size (N) is the number of envs + return {"rnn": {"sequence_length": self.sequence_length, + "sizes": [(self.num_layers, self.num_envs, self.hidden_size)]}} # hidden states (D ∗ num_layers, N, Hout) + + def compute(self, inputs, role): + states = inputs["states"] + terminated = inputs.get("terminated", None) + hidden_states = inputs["rnn"][0] + + # critic is only used during training + rnn_input = states.view(-1, self.sequence_length, states.shape[-1]) # (N, L, Hin): N=batch_size, L=sequence_length + + hidden_states = hidden_states.view(self.num_layers, -1, self.sequence_length, hidden_states.shape[-1]) # (D * num_layers, N, L, Hout) + # get the hidden states corresponding to the initial sequence + sequence_index = 1 if role in ["target_critic_1", "target_critic_2"] else 0 # target networks act on the next state of the environment + hidden_states = hidden_states[:,:,sequence_index,:].contiguous() # (D * num_layers, N, Hout) + + # reset the RNN state in the middle of a sequence + if terminated is not None and torch.any(terminated): + rnn_outputs = [] + terminated = terminated.view(-1, self.sequence_length) + indexes = [0] + (terminated[:,:-1].any(dim=0).nonzero(as_tuple=True)[0] + 1).tolist() + [self.sequence_length] + + for i in range(len(indexes) - 1): + i0, i1 = indexes[i], indexes[i + 1] + rnn_output, hidden_states = self.rnn(rnn_input[:,i0:i1,:], hidden_states) + hidden_states[:, (terminated[:,i1-1]), :] = 0 + rnn_outputs.append(rnn_output) + + rnn_output = torch.cat(rnn_outputs, dim=1) + # no need to reset the RNN state in the sequence + else: + rnn_output, hidden_states = self.rnn(rnn_input, hidden_states) + + # flatten the RNN output + rnn_output = torch.flatten(rnn_output, start_dim=0, end_dim=1) # (N, L, D ∗ Hout) -> (N * L, D ∗ Hout) + + x = F.relu(self.linear_layer_1(torch.cat([rnn_output, inputs["taken_actions"]], dim=1))) + x = F.relu(self.linear_layer_2(x)) + + return self.linear_layer_3(x), {"rnn": [hidden_states]} + + +# Gym environment observation wrapper used to mask velocity. Adapted from rl_zoo3 (rl_zoo3/wrappers.py) +class NoVelocityWrapper(gym.ObservationWrapper): + def observation(self, observation): + # observation: x, y, angular velocity + return observation * np.array([1, 1, 0]) + +gym.envs.registration.register(id="PendulumNoVel-v1", entry_point=lambda: NoVelocityWrapper(gym.make("Pendulum-v1"))) + +# Load and wrap the Gym environment +env = gym.make("PendulumNoVel-v1") +env = wrap_env(env) + +device = env.device + + +# Instantiate a RandomMemory (without replacement) as experience replay memory +memory = RandomMemory(memory_size=20000, num_envs=env.num_envs, device=device, replacement=False) + + +# Instantiate the agent's models (function approximators). +# SAC requires 5 models, visit its documentation for more details +# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.sac.html#spaces-and-models +models_sac = {} +models_sac["policy"] = Actor(env.observation_space, env.action_space, device, clip_actions=True, num_envs=env.num_envs) +models_sac["critic_1"] = Critic(env.observation_space, env.action_space, device, num_envs=env.num_envs) +models_sac["critic_2"] = Critic(env.observation_space, env.action_space, device, num_envs=env.num_envs) +models_sac["target_critic_1"] = Critic(env.observation_space, env.action_space, device, num_envs=env.num_envs) +models_sac["target_critic_2"] = Critic(env.observation_space, env.action_space, device, num_envs=env.num_envs) + +# Initialize the models' parameters (weights and biases) using a Gaussian distribution +for model in models_sac.values(): + model.init_parameters(method_name="normal_", mean=0.0, std=0.1) + + +# Configure and instantiate the agent. +# Only modify some of the default configuration, visit its documentation to see all the options +# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.sac.html#configuration-and-hyperparameters +cfg_sac = SAC_DEFAULT_CONFIG.copy() +cfg_sac["discount_factor"] = 0.98 +cfg_sac["batch_size"] = 100 +cfg_sac["random_timesteps"] = 0 +cfg_sac["learning_starts"] = 1000 +cfg_sac["learn_entropy"] = True +# logging to TensorBoard and write checkpoints each 75 and 750 timesteps respectively +cfg_sac["experiment"]["write_interval"] = 75 +cfg_sac["experiment"]["checkpoint_interval"] = 750 + +agent_sac = SAC(models=models_sac, + memory=memory, + cfg=cfg_sac, + observation_space=env.observation_space, + action_space=env.action_space, + device=device) + + +# Configure and instantiate the RL trainer +cfg_trainer = {"timesteps": 15000, "headless": True} +trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=agent_sac) + +# start training +trainer.train() diff --git a/docs/source/examples/gym/gym_taxi_sarsa.py b/docs/source/examples/gym/sarsa_gym_taxi.py similarity index 86% rename from docs/source/examples/gym/gym_taxi_sarsa.py rename to docs/source/examples/gym/sarsa_gym_taxi.py index 5f7ba8dd..f991bc09 100644 --- a/docs/source/examples/gym/gym_taxi_sarsa.py +++ b/docs/source/examples/gym/sarsa_gym_taxi.py @@ -16,18 +16,18 @@ def __init__(self, observation_space, action_space, device, num_envs=1, epsilon= TabularMixin.__init__(self, num_envs) self.epsilon = epsilon - self.q_table = torch.ones((num_envs, self.num_observations, self.num_actions), + self.q_table = torch.ones((num_envs, self.num_observations, self.num_actions), dtype=torch.float32, device=self.device) - - def compute(self, states, taken_actions, role): - actions = torch.argmax(self.q_table[torch.arange(self.num_envs).view(-1, 1), states], + + def compute(self, inputs, role): + actions = torch.argmax(self.q_table[torch.arange(self.num_envs).view(-1, 1), inputs["states"]], dim=-1, keepdim=True).view(-1,1) - + # choose random actions for exploration according to epsilon - indexes = (torch.rand(states.shape[0], device=self.device) < self.epsilon).nonzero().view(-1) + indexes = (torch.rand(inputs["states"].shape[0], device=self.device) < self.epsilon).nonzero().view(-1) if indexes.numel(): actions[indexes] = torch.randint(self.num_actions, (indexes.numel(), 1), device=self.device) - return actions + return actions, {} # Load and wrap the Gym environment. @@ -55,15 +55,15 @@ def compute(self, states, taken_actions, role): # https://skrl.readthedocs.io/en/latest/modules/skrl.agents.sarsa.html#configuration-and-hyperparameters cfg_sarsa = SARSA_DEFAULT_CONFIG.copy() cfg_sarsa["discount_factor"] = 0.999 -cfg_sarsa["alpha"] = 0.4 +cfg_sarsa["alpha"] = 0.4 # logging to TensorBoard and write checkpoints each 1600 and 8000 timesteps respectively cfg_sarsa["experiment"]["write_interval"] = 1600 cfg_sarsa["experiment"]["checkpoint_interval"] = 8000 agent_sarsa = SARSA(models=models_sarsa, - memory=None, - cfg=cfg_sarsa, - observation_space=env.observation_space, + memory=None, + cfg=cfg_sarsa, + observation_space=env.observation_space, action_space=env.action_space, device=device) diff --git a/docs/source/examples/gym/gym_taxi_sarsa_eval.py b/docs/source/examples/gym/sarsa_gym_taxi_eval.py similarity index 85% rename from docs/source/examples/gym/gym_taxi_sarsa_eval.py rename to docs/source/examples/gym/sarsa_gym_taxi_eval.py index 4cd64ac3..430e3585 100644 --- a/docs/source/examples/gym/gym_taxi_sarsa_eval.py +++ b/docs/source/examples/gym/sarsa_gym_taxi_eval.py @@ -9,25 +9,25 @@ from skrl.envs.torch import wrap_env -# Define the model (tabular models) for the SARSA agent using a helper class +# Define the model (tabular model) for the SARSA agent using a helper class class EpilonGreedyPolicy(TabularMixin, Model): def __init__(self, observation_space, action_space, device, num_envs=1, epsilon=0.1): Model.__init__(self, observation_space, action_space, device) TabularMixin.__init__(self, num_envs) self.epsilon = epsilon - self.q_table = torch.ones((num_envs, self.num_observations, self.num_actions), + self.q_table = torch.ones((num_envs, self.num_observations, self.num_actions), dtype=torch.float32, device=self.device) - - def compute(self, states, taken_actions, role): - actions = torch.argmax(self.q_table[torch.arange(self.num_envs).view(-1, 1), states], + + def compute(self, inputs, role): + actions = torch.argmax(self.q_table[torch.arange(self.num_envs).view(-1, 1), inputs["states"]], dim=-1, keepdim=True).view(-1,1) - + # choose random actions for exploration according to epsilon - indexes = (torch.rand(states.shape[0], device=self.device) < self.epsilon).nonzero().view(-1) + indexes = (torch.rand(inputs["states"].shape[0], device=self.device) < self.epsilon).nonzero().view(-1) if indexes.numel(): actions[indexes] = torch.randint(self.num_actions, (indexes.numel(), 1), device=self.device) - return actions + return actions, {} # Load and wrap the Gym environment. @@ -60,9 +60,9 @@ def compute(self, states, taken_actions, role): cfg_sarsa["experiment"]["checkpoint_interval"] = 0 agent_sarsa = SARSA(models=models_sarsa, - memory=None, - cfg=cfg_sarsa, - observation_space=env.observation_space, + memory=None, + cfg=cfg_sarsa, + observation_space=env.observation_space, action_space=env.action_space, device=device) diff --git a/docs/source/examples/gym/gym_vector_taxi_sarsa.py b/docs/source/examples/gym/sarsa_gym_taxi_vector.py similarity index 87% rename from docs/source/examples/gym/gym_vector_taxi_sarsa.py rename to docs/source/examples/gym/sarsa_gym_taxi_vector.py index 581489b2..a4370569 100644 --- a/docs/source/examples/gym/gym_vector_taxi_sarsa.py +++ b/docs/source/examples/gym/sarsa_gym_taxi_vector.py @@ -16,18 +16,18 @@ def __init__(self, observation_space, action_space, device, num_envs=1, epsilon= TabularMixin.__init__(self, num_envs) self.epsilon = epsilon - self.q_table = torch.ones((num_envs, self.num_observations, self.num_actions), + self.q_table = torch.ones((num_envs, self.num_observations, self.num_actions), dtype=torch.float32, device=self.device) - - def compute(self, states, taken_actions, role): - actions = torch.argmax(self.q_table[torch.arange(self.num_envs).view(-1, 1), states], + + def compute(self, inputs, role): + actions = torch.argmax(self.q_table[torch.arange(self.num_envs).view(-1, 1), inputs["states"]], dim=-1, keepdim=True).view(-1,1) - + # choose random actions for exploration according to epsilon - indexes = (torch.rand(states.shape[0], device=self.device) < self.epsilon).nonzero().view(-1) + indexes = (torch.rand(inputs["states"].shape[0], device=self.device) < self.epsilon).nonzero().view(-1) if indexes.numel(): actions[indexes] = torch.randint(self.num_actions, (indexes.numel(), 1), device=self.device) - return actions + return actions, {} # Load and wrap the Gym environment. @@ -55,15 +55,15 @@ def compute(self, states, taken_actions, role): # https://skrl.readthedocs.io/en/latest/modules/skrl.agents.sarsa.html#configuration-and-hyperparameters cfg_sarsa = SARSA_DEFAULT_CONFIG.copy() cfg_sarsa["discount_factor"] = 0.999 -cfg_sarsa["alpha"] = 0.4 +cfg_sarsa["alpha"] = 0.4 # logging to TensorBoard and write checkpoints each 1600 and 8000 timesteps respectively cfg_sarsa["experiment"]["write_interval"] = 1600 cfg_sarsa["experiment"]["checkpoint_interval"] = 8000 agent_sarsa = SARSA(models=models_sarsa, - memory=None, - cfg=cfg_sarsa, - observation_space=env.observation_space, + memory=None, + cfg=cfg_sarsa, + observation_space=env.observation_space, action_space=env.action_space, device=device) diff --git a/docs/source/examples/gym/td3_gym_pendulum.py b/docs/source/examples/gym/td3_gym_pendulum.py new file mode 100644 index 00000000..8f4be4b5 --- /dev/null +++ b/docs/source/examples/gym/td3_gym_pendulum.py @@ -0,0 +1,110 @@ +import gym + +import torch +import torch.nn as nn +import torch.nn.functional as F + +# Import the skrl components to build the RL system +from skrl.models.torch import Model, DeterministicMixin +from skrl.memories.torch import RandomMemory +from skrl.agents.torch.td3 import TD3, TD3_DEFAULT_CONFIG +from skrl.resources.noises.torch import GaussianNoise +from skrl.trainers.torch import SequentialTrainer +from skrl.envs.torch import wrap_env + + +# Define the models (deterministic models) for the TD3 agent using mixin +# - Actor (policy): takes as input the environment's observation/state and returns an action +# - Critic: takes the state and action as input and provides a value to guide the policy +class Actor(DeterministicMixin, Model): + def __init__(self, observation_space, action_space, device, clip_actions=False): + Model.__init__(self, observation_space, action_space, device) + DeterministicMixin.__init__(self, clip_actions) + + self.linear_layer_1 = nn.Linear(self.num_observations, 400) + self.linear_layer_2 = nn.Linear(400, 300) + self.action_layer = nn.Linear(300, self.num_actions) + + def compute(self, inputs, role): + x = F.relu(self.linear_layer_1(inputs["states"])) + x = F.relu(self.linear_layer_2(x)) + # Pendulum-v1 action_space is -2 to 2 + return 2 * torch.tanh(self.action_layer(x)), {} + +class Critic(DeterministicMixin, Model): + def __init__(self, observation_space, action_space, device, clip_actions=False): + Model.__init__(self, observation_space, action_space, device) + DeterministicMixin.__init__(self, clip_actions) + + self.linear_layer_1 = nn.Linear(self.num_observations + self.num_actions, 400) + self.linear_layer_2 = nn.Linear(400, 300) + self.linear_layer_3 = nn.Linear(300, 1) + + def compute(self, inputs, role): + x = F.relu(self.linear_layer_1(torch.cat([inputs["states"], inputs["taken_actions"]], dim=1))) + x = F.relu(self.linear_layer_2(x)) + return self.linear_layer_3(x), {} + + +# Load and wrap the Gym environment. +# Note: the environment version may change depending on the gym version +try: + env = gym.make("Pendulum-v1") +except gym.error.DeprecatedEnv as e: + env_id = [spec.id for spec in gym.envs.registry.all() if spec.id.startswith("Pendulum-v")][0] + print("Pendulum-v1 not found. Trying {}".format(env_id)) + env = gym.make(env_id) +env = wrap_env(env) + +device = env.device + + +# Instantiate a RandomMemory (without replacement) as experience replay memory +memory = RandomMemory(memory_size=20000, num_envs=env.num_envs, device=device, replacement=False) + + +# Instantiate the agent's models (function approximators). +# TD3 requires 6 models, visit its documentation for more details +# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.td3.html#spaces-and-models +models_td3 = {} +models_td3["policy"] = Actor(env.observation_space, env.action_space, device) +models_td3["target_policy"] = Actor(env.observation_space, env.action_space, device) +models_td3["critic_1"] = Critic(env.observation_space, env.action_space, device) +models_td3["critic_2"] = Critic(env.observation_space, env.action_space, device) +models_td3["target_critic_1"] = Critic(env.observation_space, env.action_space, device) +models_td3["target_critic_2"] = Critic(env.observation_space, env.action_space, device) + +# Initialize the models' parameters (weights and biases) using a Gaussian distribution +for model in models_td3.values(): + model.init_parameters(method_name="normal_", mean=0.0, std=0.1) + + +# Configure and instantiate the agent. +# Only modify some of the default configuration, visit its documentation to see all the options +# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.td3.html#configuration-and-hyperparameters +cfg_td3 = TD3_DEFAULT_CONFIG.copy() +cfg_td3["exploration"]["noise"] = GaussianNoise(0, 0.1, device=device) +cfg_td3["smooth_regularization_noise"] = GaussianNoise(0, 0.2, device=device) +cfg_td3["smooth_regularization_clip"] = 0.5 +cfg_td3["discount_factor"] = 0.98 +cfg_td3["batch_size"] = 100 +cfg_td3["random_timesteps"] = 1000 +cfg_td3["learning_starts"] = 1000 +# logging to TensorBoard and write checkpoints each 75 and 750 timesteps respectively +cfg_td3["experiment"]["write_interval"] = 75 +cfg_td3["experiment"]["checkpoint_interval"] = 750 + +agent_td3 = TD3(models=models_td3, + memory=memory, + cfg=cfg_td3, + observation_space=env.observation_space, + action_space=env.action_space, + device=device) + + +# Configure and instantiate the RL trainer +cfg_trainer = {"timesteps": 15000, "headless": True} +trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=agent_td3) + +# start training +trainer.train() diff --git a/docs/source/examples/gym/td3_gym_pendulumnovel.py b/docs/source/examples/gym/td3_gym_pendulumnovel.py new file mode 100644 index 00000000..1e85427a --- /dev/null +++ b/docs/source/examples/gym/td3_gym_pendulumnovel.py @@ -0,0 +1,113 @@ +import gym + +import torch +import torch.nn as nn +import torch.nn.functional as F +import numpy as np + +# Import the skrl components to build the RL system +from skrl.models.torch import Model, DeterministicMixin +from skrl.memories.torch import RandomMemory +from skrl.agents.torch.td3 import TD3, TD3_DEFAULT_CONFIG +from skrl.resources.noises.torch import GaussianNoise +from skrl.trainers.torch import SequentialTrainer +from skrl.envs.torch import wrap_env + + +# Define the models (deterministic models) for the TD3 agent using mixin +# - Actor (policy): takes as input the environment's observation/state and returns an action +# - Critic: takes the state and action as input and provides a value to guide the policy +class Actor(DeterministicMixin, Model): + def __init__(self, observation_space, action_space, device, clip_actions=False): + Model.__init__(self, observation_space, action_space, device) + DeterministicMixin.__init__(self, clip_actions) + + self.linear_layer_1 = nn.Linear(self.num_observations, 400) + self.linear_layer_2 = nn.Linear(400, 300) + self.action_layer = nn.Linear(300, self.num_actions) + + def compute(self, inputs, role): + x = F.relu(self.linear_layer_1(inputs["states"])) + x = F.relu(self.linear_layer_2(x)) + # Pendulum-v1 action_space is -2 to 2 + return 2 * torch.tanh(self.action_layer(x)), {} + +class Critic(DeterministicMixin, Model): + def __init__(self, observation_space, action_space, device, clip_actions=False): + Model.__init__(self, observation_space, action_space, device) + DeterministicMixin.__init__(self, clip_actions) + + self.linear_layer_1 = nn.Linear(self.num_observations + self.num_actions, 400) + self.linear_layer_2 = nn.Linear(400, 300) + self.linear_layer_3 = nn.Linear(300, 1) + + def compute(self, inputs, role): + x = F.relu(self.linear_layer_1(torch.cat([inputs["states"], inputs["taken_actions"]], dim=1))) + x = F.relu(self.linear_layer_2(x)) + return self.linear_layer_3(x), {} + + +# Gym environment observation wrapper used to mask velocity. Adapted from rl_zoo3 (rl_zoo3/wrappers.py) +class NoVelocityWrapper(gym.ObservationWrapper): + def observation(self, observation): + # observation: x, y, angular velocity + return observation * np.array([1, 1, 0]) + +gym.envs.registration.register(id="PendulumNoVel-v1", entry_point=lambda: NoVelocityWrapper(gym.make("Pendulum-v1"))) + +# Load and wrap the Gym environment +env = gym.make("PendulumNoVel-v1") +env = wrap_env(env) + +device = env.device + + +# Instantiate a RandomMemory (without replacement) as experience replay memory +memory = RandomMemory(memory_size=20000, num_envs=env.num_envs, device=device, replacement=False) + + +# Instantiate the agent's models (function approximators). +# TD3 requires 6 models, visit its documentation for more details +# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.td3.html#spaces-and-models +models_td3 = {} +models_td3["policy"] = Actor(env.observation_space, env.action_space, device) +models_td3["target_policy"] = Actor(env.observation_space, env.action_space, device) +models_td3["critic_1"] = Critic(env.observation_space, env.action_space, device) +models_td3["critic_2"] = Critic(env.observation_space, env.action_space, device) +models_td3["target_critic_1"] = Critic(env.observation_space, env.action_space, device) +models_td3["target_critic_2"] = Critic(env.observation_space, env.action_space, device) + +# Initialize the models' parameters (weights and biases) using a Gaussian distribution +for model in models_td3.values(): + model.init_parameters(method_name="normal_", mean=0.0, std=0.1) + + +# Configure and instantiate the agent. +# Only modify some of the default configuration, visit its documentation to see all the options +# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.td3.html#configuration-and-hyperparameters +cfg_td3 = TD3_DEFAULT_CONFIG.copy() +cfg_td3["exploration"]["noise"] = GaussianNoise(0, 0.1, device=device) +cfg_td3["smooth_regularization_noise"] = GaussianNoise(0, 0.2, device=device) +cfg_td3["smooth_regularization_clip"] = 0.5 +cfg_td3["discount_factor"] = 0.98 +cfg_td3["batch_size"] = 100 +cfg_td3["random_timesteps"] = 1000 +cfg_td3["learning_starts"] = 1000 +# logging to TensorBoard and write checkpoints each 75 and 750 timesteps respectively +cfg_td3["experiment"]["write_interval"] = 75 +cfg_td3["experiment"]["checkpoint_interval"] = 750 + +agent_td3 = TD3(models=models_td3, + memory=memory, + cfg=cfg_td3, + observation_space=env.observation_space, + action_space=env.action_space, + device=device) + + +# Configure and instantiate the RL trainer +cfg_trainer = {"timesteps": 15000, "headless": True} +trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=agent_td3) + +# start training +trainer.train() diff --git a/docs/source/examples/gym/td3_gym_pendulumnovel_gru.py b/docs/source/examples/gym/td3_gym_pendulumnovel_gru.py new file mode 100644 index 00000000..af7c93da --- /dev/null +++ b/docs/source/examples/gym/td3_gym_pendulumnovel_gru.py @@ -0,0 +1,215 @@ +import gym + +import torch +import torch.nn as nn +import torch.nn.functional as F +import numpy as np + +# Import the skrl components to build the RL system +from skrl.models.torch import Model, DeterministicMixin +from skrl.memories.torch import RandomMemory +from skrl.agents.torch.td3 import TD3, TD3_DEFAULT_CONFIG +from skrl.resources.noises.torch import GaussianNoise +from skrl.trainers.torch import SequentialTrainer +from skrl.envs.torch import wrap_env + + +# Define the models (deterministic models) for the TD3 agent using mixin +# - Actor (policy): takes as input the environment's observation/state and returns an action +# - Critic: takes the state and action as input and provides a value to guide the policy +class Actor(DeterministicMixin, Model): + def __init__(self, observation_space, action_space, device, clip_actions=False, + num_envs=1, num_layers=1, hidden_size=400, sequence_length=20): + Model.__init__(self, observation_space, action_space, device) + DeterministicMixin.__init__(self, clip_actions) + + self.num_envs = num_envs + self.num_layers = num_layers + self.hidden_size = hidden_size # Hout + self.sequence_length = sequence_length + + self.gru = nn.GRU(input_size=self.num_observations, + hidden_size=self.hidden_size, + num_layers=self.num_layers, + batch_first=True) # batch_first -> (batch, sequence, features) + + self.linear_layer_1 = nn.Linear(self.hidden_size, 400) + self.linear_layer_2 = nn.Linear(400, 300) + self.action_layer = nn.Linear(300, self.num_actions) + + def get_specification(self): + # batch size (N) is the number of envs + return {"rnn": {"sequence_length": self.sequence_length, + "sizes": [(self.num_layers, self.num_envs, self.hidden_size)]}} # hidden states (D ∗ num_layers, N, Hout) + + def compute(self, inputs, role): + states = inputs["states"] + terminated = inputs.get("terminated", None) + hidden_states = inputs["rnn"][0] + + # training + if self.training: + rnn_input = states.view(-1, self.sequence_length, states.shape[-1]) # (N, L, Hin): N=batch_size, L=sequence_length + hidden_states = hidden_states.view(self.num_layers, -1, self.sequence_length, hidden_states.shape[-1]) # (D * num_layers, N, L, Hout) + # get the hidden states corresponding to the initial sequence + sequence_index = 1 if role == "target_policy" else 0 # target networks act on the next state of the environment + hidden_states = hidden_states[:,:,sequence_index,:].contiguous() # (D * num_layers, N, Hout) + + # reset the RNN state in the middle of a sequence + if terminated is not None and torch.any(terminated): + rnn_outputs = [] + terminated = terminated.view(-1, self.sequence_length) + indexes = [0] + (terminated[:,:-1].any(dim=0).nonzero(as_tuple=True)[0] + 1).tolist() + [self.sequence_length] + + for i in range(len(indexes) - 1): + i0, i1 = indexes[i], indexes[i + 1] + rnn_output, hidden_states = self.gru(rnn_input[:,i0:i1,:], hidden_states) + hidden_states[:, (terminated[:,i1-1]), :] = 0 + rnn_outputs.append(rnn_output) + + rnn_output = torch.cat(rnn_outputs, dim=1) + # no need to reset the RNN state in the sequence + else: + rnn_output, hidden_states = self.gru(rnn_input, hidden_states) + # rollout + else: + rnn_input = states.view(-1, 1, states.shape[-1]) # (N, L, Hin): N=num_envs, L=1 + rnn_output, hidden_states = self.gru(rnn_input, hidden_states) + + # flatten the RNN output + rnn_output = torch.flatten(rnn_output, start_dim=0, end_dim=1) # (N, L, D ∗ Hout) -> (N * L, D ∗ Hout) + + x = F.relu(self.linear_layer_1(rnn_output)) + x = F.relu(self.linear_layer_2(x)) + + # Pendulum-v1 action_space is -2 to 2 + return 2 * torch.tanh(self.action_layer(x)), {"rnn": [hidden_states]} + +class Critic(DeterministicMixin, Model): + def __init__(self, observation_space, action_space, device, clip_actions=False, + num_envs=1, num_layers=1, hidden_size=400, sequence_length=20): + Model.__init__(self, observation_space, action_space, device) + DeterministicMixin.__init__(self, clip_actions) + + self.num_envs = num_envs + self.num_layers = num_layers + self.hidden_size = hidden_size # Hout + self.sequence_length = sequence_length + + self.gru = nn.GRU(input_size=self.num_observations, + hidden_size=self.hidden_size, + num_layers=self.num_layers, + batch_first=True) # batch_first -> (batch, sequence, features) + + self.linear_layer_1 = nn.Linear(self.hidden_size + self.num_actions, 400) + self.linear_layer_2 = nn.Linear(400, 300) + self.linear_layer_3 = nn.Linear(300, 1) + + def get_specification(self): + # batch size (N) is the number of envs + return {"rnn": {"sequence_length": self.sequence_length, + "sizes": [(self.num_layers, self.num_envs, self.hidden_size)]}} # hidden states (D ∗ num_layers, N, Hout) + + def compute(self, inputs, role): + states = inputs["states"] + terminated = inputs.get("terminated", None) + hidden_states = inputs["rnn"][0] + + # critic is only used during training + rnn_input = states.view(-1, self.sequence_length, states.shape[-1]) # (N, L, Hin): N=batch_size, L=sequence_length + + hidden_states = hidden_states.view(self.num_layers, -1, self.sequence_length, hidden_states.shape[-1]) # (D * num_layers, N, L, Hout) + # get the hidden states corresponding to the initial sequence + sequence_index = 1 if role in ["target_critic_1", "target_critic_2"] else 0 # target networks act on the next state of the environment + hidden_states = hidden_states[:,:,sequence_index,:].contiguous() # (D * num_layers, N, Hout) + + # reset the RNN state in the middle of a sequence + if terminated is not None and torch.any(terminated): + rnn_outputs = [] + terminated = terminated.view(-1, self.sequence_length) + indexes = [0] + (terminated[:,:-1].any(dim=0).nonzero(as_tuple=True)[0] + 1).tolist() + [self.sequence_length] + + for i in range(len(indexes) - 1): + i0, i1 = indexes[i], indexes[i + 1] + rnn_output, hidden_states = self.gru(rnn_input[:,i0:i1,:], hidden_states) + hidden_states[:, (terminated[:,i1-1]), :] = 0 + rnn_outputs.append(rnn_output) + + rnn_output = torch.cat(rnn_outputs, dim=1) + # no need to reset the RNN state in the sequence + else: + rnn_output, hidden_states = self.gru(rnn_input, hidden_states) + + # flatten the RNN output + rnn_output = torch.flatten(rnn_output, start_dim=0, end_dim=1) # (N, L, D ∗ Hout) -> (N * L, D ∗ Hout) + + x = F.relu(self.linear_layer_1(torch.cat([rnn_output, inputs["taken_actions"]], dim=1))) + x = F.relu(self.linear_layer_2(x)) + + return self.linear_layer_3(x), {"rnn": [hidden_states]} + + +# Gym environment observation wrapper used to mask velocity. Adapted from rl_zoo3 (rl_zoo3/wrappers.py) +class NoVelocityWrapper(gym.ObservationWrapper): + def observation(self, observation): + # observation: x, y, angular velocity + return observation * np.array([1, 1, 0]) + +gym.envs.registration.register(id="PendulumNoVel-v1", entry_point=lambda: NoVelocityWrapper(gym.make("Pendulum-v1"))) + +# Load and wrap the Gym environment +env = gym.make("PendulumNoVel-v1") +env = wrap_env(env) + +device = env.device + + +# Instantiate a RandomMemory (without replacement) as experience replay memory +memory = RandomMemory(memory_size=20000, num_envs=env.num_envs, device=device, replacement=False) + + +# Instantiate the agent's models (function approximators). +# TD3 requires 6 models, visit its documentation for more details +# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.td3.html#spaces-and-models +models_td3 = {} +models_td3["policy"] = Actor(env.observation_space, env.action_space, device, num_envs=env.num_envs) +models_td3["target_policy"] = Actor(env.observation_space, env.action_space, device, num_envs=env.num_envs) +models_td3["critic_1"] = Critic(env.observation_space, env.action_space, device, num_envs=env.num_envs) +models_td3["critic_2"] = Critic(env.observation_space, env.action_space, device, num_envs=env.num_envs) +models_td3["target_critic_1"] = Critic(env.observation_space, env.action_space, device, num_envs=env.num_envs) +models_td3["target_critic_2"] = Critic(env.observation_space, env.action_space, device, num_envs=env.num_envs) + +# Initialize the models' parameters (weights and biases) using a Gaussian distribution +for model in models_td3.values(): + model.init_parameters(method_name="normal_", mean=0.0, std=0.1) + + +# Configure and instantiate the agent. +# Only modify some of the default configuration, visit its documentation to see all the options +# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.td3.html#configuration-and-hyperparameters +cfg_td3 = TD3_DEFAULT_CONFIG.copy() +cfg_td3["exploration"]["noise"] = GaussianNoise(0, 0.1, device=device) +cfg_td3["smooth_regularization_noise"] = GaussianNoise(0, 0.2, device=device) +cfg_td3["smooth_regularization_clip"] = 0.5 +cfg_td3["discount_factor"] = 0.98 +cfg_td3["batch_size"] = 100 +cfg_td3["random_timesteps"] = 0 +cfg_td3["learning_starts"] = 1000 +# logging to TensorBoard and write checkpoints each 75 and 750 timesteps respectively +cfg_td3["experiment"]["write_interval"] = 75 +cfg_td3["experiment"]["checkpoint_interval"] = 750 + +agent_td3 = TD3(models=models_td3, + memory=memory, + cfg=cfg_td3, + observation_space=env.observation_space, + action_space=env.action_space, + device=device) + + +# Configure and instantiate the RL trainer +cfg_trainer = {"timesteps": 15000, "headless": True} +trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=agent_td3) + +# start training +trainer.train() diff --git a/docs/source/examples/gym/td3_gym_pendulumnovel_lstm.py b/docs/source/examples/gym/td3_gym_pendulumnovel_lstm.py new file mode 100644 index 00000000..d2f79407 --- /dev/null +++ b/docs/source/examples/gym/td3_gym_pendulumnovel_lstm.py @@ -0,0 +1,225 @@ +import gym + +import torch +import torch.nn as nn +import torch.nn.functional as F +import numpy as np + +# Import the skrl components to build the RL system +from skrl.models.torch import Model, DeterministicMixin +from skrl.memories.torch import RandomMemory +from skrl.agents.torch.td3 import TD3, TD3_DEFAULT_CONFIG +from skrl.resources.noises.torch import GaussianNoise +from skrl.trainers.torch import SequentialTrainer +from skrl.envs.torch import wrap_env + + +# Define the models (deterministic models) for the TD3 agent using mixin +# - Actor (policy): takes as input the environment's observation/state and returns an action +# - Critic: takes the state and action as input and provides a value to guide the policy +class Actor(DeterministicMixin, Model): + def __init__(self, observation_space, action_space, device, clip_actions=False, + num_envs=1, num_layers=1, hidden_size=400, sequence_length=20): + Model.__init__(self, observation_space, action_space, device) + DeterministicMixin.__init__(self, clip_actions) + + self.num_envs = num_envs + self.num_layers = num_layers + self.hidden_size = hidden_size # Hcell (Hout is Hcell because proj_size = 0) + self.sequence_length = sequence_length + + self.lstm = nn.LSTM(input_size=self.num_observations, + hidden_size=self.hidden_size, + num_layers=self.num_layers, + batch_first=True) # batch_first -> (batch, sequence, features) + + self.linear_layer_1 = nn.Linear(self.hidden_size, 400) + self.linear_layer_2 = nn.Linear(400, 300) + self.action_layer = nn.Linear(300, self.num_actions) + + def get_specification(self): + # batch size (N) is the number of envs + return {"rnn": {"sequence_length": self.sequence_length, + "sizes": [(self.num_layers, self.num_envs, self.hidden_size), # hidden states (D ∗ num_layers, N, Hout) + (self.num_layers, self.num_envs, self.hidden_size)]}} # cell states (D ∗ num_layers, N, Hcell) + + def compute(self, inputs, role): + states = inputs["states"] + terminated = inputs.get("terminated", None) + hidden_states, cell_states = inputs["rnn"][0], inputs["rnn"][1] + + # training + if self.training: + rnn_input = states.view(-1, self.sequence_length, states.shape[-1]) # (N, L, Hin): N=batch_size, L=sequence_length + hidden_states = hidden_states.view(self.num_layers, -1, self.sequence_length, hidden_states.shape[-1]) # (D * num_layers, N, L, Hout) + cell_states = cell_states.view(self.num_layers, -1, self.sequence_length, cell_states.shape[-1]) # (D * num_layers, N, L, Hcell) + # get the hidden/cell states corresponding to the initial sequence + sequence_index = 1 if role == "target_policy" else 0 # target networks act on the next state of the environment + hidden_states = hidden_states[:,:,sequence_index,:].contiguous() # (D * num_layers, N, Hout) + cell_states = cell_states[:,:,sequence_index,:].contiguous() # (D * num_layers, N, Hcell) + + # reset the RNN state in the middle of a sequence + if terminated is not None and torch.any(terminated): + rnn_outputs = [] + terminated = terminated.view(-1, self.sequence_length) + indexes = [0] + (terminated[:,:-1].any(dim=0).nonzero(as_tuple=True)[0] + 1).tolist() + [self.sequence_length] + + for i in range(len(indexes) - 1): + i0, i1 = indexes[i], indexes[i + 1] + rnn_output, (hidden_states, cell_states) = self.lstm(rnn_input[:,i0:i1,:], (hidden_states, cell_states)) + hidden_states[:, (terminated[:,i1-1]), :] = 0 + cell_states[:, (terminated[:,i1-1]), :] = 0 + rnn_outputs.append(rnn_output) + + rnn_states = (hidden_states, cell_states) + rnn_output = torch.cat(rnn_outputs, dim=1) + # no need to reset the RNN state in the sequence + else: + rnn_output, rnn_states = self.lstm(rnn_input, (hidden_states, cell_states)) + # rollout + else: + rnn_input = states.view(-1, 1, states.shape[-1]) # (N, L, Hin): N=num_envs, L=1 + rnn_output, rnn_states = self.lstm(rnn_input, (hidden_states, cell_states)) + + # flatten the RNN output + rnn_output = torch.flatten(rnn_output, start_dim=0, end_dim=1) # (N, L, D ∗ Hout) -> (N * L, D ∗ Hout) + + x = F.relu(self.linear_layer_1(rnn_output)) + x = F.relu(self.linear_layer_2(x)) + + # Pendulum-v1 action_space is -2 to 2 + return 2 * torch.tanh(self.action_layer(x)), {"rnn": [rnn_states[0], rnn_states[1]]} + +class Critic(DeterministicMixin, Model): + def __init__(self, observation_space, action_space, device, clip_actions=False, + num_envs=1, num_layers=1, hidden_size=400, sequence_length=20): + Model.__init__(self, observation_space, action_space, device) + DeterministicMixin.__init__(self, clip_actions) + + self.num_envs = num_envs + self.num_layers = num_layers + self.hidden_size = hidden_size # Hcell (Hout is Hcell because proj_size = 0) + self.sequence_length = sequence_length + + self.lstm = nn.LSTM(input_size=self.num_observations, + hidden_size=self.hidden_size, + num_layers=self.num_layers, + batch_first=True) # batch_first -> (batch, sequence, features) + + self.linear_layer_1 = nn.Linear(self.hidden_size + self.num_actions, 400) + self.linear_layer_2 = nn.Linear(400, 300) + self.linear_layer_3 = nn.Linear(300, 1) + + def get_specification(self): + # batch size (N) is the number of envs + return {"rnn": {"sequence_length": self.sequence_length, + "sizes": [(self.num_layers, self.num_envs, self.hidden_size), # hidden states (D ∗ num_layers, N, Hout) + (self.num_layers, self.num_envs, self.hidden_size)]}} # cell states (D ∗ num_layers, N, Hcell) + + def compute(self, inputs, role): + states = inputs["states"] + terminated = inputs.get("terminated", None) + hidden_states, cell_states = inputs["rnn"][0], inputs["rnn"][1] + + # critic is only used during training + rnn_input = states.view(-1, self.sequence_length, states.shape[-1]) # (N, L, Hin): N=batch_size, L=sequence_length + + hidden_states = hidden_states.view(self.num_layers, -1, self.sequence_length, hidden_states.shape[-1]) # (D * num_layers, N, L, Hout) + cell_states = cell_states.view(self.num_layers, -1, self.sequence_length, cell_states.shape[-1]) # (D * num_layers, N, L, Hcell) + # get the hidden/cell states corresponding to the initial sequence + sequence_index = 1 if role in ["target_critic_1", "target_critic_2"] else 0 # target networks act on the next state of the environment + hidden_states = hidden_states[:,:,sequence_index,:].contiguous() # (D * num_layers, N, Hout) + cell_states = cell_states[:,:,sequence_index,:].contiguous() # (D * num_layers, N, Hcell) + + # reset the RNN state in the middle of a sequence + if terminated is not None and torch.any(terminated): + rnn_outputs = [] + terminated = terminated.view(-1, self.sequence_length) + indexes = [0] + (terminated[:,:-1].any(dim=0).nonzero(as_tuple=True)[0] + 1).tolist() + [self.sequence_length] + + for i in range(len(indexes) - 1): + i0, i1 = indexes[i], indexes[i + 1] + rnn_output, (hidden_states, cell_states) = self.lstm(rnn_input[:,i0:i1,:], (hidden_states, cell_states)) + hidden_states[:, (terminated[:,i1-1]), :] = 0 + cell_states[:, (terminated[:,i1-1]), :] = 0 + rnn_outputs.append(rnn_output) + + rnn_states = (hidden_states, cell_states) + rnn_output = torch.cat(rnn_outputs, dim=1) + # no need to reset the RNN state in the sequence + else: + rnn_output, rnn_states = self.lstm(rnn_input, (hidden_states, cell_states)) + + # flatten the RNN output + rnn_output = torch.flatten(rnn_output, start_dim=0, end_dim=1) # (N, L, D ∗ Hout) -> (N * L, D ∗ Hout) + + x = F.relu(self.linear_layer_1(torch.cat([rnn_output, inputs["taken_actions"]], dim=1))) + x = F.relu(self.linear_layer_2(x)) + + return self.linear_layer_3(x), {"rnn": [rnn_states[0], rnn_states[1]]} + + +# Gym environment observation wrapper used to mask velocity. Adapted from rl_zoo3 (rl_zoo3/wrappers.py) +class NoVelocityWrapper(gym.ObservationWrapper): + def observation(self, observation): + # observation: x, y, angular velocity + return observation * np.array([1, 1, 0]) + +gym.envs.registration.register(id="PendulumNoVel-v1", entry_point=lambda: NoVelocityWrapper(gym.make("Pendulum-v1"))) + +# Load and wrap the Gym environment +env = gym.make("PendulumNoVel-v1") +env = wrap_env(env) + +device = env.device + + +# Instantiate a RandomMemory (without replacement) as experience replay memory +memory = RandomMemory(memory_size=20000, num_envs=env.num_envs, device=device, replacement=False) + + +# Instantiate the agent's models (function approximators). +# TD3 requires 6 models, visit its documentation for more details +# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.td3.html#spaces-and-models +models_td3 = {} +models_td3["policy"] = Actor(env.observation_space, env.action_space, device, num_envs=env.num_envs) +models_td3["target_policy"] = Actor(env.observation_space, env.action_space, device, num_envs=env.num_envs) +models_td3["critic_1"] = Critic(env.observation_space, env.action_space, device, num_envs=env.num_envs) +models_td3["critic_2"] = Critic(env.observation_space, env.action_space, device, num_envs=env.num_envs) +models_td3["target_critic_1"] = Critic(env.observation_space, env.action_space, device, num_envs=env.num_envs) +models_td3["target_critic_2"] = Critic(env.observation_space, env.action_space, device, num_envs=env.num_envs) + +# Initialize the models' parameters (weights and biases) using a Gaussian distribution +for model in models_td3.values(): + model.init_parameters(method_name="normal_", mean=0.0, std=0.1) + + +# Configure and instantiate the agent. +# Only modify some of the default configuration, visit its documentation to see all the options +# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.td3.html#configuration-and-hyperparameters +cfg_td3 = TD3_DEFAULT_CONFIG.copy() +cfg_td3["exploration"]["noise"] = GaussianNoise(0, 0.1, device=device) +cfg_td3["smooth_regularization_noise"] = GaussianNoise(0, 0.2, device=device) +cfg_td3["smooth_regularization_clip"] = 0.5 +cfg_td3["discount_factor"] = 0.98 +cfg_td3["batch_size"] = 100 +cfg_td3["random_timesteps"] = 0 +cfg_td3["learning_starts"] = 1000 +# logging to TensorBoard and write checkpoints each 75 and 750 timesteps respectively +cfg_td3["experiment"]["write_interval"] = 75 +cfg_td3["experiment"]["checkpoint_interval"] = 750 + +agent_td3 = TD3(models=models_td3, + memory=memory, + cfg=cfg_td3, + observation_space=env.observation_space, + action_space=env.action_space, + device=device) + + +# Configure and instantiate the RL trainer +cfg_trainer = {"timesteps": 15000, "headless": True} +trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=agent_td3) + +# start training +trainer.train() diff --git a/docs/source/examples/gym/td3_gym_pendulumnovel_rnn.py b/docs/source/examples/gym/td3_gym_pendulumnovel_rnn.py new file mode 100644 index 00000000..4abb5b9c --- /dev/null +++ b/docs/source/examples/gym/td3_gym_pendulumnovel_rnn.py @@ -0,0 +1,215 @@ +import gym + +import torch +import torch.nn as nn +import torch.nn.functional as F +import numpy as np + +# Import the skrl components to build the RL system +from skrl.models.torch import Model, DeterministicMixin +from skrl.memories.torch import RandomMemory +from skrl.agents.torch.td3 import TD3, TD3_DEFAULT_CONFIG +from skrl.resources.noises.torch import GaussianNoise +from skrl.trainers.torch import SequentialTrainer +from skrl.envs.torch import wrap_env + + +# Define the models (deterministic models) for the TD3 agent using mixin +# - Actor (policy): takes as input the environment's observation/state and returns an action +# - Critic: takes the state and action as input and provides a value to guide the policy +class Actor(DeterministicMixin, Model): + def __init__(self, observation_space, action_space, device, clip_actions=False, + num_envs=1, num_layers=1, hidden_size=400, sequence_length=20): + Model.__init__(self, observation_space, action_space, device) + DeterministicMixin.__init__(self, clip_actions) + + self.num_envs = num_envs + self.num_layers = num_layers + self.hidden_size = hidden_size # Hout + self.sequence_length = sequence_length + + self.rnn = nn.RNN(input_size=self.num_observations, + hidden_size=self.hidden_size, + num_layers=self.num_layers, + batch_first=True) # batch_first -> (batch, sequence, features) + + self.linear_layer_1 = nn.Linear(self.hidden_size, 400) + self.linear_layer_2 = nn.Linear(400, 300) + self.action_layer = nn.Linear(300, self.num_actions) + + def get_specification(self): + # batch size (N) is the number of envs + return {"rnn": {"sequence_length": self.sequence_length, + "sizes": [(self.num_layers, self.num_envs, self.hidden_size)]}} # hidden states (D ∗ num_layers, N, Hout) + + def compute(self, inputs, role): + states = inputs["states"] + terminated = inputs.get("terminated", None) + hidden_states = inputs["rnn"][0] + + # training + if self.training: + rnn_input = states.view(-1, self.sequence_length, states.shape[-1]) # (N, L, Hin): N=batch_size, L=sequence_length + hidden_states = hidden_states.view(self.num_layers, -1, self.sequence_length, hidden_states.shape[-1]) # (D * num_layers, N, L, Hout) + # get the hidden states corresponding to the initial sequence + sequence_index = 1 if role == "target_policy" else 0 # target networks act on the next state of the environment + hidden_states = hidden_states[:,:,sequence_index,:].contiguous() # (D * num_layers, N, Hout) + + # reset the RNN state in the middle of a sequence + if terminated is not None and torch.any(terminated): + rnn_outputs = [] + terminated = terminated.view(-1, self.sequence_length) + indexes = [0] + (terminated[:,:-1].any(dim=0).nonzero(as_tuple=True)[0] + 1).tolist() + [self.sequence_length] + + for i in range(len(indexes) - 1): + i0, i1 = indexes[i], indexes[i + 1] + rnn_output, hidden_states = self.rnn(rnn_input[:,i0:i1,:], hidden_states) + hidden_states[:, (terminated[:,i1-1]), :] = 0 + rnn_outputs.append(rnn_output) + + rnn_output = torch.cat(rnn_outputs, dim=1) + # no need to reset the RNN state in the sequence + else: + rnn_output, hidden_states = self.rnn(rnn_input, hidden_states) + # rollout + else: + rnn_input = states.view(-1, 1, states.shape[-1]) # (N, L, Hin): N=num_envs, L=1 + rnn_output, hidden_states = self.rnn(rnn_input, hidden_states) + + # flatten the RNN output + rnn_output = torch.flatten(rnn_output, start_dim=0, end_dim=1) # (N, L, D ∗ Hout) -> (N * L, D ∗ Hout) + + x = F.relu(self.linear_layer_1(rnn_output)) + x = F.relu(self.linear_layer_2(x)) + + # Pendulum-v1 action_space is -2 to 2 + return 2 * torch.tanh(self.action_layer(x)), {"rnn": [hidden_states]} + +class Critic(DeterministicMixin, Model): + def __init__(self, observation_space, action_space, device, clip_actions=False, + num_envs=1, num_layers=1, hidden_size=400, sequence_length=20): + Model.__init__(self, observation_space, action_space, device) + DeterministicMixin.__init__(self, clip_actions) + + self.num_envs = num_envs + self.num_layers = num_layers + self.hidden_size = hidden_size # Hout + self.sequence_length = sequence_length + + self.rnn = nn.RNN(input_size=self.num_observations, + hidden_size=self.hidden_size, + num_layers=self.num_layers, + batch_first=True) # batch_first -> (batch, sequence, features) + + self.linear_layer_1 = nn.Linear(self.hidden_size + self.num_actions, 400) + self.linear_layer_2 = nn.Linear(400, 300) + self.linear_layer_3 = nn.Linear(300, 1) + + def get_specification(self): + # batch size (N) is the number of envs + return {"rnn": {"sequence_length": self.sequence_length, + "sizes": [(self.num_layers, self.num_envs, self.hidden_size)]}} # hidden states (D ∗ num_layers, N, Hout) + + def compute(self, inputs, role): + states = inputs["states"] + terminated = inputs.get("terminated", None) + hidden_states = inputs["rnn"][0] + + # critic is only used during training + rnn_input = states.view(-1, self.sequence_length, states.shape[-1]) # (N, L, Hin): N=batch_size, L=sequence_length + + hidden_states = hidden_states.view(self.num_layers, -1, self.sequence_length, hidden_states.shape[-1]) # (D * num_layers, N, L, Hout) + # get the hidden states corresponding to the initial sequence + sequence_index = 1 if role in ["target_critic_1", "target_critic_2"] else 0 # target networks act on the next state of the environment + hidden_states = hidden_states[:,:,sequence_index,:].contiguous() # (D * num_layers, N, Hout) + + # reset the RNN state in the middle of a sequence + if terminated is not None and torch.any(terminated): + rnn_outputs = [] + terminated = terminated.view(-1, self.sequence_length) + indexes = [0] + (terminated[:,:-1].any(dim=0).nonzero(as_tuple=True)[0] + 1).tolist() + [self.sequence_length] + + for i in range(len(indexes) - 1): + i0, i1 = indexes[i], indexes[i + 1] + rnn_output, hidden_states = self.rnn(rnn_input[:,i0:i1,:], hidden_states) + hidden_states[:, (terminated[:,i1-1]), :] = 0 + rnn_outputs.append(rnn_output) + + rnn_output = torch.cat(rnn_outputs, dim=1) + # no need to reset the RNN state in the sequence + else: + rnn_output, hidden_states = self.rnn(rnn_input, hidden_states) + + # flatten the RNN output + rnn_output = torch.flatten(rnn_output, start_dim=0, end_dim=1) # (N, L, D ∗ Hout) -> (N * L, D ∗ Hout) + + x = F.relu(self.linear_layer_1(torch.cat([rnn_output, inputs["taken_actions"]], dim=1))) + x = F.relu(self.linear_layer_2(x)) + + return self.linear_layer_3(x), {"rnn": [hidden_states]} + + +# Gym environment observation wrapper used to mask velocity. Adapted from rl_zoo3 (rl_zoo3/wrappers.py) +class NoVelocityWrapper(gym.ObservationWrapper): + def observation(self, observation): + # observation: x, y, angular velocity + return observation * np.array([1, 1, 0]) + +gym.envs.registration.register(id="PendulumNoVel-v1", entry_point=lambda: NoVelocityWrapper(gym.make("Pendulum-v1"))) + +# Load and wrap the Gym environment +env = gym.make("PendulumNoVel-v1") +env = wrap_env(env) + +device = env.device + + +# Instantiate a RandomMemory (without replacement) as experience replay memory +memory = RandomMemory(memory_size=20000, num_envs=env.num_envs, device=device, replacement=False) + + +# Instantiate the agent's models (function approximators). +# TD3 requires 6 models, visit its documentation for more details +# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.td3.html#spaces-and-models +models_td3 = {} +models_td3["policy"] = Actor(env.observation_space, env.action_space, device, num_envs=env.num_envs) +models_td3["target_policy"] = Actor(env.observation_space, env.action_space, device, num_envs=env.num_envs) +models_td3["critic_1"] = Critic(env.observation_space, env.action_space, device, num_envs=env.num_envs) +models_td3["critic_2"] = Critic(env.observation_space, env.action_space, device, num_envs=env.num_envs) +models_td3["target_critic_1"] = Critic(env.observation_space, env.action_space, device, num_envs=env.num_envs) +models_td3["target_critic_2"] = Critic(env.observation_space, env.action_space, device, num_envs=env.num_envs) + +# Initialize the models' parameters (weights and biases) using a Gaussian distribution +for model in models_td3.values(): + model.init_parameters(method_name="normal_", mean=0.0, std=0.1) + + +# Configure and instantiate the agent. +# Only modify some of the default configuration, visit its documentation to see all the options +# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.td3.html#configuration-and-hyperparameters +cfg_td3 = TD3_DEFAULT_CONFIG.copy() +cfg_td3["exploration"]["noise"] = GaussianNoise(0, 0.1, device=device) +cfg_td3["smooth_regularization_noise"] = GaussianNoise(0, 0.2, device=device) +cfg_td3["smooth_regularization_clip"] = 0.5 +cfg_td3["discount_factor"] = 0.98 +cfg_td3["batch_size"] = 100 +cfg_td3["random_timesteps"] = 0 +cfg_td3["learning_starts"] = 1000 +# logging to TensorBoard and write checkpoints each 75 and 750 timesteps respectively +cfg_td3["experiment"]["write_interval"] = 75 +cfg_td3["experiment"]["checkpoint_interval"] = 750 + +agent_td3 = TD3(models=models_td3, + memory=memory, + cfg=cfg_td3, + observation_space=env.observation_space, + action_space=env.action_space, + device=device) + + +# Configure and instantiate the RL trainer +cfg_trainer = {"timesteps": 15000, "headless": True} +trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=agent_td3) + +# start training +trainer.train() diff --git a/docs/source/examples/gym/trpo_gym_pendulum.py b/docs/source/examples/gym/trpo_gym_pendulum.py new file mode 100644 index 00000000..3595d38a --- /dev/null +++ b/docs/source/examples/gym/trpo_gym_pendulum.py @@ -0,0 +1,107 @@ +import gym + +import torch +import torch.nn as nn + +# Import the skrl components to build the RL system +from skrl.models.torch import Model, GaussianMixin, DeterministicMixin +from skrl.memories.torch import RandomMemory +from skrl.agents.torch.trpo import TRPO, TRPO_DEFAULT_CONFIG +from skrl.trainers.torch import SequentialTrainer +from skrl.resources.preprocessors.torch import RunningStandardScaler +from skrl.envs.torch import wrap_env + + +# Define the models (stochastic and deterministic models) for the agent using mixins. +# - Policy: takes as input the environment's observation/state and returns an action +# - Value: takes the state as input and provides a value to guide the policy +class Policy(GaussianMixin, Model): + def __init__(self, observation_space, action_space, device, clip_actions=False, + clip_log_std=True, min_log_std=-20, max_log_std=2, reduction="sum"): + Model.__init__(self, observation_space, action_space, device) + GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std, reduction) + + self.net = nn.Sequential(nn.Linear(self.num_observations, 64), + nn.ReLU(), + nn.Linear(64, 64), + nn.ReLU(), + nn.Linear(64, self.num_actions)) + self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) + + def compute(self, inputs, role): + # Pendulum-v1 action_space is -2 to 2 + return 2 * torch.tanh(self.net(inputs["states"])), self.log_std_parameter, {} + +class Value(DeterministicMixin, Model): + def __init__(self, observation_space, action_space, device, clip_actions=False): + Model.__init__(self, observation_space, action_space, device) + DeterministicMixin.__init__(self, clip_actions) + + self.net = nn.Sequential(nn.Linear(self.num_observations, 64), + nn.ReLU(), + nn.Linear(64, 64), + nn.ReLU(), + nn.Linear(64, 1)) + + def compute(self, inputs, role): + return self.net(inputs["states"]), {} + + +# Load and wrap the Gym environment. +# Note: the environment version may change depending on the gym version +try: + env = gym.vector.make("Pendulum-v1", num_envs=4, asynchronous=False) +except gym.error.DeprecatedEnv as e: + env_id = [spec.id for spec in gym.envs.registry.all() if spec.id.startswith("Pendulum-v")][0] + print("Pendulum-v1 not found. Trying {}".format(env_id)) + env = gym.vector.make(env_id, num_envs=4, asynchronous=False) +env = wrap_env(env) + +device = env.device + + +# Instantiate a RandomMemory as rollout buffer (any memory can be used for this) +memory = RandomMemory(memory_size=1024, num_envs=env.num_envs, device=device) + + +# Instantiate the agent's models (function approximators). +# TRPO requires 2 models, visit its documentation for more details +# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.trpo.html#spaces-and-models +models_trpo = {} +models_trpo["policy"] = Policy(env.observation_space, env.action_space, device, clip_actions=True) +models_trpo["value"] = Value(env.observation_space, env.action_space, device) + + +# Configure and instantiate the agent. +# Only modify some of the default configuration, visit its documentation to see all the options +# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.trpo.html#configuration-and-hyperparameters +cfg_trpo = TRPO_DEFAULT_CONFIG.copy() +cfg_trpo["rollouts"] = 1024 # memory_size +cfg_trpo["learning_epochs"] = 10 +cfg_trpo["mini_batches"] = 32 +cfg_trpo["discount_factor"] = 0.99 +cfg_trpo["lambda"] = 0.95 +cfg_trpo["learning_rate"] = 1e-3 +cfg_trpo["grad_norm_clip"] = 0.5 +cfg_trpo["state_preprocessor"] = RunningStandardScaler +cfg_trpo["state_preprocessor_kwargs"] = {"size": env.observation_space, "device": device} +cfg_trpo["value_preprocessor"] = RunningStandardScaler +cfg_trpo["value_preprocessor_kwargs"] = {"size": 1, "device": device} +# logging to TensorBoard and write checkpoints each 500 and 5000 timesteps respectively +cfg_trpo["experiment"]["write_interval"] = 500 +cfg_trpo["experiment"]["checkpoint_interval"] = 5000 + +agent_trpo = TRPO(models=models_trpo, + memory=memory, + cfg=cfg_trpo, + observation_space=env.observation_space, + action_space=env.action_space, + device=device) + + +# Configure and instantiate the RL trainer +cfg_trainer = {"timesteps": 100000, "headless": True} +trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=agent_trpo) + +# start training +trainer.train() diff --git a/docs/source/examples/gym/trpo_gym_pendulumnovel.py b/docs/source/examples/gym/trpo_gym_pendulumnovel.py new file mode 100644 index 00000000..28ed1005 --- /dev/null +++ b/docs/source/examples/gym/trpo_gym_pendulumnovel.py @@ -0,0 +1,110 @@ +import gym + +import torch +import torch.nn as nn +import numpy as np + +# Import the skrl components to build the RL system +from skrl.models.torch import Model, GaussianMixin, DeterministicMixin +from skrl.memories.torch import RandomMemory +from skrl.agents.torch.trpo import TRPO, TRPO_DEFAULT_CONFIG +from skrl.trainers.torch import SequentialTrainer +from skrl.resources.preprocessors.torch import RunningStandardScaler +from skrl.envs.torch import wrap_env + + +# Define the models (stochastic and deterministic models) for the agent using mixins. +# - Policy: takes as input the environment's observation/state and returns an action +# - Value: takes the state as input and provides a value to guide the policy +class Policy(GaussianMixin, Model): + def __init__(self, observation_space, action_space, device, clip_actions=False, + clip_log_std=True, min_log_std=-20, max_log_std=2, reduction="sum"): + Model.__init__(self, observation_space, action_space, device) + GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std, reduction) + + self.net = nn.Sequential(nn.Linear(self.num_observations, 64), + nn.ReLU(), + nn.Linear(64, 64), + nn.ReLU(), + nn.Linear(64, self.num_actions)) + self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) + + def compute(self, inputs, role): + # Pendulum-v1 action_space is -2 to 2 + return 2 * torch.tanh(self.net(inputs["states"])), self.log_std_parameter, {} + +class Value(DeterministicMixin, Model): + def __init__(self, observation_space, action_space, device, clip_actions=False): + Model.__init__(self, observation_space, action_space, device) + DeterministicMixin.__init__(self, clip_actions) + + self.net = nn.Sequential(nn.Linear(self.num_observations, 64), + nn.ReLU(), + nn.Linear(64, 64), + nn.ReLU(), + nn.Linear(64, 1)) + + def compute(self, inputs, role): + return self.net(inputs["states"]), {} + + +# Gym environment observation wrapper used to mask velocity. Adapted from rl_zoo3 (rl_zoo3/wrappers.py) +class NoVelocityWrapper(gym.ObservationWrapper): + def observation(self, observation): + # observation: x, y, angular velocity + return observation * np.array([1, 1, 0]) + +gym.envs.registration.register(id="PendulumNoVel-v1", entry_point=lambda: NoVelocityWrapper(gym.make("Pendulum-v1"))) + +# Load and wrap the Gym environment +env = gym.vector.make("PendulumNoVel-v1", num_envs=4, asynchronous=False) +env = wrap_env(env) + +device = env.device + + +# Instantiate a RandomMemory as rollout buffer (any memory can be used for this) +memory = RandomMemory(memory_size=1024, num_envs=env.num_envs, device=device) + + +# Instantiate the agent's models (function approximators). +# TRPO requires 2 models, visit its documentation for more details +# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.trpo.html#spaces-and-models +models_trpo = {} +models_trpo["policy"] = Policy(env.observation_space, env.action_space, device, clip_actions=True) +models_trpo["value"] = Value(env.observation_space, env.action_space, device) + + +# Configure and instantiate the agent. +# Only modify some of the default configuration, visit its documentation to see all the options +# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.trpo.html#configuration-and-hyperparameters +cfg_trpo = TRPO_DEFAULT_CONFIG.copy() +cfg_trpo["rollouts"] = 1024 # memory_size +cfg_trpo["learning_epochs"] = 10 +cfg_trpo["mini_batches"] = 32 +cfg_trpo["discount_factor"] = 0.99 +cfg_trpo["lambda"] = 0.95 +cfg_trpo["learning_rate"] = 1e-3 +cfg_trpo["grad_norm_clip"] = 0.5 +cfg_trpo["state_preprocessor"] = RunningStandardScaler +cfg_trpo["state_preprocessor_kwargs"] = {"size": env.observation_space, "device": device} +cfg_trpo["value_preprocessor"] = RunningStandardScaler +cfg_trpo["value_preprocessor_kwargs"] = {"size": 1, "device": device} +# logging to TensorBoard and write checkpoints each 500 and 5000 timesteps respectively +cfg_trpo["experiment"]["write_interval"] = 500 +cfg_trpo["experiment"]["checkpoint_interval"] = 5000 + +agent_trpo = TRPO(models=models_trpo, + memory=memory, + cfg=cfg_trpo, + observation_space=env.observation_space, + action_space=env.action_space, + device=device) + + +# Configure and instantiate the RL trainer +cfg_trainer = {"timesteps": 100000, "headless": True} +trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=agent_trpo) + +# start training +trainer.train() diff --git a/docs/source/examples/gym/trpo_gym_pendulumnovel_gru.py b/docs/source/examples/gym/trpo_gym_pendulumnovel_gru.py new file mode 100644 index 00000000..9135d440 --- /dev/null +++ b/docs/source/examples/gym/trpo_gym_pendulumnovel_gru.py @@ -0,0 +1,209 @@ +import gym + +import torch +import torch.nn as nn +import numpy as np + +# Import the skrl components to build the RL system +from skrl.models.torch import Model, GaussianMixin, DeterministicMixin +from skrl.memories.torch import RandomMemory +from skrl.agents.torch.trpo import TRPO, TRPO_DEFAULT_CONFIG +from skrl.trainers.torch import SequentialTrainer +from skrl.resources.preprocessors.torch import RunningStandardScaler +from skrl.envs.torch import wrap_env + + +# Define the models (stochastic and deterministic models) for the agent using mixins. +# - Policy: takes as input the environment's observation/state and returns an action +# - Value: takes the state as input and provides a value to guide the policy +class Policy(GaussianMixin, Model): + def __init__(self, observation_space, action_space, device, clip_actions=False, + clip_log_std=True, min_log_std=-20, max_log_std=2, reduction="sum", + num_envs=1, num_layers=1, hidden_size=64, sequence_length=128): + Model.__init__(self, observation_space, action_space, device) + GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std, reduction) + + self.num_envs = num_envs + self.num_layers = num_layers + self.hidden_size = hidden_size # Hout + self.sequence_length = sequence_length + + self.gru = nn.GRU(input_size=self.num_observations, + hidden_size=self.hidden_size, + num_layers=self.num_layers, + batch_first=True) # batch_first -> (batch, sequence, features) + + self.net = nn.Sequential(nn.Linear(self.hidden_size, 64), + nn.ReLU(), + nn.Linear(64, self.num_actions)) + self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) + + def get_specification(self): + # batch size (N) is the number of envs + return {"rnn": {"sequence_length": self.sequence_length, + "sizes": [(self.num_layers, self.num_envs, self.hidden_size)]}} # hidden states (D ∗ num_layers, N, Hout) + + def compute(self, inputs, role): + states = inputs["states"] + terminated = inputs.get("terminated", None) + hidden_states = inputs["rnn"][0] + + # training + if self.training: + rnn_input = states.view(-1, self.sequence_length, states.shape[-1]) # (N, L, Hin): N=batch_size, L=sequence_length + hidden_states = hidden_states.view(self.num_layers, -1, self.sequence_length, hidden_states.shape[-1]) # (D * num_layers, N, L, Hout) + # get the hidden states corresponding to the initial sequence + hidden_states = hidden_states[:,:,0,:].contiguous() # (D * num_layers, N, Hout) + + # reset the RNN state in the middle of a sequence + if terminated is not None and torch.any(terminated): + rnn_outputs = [] + terminated = terminated.view(-1, self.sequence_length) + indexes = [0] + (terminated[:,:-1].any(dim=0).nonzero(as_tuple=True)[0] + 1).tolist() + [self.sequence_length] + + for i in range(len(indexes) - 1): + i0, i1 = indexes[i], indexes[i + 1] + rnn_output, hidden_states = self.gru(rnn_input[:,i0:i1,:], hidden_states) + hidden_states[:, (terminated[:,i1-1]), :] = 0 + rnn_outputs.append(rnn_output) + + rnn_output = torch.cat(rnn_outputs, dim=1) + # no need to reset the RNN state in the sequence + else: + rnn_output, hidden_states = self.gru(rnn_input, hidden_states) + # rollout + else: + rnn_input = states.view(-1, 1, states.shape[-1]) # (N, L, Hin): N=num_envs, L=1 + rnn_output, hidden_states = self.gru(rnn_input, hidden_states) + + # flatten the RNN output + rnn_output = torch.flatten(rnn_output, start_dim=0, end_dim=1) # (N, L, D ∗ Hout) -> (N * L, D ∗ Hout) + + # Pendulum-v1 action_space is -2 to 2 + return 2 * torch.tanh(self.net(rnn_output)), self.log_std_parameter, {"rnn": [hidden_states]} + +class Value(DeterministicMixin, Model): + def __init__(self, observation_space, action_space, device, clip_actions=False, + num_envs=1, num_layers=1, hidden_size=64, sequence_length=128): + Model.__init__(self, observation_space, action_space, device) + DeterministicMixin.__init__(self, clip_actions) + + self.num_envs = num_envs + self.num_layers = num_layers + self.hidden_size = hidden_size # Hout + self.sequence_length = sequence_length + + self.gru = nn.GRU(input_size=self.num_observations, + hidden_size=self.hidden_size, + num_layers=self.num_layers, + batch_first=True) # batch_first -> (batch, sequence, features) + + self.net = nn.Sequential(nn.Linear(self.hidden_size, 64), + nn.ReLU(), + nn.Linear(64, 1)) + + def get_specification(self): + # batch size (N) is the number of envs + return {"rnn": {"sequence_length": self.sequence_length, + "sizes": [(self.num_layers, self.num_envs, self.hidden_size)]}} # hidden states (D ∗ num_layers, N, Hout) + + def compute(self, inputs, role): + states = inputs["states"] + terminated = inputs.get("terminated", None) + hidden_states = inputs["rnn"][0] + + # training + if self.training: + rnn_input = states.view(-1, self.sequence_length, states.shape[-1]) # (N, L, Hin): N=batch_size, L=sequence_length + + hidden_states = hidden_states.view(self.num_layers, -1, self.sequence_length, hidden_states.shape[-1]) # (D * num_layers, N, L, Hout) + # get the hidden states corresponding to the initial sequence + hidden_states = hidden_states[:,:,0,:].contiguous() # (D * num_layers, N, Hout) + + # reset the RNN state in the middle of a sequence + if terminated is not None and torch.any(terminated): + rnn_outputs = [] + terminated = terminated.view(-1, self.sequence_length) + indexes = [0] + (terminated[:,:-1].any(dim=0).nonzero(as_tuple=True)[0] + 1).tolist() + [self.sequence_length] + + for i in range(len(indexes) - 1): + i0, i1 = indexes[i], indexes[i + 1] + rnn_output, hidden_states = self.gru(rnn_input[:,i0:i1,:], hidden_states) + hidden_states[:, (terminated[:,i1-1]), :] = 0 + rnn_outputs.append(rnn_output) + + rnn_output = torch.cat(rnn_outputs, dim=1) + # no need to reset the RNN state in the sequence + else: + rnn_output, hidden_states = self.gru(rnn_input, hidden_states) + # rollout + else: + rnn_input = states.view(-1, 1, states.shape[-1]) # (N, L, Hin): N=num_envs, L=1 + rnn_output, hidden_states = self.gru(rnn_input, hidden_states) + + # flatten the RNN output + rnn_output = torch.flatten(rnn_output, start_dim=0, end_dim=1) # (N, L, D ∗ Hout) -> (N * L, D ∗ Hout) + + return self.net(rnn_output), {"rnn": [hidden_states]} + + +# Gym environment observation wrapper used to mask velocity. Adapted from rl_zoo3 (rl_zoo3/wrappers.py) +class NoVelocityWrapper(gym.ObservationWrapper): + def observation(self, observation): + # observation: x, y, angular velocity + return observation * np.array([1, 1, 0]) + +gym.envs.registration.register(id="PendulumNoVel-v1", entry_point=lambda: NoVelocityWrapper(gym.make("Pendulum-v1"))) + +# Load and wrap the Gym environment +env = gym.vector.make("PendulumNoVel-v1", num_envs=4, asynchronous=False) +env = wrap_env(env) + +device = env.device + + +# Instantiate a RandomMemory as rollout buffer (any memory can be used for this) +memory = RandomMemory(memory_size=1024, num_envs=env.num_envs, device=device) + + +# Instantiate the agent's models (function approximators). +# TRPO requires 2 models, visit its documentation for more details +# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.trpo.html#spaces-and-models +models_trpo = {} +models_trpo["policy"] = Policy(env.observation_space, env.action_space, device, clip_actions=True, num_envs=env.num_envs) +models_trpo["value"] = Value(env.observation_space, env.action_space, device, num_envs=env.num_envs) + + +# Configure and instantiate the agent. +# Only modify some of the default configuration, visit its documentation to see all the options +# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.trpo.html#configuration-and-hyperparameters +cfg_trpo = TRPO_DEFAULT_CONFIG.copy() +cfg_trpo["rollouts"] = 1024 # memory_size +cfg_trpo["learning_epochs"] = 10 +cfg_trpo["mini_batches"] = 32 +cfg_trpo["discount_factor"] = 0.9 +cfg_trpo["lambda"] = 0.95 +cfg_trpo["learning_rate"] = 1e-3 +cfg_trpo["grad_norm_clip"] = 0.5 +cfg_trpo["state_preprocessor"] = RunningStandardScaler +cfg_trpo["state_preprocessor_kwargs"] = {"size": env.observation_space, "device": device} +cfg_trpo["value_preprocessor"] = RunningStandardScaler +cfg_trpo["value_preprocessor_kwargs"] = {"size": 1, "device": device} +# logging to TensorBoard and write checkpoints each 500 and 5000 timesteps respectively +cfg_trpo["experiment"]["write_interval"] = 500 +cfg_trpo["experiment"]["checkpoint_interval"] = 5000 + +agent_trpo = TRPO(models=models_trpo, + memory=memory, + cfg=cfg_trpo, + observation_space=env.observation_space, + action_space=env.action_space, + device=device) + + +# Configure and instantiate the RL trainer +cfg_trainer = {"timesteps": 100000, "headless": True} +trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=agent_trpo) + +# start training +trainer.train() diff --git a/docs/source/examples/gym/trpo_gym_pendulumnovel_lstm.py b/docs/source/examples/gym/trpo_gym_pendulumnovel_lstm.py new file mode 100644 index 00000000..120fec50 --- /dev/null +++ b/docs/source/examples/gym/trpo_gym_pendulumnovel_lstm.py @@ -0,0 +1,219 @@ +import gym + +import torch +import torch.nn as nn +import numpy as np + +# Import the skrl components to build the RL system +from skrl.models.torch import Model, GaussianMixin, DeterministicMixin +from skrl.memories.torch import RandomMemory +from skrl.agents.torch.trpo import TRPO, TRPO_DEFAULT_CONFIG +from skrl.trainers.torch import SequentialTrainer +from skrl.resources.preprocessors.torch import RunningStandardScaler +from skrl.envs.torch import wrap_env + + +# Define the models (stochastic and deterministic models) for the agent using mixins. +# - Policy: takes as input the environment's observation/state and returns an action +# - Value: takes the state as input and provides a value to guide the policy +class Policy(GaussianMixin, Model): + def __init__(self, observation_space, action_space, device, clip_actions=False, + clip_log_std=True, min_log_std=-20, max_log_std=2, reduction="sum", + num_envs=1, num_layers=1, hidden_size=64, sequence_length=128): + Model.__init__(self, observation_space, action_space, device) + GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std, reduction) + + self.num_envs = num_envs + self.num_layers = num_layers + self.hidden_size = hidden_size # Hcell (Hout is Hcell because proj_size = 0) + self.sequence_length = sequence_length + + self.lstm = nn.LSTM(input_size=self.num_observations, + hidden_size=self.hidden_size, + num_layers=self.num_layers, + batch_first=True) # batch_first -> (batch, sequence, features) + + self.net = nn.Sequential(nn.Linear(self.hidden_size, 64), + nn.ReLU(), + nn.Linear(64, self.num_actions)) + self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) + + def get_specification(self): + # batch size (N) is the number of envs + return {"rnn": {"sequence_length": self.sequence_length, + "sizes": [(self.num_layers, self.num_envs, self.hidden_size), # hidden states (D ∗ num_layers, N, Hout) + (self.num_layers, self.num_envs, self.hidden_size)]}} # cell states (D ∗ num_layers, N, Hcell) + + def compute(self, inputs, role): + states = inputs["states"] + terminated = inputs.get("terminated", None) + hidden_states, cell_states = inputs["rnn"][0], inputs["rnn"][1] + + # training + if self.training: + rnn_input = states.view(-1, self.sequence_length, states.shape[-1]) # (N, L, Hin): N=batch_size, L=sequence_length + hidden_states = hidden_states.view(self.num_layers, -1, self.sequence_length, hidden_states.shape[-1]) # (D * num_layers, N, L, Hout) + cell_states = cell_states.view(self.num_layers, -1, self.sequence_length, cell_states.shape[-1]) # (D * num_layers, N, L, Hcell) + # get the hidden/cell states corresponding to the initial sequence + hidden_states = hidden_states[:,:,0,:].contiguous() # (D * num_layers, N, Hout) + cell_states = cell_states[:,:,0,:].contiguous() # (D * num_layers, N, Hcell) + + # reset the RNN state in the middle of a sequence + if terminated is not None and torch.any(terminated): + rnn_outputs = [] + terminated = terminated.view(-1, self.sequence_length) + indexes = [0] + (terminated[:,:-1].any(dim=0).nonzero(as_tuple=True)[0] + 1).tolist() + [self.sequence_length] + + for i in range(len(indexes) - 1): + i0, i1 = indexes[i], indexes[i + 1] + rnn_output, (hidden_states, cell_states) = self.lstm(rnn_input[:,i0:i1,:], (hidden_states, cell_states)) + hidden_states[:, (terminated[:,i1-1]), :] = 0 + cell_states[:, (terminated[:,i1-1]), :] = 0 + rnn_outputs.append(rnn_output) + + rnn_states = (hidden_states, cell_states) + rnn_output = torch.cat(rnn_outputs, dim=1) + # no need to reset the RNN state in the sequence + else: + rnn_output, rnn_states = self.lstm(rnn_input, (hidden_states, cell_states)) + # rollout + else: + rnn_input = states.view(-1, 1, states.shape[-1]) # (N, L, Hin): N=num_envs, L=1 + rnn_output, rnn_states = self.lstm(rnn_input, (hidden_states, cell_states)) + + # flatten the RNN output + rnn_output = torch.flatten(rnn_output, start_dim=0, end_dim=1) # (N, L, D ∗ Hout) -> (N * L, D ∗ Hout) + + # Pendulum-v1 action_space is -2 to 2 + return 2 * torch.tanh(self.net(rnn_output)), self.log_std_parameter, {"rnn": [rnn_states[0], rnn_states[1]]} + +class Value(DeterministicMixin, Model): + def __init__(self, observation_space, action_space, device, clip_actions=False, + num_envs=1, num_layers=1, hidden_size=64, sequence_length=128): + Model.__init__(self, observation_space, action_space, device) + DeterministicMixin.__init__(self, clip_actions) + + self.num_envs = num_envs + self.num_layers = num_layers + self.hidden_size = hidden_size # Hcell (Hout is Hcell because proj_size = 0) + self.sequence_length = sequence_length + + self.lstm = nn.LSTM(input_size=self.num_observations, + hidden_size=self.hidden_size, + num_layers=self.num_layers, + batch_first=True) # batch_first -> (batch, sequence, features) + + self.net = nn.Sequential(nn.Linear(self.hidden_size, 64), + nn.ReLU(), + nn.Linear(64, 1)) + + def get_specification(self): + # batch size (N) is the number of envs + return {"rnn": {"sequence_length": self.sequence_length, + "sizes": [(self.num_layers, self.num_envs, self.hidden_size), # hidden states (D ∗ num_layers, N, Hout) + (self.num_layers, self.num_envs, self.hidden_size)]}} # cell states (D ∗ num_layers, N, Hcell) + + def compute(self, inputs, role): + states = inputs["states"] + terminated = inputs.get("terminated", None) + hidden_states, cell_states = inputs["rnn"][0], inputs["rnn"][1] + + # training + if self.training: + rnn_input = states.view(-1, self.sequence_length, states.shape[-1]) # (N, L, Hin): N=batch_size, L=sequence_length + + hidden_states = hidden_states.view(self.num_layers, -1, self.sequence_length, hidden_states.shape[-1]) # (D * num_layers, N, L, Hout) + cell_states = cell_states.view(self.num_layers, -1, self.sequence_length, cell_states.shape[-1]) # (D * num_layers, N, L, Hcell) + # get the hidden/cell states corresponding to the initial sequence + hidden_states = hidden_states[:,:,0,:].contiguous() # (D * num_layers, N, Hout) + cell_states = cell_states[:,:,0,:].contiguous() # (D * num_layers, N, Hcell) + + # reset the RNN state in the middle of a sequence + if terminated is not None and torch.any(terminated): + rnn_outputs = [] + terminated = terminated.view(-1, self.sequence_length) + indexes = [0] + (terminated[:,:-1].any(dim=0).nonzero(as_tuple=True)[0] + 1).tolist() + [self.sequence_length] + + for i in range(len(indexes) - 1): + i0, i1 = indexes[i], indexes[i + 1] + rnn_output, (hidden_states, cell_states) = self.lstm(rnn_input[:,i0:i1,:], (hidden_states, cell_states)) + hidden_states[:, (terminated[:,i1-1]), :] = 0 + cell_states[:, (terminated[:,i1-1]), :] = 0 + rnn_outputs.append(rnn_output) + + rnn_states = (hidden_states, cell_states) + rnn_output = torch.cat(rnn_outputs, dim=1) + # no need to reset the RNN state in the sequence + else: + rnn_output, rnn_states = self.lstm(rnn_input, (hidden_states, cell_states)) + # rollout + else: + rnn_input = states.view(-1, 1, states.shape[-1]) # (N, L, Hin): N=num_envs, L=1 + rnn_output, rnn_states = self.lstm(rnn_input, (hidden_states, cell_states)) + + # flatten the RNN output + rnn_output = torch.flatten(rnn_output, start_dim=0, end_dim=1) # (N, L, D ∗ Hout) -> (N * L, D ∗ Hout) + + return self.net(rnn_output), {"rnn": [rnn_states[0], rnn_states[1]]} + + +# Gym environment observation wrapper used to mask velocity. Adapted from rl_zoo3 (rl_zoo3/wrappers.py) +class NoVelocityWrapper(gym.ObservationWrapper): + def observation(self, observation): + # observation: x, y, angular velocity + return observation * np.array([1, 1, 0]) + +gym.envs.registration.register(id="PendulumNoVel-v1", entry_point=lambda: NoVelocityWrapper(gym.make("Pendulum-v1"))) + +# Load and wrap the Gym environment +env = gym.vector.make("PendulumNoVel-v1", num_envs=4, asynchronous=False) +env = wrap_env(env) + +device = env.device + + +# Instantiate a RandomMemory as rollout buffer (any memory can be used for this) +memory = RandomMemory(memory_size=1024, num_envs=env.num_envs, device=device) + + +# Instantiate the agent's models (function approximators). +# TRPO requires 2 models, visit its documentation for more details +# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.trpo.html#spaces-and-models +models_trpo = {} +models_trpo["policy"] = Policy(env.observation_space, env.action_space, device, clip_actions=True, num_envs=env.num_envs) +models_trpo["value"] = Value(env.observation_space, env.action_space, device, num_envs=env.num_envs) + + +# Configure and instantiate the agent. +# Only modify some of the default configuration, visit its documentation to see all the options +# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.trpo.html#configuration-and-hyperparameters +cfg_trpo = TRPO_DEFAULT_CONFIG.copy() +cfg_trpo["rollouts"] = 1024 # memory_size +cfg_trpo["learning_epochs"] = 10 +cfg_trpo["mini_batches"] = 32 +cfg_trpo["discount_factor"] = 0.9 +cfg_trpo["lambda"] = 0.95 +cfg_trpo["learning_rate"] = 1e-3 +cfg_trpo["grad_norm_clip"] = 0.5 +cfg_trpo["state_preprocessor"] = RunningStandardScaler +cfg_trpo["state_preprocessor_kwargs"] = {"size": env.observation_space, "device": device} +cfg_trpo["value_preprocessor"] = RunningStandardScaler +cfg_trpo["value_preprocessor_kwargs"] = {"size": 1, "device": device} +# logging to TensorBoard and write checkpoints each 500 and 5000 timesteps respectively +cfg_trpo["experiment"]["write_interval"] = 500 +cfg_trpo["experiment"]["checkpoint_interval"] = 5000 + +agent_trpo = TRPO(models=models_trpo, + memory=memory, + cfg=cfg_trpo, + observation_space=env.observation_space, + action_space=env.action_space, + device=device) + + +# Configure and instantiate the RL trainer +cfg_trainer = {"timesteps": 100000, "headless": True} +trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=agent_trpo) + +# start training +trainer.train() diff --git a/docs/source/examples/gym/trpo_gym_pendulumnovel_rnn.py b/docs/source/examples/gym/trpo_gym_pendulumnovel_rnn.py new file mode 100644 index 00000000..cb15682f --- /dev/null +++ b/docs/source/examples/gym/trpo_gym_pendulumnovel_rnn.py @@ -0,0 +1,209 @@ +import gym + +import torch +import torch.nn as nn +import numpy as np + +# Import the skrl components to build the RL system +from skrl.models.torch import Model, GaussianMixin, DeterministicMixin +from skrl.memories.torch import RandomMemory +from skrl.agents.torch.trpo import TRPO, TRPO_DEFAULT_CONFIG +from skrl.trainers.torch import SequentialTrainer +from skrl.resources.preprocessors.torch import RunningStandardScaler +from skrl.envs.torch import wrap_env + + +# Define the models (stochastic and deterministic models) for the agent using mixins. +# - Policy: takes as input the environment's observation/state and returns an action +# - Value: takes the state as input and provides a value to guide the policy +class Policy(GaussianMixin, Model): + def __init__(self, observation_space, action_space, device, clip_actions=False, + clip_log_std=True, min_log_std=-20, max_log_std=2, reduction="sum", + num_envs=1, num_layers=1, hidden_size=64, sequence_length=128): + Model.__init__(self, observation_space, action_space, device) + GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std, reduction) + + self.num_envs = num_envs + self.num_layers = num_layers + self.hidden_size = hidden_size # Hout + self.sequence_length = sequence_length + + self.rnn = nn.RNN(input_size=self.num_observations, + hidden_size=self.hidden_size, + num_layers=self.num_layers, + batch_first=True) # batch_first -> (batch, sequence, features) + + self.net = nn.Sequential(nn.Linear(self.hidden_size, 64), + nn.ReLU(), + nn.Linear(64, self.num_actions)) + self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) + + def get_specification(self): + # batch size (N) is the number of envs + return {"rnn": {"sequence_length": self.sequence_length, + "sizes": [(self.num_layers, self.num_envs, self.hidden_size)]}} # hidden states (D ∗ num_layers, N, Hout) + + def compute(self, inputs, role): + states = inputs["states"] + terminated = inputs.get("terminated", None) + hidden_states = inputs["rnn"][0] + + # training + if self.training: + rnn_input = states.view(-1, self.sequence_length, states.shape[-1]) # (N, L, Hin): N=batch_size, L=sequence_length + hidden_states = hidden_states.view(self.num_layers, -1, self.sequence_length, hidden_states.shape[-1]) # (D * num_layers, N, L, Hout) + # get the hidden states corresponding to the initial sequence + hidden_states = hidden_states[:,:,0,:].contiguous() # (D * num_layers, N, Hout) + + # reset the RNN state in the middle of a sequence + if terminated is not None and torch.any(terminated): + rnn_outputs = [] + terminated = terminated.view(-1, self.sequence_length) + indexes = [0] + (terminated[:,:-1].any(dim=0).nonzero(as_tuple=True)[0] + 1).tolist() + [self.sequence_length] + + for i in range(len(indexes) - 1): + i0, i1 = indexes[i], indexes[i + 1] + rnn_output, hidden_states = self.rnn(rnn_input[:,i0:i1,:], hidden_states) + hidden_states[:, (terminated[:,i1-1]), :] = 0 + rnn_outputs.append(rnn_output) + + rnn_output = torch.cat(rnn_outputs, dim=1) + # no need to reset the RNN state in the sequence + else: + rnn_output, hidden_states = self.rnn(rnn_input, hidden_states) + # rollout + else: + rnn_input = states.view(-1, 1, states.shape[-1]) # (N, L, Hin): N=num_envs, L=1 + rnn_output, hidden_states = self.rnn(rnn_input, hidden_states) + + # flatten the RNN output + rnn_output = torch.flatten(rnn_output, start_dim=0, end_dim=1) # (N, L, D ∗ Hout) -> (N * L, D ∗ Hout) + + # Pendulum-v1 action_space is -2 to 2 + return 2 * torch.tanh(self.net(rnn_output)), self.log_std_parameter, {"rnn": [hidden_states]} + +class Value(DeterministicMixin, Model): + def __init__(self, observation_space, action_space, device, clip_actions=False, + num_envs=1, num_layers=1, hidden_size=64, sequence_length=128): + Model.__init__(self, observation_space, action_space, device) + DeterministicMixin.__init__(self, clip_actions) + + self.num_envs = num_envs + self.num_layers = num_layers + self.hidden_size = hidden_size # Hout + self.sequence_length = sequence_length + + self.rnn = nn.RNN(input_size=self.num_observations, + hidden_size=self.hidden_size, + num_layers=self.num_layers, + batch_first=True) # batch_first -> (batch, sequence, features) + + self.net = nn.Sequential(nn.Linear(self.hidden_size, 64), + nn.ReLU(), + nn.Linear(64, 1)) + + def get_specification(self): + # batch size (N) is the number of envs + return {"rnn": {"sequence_length": self.sequence_length, + "sizes": [(self.num_layers, self.num_envs, self.hidden_size)]}} # hidden states (D ∗ num_layers, N, Hout) + + def compute(self, inputs, role): + states = inputs["states"] + terminated = inputs.get("terminated", None) + hidden_states = inputs["rnn"][0] + + # training + if self.training: + rnn_input = states.view(-1, self.sequence_length, states.shape[-1]) # (N, L, Hin): N=batch_size, L=sequence_length + + hidden_states = hidden_states.view(self.num_layers, -1, self.sequence_length, hidden_states.shape[-1]) # (D * num_layers, N, L, Hout) + # get the hidden states corresponding to the initial sequence + hidden_states = hidden_states[:,:,0,:].contiguous() # (D * num_layers, N, Hout) + + # reset the RNN state in the middle of a sequence + if terminated is not None and torch.any(terminated): + rnn_outputs = [] + terminated = terminated.view(-1, self.sequence_length) + indexes = [0] + (terminated[:,:-1].any(dim=0).nonzero(as_tuple=True)[0] + 1).tolist() + [self.sequence_length] + + for i in range(len(indexes) - 1): + i0, i1 = indexes[i], indexes[i + 1] + rnn_output, hidden_states = self.rnn(rnn_input[:,i0:i1,:], hidden_states) + hidden_states[:, (terminated[:,i1-1]), :] = 0 + rnn_outputs.append(rnn_output) + + rnn_output = torch.cat(rnn_outputs, dim=1) + # no need to reset the RNN state in the sequence + else: + rnn_output, hidden_states = self.rnn(rnn_input, hidden_states) + # rollout + else: + rnn_input = states.view(-1, 1, states.shape[-1]) # (N, L, Hin): N=num_envs, L=1 + rnn_output, hidden_states = self.rnn(rnn_input, hidden_states) + + # flatten the RNN output + rnn_output = torch.flatten(rnn_output, start_dim=0, end_dim=1) # (N, L, D ∗ Hout) -> (N * L, D ∗ Hout) + + return self.net(rnn_output), {"rnn": [hidden_states]} + + +# Gym environment observation wrapper used to mask velocity. Adapted from rl_zoo3 (rl_zoo3/wrappers.py) +class NoVelocityWrapper(gym.ObservationWrapper): + def observation(self, observation): + # observation: x, y, angular velocity + return observation * np.array([1, 1, 0]) + +gym.envs.registration.register(id="PendulumNoVel-v1", entry_point=lambda: NoVelocityWrapper(gym.make("Pendulum-v1"))) + +# Load and wrap the Gym environment +env = gym.vector.make("PendulumNoVel-v1", num_envs=4, asynchronous=False) +env = wrap_env(env) + +device = env.device + + +# Instantiate a RandomMemory as rollout buffer (any memory can be used for this) +memory = RandomMemory(memory_size=1024, num_envs=env.num_envs, device=device) + + +# Instantiate the agent's models (function approximators). +# TRPO requires 2 models, visit its documentation for more details +# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.trpo.html#spaces-and-models +models_trpo = {} +models_trpo["policy"] = Policy(env.observation_space, env.action_space, device, clip_actions=True, num_envs=env.num_envs) +models_trpo["value"] = Value(env.observation_space, env.action_space, device, num_envs=env.num_envs) + + +# Configure and instantiate the agent. +# Only modify some of the default configuration, visit its documentation to see all the options +# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.trpo.html#configuration-and-hyperparameters +cfg_trpo = TRPO_DEFAULT_CONFIG.copy() +cfg_trpo["rollouts"] = 1024 # memory_size +cfg_trpo["learning_epochs"] = 10 +cfg_trpo["mini_batches"] = 32 +cfg_trpo["discount_factor"] = 0.9 +cfg_trpo["lambda"] = 0.95 +cfg_trpo["learning_rate"] = 1e-3 +cfg_trpo["grad_norm_clip"] = 0.5 +cfg_trpo["state_preprocessor"] = RunningStandardScaler +cfg_trpo["state_preprocessor_kwargs"] = {"size": env.observation_space, "device": device} +cfg_trpo["value_preprocessor"] = RunningStandardScaler +cfg_trpo["value_preprocessor_kwargs"] = {"size": 1, "device": device} +# logging to TensorBoard and write checkpoints each 500 and 5000 timesteps respectively +cfg_trpo["experiment"]["write_interval"] = 500 +cfg_trpo["experiment"]["checkpoint_interval"] = 5000 + +agent_trpo = TRPO(models=models_trpo, + memory=memory, + cfg=cfg_trpo, + observation_space=env.observation_space, + action_space=env.action_space, + device=device) + + +# Configure and instantiate the RL trainer +cfg_trainer = {"timesteps": 100000, "headless": True} +trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=agent_trpo) + +# start training +trainer.train() diff --git a/docs/source/examples/gymnasium/cem_gymnasium_cartpole.py b/docs/source/examples/gymnasium/cem_gymnasium_cartpole.py new file mode 100644 index 00000000..64d1f872 --- /dev/null +++ b/docs/source/examples/gymnasium/cem_gymnasium_cartpole.py @@ -0,0 +1,82 @@ +import gymnasium as gym + +import torch.nn as nn +import torch.nn.functional as F + +# Import the skrl components to build the RL system +from skrl.models.torch import Model, CategoricalMixin +from skrl.memories.torch import RandomMemory +from skrl.agents.torch.cem import CEM, CEM_DEFAULT_CONFIG +from skrl.trainers.torch import SequentialTrainer +from skrl.envs.torch import wrap_env + + +# Define the model (categorical model) for the CEM agent using mixin +# - Policy: takes as input the environment's observation/state and returns an action +class Policy(CategoricalMixin, Model): + def __init__(self, observation_space, action_space, device, unnormalized_log_prob=True): + Model.__init__(self, observation_space, action_space, device) + CategoricalMixin.__init__(self, unnormalized_log_prob) + + self.linear_layer_1 = nn.Linear(self.num_observations, 64) + self.linear_layer_2 = nn.Linear(64, 64) + self.output_layer = nn.Linear(64, self.num_actions) + + def compute(self, inputs, role): + x = F.relu(self.linear_layer_1(inputs["states"])) + x = F.relu(self.linear_layer_2(x)) + return self.output_layer(x), {} + + +# Load and wrap the Gymnasium environment. +# Note: the environment version may change depending on the gymnasium version +try: + env = gym.make("CartPole-v1") +except (gym.error.DeprecatedEnv, gym.error.VersionNotFound) as e: + env_id = [spec for spec in gym.envs.registry if spec.startswith("CartPole-v")][0] + print("CartPole-v0 not found. Trying {}".format(env_id)) + env = gym.make(env_id) +env = wrap_env(env) + +device = env.device + + +# Instantiate a RandomMemory (without replacement) as experience replay memory +memory = RandomMemory(memory_size=1000, num_envs=env.num_envs, device=device, replacement=False) + + +# Instantiate the agent's model (function approximator). +# CEM requires 1 model, visit its documentation for more details +# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.cem.html#spaces-and-models +models_cem = {} +models_cem["policy"] = Policy(env.observation_space, env.action_space, device) + +# Initialize the models' parameters (weights and biases) using a Gaussian distribution +for model in models_cem.values(): + model.init_parameters(method_name="normal_", mean=0.0, std=0.1) + + +# Configure and instantiate the agent. +# Only modify some of the default configuration, visit its documentation to see all the options +# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.cem.html#configuration-and-hyperparameters +cfg_cem = CEM_DEFAULT_CONFIG.copy() +cfg_cem["rollouts"] = 1000 +cfg_cem["learning_starts"] = 100 +# logging to TensorBoard and write checkpoints each 1000 and 5000 timesteps respectively +cfg_cem["experiment"]["write_interval"] = 1000 +cfg_cem["experiment"]["checkpoint_interval"] = 5000 + +agent_cem = CEM(models=models_cem, + memory=memory, + cfg=cfg_cem, + observation_space=env.observation_space, + action_space=env.action_space, + device=device) + + +# Configure and instantiate the RL trainer +cfg_trainer = {"timesteps": 100000, "headless": True} +trainer = SequentialTrainer(env=env, agents=[agent_cem], cfg=cfg_trainer) + +# start training +trainer.train() diff --git a/docs/source/examples/gymnasium/cem_gymnasium_cartpole_eval.py b/docs/source/examples/gymnasium/cem_gymnasium_cartpole_eval.py new file mode 100644 index 00000000..39cb9719 --- /dev/null +++ b/docs/source/examples/gymnasium/cem_gymnasium_cartpole_eval.py @@ -0,0 +1,75 @@ +import gymnasium as gym + +import torch.nn as nn +import torch.nn.functional as F + +# Import the skrl components to build the RL system +from skrl.models.torch import Model, CategoricalMixin +from skrl.agents.torch.cem import CEM, CEM_DEFAULT_CONFIG +from skrl.trainers.torch import SequentialTrainer +from skrl.envs.torch import wrap_env + + +# Define the model (categorical model) for the CEM agent using mixin +# - Policy: takes as input the environment's observation/state and returns an action +class Policy(CategoricalMixin, Model): + def __init__(self, observation_space, action_space, device, unnormalized_log_prob=True): + Model.__init__(self, observation_space, action_space, device) + CategoricalMixin.__init__(self, unnormalized_log_prob) + + self.linear_layer_1 = nn.Linear(self.num_observations, 64) + self.linear_layer_2 = nn.Linear(64, 64) + self.output_layer = nn.Linear(64, self.num_actions) + + def compute(self, inputs, role): + x = F.relu(self.linear_layer_1(inputs["states"])) + x = F.relu(self.linear_layer_2(x)) + return self.output_layer(x), {} + + +# Load and wrap the Gymnasium environment. +# Note: the environment version may change depending on the gymnasium version +try: + env = gym.make("CartPole-v1") +except (gym.error.DeprecatedEnv, gym.error.VersionNotFound) as e: + env_id = [spec for spec in gym.envs.registry if spec.startswith("CartPole-v")][0] + print("CartPole-v0 not found. Trying {}".format(env_id)) + env = gym.make(env_id) +env = wrap_env(env) + +device = env.device + + +# Instantiate the agent's model (function approximators). +# CEM requires 1 model, visit its documentation for more details +# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.cem.html#spaces-and-models +models_cem = {} +models_cem["policy"] = Policy(env.observation_space, env.action_space, device) + + +# Configure and instantiate the agent. +# Only modify some of the default configuration, visit its documentation to see all the options +# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.cem.html#configuration-and-hyperparameters +cfg_cem = CEM_DEFAULT_CONFIG.copy() +cfg_cem["rollouts"] = 1000 +cfg_cem["learning_starts"] = 100 +# logging to TensorBoard each 1000 timesteps and ignore checkpoints +cfg_cem["experiment"]["write_interval"] = 1000 +cfg_cem["experiment"]["checkpoint_interval"] = 0 + +agent_cem = CEM(models=models_cem, + memory=None, + cfg=cfg_cem, + observation_space=env.observation_space, + action_space=env.action_space, + device=device) + +# load checkpoint +agent_cem.load("./runs/22-09-07_21-41-05-854385_CEM/checkpoints/best_agent.pt") + +# Configure and instantiate the RL trainer +cfg_trainer = {"timesteps": 100000, "headless": True} +trainer = SequentialTrainer(env=env, agents=[agent_cem], cfg=cfg_trainer) + +# evaluate the agent +trainer.eval() diff --git a/docs/source/examples/gym/gym_pendulum_ddpg.py b/docs/source/examples/gymnasium/ddpg_gymnasium_pendulum.py similarity index 81% rename from docs/source/examples/gym/gym_pendulum_ddpg.py rename to docs/source/examples/gymnasium/ddpg_gymnasium_pendulum.py index 7ac71623..41dd7c64 100644 --- a/docs/source/examples/gym/gym_pendulum_ddpg.py +++ b/docs/source/examples/gymnasium/ddpg_gymnasium_pendulum.py @@ -1,4 +1,4 @@ -import gym +import gymnasium as gym import torch import torch.nn as nn @@ -15,7 +15,7 @@ # Define the models (deterministic models) for the DDPG agent using mixin # - Actor (policy): takes as input the environment's observation/state and returns an action -# - Critic: takes the state and action as input and provides a value to guide the policy +# - Critic: takes the state and action as input and provides a value to guide the policy class DeterministicActor(DeterministicMixin, Model): def __init__(self, observation_space, action_space, device, clip_actions=False): Model.__init__(self, observation_space, action_space, device) @@ -25,10 +25,10 @@ def __init__(self, observation_space, action_space, device, clip_actions=False): self.linear_layer_2 = nn.Linear(400, 300) self.action_layer = nn.Linear(300, self.num_actions) - def compute(self, states, taken_actions, role): - x = F.relu(self.linear_layer_1(states)) + def compute(self, inputs, role): + x = F.relu(self.linear_layer_1(inputs["states"])) x = F.relu(self.linear_layer_2(x)) - return 2 * torch.tanh(self.action_layer(x)) # Pendulum-v1 action_space is -2 to 2 + return 2 * torch.tanh(self.action_layer(x)), {} # Pendulum-v1 action_space is -2 to 2 class DeterministicCritic(DeterministicMixin, Model): def __init__(self, observation_space, action_space, device, clip_actions=False): @@ -39,18 +39,18 @@ def __init__(self, observation_space, action_space, device, clip_actions=False): self.linear_layer_2 = nn.Linear(400, 300) self.linear_layer_3 = nn.Linear(300, 1) - def compute(self, states, taken_actions, role): - x = F.relu(self.linear_layer_1(torch.cat([states, taken_actions], dim=1))) + def compute(self, inputs, role): + x = F.relu(self.linear_layer_1(torch.cat([inputs["states"], inputs["taken_actions"]], dim=1))) x = F.relu(self.linear_layer_2(x)) - return self.linear_layer_3(x) + return self.linear_layer_3(x), {} -# Load and wrap the Gym environment. -# Note: the environment version may change depending on the gym version +# Load and wrap the Gymnasium environment. +# Note: the environment version may change depending on the gymnasium version try: env = gym.make("Pendulum-v1") -except gym.error.DeprecatedEnv as e: - env_id = [spec.id for spec in gym.envs.registry.all() if spec.id.startswith("Pendulum-v")][0] +except (gym.error.DeprecatedEnv, gym.error.VersionNotFound) as e: + env_id = [spec for spec in gym.envs.registry if spec.startswith("Pendulum-v")][0] print("Pendulum-v1 not found. Trying {}".format(env_id)) env = gym.make(env_id) env = wrap_env(env) @@ -88,10 +88,10 @@ def compute(self, states, taken_actions, role): cfg_ddpg["experiment"]["write_interval"] = 300 cfg_ddpg["experiment"]["checkpoint_interval"] = 1500 -agent_ddpg = DDPG(models=models_ddpg, - memory=memory, - cfg=cfg_ddpg, - observation_space=env.observation_space, +agent_ddpg = DDPG(models=models_ddpg, + memory=memory, + cfg=cfg_ddpg, + observation_space=env.observation_space, action_space=env.action_space, device=device) diff --git a/docs/source/examples/gymnasium/ddpg_gymnasium_pendulum_eval.py b/docs/source/examples/gymnasium/ddpg_gymnasium_pendulum_eval.py new file mode 100644 index 00000000..12eac470 --- /dev/null +++ b/docs/source/examples/gymnasium/ddpg_gymnasium_pendulum_eval.py @@ -0,0 +1,75 @@ +import gymnasium as gym + +import torch +import torch.nn as nn +import torch.nn.functional as F + +# Import the skrl components to build the RL system +from skrl.models.torch import Model, DeterministicMixin +from skrl.agents.torch.ddpg import DDPG, DDPG_DEFAULT_CONFIG +from skrl.trainers.torch import SequentialTrainer +from skrl.envs.torch import wrap_env + + +# Define only the policy for evaluation +class DeterministicActor(DeterministicMixin, Model): + def __init__(self, observation_space, action_space, device, clip_actions=False): + Model.__init__(self, observation_space, action_space, device) + DeterministicMixin.__init__(self, clip_actions) + + self.linear_layer_1 = nn.Linear(self.num_observations, 400) + self.linear_layer_2 = nn.Linear(400, 300) + self.action_layer = nn.Linear(300, self.num_actions) + + def compute(self, inputs, role): + x = F.relu(self.linear_layer_1(inputs["states"])) + x = F.relu(self.linear_layer_2(x)) + return 2 * torch.tanh(self.action_layer(x)), {} # Pendulum-v1 action_space is -2 to 2 + + +# Load and wrap the Gymnasium environment. +# Note: the environment version may change depending on the gymnasium version +try: + env = gym.make("Pendulum-v1") +except (gym.error.DeprecatedEnv, gym.error.VersionNotFound) as e: + env_id = [spec for spec in gym.envs.registry if spec.startswith("Pendulum-v")][0] + print("Pendulum-v1 not found. Trying {}".format(env_id)) + env = gym.make(env_id) +env = wrap_env(env) + +device = env.device + + +# Instantiate the agent's policy. +# DDPG requires 4 models, visit its documentation for more details +# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ddpg.html#spaces-and-models +models_ddpg = {} +models_ddpg["policy"] = DeterministicActor(env.observation_space, env.action_space, device) + + +# Configure and instantiate the agent. +# Only modify some of the default configuration, visit its documentation to see all the options +# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ddpg.html#configuration-and-hyperparameters +cfg_ddpg = DDPG_DEFAULT_CONFIG.copy() +cfg_ddpg["random_timesteps"] = 0 +# logging to TensorBoard each 300 timesteps and ignore checkpoints +cfg_ddpg["experiment"]["write_interval"] = 300 +cfg_ddpg["experiment"]["checkpoint_interval"] = 0 + +agent_ddpg = DDPG(models=models_ddpg, + memory=None, + cfg=cfg_ddpg, + observation_space=env.observation_space, + action_space=env.action_space, + device=device) + +# load checkpoint +agent_ddpg.load("./runs/22-09-10_11-02-46-773796_DDPG/checkpoints/agent_15000.pt") + + +# Configure and instantiate the RL trainer +cfg_trainer = {"timesteps": 15000, "headless": True} +trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=agent_ddpg) + +# evaluate the agent +trainer.eval() diff --git a/docs/source/examples/gymnasium/ddpg_gymnasium_pendulum_vector.py b/docs/source/examples/gymnasium/ddpg_gymnasium_pendulum_vector.py new file mode 100644 index 00000000..72107b94 --- /dev/null +++ b/docs/source/examples/gymnasium/ddpg_gymnasium_pendulum_vector.py @@ -0,0 +1,104 @@ +import gymnasium as gym + +import torch +import torch.nn as nn +import torch.nn.functional as F + +# Import the skrl components to build the RL system +from skrl.models.torch import Model, DeterministicMixin +from skrl.memories.torch import RandomMemory +from skrl.agents.torch.ddpg import DDPG, DDPG_DEFAULT_CONFIG +from skrl.resources.noises.torch import OrnsteinUhlenbeckNoise +from skrl.trainers.torch import SequentialTrainer +from skrl.envs.torch import wrap_env + + +# Define the models (deterministic models) for the DDPG agent using mixin +# - Actor (policy): takes as input the environment's observation/state and returns an action +# - Critic: takes the state and action as input and provides a value to guide the policy +class DeterministicActor(DeterministicMixin, Model): + def __init__(self, observation_space, action_space, device, clip_actions=False): + Model.__init__(self, observation_space, action_space, device) + DeterministicMixin.__init__(self, clip_actions) + + self.linear_layer_1 = nn.Linear(self.num_observations, 400) + self.linear_layer_2 = nn.Linear(400, 300) + self.action_layer = nn.Linear(300, self.num_actions) + + def compute(self, inputs, role): + x = F.relu(self.linear_layer_1(inputs["states"])) + x = F.relu(self.linear_layer_2(x)) + return 2 * torch.tanh(self.action_layer(x)), {} # Pendulum-v1 action_space is -2 to 2 + +class DeterministicCritic(DeterministicMixin, Model): + def __init__(self, observation_space, action_space, device, clip_actions=False): + Model.__init__(self, observation_space, action_space, device) + DeterministicMixin.__init__(self, clip_actions) + + self.linear_layer_1 = nn.Linear(self.num_observations + self.num_actions, 400) + self.linear_layer_2 = nn.Linear(400, 300) + self.linear_layer_3 = nn.Linear(300, 1) + + def compute(self, inputs, role): + x = F.relu(self.linear_layer_1(torch.cat([inputs["states"], inputs["taken_actions"]], dim=1))) + x = F.relu(self.linear_layer_2(x)) + return self.linear_layer_3(x), {} + + +# Load and wrap the Gymnasium environment. +# Note: the environment version may change depending on the gymnasium version +try: + env = gym.vector.make("Pendulum-v1", num_envs=10, asynchronous=False) +except (gym.error.DeprecatedEnv, gym.error.VersionNotFound) as e: + env_id = [spec for spec in gym.envs.registry if spec.startswith("Pendulum-v")][0] + print("Pendulum-v1 not found. Trying {}".format(env_id)) + env = gym.vector.make(env_id, num_envs=10, asynchronous=False) +env = wrap_env(env) + +device = env.device + + +# Instantiate a RandomMemory (without replacement) as experience replay memory +memory = RandomMemory(memory_size=100000, num_envs=env.num_envs, device=device, replacement=False) + + +# Instantiate the agent's models (function approximators). +# DDPG requires 4 models, visit its documentation for more details +# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ddpg.html#spaces-and-models +models_ddpg = {} +models_ddpg["policy"] = DeterministicActor(env.observation_space, env.action_space, device) +models_ddpg["target_policy"] = DeterministicActor(env.observation_space, env.action_space, device) +models_ddpg["critic"] = DeterministicCritic(env.observation_space, env.action_space, device) +models_ddpg["target_critic"] = DeterministicCritic(env.observation_space, env.action_space, device) + +# Initialize the models' parameters (weights and biases) using a Gaussian distribution +for model in models_ddpg.values(): + model.init_parameters(method_name="normal_", mean=0.0, std=0.1) + + +# Configure and instantiate the agent. +# Only modify some of the default configuration, visit its documentation to see all the options +# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ddpg.html#configuration-and-hyperparameters +cfg_ddpg = DDPG_DEFAULT_CONFIG.copy() +cfg_ddpg["exploration"]["noise"] = OrnsteinUhlenbeckNoise(theta=0.15, sigma=0.1, base_scale=1.0, device=device) +cfg_ddpg["batch_size"] = 100 +cfg_ddpg["random_timesteps"] = 100 +cfg_ddpg["learning_starts"] = 100 +# logging to TensorBoard and write checkpoints each 1000 and 1000 timesteps respectively +cfg_ddpg["experiment"]["write_interval"] = 1000 +cfg_ddpg["experiment"]["checkpoint_interval"] = 1000 + +agent_ddpg = DDPG(models=models_ddpg, + memory=memory, + cfg=cfg_ddpg, + observation_space=env.observation_space, + action_space=env.action_space, + device=device) + + +# Configure and instantiate the RL trainer +cfg_trainer = {"timesteps": 15000, "headless": True} +trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=agent_ddpg) + +# start training +trainer.train() diff --git a/docs/source/examples/gymnasium/dqn_gymnasium_cartpole.py b/docs/source/examples/gymnasium/dqn_gymnasium_cartpole.py new file mode 100644 index 00000000..d55c1e48 --- /dev/null +++ b/docs/source/examples/gymnasium/dqn_gymnasium_cartpole.py @@ -0,0 +1,82 @@ +import gymnasium as gym + +# Import the skrl components to build the RL system +from skrl.utils.model_instantiators import deterministic_model, Shape +from skrl.memories.torch import RandomMemory +from skrl.agents.torch.dqn import DQN, DQN_DEFAULT_CONFIG +from skrl.trainers.torch import SequentialTrainer +from skrl.envs.torch import wrap_env + + +# Load and wrap the Gymnasium environment. +# Note: the environment version may change depending on the gymnasium version +try: + env = gym.make("CartPole-v1") +except (gym.error.DeprecatedEnv, gym.error.VersionNotFound) as e: + env_id = [spec for spec in gym.envs.registry if spec.startswith("CartPole-v")][0] + print("CartPole-v0 not found. Trying {}".format(env_id)) + env = gym.make(env_id) +env = wrap_env(env) + +device = env.device + + +# Instantiate a RandomMemory (without replacement) as experience replay memory +memory = RandomMemory(memory_size=50000, num_envs=env.num_envs, device=device, replacement=False) + + +# Instantiate the agent's models (function approximators) using the model instantiator utility +# DQN requires 2 models, visit its documentation for more details +# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.dqn.html#spaces-and-models +models_dqn = {} +models_dqn["q_network"] = deterministic_model(observation_space=env.observation_space, + action_space=env.action_space, + device=device, + clip_actions=False, + input_shape=Shape.OBSERVATIONS, + hiddens=[64, 64], + hidden_activation=["relu", "relu"], + output_shape=Shape.ACTIONS, + output_activation=None, + output_scale=1.0) +models_dqn["target_q_network"] = deterministic_model(observation_space=env.observation_space, + action_space=env.action_space, + device=device, + clip_actions=False, + input_shape=Shape.OBSERVATIONS, + hiddens=[64, 64], + hidden_activation=["relu", "relu"], + output_shape=Shape.ACTIONS, + output_activation=None, + output_scale=1.0) + +# Initialize the models' parameters (weights and biases) using a Gaussian distribution +for model in models_dqn.values(): + model.init_parameters(method_name="normal_", mean=0.0, std=0.1) + + +# Configure and instantiate the agent. +# Only modify some of the default configuration, visit its documentation to see all the options +# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.dqn.html#configuration-and-hyperparameters +cfg_dqn = DQN_DEFAULT_CONFIG.copy() +cfg_dqn["learning_starts"] = 100 +cfg_dqn["exploration"]["final_epsilon"] = 0.04 +cfg_dqn["exploration"]["timesteps"] = 1500 +# logging to TensorBoard and write checkpoints each 1000 and 5000 timesteps respectively +cfg_dqn["experiment"]["write_interval"] = 1000 +cfg_dqn["experiment"]["checkpoint_interval"] = 5000 + +agent_dqn = DQN(models=models_dqn, + memory=memory, + cfg=cfg_dqn, + observation_space=env.observation_space, + action_space=env.action_space, + device=device) + + +# Configure and instantiate the RL trainer +cfg_trainer = {"timesteps": 50000, "headless": True} +trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=agent_dqn) + +# start training +trainer.train() diff --git a/docs/source/examples/gymnasium/dqn_gymnasium_cartpole_eval.py b/docs/source/examples/gymnasium/dqn_gymnasium_cartpole_eval.py new file mode 100644 index 00000000..49421302 --- /dev/null +++ b/docs/source/examples/gymnasium/dqn_gymnasium_cartpole_eval.py @@ -0,0 +1,64 @@ +import gymnasium as gym + +# Import the skrl components to build the RL system +from skrl.utils.model_instantiators import deterministic_model, Shape +from skrl.agents.torch.dqn import DQN, DQN_DEFAULT_CONFIG +from skrl.trainers.torch import SequentialTrainer +from skrl.envs.torch import wrap_env + + +# Load and wrap the Gymnasium environment. +# Note: the environment version may change depending on the gymnasium version +try: + env = gym.make("CartPole-v1") +except (gym.error.DeprecatedEnv, gym.error.VersionNotFound) as e: + env_id = [spec for spec in gym.envs.registry if spec.startswith("CartPole-v")][0] + print("CartPole-v0 not found. Trying {}".format(env_id)) + env = gym.make(env_id) +env = wrap_env(env) + +device = env.device + + +# Instantiate only the policy for evaluation. +# DQN requires 2 models, visit its documentation for more details +# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.dqn.html#spaces-and-models +models_dqn = {} +models_dqn["q_network"] = deterministic_model(observation_space=env.observation_space, + action_space=env.action_space, + device=device, + clip_actions=False, + input_shape=Shape.OBSERVATIONS, + hiddens=[64, 64], + hidden_activation=["relu", "relu"], + output_shape=Shape.ACTIONS, + output_activation=None, + output_scale=1.0) + + +# Configure and instantiate the agent. +# Only modify some of the default configuration, visit its documentation to see all the options +# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.dqn.html#configuration-and-hyperparameters +cfg_dqn = DQN_DEFAULT_CONFIG.copy() +cfg_dqn["exploration"]["timesteps"] = 0 +# # logging to TensorBoard each 1000 timesteps and ignore checkpoints +cfg_dqn["experiment"]["write_interval"] = 1000 +cfg_dqn["experiment"]["checkpoint_interval"] = 0 + +agent_dqn = DQN(models=models_dqn, + memory=None, + cfg=cfg_dqn, + observation_space=env.observation_space, + action_space=env.action_space, + device=device) + +# load checkpoint +agent_dqn.load("./runs/22-09-10_10-48-10-551426_DQN/checkpoints/best_agent.pt") + + +# Configure and instantiate the RL trainer +cfg_trainer = {"timesteps": 50000, "headless": True} +trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=agent_dqn) + +# evaluate the agent +trainer.eval() diff --git a/docs/source/examples/gymnasium/dqn_gymnasium_cartpole_vector.py b/docs/source/examples/gymnasium/dqn_gymnasium_cartpole_vector.py new file mode 100644 index 00000000..6f89dbc5 --- /dev/null +++ b/docs/source/examples/gymnasium/dqn_gymnasium_cartpole_vector.py @@ -0,0 +1,82 @@ +import gymnasium as gym + +# Import the skrl components to build the RL system +from skrl.utils.model_instantiators import deterministic_model, Shape +from skrl.memories.torch import RandomMemory +from skrl.agents.torch.dqn import DQN, DQN_DEFAULT_CONFIG +from skrl.trainers.torch import SequentialTrainer +from skrl.envs.torch import wrap_env + + +# Load and wrap the Gymnasium environment. +# Note: the environment version may change depending on the gymnasium version +try: + env = gym.vector.make("CartPole-v1", num_envs=5, asynchronous=False) +except (gym.error.DeprecatedEnv, gym.error.VersionNotFound) as e: + env_id = [spec for spec in gym.envs.registry if spec.startswith("CartPole-v")][0] + print("CartPole-v0 not found. Trying {}".format(env_id)) + env = gym.vector.make(env_id, num_envs=5, asynchronous=False) +env = wrap_env(env) + +device = env.device + + +# Instantiate a RandomMemory (without replacement) as experience replay memory +memory = RandomMemory(memory_size=200000, num_envs=env.num_envs, device=device, replacement=False) + + +# Instantiate the agent's models (function approximators) using the model instantiator utility +# DQN requires 2 models, visit its documentation for more details +# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.dqn.html#spaces-and-models +models_dqn = {} +models_dqn["q_network"] = deterministic_model(observation_space=env.observation_space, + action_space=env.action_space, + device=device, + clip_actions=False, + input_shape=Shape.OBSERVATIONS, + hiddens=[64, 64], + hidden_activation=["relu", "relu"], + output_shape=Shape.ACTIONS, + output_activation=None, + output_scale=1.0) +models_dqn["target_q_network"] = deterministic_model(observation_space=env.observation_space, + action_space=env.action_space, + device=device, + clip_actions=False, + input_shape=Shape.OBSERVATIONS, + hiddens=[64, 64], + hidden_activation=["relu", "relu"], + output_shape=Shape.ACTIONS, + output_activation=None, + output_scale=1.0) + +# Initialize the models' parameters (weights and biases) using a Gaussian distribution +for model in models_dqn.values(): + model.init_parameters(method_name="normal_", mean=0.0, std=0.1) + + +# Configure and instantiate the agent. +# Only modify some of the default configuration, visit its documentation to see all the options +# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.dqn.html#configuration-and-hyperparameters +cfg_dqn = DQN_DEFAULT_CONFIG.copy() +cfg_dqn["learning_starts"] = 100 +cfg_dqn["exploration"]["final_epsilon"] = 0.04 +cfg_dqn["exploration"]["timesteps"] = 1500 +# logging to TensorBoard and write checkpoints each 1000 and 5000 timesteps respectively +cfg_dqn["experiment"]["write_interval"] = 1000 +cfg_dqn["experiment"]["checkpoint_interval"] = 5000 + +agent_dqn = DQN(models=models_dqn, + memory=memory, + cfg=cfg_dqn, + observation_space=env.observation_space, + action_space=env.action_space, + device=device) + + +# Configure and instantiate the RL trainer +cfg_trainer = {"timesteps": 50000, "headless": True} +trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=agent_dqn) + +# start training +trainer.train() diff --git a/docs/source/examples/gymnasium/q_learning_gymnasium_frozen_lake.py b/docs/source/examples/gymnasium/q_learning_gymnasium_frozen_lake.py new file mode 100644 index 00000000..30d339f5 --- /dev/null +++ b/docs/source/examples/gymnasium/q_learning_gymnasium_frozen_lake.py @@ -0,0 +1,76 @@ +import gymnasium as gym + +import torch + +# Import the skrl components to build the RL system +from skrl.models.torch import Model, TabularMixin +from skrl.agents.torch.q_learning import Q_LEARNING, Q_LEARNING_DEFAULT_CONFIG +from skrl.trainers.torch import SequentialTrainer +from skrl.envs.torch import wrap_env + + +# Define the model (tabular model) for the SARSA agent using mixin +class EpilonGreedyPolicy(TabularMixin, Model): + def __init__(self, observation_space, action_space, device, num_envs=1, epsilon=0.1): + Model.__init__(self, observation_space, action_space, device) + TabularMixin.__init__(self, num_envs) + + self.epsilon = epsilon + self.q_table = torch.ones((num_envs, self.num_observations, self.num_actions), + dtype=torch.float32, device=self.device) + + def compute(self, inputs, role): + actions = torch.argmax(self.q_table[torch.arange(self.num_envs).view(-1, 1), inputs["states"]], + dim=-1, keepdim=True).view(-1,1) + + # choose random actions for exploration according to epsilon + indexes = (torch.rand(inputs["states"].shape[0], device=self.device) < self.epsilon).nonzero().view(-1) + if indexes.numel(): + actions[indexes] = torch.randint(self.num_actions, (indexes.numel(), 1), device=self.device) + return actions, {} + + +# Load and wrap the Gymnasium environment. +# Note: the environment version may change depending on the gymnasium version +try: + env = gym.make("FrozenLake-v0") +except (gym.error.DeprecatedEnv, gym.error.VersionNotFound) as e: + env_id = [spec for spec in gym.envs.registry if spec.startswith("FrozenLake-v")][0] + print("FrozenLake-v0 not found. Trying {}".format(env_id)) + env = gym.make(env_id) +env = wrap_env(env) + +device = env.device + + +# Instantiate the agent's models (table) +# Q-learning requires 1 model, visit its documentation for more details +# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.q_learning.html#spaces-and-models +models_q_learning = {} +models_q_learning["policy"] = EpilonGreedyPolicy(env.observation_space, env.action_space, device, num_envs=env.num_envs, epsilon=0.1) + + +# Configure and instantiate the agent. +# Only modify some of the default configuration, visit its documentation to see all the options +# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.q_learning.html#configuration-and-hyperparameters +cfg_q_learning = Q_LEARNING_DEFAULT_CONFIG.copy() +cfg_q_learning["discount_factor"] = 0.999 +cfg_q_learning["alpha"] = 0.4 +# logging to TensorBoard and write checkpoints each 1600 and 8000 timesteps respectively +cfg_q_learning["experiment"]["write_interval"] = 1600 +cfg_q_learning["experiment"]["checkpoint_interval"] = 8000 + +agent_q_learning = Q_LEARNING(models=models_q_learning, + memory=None, + cfg=cfg_q_learning, + observation_space=env.observation_space, + action_space=env.action_space, + device=device) + + +# Configure and instantiate the RL trainer +cfg_trainer = {"timesteps": 80000, "headless": True} +trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=agent_q_learning) + +# start training +trainer.train() diff --git a/docs/source/examples/gymnasium/q_learning_gymnasium_frozen_lake_eval.py b/docs/source/examples/gymnasium/q_learning_gymnasium_frozen_lake_eval.py new file mode 100644 index 00000000..a07ec805 --- /dev/null +++ b/docs/source/examples/gymnasium/q_learning_gymnasium_frozen_lake_eval.py @@ -0,0 +1,78 @@ +import gymnasium as gym + +import torch + +# Import the skrl components to build the RL system +from skrl.models.torch import Model, TabularMixin +from skrl.agents.torch.q_learning import Q_LEARNING, Q_LEARNING_DEFAULT_CONFIG +from skrl.trainers.torch import SequentialTrainer +from skrl.envs.torch import wrap_env + + +# Define the model (tabular model) for the SARSA agent using mixin +class EpilonGreedyPolicy(TabularMixin, Model): + def __init__(self, observation_space, action_space, device, num_envs=1, epsilon=0.1): + Model.__init__(self, observation_space, action_space, device) + TabularMixin.__init__(self, num_envs) + + self.epsilon = epsilon + self.q_table = torch.ones((num_envs, self.num_observations, self.num_actions), + dtype=torch.float32, device=self.device) + + def compute(self, inputs, role): + actions = torch.argmax(self.q_table[torch.arange(self.num_envs).view(-1, 1), inputs["states"]], + dim=-1, keepdim=True).view(-1,1) + + # choose random actions for exploration according to epsilon + indexes = (torch.rand(inputs["states"].shape[0], device=self.device) < self.epsilon).nonzero().view(-1) + if indexes.numel(): + actions[indexes] = torch.randint(self.num_actions, (indexes.numel(), 1), device=self.device) + return actions, {} + + +# Load and wrap the Gymnasium environment. +# Note: the environment version may change depending on the gymnasium version +try: + env = gym.make("FrozenLake-v0") +except (gym.error.DeprecatedEnv, gym.error.VersionNotFound) as e: + env_id = [spec for spec in gym.envs.registry if spec.startswith("FrozenLake-v")][0] + print("FrozenLake-v0 not found. Trying {}".format(env_id)) + env = gym.make(env_id) +env = wrap_env(env) + +device = env.device + + +# Instantiate the agent's models (table) +# Q-learning requires 1 model, visit its documentation for more details +# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.q_learning.html#spaces-and-models +models_q_learning = {} +models_q_learning["policy"] = EpilonGreedyPolicy(env.observation_space, env.action_space, device, num_envs=env.num_envs, epsilon=0.1) + + +# Configure and instantiate the agent. +# Only modify some of the default configuration, visit its documentation to see all the options +# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.q_learning.html#configuration-and-hyperparameters +cfg_q_learning = Q_LEARNING_DEFAULT_CONFIG.copy() +cfg_q_learning["random_timesteps"] = 0 +# logging to TensorBoard and write checkpoints each 1600 and ignore checkpoints +cfg_q_learning["experiment"]["write_interval"] = 1600 +cfg_q_learning["experiment"]["checkpoint_interval"] = 0 + +agent_q_learning = Q_LEARNING(models=models_q_learning, + memory=None, + cfg=cfg_q_learning, + observation_space=env.observation_space, + action_space=env.action_space, + device=device) + +# load checkpoint +agent_q_learning.load("./runs/22-09-10_17-54-20-381109_Q_LEARNING/checkpoints/best_agent.pt") + + +# Configure and instantiate the RL trainer +cfg_trainer = {"timesteps": 80000, "headless": True} +trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=agent_q_learning) + +# evaluate the agent +trainer.eval() diff --git a/docs/source/examples/gymnasium/q_learning_gymnasium_frozen_lake_vector.py b/docs/source/examples/gymnasium/q_learning_gymnasium_frozen_lake_vector.py new file mode 100644 index 00000000..a0fe5860 --- /dev/null +++ b/docs/source/examples/gymnasium/q_learning_gymnasium_frozen_lake_vector.py @@ -0,0 +1,76 @@ +import gymnasium as gym + +import torch + +# Import the skrl components to build the RL system +from skrl.models.torch import Model, TabularMixin +from skrl.agents.torch.q_learning import Q_LEARNING, Q_LEARNING_DEFAULT_CONFIG +from skrl.trainers.torch import SequentialTrainer +from skrl.envs.torch import wrap_env + + +# Define the model (tabular model) for the SARSA agent using mixin +class EpilonGreedyPolicy(TabularMixin, Model): + def __init__(self, observation_space, action_space, device, num_envs=1, epsilon=0.1): + Model.__init__(self, observation_space, action_space, device) + TabularMixin.__init__(self, num_envs) + + self.epsilon = epsilon + self.q_table = torch.ones((num_envs, self.num_observations, self.num_actions), + dtype=torch.float32, device=self.device) + + def compute(self, inputs, role): + actions = torch.argmax(self.q_table[torch.arange(self.num_envs).view(-1, 1), inputs["states"]], + dim=-1, keepdim=True).view(-1,1) + + # choose random actions for exploration according to epsilon + indexes = (torch.rand(inputs["states"].shape[0], device=self.device) < self.epsilon).nonzero().view(-1) + if indexes.numel(): + actions[indexes] = torch.randint(self.num_actions, (indexes.numel(), 1), device=self.device) + return actions, {} + + +# Load and wrap the Gymnasium environment. +# Note: the environment version may change depending on the gymnasium version +try: + env = gym.vector.make("FrozenLake-v0", num_envs=10, asynchronous=False) +except (gym.error.DeprecatedEnv, gym.error.VersionNotFound) as e: + env_id = [spec for spec in gym.envs.registry if spec.startswith("FrozenLake-v")][0] + print("FrozenLake-v0 not found. Trying {}".format(env_id)) + env = gym.vector.make(env_id, num_envs=10, asynchronous=False) +env = wrap_env(env) + +device = env.device + + +# Instantiate the agent's models (table) +# Q-learning requires 1 model, visit its documentation for more details +# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.q_learning.html#spaces-and-models +models_q_learning = {} +models_q_learning["policy"] = EpilonGreedyPolicy(env.observation_space, env.action_space, device, num_envs=env.num_envs, epsilon=0.1) + + +# Configure and instantiate the agent. +# Only modify some of the default configuration, visit its documentation to see all the options +# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.q_learning.html#configuration-and-hyperparameters +cfg_q_learning = Q_LEARNING_DEFAULT_CONFIG.copy() +cfg_q_learning["discount_factor"] = 0.999 +cfg_q_learning["alpha"] = 0.4 +# logging to TensorBoard and write checkpoints each 1600 and 8000 timesteps respectively +cfg_q_learning["experiment"]["write_interval"] = 1600 +cfg_q_learning["experiment"]["checkpoint_interval"] = 8000 + +agent_q_learning = Q_LEARNING(models=models_q_learning, + memory=None, + cfg=cfg_q_learning, + observation_space=env.observation_space, + action_space=env.action_space, + device=device) + + +# Configure and instantiate the RL trainer +cfg_trainer = {"timesteps": 80000, "headless": True} +trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=agent_q_learning) + +# start training +trainer.train() diff --git a/docs/source/examples/gymnasium/sarsa_gymnasium_taxi.py b/docs/source/examples/gymnasium/sarsa_gymnasium_taxi.py new file mode 100644 index 00000000..000eca7b --- /dev/null +++ b/docs/source/examples/gymnasium/sarsa_gymnasium_taxi.py @@ -0,0 +1,76 @@ +import gymnasium as gym + +import torch + +# Import the skrl components to build the RL system +from skrl.models.torch import Model, TabularMixin +from skrl.agents.torch.sarsa import SARSA, SARSA_DEFAULT_CONFIG +from skrl.trainers.torch import SequentialTrainer +from skrl.envs.torch import wrap_env + + +# Define the model (tabular model) for the SARSA agent using mixin +class EpilonGreedyPolicy(TabularMixin, Model): + def __init__(self, observation_space, action_space, device, num_envs=1, epsilon=0.1): + Model.__init__(self, observation_space, action_space, device) + TabularMixin.__init__(self, num_envs) + + self.epsilon = epsilon + self.q_table = torch.ones((num_envs, self.num_observations, self.num_actions), + dtype=torch.float32, device=self.device) + + def compute(self, inputs, role): + actions = torch.argmax(self.q_table[torch.arange(self.num_envs).view(-1, 1), inputs["states"]], + dim=-1, keepdim=True).view(-1,1) + + # choose random actions for exploration according to epsilon + indexes = (torch.rand(inputs["states"].shape[0], device=self.device) < self.epsilon).nonzero().view(-1) + if indexes.numel(): + actions[indexes] = torch.randint(self.num_actions, (indexes.numel(), 1), device=self.device) + return actions, {} + + +# Load and wrap the Gymnasium environment. +# Note: the environment version may change depending on the gymnasium version +try: + env = gym.make("Taxi-v3") +except (gym.error.DeprecatedEnv, gym.error.VersionNotFound) as e: + env_id = [spec for spec in gym.envs.registry if spec.startswith("Taxi-v")][0] + print("Taxi-v3 not found. Trying {}".format(env_id)) + env = gym.make(env_id) +env = wrap_env(env) + +device = env.device + + +# Instantiate the agent's models (table) +# SARSA requires 1 model, visit its documentation for more details +# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.sarsa.html#spaces-and-models +models_sarsa = {} +models_sarsa["policy"] = EpilonGreedyPolicy(env.observation_space, env.action_space, device, num_envs=env.num_envs, epsilon=0.1) + + +# Configure and instantiate the agent. +# Only modify some of the default configuration, visit its documentation to see all the options +# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.sarsa.html#configuration-and-hyperparameters +cfg_sarsa = SARSA_DEFAULT_CONFIG.copy() +cfg_sarsa["discount_factor"] = 0.999 +cfg_sarsa["alpha"] = 0.4 +# logging to TensorBoard and write checkpoints each 1600 and 8000 timesteps respectively +cfg_sarsa["experiment"]["write_interval"] = 1600 +cfg_sarsa["experiment"]["checkpoint_interval"] = 8000 + +agent_sarsa = SARSA(models=models_sarsa, + memory=None, + cfg=cfg_sarsa, + observation_space=env.observation_space, + action_space=env.action_space, + device=device) + + +# Configure and instantiate the RL trainer +cfg_trainer = {"timesteps": 80000, "headless": True} +trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=agent_sarsa) + +# start training +trainer.train() diff --git a/docs/source/examples/gymnasium/sarsa_gymnasium_taxi_eval.py b/docs/source/examples/gymnasium/sarsa_gymnasium_taxi_eval.py new file mode 100644 index 00000000..3899c8a6 --- /dev/null +++ b/docs/source/examples/gymnasium/sarsa_gymnasium_taxi_eval.py @@ -0,0 +1,78 @@ +import gymnasium as gym + +import torch + +# Import the skrl components to build the RL system +from skrl.models.torch import Model, TabularMixin +from skrl.agents.torch.sarsa import SARSA, SARSA_DEFAULT_CONFIG +from skrl.trainers.torch import SequentialTrainer +from skrl.envs.torch import wrap_env + + +# Define the model (tabular model) for the SARSA agent using a helper class +class EpilonGreedyPolicy(TabularMixin, Model): + def __init__(self, observation_space, action_space, device, num_envs=1, epsilon=0.1): + Model.__init__(self, observation_space, action_space, device) + TabularMixin.__init__(self, num_envs) + + self.epsilon = epsilon + self.q_table = torch.ones((num_envs, self.num_observations, self.num_actions), + dtype=torch.float32, device=self.device) + + def compute(self, inputs, role): + actions = torch.argmax(self.q_table[torch.arange(self.num_envs).view(-1, 1), inputs["states"]], + dim=-1, keepdim=True).view(-1,1) + + # choose random actions for exploration according to epsilon + indexes = (torch.rand(inputs["states"].shape[0], device=self.device) < self.epsilon).nonzero().view(-1) + if indexes.numel(): + actions[indexes] = torch.randint(self.num_actions, (indexes.numel(), 1), device=self.device) + return actions, {} + + +# Load and wrap the Gymnasium environment. +# Note: the environment version may change depending on the gymnasium version +try: + env = gym.make("Taxi-v3") +except (gym.error.DeprecatedEnv, gym.error.VersionNotFound) as e: + env_id = [spec for spec in gym.envs.registry if spec.startswith("Taxi-v")][0] + print("Taxi-v3 not found. Trying {}".format(env_id)) + env = gym.make(env_id) +env = wrap_env(env) + +device = env.device + + +# Instantiate the agent's models (table) +# SARSA requires 1 model, visit its documentation for more details +# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.sarsa.html#spaces-and-models +models_sarsa = {} +models_sarsa["policy"] = EpilonGreedyPolicy(env.observation_space, env.action_space, device, num_envs=env.num_envs, epsilon=0.1) + + +# Configure and instantiate the agent. +# Only modify some of the default configuration, visit its documentation to see all the options +# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.sarsa.html#configuration-and-hyperparameters +cfg_sarsa = SARSA_DEFAULT_CONFIG.copy() +cfg_sarsa["random_timesteps"] = 0 +# logging to TensorBoard and write checkpoints each 1600 and ignore checkpoints +cfg_sarsa["experiment"]["write_interval"] = 1600 +cfg_sarsa["experiment"]["checkpoint_interval"] = 0 + +agent_sarsa = SARSA(models=models_sarsa, + memory=None, + cfg=cfg_sarsa, + observation_space=env.observation_space, + action_space=env.action_space, + device=device) + +# load checkpoint +agent_sarsa.load("./runs/22-09-10_13-13-41-011999_SARSA/checkpoints/agent_80000.pt") + + +# Configure and instantiate the RL trainer +cfg_trainer = {"timesteps": 80000, "headless": True} +trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=agent_sarsa) + +# evaluate the agent +trainer.eval() diff --git a/docs/source/examples/gymnasium/sarsa_gymnasium_taxi_vector.py b/docs/source/examples/gymnasium/sarsa_gymnasium_taxi_vector.py new file mode 100644 index 00000000..8ab8bd0e --- /dev/null +++ b/docs/source/examples/gymnasium/sarsa_gymnasium_taxi_vector.py @@ -0,0 +1,76 @@ +import gymnasium as gym + +import torch + +# Import the skrl components to build the RL system +from skrl.models.torch import Model, TabularMixin +from skrl.agents.torch.sarsa import SARSA, SARSA_DEFAULT_CONFIG +from skrl.trainers.torch import SequentialTrainer +from skrl.envs.torch import wrap_env + + +# Define the model (tabular model) for the SARSA agent using mixin +class EpilonGreedyPolicy(TabularMixin, Model): + def __init__(self, observation_space, action_space, device, num_envs=1, epsilon=0.1): + Model.__init__(self, observation_space, action_space, device) + TabularMixin.__init__(self, num_envs) + + self.epsilon = epsilon + self.q_table = torch.ones((num_envs, self.num_observations, self.num_actions), + dtype=torch.float32, device=self.device) + + def compute(self, inputs, role): + actions = torch.argmax(self.q_table[torch.arange(self.num_envs).view(-1, 1), inputs["states"]], + dim=-1, keepdim=True).view(-1,1) + + # choose random actions for exploration according to epsilon + indexes = (torch.rand(inputs["states"].shape[0], device=self.device) < self.epsilon).nonzero().view(-1) + if indexes.numel(): + actions[indexes] = torch.randint(self.num_actions, (indexes.numel(), 1), device=self.device) + return actions, {} + + +# Load and wrap the Gymnasium environment. +# Note: the environment version may change depending on the gymnasium version +try: + env = gym.vector.make("Taxi-v3", num_envs=10, asynchronous=False) +except (gym.error.DeprecatedEnv, gym.error.VersionNotFound) as e: + env_id = [spec for spec in gym.envs.registry if spec.startswith("Taxi-v")][0] + print("Taxi-v3 not found. Trying {}".format(env_id)) + env = gym.vector.make(env_id, num_envs=10, asynchronous=False) +env = wrap_env(env) + +device = env.device + + +# Instantiate the agent's models (table) +# SARSA requires 1 model, visit its documentation for more details +# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.sarsa.html#spaces-and-models +models_sarsa = {} +models_sarsa["policy"] = EpilonGreedyPolicy(env.observation_space, env.action_space, device, num_envs=env.num_envs, epsilon=0.1) + + +# Configure and instantiate the agent. +# Only modify some of the default configuration, visit its documentation to see all the options +# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.sarsa.html#configuration-and-hyperparameters +cfg_sarsa = SARSA_DEFAULT_CONFIG.copy() +cfg_sarsa["discount_factor"] = 0.999 +cfg_sarsa["alpha"] = 0.4 +# logging to TensorBoard and write checkpoints each 1600 and 8000 timesteps respectively +cfg_sarsa["experiment"]["write_interval"] = 1600 +cfg_sarsa["experiment"]["checkpoint_interval"] = 8000 + +agent_sarsa = SARSA(models=models_sarsa, + memory=None, + cfg=cfg_sarsa, + observation_space=env.observation_space, + action_space=env.action_space, + device=device) + + +# Configure and instantiate the RL trainer +cfg_trainer = {"timesteps": 80000, "headless": True} +trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=agent_sarsa) + +# start training +trainer.train() diff --git a/docs/source/examples/isaacgym/amp_humanoid.py b/docs/source/examples/isaacgym/amp_humanoid.py index 11a3e580..0dd3ca42 100644 --- a/docs/source/examples/isaacgym/amp_humanoid.py +++ b/docs/source/examples/isaacgym/amp_humanoid.py @@ -33,12 +33,12 @@ def __init__(self, observation_space, action_space, device, clip_actions=False, nn.Linear(1024, 512), nn.ReLU(), nn.Linear(512, self.num_actions)) - + # set a fixed log standard deviation for the policy self.log_std_parameter = nn.Parameter(torch.full((self.num_actions,), fill_value=-2.9), requires_grad=False) - def compute(self, states, taken_actions, role): - return torch.tanh(self.net(states)), self.log_std_parameter + def compute(self, inputs, role): + return torch.tanh(self.net(inputs["states"])), self.log_std_parameter, {} class Value(DeterministicMixin, Model): def __init__(self, observation_space, action_space, device, clip_actions=False): @@ -51,8 +51,8 @@ def __init__(self, observation_space, action_space, device, clip_actions=False): nn.ReLU(), nn.Linear(512, 1)) - def compute(self, states, taken_actions, role): - return self.net(states) + def compute(self, inputs, role): + return self.net(inputs["states"]), {} class Discriminator(DeterministicMixin, Model): def __init__(self, observation_space, action_space, device, clip_actions=False): @@ -65,8 +65,8 @@ def __init__(self, observation_space, action_space, device, clip_actions=False): nn.ReLU(), nn.Linear(512, 1)) - def compute(self, states, taken_actions, role): - return self.net(states) + def compute(self, inputs, role): + return self.net(inputs["states"]), {} # Load and wrap the Isaac Gym environment @@ -127,9 +127,9 @@ def compute(self, states, taken_actions, role): cfg_amp["experiment"]["checkpoint_interval"] = 4000 agent = AMP(models=models_amp, - memory=memory, - cfg=cfg_amp, - observation_space=env.observation_space, + memory=memory, + cfg=cfg_amp, + observation_space=env.observation_space, action_space=env.action_space, device=device, amp_observation_space=env.amp_observation_space, diff --git a/docs/source/examples/isaacgym/isaacgym_parallel_no_shared_memory.py b/docs/source/examples/isaacgym/isaacgym_parallel_no_shared_memory.py index 6a1d8cf9..09283af9 100644 --- a/docs/source/examples/isaacgym/isaacgym_parallel_no_shared_memory.py +++ b/docs/source/examples/isaacgym/isaacgym_parallel_no_shared_memory.py @@ -32,8 +32,8 @@ def __init__(self, observation_space, action_space, device, clip_actions=False, nn.Linear(32, self.num_actions)) self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) - def compute(self, states, taken_actions, role): - return self.net(states), self.log_std_parameter + def compute(self, inputs, role): + return self.net(inputs["states"]), self.log_std_parameter, {} class DeterministicActor(DeterministicMixin, Model): def __init__(self, observation_space, action_space, device, clip_actions=False): @@ -46,8 +46,8 @@ def __init__(self, observation_space, action_space, device, clip_actions=False): nn.ELU(), nn.Linear(32, self.num_actions)) - def compute(self, states, taken_actions, role): - return self.net(states) + def compute(self, inputs, role): + return self.net(inputs["states"]), {} class Critic(DeterministicMixin, Model): def __init__(self, observation_space, action_space, device, clip_actions=False): @@ -60,8 +60,8 @@ def __init__(self, observation_space, action_space, device, clip_actions=False): nn.ELU(), nn.Linear(32, 1)) - def compute(self, states, taken_actions, role): - return self.net(torch.cat([states, taken_actions], dim=1)) + def compute(self, inputs, role): + return self.net(torch.cat([inputs["states"], inputs["taken_actions"]], dim=1)), {} if __name__ == '__main__': @@ -112,7 +112,7 @@ def compute(self, states, taken_actions, role): model.init_parameters(method_name="normal_", mean=0.0, std=0.1) for model in models_sac.values(): model.init_parameters(method_name="normal_", mean=0.0, std=0.1) - + # Configure and instantiate the agent. # Only modify some of the default configuration, visit its documentation to see all the options @@ -149,32 +149,32 @@ def compute(self, states, taken_actions, role): cfg_sac["experiment"]["write_interval"] = 25 cfg_sac["experiment"]["checkpoint_interval"] = 1000 - agent_ddpg = DDPG(models=models_ddpg, - memory=memory_ddpg, - cfg=cfg_ddpg, - observation_space=env.observation_space, + agent_ddpg = DDPG(models=models_ddpg, + memory=memory_ddpg, + cfg=cfg_ddpg, + observation_space=env.observation_space, action_space=env.action_space, device=device) - agent_td3 = TD3(models=models_td3, - memory=memory_td3, - cfg=cfg_td3, - observation_space=env.observation_space, + agent_td3 = TD3(models=models_td3, + memory=memory_td3, + cfg=cfg_td3, + observation_space=env.observation_space, action_space=env.action_space, device=device) - agent_sac = SAC(models=models_sac, - memory=memory_sac, - cfg=cfg_sac, - observation_space=env.observation_space, + agent_sac = SAC(models=models_sac, + memory=memory_sac, + cfg=cfg_sac, + observation_space=env.observation_space, action_space=env.action_space, device=device) # Configure and instantiate the RL trainer and define the agent scopes cfg = {"timesteps": 8000, "headless": True} - trainer = ParallelTrainer(cfg=cfg, - env=env, + trainer = ParallelTrainer(cfg=cfg, + env=env, agents=[agent_ddpg, agent_td3, agent_sac], agents_scope=[100, 200, 212]) # agent scopes diff --git a/docs/source/examples/isaacgym/isaacgym_parallel_no_shared_memory_eval.py b/docs/source/examples/isaacgym/isaacgym_parallel_no_shared_memory_eval.py index a935e9c9..a4614752 100644 --- a/docs/source/examples/isaacgym/isaacgym_parallel_no_shared_memory_eval.py +++ b/docs/source/examples/isaacgym/isaacgym_parallel_no_shared_memory_eval.py @@ -27,8 +27,8 @@ def __init__(self, observation_space, action_space, device, clip_actions=False, nn.Linear(32, self.num_actions)) self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) - def compute(self, states, taken_actions, role): - return self.net(states), self.log_std_parameter + def compute(self, inputs, role): + return self.net(inputs["states"]), self.log_std_parameter, {} class DeterministicActor(DeterministicMixin, Model): def __init__(self, observation_space, action_space, device, clip_actions=False): @@ -41,8 +41,8 @@ def __init__(self, observation_space, action_space, device, clip_actions=False): nn.ELU(), nn.Linear(32, self.num_actions)) - def compute(self, states, taken_actions, role): - return self.net(states) + def compute(self, inputs, role): + return self.net(inputs["states"]), {} if __name__ == '__main__': @@ -90,24 +90,24 @@ def compute(self, states, taken_actions, role): cfg_sac["experiment"]["write_interval"] = 25 cfg_sac["experiment"]["checkpoint_interval"] = 0 - agent_ddpg = DDPG(models=models_ddpg, - memory=None, - cfg=cfg_ddpg, - observation_space=env.observation_space, + agent_ddpg = DDPG(models=models_ddpg, + memory=None, + cfg=cfg_ddpg, + observation_space=env.observation_space, action_space=env.action_space, device=device) - agent_td3 = TD3(models=models_td3, - memory=None, - cfg=cfg_td3, - observation_space=env.observation_space, + agent_td3 = TD3(models=models_td3, + memory=None, + cfg=cfg_td3, + observation_space=env.observation_space, action_space=env.action_space, device=device) - agent_sac = SAC(models=models_sac, - memory=None, - cfg=cfg_sac, - observation_space=env.observation_space, + agent_sac = SAC(models=models_sac, + memory=None, + cfg=cfg_sac, + observation_space=env.observation_space, action_space=env.action_space, device=device) @@ -119,8 +119,8 @@ def compute(self, states, taken_actions, role): # Configure and instantiate the RL trainer and define the agent scopes cfg = {"timesteps": 8000, "headless": True} - trainer = ParallelTrainer(cfg=cfg, - env=env, + trainer = ParallelTrainer(cfg=cfg, + env=env, agents=[agent_ddpg, agent_td3, agent_sac], agents_scope=[100, 200, 212]) # agent scopes diff --git a/docs/source/examples/isaacgym/isaacgym_sequential_no_shared_memory.py b/docs/source/examples/isaacgym/isaacgym_sequential_no_shared_memory.py index 3cb0dc54..9c1d1b9d 100644 --- a/docs/source/examples/isaacgym/isaacgym_sequential_no_shared_memory.py +++ b/docs/source/examples/isaacgym/isaacgym_sequential_no_shared_memory.py @@ -32,8 +32,8 @@ def __init__(self, observation_space, action_space, device, clip_actions=False, nn.Linear(32, self.num_actions)) self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) - def compute(self, states, taken_actions, role): - return self.net(states), self.log_std_parameter + def compute(self, inputs, role): + return self.net(inputs["states"]), self.log_std_parameter, {} class DeterministicActor(DeterministicMixin, Model): def __init__(self, observation_space, action_space, device, clip_actions=False): @@ -46,8 +46,8 @@ def __init__(self, observation_space, action_space, device, clip_actions=False): nn.ELU(), nn.Linear(32, self.num_actions)) - def compute(self, states, taken_actions, role): - return self.net(states) + def compute(self, inputs, role): + return self.net(inputs["states"]), {} class Critic(DeterministicMixin, Model): def __init__(self, observation_space, action_space, device, clip_actions=False): @@ -60,8 +60,8 @@ def __init__(self, observation_space, action_space, device, clip_actions=False): nn.ELU(), nn.Linear(32, 1)) - def compute(self, states, taken_actions, role): - return self.net(torch.cat([states, taken_actions], dim=1)) + def compute(self, inputs, role): + return self.net(torch.cat([inputs["states"], inputs["taken_actions"]], dim=1)), {} # Load and wrap the Isaac Gym environment @@ -110,7 +110,7 @@ def compute(self, states, taken_actions, role): model.init_parameters(method_name="normal_", mean=0.0, std=0.1) for model in models_sac.values(): model.init_parameters(method_name="normal_", mean=0.0, std=0.1) - + # Configure and instantiate the agent. # Only modify some of the default configuration, visit its documentation to see all the options @@ -147,32 +147,32 @@ def compute(self, states, taken_actions, role): cfg_sac["experiment"]["write_interval"] = 25 cfg_sac["experiment"]["checkpoint_interval"] = 1000 -agent_ddpg = DDPG(models=models_ddpg, - memory=memory_ddpg, - cfg=cfg_ddpg, - observation_space=env.observation_space, +agent_ddpg = DDPG(models=models_ddpg, + memory=memory_ddpg, + cfg=cfg_ddpg, + observation_space=env.observation_space, action_space=env.action_space, device=device) -agent_td3 = TD3(models=models_td3, - memory=memory_td3, - cfg=cfg_td3, - observation_space=env.observation_space, +agent_td3 = TD3(models=models_td3, + memory=memory_td3, + cfg=cfg_td3, + observation_space=env.observation_space, action_space=env.action_space, device=device) -agent_sac = SAC(models=models_sac, - memory=memory_sac, - cfg=cfg_sac, - observation_space=env.observation_space, +agent_sac = SAC(models=models_sac, + memory=memory_sac, + cfg=cfg_sac, + observation_space=env.observation_space, action_space=env.action_space, device=device) # Configure and instantiate the RL trainer and define the agent scopes cfg = {"timesteps": 8000, "headless": True} -trainer = SequentialTrainer(cfg=cfg, - env=env, +trainer = SequentialTrainer(cfg=cfg, + env=env, agents=[agent_ddpg, agent_td3, agent_sac], agents_scope=[100, 200, 212]) # agent scopes diff --git a/docs/source/examples/isaacgym/isaacgym_sequential_no_shared_memory_eval.py b/docs/source/examples/isaacgym/isaacgym_sequential_no_shared_memory_eval.py index e775c3c7..9c599d66 100644 --- a/docs/source/examples/isaacgym/isaacgym_sequential_no_shared_memory_eval.py +++ b/docs/source/examples/isaacgym/isaacgym_sequential_no_shared_memory_eval.py @@ -27,8 +27,8 @@ def __init__(self, observation_space, action_space, device, clip_actions=False, nn.Linear(32, self.num_actions)) self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) - def compute(self, states, taken_actions, role): - return self.net(states), self.log_std_parameter + def compute(self, inputs, role): + return self.net(inputs["states"]), self.log_std_parameter, {} class DeterministicActor(DeterministicMixin, Model): def __init__(self, observation_space, action_space, device, clip_actions=False): @@ -41,8 +41,8 @@ def __init__(self, observation_space, action_space, device, clip_actions=False): nn.ELU(), nn.Linear(32, self.num_actions)) - def compute(self, states, taken_actions, role): - return self.net(states) + def compute(self, inputs, role): + return self.net(inputs["states"]), {} # Load and wrap the Isaac Gym environment @@ -88,24 +88,24 @@ def compute(self, states, taken_actions, role): cfg_sac["experiment"]["write_interval"] = 25 cfg_sac["experiment"]["checkpoint_interval"] = 0 -agent_ddpg = DDPG(models=models_ddpg, - memory=None, - cfg=cfg_ddpg, - observation_space=env.observation_space, +agent_ddpg = DDPG(models=models_ddpg, + memory=None, + cfg=cfg_ddpg, + observation_space=env.observation_space, action_space=env.action_space, device=device) -agent_td3 = TD3(models=models_td3, - memory=None, - cfg=cfg_td3, - observation_space=env.observation_space, +agent_td3 = TD3(models=models_td3, + memory=None, + cfg=cfg_td3, + observation_space=env.observation_space, action_space=env.action_space, device=device) -agent_sac = SAC(models=models_sac, - memory=None, - cfg=cfg_sac, - observation_space=env.observation_space, +agent_sac = SAC(models=models_sac, + memory=None, + cfg=cfg_sac, + observation_space=env.observation_space, action_space=env.action_space, device=device) @@ -117,8 +117,8 @@ def compute(self, states, taken_actions, role): # Configure and instantiate the RL trainer cfg = {"timesteps": 8000, "headless": True} -trainer = SequentialTrainer(cfg=cfg, - env=env, +trainer = SequentialTrainer(cfg=cfg, + env=env, agents=[agent_ddpg, agent_td3, agent_sac], agents_scope=[100, 200, 212]) diff --git a/docs/source/examples/isaacgym/isaacgym_sequential_shared_memory.py b/docs/source/examples/isaacgym/isaacgym_sequential_shared_memory.py index ef93756e..f8f24172 100644 --- a/docs/source/examples/isaacgym/isaacgym_sequential_shared_memory.py +++ b/docs/source/examples/isaacgym/isaacgym_sequential_shared_memory.py @@ -32,8 +32,8 @@ def __init__(self, observation_space, action_space, device, clip_actions=False, nn.Linear(32, self.num_actions)) self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) - def compute(self, states, taken_actions, role): - return self.net(states), self.log_std_parameter + def compute(self, inputs, role): + return self.net(inputs["states"]), self.log_std_parameter, {} class DeterministicActor(DeterministicMixin, Model): def __init__(self, observation_space, action_space, device, clip_actions=False): @@ -46,8 +46,8 @@ def __init__(self, observation_space, action_space, device, clip_actions=False): nn.ELU(), nn.Linear(32, self.num_actions)) - def compute(self, states, taken_actions, role): - return self.net(states) + def compute(self, inputs, role): + return self.net(inputs["states"]), {} class Critic(DeterministicMixin, Model): def __init__(self, observation_space, action_space, device, clip_actions=False): @@ -60,8 +60,8 @@ def __init__(self, observation_space, action_space, device, clip_actions=False): nn.ELU(), nn.Linear(32, 1)) - def compute(self, states, taken_actions, role): - return self.net(torch.cat([states, taken_actions], dim=1)) + def compute(self, inputs, role): + return self.net(torch.cat([inputs["states"], inputs["taken_actions"]], dim=1)), {} # Load and wrap the Isaac Gym environment @@ -108,7 +108,7 @@ def compute(self, states, taken_actions, role): model.init_parameters(method_name="normal_", mean=0.0, std=0.1) for model in models_sac.values(): model.init_parameters(method_name="normal_", mean=0.0, std=0.1) - + # Configure and instantiate the agent. # Only modify some of the default configuration, visit its documentation to see all the options @@ -145,32 +145,32 @@ def compute(self, states, taken_actions, role): cfg_sac["experiment"]["write_interval"] = 25 cfg_sac["experiment"]["checkpoint_interval"] = 1000 -agent_ddpg = DDPG(models=models_ddpg, - memory=memory, - cfg=cfg_ddpg, - observation_space=env.observation_space, +agent_ddpg = DDPG(models=models_ddpg, + memory=memory, + cfg=cfg_ddpg, + observation_space=env.observation_space, action_space=env.action_space, device=device) -agent_td3 = TD3(models=models_td3, - memory=memory, - cfg=cfg_td3, - observation_space=env.observation_space, +agent_td3 = TD3(models=models_td3, + memory=memory, + cfg=cfg_td3, + observation_space=env.observation_space, action_space=env.action_space, device=device) -agent_sac = SAC(models=models_sac, - memory=memory, - cfg=cfg_sac, - observation_space=env.observation_space, +agent_sac = SAC(models=models_sac, + memory=memory, + cfg=cfg_sac, + observation_space=env.observation_space, action_space=env.action_space, device=device) # Configure and instantiate the RL trainer cfg = {"timesteps": 8000, "headless": True} -trainer = SequentialTrainer(cfg=cfg, - env=env, +trainer = SequentialTrainer(cfg=cfg, + env=env, agents=[agent_ddpg, agent_td3, agent_sac], agents_scope=[]) diff --git a/docs/source/examples/isaacgym/isaacgym_sequential_shared_memory_eval.py b/docs/source/examples/isaacgym/isaacgym_sequential_shared_memory_eval.py index 209a5d1c..841a99e9 100644 --- a/docs/source/examples/isaacgym/isaacgym_sequential_shared_memory_eval.py +++ b/docs/source/examples/isaacgym/isaacgym_sequential_shared_memory_eval.py @@ -27,8 +27,8 @@ def __init__(self, observation_space, action_space, device, clip_actions=False, nn.Linear(32, self.num_actions)) self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) - def compute(self, states, taken_actions, role): - return self.net(states), self.log_std_parameter + def compute(self, inputs, role): + return self.net(inputs["states"]), self.log_std_parameter, {} class DeterministicActor(DeterministicMixin, Model): def __init__(self, observation_space, action_space, device, clip_actions=False): @@ -41,8 +41,8 @@ def __init__(self, observation_space, action_space, device, clip_actions=False): nn.ELU(), nn.Linear(32, self.num_actions)) - def compute(self, states, taken_actions, role): - return self.net(states) + def compute(self, inputs, role): + return self.net(inputs["states"]), {} # Load and wrap the Isaac Gym environment @@ -88,24 +88,24 @@ def compute(self, states, taken_actions, role): cfg_sac["experiment"]["write_interval"] = 25 cfg_sac["experiment"]["checkpoint_interval"] = 0 -agent_ddpg = DDPG(models=models_ddpg, - memory=None, - cfg=cfg_ddpg, - observation_space=env.observation_space, +agent_ddpg = DDPG(models=models_ddpg, + memory=None, + cfg=cfg_ddpg, + observation_space=env.observation_space, action_space=env.action_space, device=device) -agent_td3 = TD3(models=models_td3, - memory=None, - cfg=cfg_td3, - observation_space=env.observation_space, +agent_td3 = TD3(models=models_td3, + memory=None, + cfg=cfg_td3, + observation_space=env.observation_space, action_space=env.action_space, device=device) -agent_sac = SAC(models=models_sac, - memory=None, - cfg=cfg_sac, - observation_space=env.observation_space, +agent_sac = SAC(models=models_sac, + memory=None, + cfg=cfg_sac, + observation_space=env.observation_space, action_space=env.action_space, device=device) @@ -117,8 +117,8 @@ def compute(self, states, taken_actions, role): # Configure and instantiate the RL trainer cfg = {"timesteps": 8000, "headless": True} -trainer = SequentialTrainer(cfg=cfg, - env=env, +trainer = SequentialTrainer(cfg=cfg, + env=env, agents=[agent_ddpg, agent_td3, agent_sac], agents_scope=[]) diff --git a/docs/source/examples/isaacgym/ppo_allegro_hand.py b/docs/source/examples/isaacgym/ppo_allegro_hand.py index 2214df9b..d3d3225e 100644 --- a/docs/source/examples/isaacgym/ppo_allegro_hand.py +++ b/docs/source/examples/isaacgym/ppo_allegro_hand.py @@ -33,29 +33,29 @@ def __init__(self, observation_space, action_space, device, clip_actions=False, nn.ELU(), nn.Linear(256, 128), nn.ELU()) - + self.mean_layer = nn.Linear(128, self.num_actions) self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) - + self.value_layer = nn.Linear(128, 1) - def act(self, states, taken_actions, role): + def act(self, inputs, role): if role == "policy": - return GaussianMixin.act(self, states, taken_actions, role) + return GaussianMixin.act(self, inputs, role) elif role == "value": - return DeterministicMixin.act(self, states, taken_actions, role) + return DeterministicMixin.act(self, inputs, role) - def compute(self, states, taken_actions, role): + def compute(self, inputs, role): if role == "policy": - return self.mean_layer(self.net(states)), self.log_std_parameter + return self.mean_layer(self.net(inputs["states"])), self.log_std_parameter, {} elif role == "value": - return self.value_layer(self.net(states)) + return self.value_layer(self.net(inputs["states"])), {} # Load and wrap the Isaac Gym environment using the easy-to-use API from NVIDIA -env = isaacgymenvs.make(seed=seed, - task="AllegroHand", - num_envs=16384, +env = isaacgymenvs.make(seed=seed, + task="AllegroHand", + num_envs=16384, sim_device="cuda:0", rl_device="cuda:0", graphics_device_id=0, @@ -108,9 +108,9 @@ def compute(self, states, taken_actions, role): cfg_ppo["experiment"]["checkpoint_interval"] = 2000 agent = PPO(models=models_ppo, - memory=memory, - cfg=cfg_ppo, - observation_space=env.observation_space, + memory=memory, + cfg=cfg_ppo, + observation_space=env.observation_space, action_space=env.action_space, device=device) diff --git a/docs/source/examples/isaacgym/ppo_ant.py b/docs/source/examples/isaacgym/ppo_ant.py index 4bbc11f1..06e3eaec 100644 --- a/docs/source/examples/isaacgym/ppo_ant.py +++ b/docs/source/examples/isaacgym/ppo_ant.py @@ -33,23 +33,23 @@ def __init__(self, observation_space, action_space, device, clip_actions=False, nn.ELU(), nn.Linear(128, 64), nn.ELU()) - + self.mean_layer = nn.Linear(64, self.num_actions) self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) - + self.value_layer = nn.Linear(64, 1) - def act(self, states, taken_actions, role): + def act(self, inputs, role): if role == "policy": - return GaussianMixin.act(self, states, taken_actions, role) + return GaussianMixin.act(self, inputs, role) elif role == "value": - return DeterministicMixin.act(self, states, taken_actions, role) + return DeterministicMixin.act(self, inputs, role) - def compute(self, states, taken_actions, role): + def compute(self, inputs, role): if role == "policy": - return self.mean_layer(self.net(states)), self.log_std_parameter + return self.mean_layer(self.net(inputs["states"])), self.log_std_parameter, {} elif role == "value": - return self.value_layer(self.net(states)) + return self.value_layer(self.net(inputs["states"])), {} # Load and wrap the Isaac Gym environment @@ -102,9 +102,9 @@ def compute(self, states, taken_actions, role): cfg_ppo["experiment"]["checkpoint_interval"] = 400 agent = PPO(models=models_ppo, - memory=memory, - cfg=cfg_ppo, - observation_space=env.observation_space, + memory=memory, + cfg=cfg_ppo, + observation_space=env.observation_space, action_space=env.action_space, device=device) diff --git a/docs/source/examples/isaacgym/ppo_anymal.py b/docs/source/examples/isaacgym/ppo_anymal.py index 1de573e8..53c4913d 100644 --- a/docs/source/examples/isaacgym/ppo_anymal.py +++ b/docs/source/examples/isaacgym/ppo_anymal.py @@ -33,23 +33,23 @@ def __init__(self, observation_space, action_space, device, clip_actions=False, nn.ELU(), nn.Linear(128, 64), nn.ELU()) - + self.mean_layer = nn.Linear(64, self.num_actions) self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) - + self.value_layer = nn.Linear(64, 1) - def act(self, states, taken_actions, role): + def act(self, inputs, role): if role == "policy": - return GaussianMixin.act(self, states, taken_actions, role) + return GaussianMixin.act(self, inputs, role) elif role == "value": - return DeterministicMixin.act(self, states, taken_actions, role) + return DeterministicMixin.act(self, inputs, role) - def compute(self, states, taken_actions, role): + def compute(self, inputs, role): if role == "policy": - return self.mean_layer(self.net(states)), self.log_std_parameter + return self.mean_layer(self.net(inputs["states"])), self.log_std_parameter, {} elif role == "value": - return self.value_layer(self.net(states)) + return self.value_layer(self.net(inputs["states"])), {} # Load and wrap the Isaac Gym environment @@ -102,9 +102,9 @@ def compute(self, states, taken_actions, role): cfg_ppo["experiment"]["checkpoint_interval"] = 1200 agent = PPO(models=models_ppo, - memory=memory, - cfg=cfg_ppo, - observation_space=env.observation_space, + memory=memory, + cfg=cfg_ppo, + observation_space=env.observation_space, action_space=env.action_space, device=device) diff --git a/docs/source/examples/isaacgym/ppo_anymal_terrain.py b/docs/source/examples/isaacgym/ppo_anymal_terrain.py index f83e0c9f..550635c7 100644 --- a/docs/source/examples/isaacgym/ppo_anymal_terrain.py +++ b/docs/source/examples/isaacgym/ppo_anymal_terrain.py @@ -37,8 +37,8 @@ def __init__(self, observation_space, action_space, device, clip_actions=False, nn.Linear(128, self.num_actions)) self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) - def compute(self, states, taken_actions, role): - return self.net(states), self.log_std_parameter + def compute(self, inputs, role): + return self.net(inputs["states"]), self.log_std_parameter, {} class Value(DeterministicMixin, Model): def __init__(self, observation_space, action_space, device, clip_actions=False): @@ -53,8 +53,8 @@ def __init__(self, observation_space, action_space, device, clip_actions=False): nn.ELU(), nn.Linear(128, 1)) - def compute(self, states, taken_actions, role): - return self.net(states) + def compute(self, inputs, role): + return self.net(inputs["states"]), {} # Load and wrap the Isaac Gym environment @@ -107,9 +107,9 @@ def compute(self, states, taken_actions, role): cfg_ppo["experiment"]["checkpoint_interval"] = 1800 agent = PPO(models=models_ppo, - memory=memory, - cfg=cfg_ppo, - observation_space=env.observation_space, + memory=memory, + cfg=cfg_ppo, + observation_space=env.observation_space, action_space=env.action_space, device=device) diff --git a/docs/source/examples/isaacgym/ppo_ball_balance.py b/docs/source/examples/isaacgym/ppo_ball_balance.py index e3428f60..9cc9ccb8 100644 --- a/docs/source/examples/isaacgym/ppo_ball_balance.py +++ b/docs/source/examples/isaacgym/ppo_ball_balance.py @@ -33,23 +33,23 @@ def __init__(self, observation_space, action_space, device, clip_actions=False, nn.ELU(), nn.Linear(64, 32), nn.ELU()) - + self.mean_layer = nn.Linear(32, self.num_actions) self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) - + self.value_layer = nn.Linear(32, 1) - def act(self, states, taken_actions, role): + def act(self, inputs, role): if role == "policy": - return GaussianMixin.act(self, states, taken_actions, role) + return GaussianMixin.act(self, inputs, role) elif role == "value": - return DeterministicMixin.act(self, states, taken_actions, role) + return DeterministicMixin.act(self, inputs, role) - def compute(self, states, taken_actions, role): + def compute(self, inputs, role): if role == "policy": - return self.mean_layer(self.net(states)), self.log_std_parameter + return self.mean_layer(self.net(inputs["states"])), self.log_std_parameter, {} elif role == "value": - return self.value_layer(self.net(states)) + return self.value_layer(self.net(inputs["states"])), {} # Load and wrap the Isaac Gym environment diff --git a/docs/source/examples/isaacgym/ppo_cartpole.py b/docs/source/examples/isaacgym/ppo_cartpole.py index e3f8c253..38aa7110 100644 --- a/docs/source/examples/isaacgym/ppo_cartpole.py +++ b/docs/source/examples/isaacgym/ppo_cartpole.py @@ -16,7 +16,7 @@ # set the seed for reproducibility -set_seed(42) +set_seed(32) # Define the shared model (stochastic and deterministic models) for the agent using mixins. @@ -31,23 +31,23 @@ def __init__(self, observation_space, action_space, device, clip_actions=False, nn.ELU(), nn.Linear(32, 32), nn.ELU()) - + self.mean_layer = nn.Linear(32, self.num_actions) self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) - + self.value_layer = nn.Linear(32, 1) - def act(self, states, taken_actions, role): + def act(self, inputs, role): if role == "policy": - return GaussianMixin.act(self, states, taken_actions, role) + return GaussianMixin.act(self, inputs, role) elif role == "value": - return DeterministicMixin.act(self, states, taken_actions, role) + return DeterministicMixin.act(self, inputs, role) - def compute(self, states, taken_actions, role): + def compute(self, inputs, role): if role == "policy": - return self.mean_layer(self.net(states)), self.log_std_parameter + return self.mean_layer(self.net(inputs["states"])), self.log_std_parameter, {} elif role == "value": - return self.value_layer(self.net(states)) + return self.value_layer(self.net(inputs["states"])), {} # Load and wrap the Isaac Gym environment diff --git a/docs/source/examples/isaacgym/ppo_cartpole_eval.py b/docs/source/examples/isaacgym/ppo_cartpole_eval.py index dde950f6..c35a93db 100644 --- a/docs/source/examples/isaacgym/ppo_cartpole_eval.py +++ b/docs/source/examples/isaacgym/ppo_cartpole_eval.py @@ -24,23 +24,23 @@ def __init__(self, observation_space, action_space, device, clip_actions=False, nn.ELU(), nn.Linear(32, 32), nn.ELU()) - + self.mean_layer = nn.Linear(32, self.num_actions) self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) - + self.value_layer = nn.Linear(32, 1) - def act(self, states, taken_actions, role): + def act(self, inputs, role): if role == "policy": - return GaussianMixin.act(self, states, taken_actions, role) + return GaussianMixin.act(self, inputs, role) elif role == "value": - return DeterministicMixin.act(self, states, taken_actions, role) + return DeterministicMixin.act(self, inputs, role) - def compute(self, states, taken_actions, role): + def compute(self, inputs, role): if role == "policy": - return self.mean_layer(self.net(states)), self.log_std_parameter + return self.mean_layer(self.net(inputs["states"])), self.log_std_parameter, {} elif role == "value": - return self.value_layer(self.net(states)) + return self.value_layer(self.net(inputs["states"])), {} # Load and wrap the Isaac Gym environment @@ -68,9 +68,9 @@ def compute(self, states, taken_actions, role): cfg_ppo["experiment"]["checkpoint_interval"] = 0 agent = PPO(models=models_ppo, - memory=None, - cfg=cfg_ppo, - observation_space=env.observation_space, + memory=None, + cfg=cfg_ppo, + observation_space=env.observation_space, action_space=env.action_space, device=device) diff --git a/docs/source/examples/isaacgym/ppo_franka_cabinet.py b/docs/source/examples/isaacgym/ppo_franka_cabinet.py index 3fd9e834..4d951ea6 100644 --- a/docs/source/examples/isaacgym/ppo_franka_cabinet.py +++ b/docs/source/examples/isaacgym/ppo_franka_cabinet.py @@ -33,23 +33,23 @@ def __init__(self, observation_space, action_space, device, clip_actions=False, nn.ELU(), nn.Linear(128, 64), nn.ELU()) - + self.mean_layer = nn.Linear(64, self.num_actions) self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) - + self.value_layer = nn.Linear(64, 1) - def act(self, states, taken_actions, role): + def act(self, inputs, role): if role == "policy": - return GaussianMixin.act(self, states, taken_actions, role) + return GaussianMixin.act(self, inputs, role) elif role == "value": - return DeterministicMixin.act(self, states, taken_actions, role) + return DeterministicMixin.act(self, inputs, role) - def compute(self, states, taken_actions, role): + def compute(self, inputs, role): if role == "policy": - return self.mean_layer(self.net(states)), self.log_std_parameter + return self.mean_layer(self.net(inputs["states"])), self.log_std_parameter, {} elif role == "value": - return self.value_layer(self.net(states)) + return self.value_layer(self.net(inputs["states"])), {} # Load and wrap the Isaac Gym environment @@ -77,7 +77,7 @@ def compute(self, states, taken_actions, role): cfg_ppo = PPO_DEFAULT_CONFIG.copy() cfg_ppo["rollouts"] = 16 # memory_size cfg_ppo["learning_epochs"] = 8 -cfg_ppo["mini_batches"] = 8 # 16 * 4096 / 8192 +cfg_ppo["mini_batches"] = 8 # 16 * 4096 / 8192 cfg_ppo["discount_factor"] = 0.99 cfg_ppo["lambda"] = 0.95 cfg_ppo["learning_rate"] = 5e-4 @@ -102,9 +102,9 @@ def compute(self, states, taken_actions, role): cfg_ppo["experiment"]["checkpoint_interval"] = 1200 agent = PPO(models=models_ppo, - memory=memory, - cfg=cfg_ppo, - observation_space=env.observation_space, + memory=memory, + cfg=cfg_ppo, + observation_space=env.observation_space, action_space=env.action_space, device=device) diff --git a/docs/source/examples/isaacgym/ppo_humanoid.py b/docs/source/examples/isaacgym/ppo_humanoid.py index 19893771..2d5dd996 100644 --- a/docs/source/examples/isaacgym/ppo_humanoid.py +++ b/docs/source/examples/isaacgym/ppo_humanoid.py @@ -33,23 +33,23 @@ def __init__(self, observation_space, action_space, device, clip_actions=False, nn.ELU(), nn.Linear(200, 100), nn.ELU()) - + self.mean_layer = nn.Linear(100, self.num_actions) self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) - + self.value_layer = nn.Linear(100, 1) - def act(self, states, taken_actions, role): + def act(self, inputs, role): if role == "policy": - return GaussianMixin.act(self, states, taken_actions, role) + return GaussianMixin.act(self, inputs, role) elif role == "value": - return DeterministicMixin.act(self, states, taken_actions, role) + return DeterministicMixin.act(self, inputs, role) - def compute(self, states, taken_actions, role): + def compute(self, inputs, role): if role == "policy": - return self.mean_layer(self.net(states)), self.log_std_parameter + return self.mean_layer(self.net(inputs["states"])), self.log_std_parameter, {} elif role == "value": - return self.value_layer(self.net(states)) + return self.value_layer(self.net(inputs["states"])), {} # Load and wrap the Isaac Gym environment @@ -102,9 +102,9 @@ def compute(self, states, taken_actions, role): cfg_ppo["experiment"]["checkpoint_interval"] = 1600 agent = PPO(models=models_ppo, - memory=memory, - cfg=cfg_ppo, - observation_space=env.observation_space, + memory=memory, + cfg=cfg_ppo, + observation_space=env.observation_space, action_space=env.action_space, device=device) diff --git a/docs/source/examples/isaacgym/ppo_ingenuity.py b/docs/source/examples/isaacgym/ppo_ingenuity.py index f60d46aa..e1635033 100644 --- a/docs/source/examples/isaacgym/ppo_ingenuity.py +++ b/docs/source/examples/isaacgym/ppo_ingenuity.py @@ -33,29 +33,29 @@ def __init__(self, observation_space, action_space, device, clip_actions=False, nn.ELU(), nn.Linear(256, 128), nn.ELU()) - + self.mean_layer = nn.Linear(128, self.num_actions) self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) - + self.value_layer = nn.Linear(128, 1) - def act(self, states, taken_actions, role): + def act(self, inputs, role): if role == "policy": - return GaussianMixin.act(self, states, taken_actions, role) + return GaussianMixin.act(self, inputs, role) elif role == "value": - return DeterministicMixin.act(self, states, taken_actions, role) + return DeterministicMixin.act(self, inputs, role) - def compute(self, states, taken_actions, role): + def compute(self, inputs, role): if role == "policy": - return self.mean_layer(self.net(states)), self.log_std_parameter + return self.mean_layer(self.net(inputs["states"])), self.log_std_parameter, {} elif role == "value": - return self.value_layer(self.net(states)) + return self.value_layer(self.net(inputs["states"])), {} # Load and wrap the Isaac Gym environment using the easy-to-use API from NVIDIA -env = isaacgymenvs.make(seed=seed, - task="Ingenuity", - num_envs=4096, +env = isaacgymenvs.make(seed=seed, + task="Ingenuity", + num_envs=4096, sim_device="cuda:0", rl_device="cuda:0", graphics_device_id=0, @@ -108,9 +108,9 @@ def compute(self, states, taken_actions, role): cfg_ppo["experiment"]["checkpoint_interval"] = 400 agent = PPO(models=models_ppo, - memory=memory, - cfg=cfg_ppo, - observation_space=env.observation_space, + memory=memory, + cfg=cfg_ppo, + observation_space=env.observation_space, action_space=env.action_space, device=device) diff --git a/docs/source/examples/isaacgym/ppo_quadcopter.py b/docs/source/examples/isaacgym/ppo_quadcopter.py index 34ed47ed..b7f34182 100644 --- a/docs/source/examples/isaacgym/ppo_quadcopter.py +++ b/docs/source/examples/isaacgym/ppo_quadcopter.py @@ -33,23 +33,23 @@ def __init__(self, observation_space, action_space, device, clip_actions=False, nn.ELU(), nn.Linear(256, 128), nn.ELU()) - + self.mean_layer = nn.Linear(128, self.num_actions) self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) - + self.value_layer = nn.Linear(128, 1) - def act(self, states, taken_actions, role): + def act(self, inputs, role): if role == "policy": - return GaussianMixin.act(self, states, taken_actions, role) + return GaussianMixin.act(self, inputs, role) elif role == "value": - return DeterministicMixin.act(self, states, taken_actions, role) + return DeterministicMixin.act(self, inputs, role) - def compute(self, states, taken_actions, role): + def compute(self, inputs, role): if role == "policy": - return self.mean_layer(self.net(states)), self.log_std_parameter + return self.mean_layer(self.net(inputs["states"])), self.log_std_parameter, {} elif role == "value": - return self.value_layer(self.net(states)) + return self.value_layer(self.net(inputs["states"])), {} # Load and wrap the Isaac Gym environment diff --git a/docs/source/examples/isaacgym/ppo_shadow_hand.py b/docs/source/examples/isaacgym/ppo_shadow_hand.py index c7e8636a..3263777b 100644 --- a/docs/source/examples/isaacgym/ppo_shadow_hand.py +++ b/docs/source/examples/isaacgym/ppo_shadow_hand.py @@ -35,23 +35,23 @@ def __init__(self, observation_space, action_space, device, clip_actions=False, nn.ELU(), nn.Linear(256, 128), nn.ELU()) - + self.mean_layer = nn.Linear(128, self.num_actions) self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) - + self.value_layer = nn.Linear(128, 1) - def act(self, states, taken_actions, role): + def act(self, inputs, role): if role == "policy": - return GaussianMixin.act(self, states, taken_actions, role) + return GaussianMixin.act(self, inputs, role) elif role == "value": - return DeterministicMixin.act(self, states, taken_actions, role) + return DeterministicMixin.act(self, inputs, role) - def compute(self, states, taken_actions, role): + def compute(self, inputs, role): if role == "policy": - return self.mean_layer(self.net(states)), self.log_std_parameter + return self.mean_layer(self.net(inputs["states"])), self.log_std_parameter, {} elif role == "value": - return self.value_layer(self.net(states)) + return self.value_layer(self.net(inputs["states"])), {} # Load and wrap the Isaac Gym environment @@ -104,9 +104,9 @@ def compute(self, states, taken_actions, role): cfg_ppo["experiment"]["checkpoint_interval"] = 2000 agent = PPO(models=models_ppo, - memory=memory, - cfg=cfg_ppo, - observation_space=env.observation_space, + memory=memory, + cfg=cfg_ppo, + observation_space=env.observation_space, action_space=env.action_space, device=device) diff --git a/docs/source/examples/isaacgym/ppo_trifinger.py b/docs/source/examples/isaacgym/ppo_trifinger.py index 87591dcf..9be44ef2 100644 --- a/docs/source/examples/isaacgym/ppo_trifinger.py +++ b/docs/source/examples/isaacgym/ppo_trifinger.py @@ -35,23 +35,23 @@ def __init__(self, observation_space, action_space, device, clip_actions=False, nn.ELU(), nn.Linear(128, 128), nn.ELU()) - + self.mean_layer = nn.Linear(128, self.num_actions) self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) - + self.value_layer = nn.Linear(128, 1) - def act(self, states, taken_actions, role): + def act(self, inputs, role): if role == "policy": - return GaussianMixin.act(self, states, taken_actions, role) + return GaussianMixin.act(self, inputs, role) elif role == "value": - return DeterministicMixin.act(self, states, taken_actions, role) + return DeterministicMixin.act(self, inputs, role) - def compute(self, states, taken_actions, role): + def compute(self, inputs, role): if role == "policy": - return self.mean_layer(self.net(states)), self.log_std_parameter + return self.mean_layer(self.net(inputs["states"])), self.log_std_parameter, {} elif role == "value": - return self.value_layer(self.net(states)) + return self.value_layer(self.net(inputs["states"])), {} # Load and wrap the Isaac Gym environment @@ -70,7 +70,7 @@ def compute(self, states, taken_actions, role): # https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#spaces-and-models models_ppo = {} models_ppo["policy"] = Shared(env.observation_space, env.action_space, device) -models_ppo["value"] = models_ppo["policy"] # same instance: shared model +models_ppo["value"] = models_ppo["policy"] # same instance: shared model # Configure and instantiate the agent. @@ -102,9 +102,9 @@ def compute(self, states, taken_actions, role): cfg_ppo["experiment"]["checkpoint_interval"] = 8000 agent = PPO(models=models_ppo, - memory=memory, - cfg=cfg_ppo, - observation_space=env.observation_space, + memory=memory, + cfg=cfg_ppo, + observation_space=env.observation_space, action_space=env.action_space, device=device) diff --git a/docs/source/examples/isaacgym/trpo_cartpole.py b/docs/source/examples/isaacgym/trpo_cartpole.py index 8b0a76e7..2be1c716 100644 --- a/docs/source/examples/isaacgym/trpo_cartpole.py +++ b/docs/source/examples/isaacgym/trpo_cartpole.py @@ -34,8 +34,8 @@ def __init__(self, observation_space, action_space, device, clip_actions=False, nn.Linear(32, self.num_actions)) self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) - def compute(self, states, taken_actions, role): - return self.net(states), self.log_std_parameter + def compute(self, inputs, role): + return self.net(inputs["states"]), self.log_std_parameter, {} class Value(DeterministicMixin, Model): def __init__(self, observation_space, action_space, device, clip_actions=False): @@ -48,8 +48,8 @@ def __init__(self, observation_space, action_space, device, clip_actions=False): nn.ELU(), nn.Linear(32, 1)) - def compute(self, states, taken_actions, role): - return self.net(states) + def compute(self, inputs, role): + return self.net(inputs["states"]), {} # Load and wrap the Isaac Gym environment @@ -70,28 +70,26 @@ def compute(self, states, taken_actions, role): models_trpo["policy"] = Policy(env.observation_space, env.action_space, device) models_trpo["value"] = Value(env.observation_space, env.action_space, device) -# Initialize the models' parameters (weights and biases) using a Gaussian distribution -for model in models_trpo.values(): - model.init_parameters(method_name="normal_", mean=0.0, std=0.1) - # Configure and instantiate the agent. # Only modify some of the default configuration, visit its documentation to see all the options # https://skrl.readthedocs.io/en/latest/modules/skrl.agents.trpo.html#configuration-and-hyperparameters cfg_trpo = TRPO_DEFAULT_CONFIG.copy() cfg_trpo["rollouts"] = 16 # memory_size -cfg_trpo["learning_epochs"] = 6 -cfg_trpo["mini_batches"] = 2 -cfg_trpo["grad_norm_clip"] = 0.5 -cfg_trpo["value_loss_scale"] = 2.0 +cfg_trpo["learning_epochs"] = 8 +cfg_trpo["mini_batches"] = 1 +cfg_trpo["discount_factor"] = 0.99 cfg_trpo["lambda"] = 0.95 +cfg_trpo["learning_rate"] = 3e-4 +cfg_trpo["grad_norm_clip"] = 1.0 +cfg_trpo["value_loss_scale"] = 2.0 cfg_trpo["state_preprocessor"] = RunningStandardScaler cfg_trpo["state_preprocessor_kwargs"] = {"size": env.observation_space, "device": device} cfg_trpo["value_preprocessor"] = RunningStandardScaler cfg_trpo["value_preprocessor_kwargs"] = {"size": 1, "device": device} -# logging to TensorBoard and write checkpoints each 16 and 125 timesteps respectively +# logging to TensorBoard and write checkpoints each 16 and 80 timesteps respectively cfg_trpo["experiment"]["write_interval"] = 16 -cfg_trpo["experiment"]["checkpoint_interval"] = 125 +cfg_trpo["experiment"]["checkpoint_interval"] = 80 agent = TRPO(models=models_trpo, memory=memory, diff --git a/docs/source/examples/isaacsim/cartpole_example_skrl.py b/docs/source/examples/isaacsim/cartpole_example_skrl.py index d31c56e2..e54a4305 100644 --- a/docs/source/examples/isaacsim/cartpole_example_skrl.py +++ b/docs/source/examples/isaacsim/cartpole_example_skrl.py @@ -1,4 +1,4 @@ -# Omniverse Isaac Sim tutorial: Creating New RL Environment +# Omniverse Isaac Sim tutorial: Creating New RL Environment # https://docs.omniverse.nvidia.com/app_isaacsim/app_isaacsim/tutorial_gym_new_rl_example.html # Instance of VecEnvBase and create the task @@ -38,8 +38,8 @@ def __init__(self, observation_space, action_space, device, clip_actions=False, nn.Linear(64, self.num_actions)) self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) - def compute(self, states, taken_actions, role): - return torch.tanh(self.net(states)), self.log_std_parameter + def compute(self, inputs, role): + return torch.tanh(self.net(inputs["states"])), self.log_std_parameter, {} class Value(DeterministicMixin, Model): def __init__(self, observation_space, action_space, device, clip_actions=False): @@ -52,8 +52,8 @@ def __init__(self, observation_space, action_space, device, clip_actions=False): nn.Tanh(), nn.Linear(64, 1)) - def compute(self, states, taken_actions, role): - return self.net(states) + def compute(self, inputs, role): + return self.net(inputs["states"]), {} # Load and wrap the environment @@ -75,7 +75,7 @@ def compute(self, states, taken_actions, role): # Initialize the models' parameters (weights and biases) using a Gaussian distribution for model in models_ppo.values(): - model.init_parameters(method_name="normal_", mean=0.0, std=0.1) + model.init_parameters(method_name="normal_", mean=0.0, std=0.1) # Configure and instantiate the agent. @@ -101,9 +101,9 @@ def compute(self, states, taken_actions, role): cfg_ppo["experiment"]["checkpoint_interval"] = 10000 agent = PPO(models=models_ppo, - memory=memory, - cfg=cfg_ppo, - observation_space=env.observation_space, + memory=memory, + cfg=cfg_ppo, + observation_space=env.observation_space, action_space=env.action_space, device=device) diff --git a/docs/source/examples/isaacsim/isaacsim_jetbot_ppo.py b/docs/source/examples/isaacsim/isaacsim_jetbot_ppo.py index 84790767..6a0dadd0 100644 --- a/docs/source/examples/isaacsim/isaacsim_jetbot_ppo.py +++ b/docs/source/examples/isaacsim/isaacsim_jetbot_ppo.py @@ -39,11 +39,11 @@ def __init__(self, observation_space, action_space, device, clip_actions=False, nn.Linear(32, self.num_actions)) self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) - def compute(self, states, taken_actions, role): + def compute(self, inputs, role): # view (samples, width * height * channels) -> (samples, width, height, channels) # permute (samples, width, height, channels) -> (samples, channels, width, height) - x = self.net(states.view(-1, *self.observation_space.shape).permute(0, 3, 1, 2)) - return 10 * torch.tanh(x), self.log_std_parameter # JetBotEnv action_space is -10 to 10 + x = self.net(inputs["states"].view(-1, *self.observation_space.shape).permute(0, 3, 1, 2)) + return 10 * torch.tanh(x), self.log_std_parameter, {} # JetBotEnv action_space is -10 to 10 class Value(DeterministicMixin, Model): def __init__(self, observation_space, action_space, device, clip_actions=False): @@ -67,10 +67,10 @@ def __init__(self, observation_space, action_space, device, clip_actions=False): nn.Tanh(), nn.Linear(32, 1)) - def compute(self, states, taken_actions, role): + def compute(self, inputs, role): # view (samples, width * height * channels) -> (samples, width, height, channels) # permute (samples, width, height, channels) -> (samples, channels, width, height) - return self.net(states.view(-1, *self.observation_space.shape).permute(0, 3, 1, 2)) + return self.net(inputs["states"].view(-1, *self.observation_space.shape).permute(0, 3, 1, 2)), {} # Load and wrap the JetBot environment (a subclass of Gym) diff --git a/docs/source/examples/omniisaacgym/ppo_allegro_hand.py b/docs/source/examples/omniisaacgym/ppo_allegro_hand.py index b83b0933..74e50271 100644 --- a/docs/source/examples/omniisaacgym/ppo_allegro_hand.py +++ b/docs/source/examples/omniisaacgym/ppo_allegro_hand.py @@ -31,23 +31,23 @@ def __init__(self, observation_space, action_space, device, clip_actions=False, nn.ELU(), nn.Linear(256, 128), nn.ELU()) - + self.mean_layer = nn.Linear(128, self.num_actions) self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) - + self.value_layer = nn.Linear(128, 1) - def act(self, states, taken_actions, role): + def act(self, inputs, role): if role == "policy": - return GaussianMixin.act(self, states, taken_actions, role) + return GaussianMixin.act(self, inputs, role) elif role == "value": - return DeterministicMixin.act(self, states, taken_actions, role) + return DeterministicMixin.act(self, inputs, role) - def compute(self, states, taken_actions, role): + def compute(self, inputs, role): if role == "policy": - return self.mean_layer(self.net(states)), self.log_std_parameter + return self.mean_layer(self.net(inputs["states"])), self.log_std_parameter, {} elif role == "value": - return self.value_layer(self.net(states)) + return self.value_layer(self.net(inputs["states"])), {} # Load and wrap the Omniverse Isaac Gym environment @@ -78,7 +78,7 @@ def compute(self, states, taken_actions, role): cfg_ppo["mini_batches"] = 4 # 16 * 8192 / 32768 cfg_ppo["discount_factor"] = 0.99 cfg_ppo["lambda"] = 0.95 -cfg_ppo["learning_rate"] = 5e-3 +cfg_ppo["learning_rate"] = 5e-4 cfg_ppo["learning_rate_scheduler"] = KLAdaptiveRL cfg_ppo["learning_rate_scheduler_kwargs"] = {"kl_threshold": 0.02} cfg_ppo["random_timesteps"] = 0 @@ -100,9 +100,9 @@ def compute(self, states, taken_actions, role): cfg_ppo["experiment"]["checkpoint_interval"] = 8000 agent = PPO(models=models_ppo, - memory=memory, - cfg=cfg_ppo, - observation_space=env.observation_space, + memory=memory, + cfg=cfg_ppo, + observation_space=env.observation_space, action_space=env.action_space, device=device) diff --git a/docs/source/examples/omniisaacgym/ppo_ant.py b/docs/source/examples/omniisaacgym/ppo_ant.py index 4282b76d..0e090a47 100644 --- a/docs/source/examples/omniisaacgym/ppo_ant.py +++ b/docs/source/examples/omniisaacgym/ppo_ant.py @@ -31,23 +31,23 @@ def __init__(self, observation_space, action_space, device, clip_actions=False, nn.ELU(), nn.Linear(128, 64), nn.ELU()) - + self.mean_layer = nn.Linear(64, self.num_actions) self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) - + self.value_layer = nn.Linear(64, 1) - def act(self, states, taken_actions, role): + def act(self, inputs, role): if role == "policy": - return GaussianMixin.act(self, states, taken_actions, role) + return GaussianMixin.act(self, inputs, role) elif role == "value": - return DeterministicMixin.act(self, states, taken_actions, role) + return DeterministicMixin.act(self, inputs, role) - def compute(self, states, taken_actions, role): + def compute(self, inputs, role): if role == "policy": - return self.mean_layer(self.net(states)), self.log_std_parameter + return self.mean_layer(self.net(inputs["states"])), self.log_std_parameter, {} elif role == "value": - return self.value_layer(self.net(states)) + return self.value_layer(self.net(inputs["states"])), {} # Load and wrap the Omniverse Isaac Gym environment @@ -100,9 +100,9 @@ def compute(self, states, taken_actions, role): cfg_ppo["experiment"]["checkpoint_interval"] = 400 agent = PPO(models=models_ppo, - memory=memory, - cfg=cfg_ppo, - observation_space=env.observation_space, + memory=memory, + cfg=cfg_ppo, + observation_space=env.observation_space, action_space=env.action_space, device=device) diff --git a/docs/source/examples/omniisaacgym/ppo_ant_mt.py b/docs/source/examples/omniisaacgym/ppo_ant_mt.py index d3bdbc50..4aafeae8 100644 --- a/docs/source/examples/omniisaacgym/ppo_ant_mt.py +++ b/docs/source/examples/omniisaacgym/ppo_ant_mt.py @@ -33,23 +33,23 @@ def __init__(self, observation_space, action_space, device, clip_actions=False, nn.ELU(), nn.Linear(128, 64), nn.ELU()) - + self.mean_layer = nn.Linear(64, self.num_actions) self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) - + self.value_layer = nn.Linear(64, 1) - def act(self, states, taken_actions, role): + def act(self, inputs, role): if role == "policy": - return GaussianMixin.act(self, states, taken_actions, role) + return GaussianMixin.act(self, inputs, role) elif role == "value": - return DeterministicMixin.act(self, states, taken_actions, role) + return DeterministicMixin.act(self, inputs, role) - def compute(self, states, taken_actions, role): + def compute(self, inputs, role): if role == "policy": - return self.mean_layer(self.net(states)), self.log_std_parameter + return self.mean_layer(self.net(inputs["states"])), self.log_std_parameter, {} elif role == "value": - return self.value_layer(self.net(states)) + return self.value_layer(self.net(inputs["states"])), {} # Load and wrap the Omniverse Isaac Gym environment @@ -68,7 +68,7 @@ def compute(self, states, taken_actions, role): # https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#spaces-and-models models_ppo = {} models_ppo["policy"] = Shared(env.observation_space, env.action_space, device) -models_ppo["value"] = models_ppo["policy"] # same instance: shared model +models_ppo["value"] = models_ppo["policy"] # same instance: shared model # Configure and instantiate the agent. @@ -102,9 +102,9 @@ def compute(self, states, taken_actions, role): cfg_ppo["experiment"]["checkpoint_interval"] = 400 agent = PPO(models=models_ppo, - memory=memory, - cfg=cfg_ppo, - observation_space=env.observation_space, + memory=memory, + cfg=cfg_ppo, + observation_space=env.observation_space, action_space=env.action_space, device=device) diff --git a/docs/source/examples/omniisaacgym/ppo_anymal.py b/docs/source/examples/omniisaacgym/ppo_anymal.py index 72505e7e..904a86c6 100644 --- a/docs/source/examples/omniisaacgym/ppo_anymal.py +++ b/docs/source/examples/omniisaacgym/ppo_anymal.py @@ -31,23 +31,23 @@ def __init__(self, observation_space, action_space, device, clip_actions=False, nn.ELU(), nn.Linear(128, 64), nn.ELU()) - + self.mean_layer = nn.Linear(64, self.num_actions) self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) - + self.value_layer = nn.Linear(64, 1) - def act(self, states, taken_actions, role): + def act(self, inputs, role): if role == "policy": - return GaussianMixin.act(self, states, taken_actions, role) + return GaussianMixin.act(self, inputs, role) elif role == "value": - return DeterministicMixin.act(self, states, taken_actions, role) + return DeterministicMixin.act(self, inputs, role) - def compute(self, states, taken_actions, role): + def compute(self, inputs, role): if role == "policy": - return self.mean_layer(self.net(states)), self.log_std_parameter + return self.mean_layer(self.net(inputs["states"])), self.log_std_parameter, {} elif role == "value": - return self.value_layer(self.net(states)) + return self.value_layer(self.net(inputs["states"])), {} # Load and wrap the Omniverse Isaac Gym environment @@ -100,9 +100,9 @@ def compute(self, states, taken_actions, role): cfg_ppo["experiment"]["checkpoint_interval"] = 1200 agent = PPO(models=models_ppo, - memory=memory, - cfg=cfg_ppo, - observation_space=env.observation_space, + memory=memory, + cfg=cfg_ppo, + observation_space=env.observation_space, action_space=env.action_space, device=device) diff --git a/docs/source/examples/omniisaacgym/ppo_anymal_terrain.py b/docs/source/examples/omniisaacgym/ppo_anymal_terrain.py index 5b6aaf80..b581cdc1 100644 --- a/docs/source/examples/omniisaacgym/ppo_anymal_terrain.py +++ b/docs/source/examples/omniisaacgym/ppo_anymal_terrain.py @@ -35,8 +35,8 @@ def __init__(self, observation_space, action_space, device, clip_actions=False, nn.Linear(128, self.num_actions)) self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) - def compute(self, states, taken_actions, role): - return self.net(states), self.log_std_parameter + def compute(self, inputs, role): + return self.net(inputs["states"]), self.log_std_parameter, {} class Value(DeterministicMixin, Model): def __init__(self, observation_space, action_space, device, clip_actions=False): @@ -51,8 +51,8 @@ def __init__(self, observation_space, action_space, device, clip_actions=False): nn.ELU(), nn.Linear(128, 1)) - def compute(self, states, taken_actions, role): - return self.net(states) + def compute(self, inputs, role): + return self.net(inputs["states"]), {} # Load and wrap the Omniverse Isaac Gym environment @@ -105,9 +105,9 @@ def compute(self, states, taken_actions, role): cfg_ppo["experiment"]["checkpoint_interval"] = 4800 agent = PPO(models=models_ppo, - memory=memory, - cfg=cfg_ppo, - observation_space=env.observation_space, + memory=memory, + cfg=cfg_ppo, + observation_space=env.observation_space, action_space=env.action_space, device=device) diff --git a/docs/source/examples/omniisaacgym/ppo_ball_balance.py b/docs/source/examples/omniisaacgym/ppo_ball_balance.py index f63dbe3a..5d19e67b 100644 --- a/docs/source/examples/omniisaacgym/ppo_ball_balance.py +++ b/docs/source/examples/omniisaacgym/ppo_ball_balance.py @@ -31,23 +31,23 @@ def __init__(self, observation_space, action_space, device, clip_actions=False, nn.ELU(), nn.Linear(64, 32), nn.ELU()) - + self.mean_layer = nn.Linear(32, self.num_actions) self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) - + self.value_layer = nn.Linear(32, 1) - def act(self, states, taken_actions, role): + def act(self, inputs, role): if role == "policy": - return GaussianMixin.act(self, states, taken_actions, role) + return GaussianMixin.act(self, inputs, role) elif role == "value": - return DeterministicMixin.act(self, states, taken_actions, role) + return DeterministicMixin.act(self, inputs, role) - def compute(self, states, taken_actions, role): + def compute(self, inputs, role): if role == "policy": - return self.mean_layer(self.net(states)), self.log_std_parameter + return self.mean_layer(self.net(inputs["states"])), self.log_std_parameter, {} elif role == "value": - return self.value_layer(self.net(states)) + return self.value_layer(self.net(inputs["states"])), {} # Load and wrap the Omniverse Isaac Gym environment diff --git a/docs/source/examples/omniisaacgym/ppo_cartpole.py b/docs/source/examples/omniisaacgym/ppo_cartpole.py index f091fdf9..9a5202ea 100644 --- a/docs/source/examples/omniisaacgym/ppo_cartpole.py +++ b/docs/source/examples/omniisaacgym/ppo_cartpole.py @@ -14,7 +14,7 @@ # set the seed for reproducibility -set_seed(42) +set_seed(40) # Define the shared model (stochastic and deterministic models) for the agent using mixins. @@ -29,23 +29,23 @@ def __init__(self, observation_space, action_space, device, clip_actions=False, nn.ELU(), nn.Linear(32, 32), nn.ELU()) - + self.mean_layer = nn.Linear(32, self.num_actions) self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) - + self.value_layer = nn.Linear(32, 1) - def act(self, states, taken_actions, role): + def act(self, inputs, role): if role == "policy": - return GaussianMixin.act(self, states, taken_actions, role) + return GaussianMixin.act(self, inputs, role) elif role == "value": - return DeterministicMixin.act(self, states, taken_actions, role) + return DeterministicMixin.act(self, inputs, role) - def compute(self, states, taken_actions, role): + def compute(self, inputs, role): if role == "policy": - return self.mean_layer(self.net(states)), self.log_std_parameter + return self.mean_layer(self.net(inputs["states"])), self.log_std_parameter, {} elif role == "value": - return self.value_layer(self.net(states)) + return self.value_layer(self.net(inputs["states"])), {} # Load and wrap the Omniverse Isaac Gym environment diff --git a/docs/source/examples/omniisaacgym/ppo_cartpole_mt.py b/docs/source/examples/omniisaacgym/ppo_cartpole_mt.py index 9838c6ca..7b84a708 100644 --- a/docs/source/examples/omniisaacgym/ppo_cartpole_mt.py +++ b/docs/source/examples/omniisaacgym/ppo_cartpole_mt.py @@ -31,23 +31,23 @@ def __init__(self, observation_space, action_space, device, clip_actions=False, nn.ELU(), nn.Linear(32, 32), nn.ELU()) - + self.mean_layer = nn.Linear(32, self.num_actions) self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) - + self.value_layer = nn.Linear(32, 1) - def act(self, states, taken_actions, role): + def act(self, inputs, role): if role == "policy": - return GaussianMixin.act(self, states, taken_actions, role) + return GaussianMixin.act(self, inputs, role) elif role == "value": - return DeterministicMixin.act(self, states, taken_actions, role) + return DeterministicMixin.act(self, inputs, role) - def compute(self, states, taken_actions, role): + def compute(self, inputs, role): if role == "policy": - return self.mean_layer(self.net(states)), self.log_std_parameter + return self.mean_layer(self.net(inputs["states"])), self.log_std_parameter, {} elif role == "value": - return self.value_layer(self.net(states)) + return self.value_layer(self.net(inputs["states"])), {} # Load and wrap the multi-threaded Omniverse Isaac Gym environment diff --git a/docs/source/examples/omniisaacgym/ppo_crazy_flie.py b/docs/source/examples/omniisaacgym/ppo_crazy_flie.py index b8c16fd6..682305ba 100644 --- a/docs/source/examples/omniisaacgym/ppo_crazy_flie.py +++ b/docs/source/examples/omniisaacgym/ppo_crazy_flie.py @@ -31,23 +31,23 @@ def __init__(self, observation_space, action_space, device, clip_actions=False, nn.Tanh(), nn.Linear(256, 128), nn.Tanh()) - + self.mean_layer = nn.Linear(128, self.num_actions) self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) - + self.value_layer = nn.Linear(128, 1) - def act(self, states, taken_actions, role): + def act(self, inputs, role): if role == "policy": - return GaussianMixin.act(self, states, taken_actions, role) + return GaussianMixin.act(self, inputs, role) elif role == "value": - return DeterministicMixin.act(self, states, taken_actions, role) + return DeterministicMixin.act(self, inputs, role) - def compute(self, states, taken_actions, role): + def compute(self, inputs, role): if role == "policy": - return self.mean_layer(self.net(states)), self.log_std_parameter + return self.mean_layer(self.net(inputs["states"])), self.log_std_parameter, {} elif role == "value": - return self.value_layer(self.net(states)) + return self.value_layer(self.net(inputs["states"])), {} # Load and wrap the Omniverse Isaac Gym environment @@ -75,7 +75,7 @@ def compute(self, states, taken_actions, role): cfg_ppo = PPO_DEFAULT_CONFIG.copy() cfg_ppo["rollouts"] = 16 # memory_size cfg_ppo["learning_epochs"] = 8 -cfg_ppo["mini_batches"] = 4 # 16 * 4096 / 16384 +cfg_ppo["mini_batches"] = 4 # 16 * 4096 / 16384 cfg_ppo["discount_factor"] = 0.99 cfg_ppo["lambda"] = 0.95 cfg_ppo["learning_rate"] = 1e-4 @@ -100,9 +100,9 @@ def compute(self, states, taken_actions, role): cfg_ppo["experiment"]["checkpoint_interval"] = 800 agent = PPO(models=models_ppo, - memory=memory, - cfg=cfg_ppo, - observation_space=env.observation_space, + memory=memory, + cfg=cfg_ppo, + observation_space=env.observation_space, action_space=env.action_space, device=device) diff --git a/docs/source/examples/omniisaacgym/ppo_franka_cabinet.py b/docs/source/examples/omniisaacgym/ppo_franka_cabinet.py index 310c2e2e..83483c7e 100644 --- a/docs/source/examples/omniisaacgym/ppo_franka_cabinet.py +++ b/docs/source/examples/omniisaacgym/ppo_franka_cabinet.py @@ -31,23 +31,23 @@ def __init__(self, observation_space, action_space, device, clip_actions=False, nn.ELU(), nn.Linear(128, 64), nn.ELU()) - + self.mean_layer = nn.Linear(64, self.num_actions) self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) - + self.value_layer = nn.Linear(64, 1) - def act(self, states, taken_actions, role): + def act(self, inputs, role): if role == "policy": - return GaussianMixin.act(self, states, taken_actions, role) + return GaussianMixin.act(self, inputs, role) elif role == "value": - return DeterministicMixin.act(self, states, taken_actions, role) + return DeterministicMixin.act(self, inputs, role) - def compute(self, states, taken_actions, role): + def compute(self, inputs, role): if role == "policy": - return self.mean_layer(self.net(states)), self.log_std_parameter + return self.mean_layer(self.net(inputs["states"])), self.log_std_parameter, {} elif role == "value": - return self.value_layer(self.net(states)) + return self.value_layer(self.net(inputs["states"])), {} # Load and wrap the Omniverse Isaac Gym environment @@ -75,7 +75,7 @@ def compute(self, states, taken_actions, role): cfg_ppo = PPO_DEFAULT_CONFIG.copy() cfg_ppo["rollouts"] = 16 # memory_size cfg_ppo["learning_epochs"] = 8 -cfg_ppo["mini_batches"] = 8 # 16 * 4096 / 8192 +cfg_ppo["mini_batches"] = 8 # 16 * 4096 / 8192 cfg_ppo["discount_factor"] = 0.99 cfg_ppo["lambda"] = 0.95 cfg_ppo["learning_rate"] = 5e-4 @@ -100,9 +100,9 @@ def compute(self, states, taken_actions, role): cfg_ppo["experiment"]["checkpoint_interval"] = 1200 agent = PPO(models=models_ppo, - memory=memory, - cfg=cfg_ppo, - observation_space=env.observation_space, + memory=memory, + cfg=cfg_ppo, + observation_space=env.observation_space, action_space=env.action_space, device=device) diff --git a/docs/source/examples/omniisaacgym/ppo_humanoid.py b/docs/source/examples/omniisaacgym/ppo_humanoid.py index cf45a8a1..610324fb 100644 --- a/docs/source/examples/omniisaacgym/ppo_humanoid.py +++ b/docs/source/examples/omniisaacgym/ppo_humanoid.py @@ -31,23 +31,23 @@ def __init__(self, observation_space, action_space, device, clip_actions=False, nn.ELU(), nn.Linear(200, 100), nn.ELU()) - + self.mean_layer = nn.Linear(100, self.num_actions) self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) - + self.value_layer = nn.Linear(100, 1) - def act(self, states, taken_actions, role): + def act(self, inputs, role): if role == "policy": - return GaussianMixin.act(self, states, taken_actions, role) + return GaussianMixin.act(self, inputs, role) elif role == "value": - return DeterministicMixin.act(self, states, taken_actions, role) + return DeterministicMixin.act(self, inputs, role) - def compute(self, states, taken_actions, role): + def compute(self, inputs, role): if role == "policy": - return self.mean_layer(self.net(states)), self.log_std_parameter + return self.mean_layer(self.net(inputs["states"])), self.log_std_parameter, {} elif role == "value": - return self.value_layer(self.net(states)) + return self.value_layer(self.net(inputs["states"])), {} # Load and wrap the Omniverse Isaac Gym environment @@ -66,7 +66,7 @@ def compute(self, states, taken_actions, role): # https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#spaces-and-models models_ppo = {} models_ppo["policy"] = Shared(env.observation_space, env.action_space, device) -models_ppo["value"] = models_ppo["policy"] # same instance: shared model +models_ppo["value"] = models_ppo["policy"] # same instance: shared model # Configure and instantiate the agent. @@ -100,9 +100,9 @@ def compute(self, states, taken_actions, role): cfg_ppo["experiment"]["checkpoint_interval"] = 1600 agent = PPO(models=models_ppo, - memory=memory, - cfg=cfg_ppo, - observation_space=env.observation_space, + memory=memory, + cfg=cfg_ppo, + observation_space=env.observation_space, action_space=env.action_space, device=device) diff --git a/docs/source/examples/omniisaacgym/ppo_ingenuity.py b/docs/source/examples/omniisaacgym/ppo_ingenuity.py index 345ec93c..c92e954a 100644 --- a/docs/source/examples/omniisaacgym/ppo_ingenuity.py +++ b/docs/source/examples/omniisaacgym/ppo_ingenuity.py @@ -31,23 +31,23 @@ def __init__(self, observation_space, action_space, device, clip_actions=False, nn.ELU(), nn.Linear(256, 128), nn.ELU()) - + self.mean_layer = nn.Linear(128, self.num_actions) self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) - + self.value_layer = nn.Linear(128, 1) - def act(self, states, taken_actions, role): + def act(self, inputs, role): if role == "policy": - return GaussianMixin.act(self, states, taken_actions, role) + return GaussianMixin.act(self, inputs, role) elif role == "value": - return DeterministicMixin.act(self, states, taken_actions, role) + return DeterministicMixin.act(self, inputs, role) - def compute(self, states, taken_actions, role): + def compute(self, inputs, role): if role == "policy": - return self.mean_layer(self.net(states)), self.log_std_parameter + return self.mean_layer(self.net(inputs["states"])), self.log_std_parameter, {} elif role == "value": - return self.value_layer(self.net(states)) + return self.value_layer(self.net(inputs["states"])), {} # Load and wrap the Omniverse Isaac Gym environment @@ -100,9 +100,9 @@ def compute(self, states, taken_actions, role): cfg_ppo["experiment"]["checkpoint_interval"] = 320 agent = PPO(models=models_ppo, - memory=memory, - cfg=cfg_ppo, - observation_space=env.observation_space, + memory=memory, + cfg=cfg_ppo, + observation_space=env.observation_space, action_space=env.action_space, device=device) diff --git a/docs/source/examples/omniisaacgym/ppo_quadcopter.py b/docs/source/examples/omniisaacgym/ppo_quadcopter.py index 2425c4f2..1839a8ee 100644 --- a/docs/source/examples/omniisaacgym/ppo_quadcopter.py +++ b/docs/source/examples/omniisaacgym/ppo_quadcopter.py @@ -31,23 +31,23 @@ def __init__(self, observation_space, action_space, device, clip_actions=False, nn.ELU(), nn.Linear(256, 128), nn.ELU()) - + self.mean_layer = nn.Linear(128, self.num_actions) self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) - + self.value_layer = nn.Linear(128, 1) - def act(self, states, taken_actions, role): + def act(self, inputs, role): if role == "policy": - return GaussianMixin.act(self, states, taken_actions, role) + return GaussianMixin.act(self, inputs, role) elif role == "value": - return DeterministicMixin.act(self, states, taken_actions, role) + return DeterministicMixin.act(self, inputs, role) - def compute(self, states, taken_actions, role): + def compute(self, inputs, role): if role == "policy": - return self.mean_layer(self.net(states)), self.log_std_parameter + return self.mean_layer(self.net(inputs["states"])), self.log_std_parameter, {} elif role == "value": - return self.value_layer(self.net(states)) + return self.value_layer(self.net(inputs["states"])), {} # Load and wrap the Omniverse Isaac Gym environment diff --git a/docs/source/examples/omniisaacgym/ppo_shadow_hand.py b/docs/source/examples/omniisaacgym/ppo_shadow_hand.py index 4908a92c..125c9e7f 100644 --- a/docs/source/examples/omniisaacgym/ppo_shadow_hand.py +++ b/docs/source/examples/omniisaacgym/ppo_shadow_hand.py @@ -33,23 +33,23 @@ def __init__(self, observation_space, action_space, device, clip_actions=False, nn.ELU(), nn.Linear(256, 128), nn.ELU()) - + self.mean_layer = nn.Linear(128, self.num_actions) self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) - + self.value_layer = nn.Linear(128, 1) - def act(self, states, taken_actions, role): + def act(self, inputs, role): if role == "policy": - return GaussianMixin.act(self, states, taken_actions, role) + return GaussianMixin.act(self, inputs, role) elif role == "value": - return DeterministicMixin.act(self, states, taken_actions, role) + return DeterministicMixin.act(self, inputs, role) - def compute(self, states, taken_actions, role): + def compute(self, inputs, role): if role == "policy": - return self.mean_layer(self.net(states)), self.log_std_parameter + return self.mean_layer(self.net(inputs["states"])), self.log_std_parameter, {} elif role == "value": - return self.value_layer(self.net(states)) + return self.value_layer(self.net(inputs["states"])), {} # Load and wrap the Omniverse Isaac Gym environment @@ -68,7 +68,7 @@ def compute(self, states, taken_actions, role): # https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#spaces-and-models models_ppo = {} models_ppo["policy"] = Shared(env.observation_space, env.action_space, device) -models_ppo["value"] = models_ppo["policy"] # same instance: shared model +models_ppo["value"] = models_ppo["policy"] # same instance: shared model # Configure and instantiate the agent. @@ -102,9 +102,9 @@ def compute(self, states, taken_actions, role): cfg_ppo["experiment"]["checkpoint_interval"] = 8000 agent = PPO(models=models_ppo, - memory=memory, - cfg=cfg_ppo, - observation_space=env.observation_space, + memory=memory, + cfg=cfg_ppo, + observation_space=env.observation_space, action_space=env.action_space, device=device) diff --git a/docs/source/examples/real_world/franka_emika_panda/reaching_franka_isaacgym_env.py b/docs/source/examples/real_world/franka_emika_panda/reaching_franka_isaacgym_env.py index b4a19b7a..848edbb5 100644 --- a/docs/source/examples/real_world/franka_emika_panda/reaching_franka_isaacgym_env.py +++ b/docs/source/examples/real_world/franka_emika_panda/reaching_franka_isaacgym_env.py @@ -13,43 +13,43 @@ from skrl.utils import isaacgym_utils -TASK_CFG = {"name": "ReachingFranka", - "physics_engine": "physx", - "rl_device": "cuda:0", - "sim_device": "cuda:0", - "graphics_device_id": 0, - "headless": False, - "virtual_screen_capture": False, +TASK_CFG = {"name": "ReachingFranka", + "physics_engine": "physx", + "rl_device": "cuda:0", + "sim_device": "cuda:0", + "graphics_device_id": 0, + "headless": False, + "virtual_screen_capture": False, "force_render": True, - "env": {"numEnvs": 1024, - "envSpacing": 1.5, - "episodeLength": 100, - "enableDebugVis": False, - "clipObservations": 1000.0, - "clipActions": 1.0, - "controlFrequencyInv": 4, - "actionScale": 2.5, - "dofVelocityScale": 0.1, + "env": {"numEnvs": 1024, + "envSpacing": 1.5, + "episodeLength": 100, + "enableDebugVis": False, + "clipObservations": 1000.0, + "clipActions": 1.0, + "controlFrequencyInv": 4, + "actionScale": 2.5, + "dofVelocityScale": 0.1, "controlSpace": "cartesian", - "enableCameraSensors": False}, + "enableCameraSensors": False}, "sim": {"dt": 0.0083, # 1 / 120 - "substeps": 1, - "up_axis": "z", - "use_gpu_pipeline": True, - "gravity": [0.0, 0.0, -9.81], - "physx": {"num_threads": 4, - "solver_type": 1, - "use_gpu": True, - "num_position_iterations": 4, - "num_velocity_iterations": 1, - "contact_offset": 0.005, - "rest_offset": 0.0, - "bounce_threshold_velocity": 0.2, - "max_depenetration_velocity": 1000.0, - "default_buffer_size_multiplier": 5.0, - "max_gpu_contact_pairs": 1048576, - "num_subscenes": 4, - "contact_collection": 0}}, + "substeps": 1, + "up_axis": "z", + "use_gpu_pipeline": True, + "gravity": [0.0, 0.0, -9.81], + "physx": {"num_threads": 4, + "solver_type": 1, + "use_gpu": True, + "num_position_iterations": 4, + "num_velocity_iterations": 1, + "contact_offset": 0.005, + "rest_offset": 0.0, + "bounce_threshold_velocity": 0.2, + "max_depenetration_velocity": 1000.0, + "default_buffer_size_multiplier": 5.0, + "max_gpu_contact_pairs": 1048576, + "num_subscenes": 4, + "contact_collection": 0}}, "task": {"randomize": False}} @@ -84,12 +84,12 @@ def __init__(self, cfg): self._end_effector_link = "panda_leftfinger" # setup VecTask - super().__init__(config=self.cfg, - rl_device=rl_device, - sim_device=sim_device, - graphics_device_id=graphics_device_id, - headless=headless, - virtual_screen_capture=virtual_screen_capture, + super().__init__(config=self.cfg, + rl_device=rl_device, + sim_device=sim_device, + graphics_device_id=graphics_device_id, + headless=headless, + virtual_screen_capture=virtual_screen_capture, force_render=force_render) # tensors and views: DOFs, roots, rigid bodies @@ -195,7 +195,7 @@ def _create_envs(self, num_envs, spacing, num_per_row): self.handle_targets = [] self.handle_robots = [] self.handle_envs = [] - + indexes_sim_robot = [] indexes_sim_target = [] @@ -208,10 +208,10 @@ def _create_envs(self, num_envs, spacing, num_per_row): pose.p = gymapi.Vec3(0.0, 0.0, 0.0) pose.r = gymapi.Quat(0.0, 0.0, 0.0, 1) - robot_actor = self.gym.create_actor(env=env_ptr, - asset=robot_asset, + robot_actor = self.gym.create_actor(env=env_ptr, + asset=robot_asset, pose=pose, - name="robot", + name="robot", group=i, # collision group filter=1, # mask off collision segmentationId=0) @@ -224,9 +224,9 @@ def _create_envs(self, num_envs, spacing, num_per_row): pose.r = gymapi.Quat(0.0, 0.0, 0.0, 1) target_actor = self.gym.create_actor(env=env_ptr, - asset=target_asset, + asset=target_asset, pose=pose, - name="target", + name="target", group=i + 1, # collision group filter=1, # mask off collision segmentationId=1) @@ -240,7 +240,7 @@ def _create_envs(self, num_envs, spacing, num_per_row): self.indexes_sim_robot = torch.tensor(indexes_sim_robot, dtype=torch.int32, device=self.device) self.indexes_sim_target = torch.tensor(indexes_sim_target, dtype=torch.int32, device=self.device) - + self.num_robot_dofs = self.gym.get_asset_dof_count(robot_asset) self.rigid_body_dict_robot = self.gym.get_asset_rigid_body_dict(robot_asset) @@ -301,7 +301,7 @@ def reset_idx(self, env_ids): pos = torch.clamp(self.robot_default_dof_pos.unsqueeze(0) + 0.25 * (torch.rand((len(env_ids), self.num_robot_dofs), device=self.device) - 0.5), self.robot_dof_lower_limits, self.robot_dof_upper_limits) pos[:, 7:] = 0 - + self.robot_dof_targets[env_ids, :] = pos[:] self.dof_pos[env_ids, :] = pos[:] self.dof_vel[env_ids, :] = 0 @@ -309,14 +309,14 @@ def reset_idx(self, env_ids): indexes = self.indexes_sim_robot[env_ids] self.gym.set_dof_position_target_tensor_indexed(self.sim, gymtorch.unwrap_tensor(self.robot_dof_targets), - gymtorch.unwrap_tensor(indexes), + gymtorch.unwrap_tensor(indexes), len(env_ids)) self.gym.set_dof_state_tensor_indexed(self.sim, gymtorch.unwrap_tensor(self.dof_state), - gymtorch.unwrap_tensor(indexes), + gymtorch.unwrap_tensor(indexes), len(env_ids)) - + # reset targets pos = (torch.rand((len(env_ids), 3), device=self.device) - 0.5) * 2 pos[:, 0] = 0.50 + pos[:, 0] * 0.25 @@ -328,7 +328,7 @@ def reset_idx(self, env_ids): indexes = self.indexes_sim_target[env_ids] self.gym.set_actor_root_state_tensor_indexed(self.sim, gymtorch.unwrap_tensor(self.root_state), - gymtorch.unwrap_tensor(indexes), + gymtorch.unwrap_tensor(indexes), len(env_ids)) # bookkeeping diff --git a/docs/source/examples/real_world/franka_emika_panda/reaching_franka_isaacgym_skrl_eval.py b/docs/source/examples/real_world/franka_emika_panda/reaching_franka_isaacgym_skrl_eval.py index cf30f686..fb1de758 100644 --- a/docs/source/examples/real_world/franka_emika_panda/reaching_franka_isaacgym_skrl_eval.py +++ b/docs/source/examples/real_world/franka_emika_panda/reaching_franka_isaacgym_skrl_eval.py @@ -11,7 +11,7 @@ from skrl.envs.torch import wrap_env -# Define only the policy for evaluation +# Define only the policy for evaluation class Policy(GaussianMixin, Model): def __init__(self, observation_space, action_space, device, clip_actions=False, clip_log_std=True, min_log_std=-20, max_log_std=2): @@ -27,8 +27,8 @@ def __init__(self, observation_space, action_space, device, clip_actions=False, nn.Linear(64, self.num_actions)) self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) - def compute(self, states, taken_actions, role): - return self.net(states), self.log_std_parameter + def compute(self, inputs, role): + return self.net(inputs["states"]), self.log_std_parameter, {} # instantiate and configure the task @@ -68,9 +68,9 @@ def compute(self, states, taken_actions, role): cfg_ppo["experiment"]["checkpoint_interval"] = 0 agent = PPO(models=models_ppo, - memory=None, - cfg=cfg_ppo, - observation_space=env.observation_space, + memory=None, + cfg=cfg_ppo, + observation_space=env.observation_space, action_space=env.action_space, device=device) diff --git a/docs/source/examples/real_world/franka_emika_panda/reaching_franka_isaacgym_skrl_train.py b/docs/source/examples/real_world/franka_emika_panda/reaching_franka_isaacgym_skrl_train.py index 841129aa..cbbec60b 100644 --- a/docs/source/examples/real_world/franka_emika_panda/reaching_franka_isaacgym_skrl_train.py +++ b/docs/source/examples/real_world/franka_emika_panda/reaching_franka_isaacgym_skrl_train.py @@ -36,8 +36,8 @@ def __init__(self, observation_space, action_space, device, clip_actions=False, nn.Linear(64, self.num_actions)) self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) - def compute(self, states, taken_actions, role): - return self.net(states), self.log_std_parameter + def compute(self, inputs, role): + return self.net(inputs["states"]), self.log_std_parameter, {} class Value(DeterministicMixin, Model): def __init__(self, observation_space, action_space, device, clip_actions=False): @@ -52,8 +52,8 @@ def __init__(self, observation_space, action_space, device, clip_actions=False): nn.ELU(), nn.Linear(64, 1)) - def compute(self, states, taken_actions, role): - return self.net(states) + def compute(self, inputs, role): + return self.net(inputs["states"]), {} # instantiate and configure the task @@ -91,7 +91,7 @@ def compute(self, states, taken_actions, role): cfg_ppo = PPO_DEFAULT_CONFIG.copy() cfg_ppo["rollouts"] = 16 cfg_ppo["learning_epochs"] = 8 -cfg_ppo["mini_batches"] = 8 +cfg_ppo["mini_batches"] = 8 cfg_ppo["discount_factor"] = 0.99 cfg_ppo["lambda"] = 0.95 cfg_ppo["learning_rate"] = 5e-4 @@ -115,9 +115,9 @@ def compute(self, states, taken_actions, role): cfg_ppo["experiment"]["checkpoint_interval"] = 250 agent = PPO(models=models_ppo, - memory=memory, - cfg=cfg_ppo, - observation_space=env.observation_space, + memory=memory, + cfg=cfg_ppo, + observation_space=env.observation_space, action_space=env.action_space, device=device) diff --git a/docs/source/examples/real_world/franka_emika_panda/reaching_franka_omniverse_isaacgym_env.py b/docs/source/examples/real_world/franka_emika_panda/reaching_franka_omniverse_isaacgym_env.py index d530abc7..2ee17f87 100644 --- a/docs/source/examples/real_world/franka_emika_panda/reaching_franka_omniverse_isaacgym_env.py +++ b/docs/source/examples/real_world/franka_emika_panda/reaching_franka_omniverse_isaacgym_env.py @@ -4,9 +4,9 @@ from omniisaacgymenvs.tasks.base.rl_task import RLTask from omniisaacgymenvs.robots.articulations.franka import Franka as Robot -from omni.isaac.core.prims import RigidPrimView, XFormPrimView +from omni.isaac.core.prims import RigidPrimView from omni.isaac.core.articulations import ArticulationView -from omni.isaac.core.objects import VisualSphere +from omni.isaac.core.objects import DynamicSphere from omni.isaac.core.utils.prims import get_prim_at_path from skrl.utils import omniverse_isaacgym_utils @@ -19,67 +19,79 @@ # - get_extras() -TASK_CFG = {"test": False, - "device_id": 0, +TASK_CFG = {"test": False, + "device_id": 0, "headless": True, - "sim_device": "gpu", - "task": {"name": "ReachingFranka", - "physics_engine": "physx", - "env": {"numEnvs": 1024, - "envSpacing": 1.5, - "episodeLength": 100, - "enableDebugVis": False, - "clipObservations": 1000.0, - "clipActions": 1.0, - "controlFrequencyInv": 4, - "actionScale": 2.5, - "dofVelocityScale": 0.1, - "controlSpace": "cartesian"}, + "sim_device": "gpu", + "task": {"name": "ReachingFranka", + "physics_engine": "physx", + "env": {"numEnvs": 1024, + "envSpacing": 1.5, + "episodeLength": 100, + "enableDebugVis": False, + "clipObservations": 1000.0, + "clipActions": 1.0, + "controlFrequencyInv": 4, + "actionScale": 2.5, + "dofVelocityScale": 0.1, + "controlSpace": "cartesian"}, "sim": {"dt": 0.0083, # 1 / 120 - "use_gpu_pipeline": True, - "gravity": [0.0, 0.0, -9.81], - "add_ground_plane": True, - "use_flatcache": True, - "enable_scene_query_support": False, - "enable_cameras": False, - "default_physics_material": {"static_friction": 1.0, - "dynamic_friction": 1.0, - "restitution": 0.0}, - "physx": {"worker_thread_count": 4, - "solver_type": 1, - "use_gpu": True, - "solver_position_iteration_count": 4, - "solver_velocity_iteration_count": 1, - "contact_offset": 0.005, - "rest_offset": 0.0, - "bounce_threshold_velocity": 0.2, - "friction_offset_threshold": 0.04, - "friction_correlation_distance": 0.025, - "enable_sleeping": True, - "enable_stabilization": True, - "max_depenetration_velocity": 1000.0, - "gpu_max_rigid_contact_count": 524288, - "gpu_max_rigid_patch_count": 33554432, - "gpu_found_lost_pairs_capacity": 524288, - "gpu_found_lost_aggregate_pairs_capacity": 262144, - "gpu_total_aggregate_pairs_capacity": 1048576, - "gpu_max_soft_body_contacts": 1048576, - "gpu_max_particle_contacts": 1048576, - "gpu_heap_capacity": 33554432, - "gpu_temp_buffer_capacity": 16777216, - "gpu_max_num_partitions": 8}, - "robot": {"override_usd_defaults": False, - "fixed_base": False, - "enable_self_collisions": False, - "enable_gyroscopic_forces": True, - "solver_position_iteration_count": 4, - "solver_velocity_iteration_count": 1, - "sleep_threshold": 0.005, - "stabilization_threshold": 0.001, - "density": -1, - "max_depenetration_velocity": 1000.0, - "contact_offset": 0.005, - "rest_offset": 0.0}}}} + "use_gpu_pipeline": True, + "gravity": [0.0, 0.0, -9.81], + "add_ground_plane": True, + "use_flatcache": True, + "enable_scene_query_support": False, + "enable_cameras": False, + "default_physics_material": {"static_friction": 1.0, + "dynamic_friction": 1.0, + "restitution": 0.0}, + "physx": {"worker_thread_count": 4, + "solver_type": 1, + "use_gpu": True, + "solver_position_iteration_count": 4, + "solver_velocity_iteration_count": 1, + "contact_offset": 0.005, + "rest_offset": 0.0, + "bounce_threshold_velocity": 0.2, + "friction_offset_threshold": 0.04, + "friction_correlation_distance": 0.025, + "enable_sleeping": True, + "enable_stabilization": True, + "max_depenetration_velocity": 1000.0, + "gpu_max_rigid_contact_count": 524288, + "gpu_max_rigid_patch_count": 33554432, + "gpu_found_lost_pairs_capacity": 524288, + "gpu_found_lost_aggregate_pairs_capacity": 262144, + "gpu_total_aggregate_pairs_capacity": 1048576, + "gpu_max_soft_body_contacts": 1048576, + "gpu_max_particle_contacts": 1048576, + "gpu_heap_capacity": 33554432, + "gpu_temp_buffer_capacity": 16777216, + "gpu_max_num_partitions": 8}, + "robot": {"override_usd_defaults": False, + "fixed_base": False, + "enable_self_collisions": False, + "enable_gyroscopic_forces": True, + "solver_position_iteration_count": 4, + "solver_velocity_iteration_count": 1, + "sleep_threshold": 0.005, + "stabilization_threshold": 0.001, + "density": -1, + "max_depenetration_velocity": 1000.0, + "contact_offset": 0.005, + "rest_offset": 0.0}, + "target": {"override_usd_defaults": False, + "fixed_base": True, + "enable_self_collisions": False, + "enable_gyroscopic_forces": True, + "solver_position_iteration_count": 4, + "solver_velocity_iteration_count": 1, + "sleep_threshold": 0.005, + "stabilization_threshold": 0.001, + "density": -1, + "max_depenetration_velocity": 1000.0, + "contact_offset": 0.005, + "rest_offset": 0.0}}}} class RobotView(ArticulationView): @@ -118,7 +130,7 @@ def __init__(self, name, sim_config, env, offset=None) -> None: def set_up_scene(self, scene) -> None: self.get_robot() self.get_target() - + super().set_up_scene(scene) # robot view @@ -132,23 +144,24 @@ def set_up_scene(self, scene) -> None: self._hands = RigidPrimView(prim_paths_expr="/World/envs/.*/robot/panda_hand", name="hand_view", reset_xform_properties=False) scene.add(self._hands) # target view - self._targets = XFormPrimView(prim_paths_expr="/World/envs/.*/target", name="target_view", reset_xform_properties=False) + self._targets = RigidPrimView(prim_paths_expr="/World/envs/.*/target", name="target_view", reset_xform_properties=False) scene.add(self._targets) - + self.init_data() def get_robot(self): - robot = Robot(prim_path=self.default_zero_env_path + "/robot", - translation=torch.tensor([0.0, 0.0, 0.0]), + robot = Robot(prim_path=self.default_zero_env_path + "/robot", + translation=torch.tensor([0.0, 0.0, 0.0]), orientation=torch.tensor([1.0, 0.0, 0.0, 0.0]), name="robot") self._sim_config.apply_articulation_settings("robot", get_prim_at_path(robot.prim_path), self._sim_config.parse_actor_config("robot")) def get_target(self): - target = VisualSphere(prim_path=self.default_zero_env_path + "/target", - name="target", - radius=0.025, - color=torch.tensor([1, 0, 0])) + target = DynamicSphere(prim_path=self.default_zero_env_path + "/target", + name="target", + radius=0.025, + color=torch.tensor([1, 0, 0])) + self._sim_config.apply_articulation_settings("target", get_prim_at_path(target.prim_path), self._sim_config.parse_actor_config("target")) target.set_collision_enabled(False) def init_data(self) -> None: @@ -162,8 +175,8 @@ def init_data(self) -> None: def get_observations(self) -> dict: robot_dof_pos = self._robots.get_joint_positions(clone=False) robot_dof_vel = self._robots.get_joint_velocities(clone=False) - end_effector_pos, end_effector_rot = self._end_effectors.get_local_poses() - target_pos, target_rot = self._targets.get_local_poses() + end_effector_pos, end_effector_rot = self._end_effectors.get_world_poses(clone=False) + target_pos, target_rot = self._targets.get_world_poses(clone=False) dof_pos_scaled = 2.0 * (robot_dof_pos - self.robot_dof_lower_limits) \ / (self.robot_dof_upper_limits - self.robot_dof_lower_limits) - 1.0 @@ -174,14 +187,15 @@ def get_observations(self) -> dict: self.obs_buf[:, 0] = self.progress_buf / self._max_episode_length self.obs_buf[:, 1:8] = dof_pos_scaled[:, :7] self.obs_buf[:, 8:15] = dof_vel_scaled[:, :7] * generalization_noise - self.obs_buf[:, 15:18] = target_pos + self.obs_buf[:, 15:18] = target_pos - self._env_pos # compute distance for calculate_metrics() and is_done() self._computed_distance = torch.norm(end_effector_pos - target_pos, dim=-1) if self._control_space == "cartesian": self.jacobians = self._robots.get_jacobians(clone=False) - self.hand_pos, self.hand_rot = self._hands.get_local_poses() + self.hand_pos, self.hand_rot = self._hands.get_world_poses(clone=False) + self.hand_pos -= self._env_pos return {self._robots.name: {"obs_buf": self.obs_buf}} @@ -195,7 +209,7 @@ def pre_physics_step(self, actions) -> None: if self._control_space == "joint": targets = self.robot_dof_targets[:, :7] + self.robot_dof_speed_scales[:7] * self.dt * self.actions * self._action_scale - + elif self._control_space == "cartesian": goal_position = self.hand_pos + actions / 100.0 delta_dof_pos = omniverse_isaacgym_utils.ik(jacobian_end_effector=self.jacobians[:, 8 - 1, :, :7], # franka hand index: 8 @@ -227,12 +241,11 @@ def reset_idx(self, env_ids) -> None: self._robots.set_joint_velocities(dof_vel, indices=indices) # reset target - pos = (torch.rand((len(env_ids), 3), device=self._device) - 0.5) * 2 - pos[:, 0] = 0.50 + pos[:, 0] * 0.25 - pos[:, 1] = 0.00 + pos[:, 1] * 0.25 - pos[:, 2] = 0.20 + pos[:, 2] * 0.10 + pos = (torch.rand((len(env_ids), 3), device=self._device) - 0.5) * 2 \ + * torch.tensor([0.25, 0.25, 0.10], device=self._device) \ + + torch.tensor([0.50, 0.00, 0.20], device=self._device) - self._targets.set_local_poses(pos, indices=indices) + self._targets.set_world_poses(pos + self._env_pos[env_ids], indices=indices) # bookkeeping self.reset_buf[env_ids] = 0 diff --git a/docs/source/examples/real_world/franka_emika_panda/reaching_franka_omniverse_isaacgym_skrl_eval.py b/docs/source/examples/real_world/franka_emika_panda/reaching_franka_omniverse_isaacgym_skrl_eval.py index f9283779..50f5ae9b 100644 --- a/docs/source/examples/real_world/franka_emika_panda/reaching_franka_omniverse_isaacgym_skrl_eval.py +++ b/docs/source/examples/real_world/franka_emika_panda/reaching_franka_omniverse_isaacgym_skrl_eval.py @@ -10,7 +10,7 @@ from skrl.envs.torch import wrap_env -# Define only the policy for evaluation +# Define only the policy for evaluation class Policy(GaussianMixin, Model): def __init__(self, observation_space, action_space, device, clip_actions=False, clip_log_std=True, min_log_std=-20, max_log_std=2): @@ -26,16 +26,16 @@ def __init__(self, observation_space, action_space, device, clip_actions=False, nn.Linear(64, self.num_actions)) self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) - def compute(self, states, taken_actions, role): - return self.net(states), self.log_std_parameter + def compute(self, inputs, role): + return self.net(inputs["states"]), self.log_std_parameter, {} # instance VecEnvBase and setup task headless = True # set headless to False for rendering -env = get_env_instance(headless=headless) +env = get_env_instance(headless=headless) from omniisaacgymenvs.utils.config_utils.sim_config import SimConfig -from reaching_franka_sim_env import ReachingFrankaTask, TASK_CFG +from reaching_franka_omniverse_isaacgym_env import ReachingFrankaTask, TASK_CFG TASK_CFG["headless"] = headless TASK_CFG["task"]["env"]["numEnvs"] = 64 @@ -71,9 +71,9 @@ def compute(self, states, taken_actions, role): cfg_ppo["experiment"]["checkpoint_interval"] = 0 agent = PPO(models=models_ppo, - memory=None, - cfg=cfg_ppo, - observation_space=env.observation_space, + memory=None, + cfg=cfg_ppo, + observation_space=env.observation_space, action_space=env.action_space, device=device) diff --git a/docs/source/examples/real_world/franka_emika_panda/reaching_franka_omniverse_isaacgym_skrl_train.py b/docs/source/examples/real_world/franka_emika_panda/reaching_franka_omniverse_isaacgym_skrl_train.py index b9a31852..899df201 100644 --- a/docs/source/examples/real_world/franka_emika_panda/reaching_franka_omniverse_isaacgym_skrl_train.py +++ b/docs/source/examples/real_world/franka_emika_panda/reaching_franka_omniverse_isaacgym_skrl_train.py @@ -35,8 +35,8 @@ def __init__(self, observation_space, action_space, device, clip_actions=False, nn.Linear(64, self.num_actions)) self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) - def compute(self, states, taken_actions, role): - return self.net(states), self.log_std_parameter + def compute(self, inputs, role): + return self.net(inputs["states"]), self.log_std_parameter, {} class Value(DeterministicMixin, Model): def __init__(self, observation_space, action_space, device, clip_actions=False): @@ -51,16 +51,16 @@ def __init__(self, observation_space, action_space, device, clip_actions=False): nn.ELU(), nn.Linear(64, 1)) - def compute(self, states, taken_actions, role): - return self.net(states) + def compute(self, inputs, role): + return self.net(inputs["states"]), {} # instance VecEnvBase and setup task headless = True # set headless to False for rendering -env = get_env_instance(headless=headless) +env = get_env_instance(headless=headless) from omniisaacgymenvs.utils.config_utils.sim_config import SimConfig -from reaching_franka_sim_env import ReachingFrankaTask, TASK_CFG +from reaching_franka_omniverse_isaacgym_env import ReachingFrankaTask, TASK_CFG TASK_CFG["headless"] = headless TASK_CFG["task"]["env"]["numEnvs"] = 1024 @@ -94,7 +94,7 @@ def compute(self, states, taken_actions, role): cfg_ppo = PPO_DEFAULT_CONFIG.copy() cfg_ppo["rollouts"] = 16 cfg_ppo["learning_epochs"] = 8 -cfg_ppo["mini_batches"] = 8 +cfg_ppo["mini_batches"] = 8 cfg_ppo["discount_factor"] = 0.99 cfg_ppo["lambda"] = 0.95 cfg_ppo["learning_rate"] = 5e-4 @@ -118,9 +118,9 @@ def compute(self, states, taken_actions, role): cfg_ppo["experiment"]["checkpoint_interval"] = 250 agent = PPO(models=models_ppo, - memory=memory, - cfg=cfg_ppo, - observation_space=env.observation_space, + memory=memory, + cfg=cfg_ppo, + observation_space=env.observation_space, action_space=env.action_space, device=device) diff --git a/docs/source/examples/real_world/franka_emika_panda/reaching_franka_real_env.py b/docs/source/examples/real_world/franka_emika_panda/reaching_franka_real_env.py index 8afb649d..06c0d20c 100644 --- a/docs/source/examples/real_world/franka_emika_panda/reaching_franka_real_env.py +++ b/docs/source/examples/real_world/franka_emika_panda/reaching_franka_real_env.py @@ -17,7 +17,7 @@ def __init__(self, robot_ip="172.16.0.2", device="cuda:0", control_space="joint" self.motion_type = motion_type # waypoint or impedance if self.control_space == "cartesian" and self.motion_type == "impedance": - # The operation of this mode (Cartesian-impedance) was adjusted later without being able to test it on the real robot. + # The operation of this mode (Cartesian-impedance) was adjusted later without being able to test it on the real robot. # Dangerous movements may occur for the operator and the robot. # Comment the following line of code if you want to proceed with this mode. raise ValueError("See comment in the code to proceed with this mode") @@ -42,7 +42,7 @@ def __init__(self, robot_ip="172.16.0.2", device="cuda:0", control_space="joint" self.robot = frankx.Robot(robot_ip) self.robot.set_default_behavior() self.robot.recover_from_errors() - + # the robot's response can be better managed by independently setting the following properties, for example: # - self.robot.velocity_rel = 0.2 # - self.robot.acceleration_rel = 0.1 @@ -149,12 +149,12 @@ def reset(self): self.motion_thread.join() self.motion = None self.motion_thread = None - + # open/close gripper # self.gripper.open() # self.gripper.clamp() - # go to 1) safe position, 2) random position + # go to 1) safe position, 2) random position self.robot.move(frankx.JointMotion(self.robot_default_dof_pos.tolist())) dof_pos = self.robot_default_dof_pos + 0.25 * (np.random.rand(7) - 0.5) self.robot.move(frankx.JointMotion(dof_pos.tolist())) @@ -178,7 +178,7 @@ def reset(self): # initial pose affine = frankx.Affine(frankx.Kinematics.forward(dof_pos.tolist())) affine = affine * frankx.Affine(x=0, y=0, z=-0.10335, a=np.pi/2) - + # motion type if self.motion_type == "waypoint": self.motion = frankx.WaypointMotion([frankx.Waypoint(affine)], return_when_finished=False) @@ -186,7 +186,7 @@ def reset(self): self.motion = frankx.ImpedanceMotion(500, 50) else: raise ValueError("Invalid motion type:", self.motion_type) - + self.motion_thread = self.robot.move_async(self.motion) if self.motion_type == "impedance": self.motion.target = affine @@ -200,7 +200,7 @@ def reset(self): return observation else: return observation, {} - + def step(self, action): self.progress_buf += 1 diff --git a/docs/source/examples/real_world/franka_emika_panda/reaching_franka_real_skrl_eval.py b/docs/source/examples/real_world/franka_emika_panda/reaching_franka_real_skrl_eval.py index 0d64487e..adcd1158 100644 --- a/docs/source/examples/real_world/franka_emika_panda/reaching_franka_real_skrl_eval.py +++ b/docs/source/examples/real_world/franka_emika_panda/reaching_franka_real_skrl_eval.py @@ -9,7 +9,7 @@ from skrl.envs.torch import wrap_env -# Define only the policy for evaluation +# Define only the policy for evaluation class Policy(GaussianMixin, Model): def __init__(self, observation_space, action_space, device, clip_actions=False, clip_log_std=True, min_log_std=-20, max_log_std=2): @@ -25,8 +25,8 @@ def __init__(self, observation_space, action_space, device, clip_actions=False, nn.Linear(64, self.num_actions)) self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) - def compute(self, states, taken_actions, role): - return self.net(states), self.log_std_parameter + def compute(self, inputs, role): + return self.net(inputs["states"]), self.log_std_parameter, {} # Load the environment @@ -34,7 +34,7 @@ def compute(self, states, taken_actions, role): control_space = "joint" # joint or cartesian motion_type = "waypoint" # waypoint or impedance -camera_tracking = False # True for USB-camera tracking +camera_tracking = False # True for USB-camera tracking env = ReachingFranka(robot_ip="172.16.0.2", device="cpu", @@ -67,9 +67,9 @@ def compute(self, states, taken_actions, role): cfg_ppo["experiment"]["checkpoint_interval"] = 0 agent = PPO(models=models_ppo, - memory=None, - cfg=cfg_ppo, - observation_space=env.observation_space, + memory=None, + cfg=cfg_ppo, + observation_space=env.observation_space, action_space=env.action_space, device=device) diff --git a/docs/source/examples/real_world/kuka_lbr_iiwa/reaching_iiwa_omniverse_isaacgym_env.py b/docs/source/examples/real_world/kuka_lbr_iiwa/reaching_iiwa_omniverse_isaacgym_env.py new file mode 100644 index 00000000..f29d7eba --- /dev/null +++ b/docs/source/examples/real_world/kuka_lbr_iiwa/reaching_iiwa_omniverse_isaacgym_env.py @@ -0,0 +1,271 @@ +import torch +import numpy as np + +from omniisaacgymenvs.tasks.base.rl_task import RLTask + +from omni.isaac.core.prims import RigidPrimView +from omni.isaac.core.articulations import ArticulationView +from omni.isaac.core.objects import DynamicSphere +from omni.isaac.core.utils.prims import get_prim_at_path + +from robots.iiwa14 import Iiwa14 as Robot + +from skrl.utils import omniverse_isaacgym_utils + +# post_physics_step calls +# - get_observations() +# - get_states() +# - calculate_metrics() +# - is_done() +# - get_extras() + + +TASK_CFG = {"test": False, + "device_id": 0, + "headless": True, + "sim_device": "gpu", + "task": {"name": "ReachingIiwa", + "physics_engine": "physx", + "env": {"numEnvs": 1024, + "envSpacing": 1.5, + "episodeLength": 100, + "enableDebugVis": False, + "clipObservations": 1000.0, + "clipActions": 1.0, + "controlFrequencyInv": 4, + "actionScale": 2.5, + "dofVelocityScale": 0.1, + "controlSpace": "cartesian"}, + "sim": {"dt": 0.0083, # 1 / 120 + "use_gpu_pipeline": True, + "gravity": [0.0, 0.0, -9.81], + "add_ground_plane": True, + "use_flatcache": True, + "enable_scene_query_support": False, + "enable_cameras": False, + "default_physics_material": {"static_friction": 1.0, + "dynamic_friction": 1.0, + "restitution": 0.0}, + "physx": {"worker_thread_count": 4, + "solver_type": 1, + "use_gpu": True, + "solver_position_iteration_count": 4, + "solver_velocity_iteration_count": 1, + "contact_offset": 0.005, + "rest_offset": 0.0, + "bounce_threshold_velocity": 0.2, + "friction_offset_threshold": 0.04, + "friction_correlation_distance": 0.025, + "enable_sleeping": True, + "enable_stabilization": True, + "max_depenetration_velocity": 1000.0, + "gpu_max_rigid_contact_count": 524288, + "gpu_max_rigid_patch_count": 33554432, + "gpu_found_lost_pairs_capacity": 524288, + "gpu_found_lost_aggregate_pairs_capacity": 262144, + "gpu_total_aggregate_pairs_capacity": 1048576, + "gpu_max_soft_body_contacts": 1048576, + "gpu_max_particle_contacts": 1048576, + "gpu_heap_capacity": 33554432, + "gpu_temp_buffer_capacity": 16777216, + "gpu_max_num_partitions": 8}, + "robot": {"override_usd_defaults": False, + "fixed_base": False, + "enable_self_collisions": False, + "enable_gyroscopic_forces": True, + "solver_position_iteration_count": 4, + "solver_velocity_iteration_count": 1, + "sleep_threshold": 0.005, + "stabilization_threshold": 0.001, + "density": -1, + "max_depenetration_velocity": 1000.0, + "contact_offset": 0.005, + "rest_offset": 0.0}, + "target": {"override_usd_defaults": False, + "fixed_base": True, + "enable_self_collisions": False, + "enable_gyroscopic_forces": True, + "solver_position_iteration_count": 4, + "solver_velocity_iteration_count": 1, + "sleep_threshold": 0.005, + "stabilization_threshold": 0.001, + "density": -1, + "max_depenetration_velocity": 1000.0, + "contact_offset": 0.005, + "rest_offset": 0.0}}}} + + +class RobotView(ArticulationView): + def __init__(self, prim_paths_expr: str, name: str = "robot_view") -> None: + super().__init__(prim_paths_expr=prim_paths_expr, name=name, reset_xform_properties=False) + + +class ReachingIiwaTask(RLTask): + def __init__(self, name, sim_config, env, offset=None) -> None: + self._sim_config = sim_config + self._cfg = sim_config.config + self._task_cfg = sim_config.task_config + + self.dt = 1 / 120.0 + + self._num_envs = self._task_cfg["env"]["numEnvs"] + self._env_spacing = self._task_cfg["env"]["envSpacing"] + self._action_scale = self._task_cfg["env"]["actionScale"] + self._dof_vel_scale = self._task_cfg["env"]["dofVelocityScale"] + self._max_episode_length = self._task_cfg["env"]["episodeLength"] + self._control_space = self._task_cfg["env"]["controlSpace"] + + # observation and action space + self._num_observations = 18 + if self._control_space == "joint": + self._num_actions = 7 + elif self._control_space == "cartesian": + self._num_actions = 3 + else: + raise ValueError("Invalid control space: {}".format(self._control_space)) + + self._end_effector_link = "iiwa_link_7" + + RLTask.__init__(self, name, env) + + def set_up_scene(self, scene) -> None: + self.get_robot() + self.get_target() + + super().set_up_scene(scene) + + # robot view + self._robots = RobotView(prim_paths_expr="/World/envs/.*/robot", name="robot_view") + scene.add(self._robots) + # end-effectors view + self._end_effectors = RigidPrimView(prim_paths_expr="/World/envs/.*/robot/{}".format(self._end_effector_link), name="end_effector_view") + scene.add(self._end_effectors) + # target view + self._targets = RigidPrimView(prim_paths_expr="/World/envs/.*/target", name="target_view", reset_xform_properties=False) + scene.add(self._targets) + + self.init_data() + + def get_robot(self): + robot = Robot(prim_path=self.default_zero_env_path + "/robot", + translation=torch.tensor([0.0, 0.0, 0.0]), + orientation=torch.tensor([1.0, 0.0, 0.0, 0.0]), + name="robot") + self._sim_config.apply_articulation_settings("robot", get_prim_at_path(robot.prim_path), self._sim_config.parse_actor_config("robot")) + + def get_target(self): + target = DynamicSphere(prim_path=self.default_zero_env_path + "/target", + name="target", + radius=0.025, + color=torch.tensor([1, 0, 0])) + self._sim_config.apply_articulation_settings("target", get_prim_at_path(target.prim_path), self._sim_config.parse_actor_config("target")) + target.set_collision_enabled(False) + + def init_data(self) -> None: + self.robot_default_dof_pos = torch.tensor(np.radians([0, 0, 0, -90, 0, 90, 0]), device=self._device, dtype=torch.float32) + self.actions = torch.zeros((self._num_envs, self.num_actions), device=self._device) + + if self._control_space == "cartesian": + self.jacobians = torch.zeros((self._num_envs, 7, 6, 7), device=self._device) + self.end_effector_pos, self.end_effector_rot = torch.zeros((self._num_envs, 3), device=self._device), torch.zeros((self._num_envs, 4), device=self._device) + + def get_observations(self) -> dict: + robot_dof_pos = self._robots.get_joint_positions(clone=False) + robot_dof_vel = self._robots.get_joint_velocities(clone=False) + end_effector_pos, end_effector_rot = self._end_effectors.get_world_poses(clone=False) + target_pos, target_rot = self._targets.get_world_poses(clone=False) + + dof_pos_scaled = 2.0 * (robot_dof_pos - self.robot_dof_lower_limits) \ + / (self.robot_dof_upper_limits - self.robot_dof_lower_limits) - 1.0 + dof_vel_scaled = robot_dof_vel * self._dof_vel_scale + + generalization_noise = torch.rand((dof_vel_scaled.shape[0], 7), device=self._device) + 0.5 + + self.obs_buf[:, 0] = self.progress_buf / self._max_episode_length + self.obs_buf[:, 1:8] = dof_pos_scaled + self.obs_buf[:, 8:15] = dof_vel_scaled * generalization_noise + self.obs_buf[:, 15:18] = target_pos - self._env_pos + + # compute distance for calculate_metrics() and is_done() + self._computed_distance = torch.norm(end_effector_pos - target_pos, dim=-1) + + if self._control_space == "cartesian": + self.jacobians = self._robots.get_jacobians(clone=False) + self.end_effector_pos, self.end_effector_rot = end_effector_pos, end_effector_rot + self.end_effector_pos -= self._env_pos + + return {self._robots.name: {"obs_buf": self.obs_buf}} + + def pre_physics_step(self, actions) -> None: + reset_env_ids = self.reset_buf.nonzero(as_tuple=False).squeeze(-1) + if len(reset_env_ids) > 0: + self.reset_idx(reset_env_ids) + + self.actions = actions.clone().to(self._device) + env_ids_int32 = torch.arange(self._robots.count, dtype=torch.int32, device=self._device) + + if self._control_space == "joint": + targets = self.robot_dof_targets + self.robot_dof_speed_scales * self.dt * self.actions * self._action_scale + + elif self._control_space == "cartesian": + goal_position = self.end_effector_pos + actions / 100.0 + delta_dof_pos = omniverse_isaacgym_utils.ik(jacobian_end_effector=self.jacobians[:, 7 - 1, :, :7], # iiwa_link_7 index: 7 + current_position=self.end_effector_pos, + current_orientation=self.end_effector_rot, + goal_position=goal_position, + goal_orientation=None) + targets = self.robot_dof_targets[:, :7] + delta_dof_pos + + self.robot_dof_targets = torch.clamp(targets, self.robot_dof_lower_limits, self.robot_dof_upper_limits) + self._robots.set_joint_position_targets(self.robot_dof_targets, indices=env_ids_int32) + + def reset_idx(self, env_ids) -> None: + indices = env_ids.to(dtype=torch.int32) + + # reset robot + pos = torch.clamp(self.robot_default_dof_pos.unsqueeze(0) + 0.25 * (torch.rand((len(env_ids), self.num_robot_dofs), device=self._device) - 0.5), + self.robot_dof_lower_limits, self.robot_dof_upper_limits) + dof_pos = torch.zeros((len(indices), self._robots.num_dof), device=self._device) + dof_pos[:] = pos + dof_vel = torch.zeros((len(indices), self._robots.num_dof), device=self._device) + self.robot_dof_targets[env_ids, :] = pos + self.robot_dof_pos[env_ids, :] = pos + + self._robots.set_joint_position_targets(self.robot_dof_targets[env_ids], indices=indices) + self._robots.set_joint_positions(dof_pos, indices=indices) + self._robots.set_joint_velocities(dof_vel, indices=indices) + + # reset target + pos = (torch.rand((len(env_ids), 3), device=self._device) - 0.5) * 2 \ + * torch.tensor([0.10, 0.20, 0.20], device=self._device) \ + + torch.tensor([0.60, 0.00, 0.40], device=self._device) + + self._targets.set_world_poses(pos + self._env_pos[env_ids], indices=indices) + + # bookkeeping + self.reset_buf[env_ids] = 0 + self.progress_buf[env_ids] = 0 + + def post_reset(self): + self.num_robot_dofs = self._robots.num_dof + self.robot_dof_pos = torch.zeros((self.num_envs, self.num_robot_dofs), device=self._device) + dof_limits = self._robots.get_dof_limits() + self.robot_dof_lower_limits = dof_limits[0, :, 0].to(device=self._device) + self.robot_dof_upper_limits = dof_limits[0, :, 1].to(device=self._device) + + self.robot_dof_speed_scales = torch.ones_like(self.robot_dof_lower_limits) + self.robot_dof_targets = torch.zeros((self._num_envs, self.num_robot_dofs), dtype=torch.float, device=self._device) + + # randomize all envs + indices = torch.arange(self._num_envs, dtype=torch.int64, device=self._device) + self.reset_idx(indices) + + def calculate_metrics(self) -> None: + self.rew_buf[:] = -self._computed_distance + + def is_done(self) -> None: + self.reset_buf.fill_(0) + # target reached + self.reset_buf = torch.where(self._computed_distance <= 0.035, torch.ones_like(self.reset_buf), self.reset_buf) + # max episode length + self.reset_buf = torch.where(self.progress_buf >= self._max_episode_length - 1, torch.ones_like(self.reset_buf), self.reset_buf) diff --git a/docs/source/examples/real_world/kuka_lbr_iiwa/reaching_iiwa_omniverse_isaacgym_skrl_eval.py b/docs/source/examples/real_world/kuka_lbr_iiwa/reaching_iiwa_omniverse_isaacgym_skrl_eval.py new file mode 100644 index 00000000..3611f86a --- /dev/null +++ b/docs/source/examples/real_world/kuka_lbr_iiwa/reaching_iiwa_omniverse_isaacgym_skrl_eval.py @@ -0,0 +1,92 @@ +import torch +import torch.nn as nn + +# Import the skrl components to build the RL system +from skrl.models.torch import Model, GaussianMixin +from skrl.agents.torch.ppo import PPO, PPO_DEFAULT_CONFIG +from skrl.resources.preprocessors.torch import RunningStandardScaler +from skrl.trainers.torch import SequentialTrainer +from skrl.utils.omniverse_isaacgym_utils import get_env_instance +from skrl.envs.torch import wrap_env + + +# Define only the policy for evaluation +class Policy(GaussianMixin, Model): + def __init__(self, observation_space, action_space, device, clip_actions=False, + clip_log_std=True, min_log_std=-20, max_log_std=2): + Model.__init__(self, observation_space, action_space, device) + GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std) + + self.net = nn.Sequential(nn.Linear(self.num_observations, 256), + nn.ELU(), + nn.Linear(256, 128), + nn.ELU(), + nn.Linear(128, 64), + nn.ELU(), + nn.Linear(64, self.num_actions)) + self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) + + def compute(self, inputs, role): + return self.net(inputs["states"]), self.log_std_parameter, {} + + +# instance VecEnvBase and setup task +headless = not True # set headless to False for rendering +env = get_env_instance(headless=headless) + +from omniisaacgymenvs.utils.config_utils.sim_config import SimConfig +from reaching_iiwa_omniverse_isaacgym_env import ReachingIiwaTask, TASK_CFG + +TASK_CFG["headless"] = headless +TASK_CFG["task"]["env"]["numEnvs"] = 64 +TASK_CFG["task"]["env"]["controlSpace"] = "joint" # "joint" or "cartesian" + +sim_config = SimConfig(TASK_CFG) +task = ReachingIiwaTask(name="ReachingIiwa", sim_config=sim_config, env=env) +env.set_task(task=task, sim_params=sim_config.get_physics_params(), backend="torch", init_sim=True) + +# wrap the environment +env = wrap_env(env, "omniverse-isaacgym") + +device = env.device + + +# Instantiate the agent's policy. +# PPO requires 2 models, visit its documentation for more details +# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#spaces-and-models +models_ppo = {} +models_ppo["policy"] = Policy(env.observation_space, env.action_space, device) + + +# Configure and instantiate the agent. +# Only modify some of the default configuration, visit its documentation to see all the options +# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#configuration-and-hyperparameters +cfg_ppo = PPO_DEFAULT_CONFIG.copy() +cfg_ppo["random_timesteps"] = 0 +cfg_ppo["learning_starts"] = 0 +cfg_ppo["state_preprocessor"] = RunningStandardScaler +cfg_ppo["state_preprocessor_kwargs"] = {"size": env.observation_space, "device": device} +# logging to TensorBoard each 32 timesteps an ignore checkpoints +cfg_ppo["experiment"]["write_interval"] = 32 +cfg_ppo["experiment"]["checkpoint_interval"] = 0 + +agent = PPO(models=models_ppo, + memory=None, + cfg=cfg_ppo, + observation_space=env.observation_space, + action_space=env.action_space, + device=device) + +# load checkpoints +if TASK_CFG["task"]["env"]["controlSpace"] == "joint": + agent.load("./agent_joint.pt") +elif TASK_CFG["task"]["env"]["controlSpace"] == "cartesian": + agent.load("./agent_cartesian.pt") + + +# Configure and instantiate the RL trainer +cfg_trainer = {"timesteps": 5000, "headless": True} +trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=agent) + +# start evaluation +trainer.eval() diff --git a/docs/source/examples/real_world/kuka_lbr_iiwa/reaching_iiwa_omniverse_isaacgym_skrl_train.py b/docs/source/examples/real_world/kuka_lbr_iiwa/reaching_iiwa_omniverse_isaacgym_skrl_train.py new file mode 100644 index 00000000..d109085a --- /dev/null +++ b/docs/source/examples/real_world/kuka_lbr_iiwa/reaching_iiwa_omniverse_isaacgym_skrl_train.py @@ -0,0 +1,133 @@ +import torch +import torch.nn as nn + +# Import the skrl components to build the RL system +from skrl.models.torch import Model, GaussianMixin, DeterministicMixin +from skrl.memories.torch import RandomMemory +from skrl.agents.torch.ppo import PPO, PPO_DEFAULT_CONFIG +from skrl.resources.schedulers.torch import KLAdaptiveRL +from skrl.resources.preprocessors.torch import RunningStandardScaler +from skrl.trainers.torch import SequentialTrainer +from skrl.utils.omniverse_isaacgym_utils import get_env_instance +from skrl.envs.torch import wrap_env +from skrl.utils import set_seed + + +# set the seed for reproducibility +set_seed(42) + + +# Define the models (stochastic and deterministic models) for the agent using helper mixin. +# - Policy: takes as input the environment's observation/state and returns an action +# - Value: takes the state as input and provides a value to guide the policy +class Policy(GaussianMixin, Model): + def __init__(self, observation_space, action_space, device, clip_actions=False, + clip_log_std=True, min_log_std=-20, max_log_std=2): + Model.__init__(self, observation_space, action_space, device) + GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std) + + self.net = nn.Sequential(nn.Linear(self.num_observations, 256), + nn.ELU(), + nn.Linear(256, 128), + nn.ELU(), + nn.Linear(128, 64), + nn.ELU(), + nn.Linear(64, self.num_actions)) + self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) + + def compute(self, inputs, role): + return self.net(inputs["states"]), self.log_std_parameter, {} + +class Value(DeterministicMixin, Model): + def __init__(self, observation_space, action_space, device, clip_actions=False): + Model.__init__(self, observation_space, action_space, device) + DeterministicMixin.__init__(self, clip_actions) + + self.net = nn.Sequential(nn.Linear(self.num_observations, 256), + nn.ELU(), + nn.Linear(256, 128), + nn.ELU(), + nn.Linear(128, 64), + nn.ELU(), + nn.Linear(64, 1)) + + def compute(self, inputs, role): + return self.net(inputs["states"]), {} + + +# instance VecEnvBase and setup task +headless = True # set headless to False for rendering +env = get_env_instance(headless=headless) + +from omniisaacgymenvs.utils.config_utils.sim_config import SimConfig +from reaching_iiwa_omniverse_isaacgym_env import ReachingIiwaTask, TASK_CFG + +TASK_CFG["headless"] = headless +TASK_CFG["task"]["env"]["numEnvs"] = 1024 +TASK_CFG["task"]["env"]["controlSpace"] = "joint" # "joint" or "cartesian" + +sim_config = SimConfig(TASK_CFG) +task = ReachingIiwaTask(name="ReachingIiwa", sim_config=sim_config, env=env) +env.set_task(task=task, sim_params=sim_config.get_physics_params(), backend="torch", init_sim=True) + +# wrap the environment +env = wrap_env(env, "omniverse-isaacgym") + +device = env.device + + +# Instantiate a RandomMemory as rollout buffer (any memory can be used for this) +memory = RandomMemory(memory_size=16, num_envs=env.num_envs, device=device) + + +# Instantiate the agent's models (function approximators). +# PPO requires 2 models, visit its documentation for more details +# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#spaces-and-models +models_ppo = {} +models_ppo["policy"] = Policy(env.observation_space, env.action_space, device) +models_ppo["value"] = Value(env.observation_space, env.action_space, device) + + +# Configure and instantiate the agent. +# Only modify some of the default configuration, visit its documentation to see all the options +# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#configuration-and-hyperparameters +cfg_ppo = PPO_DEFAULT_CONFIG.copy() +cfg_ppo["rollouts"] = 16 +cfg_ppo["learning_epochs"] = 8 +cfg_ppo["mini_batches"] = 8 +cfg_ppo["discount_factor"] = 0.99 +cfg_ppo["lambda"] = 0.95 +cfg_ppo["learning_rate"] = 5e-4 +cfg_ppo["learning_rate_scheduler"] = KLAdaptiveRL +cfg_ppo["learning_rate_scheduler_kwargs"] = {"kl_threshold": 0.008} +cfg_ppo["random_timesteps"] = 0 +cfg_ppo["learning_starts"] = 0 +cfg_ppo["grad_norm_clip"] = 1.0 +cfg_ppo["ratio_clip"] = 0.2 +cfg_ppo["value_clip"] = 0.2 +cfg_ppo["clip_predicted_values"] = True +cfg_ppo["entropy_loss_scale"] = 0.0 +cfg_ppo["value_loss_scale"] = 2.0 +cfg_ppo["kl_threshold"] = 0 +cfg_ppo["state_preprocessor"] = RunningStandardScaler +cfg_ppo["state_preprocessor_kwargs"] = {"size": env.observation_space, "device": device} +cfg_ppo["value_preprocessor"] = RunningStandardScaler +cfg_ppo["value_preprocessor_kwargs"] = {"size": 1, "device": device} +# logging to TensorBoard and write checkpoints each 32 and 250 timesteps respectively +cfg_ppo["experiment"]["write_interval"] = 32 +cfg_ppo["experiment"]["checkpoint_interval"] = 250 + +agent = PPO(models=models_ppo, + memory=memory, + cfg=cfg_ppo, + observation_space=env.observation_space, + action_space=env.action_space, + device=device) + + +# Configure and instantiate the RL trainer +cfg_trainer = {"timesteps": 5000, "headless": True} +trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=agent) + +# start training +trainer.train() diff --git a/docs/source/examples/real_world/kuka_lbr_iiwa/reaching_iiwa_real_env.py b/docs/source/examples/real_world/kuka_lbr_iiwa/reaching_iiwa_real_env.py new file mode 100644 index 00000000..4e5a8e9e --- /dev/null +++ b/docs/source/examples/real_world/kuka_lbr_iiwa/reaching_iiwa_real_env.py @@ -0,0 +1,145 @@ +import time +import numpy as np +import gymnasium as gym + +import libiiwa + + +class ReachingIiwa(gym.Env): + def __init__(self, control_space="joint"): + + self.control_space = control_space # joint or cartesian + + # spaces + self.observation_space = gym.spaces.Box(low=-1000, high=1000, shape=(18,), dtype=np.float32) + if self.control_space == "joint": + self.action_space = gym.spaces.Box(low=-1, high=1, shape=(7,), dtype=np.float32) + elif self.control_space == "cartesian": + self.action_space = gym.spaces.Box(low=-1, high=1, shape=(3,), dtype=np.float32) + else: + raise ValueError("Invalid control space:", self.control_space) + + # init iiwa + print("Connecting to robot...") + + self.robot = libiiwa.LibIiwa() + self.robot.set_control_interface(libiiwa.ControlInterface.CONTROL_INTERFACE_SERVO) + + self.robot.set_desired_joint_velocity_rel(0.5) + self.robot.set_desired_joint_acceleration_rel(0.5) + self.robot.set_desired_joint_jerk_rel(0.5) + + self.robot.set_desired_cartesian_velocity(10) + self.robot.set_desired_cartesian_acceleration(10) + self.robot.set_desired_cartesian_jerk(10) + + print("Robot connected") + + self.motion = None + self.motion_thread = None + + self.dt = 1 / 120.0 + self.action_scale = 2.5 + self.dof_vel_scale = 0.1 + self.max_episode_length = 100 + self.robot_dof_speed_scales = 1 + self.target_pos = np.array([0.65, 0.2, 0.2]) + self.robot_default_dof_pos = np.radians([0, 0, 0, -90, 0, 90, 0]) + self.robot_dof_lower_limits = np.array([-2.9671, -2.0944, -2.9671, -2.0944, -2.9671, -2.0944, -3.0543]) + self.robot_dof_upper_limits = np.array([ 2.9671, 2.0944, 2.9671, 2.0944, 2.9671, 2.0944, 3.0543]) + + self.progress_buf = 1 + self.obs_buf = np.zeros((18,), dtype=np.float32) + + def _get_observation_reward_done(self): + # get robot state + robot_state = self.robot.get_state(refresh=True) + + # observation + robot_dof_pos = robot_state["joint_position"] + robot_dof_vel = robot_state["joint_velocity"] + end_effector_pos = robot_state["cartesian_position"] + + dof_pos_scaled = 2.0 * (robot_dof_pos - self.robot_dof_lower_limits) / (self.robot_dof_upper_limits - self.robot_dof_lower_limits) - 1.0 + dof_vel_scaled = robot_dof_vel * self.dof_vel_scale + + self.obs_buf[0] = self.progress_buf / float(self.max_episode_length) + self.obs_buf[1:8] = dof_pos_scaled + self.obs_buf[8:15] = dof_vel_scaled + self.obs_buf[15:18] = self.target_pos + + # reward + distance = np.linalg.norm(end_effector_pos - self.target_pos) + reward = -distance + + # done + done = self.progress_buf >= self.max_episode_length - 1 + done = done or distance <= 0.075 + + print("Distance:", distance) + if done: + print("Target or Maximum episode length reached") + time.sleep(1) + + return self.obs_buf, reward, done + + def reset(self): + print("Reseting...") + + # go to 1) safe position, 2) random position + self.robot.command_joint_position(self.robot_default_dof_pos) + time.sleep(3) + dof_pos = self.robot_default_dof_pos + 0.25 * (np.random.rand(7) - 0.5) + self.robot.command_joint_position(dof_pos) + time.sleep(1) + + # get target position from prompt + while True: + try: + print("Enter target position (X, Y, Z) in meters") + raw = input("or press [Enter] key for a random target position: ") + if raw: + self.target_pos = np.array([float(p) for p in raw.replace(' ', '').split(',')]) + else: + noise = (2 * np.random.rand(3) - 1) * np.array([0.1, 0.2, 0.2]) + self.target_pos = np.array([0.6, 0.0, 0.4]) + noise + print("Target position:", self.target_pos) + break + except ValueError: + print("Invalid input. Try something like: 0.65, 0.0, 0.4") + + input("Press [Enter] to continue") + + self.progress_buf = 0 + observation, reward, done = self._get_observation_reward_done() + + return observation, {} + + def step(self, action): + self.progress_buf += 1 + + # get robot state + robot_state = self.robot.get_state(refresh=True) + + # control space + # joint + if self.control_space == "joint": + dof_pos = robot_state["joint_position"] + (self.robot_dof_speed_scales * self.dt * action * self.action_scale) + self.robot.command_joint_position(dof_pos) + # cartesian + elif self.control_space == "cartesian": + end_effector_pos = robot_state["cartesian_position"] + action / 100.0 + self.robot.command_cartesian_pose(end_effector_pos) + + # the use of time.sleep is for simplicity. It does not guarantee control at a specific frequency + time.sleep(1 / 30.0) + + observation, reward, terminated = self._get_observation_reward_done() + + return observation, reward, terminated, False, {} + + def render(self, *args, **kwargs): + pass + + def close(self): + pass diff --git a/docs/source/examples/real_world/kuka_lbr_iiwa/reaching_iiwa_real_ros2_env.py b/docs/source/examples/real_world/kuka_lbr_iiwa/reaching_iiwa_real_ros2_env.py new file mode 100644 index 00000000..e3598e30 --- /dev/null +++ b/docs/source/examples/real_world/kuka_lbr_iiwa/reaching_iiwa_real_ros2_env.py @@ -0,0 +1,218 @@ +import time +import numpy as np +import gymnasium as gym + +import rclpy +from rclpy.node import Node +from rclpy.qos import QoSPresetProfiles +import sensor_msgs.msg +import geometry_msgs.msg + +import libiiwa_msgs.srv + + +class ReachingIiwa(gym.Env): + def __init__(self, control_space="joint"): + + self.control_space = control_space # joint or cartesian + + # spaces + self.observation_space = gym.spaces.Box(low=-1000, high=1000, shape=(18,), dtype=np.float32) + if self.control_space == "joint": + self.action_space = gym.spaces.Box(low=-1, high=1, shape=(7,), dtype=np.float32) + elif self.control_space == "cartesian": + self.action_space = gym.spaces.Box(low=-1, high=1, shape=(3,), dtype=np.float32) + else: + raise ValueError("Invalid control space:", self.control_space) + + # initialize the ROS node + rclpy.init() + self.node = Node(self.__class__.__name__) + + import threading + threading.Thread(target=self._spin).start() + + # create publishers + + self.pub_command_joint = self.node.create_publisher(sensor_msgs.msg.JointState, '/iiwa/command/joint', QoSPresetProfiles.SYSTEM_DEFAULT.value) + self.pub_command_cartesian = self.node.create_publisher(geometry_msgs.msg.Pose, '/iiwa/command/cartesian', QoSPresetProfiles.SYSTEM_DEFAULT.value) + + # keep compatibility with libiiwa Python API + self.robot_state = {"joint_position": np.zeros((7,)), + "joint_velocity": np.zeros((7,)), + "cartesian_position": np.zeros((3,))} + + # create subscribers + self.node.create_subscription(msg_type=sensor_msgs.msg.JointState, + topic='/iiwa/state/joint_states', + callback=self._callback_joint_states, + qos_profile=QoSPresetProfiles.SYSTEM_DEFAULT.value) + self.node.create_subscription(msg_type=geometry_msgs.msg.Pose, + topic='/iiwa/state/end_effector_pose', + callback=self._callback_end_effector_pose, + qos_profile=QoSPresetProfiles.SYSTEM_DEFAULT.value) + + # service clients + client_control_interface = self.node.create_client(libiiwa_msgs.srv.SetString, '/iiwa/set_control_interface') + client_control_interface.wait_for_service() + request = libiiwa_msgs.srv.SetString.Request() + request.data = "SERVO" # or "servo" + client_control_interface.call(request) + + client_joint_velocity_rel = self.node.create_client(libiiwa_msgs.srv.SetNumber, '/iiwa/set_desired_joint_velocity_rel') + client_joint_acceleration_rel = self.node.create_client(libiiwa_msgs.srv.SetNumber, '/iiwa/set_desired_joint_acceleration_rel') + client_joint_jerk_rel = self.node.create_client(libiiwa_msgs.srv.SetNumber, '/iiwa/set_desired_joint_jerk_rel') + + client_cartesian_velocity = self.node.create_client(libiiwa_msgs.srv.SetNumber, '/iiwa/set_desired_cartesian_velocity') + client_cartesian_acceleration = self.node.create_client(libiiwa_msgs.srv.SetNumber, '/iiwa/set_desired_cartesian_acceleration') + client_cartesian_jerk = self.node.create_client(libiiwa_msgs.srv.SetNumber, '/iiwa/set_desired_cartesian_jerk') + + client_joint_velocity_rel.wait_for_service() + client_joint_acceleration_rel.wait_for_service() + client_joint_jerk_rel.wait_for_service() + + client_cartesian_velocity.wait_for_service() + client_cartesian_acceleration.wait_for_service() + client_cartesian_jerk.wait_for_service() + + request = libiiwa_msgs.srv.SetNumber.Request() + + request.data = 0.5 + client_joint_velocity_rel.call(request) + client_joint_acceleration_rel.call(request) + client_joint_jerk_rel.call(request) + + request.data = 10.0 + client_cartesian_velocity.call(request) + client_cartesian_acceleration.call(request) + client_cartesian_jerk.call(request) + + print("Robot connected") + + self.motion = None + self.motion_thread = None + + self.dt = 1 / 120.0 + self.action_scale = 2.5 + self.dof_vel_scale = 0.1 + self.max_episode_length = 100 + self.robot_dof_speed_scales = 1 + self.target_pos = np.array([0.65, 0.2, 0.2]) + self.robot_default_dof_pos = np.radians([0, 0, 0, -90, 0, 90, 0]) + self.robot_dof_lower_limits = np.array([-2.9671, -2.0944, -2.9671, -2.0944, -2.9671, -2.0944, -3.0543]) + self.robot_dof_upper_limits = np.array([ 2.9671, 2.0944, 2.9671, 2.0944, 2.9671, 2.0944, 3.0543]) + + self.progress_buf = 1 + self.obs_buf = np.zeros((18,), dtype=np.float32) + + def _spin(self): + rclpy.spin(self.node) + + def _callback_joint_states(self, msg): + self.robot_state["joint_position"] = np.array(msg.position) + self.robot_state["joint_velocity"] = np.array(msg.velocity) + + def _callback_end_effector_pose(self, msg): + positon = msg.position + self.robot_state["cartesian_position"] = np.array([positon.x, positon.y, positon.z]) + + def _get_observation_reward_done(self): + # observation + robot_dof_pos = self.robot_state["joint_position"] + robot_dof_vel = self.robot_state["joint_velocity"] + end_effector_pos = self.robot_state["cartesian_position"] + + dof_pos_scaled = 2.0 * (robot_dof_pos - self.robot_dof_lower_limits) / (self.robot_dof_upper_limits - self.robot_dof_lower_limits) - 1.0 + dof_vel_scaled = robot_dof_vel * self.dof_vel_scale + + self.obs_buf[0] = self.progress_buf / float(self.max_episode_length) + self.obs_buf[1:8] = dof_pos_scaled + self.obs_buf[8:15] = dof_vel_scaled + self.obs_buf[15:18] = self.target_pos + + # reward + distance = np.linalg.norm(end_effector_pos - self.target_pos) + reward = -distance + + # done + done = self.progress_buf >= self.max_episode_length - 1 + done = done or distance <= 0.075 + + print("Distance:", distance) + if done: + print("Target or Maximum episode length reached") + time.sleep(1) + + return self.obs_buf, reward, done + + def reset(self): + print("Reseting...") + + # go to 1) safe position, 2) random position + msg = sensor_msgs.msg.JointState() + msg.position = self.robot_default_dof_pos.tolist() + self.pub_command_joint.publish(msg) + time.sleep(3) + msg.position = (self.robot_default_dof_pos + 0.25 * (np.random.rand(7) - 0.5)).tolist() + self.pub_command_joint.publish(msg) + time.sleep(1) + + # get target position from prompt + while True: + try: + print("Enter target position (X, Y, Z) in meters") + raw = input("or press [Enter] key for a random target position: ") + if raw: + self.target_pos = np.array([float(p) for p in raw.replace(' ', '').split(',')]) + else: + noise = (2 * np.random.rand(3) - 1) * np.array([0.1, 0.2, 0.2]) + self.target_pos = np.array([0.6, 0.0, 0.4]) + noise + print("Target position:", self.target_pos) + break + except ValueError: + print("Invalid input. Try something like: 0.65, 0.0, 0.4") + + input("Press [Enter] to continue") + + self.progress_buf = 0 + observation, reward, done = self._get_observation_reward_done() + + return observation, {} + + def step(self, action): + self.progress_buf += 1 + + # control space + # joint + if self.control_space == "joint": + joint_positions = self.robot_state["joint_position"] + (self.robot_dof_speed_scales * self.dt * action * self.action_scale) + msg = sensor_msgs.msg.JointState() + msg.position = joint_positions.tolist() + self.pub_command_joint.publish(msg) + # cartesian + elif self.control_space == "cartesian": + end_effector_pos = self.robot_state["cartesian_position"] + action / 100.0 + msg = geometry_msgs.msg.Pose() + msg.position.x = end_effector_pos[0] + msg.position.y = end_effector_pos[1] + msg.position.z = end_effector_pos[2] + msg.orientation.x = np.nan + msg.orientation.y = np.nan + msg.orientation.z = np.nan + msg.orientation.w = np.nan + self.pub_command_cartesian.publish(msg) + + # the use of time.sleep is for simplicity. It does not guarantee control at a specific frequency + time.sleep(1 / 30.0) + + observation, reward, terminated = self._get_observation_reward_done() + + return observation, reward, terminated, False, {} + + def render(self, *args, **kwargs): + pass + + def close(self): + # shutdown the node + self.node.destroy_node() + rclpy.shutdown() diff --git a/docs/source/examples/real_world/kuka_lbr_iiwa/reaching_iiwa_real_ros_env.py b/docs/source/examples/real_world/kuka_lbr_iiwa/reaching_iiwa_real_ros_env.py new file mode 100644 index 00000000..a6df08e6 --- /dev/null +++ b/docs/source/examples/real_world/kuka_lbr_iiwa/reaching_iiwa_real_ros_env.py @@ -0,0 +1,193 @@ +import time +import numpy as np +import gymnasium as gym + +import rospy +import sensor_msgs.msg +import geometry_msgs.msg + +import libiiwa_msgs.srv + + +class ReachingIiwa(gym.Env): + def __init__(self, control_space="joint"): + + self.control_space = control_space # joint or cartesian + + # spaces + self.observation_space = gym.spaces.Box(low=-1000, high=1000, shape=(18,), dtype=np.float32) + if self.control_space == "joint": + self.action_space = gym.spaces.Box(low=-1, high=1, shape=(7,), dtype=np.float32) + elif self.control_space == "cartesian": + self.action_space = gym.spaces.Box(low=-1, high=1, shape=(3,), dtype=np.float32) + else: + raise ValueError("Invalid control space:", self.control_space) + + # create publishers + self.pub_command_joint = rospy.Publisher('/iiwa/command/joint', sensor_msgs.msg.JointState, queue_size=1) + self.pub_command_cartesian = rospy.Publisher('/iiwa/command/cartesian', geometry_msgs.msg.Pose, queue_size=1) + + # keep compatibility with libiiwa Python API + self.robot_state = {"joint_position": np.zeros((7,)), + "joint_velocity": np.zeros((7,)), + "cartesian_position": np.zeros((3,))} + + # create subscribers + rospy.Subscriber('/iiwa/state/joint_states', sensor_msgs.msg.JointState, self._callback_joint_states) + rospy.Subscriber('/iiwa/state/end_effector_pose', geometry_msgs.msg.Pose, self._callback_end_effector_pose) + + # create service clients + rospy.wait_for_service('/iiwa/set_control_interface') + + proxy = rospy.ServiceProxy('/iiwa/set_control_interface', libiiwa_msgs.srv.SetString) + proxy("SERVO") # or "servo" + + rospy.wait_for_service('/iiwa/set_desired_joint_velocity_rel') + rospy.wait_for_service('/iiwa/set_desired_joint_acceleration_rel') + rospy.wait_for_service('/iiwa/set_desired_joint_jerk_rel') + + proxy = rospy.ServiceProxy('/iiwa/set_desired_joint_velocity_rel', libiiwa_msgs.srv.SetNumber) + proxy(0.5) + proxy = rospy.ServiceProxy('/iiwa/set_desired_joint_acceleration_rel', libiiwa_msgs.srv.SetNumber) + proxy(0.5) + proxy = rospy.ServiceProxy('/iiwa/set_desired_joint_jerk_rel', libiiwa_msgs.srv.SetNumber) + proxy(0.5) + + rospy.wait_for_service('/iiwa/set_desired_cartesian_velocity') + rospy.wait_for_service('/iiwa/set_desired_cartesian_acceleration') + rospy.wait_for_service('/iiwa/set_desired_cartesian_jerk') + + proxy = rospy.ServiceProxy('/iiwa/set_desired_cartesian_velocity', libiiwa_msgs.srv.SetNumber) + proxy(10.0) + proxy = rospy.ServiceProxy('/iiwa/set_desired_cartesian_acceleration', libiiwa_msgs.srv.SetNumber) + proxy(10.0) + proxy = rospy.ServiceProxy('/iiwa/set_desired_cartesian_jerk', libiiwa_msgs.srv.SetNumber) + proxy(10.0) + + # initialize the ROS node + rospy.init_node(self.__class__.__name__) + + print("Robot connected") + + self.motion = None + self.motion_thread = None + + self.dt = 1 / 120.0 + self.action_scale = 2.5 + self.dof_vel_scale = 0.1 + self.max_episode_length = 100 + self.robot_dof_speed_scales = 1 + self.target_pos = np.array([0.65, 0.2, 0.2]) + self.robot_default_dof_pos = np.radians([0, 0, 0, -90, 0, 90, 0]) + self.robot_dof_lower_limits = np.array([-2.9671, -2.0944, -2.9671, -2.0944, -2.9671, -2.0944, -3.0543]) + self.robot_dof_upper_limits = np.array([ 2.9671, 2.0944, 2.9671, 2.0944, 2.9671, 2.0944, 3.0543]) + + self.progress_buf = 1 + self.obs_buf = np.zeros((18,), dtype=np.float32) + + def _callback_joint_states(self, msg): + self.robot_state["joint_position"] = np.array(msg.position) + self.robot_state["joint_velocity"] = np.array(msg.velocity) + + def _callback_end_effector_pose(self, msg): + positon = msg.position + self.robot_state["cartesian_position"] = np.array([positon.x, positon.y, positon.z]) + + def _get_observation_reward_done(self): + # observation + robot_dof_pos = self.robot_state["joint_position"] + robot_dof_vel = self.robot_state["joint_velocity"] + end_effector_pos = self.robot_state["cartesian_position"] + + dof_pos_scaled = 2.0 * (robot_dof_pos - self.robot_dof_lower_limits) / (self.robot_dof_upper_limits - self.robot_dof_lower_limits) - 1.0 + dof_vel_scaled = robot_dof_vel * self.dof_vel_scale + + self.obs_buf[0] = self.progress_buf / float(self.max_episode_length) + self.obs_buf[1:8] = dof_pos_scaled + self.obs_buf[8:15] = dof_vel_scaled + self.obs_buf[15:18] = self.target_pos + + # reward + distance = np.linalg.norm(end_effector_pos - self.target_pos) + reward = -distance + + # done + done = self.progress_buf >= self.max_episode_length - 1 + done = done or distance <= 0.075 + + print("Distance:", distance) + if done: + print("Target or Maximum episode length reached") + time.sleep(1) + + return self.obs_buf, reward, done + + def reset(self): + print("Reseting...") + + # go to 1) safe position, 2) random position + msg = sensor_msgs.msg.JointState() + msg.position = self.robot_default_dof_pos.tolist() + self.pub_command_joint.publish(msg) + time.sleep(3) + msg.position = (self.robot_default_dof_pos + 0.25 * (np.random.rand(7) - 0.5)).tolist() + self.pub_command_joint.publish(msg) + time.sleep(1) + + # get target position from prompt + while True: + try: + print("Enter target position (X, Y, Z) in meters") + raw = input("or press [Enter] key for a random target position: ") + if raw: + self.target_pos = np.array([float(p) for p in raw.replace(' ', '').split(',')]) + else: + noise = (2 * np.random.rand(3) - 1) * np.array([0.1, 0.2, 0.2]) + self.target_pos = np.array([0.6, 0.0, 0.4]) + noise + print("Target position:", self.target_pos) + break + except ValueError: + print("Invalid input. Try something like: 0.65, 0.0, 0.4") + + input("Press [Enter] to continue") + + self.progress_buf = 0 + observation, reward, done = self._get_observation_reward_done() + + return observation, {} + + def step(self, action): + self.progress_buf += 1 + + # control space + # joint + if self.control_space == "joint": + joint_positions = self.robot_state["joint_position"] + (self.robot_dof_speed_scales * self.dt * action * self.action_scale) + msg = sensor_msgs.msg.JointState() + msg.position = joint_positions.tolist() + self.pub_command_joint.publish(msg) + # cartesian + elif self.control_space == "cartesian": + end_effector_pos = self.robot_state["cartesian_position"] + action / 100.0 + msg = geometry_msgs.msg.Pose() + msg.position.x = end_effector_pos[0] + msg.position.y = end_effector_pos[1] + msg.position.z = end_effector_pos[2] + msg.orientation.x = np.nan + msg.orientation.y = np.nan + msg.orientation.z = np.nan + msg.orientation.w = np.nan + self.pub_command_cartesian.publish(msg) + + # the use of time.sleep is for simplicity. It does not guarantee control at a specific frequency + time.sleep(1 / 30.0) + + observation, reward, terminated = self._get_observation_reward_done() + + return observation, reward, terminated, False, {} + + def render(self, *args, **kwargs): + pass + + def close(self): + pass diff --git a/docs/source/examples/real_world/kuka_lbr_iiwa/reaching_iiwa_real_ros_ros2_skrl_eval.py b/docs/source/examples/real_world/kuka_lbr_iiwa/reaching_iiwa_real_ros_ros2_skrl_eval.py new file mode 100644 index 00000000..b3c424f6 --- /dev/null +++ b/docs/source/examples/real_world/kuka_lbr_iiwa/reaching_iiwa_real_ros_ros2_skrl_eval.py @@ -0,0 +1,96 @@ +import torch +import torch.nn as nn + +# Import the skrl components to build the RL system +from skrl.models.torch import Model, GaussianMixin +from skrl.agents.torch.ppo import PPO, PPO_DEFAULT_CONFIG +from skrl.resources.preprocessors.torch import RunningStandardScaler +from skrl.trainers.torch import SequentialTrainer +from skrl.envs.torch import wrap_env + + +# Define only the policy for evaluation +class Policy(GaussianMixin, Model): + def __init__(self, observation_space, action_space, device, clip_actions=False, + clip_log_std=True, min_log_std=-20, max_log_std=2): + Model.__init__(self, observation_space, action_space, device) + GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std) + + self.net = nn.Sequential(nn.Linear(self.num_observations, 256), + nn.ELU(), + nn.Linear(256, 128), + nn.ELU(), + nn.Linear(128, 64), + nn.ELU(), + nn.Linear(64, self.num_actions)) + self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) + + def compute(self, inputs, role): + return self.net(inputs["states"]), self.log_std_parameter, {} + + +# Load the environment according to the ROS version +def get_active_ros_version(): + import os + if os.environ.get("ROS_DISTRO"): + return "ROS2" if os.environ.get("AMENT_PREFIX_PATH") else "ROS" + return "" + +active_ros_version = get_active_ros_version() + +if active_ros_version == "ROS": + from reaching_iiwa_real_ros_env import ReachingIiwa +elif active_ros_version == "ROS2": + from reaching_iiwa_real_ros2_env import ReachingIiwa +else: + print("No active ROS version found") + exit() + +control_space = "joint" # joint or cartesian +env = ReachingIiwa(control_space=control_space) + +# wrap the environment +env = wrap_env(env) + +device = env.device + + +# Instantiate the agent's policy. +# PPO requires 2 models, visit its documentation for more details +# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#spaces-and-models +models_ppo = {} +models_ppo["policy"] = Policy(env.observation_space, env.action_space, device) + + +# Configure and instantiate the agent. +# Only modify some of the default configuration, visit its documentation to see all the options +# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#configuration-and-hyperparameters +cfg_ppo = PPO_DEFAULT_CONFIG.copy() +cfg_ppo["random_timesteps"] = 0 +cfg_ppo["learning_starts"] = 0 +cfg_ppo["state_preprocessor"] = RunningStandardScaler +cfg_ppo["state_preprocessor_kwargs"] = {"size": env.observation_space, "device": device} +# logging to TensorBoard each 32 timesteps an ignore checkpoints +cfg_ppo["experiment"]["write_interval"] = 32 +cfg_ppo["experiment"]["checkpoint_interval"] = 0 + +agent = PPO(models=models_ppo, + memory=None, + cfg=cfg_ppo, + observation_space=env.observation_space, + action_space=env.action_space, + device=device) + +# load checkpoints +if control_space == "joint": + agent.load("./agent_joint.pt") +elif control_space == "cartesian": + agent.load("./agent_cartesian.pt") + + +# Configure and instantiate the RL trainer +cfg_trainer = {"timesteps": 1000, "headless": True} +trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=agent) + +# start evaluation +trainer.eval() diff --git a/docs/source/examples/real_world/kuka_lbr_iiwa/reaching_iiwa_real_skrl_eval.py b/docs/source/examples/real_world/kuka_lbr_iiwa/reaching_iiwa_real_skrl_eval.py new file mode 100644 index 00000000..87b43765 --- /dev/null +++ b/docs/source/examples/real_world/kuka_lbr_iiwa/reaching_iiwa_real_skrl_eval.py @@ -0,0 +1,82 @@ +import torch +import torch.nn as nn + +# Import the skrl components to build the RL system +from skrl.models.torch import Model, GaussianMixin +from skrl.agents.torch.ppo import PPO, PPO_DEFAULT_CONFIG +from skrl.resources.preprocessors.torch import RunningStandardScaler +from skrl.trainers.torch import SequentialTrainer +from skrl.envs.torch import wrap_env + + +# Define only the policy for evaluation +class Policy(GaussianMixin, Model): + def __init__(self, observation_space, action_space, device, clip_actions=False, + clip_log_std=True, min_log_std=-20, max_log_std=2): + Model.__init__(self, observation_space, action_space, device) + GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std) + + self.net = nn.Sequential(nn.Linear(self.num_observations, 256), + nn.ELU(), + nn.Linear(256, 128), + nn.ELU(), + nn.Linear(128, 64), + nn.ELU(), + nn.Linear(64, self.num_actions)) + self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) + + def compute(self, inputs, role): + return self.net(inputs["states"]), self.log_std_parameter, {} + + +# Load the environment +from reaching_iiwa_real_env import ReachingIiwa + +control_space = "joint" # joint or cartesian +env = ReachingIiwa(control_space=control_space) + +# wrap the environment +env = wrap_env(env) + +device = env.device + + +# Instantiate the agent's policy. +# PPO requires 2 models, visit its documentation for more details +# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#spaces-and-models +models_ppo = {} +models_ppo["policy"] = Policy(env.observation_space, env.action_space, device) + + +# Configure and instantiate the agent. +# Only modify some of the default configuration, visit its documentation to see all the options +# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#configuration-and-hyperparameters +cfg_ppo = PPO_DEFAULT_CONFIG.copy() +cfg_ppo["random_timesteps"] = 0 +cfg_ppo["learning_starts"] = 0 +cfg_ppo["state_preprocessor"] = RunningStandardScaler +cfg_ppo["state_preprocessor_kwargs"] = {"size": env.observation_space, "device": device} +# logging to TensorBoard each 32 timesteps an ignore checkpoints +cfg_ppo["experiment"]["write_interval"] = 32 +cfg_ppo["experiment"]["checkpoint_interval"] = 0 + +agent = PPO(models=models_ppo, + memory=None, + cfg=cfg_ppo, + observation_space=env.observation_space, + action_space=env.action_space, + device=device) + +# load checkpoints +if control_space == "joint": + agent.load("./agent_joint.pt") +elif control_space == "cartesian": + agent.load("./agent_cartesian.pt") + + +# Configure and instantiate the RL trainer +cfg_trainer = {"timesteps": 1000, "headless": True} +trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=agent) + +# start evaluation +trainer.eval() diff --git a/docs/source/examples/robosuite/td3_robosuite_two_arm_lift.py b/docs/source/examples/robosuite/td3_robosuite_two_arm_lift.py new file mode 100644 index 00000000..69a5f344 --- /dev/null +++ b/docs/source/examples/robosuite/td3_robosuite_two_arm_lift.py @@ -0,0 +1,117 @@ +import robosuite +from robosuite.controllers import load_controller_config + +import torch +import torch.nn as nn +import torch.nn.functional as F + +# Import the skrl components to build the RL system +from skrl.models.torch import Model, DeterministicMixin +from skrl.memories.torch import RandomMemory +from skrl.agents.torch.td3 import TD3, TD3_DEFAULT_CONFIG +from skrl.resources.noises.torch import GaussianNoise +from skrl.trainers.torch import SequentialTrainer +from skrl.envs.torch import wrap_env + + +# Define the models (deterministic models) for the TD3 agent using mixins +# and programming with two approaches (torch functional and torch.nn.Sequential class). +# - Actor (policy): takes as input the environment's observation/state and returns an action +# - Critic: takes the state and action as input and provides a value to guide the policy +class DeterministicActor(DeterministicMixin, Model): + def __init__(self, observation_space, action_space, device, clip_actions=False): + Model.__init__(self, observation_space, action_space, device) + DeterministicMixin.__init__(self, clip_actions) + + self.linear_layer_1 = nn.Linear(self.num_observations, 400) + self.linear_layer_2 = nn.Linear(400, 300) + self.action_layer = nn.Linear(300, self.num_actions) + + def compute(self, inputs, role): + x = F.relu(self.linear_layer_1(inputs["states"])) + x = F.relu(self.linear_layer_2(x)) + return torch.tanh(self.action_layer(x)), {} + +class DeterministicCritic(DeterministicMixin, Model): + def __init__(self, observation_space, action_space, device, clip_actions=False): + Model.__init__(self, observation_space, action_space, device) + DeterministicMixin.__init__(self, clip_actions) + + self.net = nn.Sequential(nn.Linear(self.num_observations + self.num_actions, 400), + nn.ReLU(), + nn.Linear(400, 300), + nn.ReLU(), + nn.Linear(300, 1)) + + def compute(self, inputs, role): + return self.net(torch.cat([inputs["states"], inputs["taken_actions"]], dim=1)), {} + + +# Load and wrap the DeepMind robosuite environment +controller_config = load_controller_config(default_controller="OSC_POSE") +env = robosuite.make("TwoArmLift", + robots=["Sawyer", "Panda"], # load a Sawyer robot and a Panda robot + gripper_types="default", # use default grippers per robot arm + controller_configs=controller_config, # each arm is controlled using OSC + env_configuration="single-arm-opposed", # (two-arm envs only) arms face each other + has_renderer=True, # on-screen rendering + render_camera="frontview", # visualize the "frontview" camera + has_offscreen_renderer=False, # no off-screen rendering + control_freq=20, # 20 hz control for applied actions + horizon=200, # each episode terminates after 200 steps + use_object_obs=True, # provide object observations to agent + use_camera_obs=False, # don't provide image observations to agent + reward_shaping=True) # use a dense reward signal for learning +env = wrap_env(env) + +device = env.device + + +# Instantiate a RandomMemory (without replacement) as experience replay memory +memory = RandomMemory(memory_size=25000, num_envs=env.num_envs, device=device, replacement=False) + + +# Instantiate the agent's models (function approximators). +# TD3 requires 6 models, visit its documentation for more details +# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.td3.html#spaces-and-models +models = {} +models["policy"] = DeterministicActor(env.observation_space, env.action_space, device) +models["target_policy"] = DeterministicActor(env.observation_space, env.action_space, device) +models["critic_1"] = DeterministicCritic(env.observation_space, env.action_space, device) +models["critic_2"] = DeterministicCritic(env.observation_space, env.action_space, device) +models["target_critic_1"] = DeterministicCritic(env.observation_space, env.action_space, device) +models["target_critic_2"] = DeterministicCritic(env.observation_space, env.action_space, device) + +# Initialize the models' parameters (weights and biases) using a Gaussian distribution +for model in models.values(): + model.init_parameters(method_name="normal_", mean=0.0, std=0.1) + + +# Configure and instantiate the agent. +# Only modify some of the default configuration, visit its documentation to see all the options +# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.td3.html#configuration-and-hyperparameters +cfg_agent = TD3_DEFAULT_CONFIG.copy() +cfg_agent["exploration"]["noise"] = GaussianNoise(0, 0.1, device=device) +cfg_agent["smooth_regularization_noise"] = GaussianNoise(0, 0.2, device=device) +cfg_agent["smooth_regularization_clip"] = 0.5 +cfg_agent["batch_size"] = 100 +cfg_agent["random_timesteps"] = 100 +cfg_agent["learning_starts"] = 100 +# logging to TensorBoard and write checkpoints each 1000 and 5000 timesteps respectively +cfg_agent["experiment"]["write_interval"] = 1000 +cfg_agent["experiment"]["checkpoint_interval"] = 5000 + +agent = TD3(models=models, + memory=memory, + cfg=cfg_agent, + observation_space=env.observation_space, + action_space=env.action_space, + device=device) + + +# Configure and instantiate the RL trainer +cfg_trainer = {"timesteps": 50000, "headless": False} +trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=agent) + +# start training +trainer.train() diff --git a/docs/source/examples/shimmy/ddpg_openai_gym_compatibility_pendulum.py b/docs/source/examples/shimmy/ddpg_openai_gym_compatibility_pendulum.py new file mode 100644 index 00000000..dc72a81c --- /dev/null +++ b/docs/source/examples/shimmy/ddpg_openai_gym_compatibility_pendulum.py @@ -0,0 +1,98 @@ +import gymnasium as gym + +import torch +import torch.nn as nn +import torch.nn.functional as F + +# Import the skrl components to build the RL system +from skrl.models.torch import Model, DeterministicMixin +from skrl.memories.torch import RandomMemory +from skrl.agents.torch.ddpg import DDPG, DDPG_DEFAULT_CONFIG +from skrl.resources.noises.torch import OrnsteinUhlenbeckNoise +from skrl.trainers.torch import SequentialTrainer +from skrl.envs.torch import wrap_env + + +# Define the models (deterministic models) for the DDPG agent using mixin +# - Actor (policy): takes as input the environment's observation/state and returns an action +# - Critic: takes the state and action as input and provides a value to guide the policy +class DeterministicActor(DeterministicMixin, Model): + def __init__(self, observation_space, action_space, device, clip_actions=False): + Model.__init__(self, observation_space, action_space, device) + DeterministicMixin.__init__(self, clip_actions) + + self.linear_layer_1 = nn.Linear(self.num_observations, 400) + self.linear_layer_2 = nn.Linear(400, 300) + self.action_layer = nn.Linear(300, self.num_actions) + + def compute(self, inputs, role): + x = F.relu(self.linear_layer_1(inputs["states"])) + x = F.relu(self.linear_layer_2(x)) + return 2 * torch.tanh(self.action_layer(x)), {} # Pendulum-v1 action_space is -2 to 2 + +class DeterministicCritic(DeterministicMixin, Model): + def __init__(self, observation_space, action_space, device, clip_actions=False): + Model.__init__(self, observation_space, action_space, device) + DeterministicMixin.__init__(self, clip_actions) + + self.linear_layer_1 = nn.Linear(self.num_observations + self.num_actions, 400) + self.linear_layer_2 = nn.Linear(400, 300) + self.linear_layer_3 = nn.Linear(300, 1) + + def compute(self, inputs, role): + x = F.relu(self.linear_layer_1(torch.cat([inputs["states"], inputs["taken_actions"]], dim=1))) + x = F.relu(self.linear_layer_2(x)) + return self.linear_layer_3(x), {} + + +# Load and wrap the Gymnasium environment. +env = gym.make("GymV26Environment-v0", env_id="Pendulum-v1") +env = wrap_env(env) + +device = env.device + + +# Instantiate a RandomMemory (without replacement) as experience replay memory +memory = RandomMemory(memory_size=15000, num_envs=env.num_envs, device=device, replacement=False) + + +# Instantiate the agent's models (function approximators). +# DDPG requires 4 models, visit its documentation for more details +# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ddpg.html#spaces-and-models +models_ddpg = {} +models_ddpg["policy"] = DeterministicActor(env.observation_space, env.action_space, device) +models_ddpg["target_policy"] = DeterministicActor(env.observation_space, env.action_space, device) +models_ddpg["critic"] = DeterministicCritic(env.observation_space, env.action_space, device) +models_ddpg["target_critic"] = DeterministicCritic(env.observation_space, env.action_space, device) + +# Initialize the models' parameters (weights and biases) using a Gaussian distribution +for model in models_ddpg.values(): + model.init_parameters(method_name="normal_", mean=0.0, std=0.1) + + +# Configure and instantiate the agent. +# Only modify some of the default configuration, visit its documentation to see all the options +# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ddpg.html#configuration-and-hyperparameters +cfg_ddpg = DDPG_DEFAULT_CONFIG.copy() +cfg_ddpg["exploration"]["noise"] = OrnsteinUhlenbeckNoise(theta=0.15, sigma=0.1, base_scale=1.0, device=device) +cfg_ddpg["batch_size"] = 100 +cfg_ddpg["random_timesteps"] = 100 +cfg_ddpg["learning_starts"] = 100 +# logging to TensorBoard and write checkpoints each 300 and 1500 timesteps respectively +cfg_ddpg["experiment"]["write_interval"] = 300 +cfg_ddpg["experiment"]["checkpoint_interval"] = 1500 + +agent_ddpg = DDPG(models=models_ddpg, + memory=memory, + cfg=cfg_ddpg, + observation_space=env.observation_space, + action_space=env.action_space, + device=device) + + +# Configure and instantiate the RL trainer +cfg_trainer = {"timesteps": 15000, "headless": True} +trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=agent_ddpg) + +# start training +trainer.train() diff --git a/docs/source/examples/shimmy/dqn_shimmy_atari_pong.py b/docs/source/examples/shimmy/dqn_shimmy_atari_pong.py new file mode 100644 index 00000000..553ed5e6 --- /dev/null +++ b/docs/source/examples/shimmy/dqn_shimmy_atari_pong.py @@ -0,0 +1,78 @@ +import gymnasium as gym + +import torch +import torch.nn as nn + +# Import the skrl components to build the RL system +from skrl.models.torch import Model, DeterministicMixin +from skrl.memories.torch import RandomMemory +from skrl.agents.torch.dqn import DQN, DQN_DEFAULT_CONFIG +from skrl.trainers.torch import SequentialTrainer +from skrl.envs.torch import wrap_env + + +# Define the model (deterministic models) for the DQN agent using mixin +class QNetwork(DeterministicMixin, Model): + def __init__(self, observation_space, action_space, device, clip_actions=False): + Model.__init__(self, observation_space, action_space, device) + DeterministicMixin.__init__(self, clip_actions) + + self.net = nn.Sequential(nn.Linear(self.num_observations, 64), + nn.ReLU(), + nn.Linear(64, 64), + nn.ReLU(), + nn.Linear(64, self.num_actions)) + + def compute(self, inputs, role): + return self.net(inputs["states"]), {} + + +# Load and wrap the environment +env = gym.make("ALE/Pong-v5") +env = wrap_env(env) + +device = env.device + + +# Instantiate a RandomMemory (without replacement) as experience replay memory +memory = RandomMemory(memory_size=15000, num_envs=env.num_envs, device=device, replacement=False) + + +# Instantiate the agent's models (function approximators). +# DQN requires 2 models, visit its documentation for more details +# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.dqn.html#spaces-and-models +models = {} +models["q_network"] = QNetwork(env.observation_space, env.action_space, device) +models["target_q_network"] = QNetwork(env.observation_space, env.action_space, device) + +# # Initialize the models' parameters (weights and biases) using a Gaussian distribution +for model in models.values(): + model.init_parameters(method_name="normal_", mean=0.0, std=0.1) + + +# Configure and instantiate the agent. +# Only modify some of the default configuration, visit its documentation to see all the options +# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.dqn.html#configuration-and-hyperparameters +cfg_agent = DQN_DEFAULT_CONFIG.copy() +cfg_agent["learning_starts"] = 100 +cfg_agent["exploration"]["initial_epsilon"] = 1.0 +cfg_agent["exploration"]["final_epsilon"] = 0.04 +cfg_agent["exploration"]["timesteps"] = 1500 +# logging to TensorBoard and write checkpoints each 1000 and 5000 timesteps respectively +cfg_agent["experiment"]["write_interval"] = 1000 +cfg_agent["experiment"]["checkpoint_interval"] = 5000 + +agent_dqn = DQN(models=models, + memory=memory, + cfg=cfg_agent, + observation_space=env.observation_space, + action_space=env.action_space, + device=device) + + +# Configure and instantiate the RL trainer +cfg_trainer = {"timesteps": 50000, "headless": True} +trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=agent_dqn) + +# start training +trainer.train() diff --git a/docs/source/examples/shimmy/sac_shimmy_dm_control_acrobot_swingup_sparse.py b/docs/source/examples/shimmy/sac_shimmy_dm_control_acrobot_swingup_sparse.py new file mode 100644 index 00000000..cf1a2f17 --- /dev/null +++ b/docs/source/examples/shimmy/sac_shimmy_dm_control_acrobot_swingup_sparse.py @@ -0,0 +1,100 @@ +import gymnasium as gym + +import torch +import torch.nn as nn +import torch.nn.functional as F + +# Import the skrl components to build the RL system +from skrl.models.torch import Model, DeterministicMixin +from skrl.memories.torch import RandomMemory +from skrl.agents.torch.ddpg import DDPG, DDPG_DEFAULT_CONFIG +from skrl.resources.noises.torch import OrnsteinUhlenbeckNoise +from skrl.trainers.torch import SequentialTrainer +from skrl.envs.torch import wrap_env + + +# Define the models (deterministic models) for the DDPG agent using mixin +# - Actor (policy): takes as input the environment's observation/state and returns an action +# - Critic: takes the state and action as input and provides a value to guide the policy +class Actor(DeterministicMixin, Model): + def __init__(self, observation_space, action_space, device, clip_actions=False): + Model.__init__(self, observation_space, action_space, device) + DeterministicMixin.__init__(self, clip_actions) + + self.linear_layer_1 = nn.Linear(self.num_observations, 400) + self.linear_layer_2 = nn.Linear(400, 300) + self.action_layer = nn.Linear(300, self.num_actions) + + def compute(self, inputs, role): + x = F.relu(self.linear_layer_1(inputs["states"])) + x = F.relu(self.linear_layer_2(x)) + # Pendulum-v1 action_space is -2 to 2 + return 2 * torch.tanh(self.action_layer(x)), {} + +class Critic(DeterministicMixin, Model): + def __init__(self, observation_space, action_space, device, clip_actions=False): + Model.__init__(self, observation_space, action_space, device) + DeterministicMixin.__init__(self, clip_actions) + + self.linear_layer_1 = nn.Linear(self.num_observations + self.num_actions, 400) + self.linear_layer_2 = nn.Linear(400, 300) + self.linear_layer_3 = nn.Linear(300, 1) + + def compute(self, inputs, role): + x = F.relu(self.linear_layer_1(torch.cat([inputs["states"], inputs["taken_actions"]], dim=1))) + x = F.relu(self.linear_layer_2(x)) + return self.linear_layer_3(x), {} + + +# Load and wrap the environment +env = gym.make("dm_control/acrobot-swingup_sparse-v0") +env = wrap_env(env) + +device = env.device + + +# Instantiate a RandomMemory (without replacement) as experience replay memory +memory = RandomMemory(memory_size=20000, num_envs=env.num_envs, device=device, replacement=False) + + +# Instantiate the agent's models (function approximators). +# DDPG requires 4 models, visit its documentation for more details +# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ddpg.html#spaces-and-models +models_ddpg = {} +models_ddpg["policy"] = Actor(env.observation_space, env.action_space, device) +models_ddpg["target_policy"] = Actor(env.observation_space, env.action_space, device) +models_ddpg["critic"] = Critic(env.observation_space, env.action_space, device) +models_ddpg["target_critic"] = Critic(env.observation_space, env.action_space, device) + +# Initialize the models' parameters (weights and biases) using a Gaussian distribution +for model in models_ddpg.values(): + model.init_parameters(method_name="normal_", mean=0.0, std=0.1) + + +# Configure and instantiate the agent. +# Only modify some of the default configuration, visit its documentation to see all the options +# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ddpg.html#configuration-and-hyperparameters +cfg_ddpg = DDPG_DEFAULT_CONFIG.copy() +cfg_ddpg["exploration"]["noise"] = OrnsteinUhlenbeckNoise(theta=0.15, sigma=0.1, base_scale=1.0, device=device) +cfg_ddpg["discount_factor"] = 0.98 +cfg_ddpg["batch_size"] = 100 +cfg_ddpg["random_timesteps"] = 1000 +cfg_ddpg["learning_starts"] = 1000 +# logging to TensorBoard and write checkpoints each 75 and 750 timesteps respectively +cfg_ddpg["experiment"]["write_interval"] = 75 +cfg_ddpg["experiment"]["checkpoint_interval"] = 750 + +agent_ddpg = DDPG(models=models_ddpg, + memory=memory, + cfg=cfg_ddpg, + observation_space=env.observation_space, + action_space=env.action_space, + device=device) + + +# Configure and instantiate the RL trainer +cfg_trainer = {"timesteps": 15000, "headless": True} +trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=agent_ddpg) + +# start training +trainer.train() diff --git a/docs/source/examples/utils/tensorboard_file_iterator.py b/docs/source/examples/utils/tensorboard_file_iterator.py index e8ba7179..01ffb04c 100644 --- a/docs/source/examples/utils/tensorboard_file_iterator.py +++ b/docs/source/examples/utils/tensorboard_file_iterator.py @@ -8,7 +8,7 @@ rewards = [] # load the Tensorboard files and iterate over them (tag: "Reward / Total reward (mean)") -tensorboard_iterator = postprocessing.TensorboardFileIterator("runs/*/events.out.tfevents.*", +tensorboard_iterator = postprocessing.TensorboardFileIterator("runs/*/events.out.tfevents.*", tags=["Reward / Total reward (mean)"]) for dirname, data in tensorboard_iterator: rewards.append(data["Reward / Total reward (mean)"]) diff --git a/docs/source/index.rst b/docs/source/index.rst index 52ca475d..b948cdca 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -1,21 +1,21 @@ SKRL - Reinforcement Learning library (|version|) ================================================= -**skrl** is an open-source modular library for Reinforcement Learning written in Python (using `PyTorch `_) and designed with a focus on readability, simplicity, and transparency of algorithm implementation. In addition to supporting the `OpenAI Gym `_ and `DeepMind `_ environment interfaces, it allows loading and configuring `NVIDIA Isaac Gym `_ and `NVIDIA Omniverse Isaac Gym `_ environments, enabling agents' simultaneous training by scopes (subsets of environments among all available environments), which may or may not share resources, in the same run +**skrl** is an open-source modular library for Reinforcement Learning written in Python (using `PyTorch `_) and designed with a focus on readability, simplicity, and transparency of algorithm implementation. In addition to supporting the OpenAI `Gym `_ / Farama `Gymnasium `_, `DeepMind `_ and other environment interfaces, it allows loading and configuring `NVIDIA Isaac Gym `_ and `NVIDIA Omniverse Isaac Gym `_ environments, enabling agents' simultaneous training by scopes (subsets of environments among all available environments), which may or may not share resources, in the same run **Main features:** * Clean code * Modularity and reusability * Documented library, code and implementations - * Support for OpenAI Gym (single and vectorized), DeepMind, NVIDIA Isaac Gym (preview 2, 3 and 4) and NVIDIA Omniverse Isaac Gym environments - * Simultaneous learning by scopes in OpenAI Gym (vectorized), NVIDIA Isaac Gym and NVIDIA Omniverse Isaac Gym + * Support for Gym/Gymnasium (single and vectorized), DeepMind, NVIDIA Isaac Gym (preview 2, 3 and 4), NVIDIA Omniverse Isaac Gym environments, among others + * Simultaneous learning by scopes in Gym/Gymnasium (vectorized), NVIDIA Isaac Gym and NVIDIA Omniverse Isaac Gym .. warning:: **skrl** is under **active continuous development**. Make sure you always have the latest version. Visit the `develop `_ branch or its `documentation `_ to access the latest updates to be released. -| **GitHub repository:** https://github.com/Toni-SM/skrl -| **Questions or discussions:** https://github.com/Toni-SM/skrl/discussions +| **GitHub repository:** https://github.com/Toni-SM/skrl +| **Questions or discussions:** https://github.com/Toni-SM/skrl/discussions **Citing skrl:** To cite this library (created at `Mondragon Unibertsitatea `_) use the following reference to its `article `_: *"skrl: Modular and Flexible Library for Reinforcement Learning"* @@ -90,9 +90,9 @@ Agents Environments ^^^^^^^^^^^^ - Definition of the Isaac Gym (preview 2, 3 and 4) and Omniverse Isaac Gym environment loaders, and wrappers for the OpenAI Gym, DeepMind, Isaac Gym and Omniverse Isaac Gym environments + Definition of the Isaac Gym (preview 2, 3 and 4) and Omniverse Isaac Gym environment loaders, and wrappers for the Gym/Gymnasium, DeepMind, Isaac Gym, Omniverse Isaac Gym environments, among others - * :doc:`Wrapping ` **OpenAI Gym**, **DeepMind**, **Isaac Gym** and **Omniverse Isaac Gym** environments + * :doc:`Wrapping ` **Gym/Gymnasium**, **DeepMind**, **Isaac Gym**, **Omniverse Isaac Gym** environments, among others * Loading :doc:`Isaac Gym environments ` * Loading :doc:`Omniverse Isaac Gym environments ` @@ -157,7 +157,7 @@ Trainers :maxdepth: 1 :caption: Trainers :hidden: - + modules/skrl.trainers.base_class modules/skrl.trainers.sequential modules/skrl.trainers.parallel @@ -185,7 +185,7 @@ Resources :maxdepth: 2 :caption: Resources :hidden: - + modules/skrl.resources.noises modules/skrl.resources.schedulers modules/skrl.resources.preprocessors @@ -205,7 +205,7 @@ Utils :maxdepth: 1 :caption: Utils :hidden: - + modules/skrl.utils.utilities modules/skrl.utils.model_instantiators modules/skrl.utils.postprocessing diff --git a/docs/source/intro/data.rst b/docs/source/intro/data.rst index eb09f06c..1c740916 100644 --- a/docs/source/intro/data.rst +++ b/docs/source/intro/data.rst @@ -6,19 +6,19 @@ Saving, loading and logging Tracking metrics (TensorBoard) ------------------------------ +`TensorBoard `_ is used for tracking and visualizing metrics and scalars (coefficients, losses, etc.). The tracking and writing of metrics and scalars is the responsibility of the agents (**can be customized independently for each agent using its configuration dictionary**). + Configuration ^^^^^^^^^^^^^ -`TensorBoard `_ is used for tracking and visualizing metrics and scalars (coefficients, losses, etc.). The tracking and writing of metrics and scalars is the responsibility of the agents (**can be customized independently for each agent using its configuration dictionary**). - Each agent offers the following parameters under the :literal:`"experiment"` key: .. code-block:: python - :emphasize-lines: 5,6,7 + :emphasize-lines: 5-7 DEFAULT_CONFIG = { ... - + "experiment": { "directory": "", # experiment's parent directory "experiment_name": "", # experiment name @@ -26,6 +26,9 @@ Each agent offers the following parameters under the :literal:`"experiment"` key "checkpoint_interval": 1000, # interval for checkpoints (timesteps) "store_separately": False, # whether to store checkpoints separately + + "wandb": False, # whether to use Weights & Biases + "wandb_kwargs": {} # wandb kwargs (see https://docs.wandb.ai/ref/python/init) } } @@ -128,6 +131,49 @@ Tracking custom metrics/scales ---------------- +Tracking metrics (Weights and Biases) +------------------------------------- + +`Weights & Biases `_ is also supported for tracking and visualizing metrics and scalars. Its configuration is responsibility of the agents (**can be customized independently for each agent using its configuration dictionary**). + +Follow the steps described in Weights & Biases documentation (`Set up wandb `_) to login to the :literal:`wandb` library on the current machine. + +Configuration +^^^^^^^^^^^^^ + +Each agent offers the following parameters under the :literal:`"experiment"` key. Visit the Weights & Biases documentation for more details about the configuration parameters. + +.. code-block:: python + :emphasize-lines: 12-13 + + DEFAULT_CONFIG = { + ... + + "experiment": { + "directory": "", # experiment's parent directory + "experiment_name": "", # experiment name + "write_interval": 250, # TensorBoard writing interval (timesteps) + + "checkpoint_interval": 1000, # interval for checkpoints (timesteps) + "store_separately": False, # whether to store checkpoints separately + + "wandb": False, # whether to use Weights & Biases + "wandb_kwargs": {} # wandb kwargs (see https://docs.wandb.ai/ref/python/init) + } + } + +* **wandb**: whether to enable support for Weights & Biases. + +* **wandb_kwargs**: keyword argument dictionary used to parameterize the `wandb.init `_ function. If no values are provided for the following parameters, the following values will be set for them: + + * :literal:`"name"`: will be set to the name of the experiment directory. + + * :literal:`"sync_tensorboard"`: will be set to :literal:`True`. + + * :literal:`"config"`: will be updated with the configuration dictionaries of both the agent (and its models) and the trainer. The update will be done even if a value has been set for the parameter. + +---------------- + Checkpoints ----------- @@ -143,7 +189,7 @@ The checkpoint management, as in the previous case, is the responsibility of the DEFAULT_CONFIG = { ... - + "experiment": { "directory": "", # experiment's parent directory "experiment_name": "", # experiment name @@ -151,6 +197,9 @@ The checkpoint management, as in the previous case, is the responsibility of the "checkpoint_interval": 1000, # interval for checkpoints (timesteps) "store_separately": False, # whether to store checkpoints separately + + "wandb": False, # whether to use Weights & Biases + "wandb_kwargs": {} # wandb kwargs (see https://docs.wandb.ai/ref/python/init) } } @@ -214,8 +263,8 @@ The following code snippets show how to load the checkpoints through the instant nn.ReLU(), nn.Linear(32, self.num_actions)) - def compute(self, states, taken_actions, role): - return self.net(states) + def compute(self, inputs, role): + return self.net(inputs["states"]), {} # Instantiate the model policy = Policy(env.observation_space, env.action_space, env.device, clip_actions=True) @@ -273,8 +322,8 @@ The following code snippets show how to migrate checkpoints from other libraries nn.ReLU(), nn.Linear(32, self.num_actions)) - def compute(self, states, taken_actions, role): - return self.net(states) + def compute(self, inputs, role): + return self.net(inputs["states"]), {} # Instantiate the model policy = Policy(env.observation_space, env.action_space, env.device, clip_actions=True) @@ -305,9 +354,9 @@ Memories can be automatically exported to files at each filling cycle (before da from skrl.memories.torch import RandomMemory # Instantiate a memory and enable its export - memory = RandomMemory(memory_size=16, - num_envs=env.num_envs, - device=device, + memory = RandomMemory(memory_size=16, + num_envs=env.num_envs, + device=device, export=True, export_format="pt", export_directory="./memories") diff --git a/docs/source/intro/examples.rst b/docs/source/intro/examples.rst index c15e6c92..48d31708 100644 --- a/docs/source/intro/examples.rst +++ b/docs/source/intro/examples.rst @@ -3,8 +3,30 @@ Examples ======== +In this section, you will find a variety of examples that demonstrate how to use this library to solve reinforcement learning tasks. With the knowledge and skills you gain from trying these examples, you will be well on your way to using this library to solve your reinforcement learning problems + .. contents:: Table of Contents - :depth: 1 + :depth: 2 + :local: + :backlinks: none + +.. raw:: html + +
+ +.. note:: + + It is recommended to use the Table of Contents in the sidebar or in this section to improve the browsing experience + +.. raw:: html + +

+ +Gym/Gymnasium +------------- + +.. contents:: + :depth: 2 :local: :backlinks: none @@ -12,15 +34,15 @@ Examples
-Learning in an OpenAI Gym environment -------------------------------------- +Gym/Gymnasium environments +^^^^^^^^^^^^^^^^^^^^^^^^^^ -These examples perform the training of one agent in an OpenAI Gym environment (**one agent, one environment**) +These examples perform the training of one agent in a Gym/Gymnasium environment (**one agent, one environment**) .. image:: ../_static/imgs/example_gym.png :width: 100% :align: center - :alt: OpenAI Gym environments + :alt: Gym/Gymnasium environments .. raw:: html @@ -28,130 +50,189 @@ These examples perform the training of one agent in an OpenAI Gym environment (* The following components or practices are exemplified (highlighted): - - Load and wrap an OpenAI Gym environment: **Pendulum (DDPG)**, **CartPole (CEM)** + - Load and wrap a Gym environment: **Pendulum (DDPG)**, **CartPole (CEM)** + - Recurrent neural network models (RNN, GRU, LSTM): **PendulumNoVel (DDPG)** - Instantiate models using the model instantiation utility: **CartPole (DQN)** - Create a tabular model (:math:`\epsilon`-greedy policy): **Taxi (SARSA)**, **FrozenLake (Q-Learning)** - Load a checkpoint during evaluation: **Pendulum (DDPG)**, **CartPole (CEM)**, **CartPole (DQN)**, **Taxi (SARSA)**, **FrozenLake (Q-Learning)** +**Benchmark results** are listed in `Benchmark results #32 (Gym/Gymnasium) `_ + .. tabs:: - + .. tab:: Pendulum (DDPG) .. tabs:: - + .. group-tab:: Training - :download:`gym_pendulum_ddpg.py <../examples/gym/gym_pendulum_ddpg.py>` + | :download:`ddpg_gym_pendulum.py <../examples/gym/ddpg_gym_pendulum.py>` + | :download:`ddpg_gymnasium_pendulum.py <../examples/gymnasium/ddpg_gymnasium_pendulum.py>` - .. literalinclude:: ../examples/gym/gym_pendulum_ddpg.py + .. literalinclude:: ../examples/gym/ddpg_gym_pendulum.py :language: python - :emphasize-lines: 1, 13, 50-56 + :emphasize-lines: 1, 13, 51-57 .. group-tab:: Evaluation - - :download:`gym_pendulum_ddpg_eval.py <../examples/gym/gym_pendulum_ddpg_eval.py>` + + | :download:`ddpg_gym_pendulum_eval.py <../examples/gym/ddpg_gym_pendulum_eval.py>` + | :download:`ddpg_gymnasium_pendulum_eval.py <../examples/gymnasium/ddpg_gymnasium_pendulum_eval.py>` **Note:** It is necessary to adjust the checkpoint path according to the directories generated by the new experiments **Note:** Warnings such as :literal:`[skrl:WARNING] Cannot load the module. The agent doesn't have such an instance` can be ignored without problems. The reason for this is that during the evaluation, not all components such as optimizers or other models apart from the policy are defined - .. literalinclude:: ../examples/gym/gym_pendulum_ddpg_eval.py + .. literalinclude:: ../examples/gym/ddpg_gym_pendulum_eval.py :language: python :emphasize-lines: 67 + .. tab:: PendulumNoVel (DDPG) + + .. note:: + + The examples use a wrapper around the original environment to mask the velocity in the observation. The intention is to make the MDP partially observable and to show the capabilities of recurrent neural networks + + More examples with other algorithms can be found in the repository documentation `example folder `_ and in the benchmark results indicated above + + .. tabs:: + + .. tab:: RNN + + .. tabs:: + + .. group-tab:: Training + + | :download:`ddpg_gym_pendulumnovel_rnn.py <../examples/gym/ddpg_gym_pendulumnovel_rnn.py>` + + .. literalinclude:: ../examples/gym/ddpg_gym_pendulumnovel_rnn.py + :language: python + :emphasize-lines: 31-34, 40-43, 50-77, 86, 99-102, 108-111, 118-141, 149 + + .. tab:: GRU + + .. tabs:: + + .. group-tab:: Training + + | :download:`ddpg_gym_pendulumnovel_gru.py <../examples/gym/ddpg_gym_pendulumnovel_gru.py>` + + .. literalinclude:: ../examples/gym/ddpg_gym_pendulumnovel_gru.py + :language: python + :emphasize-lines: 31-34, 40-43, 50-77, 86, 99-102, 108-111, 118-141, 149 + + .. tab:: LSTM + + .. tabs:: + + .. group-tab:: Training + + | :download:`ddpg_gym_pendulumnovel_lstm.py <../examples/gym/ddpg_gym_pendulumnovel_lstm.py>` + + .. literalinclude:: ../examples/gym/ddpg_gym_pendulumnovel_lstm.py + :language: python + :emphasize-lines: 31-34, 40-44, 51-82, 91, 104-107, 113-117, 127-151, 159 + .. tab:: CartPole (CEM) .. tabs:: .. group-tab:: Training - :download:`gym_cartpole_cem.py <../examples/gym/gym_cartpole_cem.py>` + | :download:`cem_gym_cartpole.py <../examples/gym/cem_gym_cartpole.py>` + | :download:`cem_gymnasium_cartpole.py <../examples/gymnasium/cem_gymnasium_cartpole.py>` - .. literalinclude:: ../examples/gym/gym_cartpole_cem.py + .. literalinclude:: ../examples/gym/cem_gym_cartpole.py :language: python :emphasize-lines: 1, 11, 33-39 .. group-tab:: Evaluation - :download:`gym_cartpole_cem_eval.py <../examples/gym/gym_cartpole_cem_eval.py>` + | :download:`cem_gym_cartpole_eval.py <../examples/gym/cem_gym_cartpole_eval.py>` + | :download:`cem_gymnasium_cartpole_eval.py <../examples/gymnasium/cem_gymnasium_cartpole_eval.py>` **Note:** It is necessary to adjust the checkpoint path according to the directories generated by the new experiments **Note:** Warnings such as :literal:`[skrl:WARNING] Cannot load the module. The agent doesn't have such an instance` can be ignored without problems. The reason for this is that during the evaluation, not all components such as optimizers or other models apart from the policy are defined - .. literalinclude:: ../examples/gym/gym_cartpole_cem_eval.py + .. literalinclude:: ../examples/gym/cem_gym_cartpole_eval.py :language: python :emphasize-lines: 68 .. tab:: CartPole (DQN) .. tabs:: - + .. group-tab:: Training - - :download:`gym_cartpole_dqn.py <../examples/gym/gym_cartpole_dqn.py>` - .. literalinclude:: ../examples/gym/gym_cartpole_dqn.py + | :download:`dqn_gym_cartpole.py <../examples/gym/dqn_gym_cartpole.py>` + | :download:`dqn_gymnasium_cartpole.py <../examples/gymnasium/dqn_gymnasium_cartpole.py>` + + .. literalinclude:: ../examples/gym/dqn_gym_cartpole.py :language: python :emphasize-lines: 4, 31-51 - + .. group-tab:: Evaluation - - :download:`gym_cartpole_dqn_eval.py <../examples/gym/gym_cartpole_dqn_eval.py>` - + + | :download:`dqn_gym_cartpole_eval.py <../examples/gym/dqn_gym_cartpole_eval.py>` + | :download:`dqn_gymnasium_cartpole_eval.py <../examples/gymnasium/dqn_gymnasium_cartpole_eval.py>` + **Note:** It is necessary to adjust the checkpoint path according to the directories generated by the new experiments **Note:** Warnings such as :literal:`[skrl:WARNING] Cannot load the module. The agent doesn't have such an instance` can be ignored without problems. The reason for this is that during the evaluation, not all components such as optimizers or other models apart from the policy are defined - .. literalinclude:: ../examples/gym/gym_cartpole_dqn_eval.py + .. literalinclude:: ../examples/gym/dqn_gym_cartpole_eval.py :language: python :emphasize-lines: 56 - + .. tab:: Taxi (SARSA) .. tabs:: - + .. group-tab:: Training - - :download:`gym_taxi_sarsa.py <../examples/gym/gym_taxi_sarsa.py>` - .. literalinclude:: ../examples/gym/gym_taxi_sarsa.py + | :download:`sarsa_gym_taxi.py <../examples/gym/sarsa_gym_taxi.py>` + | :download:`sarsa_gymnasium_taxi.py <../examples/gymnasium/sarsa_gymnasium_taxi.py>` + + .. literalinclude:: ../examples/gym/sarsa_gym_taxi.py :language: python :emphasize-lines: 6, 13-30 - + .. group-tab:: Evaluation - - :download:`gym_taxi_sarsa_eval.py <../examples/gym/gym_taxi_sarsa_eval.py>` - + + | :download:`sarsa_gym_taxi_eval.py <../examples/gym/sarsa_gym_taxi_eval.py>` + | :download:`sarsa_gymnasium_taxi_eval.py <../examples/gymnasium/sarsa_gymnasium_taxi_eval.py>` + **Note:** It is necessary to adjust the checkpoint path according to the directories generated by the new experiments **Note:** Warnings such as :literal:`[skrl:WARNING] Cannot load the module. The agent doesn't have such an instance` can be ignored without problems. The reason for this is that during the evaluation, not all components such as optimizers or other models apart from the policy are defined - .. literalinclude:: ../examples/gym/gym_taxi_sarsa_eval.py + .. literalinclude:: ../examples/gym/sarsa_gym_taxi_eval.py :language: python :emphasize-lines: 70 - + .. tab:: FrozenLake (Q-learning) .. tabs:: - + .. group-tab:: Training - - :download:`gym_frozen_lake_q_learning.py <../examples/gym/gym_frozen_lake_q_learning.py>` - .. literalinclude:: ../examples/gym/gym_frozen_lake_q_learning.py + | :download:`q_learning_gym_frozen_lake.py <../examples/gym/q_learning_gym_frozen_lake.py>` + | :download:`q_learning_gymnasium_frozen_lake.py <../examples/gymnasium/q_learning_gymnasium_frozen_lake.py>` + + .. literalinclude:: ../examples/gym/q_learning_gym_frozen_lake.py :language: python :emphasize-lines: 6, 13-30 - + .. group-tab:: Evaluation - - :download:`gym_frozen_lake_q_learning_eval.py <../examples/gym/gym_frozen_lake_q_learning_eval.py>` - + + | :download:`q_learning_gym_frozen_lake_eval.py <../examples/gym/q_learning_gym_frozen_lake_eval.py>` + | :download:`q_learning_gymnasium_frozen_lake_eval.py <../examples/gymnasium/q_learning_gymnasium_frozen_lake_eval.py>` + **Note:** It is necessary to adjust the checkpoint path according to the directories generated by the new experiments **Note:** Warnings such as :literal:`[skrl:WARNING] Cannot load the module. The agent doesn't have such an instance` can be ignored without problems. The reason for this is that during the evaluation, not all components such as optimizers or other models apart from the policy are defined - .. literalinclude:: ../examples/gym/gym_frozen_lake_q_learning_eval.py + .. literalinclude:: ../examples/gym/q_learning_gym_frozen_lake_eval.py :language: python :emphasize-lines: 70 @@ -159,62 +240,66 @@ The following components or practices are exemplified (highlighted):
-Learning in an OpenAI Gym vectorized environment ------------------------------------------------- +Gym/Gymnasium vectorized environments +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -These examples perform the training of one agent in an OpenAI Gym vectorized environment (**one agent, multiple independent copies of the same environment in parallel**) +These examples perform the training of one agent in a Gym/Gymnasium vectorized environment (**one agent, multiple independent copies of the same environment in parallel**) The following components or practices are exemplified (highlighted): - - Load and wrap an OpenAI Gym vectorized environment: **Pendulum (DDPG)**, **CartPole (DQN)**, **Taxi (SARSA)**, **FrozenLake (Q-Learning)** + - Load and wrap a Gym vectorized environment: **Pendulum (DDPG)**, **CartPole (DQN)**, **Taxi (SARSA)**, **FrozenLake (Q-Learning)** .. tabs:: - + .. tab:: Pendulum (DDPG) .. tabs:: - + .. group-tab:: Training - :download:`gym_vector_pendulum_ddpg.py <../examples/gym/gym_vector_pendulum_ddpg.py>` + | :download:`ddpg_gym_pendulum_vector.py <../examples/gym/ddpg_gym_pendulum_vector.py>` + | :download:`ddpg_gymnasium_pendulum_vector.py <../examples/gymnasium/ddpg_gymnasium_pendulum_vector.py>` - .. literalinclude:: ../examples/gym/gym_vector_pendulum_ddpg.py + .. literalinclude:: ../examples/gym/ddpg_gym_pendulum_vector.py :language: python :emphasize-lines: 1, 13, 50-56 .. tab:: CartPole (DQN) .. tabs:: - + .. group-tab:: Training - - :download:`gym_vector_cartpole_dqn.py <../examples/gym/gym_vector_cartpole_dqn.py>` - .. literalinclude:: ../examples/gym/gym_vector_cartpole_dqn.py + | :download:`dqn_gym_cartpole_vector.py <../examples/gym/dqn_gym_cartpole_vector.py>` + | :download:`dqn_gymnasium_cartpole_vector.py <../examples/gymnasium/dqn_gymnasium_cartpole_vector.py>` + + .. literalinclude:: ../examples/gym/dqn_gym_cartpole_vector.py :language: python :emphasize-lines: 1, 8, 13-19 - + .. tab:: Taxi (SARSA) .. tabs:: - + .. group-tab:: Training - - :download:`gym_vector_taxi_sarsa.py <../examples/gym/gym_vector_taxi_sarsa.py>` - .. literalinclude:: ../examples/gym/gym_vector_taxi_sarsa.py + | :download:`sarsa_gym_taxi_vector.py <../examples/gym/sarsa_gym_taxi_vector.py>` + | :download:`sarsa_gymnasium_taxi_vector.py <../examples/gymnasium/sarsa_gymnasium_taxi_vector.py>` + + .. literalinclude:: ../examples/gym/sarsa_gym_taxi_vector.py :language: python :emphasize-lines: 1, 9, 35-41 - + .. tab:: FrozenLake (Q-learning) .. tabs:: - + .. group-tab:: Training - - :download:`gym_vector_frozen_lake_q_learning.py <../examples/gym/gym_vector_frozen_lake_q_learning.py>` - .. literalinclude:: ../examples/gym/gym_vector_frozen_lake_q_learning.py + | :download:`q_learning_gym_frozen_lake_vector.py <../examples/gym/q_learning_gym_frozen_lake_vector.py>` + | :download:`q_learning_gymnasium_frozen_lake_vector.py <../examples/gymnasium/q_learning_gymnasium_frozen_lake_vector.py>` + + .. literalinclude:: ../examples/gym/q_learning_gym_frozen_lake_vector.py :language: python :emphasize-lines: 1, 9, 35-41 @@ -222,10 +307,79 @@ The following components or practices are exemplified (highlighted):
-Learning in a DeepMind environment ----------------------------------- +Farama Shimmy (converted environments) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The following examples show the training in several popular environments (Atari, DeepMind Control and OpenAI Gym) that have been converted to the Gymnasium API using the `Shimmy `_ (API conversion tool) package + +.. image:: ../_static/imgs/example_shimmy.png + :width: 100% + :align: center + :alt: Shimmy (converted environments) + +.. note:: + + From **skrl**, no extra implementation is necessary, since it fully supports Gymnasium API + +.. note:: + + Because the Gymnasium API requires that the rendering mode be specified during the initialization of the environment, it is not enough to set the :literal:`headless` option in the trainer configuration to render the environment. In this case, it is necessary to call the :literal:`gymnasium.make` function using :literal:`render_mode="human"` or any other supported option + +.. tabs:: + + .. tab:: Atari: Pong (DQN) + + .. tabs:: + + .. group-tab:: Training + + | :download:`dqn_shimmy_atari_pong.py <../examples/shimmy/dqn_shimmy_atari_pong.py>` + + .. literalinclude:: ../examples/shimmy/dqn_shimmy_atari_pong.py + :language: python + + .. tab:: DeepMind: Acrobot (SAC) + + .. tabs:: + + .. group-tab:: Training + + | :download:`sac_shimmy_dm_control_acrobot_swingup_sparse.py <../examples/shimmy/sac_shimmy_dm_control_acrobot_swingup_sparse.py>` + + .. literalinclude:: ../examples/shimmy/sac_shimmy_dm_control_acrobot_swingup_sparse.py + :language: python + + .. tab:: Gym compatibility (DDPG) + + .. tabs:: + + .. group-tab:: Training + + | :download:`ddpg_openai_gym_compatibility_pendulum.py <../examples/shimmy/ddpg_openai_gym_compatibility_pendulum.py>` + + .. literalinclude:: ../examples/shimmy/ddpg_openai_gym_compatibility_pendulum.py + :language: python + +.. raw:: html + +

+ +Other supported APIs +-------------------- + +.. contents:: + :depth: 2 + :local: + :backlinks: none + +.. raw:: html + +
+ +DeepMind environments +^^^^^^^^^^^^^^^^^^^^^ -These examples perform the training of one agent in an DeepMind environment (**one agent, one environment**) +These examples perform the training of one agent in a DeepMind environment (**one agent, one environment**) .. image:: ../_static/imgs/example_deepmind.png :width: 100% @@ -242,11 +396,11 @@ The following components or practices are exemplified (highlighted): - Map the observation/state space (flat tensor) to the original environment space to be used by the model: **reach_site_vision (SAC)** .. tabs:: - + .. tab:: suite:cartpole (DDPG) .. tabs:: - + .. group-tab:: Training :download:`dm_suite_cartpole_swingup_ddpg.py <../examples/deepmind/dm_suite_cartpole_swingup_ddpg.py>` @@ -254,25 +408,73 @@ The following components or practices are exemplified (highlighted): .. literalinclude:: ../examples/deepmind/dm_suite_cartpole_swingup_ddpg.py :language: python :emphasize-lines: 1, 13, 50-51 - + .. tab:: manipulation:reach_site_vision (SAC) .. tabs:: - + .. group-tab:: Training :download:`dm_manipulation_stack_sac.py <../examples/deepmind/dm_manipulation_stack_sac.py>` .. literalinclude:: ../examples/deepmind/dm_manipulation_stack_sac.py :language: python - :emphasize-lines: 67, 80, 83-84, 112, 115, 118-119 + :emphasize-lines: 69, 82, 85-86, 118, 121, 124-125 .. raw:: html
-Learning in an Isaac Gym environment ------------------------------------- +Robosuite environments +^^^^^^^^^^^^^^^^^^^^^^ + +These examples perform the training of one agent in a robosuite environment (**one agent, one environment**) + +.. image:: ../_static/imgs/example_robosuite.png + :width: 50% + :align: center + :alt: robosuite environments + +.. raw:: html + +
+ +The following components or practices are exemplified (highlighted): + + - Load and wrap a robosuite environment: **TwoArmLift (TD3)** + +.. tabs:: + + .. tab:: robosuite:TwoArmLift (TD3) + + .. tabs:: + + .. group-tab:: Training + + :download:`td3_robosuite_two_arm_lift.py <../examples/robosuite/td3_robosuite_two_arm_lift.py>` (not tuned) + + .. literalinclude:: ../examples/robosuite/td3_robosuite_two_arm_lift.py + :language: python + :emphasize-lines: 1-2, 51-65 + +.. raw:: html + +

+ +Isaac Gym preview +----------------- + +.. contents:: + :depth: 2 + :local: + :backlinks: none + +.. raw:: html + +
+ +Isaac Gym environments +^^^^^^^^^^^^^^^^^^^^^^ These examples perform the training of an agent in the `Isaac Gym environments `_ (**one agent, multiple environments**) @@ -326,20 +528,20 @@ The PPO agent configuration is mapped, as far as possible, from the rl_games' A2 # trainer timesteps = horizon_length * max_epochs -**Benchmark results** for Isaac Gym are listed in `Benchmark results #32 `_. +**Benchmark results** are listed in `Benchmark results #32 (NVIDIA Isaac Gym) `_ .. note:: Isaac Gym environments implement a functionality to get their configuration from the command line. Because of this feature, setting the :literal:`headless` option from the trainer configuration will not work. In this case, it is necessary to invoke the scripts as follows: :literal:`python script.py headless=True` for Isaac Gym environments (preview 3 and preview 4) or :literal:`python script.py --headless` for Isaac Gym environments (preview 2) .. tabs:: - + .. tab:: Isaac Gym environments (training) .. tabs:: - + .. tab:: AllegroHand - + :download:`ppo_allegro_hand.py <../examples/isaacgym/ppo_allegro_hand.py>` .. literalinclude:: ../examples/isaacgym/ppo_allegro_hand.py @@ -347,7 +549,7 @@ The PPO agent configuration is mapped, as far as possible, from the rl_games' A2 :emphasize-lines: 2, 19, 56-62 .. tab:: Ant - + :download:`ppo_ant.py <../examples/isaacgym/ppo_ant.py>` .. literalinclude:: ../examples/isaacgym/ppo_ant.py @@ -355,7 +557,7 @@ The PPO agent configuration is mapped, as far as possible, from the rl_games' A2 :emphasize-lines: 13-14, 56-57 .. tab:: Anymal - + :download:`ppo_anymal.py <../examples/isaacgym/ppo_anymal.py>` .. literalinclude:: ../examples/isaacgym/ppo_anymal.py @@ -363,7 +565,7 @@ The PPO agent configuration is mapped, as far as possible, from the rl_games' A2 :emphasize-lines: 13-14, 56-57 .. tab:: AnymalTerrain - + :download:`ppo_anymal_terrain.py <../examples/isaacgym/ppo_anymal_terrain.py>` .. literalinclude:: ../examples/isaacgym/ppo_anymal_terrain.py @@ -371,7 +573,7 @@ The PPO agent configuration is mapped, as far as possible, from the rl_games' A2 :emphasize-lines: 11, 101-104 .. tab:: BallBalance - + :download:`ppo_ball_balance.py <../examples/isaacgym/ppo_ball_balance.py>` .. literalinclude:: ../examples/isaacgym/ppo_ball_balance.py @@ -379,7 +581,7 @@ The PPO agent configuration is mapped, as far as possible, from the rl_games' A2 :emphasize-lines: 11, 96-99 .. tab:: Cartpole - + :download:`ppo_cartpole.py <../examples/isaacgym/ppo_cartpole.py>` .. literalinclude:: ../examples/isaacgym/ppo_cartpole.py @@ -387,7 +589,7 @@ The PPO agent configuration is mapped, as far as possible, from the rl_games' A2 :emphasize-lines: 15, 19 .. tab:: Cartpole (TRPO) - + :download:`trpo_cartpole.py <../examples/isaacgym/trpo_cartpole.py>` .. literalinclude:: ../examples/isaacgym/trpo_cartpole.py @@ -395,7 +597,7 @@ The PPO agent configuration is mapped, as far as possible, from the rl_games' A2 :emphasize-lines: 14, 18 .. tab:: FrankaCabinet - + :download:`ppo_franka_cabinet.py <../examples/isaacgym/ppo_franka_cabinet.py>` .. literalinclude:: ../examples/isaacgym/ppo_franka_cabinet.py @@ -403,7 +605,7 @@ The PPO agent configuration is mapped, as far as possible, from the rl_games' A2 :emphasize-lines: 10, 84-85 .. tab:: Humanoid - + :download:`ppo_humanoid.py <../examples/isaacgym/ppo_humanoid.py>` .. literalinclude:: ../examples/isaacgym/ppo_humanoid.py @@ -411,7 +613,7 @@ The PPO agent configuration is mapped, as far as possible, from the rl_games' A2 :emphasize-lines: 10, 84-85 .. tab:: Humanoid (AMP) - + :download:`amp_humanoid.py <../examples/isaacgym/amp_humanoid.py>` .. literalinclude:: ../examples/isaacgym/amp_humanoid.py @@ -419,7 +621,7 @@ The PPO agent configuration is mapped, as far as possible, from the rl_games' A2 :emphasize-lines: 89, 124, 135, 138-139 .. tab:: Ingenuity - + :download:`ppo_ingenuity.py <../examples/isaacgym/ppo_ingenuity.py>` .. literalinclude:: ../examples/isaacgym/ppo_ingenuity.py @@ -427,7 +629,7 @@ The PPO agent configuration is mapped, as far as possible, from the rl_games' A2 :emphasize-lines: 2, 19, 56-62 .. tab:: Quadcopter - + :download:`ppo_quadcopter.py <../examples/isaacgym/ppo_quadcopter.py>` .. literalinclude:: ../examples/isaacgym/ppo_quadcopter.py @@ -435,7 +637,7 @@ The PPO agent configuration is mapped, as far as possible, from the rl_games' A2 :emphasize-lines: 95 .. tab:: ShadowHand - + :download:`ppo_shadow_hand.py <../examples/isaacgym/ppo_shadow_hand.py>` .. literalinclude:: ../examples/isaacgym/ppo_shadow_hand.py @@ -443,7 +645,7 @@ The PPO agent configuration is mapped, as far as possible, from the rl_games' A2 :emphasize-lines: 97 .. tab:: Trifinger - + :download:`ppo_trifinger.py <../examples/isaacgym/ppo_trifinger.py>` .. literalinclude:: ../examples/isaacgym/ppo_trifinger.py @@ -453,11 +655,11 @@ The PPO agent configuration is mapped, as far as possible, from the rl_games' A2 .. tab:: Isaac Gym environments (evaluation) .. tabs:: - + .. tab:: Cartpole - + :download:`ppo_cartpole_eval.py <../examples/isaacgym/ppo_cartpole_eval.py>` - + **Note:** It is necessary to adjust the checkpoint path according to the directories generated by the new experiments **Note:** Warnings such as :literal:`[skrl:WARNING] Cannot load the module. The agent doesn't have such an instance` can be ignored without problems. The reason for this is that during the evaluation, not all components such as optimizers or other models apart from the policy are defined @@ -470,8 +672,8 @@ The PPO agent configuration is mapped, as far as possible, from the rl_games' A2
-Learning by scopes in an Isaac Gym environment ----------------------------------------------- +Isaac Gym environments (learning by scopes) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ These examples perform the training of 3 agents by scopes in Isaac Gym's Cartpole environment in the same run (**multiple agents and environments**) @@ -500,15 +702,15 @@ The following components or practices are exemplified (highlighted): .. note:: Isaac Gym environments implement a functionality to get their configuration from the command line. Because of this feature, setting the :literal:`headless` option from the trainer configuration will not work. In this case, it is necessary to invoke the scripts as follows: :literal:`python script.py headless=True` for Isaac Gym environments (preview 3 and preview 4) or :literal:`python script.py --headless` for Isaac Gym environments (preview 2) - + .. tabs:: - + .. tab:: Shared memory .. tabs:: - + .. tab:: Sequential training - + :download:`isaacgym_sequential_shared_memory.py <../examples/isaacgym/isaacgym_sequential_shared_memory.py>` .. literalinclude:: ../examples/isaacgym/isaacgym_sequential_shared_memory.py @@ -516,9 +718,9 @@ The following components or practices are exemplified (highlighted): :emphasize-lines: 75, 149, 156, 163, 174-175 .. tab:: Sequential evaluation - + :download:`isaacgym_sequential_shared_memory_eval.py <../examples/isaacgym/isaacgym_sequential_shared_memory_eval.py>` - + **Note:** It is necessary to adjust the checkpoint path according to the directories generated by the new experiments **Note:** Warnings such as :literal:`[skrl:WARNING] Cannot load the module. The agent doesn't have such an instance` can be ignored without problems. The reason for this is that during the evaluation, not all components such as optimizers or other models apart from the policy are defined @@ -530,9 +732,9 @@ The following components or practices are exemplified (highlighted): .. tab:: No shared memory .. tabs:: - + .. tab:: Sequential training - + :download:`isaacgym_sequential_no_shared_memory.py <../examples/isaacgym/isaacgym_sequential_no_shared_memory.py>` .. literalinclude:: ../examples/isaacgym/isaacgym_sequential_no_shared_memory.py @@ -540,7 +742,7 @@ The following components or practices are exemplified (highlighted): :emphasize-lines: 75-77, 151, 158, 165, 176-177 .. tab:: Parallel training - + :download:`isaacgym_parallel_no_shared_memory.py <../examples/isaacgym/isaacgym_parallel_no_shared_memory.py>` .. literalinclude:: ../examples/isaacgym/isaacgym_parallel_no_shared_memory.py @@ -548,9 +750,9 @@ The following components or practices are exemplified (highlighted): :emphasize-lines: 13, 67, 176-179 .. tab:: Sequential eval... - + :download:`isaacgym_sequential_no_shared_memory_eval.py <../examples/isaacgym/isaacgym_sequential_no_shared_memory_eval.py>` - + **Note:** It is necessary to adjust the checkpoint path according to the directories generated by the new experiments **Note:** Warnings such as :literal:`[skrl:WARNING] Cannot load the module. The agent doesn't have such an instance` can be ignored without problems. The reason for this is that during the evaluation, not all components such as optimizers or other models apart from the policy are defined @@ -560,9 +762,9 @@ The following components or practices are exemplified (highlighted): :emphasize-lines: 113-115, 126 .. tab:: Parallel eval... - + :download:`isaacgym_parallel_no_shared_memory_eval.py <../examples/isaacgym/isaacgym_parallel_no_shared_memory_eval.py>` - + **Note:** It is necessary to adjust the checkpoint path according to the directories generated by the new experiments **Note:** Warnings such as :literal:`[skrl:WARNING] Cannot load the module. The agent doesn't have such an instance` can be ignored without problems. The reason for this is that during the evaluation, not all components such as optimizers or other models apart from the policy are defined @@ -571,12 +773,24 @@ The following components or practices are exemplified (highlighted): :language: python :emphasize-lines: 115-117, 128 +.. raw:: html + +

+ +Omniverse Isaac Gym +------------------- + +.. contents:: + :depth: 2 + :local: + :backlinks: none + .. raw:: html
-Learning in an Omniverse Isaac Gym environment ----------------------------------------------- +Omniverse Isaac Gym environments +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ These examples perform the training of an agent in the `Omniverse Isaac Gym environments `_ (**one agent, multiple environments**) @@ -628,7 +842,7 @@ The PPO agent configuration is mapped, as far as possible, from the rl_games' A2 # trainer timesteps = horizon_length * max_epochs -**Benchmark results** for Omniverse Isaac Gym are listed in `Benchmark results #32 `_. +**Benchmark results** are listed in `Benchmark results #32 (NVIDIA Omniverse Isaac Gym) `_ .. note:: @@ -641,15 +855,15 @@ The PPO agent configuration is mapped, as far as possible, from the rl_games' A2 .. tabs:: .. tab:: AllegroHand - + :download:`ppo_allegro_hand.py <../examples/omniisaacgym/ppo_allegro_hand.py>` .. literalinclude:: ../examples/omniisaacgym/ppo_allegro_hand.py :language: python :emphasize-lines: 11-12, 54-55 - + .. tab:: Ant - + :download:`ppo_ant.py <../examples/omniisaacgym/ppo_ant.py>` .. literalinclude:: ../examples/omniisaacgym/ppo_ant.py @@ -657,15 +871,15 @@ The PPO agent configuration is mapped, as far as possible, from the rl_games' A2 :emphasize-lines: 11-12, 54-55 .. tab:: Ant (multi-threaded) - + :download:`ppo_ant_mt.py <../examples/omniisaacgym/ppo_ant_mt.py>` .. literalinclude:: ../examples/omniisaacgym/ppo_ant_mt.py :language: python :emphasize-lines: 1, 13-14, 56-57, 117, 121 - + .. tab:: Anymal - + :download:`ppo_anymal.py <../examples/omniisaacgym/ppo_anymal.py>` .. literalinclude:: ../examples/omniisaacgym/ppo_anymal.py @@ -673,7 +887,7 @@ The PPO agent configuration is mapped, as far as possible, from the rl_games' A2 :emphasize-lines: 11-12, 54-55 .. tab:: AnymalTerrain - + :download:`ppo_anymal_terrain.py <../examples/omniisaacgym/ppo_anymal_terrain.py>` .. literalinclude:: ../examples/omniisaacgym/ppo_anymal_terrain.py @@ -689,7 +903,7 @@ The PPO agent configuration is mapped, as far as possible, from the rl_games' A2 :emphasize-lines: 9, 94-97 .. tab:: Cartpole - + :download:`ppo_cartpole.py <../examples/omniisaacgym/ppo_cartpole.py>` .. literalinclude:: ../examples/omniisaacgym/ppo_cartpole.py @@ -697,7 +911,7 @@ The PPO agent configuration is mapped, as far as possible, from the rl_games' A2 :emphasize-lines: 13, 17 .. tab:: Cartpole (multi-threaded) - + :download:`ppo_cartpole_mt.py <../examples/omniisaacgym/ppo_cartpole_mt.py>` .. literalinclude:: ../examples/omniisaacgym/ppo_cartpole_mt.py @@ -705,7 +919,7 @@ The PPO agent configuration is mapped, as far as possible, from the rl_games' A2 :emphasize-lines: 1, 13-14, 54-55, 115, 119 .. tab:: Crazyflie - + :download:`ppo_crazy_flie.py <../examples/omniisaacgym/ppo_crazy_flie.py>` .. literalinclude:: ../examples/omniisaacgym/ppo_crazy_flie.py @@ -713,7 +927,7 @@ The PPO agent configuration is mapped, as far as possible, from the rl_games' A2 :emphasize-lines: 13, 17 .. tab:: FrankaCabinet - + :download:`ppo_franka_cabinet.py <../examples/omniisaacgym/ppo_franka_cabinet.py>` .. literalinclude:: ../examples/omniisaacgym/ppo_franka_cabinet.py @@ -721,15 +935,15 @@ The PPO agent configuration is mapped, as far as possible, from the rl_games' A2 :emphasize-lines: 8, 82-83 .. tab:: Humanoid - + :download:`ppo_humanoid.py <../examples/omniisaacgym/ppo_humanoid.py>` .. literalinclude:: ../examples/omniisaacgym/ppo_humanoid.py :language: python :emphasize-lines: 8, 82-83 - + .. tab:: Ingenuity - + :download:`ppo_ingenuity.py <../examples/omniisaacgym/ppo_ingenuity.py>` .. literalinclude:: ../examples/omniisaacgym/ppo_ingenuity.py @@ -737,7 +951,7 @@ The PPO agent configuration is mapped, as far as possible, from the rl_games' A2 :emphasize-lines: 93 .. tab:: Quadcopter - + :download:`ppo_quadcopter.py <../examples/omniisaacgym/ppo_quadcopter.py>` .. literalinclude:: ../examples/omniisaacgym/ppo_quadcopter.py @@ -745,7 +959,7 @@ The PPO agent configuration is mapped, as far as possible, from the rl_games' A2 :emphasize-lines: 93 .. tab:: ShadowHand - + :download:`ppo_shadow_hand.py <../examples/omniisaacgym/ppo_shadow_hand.py>` .. literalinclude:: ../examples/omniisaacgym/ppo_shadow_hand.py @@ -756,21 +970,21 @@ The PPO agent configuration is mapped, as far as possible, from the rl_games' A2
-Learning in an Omniverse Isaac Sim environment ----------------------------------------------- +Omniverse Isaac Sim (single environment) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -These examples show how to train an agent in an Omniverse Isaac Sim environment that is implemented using the OpenAI Gym interface (**one agent, one environment**) +These examples show how to train an agent in an Omniverse Isaac Sim environment that is implemented using the Gym interface (**one agent, one environment**) .. tabs:: - .. tab:: Isaac Sim 2022.1.X (Cartpole) + .. tab:: Isaac Sim 2022.X.X (Cartpole) This example performs the training of an agent in the Isaac Sim's Cartpole environment described in the `Creating New RL Environment `_ tutorial Use the steps described below to setup and launch the experiment after follow the tutorial .. code-block:: bash - + # download the sample code from GitHub in the directory containing the cartpole_task.py script wget https://raw.githubusercontent.com/Toni-SM/skrl/main/docs/source/examples/isaacsim/cartpole_example_skrl.py @@ -787,7 +1001,7 @@ These examples show how to train an agent in an Omniverse Isaac Sim environment :language: python .. tab:: Isaac Sim 2021.2.1 (JetBot) - + This example performs the training of an agent in the Isaac Sim's JetBot environment. The following components or practices are exemplified (highlighted): - Define and instantiate Convolutional Neural Networks (CNN) to learn from 128 X 128 RGB images @@ -797,12 +1011,12 @@ These examples show how to train an agent in an Omniverse Isaac Sim environment .. tabs:: .. tab:: Local workstation (setup) - + .. code-block:: bash # create a working directory and change to it - mkdir ~/.local/share/ov/pkg/isaac_sim-2021.2.1/standalone_examples/api/omni.isaac.jetbot/skrl_example - cd ~/.local/share/ov/pkg/isaac_sim-2021.2.1/standalone_examples/api/omni.isaac.jetbot/skrl_example + mkdir ~/.local/share/ov/pkg/isaac_sim-2021.2.1/standalone_examples/api/omni.isaac.jetbot/skrl_example + cd ~/.local/share/ov/pkg/isaac_sim-2021.2.1/standalone_examples/api/omni.isaac.jetbot/skrl_example # install the skrl library in editable mode from the working directory ~/.local/share/ov/pkg/isaac_sim-2021.2.1/python.sh -m pip install -e git+https://github.com/Toni-SM/skrl.git#egg=skrl @@ -821,7 +1035,7 @@ These examples show how to train an agent in an Omniverse Isaac Sim environment .. code-block:: bash # create a working directory and change to it - mkdir /isaac-sim/standalone_examples/api/omni.isaac.jetbot/skrl_example + mkdir /isaac-sim/standalone_examples/api/omni.isaac.jetbot/skrl_example cd /isaac-sim/standalone_examples/api/omni.isaac.jetbot/skrl_example # install the skrl library in editable mode from the working directory @@ -835,7 +1049,7 @@ These examples show how to train an agent in an Omniverse Isaac Sim environment # run the experiment /isaac-sim/python.sh isaacsim_jetbot_ppo.py - + .. raw:: html
@@ -846,30 +1060,39 @@ These examples show how to train an agent in an Omniverse Isaac Sim environment :language: python :emphasize-lines: 24-39, 45, 53-68, 73 +.. raw:: html + +

+ Real-world examples ------------------- +.. contents:: + :depth: 2 + :local: + :backlinks: none + These examples show basic real-world use cases to guide and support advanced RL implementations .. tabs:: .. tab:: Franka Emika Panda - **3D reaching task (Franka's gripper must reach a certain target point in space)**. The training was done in Omniverse Isaac Gym. The real robot control is performed through the Python API of a modified version of frankx (see `frankx's pull request #44 `_), a high-level motion library around libfranka. Training and evaluation is performed for both Cartesian and joint control space + **3D reaching task (Franka's gripper must reach a certain target point in space)**. The training was done in Omniverse Isaac Gym. The real robot control is performed through the Python API of a modified version of *frankx* (see `frankx's pull request #44 `_), a high-level motion library around *libfranka*. Training and evaluation is performed for both Cartesian and joint control space .. raw:: html
- + **Implementation** (see details in the table below): * The observation space is composed of the episode's normalized progress, the robot joints' normalized positions (:math:`q`) in the interval -1 to 1, the robot joints' velocities (:math:`\dot{q}`) affected by a random uniform scale for generalization, and the target's position in space (:math:`target_{_{XYZ}}`) with respect to the robot's base - + * The action space, bounded in the range -1 to 1, consists of the following. For the joint control it's robot joints' position scaled change. For the Cartesian control it's the end-effector's position (:math:`ee_{_{XYZ}}`) scaled change. The end-effector position frame corresponds to the point where the left finger connects to the gripper base in simulation, whereas in the real world it corresponds to the end of the fingers. The gripper fingers remain closed all the time in both cases - + * The instantaneous reward is the negative value of the Euclidean distance (:math:`\text{d}`) between the robot end-effector and the target point position. The episode terminates when this distance is less than 0.035 meters in simulation (0.075 meters in real-world) or when the defined maximum timestep is reached - * The target position lies within a rectangular cuboid of dimensions 0.5 x 0.5 x 0.2 meters centered at 0.5, 0.0, 0.2 meters with respect to the robot's base. The robot joints' positions are drawn from an initial configuration [0º, -45º, 0º, -135º, 0º, 90º, 45º] modified with uniform random values between -7º and 7º approximately + * The target position lies within a rectangular cuboid of dimensions 0.5 x 0.5 x 0.2 meters centered at (0.5, 0.0, 0.2) meters with respect to the robot's base. The robot joints' positions are drawn from an initial configuration [0º, -45º, 0º, -135º, 0º, 90º, 45º] modified with uniform random values between -7º and 7º approximately .. list-table:: :header-rows: 1 @@ -878,7 +1101,7 @@ These examples show basic real-world use cases to guide and support advanced RL - Formula / value - Size * - Observation space - - :math:`\dfrac{t}{t_{max}},\; 2 \dfrac{q - q_{min}}{q_{max} - q_{min}} - 1,\; 0.1\,\dot{q}\,U(0.5,1.5),\; target_{_{XYZ}}` + - :math:`\dfrac{t}{t_{max}},\; 2 \dfrac{q - q_{min}}{q_{max} - q_{min}} - 1,\; 0.1\,\dot{q}\,U(0.5,1.5),\; target_{_{XYZ}}` - 18 * - Action space (joint) - :math:`\dfrac{2.5}{120} \, \Delta q` @@ -888,13 +1111,13 @@ These examples show basic real-world use cases to guide and support advanced RL - 3 * - Reward - :math:`-\text{d}(ee_{_{XYZ}},\; target_{_{XYZ}})` - - + - * - Episode termination - - :math:`\text{d}(ee_{_{XYZ}},\; target_{_{XYZ}}) \le 0.035 \quad` or :math:`\quad t \ge t_{max} - 1` - - + - :math:`\text{d}(ee_{_{XYZ}},\; target_{_{XYZ}}) \le 0.035 \quad` or :math:`\quad t \ge t_{max} - 1` + - * - Maximum timesteps (:math:`t_{max}`) - 100 - - + - .. raw:: html @@ -926,7 +1149,7 @@ These examples show basic real-world use cases to guide and support advanced RL **Prerequisites:** - A physical Franka robot with `Franka Control Interface (FCI) `_ is required. Additionally, the frankx library must be available in the python environment (see `frankx's pull request #44 `_ for the RL-compatible version installation) + A physical Franka Emika Panda robot with `Franka Control Interface (FCI) `_ is required. Additionally, the *frankx* library must be available in the python environment (see `frankx's pull request #44 `_ for the RL-compatible version installation) **Files** @@ -959,7 +1182,7 @@ These examples show basic real-world use cases to guide and support advanced RL .. raw:: html .. raw:: html @@ -1053,27 +1276,238 @@ These examples show basic real-world use cases to guide and support advanced RL TASK_CFG["env"]["controlSpace"] = "joint" # "joint" or "cartesian" + .. tab:: Kuka LBR iiwa + + **3D reaching task (iiwa's end-effector must reach a certain target point in space)**. The training was done in Omniverse Isaac Gym. The real robot control is performed through the Python, ROS and ROS2 APIs of `libiiwa `_, a scalable multi-control framework for the KUKA LBR Iiwa robots. Training and evaluation is performed for both Cartesian and joint control space + + .. raw:: html + +
+ + **Implementation** (see details in the table below): + + * The observation space is composed of the episode's normalized progress, the robot joints' normalized positions (:math:`q`) in the interval -1 to 1, the robot joints' velocities (:math:`\dot{q}`) affected by a random uniform scale for generalization, and the target's position in space (:math:`target_{_{XYZ}}`) with respect to the robot's base + + * The action space, bounded in the range -1 to 1, consists of the following. For the joint control it's robot joints' position scaled change. For the Cartesian control it's the end-effector's position (:math:`ee_{_{XYZ}}`) scaled change + + * The instantaneous reward is the negative value of the Euclidean distance (:math:`\text{d}`) between the robot end-effector and the target point position. The episode terminates when this distance is less than 0.035 meters in simulation (0.075 meters in real-world) or when the defined maximum timestep is reached + + * The target position lies within a rectangular cuboid of dimensions 0.2 x 0.4 x 0.4 meters centered at (0.6, 0.0, 0.4) meters with respect to the robot's base. The robot joints' positions are drawn from an initial configuration [0º, 0º, 0º, -90º, 0º, 90º, 0º] modified with uniform random values between -7º and 7º approximately + + .. list-table:: + :header-rows: 1 + + * - Variable + - Formula / value + - Size + * - Observation space + - :math:`\dfrac{t}{t_{max}},\; 2 \dfrac{q - q_{min}}{q_{max} - q_{min}} - 1,\; 0.1\,\dot{q}\,U(0.5,1.5),\; target_{_{XYZ}}` + - 18 + * - Action space (joint) + - :math:`\dfrac{2.5}{120} \, \Delta q` + - 7 + * - Action space (Cartesian) + - :math:`\dfrac{1}{100} \, \Delta ee_{_{XYZ}}` + - 3 + * - Reward + - :math:`-\text{d}(ee_{_{XYZ}},\; target_{_{XYZ}})` + - + * - Episode termination + - :math:`\text{d}(ee_{_{XYZ}},\; target_{_{XYZ}}) \le 0.035 \quad` or :math:`\quad t \ge t_{max} - 1` + - + * - Maximum timesteps (:math:`t_{max}`) + - 100 + - + + .. raw:: html + +
+ + **Workflows** + + .. tabs:: + + .. tab:: Real-world + + .. warning:: + + Make sure you have the smartHMI on hand in case something goes wrong in the run. **Control via RL can be dangerous and unsafe for both the operator and the robot** + + .. raw:: html + + + + **Prerequisites:** + + A physical Kuka LBR iiwa robot is required. Additionally, the *libiiwa* library must be installed (visit the `libiiwa `_ documentation for installation details) + + **Files** + + * Environment: :download:`reaching_iiwa_real_env.py <../examples/real_world/kuka_lbr_iiwa/reaching_iiwa_real_env.py>` + * Evaluation script: :download:`reaching_iiwa_real_skrl_eval.py <../examples/real_world/kuka_lbr_iiwa/reaching_iiwa_real_skrl_eval.py>` + * Checkpoints (:literal:`agent_joint.pt`, :literal:`agent_cartesian.pt`): :download:`trained_checkpoints.zip ` + + **Evaluation:** + + .. code-block:: bash + + python3 reaching_iiwa_real_skrl_eval.py + + **Main environment configuration:** + + The control space (Cartesian or joint) can be specified in the environment class constructor (from :literal:`reaching_iiwa_real_skrl_eval.py`) as follow: + + .. code-block:: python + + control_space = "joint" # joint or cartesian + + .. tab:: Real-world (ROS/ROS2) + + .. warning:: + + Make sure you have the smartHMI on hand in case something goes wrong in the run. **Control via RL can be dangerous and unsafe for both the operator and the robot** + + .. raw:: html + + + + **Prerequisites:** + + A physical Kuka LBR iiwa robot is required. Additionally, the *libiiwa* library must be installed (visit the `libiiwa `_ documentation for installation details) and a Robot Operating System (ROS or ROS2) distribution must be available + + **Files** + + * Environment (ROS): :download:`reaching_iiwa_real_ros_env.py <../examples/real_world/kuka_lbr_iiwa/reaching_iiwa_real_ros_env.py>` + * Environment (ROS2): :download:`reaching_iiwa_real_ros2_env.py <../examples/real_world/kuka_lbr_iiwa/reaching_iiwa_real_ros2_env.py>` + * Evaluation script: :download:`reaching_iiwa_real_ros_ros2_skrl_eval.py <../examples/real_world/kuka_lbr_iiwa/reaching_iiwa_real_ros_ros2_skrl_eval.py>` + * Checkpoints (:literal:`agent_joint.pt`, :literal:`agent_cartesian.pt`): :download:`trained_checkpoints.zip ` + + .. note:: + + Source the ROS/ROS2 distribution and the ROS/ROS workspace containing the libiiwa packages before executing the scripts + + **Evaluation:** + + .. note:: + + The environment (:literal:`reaching_iiwa_real_ros_env.py` or :literal:`reaching_iiwa_real_ros2_env.py`) to be loaded will be automatically selected based on the sourced ROS distribution (ROS or ROS2) at script execution + + .. code-block:: bash + + python3 reaching_iiwa_real_ros_ros2_skrl_eval.py + + **Main environment configuration:** + + The control space (Cartesian or joint) can be specified in the environment class constructor (from :literal:`reaching_iiwa_real_ros_ros2_skrl_eval.py`) as follow: + + .. code-block:: python + + control_space = "joint" # joint or cartesian + + .. tab:: Simulation (Omniverse Isaac Gym) + + .. raw:: html + + + + .. raw:: html + + + + | + + **Prerequisites:** + + All installation steps described in Omniverse Isaac Gym's `Overview & Getting Started `_ section must be fulfilled (especially the subsection 1.3. Installing Examples Repository) + + **Files** (the implementation is self-contained so no specific location is required): + + * Environment: :download:`reaching_iiwa_omniverse_isaacgym_env.py <../examples/real_world/kuka_lbr_iiwa/reaching_iiwa_omniverse_isaacgym_env.py>` + * Training script: :download:`reaching_iiwa_omniverse_isaacgym_skrl_train.py <../examples/real_world/kuka_lbr_iiwa/reaching_iiwa_omniverse_isaacgym_skrl_train.py>` + * Evaluation script: :download:`reaching_iiwa_omniverse_isaacgym_skrl_eval.py <../examples/real_world/kuka_lbr_iiwa/reaching_iiwa_omniverse_isaacgym_skrl_eval.py>` + * Checkpoints (:literal:`agent_joint.pt`, :literal:`agent_cartesian.pt`): :download:`trained_checkpoints.zip ` + * Simulation files: (.usd assets and robot class): :download:`simulation_files.zip ` + + + Simulation files must be structured as follows: + + .. code-block:: + + + ├── agent_cartesian.pt + ├── agent_joint.pt + ├── assets + │ ├── iiwa14_instanceable_meshes.usd + │ └── iiwa14.usd + ├── reaching_iiwa_omniverse_isaacgym_env.py + ├── reaching_iiwa_omniverse_isaacgym_skrl_eval.py + ├── reaching_iiwa_omniverse_isaacgym_skrl_train.py + ├── robots + │ ├── iiwa14.py + │ └── __init__.py + + **Training and evaluation:** + + .. code-block:: bash + + # training (local workstation) + ~/.local/share/ov/pkg/isaac_sim-*/python.sh reaching_iiwa_omniverse_isaacgym_skrl_train.py + + # training (docker container) + /isaac-sim/python.sh reaching_iiwa_omniverse_isaacgym_skrl_train.py + + .. code-block:: bash + + # evaluation (local workstation) + ~/.local/share/ov/pkg/isaac_sim-*/python.sh reaching_iiwa_omniverse_isaacgym_skrl_eval.py + + # evaluation (docker container) + /isaac-sim/python.sh reaching_iiwa_omniverse_isaacgym_skrl_eval.py + + **Main environment configuration:** + + The control space (Cartesian or joint) can be specified in the task configuration dictionary (from :literal:`reaching_iiwa_omniverse_isaacgym_skrl_train.py`) as follow: + + .. code-block:: python + + TASK_CFG["task"]["env"]["controlSpace"] = "joint" # "joint" or "cartesian" + +.. raw:: html + +

+ .. _library_utilities: Library utilities (skrl.utils module) ------------------------------------- +.. contents:: + :depth: 2 + :local: + :backlinks: none + This example shows how to use the library utilities to carry out the post-processing of files and data generated by the experiments .. tabs:: - + .. tab:: Tensorboard files - + .. image:: ../_static/imgs/utils_tensorboard_file_iterator.svg :width: 100% :alt: Tensorboard file iterator - + .. raw:: html

Example of a figure, generated by the code, showing the total reward (left) and the mean and standard deviation (right) of all experiments located in the runs folder - + :download:`tensorboard_file_iterator.py <../examples/utils/tensorboard_file_iterator.py>` **Note:** The code will load all the Tensorboard files of the experiments located in the :literal:`runs` folder. It is necessary to adjust the iterator's parameters for other paths diff --git a/docs/source/intro/getting_started.rst b/docs/source/intro/getting_started.rst index 112a80ef..4849f068 100644 --- a/docs/source/intro/getting_started.rst +++ b/docs/source/intro/getting_started.rst @@ -1,6 +1,11 @@ Getting Started =============== +In this section, you will learn how to use the various components of the **skrl** library to create reinforcement learning tasks. Whether you are a beginner or an experienced researcher, we hope this section will provide you with a solid foundation to build upon. We recommend visiting the :ref:`Examples ` to see how the components can be integrated and applied in practice. Let's get started! + +Reinforcement Learning schema +----------------------------- + **Reinforcement Learning (RL)** is a Machine Learning sub-field for decision making that allows an agent to learn from its interaction with the environment as shown in the following schema: .. image:: ../_static/imgs/rl_schema.svg @@ -14,14 +19,14 @@ Getting Started At each step (also called timestep) of interaction with the environment, the agent sees an observation :math:`o_t` of the complete description of the state :math:`s_t \in S` of the environment. Then, it decides which action :math:`a_t \in A` to take from the action space using a policy. The environment, which changes in response to the agent's action (or by itself), returns a reward signal :math:`r_t = R(s_t, a_t, s_{t+1})` as a measure of how good or bad the action was that moved it to its new state :math:`s_{t+1}`. The agent aims to maximize the cumulative reward (discounted or not by a factor :math:`\gamma \in (0,1]`) by adjusting the policy's behaviour via some optimization algorithm. -**From this schema, this section is intended to guide in the creation of a RL system using skrl**. Visit the :ref:`Examples ` section for training and evaluation demonstrations with different environment interfaces and highlighted practices, among others. +**From this schema, this section is intended to guide in the creation of a RL system using skrl** 1. Environments --------------- -The environment plays a fundamental role in the definition of the RL schema. For example, the selection of the agent depends strongly on the observation and action space nature. There are several interfaces to interact with the environments such as OpenAI Gym or DeepMind. However, each of them has a different API and work with non-compatible data types. +The environment plays a fundamental role in the definition of the RL schema. For example, the selection of the agent depends strongly on the observation and action space nature. There are several interfaces to interact with the environments such as OpenAI Gym / Farama Gymnasium or DeepMind. However, each of them has a different API and work with non-compatible data types. -skrl offers a function to **wrap environments** based on the OpenAI Gym, DeepMind, Isaac Gym and Omniverse Isaac Gym interfaces (the last two have slight differences with OpenAI Gym) and offer, for library components, a common interface (based on OpenAI Gym) as shown in the following figure. Refer to the :doc:`Wrapping <../modules/skrl.envs.wrapping>` section for more information. +skrl offers a function to **wrap environments** based on the Gym/Gymnasium, DeepMind, Isaac Gym and Omniverse Isaac Gym interfaces (the last two have slight differences with Gym) and offer, for library components, a common interface (based on Gym/Gymnasium) as shown in the following figure. Refer to the :doc:`Wrapping <../modules/skrl.envs.wrapping>` section for more information. .. image:: ../_static/imgs/wrapping.svg :width: 100% @@ -69,7 +74,7 @@ Within the methods and properties defined in the wrapped environment, the observ .. tabs:: .. tab:: Preview 4 (isaacgymenvs.make) - + .. code-block:: python import isaacgymenvs @@ -78,9 +83,9 @@ Within the methods and properties defined in the wrapped environment, the observ from skrl.envs.torch import wrap_env # create/load the environment using the easy-to-use API from NVIDIA - env = isaacgymenvs.make(seed=0, - task="Cartpole", - num_envs=512, + env = isaacgymenvs.make(seed=0, + task="Cartpole", + num_envs=512, sim_device="cuda:0", rl_device="cuda:0", graphics_device_id=0, @@ -90,7 +95,7 @@ Within the methods and properties defined in the wrapped environment, the observ env = wrap_env(env) # or 'env = wrap_env(env, wrapper="isaacgym-preview4")' .. tab:: Preview 4 - + .. code-block:: python # import the environment wrapper and loader @@ -104,7 +109,7 @@ Within the methods and properties defined in the wrapped environment, the observ env = wrap_env(env) # or 'env = wrap_env(env, wrapper="isaacgym-preview4")' .. tab:: Preview 3 - + .. code-block:: python # import the environment wrapper and loader @@ -118,7 +123,7 @@ Within the methods and properties defined in the wrapped environment, the observ env = wrap_env(env) # or 'env = wrap_env(env, wrapper="isaacgym-preview3")' .. tab:: Preview 2 - + .. code-block:: python # import the environment wrapper and loader @@ -131,42 +136,80 @@ Within the methods and properties defined in the wrapped environment, the observ # wrap the environment env = wrap_env(env) # or 'env = wrap_env(env, wrapper="isaacgym-preview2")' - .. tab:: OpenAI Gym - + .. tab:: Gym / Gymnasium + .. tabs:: - .. tab:: Single environment + .. tab:: Gym - .. code-block:: python + .. tabs:: - # import the environment wrapper and gym - from skrl.envs.torch import wrap_env - import gym + .. tab:: Single environment - # load environment - env = gym.make('Pendulum-v1') + .. code-block:: python - # wrap the environment - env = wrap_env(env) # or 'env = wrap_env(env, wrapper="gym")' + # import the environment wrapper and gym + from skrl.envs.torch import wrap_env + import gym - .. tab:: Vectorized environment + # load environment + env = gym.make('Pendulum-v1') - Visit the OpenAI Gym documentation (`Vector API `_) for more information about the creation and usage of vectorized environments. + # wrap the environment + env = wrap_env(env) # or 'env = wrap_env(env, wrapper="gym")' - .. code-block:: python + .. tab:: Vectorized environment - # import the environment wrapper and gym - from skrl.envs.torch import wrap_env - import gym + Visit the OpenAI Gym documentation (`Vector `__) for more information about the creation and usage of vectorized environments. - # load a vectorized environment - env = gym.vector.make("Pendulum-v1", num_envs=10, asynchronous=False) + .. code-block:: python - # wrap the environment - env = wrap_env(env) # or 'env = wrap_env(env, wrapper="gym")' + # import the environment wrapper and gym + from skrl.envs.torch import wrap_env + import gym + + # load a vectorized environment + env = gym.vector.make("Pendulum-v1", num_envs=10, asynchronous=False) + + # wrap the environment + env = wrap_env(env) # or 'env = wrap_env(env, wrapper="gym")' + + .. tab:: Gymnasium + + .. tabs:: + + .. tab:: Single environment + + .. code-block:: python + + # import the environment wrapper and gymnasium + from skrl.envs.torch import wrap_env + import gymnasium as gym + + # load environment + env = gym.make('Pendulum-v1') + + # wrap the environment + env = wrap_env(env) # or 'env = wrap_env(env, wrapper="gymnasium")' + + .. tab:: Vectorized environment + + Visit the Gymnasium documentation (`Vector `__) for more information about the creation and usage of vectorized environments. + + .. code-block:: python + + # import the environment wrapper and gymnasium + from skrl.envs.torch import wrap_env + import gymnasium as gym + + # load a vectorized environment + env = gym.vector.make("Pendulum-v1", num_envs=10, asynchronous=False) + + # wrap the environment + env = wrap_env(env) # or 'env = wrap_env(env, wrapper="gymnasium")' .. tab:: DeepMind - + .. code-block:: python # import the environment wrapper and the deepmind suite @@ -243,8 +286,8 @@ The following code snippets show how to define a model, based on the concept of nn.ELU(), nn.Linear(32, self.num_actions)) - def compute(self, states, taken_actions, role): - return self.net(states) + def compute(self, inputs, role): + return self.net(inputs["states"]), {} .. tab:: Gaussian @@ -262,23 +305,23 @@ The following code snippets show how to define a model, based on the concept of import torch import torch.nn as nn from skrl.models.torch import Model, GaussianMixin - + # define the model class Policy(GaussianMixin, Model): - def __init__(self, observation_space, action_space, device="cuda:0", + def __init__(self, observation_space, action_space, device="cuda:0", clip_actions=False, clip_log_std=True, min_log_std=-20, max_log_std=2, reduction="sum"): Model.__init__(self, observation_space, action_space, device) GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std, reduction) - + self.net = nn.Sequential(nn.Linear(self.num_observations, 32), nn.ELU(), nn.Linear(32, 32), nn.ELU(), nn.Linear(32, self.num_actions)) self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) - - def compute(self, states, taken_actions, role): - return self.net(states), self.log_std_parameter + + def compute(self, inputs, role): + return self.net(inputs["states"]), self.log_std_parameter, {} .. tab:: Multivariate Gaussian @@ -297,22 +340,22 @@ The following code snippets show how to define a model, based on the concept of import torch.nn as nn from skrl.models.torch import Model, MultivariateGaussianMixin - # define the model + # define the model class Policy(MultivariateGaussianMixin, Model): - def __init__(self, observation_space, action_space, device="cuda:0", + def __init__(self, observation_space, action_space, device="cuda:0", clip_actions=False, clip_log_std=True, min_log_std=-20, max_log_std=2): Model.__init__(self, observation_space, action_space, device) MultivariateGaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std) - + self.net = nn.Sequential(nn.Linear(self.num_observations, 32), nn.ELU(), nn.Linear(32, 32), nn.ELU(), nn.Linear(32, self.num_actions)) self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) - - def compute(self, states, taken_actions, role): - return self.net(states), self.log_std_parameter + + def compute(self, inputs, role): + return self.net(inputs["states"]), self.log_std_parameter, {} .. tab:: Deterministic @@ -330,21 +373,21 @@ The following code snippets show how to define a model, based on the concept of import torch import torch.nn as nn from skrl.models.torch import Model, DeterministicMixin - + # define the model class Policy(DeterministicMixin, Model): def __init__(self, observation_space, action_space, device="cuda:0", clip_actions=False): Model.__init__(self, observation_space, action_space, device) DeterministicMixin.__init__(self, clip_actions) - + self.net = nn.Sequential(nn.Linear(self.num_observations, 32), nn.ELU(), nn.Linear(32, 32), nn.ELU(), nn.Linear(32, self.num_actions)) - - def compute(self, states, taken_actions, role): - return self.net(states) + + def compute(self, inputs, role): + return self.net(inputs["states"]), {} .. tab:: Tabular @@ -359,12 +402,13 @@ The following code snippets show how to define a model, based on the concept of Model.__init__(self, observation_space, action_space, device) TabularMixin.__init__(self, num_envs) - self.table = torch.ones((num_envs, self.num_observations, self.num_actions), + self.table = torch.ones((num_envs, self.num_observations, self.num_actions), dtype=torch.float32, device=self.device) - def compute(self, states, taken_actions, role): - actions = torch.argmax(self.table[torch.arange(self.num_envs).view(-1, 1), states], + def compute(self, inputs, role): + actions = torch.argmax(self.table[torch.arange(self.num_envs).view(-1, 1), inputs["states"]], dim=-1, keepdim=True).view(-1,1) + return actions, {} Models must be collected in a dictionary and passed to the agent constructor during its instantiation under the argument :literal:`models`. The dictionary keys are specific to each agent. Visit their respective documentation for more details (under *Spaces and models* section). For example, the PPO agent requires the policy and value models as shown below: @@ -379,7 +423,7 @@ Models can be saved and loaded to and from the file system. However, the recomme 4. Noises --------- -Noise plays a fundamental role in the exploration stage, especially in agents of a deterministic nature, such as DDPG or TD3, for example. +Noise plays a fundamental role in the exploration stage, especially in agents of a deterministic nature, such as DDPG or TD3, for example. skrl provides, as part of its resources, **classes for instantiating noises** as shown in the following code snippets. Refer to :ref:`Noises ` documentation for more information. diff --git a/docs/source/intro/installation.rst b/docs/source/intro/installation.rst index cd5e6bdb..f7fbbd7c 100644 --- a/docs/source/intro/installation.rst +++ b/docs/source/intro/installation.rst @@ -4,13 +4,13 @@ Installation .. raw:: html
- + Prerequisites ------------- **skrl** requires Python 3.6 or higher and the following libraries (they will be installed automatically): - * `gym `_ + * `gym `_ / `gymnasium `_ * `tqdm `_ * `packaging `_ * `torch `_ 1.8.0 or higher @@ -29,7 +29,7 @@ Python Package Index (PyPI) To install **skrl** with pip, execute: .. code-block:: bash - + pip install skrl GitHub repository @@ -38,20 +38,20 @@ GitHub repository Clone or download the library from its GitHub repository (https://github.com/Toni-SM/skrl) .. code-block:: bash - + git clone https://github.com/Toni-SM/skrl.git cd skrl * **Install in editable/development mode** (links the package to its original location allowing any modifications to be reflected directly in its Python environment) .. code-block:: bash - + pip install -e . * **Install in the current Python site-packages directory** (modifications to the code downloaded from GitHub will not be reflected in your Python environment) .. code-block:: bash - + pip install . .. raw:: html @@ -77,7 +77,7 @@ Known issues See PyTorch issue `#80831 `_ .. code-block:: text - + AttributeError: 'Adam' object has no attribute '_warned_capturable_if_run_uncaptured' Changelog diff --git a/docs/source/modules/skrl.agents.a2c.rst b/docs/source/modules/skrl.agents.a2c.rst index bcbf2a0e..5dba97d5 100644 --- a/docs/source/modules/skrl.agents.a2c.rst +++ b/docs/source/modules/skrl.agents.a2c.rst @@ -62,7 +62,7 @@ Algorithm implementation | :green:`# optimization step` | reset :math:`\text{optimizer}_{\theta, \phi}` | :math:`\nabla_{\theta, \, \phi} (L_{\pi_\theta} + {L}_{entropy} + L_{V_\phi})` -| :math:`\text{clip}(\lVert \nabla_{\theta, \, \phi} \rVert)` with :guilabel:`grad_norm_clip` +| :math:`\text{clip}(\lVert \nabla_{\theta, \, \phi} \rVert)` with :guilabel:`grad_norm_clip` | step :math:`\text{optimizer}_{\theta, \phi}` | :green:`# update learning rate` | **IF** there is a :guilabel:`learning_rate_scheduler` **THEN** @@ -76,18 +76,18 @@ Configuration and hyperparameters .. literalinclude:: ../../../skrl/agents/torch/a2c/a2c.py :language: python - :lines: 17-50 + :lines: 17-53 :linenos: Spaces and models ^^^^^^^^^^^^^^^^^ -The implementation supports the following `Gym spaces `_ +The implementation supports the following `Gym spaces `_ / `Gymnasium spaces `_ .. list-table:: :header-rows: 1 - * - Gym spaces + * - Gym/Gymnasium spaces - .. centered:: Observation - .. centered:: Action * - Discrete @@ -124,6 +124,18 @@ The implementation uses 1 stochastic (discrete or continuous) and 1 deterministi - 1 - :ref:`Deterministic ` +Support for advanced features is described in the next table + +.. list-table:: + :header-rows: 1 + + * - Feature + - Support and remarks + * - Shared model + - for Policy and Value + * - RNN support + - RNN, LSTM, GRU and any other variant + API ^^^ @@ -132,5 +144,5 @@ API :show-inheritance: :private-members: _update :members: - + .. automethod:: __init__ diff --git a/docs/source/modules/skrl.agents.amp.rst b/docs/source/modules/skrl.agents.amp.rst index 2c337b77..66a11244 100644 --- a/docs/source/modules/skrl.agents.amp.rst +++ b/docs/source/modules/skrl.agents.amp.rst @@ -71,7 +71,7 @@ Algorithm implementation | :math:`{logit}_{_{AMP}}^{^B} \leftarrow D_\psi(s_{_{AMP}}^{^B}) \qquad` with :math:`s_{_{AMP}}^{^B}` of size :guilabel:`discriminator_batch_size` | :math:`{logit}_{_{AMP}}^{^M} \leftarrow D_\psi(s_{_{AMP}}^{^M}) \qquad` with :math:`s_{_{AMP}}^{^M}` of size :guilabel:`discriminator_batch_size` | :green:`# discriminator prediction loss` -| :math:`L_{D_\psi} \leftarrow \dfrac{1}{2}(BCE({logit}_{_{AMP}}` ++ :math:`{logit}_{_{AMP}}^{^B}, \, 0) + BCE({logit}_{_{AMP}}^{^M}, \, 1))` +| :math:`L_{D_\psi} \leftarrow \dfrac{1}{2}(BCE({logit}_{_{AMP}}` ++ :math:`{logit}_{_{AMP}}^{^B}, \, 0) + BCE({logit}_{_{AMP}}^{^M}, \, 1))` | with :math:`\; BCE(x,y)=-\frac{1}{N} \sum_{i=1}^N [y \; log(\hat{y}) + (1-y) \, log(1-\hat{y})] \;` and :math:`\; \hat{y} = \dfrac{1}{1 + e^{-x}}` | :green:`# discriminator logit regularization` | :math:`L_{D_\psi} \leftarrow L_{D_\psi} +` :guilabel:`discriminator_logit_regularization_scale` :math:`\sum_{i=1}^N \text{flatten}(\psi_w[-1])^2` @@ -82,7 +82,7 @@ Algorithm implementation | :green:`# optimization step` | reset :math:`\text{optimizer}_{\theta, \phi, \psi}` | :math:`\nabla_{\theta, \, \phi, \, \psi} (L^{clip}_{\pi_\theta} + {L}_{entropy} + L_{V_\phi} + L_{D_\psi})` -| :math:`\text{clip}(\lVert \nabla_{\theta, \, \phi, \, \psi} \rVert)` with :guilabel:`grad_norm_clip` +| :math:`\text{clip}(\lVert \nabla_{\theta, \, \phi, \, \psi} \rVert)` with :guilabel:`grad_norm_clip` | step :math:`\text{optimizer}_{\theta, \phi, \psi}` | :green:`# update learning rate` | **IF** there is a :guilabel:`learning_rate_scheduler` **THEN** @@ -97,18 +97,18 @@ Configuration and hyperparameters .. literalinclude:: ../../../skrl/agents/torch/amp/amp.py :language: python - :lines: 18-68 + :lines: 18-71 :linenos: Spaces and models ^^^^^^^^^^^^^^^^^ -The implementation supports the following `Gym spaces `_ +The implementation supports the following `Gym spaces `_ / `Gymnasium spaces `_ .. list-table:: :header-rows: 1 - * - Gym spaces + * - Gym/Gymnasium spaces - .. centered:: AMP observation - .. centered:: Observation - .. centered:: Action @@ -155,6 +155,18 @@ The implementation uses 1 stochastic (continuous) and 2 deterministic function a - 1 - :ref:`Deterministic ` +Support for advanced features is described in the next table + +.. list-table:: + :header-rows: 1 + + * - Feature + - Support and remarks + * - Shared model + - \- + * - RNN support + - \- + API ^^^ @@ -163,5 +175,5 @@ API :show-inheritance: :private-members: _update :members: - + .. automethod:: __init__ diff --git a/docs/source/modules/skrl.agents.base_class.rst b/docs/source/modules/skrl.agents.base_class.rst index 4fea0caf..32764bbe 100644 --- a/docs/source/modules/skrl.agents.base_class.rst +++ b/docs/source/modules/skrl.agents.base_class.rst @@ -11,7 +11,7 @@ Basic inheritance usage ^^^^^^^^^^^^^^^^^^^^^^^ .. tabs:: - + .. tab:: Inheritance .. literalinclude:: ../snippets/agent.py @@ -26,6 +26,6 @@ API :inherited-members: :private-members: _update, _empty_preprocessor, _get_internal_value :members: - + .. automethod:: __init__ .. automethod:: __str__ diff --git a/docs/source/modules/skrl.agents.cem.rst b/docs/source/modules/skrl.agents.cem.rst index 0a774635..948ea1c5 100644 --- a/docs/source/modules/skrl.agents.cem.rst +++ b/docs/source/modules/skrl.agents.cem.rst @@ -4,6 +4,11 @@ Cross-Entropy Method (CEM) Algorithm implementation ^^^^^^^^^^^^^^^^^^^^^^^^ +| Main notation/symbols: +| - policy function approximator (:math:`\pi_\theta`) +| - states (:math:`s`), actions (:math:`a`), rewards (:math:`r`), next states (:math:`s'`), dones (:math:`d`) +| - loss (:math:`L`) + **Decision making** (:literal:`act(...)`) | :math:`a \leftarrow \pi_\theta(s)` @@ -13,17 +18,22 @@ Algorithm implementation | :green:`# sample all memory` | :math:`s, a, r, s', d \leftarrow` states, actions, rewards, next_states, dones | :green:`# compute discounted return threshold` -| :math:`[G] \leftarrow \sum_{t=0}^{E-1} \gamma^{t} r_t` for each episode -| :math:`G_{_{bound}} \leftarrow q_{th_{percentile}}([G])` +| :math:`[G] \leftarrow \sum_{t=0}^{E-1}` :guilabel:`discount_factor`:math:`^{t} \, r_t` for each episode +| :math:`G_{_{bound}} \leftarrow q_{th_{quantile}}([G])` at the given :guilabel:`percentile` | :green:`# get elite states and actions` | :math:`s_{_{elite}} \leftarrow s[G \geq G_{_{bound}}]` | :math:`a_{_{elite}} \leftarrow a[G \geq G_{_{bound}}]` | :green:`# compute scores for the elite states` | :math:`scores \leftarrow \theta(s_{_{elite}})` | :green:`# compute policy loss` -| :math:`{Loss}_{policy} \leftarrow -\sum_{i=1}^{N} a_{_{elite}} \log(scores)` -| :green:`# optimize policy` -| :math:`\nabla_{\theta} {Loss}_{policy}` +| :math:`L_{\pi_\theta} \leftarrow -\sum_{i=1}^{N} a_{_{elite}} \log(scores)` +| :green:`# optimization step` +| reset :math:`\text{optimizer}_\theta` +| :math:`\nabla_{\theta} L_{\pi_\theta}` +| step :math:`\text{optimizer}_\theta` +| :green:`# update learning rate` +| **IF** there is a :guilabel:`learning_rate_scheduler` **THEN** +| step :math:`\text{scheduler}_\theta (\text{optimizer}_\theta)` Configuration and hyperparameters ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -32,18 +42,18 @@ Configuration and hyperparameters .. literalinclude:: ../../../skrl/agents/torch/cem/cem.py :language: python - :lines: 15-41 + :lines: 15-44 :linenos: Spaces and models ^^^^^^^^^^^^^^^^^ -The implementation supports the following `Gym spaces `_ +The implementation supports the following `Gym spaces `_ / `Gymnasium spaces `_ .. list-table:: :header-rows: 1 - * - Gym spaces + * - Gym/Gymnasium spaces - .. centered:: Observation - .. centered:: Action * - Discrete @@ -74,6 +84,16 @@ The implementation uses 1 discrete function approximator. This function approxim - action - :ref:`Categorical ` +Support for advanced features is described in the next table + +.. list-table:: + :header-rows: 1 + + * - Feature + - Support and remarks + * - RNN support + - \- + API ^^^ @@ -82,5 +102,5 @@ API :show-inheritance: :private-members: _update :members: - + .. automethod:: __init__ diff --git a/docs/source/modules/skrl.agents.ddpg.rst b/docs/source/modules/skrl.agents.ddpg.rst index f7547713..7c5da694 100644 --- a/docs/source/modules/skrl.agents.ddpg.rst +++ b/docs/source/modules/skrl.agents.ddpg.rst @@ -36,6 +36,7 @@ Algorithm implementation | :green:`# optimization step (critic)` | reset :math:`\text{optimizer}_\phi` | :math:`\nabla_{\phi} L_{Q_\phi}` +| :math:`\text{clip}(\lVert \nabla_{\phi} \rVert)` with :guilabel:`grad_norm_clip` | step :math:`\text{optimizer}_\phi` | :green:`# compute policy (actor) loss` | :math:`a \leftarrow \mu_\theta(s)` @@ -44,6 +45,7 @@ Algorithm implementation | :green:`# optimization step (policy)` | reset :math:`\text{optimizer}_\theta` | :math:`\nabla_{\theta} L_{\mu_\theta}` +| :math:`\text{clip}(\lVert \nabla_{\theta} \rVert)` with :guilabel:`grad_norm_clip` | step :math:`\text{optimizer}_\theta` | :green:`# update target networks` | :math:`\theta_{target} \leftarrow` :guilabel:`polyak` :math:`\theta + (1 \;-` :guilabel:`polyak` :math:`) \theta_{target}` @@ -60,18 +62,18 @@ Configuration and hyperparameters .. literalinclude:: ../../../skrl/agents/torch/ddpg/ddpg.py :language: python - :lines: 15-50 + :lines: 16-56 :linenos: Spaces and models ^^^^^^^^^^^^^^^^^ -The implementation supports the following `Gym spaces `_ +The implementation supports the following `Gym spaces `_ / `Gymnasium spaces `_ .. list-table:: :header-rows: 1 - * - Gym spaces + * - Gym/Gymnasium spaces - .. centered:: Observation - .. centered:: Action * - Discrete @@ -120,6 +122,18 @@ The implementation uses 4 deterministic function approximators. These function a - 1 - :ref:`Deterministic ` +Support for advanced features is described in the next table + +.. list-table:: + :header-rows: 1 + + * - Feature + - Support and remarks + * - Shared model + - \- + * - RNN support + - RNN, LSTM, GRU and any other variant + API ^^^ @@ -128,5 +142,5 @@ API :show-inheritance: :private-members: _update :members: - + .. automethod:: __init__ diff --git a/docs/source/modules/skrl.agents.ddqn.rst b/docs/source/modules/skrl.agents.ddqn.rst index 4af5ea0f..77eb36c4 100644 --- a/docs/source/modules/skrl.agents.ddqn.rst +++ b/docs/source/modules/skrl.agents.ddqn.rst @@ -16,13 +16,13 @@ Algorithm implementation **Learning algorithm** (:literal:`_update(...)`) | :green:`# sample a batch from memory` -| :math:`s, a, r, s', d \leftarrow` states, actions, rewards, next_states, dones +| [:math:`s, a, r, s', d`] :math:`\leftarrow` states, actions, rewards, next_states, dones of size :guilabel:`batch_size` | :green:`# gradient steps` -| **FOR** each gradient step **DO** +| **FOR** each gradient step up to :guilabel:`gradient_steps` **DO** | :green:`# compute target values` | :math:`Q' \leftarrow Q_{\phi_{target}}(s')` | :math:`Q_{_{target}} \leftarrow Q'[\underset{a}{\arg\max} \; Q_\phi(s')] \qquad` :gray:`# the only difference with DQN` -| :math:`y \leftarrow r + \gamma \; \neg d \; Q_{_{target}}` +| :math:`y \leftarrow r \;+` :guilabel:`discount_factor` :math:`\neg d \; Q_{_{target}}` | :green:`# compute Q-network loss` | :math:`Q \leftarrow Q_\phi(s)[a]` | :math:`{Loss}_{Q_\phi} \leftarrow \frac{1}{N} \sum_{i=1}^N (Q - y)^2` @@ -30,7 +30,10 @@ Algorithm implementation | :math:`\nabla_{\phi} {Loss}_{Q_\phi}` | :green:`# update target network` | **IF** it's time to update target network **THEN** -| :math:`\phi_{target} \leftarrow \tau \; \phi + (1 - \tau) \phi_{target}` +| :math:`\phi_{target} \leftarrow` :guilabel:`polyak` :math:`\phi + (1 \;-` :guilabel:`polyak` :math:`) \phi_{target}` +| :green:`# update learning rate` +| **IF** there is a :guilabel:`learning_rate_scheduler` **THEN** +| step :math:`\text{scheduler}_\phi (\text{optimizer}_\phi)` Configuration and hyperparameters ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -39,18 +42,18 @@ Configuration and hyperparameters .. literalinclude:: ../../../skrl/agents/torch/dqn/ddqn.py :language: python - :lines: 16-52 + :lines: 16-55 :linenos: Spaces and models ^^^^^^^^^^^^^^^^^ -The implementation supports the following `Gym spaces `_ +The implementation supports the following `Gym spaces `_ / `Gymnasium spaces `_ .. list-table:: :header-rows: 1 - * - Gym spaces + * - Gym/Gymnasium spaces - .. centered:: Observation - .. centered:: Action * - Discrete @@ -87,6 +90,18 @@ The implementation uses 2 deterministic function approximators. These function a - action - :ref:`Deterministic ` +Support for advanced features is described in the next table + +.. list-table:: + :header-rows: 1 + + * - Feature + - Support and remarks + * - Shared model + - \- + * - RNN support + - \- + API ^^^ @@ -95,5 +110,5 @@ API :show-inheritance: :private-members: _update :members: - + .. automethod:: __init__ diff --git a/docs/source/modules/skrl.agents.dqn.rst b/docs/source/modules/skrl.agents.dqn.rst index 200baf9a..991733c5 100644 --- a/docs/source/modules/skrl.agents.dqn.rst +++ b/docs/source/modules/skrl.agents.dqn.rst @@ -1,7 +1,7 @@ Deep Q-Network (DQN) ==================== -DQN is a **model-free**, **off-policy** algorithm that trains a control policies directly from high-dimensional sensory using a deep function approximator to represent the Q-value function +DQN is a **model-free**, **off-policy** algorithm that trains a control policies directly from high-dimensional sensory using a deep function approximator to represent the Q-value function Paper: `Playing Atari with Deep Reinforcement Learning `_ @@ -16,13 +16,13 @@ Algorithm implementation **Learning algorithm** (:literal:`_update(...)`) | :green:`# sample a batch from memory` -| :math:`s, a, r, s', d \leftarrow` states, actions, rewards, next_states, dones +| [:math:`s, a, r, s', d`] :math:`\leftarrow` states, actions, rewards, next_states, dones of size :guilabel:`batch_size` | :green:`# gradient steps` -| **FOR** each gradient step **DO** +| **FOR** each gradient step up to :guilabel:`gradient_steps` **DO** | :green:`# compute target values` | :math:`Q' \leftarrow Q_{\phi_{target}}(s')` | :math:`Q_{_{target}} \leftarrow \underset{a}{\max} \; Q' \qquad` :gray:`# the only difference with DDQN` -| :math:`y \leftarrow r + \gamma \; \neg d \; Q_{_{target}}` +| :math:`y \leftarrow r \;+` :guilabel:`discount_factor` :math:`\neg d \; Q_{_{target}}` | :green:`# compute Q-network loss` | :math:`Q \leftarrow Q_\phi(s)[a]` | :math:`{Loss}_{Q_\phi} \leftarrow \frac{1}{N} \sum_{i=1}^N (Q - y)^2` @@ -30,7 +30,10 @@ Algorithm implementation | :math:`\nabla_{\phi} {Loss}_{Q_\phi}` | :green:`# update target network` | **IF** it's time to update target network **THEN** -| :math:`\phi_{target} \leftarrow \tau \; \phi + (1 - \tau) \phi_{target}` +| :math:`\phi_{target} \leftarrow` :guilabel:`polyak` :math:`\phi + (1 \;-` :guilabel:`polyak` :math:`) \phi_{target}` +| :green:`# update learning rate` +| **IF** there is a :guilabel:`learning_rate_scheduler` **THEN** +| step :math:`\text{scheduler}_\phi (\text{optimizer}_\phi)` Configuration and hyperparameters ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -39,18 +42,18 @@ Configuration and hyperparameters .. literalinclude:: ../../../skrl/agents/torch/dqn/dqn.py :language: python - :lines: 16-52 + :lines: 16-55 :linenos: Spaces and models ^^^^^^^^^^^^^^^^^ -The implementation supports the following `Gym spaces `_ +The implementation supports the following `Gym spaces `_ / `Gymnasium spaces `_ .. list-table:: :header-rows: 1 - * - Gym spaces + * - Gym/Gymnasium spaces - .. centered:: Observation - .. centered:: Action * - Discrete @@ -87,6 +90,18 @@ The implementation uses 2 deterministic function approximators. These function a - action - :ref:`Deterministic ` +Support for advanced features is described in the next table + +.. list-table:: + :header-rows: 1 + + * - Feature + - Support and remarks + * - Shared model + - \- + * - RNN support + - \- + API ^^^ @@ -95,5 +110,5 @@ API :show-inheritance: :private-members: _update :members: - + .. automethod:: __init__ diff --git a/docs/source/modules/skrl.agents.ppo.rst b/docs/source/modules/skrl.agents.ppo.rst index 11fdb703..a87ca455 100644 --- a/docs/source/modules/skrl.agents.ppo.rst +++ b/docs/source/modules/skrl.agents.ppo.rst @@ -9,7 +9,7 @@ Algorithm ^^^^^^^^^ | For each iteration do: -| :math:`\bullet \;` Collect, in a rollout memory, a set of states :math:`s`, actions :math:`a`, rewards :math:`r`, dones :math:`d`, log probabilities :math:`logp` and values :math:`V` on policy using :math:`\pi_\theta` and :math:`V_\phi` +| :math:`\bullet \;` Collect, in a rollout memory, a set of states :math:`s`, actions :math:`a`, rewards :math:`r`, dones :math:`d`, log probabilities :math:`logp` and values :math:`V` on policy using :math:`\pi_\theta` and :math:`V_\phi` | :math:`\bullet \;` Estimate returns :math:`R` and advantages :math:`A` using Generalized Advantage Estimation (GAE(:math:`\lambda`)) from the collected data [:math:`r, d, V`] | :math:`\bullet \;` Compute the entropy loss :math:`{L}_{entropy}` | :math:`\bullet \;` Compute the clipped surrogate objective (policy loss) with :math:`ratio` as the probability ratio between the action under the current policy and the action under the previous policy: :math:`L^{clip}_{\pi_\theta} = \mathbb{E}[\min(A \; ratio, A \; \text{clip}(ratio, 1-c, 1+c))]` @@ -79,7 +79,7 @@ Algorithm implementation | :green:`# optimization step` | reset :math:`\text{optimizer}_{\theta, \phi}` | :math:`\nabla_{\theta, \, \phi} (L^{clip}_{\pi_\theta} + {L}_{entropy} + L_{V_\phi})` -| :math:`\text{clip}(\lVert \nabla_{\theta, \, \phi} \rVert)` with :guilabel:`grad_norm_clip` +| :math:`\text{clip}(\lVert \nabla_{\theta, \, \phi} \rVert)` with :guilabel:`grad_norm_clip` | step :math:`\text{optimizer}_{\theta, \phi}` | :green:`# update learning rate` | **IF** there is a :guilabel:`learning_rate_scheduler` **THEN** @@ -92,18 +92,18 @@ Configuration and hyperparameters .. literalinclude:: ../../../skrl/agents/torch/ppo/ppo.py :language: python - :lines: 18-58 + :lines: 18-61 :linenos: Spaces and models ^^^^^^^^^^^^^^^^^ -The implementation supports the following `Gym spaces `_ +The implementation supports the following `Gym spaces `_ / `Gymnasium spaces `_ .. list-table:: :header-rows: 1 - * - Gym spaces + * - Gym/Gymnasium spaces - .. centered:: Observation - .. centered:: Action * - Discrete @@ -140,6 +140,18 @@ The implementation uses 1 stochastic (discrete or continuous) and 1 deterministi - 1 - :ref:`Deterministic ` +Support for advanced features is described in the next table + +.. list-table:: + :header-rows: 1 + + * - Feature + - Support and remarks + * - Shared model + - for Policy and Value + * - RNN support + - RNN, LSTM, GRU and any other variant + API ^^^ @@ -148,5 +160,5 @@ API :show-inheritance: :private-members: _update :members: - + .. automethod:: __init__ diff --git a/docs/source/modules/skrl.agents.q_learning.rst b/docs/source/modules/skrl.agents.q_learning.rst index 2ea947f6..4c01c747 100644 --- a/docs/source/modules/skrl.agents.q_learning.rst +++ b/docs/source/modules/skrl.agents.q_learning.rst @@ -30,18 +30,18 @@ Configuration and hyperparameters .. literalinclude:: ../../../skrl/agents/torch/q_learning/q_learning.py :language: python - :lines: 14-32 + :lines: 14-35 :linenos: Spaces and models ^^^^^^^^^^^^^^^^^ -The implementation supports the following `Gym spaces `_ +The implementation supports the following `Gym spaces `_ / `Gymnasium spaces `_ .. list-table:: :header-rows: 1 - * - Gym spaces + * - Gym/Gymnasium spaces - .. centered:: Observation - .. centered:: Action * - Discrete @@ -80,5 +80,5 @@ API :show-inheritance: :private-members: _update :members: - + .. automethod:: __init__ diff --git a/docs/source/modules/skrl.agents.sac.rst b/docs/source/modules/skrl.agents.sac.rst index 0af74b70..b81af374 100644 --- a/docs/source/modules/skrl.agents.sac.rst +++ b/docs/source/modules/skrl.agents.sac.rst @@ -32,6 +32,7 @@ Algorithm implementation | :green:`# optimization step (critic)` | reset :math:`\text{optimizer}_\phi` | :math:`\nabla_{\phi} L_{Q_\phi}` +| :math:`\text{clip}(\lVert \nabla_{\phi} \rVert)` with :guilabel:`grad_norm_clip` | step :math:`\text{optimizer}_\phi` | :green:`# compute policy (actor) loss` | :math:`a,\; logp \leftarrow \pi_\theta(s)` @@ -41,6 +42,7 @@ Algorithm implementation | :green:`# optimization step (policy)` | reset :math:`\text{optimizer}_\theta` | :math:`\nabla_{\theta} L_{\pi_\theta}` +| :math:`\text{clip}(\lVert \nabla_{\theta} \rVert)` with :guilabel:`grad_norm_clip` | step :math:`\text{optimizer}_\theta` | :green:`# entropy learning` | **IF** :guilabel:`learn_entropy` is enabled **THEN** @@ -67,18 +69,18 @@ Configuration and hyperparameters .. literalinclude:: ../../../skrl/agents/torch/sac/sac.py :language: python - :lines: 17-50 + :lines: 18-56 :linenos: Spaces and models ^^^^^^^^^^^^^^^^^ -The implementation supports the following `Gym spaces `_ +The implementation supports the following `Gym spaces `_ / `Gymnasium spaces `_ .. list-table:: :header-rows: 1 - * - Gym spaces + * - Gym/Gymnasium spaces - .. centered:: Observation - .. centered:: Action * - Discrete @@ -133,6 +135,18 @@ The implementation uses 1 stochastic and 4 deterministic function approximators. - 1 - :ref:`Deterministic ` +Support for advanced features is described in the next table + +.. list-table:: + :header-rows: 1 + + * - Feature + - Support and remarks + * - Shared model + - \- + * - RNN support + - RNN, LSTM, GRU and any other variant + API ^^^ @@ -141,5 +155,5 @@ API :show-inheritance: :private-members: _update :members: - + .. automethod:: __init__ diff --git a/docs/source/modules/skrl.agents.sarsa.rst b/docs/source/modules/skrl.agents.sarsa.rst index 9bb4f23f..c880cb7c 100644 --- a/docs/source/modules/skrl.agents.sarsa.rst +++ b/docs/source/modules/skrl.agents.sarsa.rst @@ -29,18 +29,18 @@ Configuration and hyperparameters .. literalinclude:: ../../../skrl/agents/torch/sarsa/sarsa.py :language: python - :lines: 14-32 + :lines: 14-35 :linenos: Spaces and models ^^^^^^^^^^^^^^^^^ -The implementation supports the following `Gym spaces `_ +The implementation supports the following `Gym spaces `_ / `Gymnasium spaces `_ .. list-table:: :header-rows: 1 - * - Gym spaces + * - Gym/Gymnasium spaces - .. centered:: Observation - .. centered:: Action * - Discrete @@ -79,5 +79,5 @@ API :show-inheritance: :private-members: _update :members: - + .. automethod:: __init__ diff --git a/docs/source/modules/skrl.agents.td3.rst b/docs/source/modules/skrl.agents.td3.rst index eb30fb80..772a292c 100644 --- a/docs/source/modules/skrl.agents.td3.rst +++ b/docs/source/modules/skrl.agents.td3.rst @@ -1,7 +1,7 @@ Twin-Delayed DDPG (TD3) ======================= -TD3 is a **model-free**, **deterministic** **off-policy** **actor-critic** algorithm (based on DDPG) that relies on double Q-learning, target policy smoothing and delayed policy updates to address the problems introduced by overestimation bias in actor-critic algorithms +TD3 is a **model-free**, **deterministic** **off-policy** **actor-critic** algorithm (based on DDPG) that relies on double Q-learning, target policy smoothing and delayed policy updates to address the problems introduced by overestimation bias in actor-critic algorithms Paper: `Addressing Function Approximation Error in Actor-Critic Methods `_ @@ -28,7 +28,7 @@ Algorithm implementation | **FOR** each gradient step up to :guilabel:`gradient_steps` **DO** | :green:`# target policy smoothing` | :math:`a' \leftarrow \mu_{\theta_{target}}(s')` -| :math:`noise \leftarrow \text{clip}(` :guilabel:`smooth_regularization_noise` :math:`, -c, c) \qquad` with :math:`c` as :guilabel:`smooth_regularization_clip` +| :math:`noise \leftarrow \text{clip}(` :guilabel:`smooth_regularization_noise` :math:`, -c, c) \qquad` with :math:`c` as :guilabel:`smooth_regularization_clip` | :math:`a' \leftarrow a' + noise` | :math:`a' \leftarrow \text{clip}(a', {a'}_{Low}, {a'}_{High})` | :green:`# compute target values` @@ -43,6 +43,7 @@ Algorithm implementation | :green:`# optimization step (critic)` | reset :math:`\text{optimizer}_\phi` | :math:`\nabla_{\phi} L_{Q_\phi}` +| :math:`\text{clip}(\lVert \nabla_{\phi} \rVert)` with :guilabel:`grad_norm_clip` | step :math:`\text{optimizer}_\phi` | :green:`# delayed update` | **IF** it's time for the :guilabel:`policy_delay` update **THEN** @@ -53,6 +54,7 @@ Algorithm implementation | :green:`# optimization step (policy)` | reset :math:`\text{optimizer}_\theta` | :math:`\nabla_{\theta} L_{\mu_\theta}` +| :math:`\text{clip}(\lVert \nabla_{\theta} \rVert)` with :guilabel:`grad_norm_clip` | step :math:`\text{optimizer}_\theta` | :green:`# update target networks` | :math:`\theta_{target} \leftarrow` :guilabel:`polyak` :math:`\theta + (1 \;-` :guilabel:`polyak` :math:`) \theta_{target}` @@ -70,18 +72,18 @@ Configuration and hyperparameters .. literalinclude:: ../../../skrl/agents/torch/td3/td3.py :language: python - :lines: 16-55 + :lines: 17-61 :linenos: Spaces and models ^^^^^^^^^^^^^^^^^ -The implementation supports the following `Gym spaces `_ +The implementation supports the following `Gym spaces `_ / `Gymnasium spaces `_ .. list-table:: :header-rows: 1 - * - Gym spaces + * - Gym/Gymnasium spaces - .. centered:: Observation - .. centered:: Action * - Discrete @@ -142,6 +144,18 @@ The implementation uses 6 deterministic function approximators. These function a - 1 - :ref:`Deterministic ` +Support for advanced features is described in the next table + +.. list-table:: + :header-rows: 1 + + * - Feature + - Support and remarks + * - Shared model + - \- + * - RNN support + - RNN, LSTM, GRU and any other variant + API ^^^ @@ -150,5 +164,5 @@ API :show-inheritance: :private-members: _update :members: - + .. automethod:: __init__ diff --git a/docs/source/modules/skrl.agents.trpo.rst b/docs/source/modules/skrl.agents.trpo.rst index bd2b0ad6..dcc4ed00 100644 --- a/docs/source/modules/skrl.agents.trpo.rst +++ b/docs/source/modules/skrl.agents.trpo.rst @@ -9,7 +9,7 @@ Algorithm ^^^^^^^^^ | For each iteration do -| :math:`\bullet \;` Collect, in a rollout memory, a set of states :math:`s`, actions :math:`a`, rewards :math:`r`, dones :math:`d`, log probabilities :math:`logp` and values :math:`V` on policy using :math:`\pi_\theta` and :math:`V_\phi` +| :math:`\bullet \;` Collect, in a rollout memory, a set of states :math:`s`, actions :math:`a`, rewards :math:`r`, dones :math:`d`, log probabilities :math:`logp` and values :math:`V` on policy using :math:`\pi_\theta` and :math:`V_\phi` | :math:`\bullet \;` Estimate returns :math:`R` and advantages :math:`A` using Generalized Advantage Estimation (GAE(:math:`\lambda`)) from the collected data [:math:`r, d, V`] | :math:`\bullet \;` Compute the surrogate objective (policy loss) gradient :math:`g` and the Hessian :math:`H` of :math:`KL` divergence with respect to the policy parameters :math:`\theta` | :math:`\bullet \;` Compute the search direction :math:`\; x \approx H^{-1}g \;` using the conjugate gradient method @@ -79,45 +79,47 @@ Algorithm implementation | :green:`# compute returns and advantages` | :math:`V_{_{last}}' \leftarrow V_\phi(s')` | :math:`R, A \leftarrow f_{GAE}(r, d, V, V_{_{last}}')` +| :green:`# sample all from memory` +| [[:math:`s, a, logp, A`]] :math:`\leftarrow` states, actions, log_prob, advantages +| :green:`# compute policy loss gradient` +| :math:`L_{\pi_\theta} \leftarrow f_{Loss}(\pi_\theta, s, a, logp, A)` +| :math:`g \leftarrow \nabla_{\theta} L_{\pi_\theta}` +| :math:`g_{_{flat}} \leftarrow \text{flatten}(g)` +| :green:`# compute the search direction using the conjugate gradient algorithm` +| :math:`search_{direction} \leftarrow f_{CG}(\pi_\theta, s, g_{_{flat}})` +| :green:`# compute step size and full step` +| :math:`xHx \leftarrow search_{direction} \; f_{Ax}(\pi_\theta, s, search_{direction})` +| :math:`step_{size} \leftarrow \sqrt{\dfrac{2 \, \delta}{xHx}} \qquad` with :math:`\; \delta` as :guilabel:`max_kl_divergence` +| :math:`\beta \leftarrow step_{size} \; search_{direction}` +| :green:`# backtracking line search` +| :math:`flag_{restore} \leftarrow \text{True}` +| :math:`\pi_{\theta_{backup}} \leftarrow \pi_\theta` +| :math:`\theta \leftarrow \text{get_parameters}(\pi_\theta)` +| :math:`I_{expected} \leftarrow g_{_{flat}} \; \beta` +| **FOR** :math:`\alpha \leftarrow (0.5` :guilabel:`step_fraction` :math:`)^i \;` with :math:`i = 0, 1, 2, ...` up to :guilabel:`max_backtrack_steps` **DO** +| :math:`\theta_{new} \leftarrow \theta + \alpha \; \beta` +| :math:`\pi_\theta \leftarrow \text{set_parameters}(\theta_{new})` +| :math:`I_{expected} \leftarrow \alpha \; I_{expected}` +| :math:`kl \leftarrow f_{KL}(\pi_{\theta_{backup}}, \pi_\theta, s)` +| :math:`L \leftarrow f_{Loss}(\pi_\theta, s, a, logp, A)` +| **IF** :math:`kl < \delta` **AND** :math:`\dfrac{L - L_{\pi_\theta}}{I_{expected}} >` :guilabel:`accept_ratio` **THEN** +| :math:`flag_{restore} \leftarrow \text{False}` +| **BREAK LOOP** +| **IF** :math:`flag_{restore}` **THEN** +| :math:`\pi_\theta \leftarrow \pi_{\theta_{backup}}` | :green:`# sample mini-batches from memory` -| [[:math:`s, a, logp, R, A`]] :math:`\leftarrow` states, actions, log_prob, returns, advantages +| [[:math:`s, R`]] :math:`\leftarrow` states, returns | :green:`# learning epochs` | **FOR** each learning epoch up to :guilabel:`learning_epochs` **DO** | :green:`# mini-batches loop` -| **FOR** each mini-batch [:math:`s, a, logp, R, A`] up to :guilabel:`mini_batches` **DO** -| :green:`# compute policy loss gradient` -| :math:`L_{\pi_\theta} \leftarrow f_{Loss}(\pi_\theta, s, a, logp, A)` -| :math:`g \leftarrow \nabla_{\theta} L_{\pi_\theta}` -| :math:`g_{_{flat}} \leftarrow \text{flatten}(g)` -| :green:`# compute the search direction using the conjugate gradient algorithm` -| :math:`search_{direction} \leftarrow f_{CG}(\pi_\theta, s, g_{_{flat}})` -| :green:`# compute step size and full step` -| :math:`xHx \leftarrow search_{direction} \; f_{Ax}(\pi_\theta, s, search_{direction})` -| :math:`step_{size} \leftarrow \sqrt{\dfrac{2 \, \delta}{xHx}} \qquad` with :math:`\; \delta` as :guilabel:`max_kl_divergence` -| :math:`\beta \leftarrow step_{size} \; search_{direction}` -| :green:`# backtracking line search` -| :math:`flag_{restore} \leftarrow \text{True}` -| :math:`\pi_{\theta_{backup}} \leftarrow \pi_\theta` -| :math:`\theta \leftarrow \text{get_parameters}(\pi_\theta)` -| :math:`I_{expected} \leftarrow g_{_{flat}} \; \beta` -| **FOR** :math:`\alpha \leftarrow (0.5` :guilabel:`step_fraction` :math:`)^i \;` with :math:`i = 0, 1, 2, ...` up to :guilabel:`max_backtrack_steps` **DO** -| :math:`\theta_{new} \leftarrow \theta + \alpha \; \beta` -| :math:`\pi_\theta \leftarrow \text{set_parameters}(\theta_{new})` -| :math:`I_{expected} \leftarrow \alpha \; I_{expected}` -| :math:`kl \leftarrow f_{KL}(\pi_{\theta_{backup}}, \pi_\theta, s)` -| :math:`L \leftarrow f_{Loss}(\pi_\theta, s, a, logp, A)` -| **IF** :math:`kl < \delta` **AND** :math:`\dfrac{L - L_{\pi_\theta}}{I_{expected}} >` :guilabel:`accept_ratio` **THEN** -| :math:`flag_{restore} \leftarrow \text{False}` -| **BREAK LOOP** -| **IF** :math:`flag_{restore}` **THEN** -| :math:`\pi_\theta \leftarrow \pi_{\theta_{backup}}` +| **FOR** each mini-batch [:math:`s, R`] up to :guilabel:`mini_batches` **DO** | :green:`# compute value loss` | :math:`V' \leftarrow V_\phi(s)` | :math:`L_{V_\phi} \leftarrow` :guilabel:`value_loss_scale` :math:`\frac{1}{N} \sum_{i=1}^N (R - V')^2` | :green:`# optimization step (value)` | reset :math:`\text{optimizer}_\phi` | :math:`\nabla_{\phi} L_{V_\phi}` -| :math:`\text{clip}(\lVert \nabla_{\phi} \rVert)` with :guilabel:`grad_norm_clip` +| :math:`\text{clip}(\lVert \nabla_{\phi} \rVert)` with :guilabel:`grad_norm_clip` | step :math:`\text{optimizer}_\phi` | :green:`# update learning rate` | **IF** there is a :guilabel:`learning_rate_scheduler` **THEN** @@ -130,18 +132,18 @@ Configuration and hyperparameters .. literalinclude:: ../../../skrl/agents/torch/trpo/trpo.py :language: python - :lines: 18-58 + :lines: 18-61 :linenos: Spaces and models ^^^^^^^^^^^^^^^^^ -The implementation supports the following `Gym spaces `_ +The implementation supports the following `Gym spaces `_ / `Gymnasium spaces `_ .. list-table:: :header-rows: 1 - * - Gym spaces + * - Gym/Gymnasium spaces - .. centered:: Observation - .. centered:: Action * - Discrete @@ -178,6 +180,18 @@ The implementation uses 1 stochastic and 1 deterministic function approximator. - 1 - :ref:`Deterministic ` +Support for advanced features is described in the next table + +.. list-table:: + :header-rows: 1 + + * - Feature + - Support and remarks + * - Shared model + - \- + * - RNN support + - RNN, LSTM, GRU and any other variant + API ^^^ @@ -186,5 +200,5 @@ API :show-inheritance: :private-members: _update :members: - + .. automethod:: __init__ diff --git a/docs/source/modules/skrl.envs.isaac_gym.rst b/docs/source/modules/skrl.envs.isaac_gym.rst index 6a386f24..a99c2ef9 100644 --- a/docs/source/modules/skrl.envs.isaac_gym.rst +++ b/docs/source/modules/skrl.envs.isaac_gym.rst @@ -26,9 +26,9 @@ With the release of Isaac Gym (preview 4), NVIDIA developers provide an easy-to- import isaacgymenvs - env = isaacgymenvs.make(seed=0, - task="Cartpole", - num_envs=2000, + env = isaacgymenvs.make(seed=0, + task="Cartpole", + num_envs=2000, sim_device="cuda:0", rl_device="cuda:0", graphics_device_id=0, @@ -61,7 +61,7 @@ Basic usage env = load_isaacgym_env_preview4(task_name="Cartpole") .. tab:: Command line arguments (priority) - + .. code-block:: python :linenos: @@ -118,7 +118,7 @@ Basic usage env = load_isaacgym_env_preview3(task_name="Cartpole") .. tab:: Command line arguments (priority) - + .. code-block:: python :linenos: @@ -171,7 +171,7 @@ Basic usage env = load_isaacgym_env_preview2(task_name="Cartpole") .. tab:: Command line arguments (priority) - + .. code-block:: python :linenos: diff --git a/docs/source/modules/skrl.envs.omniverse_isaac_gym.rst b/docs/source/modules/skrl.envs.omniverse_isaac_gym.rst index 4a6b3b34..c1b13a56 100644 --- a/docs/source/modules/skrl.envs.omniverse_isaac_gym.rst +++ b/docs/source/modules/skrl.envs.omniverse_isaac_gym.rst @@ -53,7 +53,7 @@ In this approach, the RL algorithm maintains the main execution loop env = load_omniverse_isaacgym_env(task_name="Cartpole") .. tab:: Command line arguments (priority) - + .. code-block:: python :linenos: @@ -98,7 +98,7 @@ In this approach, the RL algorithm is executed on a secondary thread while the s env.run() .. tab:: Command line arguments (priority) - + .. code-block:: python :linenos: diff --git a/docs/source/modules/skrl.envs.wrapping.rst b/docs/source/modules/skrl.envs.wrapping.rst index 3f0a7be5..12a28131 100644 --- a/docs/source/modules/skrl.envs.wrapping.rst +++ b/docs/source/modules/skrl.envs.wrapping.rst @@ -3,8 +3,9 @@ Wrapping This library works with a common API to interact with the following RL environments: -* `OpenAI Gym `_ (single and vectorized environments) +* OpenAI `Gym `_ / Farama `Gymnasium `_ (single and vectorized environments) * `DeepMind `_ +* `robosuite `_ * `NVIDIA Isaac Gym `_ (preview 2, 3 and 4) * `NVIDIA Omniverse Isaac Gym `_ @@ -63,7 +64,7 @@ Basic usage .. tabs:: .. tab:: Preview 4 (isaacgymenvs.make) - + .. code-block:: python :linenos: @@ -73,9 +74,9 @@ Basic usage from skrl.envs.torch import wrap_env # create/load the environment using the easy-to-use API from NVIDIA - env = isaacgymenvs.make(seed=0, - task="Cartpole", - num_envs=512, + env = isaacgymenvs.make(seed=0, + task="Cartpole", + num_envs=512, sim_device="cuda:0", rl_device="cuda:0", graphics_device_id=0, @@ -85,7 +86,7 @@ Basic usage env = wrap_env(env) # or 'env = wrap_env(env, wrapper="isaacgym-preview4")' .. tab:: Preview 4 - + .. code-block:: python :linenos: @@ -100,7 +101,7 @@ Basic usage env = wrap_env(env) # or 'env = wrap_env(env, wrapper="isaacgym-preview4")' .. tab:: Preview 3 - + .. code-block:: python :linenos: @@ -115,7 +116,7 @@ Basic usage env = wrap_env(env) # or 'env = wrap_env(env, wrapper="isaacgym-preview3")' .. tab:: Preview 2 - + .. code-block:: python :linenos: @@ -129,44 +130,84 @@ Basic usage # wrap the environment env = wrap_env(env) # or 'env = wrap_env(env, wrapper="isaacgym-preview2")' - .. tab:: OpenAI Gym - + .. tab:: Gym / Gymnasium + .. tabs:: - .. tab:: Single environment + .. tab:: Gym - .. code-block:: python - :linenos: + .. tabs:: - # import the environment wrapper and gym - from skrl.envs.torch import wrap_env - import gym + .. tab:: Single environment - # load environment - env = gym.make('Pendulum-v1') + .. code-block:: python + :linenos: - # wrap the environment - env = wrap_env(env) # or 'env = wrap_env(env, wrapper="gym")' + # import the environment wrapper and gym + from skrl.envs.torch import wrap_env + import gym - .. tab:: Vectorized environment + # load environment + env = gym.make('Pendulum-v1') - Visit the OpenAI Gym documentation (`Vector API `_) for more information about the creation and usage of vectorized environments + # wrap the environment + env = wrap_env(env) # or 'env = wrap_env(env, wrapper="gym")' - .. code-block:: python - :linenos: + .. tab:: Vectorized environment - # import the environment wrapper and gym - from skrl.envs.torch import wrap_env - import gym + Visit the Gym documentation (`Vector `__) for more information about the creation and usage of vectorized environments - # load a vectorized environment - env = gym.vector.make("Pendulum-v1", num_envs=10, asynchronous=False) + .. code-block:: python + :linenos: - # wrap the environment - env = wrap_env(env) # or 'env = wrap_env(env, wrapper="gym")' + # import the environment wrapper and gym + from skrl.envs.torch import wrap_env + import gym + + # load a vectorized environment + env = gym.vector.make("Pendulum-v1", num_envs=10, asynchronous=False) + + # wrap the environment + env = wrap_env(env) # or 'env = wrap_env(env, wrapper="gym")' + + .. tab:: Gymnasium + + .. tabs:: + + .. tab:: Single environment + + .. code-block:: python + :linenos: + + # import the environment wrapper and gymnasium + from skrl.envs.torch import wrap_env + import gymnasium as gym + + # load environment + env = gym.make('Pendulum-v1') + + # wrap the environment + env = wrap_env(env) # or 'env = wrap_env(env, wrapper="gymnasium")' + + .. tab:: Vectorized environment + + Visit the Gymnasium documentation (`Vector `__) for more information about the creation and usage of vectorized environments + + .. code-block:: python + :linenos: + + # import the environment wrapper and gymnasium + from skrl.envs.torch import wrap_env + import gymnasium as gym + + # load a vectorized environment + env = gym.vector.make("Pendulum-v1", num_envs=10, asynchronous=False) + + # wrap the environment + env = wrap_env(env) # or 'env = wrap_env(env, wrapper="gymnasium")' .. tab:: DeepMind - + .. code-block:: python :linenos: @@ -180,6 +221,35 @@ Basic usage # wrap the environment env = wrap_env(env) # or 'env = wrap_env(env, wrapper="dm")' + .. tab:: robosuite + + .. code-block:: python + :linenos: + + # import the environment wrapper and robosuite + from skrl.envs.torch import wrap_env + import robosuite + from robosuite.controllers import load_controller_config + + # load environment + controller_config = load_controller_config(default_controller="OSC_POSE") + env = robosuite.make("TwoArmLift", + robots=["Sawyer", "Panda"], # load a Sawyer robot and a Panda robot + gripper_types="default", # use default grippers per robot arm + controller_configs=controller_config, # each arm is controlled using OSC + env_configuration="single-arm-opposed", # (two-arm envs only) arms face each other + has_renderer=True, # on-screen rendering + render_camera="frontview", # visualize the "frontview" camera + has_offscreen_renderer=False, # no off-screen rendering + control_freq=20, # 20 hz control for applied actions + horizon=200, # each episode terminates after 200 steps + use_object_obs=True, # provide object observations to agent + use_camera_obs=False, # don't provide image observations to agent + reward_shaping=True) # use a dense reward signal for learning + + # wrap the environment + env = wrap_env(env) # or 'env = wrap_env(env, wrapper="robosuite")' + .. raw:: html
@@ -200,41 +270,48 @@ Internal API :undoc-members: :show-inheritance: :members: - + .. automethod:: __init__ .. py:property:: device The device used by the environment - If the wrapped environment does not have the ``device`` property, the value of this property will be ``"cuda:0"`` or ``"cpu"`` depending on the device availability + If the wrapped environment does not have the ``device`` property, the value of this property will be ``"cuda:0"`` or ``"cpu"`` depending on the device availability .. autoclass:: skrl.envs.torch.wrappers.OmniverseIsaacGymWrapper :undoc-members: :show-inheritance: :members: - + .. automethod:: __init__ .. autoclass:: skrl.envs.torch.wrappers.IsaacGymPreview3Wrapper :undoc-members: :show-inheritance: :members: - + .. automethod:: __init__ .. autoclass:: skrl.envs.torch.wrappers.IsaacGymPreview2Wrapper :undoc-members: :show-inheritance: :members: - + .. automethod:: __init__ .. autoclass:: skrl.envs.torch.wrappers.GymWrapper :undoc-members: :show-inheritance: :members: - + + .. automethod:: __init__ + +.. autoclass:: skrl.envs.torch.wrappers.GymnasiumWrapper + :undoc-members: + :show-inheritance: + :members: + .. automethod:: __init__ .. autoclass:: skrl.envs.torch.wrappers.DeepMindWrapper @@ -242,5 +319,13 @@ Internal API :show-inheritance: :private-members: _spec_to_space, _observation_to_tensor, _tensor_to_action :members: - + + .. automethod:: __init__ + +.. autoclass:: skrl.envs.torch.wrappers.RobosuiteWrapper + :undoc-members: + :show-inheritance: + :private-members: _spec_to_space, _observation_to_tensor, _tensor_to_action + :members: + .. automethod:: __init__ diff --git a/docs/source/modules/skrl.memories.base_class.rst b/docs/source/modules/skrl.memories.base_class.rst index 101e98dc..99177175 100644 --- a/docs/source/modules/skrl.memories.base_class.rst +++ b/docs/source/modules/skrl.memories.base_class.rst @@ -11,13 +11,13 @@ Basic inheritance usage ^^^^^^^^^^^^^^^^^^^^^^^ .. tabs:: - + .. tab:: Inheritance .. literalinclude:: ../snippets/memory.py :language: python :linenos: - + API ^^^ @@ -25,6 +25,6 @@ API :undoc-members: :show-inheritance: :members: - + .. automethod:: __init__ .. automethod:: __len__ diff --git a/docs/source/modules/skrl.memories.random.rst b/docs/source/modules/skrl.memories.random.rst index 36842732..cce722d0 100644 --- a/docs/source/modules/skrl.memories.random.rst +++ b/docs/source/modules/skrl.memories.random.rst @@ -12,7 +12,7 @@ Basic usage # create a random memory object memory = RandomMemory(memory_size=1000, num_envs=1, replacement=False) - + # create tensors in memory memory.create_tensor(name="states", size=(64, 64, 3), dtype=torch.float32) memory.create_tensor(name="actions", size=(4,1), dtype=torch.float32) @@ -39,6 +39,6 @@ API :show-inheritance: :inherited-members: :members: - + .. automethod:: __init__ .. automethod:: __len__ diff --git a/docs/source/modules/skrl.models.base_class.rst b/docs/source/modules/skrl.models.base_class.rst index 8422448e..d4cbc0cf 100644 --- a/docs/source/modules/skrl.models.base_class.rst +++ b/docs/source/modules/skrl.models.base_class.rst @@ -36,7 +36,7 @@ API :show-inheritance: :private-members: _get_space_size :members: - + .. automethod:: __init__ .. py:property:: device diff --git a/docs/source/modules/skrl.models.categorical.rst b/docs/source/modules/skrl.models.categorical.rst index 6dc185c2..f9fbad84 100644 --- a/docs/source/modules/skrl.models.categorical.rst +++ b/docs/source/modules/skrl.models.categorical.rst @@ -38,23 +38,238 @@ Concept Basic usage ----------- +* Multi-Layer Perceptron (**MLP**) +* Convolutional Neural Network (**CNN**) +* Recurrent Neural Network (**RNN**) +* Gated Recurrent Unit RNN (**GRU**) +* Long Short-Term Memory RNN (**LSTM**) + .. tabs:: - - .. tab:: Multi-Layer Perceptron (MLP) - - .. literalinclude:: ../snippets/categorical_model.py - :language: python - :linenos: - :start-after: [start-mlp] - :end-before: [end-mlp] - - .. tab:: Convolutional Neural Network (CNN) - - .. literalinclude:: ../snippets/categorical_model.py - :language: python - :linenos: - :start-after: [start-cnn] - :end-before: [end-cnn] + + .. tab:: MLP + + .. image:: ../_static/imgs/model_categorical_mlp.svg + :width: 40% + :align: center + + .. raw:: html + +
+ + .. tabs:: + + .. group-tab:: nn.Sequential + + .. literalinclude:: ../snippets/categorical_model.py + :language: python + :linenos: + :start-after: [start-mlp-sequential] + :end-before: [end-mlp-sequential] + + .. group-tab:: nn.functional + + .. literalinclude:: ../snippets/categorical_model.py + :language: python + :linenos: + :start-after: [start-mlp-functional] + :end-before: [end-mlp-functional] + + .. tab:: CNN + + .. image:: ../_static/imgs/model_categorical_cnn.svg + :width: 100% + :align: center + + .. raw:: html + +
+ + .. tabs:: + + .. group-tab:: nn.Sequential + + .. literalinclude:: ../snippets/categorical_model.py + :language: python + :linenos: + :start-after: [start-cnn-sequential] + :end-before: [end-cnn-sequential] + + .. group-tab:: nn.functional + + .. literalinclude:: ../snippets/categorical_model.py + :language: python + :linenos: + :start-after: [start-cnn-functional] + :end-before: [end-cnn-functional] + + .. tab:: RNN + + .. image:: ../_static/imgs/model_categorical_rnn.svg + :width: 90% + :align: center + + where: + + .. math:: + \begin{aligned} + N ={} & \text{batch size} \\ + L ={} & \text{sequence length} \\ + D ={} & 2 \text{ if bidirectional=True otherwise } 1 \\ + H_{in} ={} & \text{input_size} \\ + H_{out} ={} & \text{hidden_size} + \end{aligned} + + .. raw:: html + +
+ + The following points are relevant in the definition of recurrent models: + + * The ``.get_specification()`` method must be overwritten to return, under a dictionary key ``"rnn"``, a sub-dictionary that includes the sequence length (under key ``"sequence_length"``) as a number and a list of the dimensions (under key ``"sizes"``) of each initial hidden state + + * The ``.compute()`` method's ``inputs`` parameter will have, at least, the following items in the dictionary: + + * ``"states"``: state of the environment used to make the decision + * ``"taken_actions"``: actions taken by the policy for the given states, if applicable + * ``"terminated"``: episode termination status for sampled environment transitions. This key is only defined during the training process + * ``"rnn"``: list of initial hidden states ordered according to the model specification + + * The ``.compute()`` method must inlcude, under the ``"rnn"`` key of the returned dictionary, a list of each final hidden state + + .. raw:: html + +
+ + .. tabs:: + + .. group-tab:: nn.Sequential + + .. literalinclude:: ../snippets/categorical_model.py + :language: python + :linenos: + :start-after: [start-rnn-sequential] + :end-before: [end-rnn-sequential] + + .. group-tab:: nn.functional + + .. literalinclude:: ../snippets/categorical_model.py + :language: python + :linenos: + :start-after: [start-rnn-functional] + :end-before: [end-rnn-functional] + + .. tab:: GRU + + .. image:: ../_static/imgs/model_categorical_rnn.svg + :width: 90% + :align: center + + where: + + .. math:: + \begin{aligned} + N ={} & \text{batch size} \\ + L ={} & \text{sequence length} \\ + D ={} & 2 \text{ if bidirectional=True otherwise } 1 \\ + H_{in} ={} & \text{input_size} \\ + H_{out} ={} & \text{hidden_size} + \end{aligned} + + .. raw:: html + +
+ + The following points are relevant in the definition of recurrent models: + + * The ``.get_specification()`` method must be overwritten to return, under a dictionary key ``"rnn"``, a sub-dictionary that includes the sequence length (under key ``"sequence_length"``) as a number and a list of the dimensions (under key ``"sizes"``) of each initial hidden state + + * The ``.compute()`` method's ``inputs`` parameter will have, at least, the following items in the dictionary: + + * ``"states"``: state of the environment used to make the decision + * ``"taken_actions"``: actions taken by the policy for the given states, if applicable + * ``"terminated"``: episode termination status for sampled environment transitions. This key is only defined during the training process + * ``"rnn"``: list of initial hidden states ordered according to the model specification + + * The ``.compute()`` method must inlcude, under the ``"rnn"`` key of the returned dictionary, a list of each final hidden state + + .. raw:: html + +
+ + .. tabs:: + + .. group-tab:: nn.Sequential + + .. literalinclude:: ../snippets/categorical_model.py + :language: python + :linenos: + :start-after: [start-gru-sequential] + :end-before: [end-gru-sequential] + + .. group-tab:: nn.functional + + .. literalinclude:: ../snippets/categorical_model.py + :language: python + :linenos: + :start-after: [start-gru-functional] + :end-before: [end-gru-functional] + + .. tab:: LSTM + + .. image:: ../_static/imgs/model_categorical_rnn.svg + :width: 90% + :align: center + + where: + + .. math:: + \begin{aligned} + N ={} & \text{batch size} \\ + L ={} & \text{sequence length} \\ + D ={} & 2 \text{ if bidirectional=True otherwise } 1 \\ + H_{in} ={} & \text{input_size} \\ + H_{cell} ={} & \text{hidden_size} \\ + H_{out} ={} & \text{proj_size if } \text{proj_size}>0 \text{ otherwise hidden_size} \\ + \end{aligned} + + .. raw:: html + +
+ + The following points are relevant in the definition of recurrent models: + + * The ``.get_specification()`` method must be overwritten to return, under a dictionary key ``"rnn"``, a sub-dictionary that includes the sequence length (under key ``"sequence_length"``) as a number and a list of the dimensions (under key ``"sizes"``) of each initial hidden/cell states + + * The ``.compute()`` method's ``inputs`` parameter will have, at least, the following items in the dictionary: + + * ``"states"``: state of the environment used to make the decision + * ``"taken_actions"``: actions taken by the policy for the given states, if applicable + * ``"terminated"``: episode termination status for sampled environment transitions. This key is only defined during the training process + * ``"rnn"``: list of initial hidden/cell states ordered according to the model specification + + * The ``.compute()`` method must inlcude, under the ``"rnn"`` key of the returned dictionary, a list of each final hidden/cell states + + .. raw:: html + +
+ + .. tabs:: + + .. group-tab:: nn.Sequential + + .. literalinclude:: ../snippets/categorical_model.py + :language: python + :linenos: + :start-after: [start-lstm-sequential] + :end-before: [end-lstm-sequential] + + .. group-tab:: nn.functional + + .. literalinclude:: ../snippets/categorical_model.py + :language: python + :linenos: + :start-after: [start-lstm-functional] + :end-before: [end-lstm-functional] API --- diff --git a/docs/source/modules/skrl.models.deterministic.rst b/docs/source/modules/skrl.models.deterministic.rst index bccbf334..e4abb854 100644 --- a/docs/source/modules/skrl.models.deterministic.rst +++ b/docs/source/modules/skrl.models.deterministic.rst @@ -38,23 +38,238 @@ Concept Basic usage ----------- +* Multi-Layer Perceptron (**MLP**) +* Convolutional Neural Network (**CNN**) +* Recurrent Neural Network (**RNN**) +* Gated Recurrent Unit RNN (**GRU**) +* Long Short-Term Memory RNN (**LSTM**) + .. tabs:: - - .. tab:: Multi-Layer Perceptron (MLP) - - .. literalinclude:: ../snippets/deterministic_model.py - :language: python - :linenos: - :start-after: [start-mlp] - :end-before: [end-mlp] - - .. tab:: Convolutional Neural Network (CNN) - - .. literalinclude:: ../snippets/deterministic_model.py - :language: python - :linenos: - :start-after: [start-cnn] - :end-before: [end-cnn] + + .. tab:: MLP + + .. image:: ../_static/imgs/model_deterministic_mlp.svg + :width: 35% + :align: center + + .. raw:: html + +
+ + .. tabs:: + + .. group-tab:: nn.Sequential + + .. literalinclude:: ../snippets/deterministic_model.py + :language: python + :linenos: + :start-after: [start-mlp-sequential] + :end-before: [end-mlp-sequential] + + .. group-tab:: nn.functional + + .. literalinclude:: ../snippets/deterministic_model.py + :language: python + :linenos: + :start-after: [start-mlp-functional] + :end-before: [end-mlp-functional] + + .. tab:: CNN + + .. image:: ../_static/imgs/model_deterministic_cnn.svg + :width: 100% + :align: center + + .. raw:: html + +
+ + .. tabs:: + + .. group-tab:: nn.Sequential + + .. literalinclude:: ../snippets/deterministic_model.py + :language: python + :linenos: + :start-after: [start-cnn-sequential] + :end-before: [end-cnn-sequential] + + .. group-tab:: nn.functional + + .. literalinclude:: ../snippets/deterministic_model.py + :language: python + :linenos: + :start-after: [start-cnn-functional] + :end-before: [end-cnn-functional] + + .. tab:: RNN + + .. image:: ../_static/imgs/model_deterministic_rnn.svg + :width: 90% + :align: center + + where: + + .. math:: + \begin{aligned} + N ={} & \text{batch size} \\ + L ={} & \text{sequence length} \\ + D ={} & 2 \text{ if bidirectional=True otherwise } 1 \\ + H_{in} ={} & \text{input_size} \\ + H_{out} ={} & \text{hidden_size} + \end{aligned} + + .. raw:: html + +
+ + The following points are relevant in the definition of recurrent models: + + * The ``.get_specification()`` method must be overwritten to return, under a dictionary key ``"rnn"``, a sub-dictionary that includes the sequence length (under key ``"sequence_length"``) as a number and a list of the dimensions (under key ``"sizes"``) of each initial hidden state + + * The ``.compute()`` method's ``inputs`` parameter will have, at least, the following items in the dictionary: + + * ``"states"``: state of the environment used to make the decision + * ``"taken_actions"``: actions taken by the policy for the given states, if applicable + * ``"terminated"``: episode termination status for sampled environment transitions. This key is only defined during the training process + * ``"rnn"``: list of initial hidden states ordered according to the model specification + + * The ``.compute()`` method must inlcude, under the ``"rnn"`` key of the returned dictionary, a list of each final hidden state + + .. raw:: html + +
+ + .. tabs:: + + .. group-tab:: nn.Sequential + + .. literalinclude:: ../snippets/deterministic_model.py + :language: python + :linenos: + :start-after: [start-rnn-sequential] + :end-before: [end-rnn-sequential] + + .. group-tab:: nn.functional + + .. literalinclude:: ../snippets/deterministic_model.py + :language: python + :linenos: + :start-after: [start-rnn-functional] + :end-before: [end-rnn-functional] + + .. tab:: GRU + + .. image:: ../_static/imgs/model_deterministic_rnn.svg + :width: 90% + :align: center + + where: + + .. math:: + \begin{aligned} + N ={} & \text{batch size} \\ + L ={} & \text{sequence length} \\ + D ={} & 2 \text{ if bidirectional=True otherwise } 1 \\ + H_{in} ={} & \text{input_size} \\ + H_{out} ={} & \text{hidden_size} + \end{aligned} + + .. raw:: html + +
+ + The following points are relevant in the definition of recurrent models: + + * The ``.get_specification()`` method must be overwritten to return, under a dictionary key ``"rnn"``, a sub-dictionary that includes the sequence length (under key ``"sequence_length"``) as a number and a list of the dimensions (under key ``"sizes"``) of each initial hidden state + + * The ``.compute()`` method's ``inputs`` parameter will have, at least, the following items in the dictionary: + + * ``"states"``: state of the environment used to make the decision + * ``"taken_actions"``: actions taken by the policy for the given states, if applicable + * ``"terminated"``: episode termination status for sampled environment transitions. This key is only defined during the training process + * ``"rnn"``: list of initial hidden states ordered according to the model specification + + * The ``.compute()`` method must inlcude, under the ``"rnn"`` key of the returned dictionary, a list of each final hidden state + + .. raw:: html + +
+ + .. tabs:: + + .. group-tab:: nn.Sequential + + .. literalinclude:: ../snippets/deterministic_model.py + :language: python + :linenos: + :start-after: [start-gru-sequential] + :end-before: [end-gru-sequential] + + .. group-tab:: nn.functional + + .. literalinclude:: ../snippets/deterministic_model.py + :language: python + :linenos: + :start-after: [start-gru-functional] + :end-before: [end-gru-functional] + + .. tab:: LSTM + + .. image:: ../_static/imgs/model_deterministic_rnn.svg + :width: 90% + :align: center + + where: + + .. math:: + \begin{aligned} + N ={} & \text{batch size} \\ + L ={} & \text{sequence length} \\ + D ={} & 2 \text{ if bidirectional=True otherwise } 1 \\ + H_{in} ={} & \text{input_size} \\ + H_{cell} ={} & \text{hidden_size} \\ + H_{out} ={} & \text{proj_size if } \text{proj_size}>0 \text{ otherwise hidden_size} \\ + \end{aligned} + + .. raw:: html + +
+ + The following points are relevant in the definition of recurrent models: + + * The ``.get_specification()`` method must be overwritten to return, under a dictionary key ``"rnn"``, a sub-dictionary that includes the sequence length (under key ``"sequence_length"``) as a number and a list of the dimensions (under key ``"sizes"``) of each initial hidden/cell states + + * The ``.compute()`` method's ``inputs`` parameter will have, at least, the following items in the dictionary: + + * ``"states"``: state of the environment used to make the decision + * ``"taken_actions"``: actions taken by the policy for the given states, if applicable + * ``"terminated"``: episode termination status for sampled environment transitions. This key is only defined during the training process + * ``"rnn"``: list of initial hidden/cell states ordered according to the model specification + + * The ``.compute()`` method must inlcude, under the ``"rnn"`` key of the returned dictionary, a list of each final hidden/cell states + + .. raw:: html + +
+ + .. tabs:: + + .. group-tab:: nn.Sequential + + .. literalinclude:: ../snippets/deterministic_model.py + :language: python + :linenos: + :start-after: [start-lstm-sequential] + :end-before: [end-lstm-sequential] + + .. group-tab:: nn.functional + + .. literalinclude:: ../snippets/deterministic_model.py + :language: python + :linenos: + :start-after: [start-lstm-functional] + :end-before: [end-lstm-functional] API --- diff --git a/docs/source/modules/skrl.models.gaussian.rst b/docs/source/modules/skrl.models.gaussian.rst index 0b7dd56a..36f78e83 100644 --- a/docs/source/modules/skrl.models.gaussian.rst +++ b/docs/source/modules/skrl.models.gaussian.rst @@ -13,7 +13,7 @@ skrl provides a Python mixin (:literal:`GaussianMixin`) to assist in the creatio :emphasize-lines: 1 class GaussianModel(GaussianMixin, Model): - def __init__(self, observation_space, action_space, device="cuda:0", + def __init__(self, observation_space, action_space, device="cuda:0", clip_actions=False, clip_log_std=True, min_log_std=-20, max_log_std=2, reduction="sum"): Model.__init__(self, observation_space, action_space, device) GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std, reduction) @@ -24,7 +24,7 @@ skrl provides a Python mixin (:literal:`GaussianMixin`) to assist in the creatio :emphasize-lines: 4-5 class GaussianModel(GaussianMixin, Model): - def __init__(self, observation_space, action_space, device="cuda:0", + def __init__(self, observation_space, action_space, device="cuda:0", clip_actions=False, clip_log_std=True, min_log_std=-20, max_log_std=2, reduction="sum"): Model.__init__(self, observation_space, action_space, device) GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std, reduction) @@ -40,23 +40,238 @@ Concept Basic usage ----------- +* Multi-Layer Perceptron (**MLP**) +* Convolutional Neural Network (**CNN**) +* Recurrent Neural Network (**RNN**) +* Gated Recurrent Unit RNN (**GRU**) +* Long Short-Term Memory RNN (**LSTM**) + .. tabs:: - - .. tab:: Multi-Layer Perceptron (MLP) - - .. literalinclude:: ../snippets/gaussian_model.py - :language: python - :linenos: - :start-after: [start-mlp] - :end-before: [end-mlp] - - .. tab:: Convolutional Neural Network (CNN) - - .. literalinclude:: ../snippets/gaussian_model.py - :language: python - :linenos: - :start-after: [start-cnn] - :end-before: [end-cnn] + + .. tab:: MLP + + .. image:: ../_static/imgs/model_gaussian_mlp.svg + :width: 42% + :align: center + + .. raw:: html + +
+ + .. tabs:: + + .. group-tab:: nn.Sequential + + .. literalinclude:: ../snippets/gaussian_model.py + :language: python + :linenos: + :start-after: [start-mlp-sequential] + :end-before: [end-mlp-sequential] + + .. group-tab:: nn.functional + + .. literalinclude:: ../snippets/gaussian_model.py + :language: python + :linenos: + :start-after: [start-mlp-functional] + :end-before: [end-mlp-functional] + + .. tab:: CNN + + .. image:: ../_static/imgs/model_gaussian_cnn.svg + :width: 100% + :align: center + + .. raw:: html + +
+ + .. tabs:: + + .. group-tab:: nn.Sequential + + .. literalinclude:: ../snippets/gaussian_model.py + :language: python + :linenos: + :start-after: [start-cnn-sequential] + :end-before: [end-cnn-sequential] + + .. group-tab:: nn.functional + + .. literalinclude:: ../snippets/gaussian_model.py + :language: python + :linenos: + :start-after: [start-cnn-functional] + :end-before: [end-cnn-functional] + + .. tab:: RNN + + .. image:: ../_static/imgs/model_gaussian_rnn.svg + :width: 90% + :align: center + + where: + + .. math:: + \begin{aligned} + N ={} & \text{batch size} \\ + L ={} & \text{sequence length} \\ + D ={} & 2 \text{ if bidirectional=True otherwise } 1 \\ + H_{in} ={} & \text{input_size} \\ + H_{out} ={} & \text{hidden_size} + \end{aligned} + + .. raw:: html + +
+ + The following points are relevant in the definition of recurrent models: + + * The ``.get_specification()`` method must be overwritten to return, under a dictionary key ``"rnn"``, a sub-dictionary that includes the sequence length (under key ``"sequence_length"``) as a number and a list of the dimensions (under key ``"sizes"``) of each initial hidden state + + * The ``.compute()`` method's ``inputs`` parameter will have, at least, the following items in the dictionary: + + * ``"states"``: state of the environment used to make the decision + * ``"taken_actions"``: actions taken by the policy for the given states, if applicable + * ``"terminated"``: episode termination status for sampled environment transitions. This key is only defined during the training process + * ``"rnn"``: list of initial hidden states ordered according to the model specification + + * The ``.compute()`` method must inlcude, under the ``"rnn"`` key of the returned dictionary, a list of each final hidden state + + .. raw:: html + +
+ + .. tabs:: + + .. group-tab:: nn.Sequential + + .. literalinclude:: ../snippets/gaussian_model.py + :language: python + :linenos: + :start-after: [start-rnn-sequential] + :end-before: [end-rnn-sequential] + + .. group-tab:: nn.functional + + .. literalinclude:: ../snippets/gaussian_model.py + :language: python + :linenos: + :start-after: [start-rnn-functional] + :end-before: [end-rnn-functional] + + .. tab:: GRU + + .. image:: ../_static/imgs/model_gaussian_rnn.svg + :width: 90% + :align: center + + where: + + .. math:: + \begin{aligned} + N ={} & \text{batch size} \\ + L ={} & \text{sequence length} \\ + D ={} & 2 \text{ if bidirectional=True otherwise } 1 \\ + H_{in} ={} & \text{input_size} \\ + H_{out} ={} & \text{hidden_size} + \end{aligned} + + .. raw:: html + +
+ + The following points are relevant in the definition of recurrent models: + + * The ``.get_specification()`` method must be overwritten to return, under a dictionary key ``"rnn"``, a sub-dictionary that includes the sequence length (under key ``"sequence_length"``) as a number and a list of the dimensions (under key ``"sizes"``) of each initial hidden state + + * The ``.compute()`` method's ``inputs`` parameter will have, at least, the following items in the dictionary: + + * ``"states"``: state of the environment used to make the decision + * ``"taken_actions"``: actions taken by the policy for the given states, if applicable + * ``"terminated"``: episode termination status for sampled environment transitions. This key is only defined during the training process + * ``"rnn"``: list of initial hidden states ordered according to the model specification + + * The ``.compute()`` method must inlcude, under the ``"rnn"`` key of the returned dictionary, a list of each final hidden state + + .. raw:: html + +
+ + .. tabs:: + + .. group-tab:: nn.Sequential + + .. literalinclude:: ../snippets/gaussian_model.py + :language: python + :linenos: + :start-after: [start-gru-sequential] + :end-before: [end-gru-sequential] + + .. group-tab:: nn.functional + + .. literalinclude:: ../snippets/gaussian_model.py + :language: python + :linenos: + :start-after: [start-gru-functional] + :end-before: [end-gru-functional] + + .. tab:: LSTM + + .. image:: ../_static/imgs/model_gaussian_rnn.svg + :width: 90% + :align: center + + where: + + .. math:: + \begin{aligned} + N ={} & \text{batch size} \\ + L ={} & \text{sequence length} \\ + D ={} & 2 \text{ if bidirectional=True otherwise } 1 \\ + H_{in} ={} & \text{input_size} \\ + H_{cell} ={} & \text{hidden_size} \\ + H_{out} ={} & \text{proj_size if } \text{proj_size}>0 \text{ otherwise hidden_size} \\ + \end{aligned} + + .. raw:: html + +
+ + The following points are relevant in the definition of recurrent models: + + * The ``.get_specification()`` method must be overwritten to return, under a dictionary key ``"rnn"``, a sub-dictionary that includes the sequence length (under key ``"sequence_length"``) as a number and a list of the dimensions (under key ``"sizes"``) of each initial hidden/cell states + + * The ``.compute()`` method's ``inputs`` parameter will have, at least, the following items in the dictionary: + + * ``"states"``: state of the environment used to make the decision + * ``"taken_actions"``: actions taken by the policy for the given states, if applicable + * ``"terminated"``: episode termination status for sampled environment transitions. This key is only defined during the training process + * ``"rnn"``: list of initial hidden/cell states ordered according to the model specification + + * The ``.compute()`` method must inlcude, under the ``"rnn"`` key of the returned dictionary, a list of each final hidden/cell states + + .. raw:: html + +
+ + .. tabs:: + + .. group-tab:: nn.Sequential + + .. literalinclude:: ../snippets/gaussian_model.py + :language: python + :linenos: + :start-after: [start-lstm-sequential] + :end-before: [end-lstm-sequential] + + .. group-tab:: nn.functional + + .. literalinclude:: ../snippets/gaussian_model.py + :language: python + :linenos: + :start-after: [start-lstm-functional] + :end-before: [end-lstm-functional] API --- diff --git a/docs/source/modules/skrl.models.multivariate_gaussian.rst b/docs/source/modules/skrl.models.multivariate_gaussian.rst index 13661349..d499af25 100644 --- a/docs/source/modules/skrl.models.multivariate_gaussian.rst +++ b/docs/source/modules/skrl.models.multivariate_gaussian.rst @@ -13,7 +13,7 @@ skrl provides a Python mixin (:literal:`MultivariateGaussianMixin`) to assist in :emphasize-lines: 1 class MultivariateGaussianModel(MultivariateGaussianMixin, Model): - def __init__(self, observation_space, action_space, device="cuda:0", + def __init__(self, observation_space, action_space, device="cuda:0", clip_actions=False, clip_log_std=True, min_log_std=-20, max_log_std=2): Model.__init__(self, observation_space, action_space, device) MultivariateGaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std) @@ -24,7 +24,7 @@ skrl provides a Python mixin (:literal:`MultivariateGaussianMixin`) to assist in :emphasize-lines: 4-5 class MultivariateGaussianModel(MultivariateGaussianMixin, Model): - def __init__(self, observation_space, action_space, device="cuda:0", + def __init__(self, observation_space, action_space, device="cuda:0", clip_actions=False, clip_log_std=True, min_log_std=-20, max_log_std=2): Model.__init__(self, observation_space, action_space, device) MultivariateGaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std) @@ -40,23 +40,238 @@ Concept Basic usage ----------- +* Multi-Layer Perceptron (**MLP**) +* Convolutional Neural Network (**CNN**) +* Recurrent Neural Network (**RNN**) +* Gated Recurrent Unit RNN (**GRU**) +* Long Short-Term Memory RNN (**LSTM**) + .. tabs:: - - .. tab:: Multi-Layer Perceptron (MLP) - - .. literalinclude:: ../snippets/multivariate_gaussian_model.py - :language: python - :linenos: - :start-after: [start-mlp] - :end-before: [end-mlp] - - .. tab:: Convolutional Neural Network (CNN) - - .. literalinclude:: ../snippets/multivariate_gaussian_model.py - :language: python - :linenos: - :start-after: [start-cnn] - :end-before: [end-cnn] + + .. tab:: MLP + + .. image:: ../_static/imgs/model_gaussian_mlp.svg + :width: 42% + :align: center + + .. raw:: html + +
+ + .. tabs:: + + .. group-tab:: nn.Sequential + + .. literalinclude:: ../snippets/multivariate_gaussian_model.py + :language: python + :linenos: + :start-after: [start-mlp-sequential] + :end-before: [end-mlp-sequential] + + .. group-tab:: nn.functional + + .. literalinclude:: ../snippets/multivariate_gaussian_model.py + :language: python + :linenos: + :start-after: [start-mlp-functional] + :end-before: [end-mlp-functional] + + .. tab:: CNN + + .. image:: ../_static/imgs/model_gaussian_cnn.svg + :width: 100% + :align: center + + .. raw:: html + +
+ + .. tabs:: + + .. group-tab:: nn.Sequential + + .. literalinclude:: ../snippets/multivariate_gaussian_model.py + :language: python + :linenos: + :start-after: [start-cnn-sequential] + :end-before: [end-cnn-sequential] + + .. group-tab:: nn.functional + + .. literalinclude:: ../snippets/multivariate_gaussian_model.py + :language: python + :linenos: + :start-after: [start-cnn-functional] + :end-before: [end-cnn-functional] + + .. tab:: RNN + + .. image:: ../_static/imgs/model_gaussian_rnn.svg + :width: 90% + :align: center + + where: + + .. math:: + \begin{aligned} + N ={} & \text{batch size} \\ + L ={} & \text{sequence length} \\ + D ={} & 2 \text{ if bidirectional=True otherwise } 1 \\ + H_{in} ={} & \text{input_size} \\ + H_{out} ={} & \text{hidden_size} + \end{aligned} + + .. raw:: html + +
+ + The following points are relevant in the definition of recurrent models: + + * The ``.get_specification()`` method must be overwritten to return, under a dictionary key ``"rnn"``, a sub-dictionary that includes the sequence length (under key ``"sequence_length"``) as a number and a list of the dimensions (under key ``"sizes"``) of each initial hidden state + + * The ``.compute()`` method's ``inputs`` parameter will have, at least, the following items in the dictionary: + + * ``"states"``: state of the environment used to make the decision + * ``"taken_actions"``: actions taken by the policy for the given states, if applicable + * ``"terminated"``: episode termination status for sampled environment transitions. This key is only defined during the training process + * ``"rnn"``: list of initial hidden states ordered according to the model specification + + * The ``.compute()`` method must inlcude, under the ``"rnn"`` key of the returned dictionary, a list of each final hidden state + + .. raw:: html + +
+ + .. tabs:: + + .. group-tab:: nn.Sequential + + .. literalinclude:: ../snippets/multivariate_gaussian_model.py + :language: python + :linenos: + :start-after: [start-rnn-sequential] + :end-before: [end-rnn-sequential] + + .. group-tab:: nn.functional + + .. literalinclude:: ../snippets/multivariate_gaussian_model.py + :language: python + :linenos: + :start-after: [start-rnn-functional] + :end-before: [end-rnn-functional] + + .. tab:: GRU + + .. image:: ../_static/imgs/model_gaussian_rnn.svg + :width: 90% + :align: center + + where: + + .. math:: + \begin{aligned} + N ={} & \text{batch size} \\ + L ={} & \text{sequence length} \\ + D ={} & 2 \text{ if bidirectional=True otherwise } 1 \\ + H_{in} ={} & \text{input_size} \\ + H_{out} ={} & \text{hidden_size} + \end{aligned} + + .. raw:: html + +
+ + The following points are relevant in the definition of recurrent models: + + * The ``.get_specification()`` method must be overwritten to return, under a dictionary key ``"rnn"``, a sub-dictionary that includes the sequence length (under key ``"sequence_length"``) as a number and a list of the dimensions (under key ``"sizes"``) of each initial hidden state + + * The ``.compute()`` method's ``inputs`` parameter will have, at least, the following items in the dictionary: + + * ``"states"``: state of the environment used to make the decision + * ``"taken_actions"``: actions taken by the policy for the given states, if applicable + * ``"terminated"``: episode termination status for sampled environment transitions. This key is only defined during the training process + * ``"rnn"``: list of initial hidden states ordered according to the model specification + + * The ``.compute()`` method must inlcude, under the ``"rnn"`` key of the returned dictionary, a list of each final hidden state + + .. raw:: html + +
+ + .. tabs:: + + .. group-tab:: nn.Sequential + + .. literalinclude:: ../snippets/multivariate_gaussian_model.py + :language: python + :linenos: + :start-after: [start-gru-sequential] + :end-before: [end-gru-sequential] + + .. group-tab:: nn.functional + + .. literalinclude:: ../snippets/multivariate_gaussian_model.py + :language: python + :linenos: + :start-after: [start-gru-functional] + :end-before: [end-gru-functional] + + .. tab:: LSTM + + .. image:: ../_static/imgs/model_gaussian_rnn.svg + :width: 90% + :align: center + + where: + + .. math:: + \begin{aligned} + N ={} & \text{batch size} \\ + L ={} & \text{sequence length} \\ + D ={} & 2 \text{ if bidirectional=True otherwise } 1 \\ + H_{in} ={} & \text{input_size} \\ + H_{cell} ={} & \text{hidden_size} \\ + H_{out} ={} & \text{proj_size if } \text{proj_size}>0 \text{ otherwise hidden_size} \\ + \end{aligned} + + .. raw:: html + +
+ + The following points are relevant in the definition of recurrent models: + + * The ``.get_specification()`` method must be overwritten to return, under a dictionary key ``"rnn"``, a sub-dictionary that includes the sequence length (under key ``"sequence_length"``) as a number and a list of the dimensions (under key ``"sizes"``) of each initial hidden/cell states + + * The ``.compute()`` method's ``inputs`` parameter will have, at least, the following items in the dictionary: + + * ``"states"``: state of the environment used to make the decision + * ``"taken_actions"``: actions taken by the policy for the given states, if applicable + * ``"terminated"``: episode termination status for sampled environment transitions. This key is only defined during the training process + * ``"rnn"``: list of initial hidden/cell states ordered according to the model specification + + * The ``.compute()`` method must inlcude, under the ``"rnn"`` key of the returned dictionary, a list of each final hidden/cell states + + .. raw:: html + +
+ + .. tabs:: + + .. group-tab:: nn.Sequential + + .. literalinclude:: ../snippets/multivariate_gaussian_model.py + :language: python + :linenos: + :start-after: [start-lstm-sequential] + :end-before: [end-lstm-sequential] + + .. group-tab:: nn.functional + + .. literalinclude:: ../snippets/multivariate_gaussian_model.py + :language: python + :linenos: + :start-after: [start-lstm-functional] + :end-before: [end-lstm-functional] API --- diff --git a/docs/source/modules/skrl.models.tabular.rst b/docs/source/modules/skrl.models.tabular.rst index 1190c245..54ab3452 100644 --- a/docs/source/modules/skrl.models.tabular.rst +++ b/docs/source/modules/skrl.models.tabular.rst @@ -31,7 +31,7 @@ Basic usage ----------- .. tabs:: - + .. tab:: :math:`\epsilon`-greedy policy .. literalinclude:: ../snippets/tabular_model.py diff --git a/docs/source/modules/skrl.resources.noises.rst b/docs/source/modules/skrl.resources.noises.rst index e25c87c4..c4e74014 100644 --- a/docs/source/modules/skrl.resources.noises.rst +++ b/docs/source/modules/skrl.resources.noises.rst @@ -16,7 +16,7 @@ Basic usage The noise usage is defined in each agent's configuration dictionary. A noise instance is set under the :literal:`"noise"` sub-key. The following examples show how to set the noise for an agent: .. tabs:: - + .. tab:: Gaussian noise .. image:: ../_static/imgs/noise_gaussian.png @@ -73,7 +73,7 @@ API :inherited-members: :private-members: _update :members: - + .. automethod:: __init__ .. raw:: html @@ -94,7 +94,7 @@ API :inherited-members: :private-members: _update :members: - + .. automethod:: __init__ .. raw:: html @@ -116,7 +116,7 @@ Basic inheritance usage ^^^^^^^^^^^^^^^^^^^^^^^ .. tabs:: - + .. tab:: Inheritance .. literalinclude:: ../snippets/noise.py @@ -132,5 +132,5 @@ API :inherited-members: :private-members: _update :members: - + .. automethod:: __init__ diff --git a/docs/source/modules/skrl.resources.preprocessors.rst b/docs/source/modules/skrl.resources.preprocessors.rst index 21c44a10..ae237b71 100644 --- a/docs/source/modules/skrl.resources.preprocessors.rst +++ b/docs/source/modules/skrl.resources.preprocessors.rst @@ -10,7 +10,7 @@ Preprocessors Basic usage ----------- -The preprocessors usage is defined in each agent's configuration dictionary. +The preprocessors usage is defined in each agent's configuration dictionary. The preprocessor class is set under the :literal:`"_preprocessor"` key and its arguments are set under the :literal:`"_preprocessor_kwargs"` key as a keyword argument dictionary. The following examples show how to set the preprocessors for an agent: @@ -48,11 +48,11 @@ Algorithm implementation **Standardization by centering and scaling** -| :math:`\text{clip}((x - \bar{x}_t) / (\sqrt{\sigma^2} \;+` :guilabel:`epsilon` :math:`), -c, c) \qquad` with :math:`c` as :guilabel:`clip_threshold` +| :math:`\text{clip}((x - \bar{x}_t) / (\sqrt{\sigma^2} \;+` :guilabel:`epsilon` :math:`), -c, c) \qquad` with :math:`c` as :guilabel:`clip_threshold` **Scale back the data to the original representation (inverse transform)** -| :math:`\sqrt{\sigma^2_t} \; \text{clip}(x, -c, c) + \bar{x}_t \qquad` with :math:`c` as :guilabel:`clip_threshold` +| :math:`\sqrt{\sigma^2_t} \; \text{clip}(x, -c, c) + \bar{x}_t \qquad` with :math:`c` as :guilabel:`clip_threshold` **Update the running mean and variance** (See `parallel algorithm `_) @@ -69,5 +69,5 @@ API .. autoclass:: skrl.resources.preprocessors.torch.running_standard_scaler.RunningStandardScaler :members: - + .. automethod:: __init__ diff --git a/docs/source/modules/skrl.resources.schedulers.rst b/docs/source/modules/skrl.resources.schedulers.rst index a879dae6..a3a76071 100644 --- a/docs/source/modules/skrl.resources.schedulers.rst +++ b/docs/source/modules/skrl.resources.schedulers.rst @@ -15,7 +15,7 @@ Basic usage The learning rate scheduler usage is defined in each agent's configuration dictionary. The scheduler class is set under the :literal:`"learning_rate_scheduler"` key and its arguments are set under the :literal:`"learning_rate_scheduler_kwargs"` key as a keyword argument dictionary, without specifying the optimizer (first argument). The following examples show how to set the scheduler for an agent: .. tabs:: - + .. tab:: PyTorch scheduler .. code-block:: python @@ -66,5 +66,5 @@ API :show-inheritance: :inherited-members: :members: - + .. automethod:: __init__ diff --git a/docs/source/modules/skrl.trainers.base_class.rst b/docs/source/modules/skrl.trainers.base_class.rst index 7b76d574..660c4b6b 100644 --- a/docs/source/modules/skrl.trainers.base_class.rst +++ b/docs/source/modules/skrl.trainers.base_class.rst @@ -11,7 +11,7 @@ Basic inheritance usage ^^^^^^^^^^^^^^^^^^^^^^^ .. tabs:: - + .. tab:: Inheritance .. literalinclude:: ../snippets/trainer.py @@ -29,6 +29,6 @@ API :inherited-members: :private-members: _setup_agents :members: - + .. automethod:: __init__ .. automethod:: __str__ diff --git a/docs/source/modules/skrl.trainers.manual.rst b/docs/source/modules/skrl.trainers.manual.rst index 61c43c86..320827d0 100644 --- a/docs/source/modules/skrl.trainers.manual.rst +++ b/docs/source/modules/skrl.trainers.manual.rst @@ -13,7 +13,7 @@ Basic usage ^^^^^^^^^^^ .. tabs:: - + .. tab:: Snippet .. literalinclude:: ../snippets/trainer.py @@ -29,7 +29,7 @@ Configuration .. literalinclude:: ../../../skrl/trainers/torch/manual.py :language: python - :lines: 14-17 + :lines: 14-18 :linenos: API diff --git a/docs/source/modules/skrl.trainers.parallel.rst b/docs/source/modules/skrl.trainers.parallel.rst index 4e3751e3..843a489e 100644 --- a/docs/source/modules/skrl.trainers.parallel.rst +++ b/docs/source/modules/skrl.trainers.parallel.rst @@ -21,7 +21,7 @@ Basic usage At the moment, only simultaneous training and evaluation of agents with local memory (no memory sharing) is implemented .. tabs:: - + .. tab:: Snippet .. literalinclude:: ../snippets/trainer.py @@ -37,7 +37,7 @@ Configuration .. literalinclude:: ../../../skrl/trainers/torch/parallel.py :language: python - :lines: 15-18 + :lines: 15-19 :linenos: API diff --git a/docs/source/modules/skrl.trainers.sequential.rst b/docs/source/modules/skrl.trainers.sequential.rst index 9866a420..3807ea3e 100644 --- a/docs/source/modules/skrl.trainers.sequential.rst +++ b/docs/source/modules/skrl.trainers.sequential.rst @@ -13,7 +13,7 @@ Basic usage ^^^^^^^^^^^ .. tabs:: - + .. tab:: Snippet .. literalinclude:: ../snippets/trainer.py @@ -29,7 +29,7 @@ Configuration .. literalinclude:: ../../../skrl/trainers/torch/sequential.py :language: python - :lines: 14-17 + :lines: 14-18 :linenos: API diff --git a/docs/source/modules/skrl.utils.isaacgym_utils.rst b/docs/source/modules/skrl.utils.isaacgym_utils.rst index 6ccd74cc..6df0454f 100644 --- a/docs/source/modules/skrl.utils.isaacgym_utils.rst +++ b/docs/source/modules/skrl.utils.isaacgym_utils.rst @@ -39,7 +39,7 @@ API Web viewer for development without X server ------------------------------------------- -This library provides an API for instantiating a lightweight web viewer useful, mostly, for designing Isaac Gym environments in remote workstations or docker containers without X server +This library provides an API for instantiating a lightweight web viewer useful, mostly, for designing Isaac Gym environments in remote workstations or docker containers without X server Gestures and actions ^^^^^^^^^^^^^^^^^^^^ @@ -85,7 +85,7 @@ Basic usage ^^^^^^^^^^^ .. tabs:: - + .. tab:: Snippet .. literalinclude:: ../snippets/isaacgym_utils.py @@ -102,5 +102,5 @@ API :inherited-members: :private-members: _route_index, _route_stream, _route_input_event, _stream :members: - + .. automethod:: __init__ diff --git a/docs/source/modules/skrl.utils.model_instantiators.rst b/docs/source/modules/skrl.utils.model_instantiators.rst index 536f4d04..56c62e90 100644 --- a/docs/source/modules/skrl.utils.model_instantiators.rst +++ b/docs/source/modules/skrl.utils.model_instantiators.rst @@ -10,26 +10,26 @@ API ^^^ .. autoclass:: skrl.utils.model_instantiators.Shape - + .. py:property:: ONE Flag to indicate that the model's input/output has shape (1,) - - This flag is useful for the definition of critic models, where the critic's output is a scalar + + This flag is useful for the definition of critic models, where the critic's output is a scalar .. py:property:: STATES Flag to indicate that the model's input/output is the state (observation) space of the environment It is an alias for :py:attr:`OBSERVATIONS` - + .. py:property:: OBSERVATIONS Flag to indicate that the model's input/output is the observation space of the environment - + .. py:property:: ACTIONS Flag to indicate that the model's input/output is the action space of the environment - + .. py:property:: STATES_ACTIONS Flag to indicate that the model's input/output is the combination (concatenation) of the state (observation) and action spaces of the environment diff --git a/docs/source/modules/skrl.utils.omniverse_isaacgym_utils.rst b/docs/source/modules/skrl.utils.omniverse_isaacgym_utils.rst index 8b4102e5..b3ea9380 100644 --- a/docs/source/modules/skrl.utils.omniverse_isaacgym_utils.rst +++ b/docs/source/modules/skrl.utils.omniverse_isaacgym_utils.rst @@ -42,4 +42,4 @@ OmniIsaacGymEnvs-like environment instance API """ -.. autofunction:: skrl.utils.omniverse_isaacgym_utils.get_env_instance \ No newline at end of file +.. autofunction:: skrl.utils.omniverse_isaacgym_utils.get_env_instance diff --git a/docs/source/modules/skrl.utils.postprocessing.rst b/docs/source/modules/skrl.utils.postprocessing.rst index 98341e1a..d4914d82 100644 --- a/docs/source/modules/skrl.utils.postprocessing.rst +++ b/docs/source/modules/skrl.utils.postprocessing.rst @@ -10,7 +10,7 @@ Basic usage ^^^^^^^^^^^ .. tabs:: - + .. tab:: PyTorch (.pt) .. literalinclude:: ../snippets/utils_postprocessing.py @@ -47,7 +47,7 @@ API :inherited-members: :private-members: _format_numpy, _format_torch, _format_csv :members: - + .. automethod:: __init__ .. automethod:: __iter__ .. automethod:: __next__ @@ -70,7 +70,7 @@ Basic usage ^^^^^^^^^^^ .. tabs:: - + .. tab:: Tensorboard (events.out.tfevents.*) .. literalinclude:: ../snippets/utils_postprocessing.py @@ -88,7 +88,7 @@ API :show-inheritance: :inherited-members: :members: - + .. automethod:: __init__ .. automethod:: __iter__ - .. automethod:: __next__ \ No newline at end of file + .. automethod:: __next__ diff --git a/docs/source/snippets/agent.py b/docs/source/snippets/agent.py index 1f27f789..2b16d5e9 100644 --- a/docs/source/snippets/agent.py +++ b/docs/source/snippets/agent.py @@ -1,4 +1,4 @@ -from typing import Union, Tuple, Dict, Any +from typing import Union, Tuple, Dict, Any, Optional import gym @@ -18,18 +18,21 @@ "checkpoint_interval": 1000, # interval for checkpoints (timesteps) "store_separately": False, # whether to store checkpoints separately + + "wandb": False, # whether to use Weights & Biases + "wandb_kwargs": {} # wandb kwargs (see https://docs.wandb.ai/ref/python/init) } } class CUSTOM(Agent): - def __init__(self, - models: Dict[str, Model], - memory: Union[Memory, None] = None, - observation_space: Union[int, Tuple[int], gym.Space, None] = None, - action_space: Union[int, Tuple[int], gym.Space, None] = None, - device: Union[str, torch.device] = "cuda:0", - cfg: dict = {}) -> None: + def __init__(self, + models: Dict[str, Model], + memory: Optional[Memory] = None, + observation_space: Optional[Union[int, Tuple[int], gym.Space]] = None, + action_space: Optional[Union[int, Tuple[int], gym.Space]] = None, + device: Union[str, torch.device] = "cuda:0", + cfg: Optional[dict] = None) -> None: """ :param models: Models used by the agent :type models: dictionary of skrl.models.torch.Model @@ -44,12 +47,12 @@ def __init__(self, :param cfg: Configuration dictionary :type cfg: dict """ - CUSTOM_DEFAULT_CONFIG.update(cfg) - super().__init__(models=models, - memory=memory, - observation_space=observation_space, - action_space=action_space, - device=device, + CUSTOM_DEFAULT_CONFIG.update(cfg if cfg is not None else {}) + super().__init__(models=models, + memory=memory, + observation_space=observation_space, + action_space=action_space, + device=device, cfg=CUSTOM_DEFAULT_CONFIG) # ===================================================================== # - get and process models from self.models @@ -59,10 +62,10 @@ def __init__(self, # - set up preprocessors # ===================================================================== - def init(self) -> None: + def init(self, trainer_cfg: Optional[Dict[str, Any]] = None) -> None: """Initialize the agent """ - super().init() + super().init(trainer_cfg=trainer_cfg) self.set_mode("eval") # ================================================================= # - create tensors in memory if required @@ -89,17 +92,17 @@ def act(self, states: torch.Tensor, timestep: int, timesteps: int) -> torch.Tens # sample and return agent's actions # ====================================== - def record_transition(self, - states: torch.Tensor, - actions: torch.Tensor, - rewards: torch.Tensor, - next_states: torch.Tensor, - dones: torch.Tensor, - infos: Any, - timestep: int, + def record_transition(self, + states: torch.Tensor, + actions: torch.Tensor, + rewards: torch.Tensor, + next_states: torch.Tensor, + dones: torch.Tensor, + infos: Any, + timestep: int, timesteps: int) -> None: """Record an environment transition in memory - + :param states: Observations/states of the environment used to make the decision :type states: torch.Tensor :param actions: Actions taken by the agent diff --git a/docs/source/snippets/categorical_model.py b/docs/source/snippets/categorical_model.py index 66fcc47b..d54ff91e 100644 --- a/docs/source/snippets/categorical_model.py +++ b/docs/source/snippets/categorical_model.py @@ -1,4 +1,35 @@ -# [start-mlp] +# [start-mlp-sequential] +import torch +import torch.nn as nn + +from skrl.models.torch import Model, CategoricalMixin + + +# define the model +class MLP(CategoricalMixin, Model): + def __init__(self, observation_space, action_space, device, unnormalized_log_prob=True): + Model.__init__(self, observation_space, action_space, device) + CategoricalMixin.__init__(self, unnormalized_log_prob) + + self.net = nn.Sequential(nn.Linear(self.num_observations, 64), + nn.ReLU(), + nn.Linear(64, 32), + nn.ReLU(), + nn.Linear(32, self.num_actions)) + + def compute(self, inputs, role): + return self.net(inputs["states"]), {} + + +# instantiate the model (assumes there is a wrapped environment: env) +policy = MLP(observation_space=env.observation_space, + action_space=env.action_space, + device=env.device, + unnormalized_log_prob=True) +# [end-mlp-sequential] + +# [start-mlp-functional] +import torch import torch.nn as nn import torch.nn.functional as F @@ -11,26 +42,29 @@ def __init__(self, observation_space, action_space, device, unnormalized_log_pro Model.__init__(self, observation_space, action_space, device) CategoricalMixin.__init__(self, unnormalized_log_prob) - self.linear_layer_1 = nn.Linear(self.num_observations, 64) - self.linear_layer_2 = nn.Linear(64, 32) - self.output_layer = nn.Linear(32, self.num_actions) + self.fc1 = nn.Linear(self.num_observations, 64) + self.fc2 = nn.Linear(64, 32) + self.logits = nn.Linear(32, self.num_actions) - def compute(self, states, taken_actions, role): - x = F.relu(self.linear_layer_1(states)) - x = F.relu(self.linear_layer_2(x)) - return self.output_layer(x) + def compute(self, inputs, role): + x = self.fc1(inputs["states"]) + x = F.relu(x) + x = self.fc2(x) + x = F.relu(x) + return self.logits(x), {} # instantiate the model (assumes there is a wrapped environment: env) -policy = MLP(observation_space=env.observation_space, - action_space=env.action_space, - device=env.device, +policy = MLP(observation_space=env.observation_space, + action_space=env.action_space, + device=env.device, unnormalized_log_prob=True) -# [end-mlp] +# [end-mlp-functional] # ============================================================================= -# [start-cnn] +# [start-cnn-sequential] +import torch import torch.nn as nn from skrl.models.torch import Model, CategoricalMixin @@ -49,7 +83,7 @@ def __init__(self, observation_space, action_space, device, unnormalized_log_pro nn.Conv2d(64, 64, kernel_size=3, stride=1), nn.ReLU(), nn.Flatten(), - nn.Linear(9216, 512), + nn.Linear(1024, 512), nn.ReLU(), nn.Linear(512, 16), nn.Tanh(), @@ -59,14 +93,604 @@ def __init__(self, observation_space, action_space, device, unnormalized_log_pro nn.Tanh(), nn.Linear(32, self.num_actions)) - def compute(self, states, taken_actions, role): - # permute (samples, width, height, channels) -> (samples, channels, width, height) - return self.net(states.permute(0, 3, 1, 2)) + def compute(self, inputs, role): + # permute (samples, width * height * channels) -> (samples, channels, width, height) + return self.net(inputs["states"].view(-1, *self.observation_space.shape).permute(0, 3, 1, 2)), {} # instantiate the model (assumes there is a wrapped environment: env) -policy = CNN(observation_space=env.observation_space, - action_space=env.action_space, - device=env.device, +policy = CNN(observation_space=env.observation_space, + action_space=env.action_space, + device=env.device, unnormalized_log_prob=True) -# [end-cnn] +# [end-cnn-sequential] + +# [start-cnn-functional] +import torch +import torch.nn as nn +import torch.nn.functional as F + +from skrl.models.torch import Model, CategoricalMixin + + +# define the model +class CNN(CategoricalMixin, Model): + def __init__(self, observation_space, action_space, device, unnormalized_log_prob=True): + Model.__init__(self, observation_space, action_space, device) + CategoricalMixin.__init__(self, unnormalized_log_prob) + + self.conv1 = nn.Conv2d(3, 32, kernel_size=8, stride=4) + self.conv2 = nn.Conv2d(32, 64, kernel_size=4, stride=2) + self.conv3 = nn.Conv2d(64, 64, kernel_size=3, stride=1) + self.fc1 = nn.Linear(1024, 512) + self.fc2 = nn.Linear(512, 16) + self.fc3 = nn.Linear(16, 64) + self.fc4 = nn.Linear(64, 32) + self.fc5 = nn.Linear(32, self.num_actions) + + def compute(self, inputs, role): + # permute (samples, width * height * channels) -> (samples, channels, width, height) + x = inputs["states"].view(-1, *self.observation_space.shape).permute(0, 3, 1, 2) + x = self.conv1(x) + x = F.relu(x) + x = self.conv2(x) + x = F.relu(x) + x = self.conv3(x) + x = F.relu(x) + x = torch.flatten(x, start_dim=1) + x = self.fc1(x) + x = F.relu(x) + x = self.fc2(x) + x = torch.tanh(x) + x = self.fc3(x) + x = torch.tanh(x) + x = self.fc4(x) + x = torch.tanh(x) + x = self.fc5(x) + return x, {} + + +# instantiate the model (assumes there is a wrapped environment: env) +policy = CNN(observation_space=env.observation_space, + action_space=env.action_space, + device=env.device, + unnormalized_log_prob=True) +# [end-cnn-functional] + +# ============================================================================= + +# [start-rnn-sequential] +import torch +import torch.nn as nn + +from skrl.models.torch import Model, CategoricalMixin + + +# define the model +class RNN(CategoricalMixin, Model): + def __init__(self, observation_space, action_space, device, unnormalized_log_prob=True, + num_envs=1, num_layers=1, hidden_size=64, sequence_length=10): + Model.__init__(self, observation_space, action_space, device) + CategoricalMixin.__init__(self, unnormalized_log_prob) + + self.num_envs = num_envs + self.num_layers = num_layers + self.hidden_size = hidden_size # Hout + self.sequence_length = sequence_length + + self.rnn = nn.RNN(input_size=self.num_observations, + hidden_size=self.hidden_size, + num_layers=self.num_layers, + batch_first=True) # batch_first -> (batch, sequence, features) + + self.net = nn.Sequential(nn.Linear(self.hidden_size, 64), + nn.ReLU(), + nn.Linear(64, 32), + nn.ReLU(), + nn.Linear(32, self.num_actions)) + + def get_specification(self): + # batch size (N) is the number of envs during rollout + return {"rnn": {"sequence_length": self.sequence_length, + "sizes": [(self.num_layers, self.num_envs, self.hidden_size)]}} # hidden states (D ∗ num_layers, N, Hout) + + def compute(self, inputs, role): + states = inputs["states"] + terminated = inputs.get("terminated", None) + hidden_states = inputs["rnn"][0] + + # training + if self.training: + rnn_input = states.view(-1, self.sequence_length, states.shape[-1]) # (N, L, Hin): N=batch_size, L=sequence_length + hidden_states = hidden_states.view(self.num_layers, -1, self.sequence_length, hidden_states.shape[-1]) # (D * num_layers, N, L, Hout) + # get the hidden states corresponding to the initial sequence + hidden_states = hidden_states[:,:,0,:].contiguous() # (D * num_layers, N, Hout) + + # reset the RNN state in the middle of a sequence + if terminated is not None and torch.any(terminated): + rnn_outputs = [] + terminated = terminated.view(-1, self.sequence_length) + indexes = [0] + (terminated[:,:-1].any(dim=0).nonzero(as_tuple=True)[0] + 1).tolist() + [self.sequence_length] + + for i in range(len(indexes) - 1): + i0, i1 = indexes[i], indexes[i + 1] + rnn_output, hidden_states = self.rnn(rnn_input[:,i0:i1,:], hidden_states) + hidden_states[:, (terminated[:,i1-1]), :] = 0 + rnn_outputs.append(rnn_output) + + rnn_output = torch.cat(rnn_outputs, dim=1) + # no need to reset the RNN state in the sequence + else: + rnn_output, hidden_states = self.rnn(rnn_input, hidden_states) + # rollout + else: + rnn_input = states.view(-1, 1, states.shape[-1]) # (N, L, Hin): N=num_envs, L=1 + rnn_output, hidden_states = self.rnn(rnn_input, hidden_states) + + # flatten the RNN output + rnn_output = torch.flatten(rnn_output, start_dim=0, end_dim=1) # (N, L, D ∗ Hout) -> (N * L, D ∗ Hout) + + return self.net(rnn_output), {"rnn": [hidden_states]} + + +# instantiate the model (assumes there is a wrapped environment: env) +policy = RNN(observation_space=env.observation_space, + action_space=env.action_space, + device=env.device, + unnormalized_log_prob=True, + num_envs=env.num_envs, + num_layers=1, + hidden_size=64, + sequence_length=10) +# [end-rnn-sequential] + +# [start-rnn-functional] +import torch +import torch.nn as nn +import torch.nn.functional as F + +from skrl.models.torch import Model, CategoricalMixin + + +# define the model +class RNN(CategoricalMixin, Model): + def __init__(self, observation_space, action_space, device, unnormalized_log_prob=True, + num_envs=1, num_layers=1, hidden_size=64, sequence_length=10): + Model.__init__(self, observation_space, action_space, device) + CategoricalMixin.__init__(self, unnormalized_log_prob) + + self.num_envs = num_envs + self.num_layers = num_layers + self.hidden_size = hidden_size # Hout + self.sequence_length = sequence_length + + self.rnn = nn.RNN(input_size=self.num_observations, + hidden_size=self.hidden_size, + num_layers=self.num_layers, + batch_first=True) # batch_first -> (batch, sequence, features) + + self.fc1 = nn.Linear(self.hidden_size, 64) + self.fc2 = nn.Linear(64, 32) + self.logits = nn.Linear(32, self.num_actions) + + def get_specification(self): + # batch size (N) is the number of envs during rollout + return {"rnn": {"sequence_length": self.sequence_length, + "sizes": [(self.num_layers, self.num_envs, self.hidden_size)]}} # hidden states (D ∗ num_layers, N, Hout) + + def compute(self, inputs, role): + states = inputs["states"] + terminated = inputs.get("terminated", None) + hidden_states = inputs["rnn"][0] + + # training + if self.training: + rnn_input = states.view(-1, self.sequence_length, states.shape[-1]) # (N, L, Hin): N=batch_size, L=sequence_length + hidden_states = hidden_states.view(self.num_layers, -1, self.sequence_length, hidden_states.shape[-1]) # (D * num_layers, N, L, Hout) + # get the hidden states corresponding to the initial sequence + hidden_states = hidden_states[:,:,0,:].contiguous() # (D * num_layers, N, Hout) + + # reset the RNN state in the middle of a sequence + if terminated is not None and torch.any(terminated): + rnn_outputs = [] + terminated = terminated.view(-1, self.sequence_length) + indexes = [0] + (terminated[:,:-1].any(dim=0).nonzero(as_tuple=True)[0] + 1).tolist() + [self.sequence_length] + + for i in range(len(indexes) - 1): + i0, i1 = indexes[i], indexes[i + 1] + rnn_output, hidden_states = self.rnn(rnn_input[:,i0:i1,:], hidden_states) + hidden_states[:, (terminated[:,i1-1]), :] = 0 + rnn_outputs.append(rnn_output) + + rnn_output = torch.cat(rnn_outputs, dim=1) + # no need to reset the RNN state in the sequence + else: + rnn_output, hidden_states = self.rnn(rnn_input, hidden_states) + # rollout + else: + rnn_input = states.view(-1, 1, states.shape[-1]) # (N, L, Hin): N=num_envs, L=1 + rnn_output, hidden_states = self.rnn(rnn_input, hidden_states) + + # flatten the RNN output + rnn_output = torch.flatten(rnn_output, start_dim=0, end_dim=1) # (N, L, D ∗ Hout) -> (N * L, D ∗ Hout) + + x = self.fc1(rnn_output) + x = F.relu(x) + x = self.fc2(x) + x = F.relu(x) + + return self.logits(x), {"rnn": [hidden_states]} + + +# instantiate the model (assumes there is a wrapped environment: env) +policy = RNN(observation_space=env.observation_space, + action_space=env.action_space, + device=env.device, + unnormalized_log_prob=True, + num_envs=env.num_envs, + num_layers=1, + hidden_size=64, + sequence_length=10) +# [end-rnn-functional] + +# ============================================================================= + +# [start-gru-sequential] +import torch +import torch.nn as nn + +from skrl.models.torch import Model, CategoricalMixin + + +# define the model +class GRU(CategoricalMixin, Model): + def __init__(self, observation_space, action_space, device, unnormalized_log_prob=True, + num_envs=1, num_layers=1, hidden_size=64, sequence_length=10): + Model.__init__(self, observation_space, action_space, device) + CategoricalMixin.__init__(self, unnormalized_log_prob) + + self.num_envs = num_envs + self.num_layers = num_layers + self.hidden_size = hidden_size # Hout + self.sequence_length = sequence_length + + self.gru = nn.GRU(input_size=self.num_observations, + hidden_size=self.hidden_size, + num_layers=self.num_layers, + batch_first=True) # batch_first -> (batch, sequence, features) + + self.net = nn.Sequential(nn.Linear(self.hidden_size, 64), + nn.ReLU(), + nn.Linear(64, 32), + nn.ReLU(), + nn.Linear(32, self.num_actions)) + + def get_specification(self): + # batch size (N) is the number of envs during rollout + return {"rnn": {"sequence_length": self.sequence_length, + "sizes": [(self.num_layers, self.num_envs, self.hidden_size)]}} # hidden states (D ∗ num_layers, N, Hout) + + def compute(self, inputs, role): + states = inputs["states"] + terminated = inputs.get("terminated", None) + hidden_states = inputs["rnn"][0] + + # training + if self.training: + rnn_input = states.view(-1, self.sequence_length, states.shape[-1]) # (N, L, Hin): N=batch_size, L=sequence_length + hidden_states = hidden_states.view(self.num_layers, -1, self.sequence_length, hidden_states.shape[-1]) # (D * num_layers, N, L, Hout) + # get the hidden states corresponding to the initial sequence + hidden_states = hidden_states[:,:,0,:].contiguous() # (D * num_layers, N, Hout) + + # reset the RNN state in the middle of a sequence + if terminated is not None and torch.any(terminated): + rnn_outputs = [] + terminated = terminated.view(-1, self.sequence_length) + indexes = [0] + (terminated[:,:-1].any(dim=0).nonzero(as_tuple=True)[0] + 1).tolist() + [self.sequence_length] + + for i in range(len(indexes) - 1): + i0, i1 = indexes[i], indexes[i + 1] + rnn_output, hidden_states = self.gru(rnn_input[:,i0:i1,:], hidden_states) + hidden_states[:, (terminated[:,i1-1]), :] = 0 + rnn_outputs.append(rnn_output) + + rnn_output = torch.cat(rnn_outputs, dim=1) + # no need to reset the RNN state in the sequence + else: + rnn_output, hidden_states = self.gru(rnn_input, hidden_states) + # rollout + else: + rnn_input = states.view(-1, 1, states.shape[-1]) # (N, L, Hin): N=num_envs, L=1 + rnn_output, hidden_states = self.gru(rnn_input, hidden_states) + + # flatten the RNN output + rnn_output = torch.flatten(rnn_output, start_dim=0, end_dim=1) # (N, L, D ∗ Hout) -> (N * L, D ∗ Hout) + + return self.net(rnn_output), {"rnn": [hidden_states]} + + +# instantiate the model (assumes there is a wrapped environment: env) +policy = GRU(observation_space=env.observation_space, + action_space=env.action_space, + device=env.device, + unnormalized_log_prob=True, + num_envs=env.num_envs, + num_layers=1, + hidden_size=64, + sequence_length=10) +# [end-gru-sequential] + +# [start-gru-functional] +import torch +import torch.nn as nn +import torch.nn.functional as F + +from skrl.models.torch import Model, CategoricalMixin + + +# define the model +class GRU(CategoricalMixin, Model): + def __init__(self, observation_space, action_space, device, unnormalized_log_prob=True, + num_envs=1, num_layers=1, hidden_size=64, sequence_length=10): + Model.__init__(self, observation_space, action_space, device) + CategoricalMixin.__init__(self, unnormalized_log_prob) + + self.num_envs = num_envs + self.num_layers = num_layers + self.hidden_size = hidden_size # Hout + self.sequence_length = sequence_length + + self.gru = nn.GRU(input_size=self.num_observations, + hidden_size=self.hidden_size, + num_layers=self.num_layers, + batch_first=True) # batch_first -> (batch, sequence, features) + + self.fc1 = nn.Linear(self.hidden_size, 64) + self.fc2 = nn.Linear(64, 32) + self.logits = nn.Linear(32, self.num_actions) + + def get_specification(self): + # batch size (N) is the number of envs during rollout + return {"rnn": {"sequence_length": self.sequence_length, + "sizes": [(self.num_layers, self.num_envs, self.hidden_size)]}} # hidden states (D ∗ num_layers, N, Hout) + + def compute(self, inputs, role): + states = inputs["states"] + terminated = inputs.get("terminated", None) + hidden_states = inputs["rnn"][0] + + # training + if self.training: + rnn_input = states.view(-1, self.sequence_length, states.shape[-1]) # (N, L, Hin): N=batch_size, L=sequence_length + hidden_states = hidden_states.view(self.num_layers, -1, self.sequence_length, hidden_states.shape[-1]) # (D * num_layers, N, L, Hout) + # get the hidden states corresponding to the initial sequence + hidden_states = hidden_states[:,:,0,:].contiguous() # (D * num_layers, N, Hout) + + # reset the RNN state in the middle of a sequence + if terminated is not None and torch.any(terminated): + rnn_outputs = [] + terminated = terminated.view(-1, self.sequence_length) + indexes = [0] + (terminated[:,:-1].any(dim=0).nonzero(as_tuple=True)[0] + 1).tolist() + [self.sequence_length] + + for i in range(len(indexes) - 1): + i0, i1 = indexes[i], indexes[i + 1] + rnn_output, hidden_states = self.gru(rnn_input[:,i0:i1,:], hidden_states) + hidden_states[:, (terminated[:,i1-1]), :] = 0 + rnn_outputs.append(rnn_output) + + rnn_output = torch.cat(rnn_outputs, dim=1) + # no need to reset the RNN state in the sequence + else: + rnn_output, hidden_states = self.gru(rnn_input, hidden_states) + # rollout + else: + rnn_input = states.view(-1, 1, states.shape[-1]) # (N, L, Hin): N=num_envs, L=1 + rnn_output, hidden_states = self.gru(rnn_input, hidden_states) + + # flatten the RNN output + rnn_output = torch.flatten(rnn_output, start_dim=0, end_dim=1) # (N, L, D ∗ Hout) -> (N * L, D ∗ Hout) + + x = self.fc1(rnn_output) + x = F.relu(x) + x = self.fc2(x) + x = F.relu(x) + + return self.logits(x), {"rnn": [hidden_states]} + + +# instantiate the model (assumes there is a wrapped environment: env) +policy = GRU(observation_space=env.observation_space, + action_space=env.action_space, + device=env.device, + unnormalized_log_prob=True, + num_envs=env.num_envs, + num_layers=1, + hidden_size=64, + sequence_length=10) +# [end-gru-functional] + +# ============================================================================= + +# [start-lstm-sequential] +import torch +import torch.nn as nn + +from skrl.models.torch import Model, CategoricalMixin + + +# define the model +class LSTM(CategoricalMixin, Model): + def __init__(self, observation_space, action_space, device, unnormalized_log_prob=True, + num_envs=1, num_layers=1, hidden_size=64, sequence_length=10): + Model.__init__(self, observation_space, action_space, device) + CategoricalMixin.__init__(self, unnormalized_log_prob) + + self.num_envs = num_envs + self.num_layers = num_layers + self.hidden_size = hidden_size # Hcell (Hout is Hcell because proj_size = 0) + self.sequence_length = sequence_length + + self.lstm = nn.LSTM(input_size=self.num_observations, + hidden_size=self.hidden_size, + num_layers=self.num_layers, + batch_first=True) # batch_first -> (batch, sequence, features) + + self.net = nn.Sequential(nn.Linear(self.hidden_size, 64), + nn.ReLU(), + nn.Linear(64, 32), + nn.ReLU(), + nn.Linear(32, self.num_actions)) + + def get_specification(self): + # batch size (N) is the number of envs during rollout + return {"rnn": {"sequence_length": self.sequence_length, + "sizes": [(self.num_layers, self.num_envs, self.hidden_size), # hidden states (D ∗ num_layers, N, Hout) + (self.num_layers, self.num_envs, self.hidden_size)]}} # cell states (D ∗ num_layers, N, Hcell) + + def compute(self, inputs, role): + states = inputs["states"] + terminated = inputs.get("terminated", None) + hidden_states, cell_states = inputs["rnn"][0], inputs["rnn"][1] + + # training + if self.training: + rnn_input = states.view(-1, self.sequence_length, states.shape[-1]) # (N, L, Hin): N=batch_size, L=sequence_length + hidden_states = hidden_states.view(self.num_layers, -1, self.sequence_length, hidden_states.shape[-1]) # (D * num_layers, N, L, Hout) + cell_states = cell_states.view(self.num_layers, -1, self.sequence_length, cell_states.shape[-1]) # (D * num_layers, N, L, Hcell) + # get the hidden/cell states corresponding to the initial sequence + hidden_states = hidden_states[:,:,0,:].contiguous() # (D * num_layers, N, Hout) + cell_states = cell_states[:,:,0,:].contiguous() # (D * num_layers, N, Hcell) + + # reset the RNN state in the middle of a sequence + if terminated is not None and torch.any(terminated): + rnn_outputs = [] + terminated = terminated.view(-1, self.sequence_length) + indexes = [0] + (terminated[:,:-1].any(dim=0).nonzero(as_tuple=True)[0] + 1).tolist() + [self.sequence_length] + + for i in range(len(indexes) - 1): + i0, i1 = indexes[i], indexes[i + 1] + rnn_output, (hidden_states, cell_states) = self.lstm(rnn_input[:,i0:i1,:], (hidden_states, cell_states)) + hidden_states[:, (terminated[:,i1-1]), :] = 0 + cell_states[:, (terminated[:,i1-1]), :] = 0 + rnn_outputs.append(rnn_output) + + rnn_states = (hidden_states, cell_states) + rnn_output = torch.cat(rnn_outputs, dim=1) + # no need to reset the RNN state in the sequence + else: + rnn_output, rnn_states = self.lstm(rnn_input, (hidden_states, cell_states)) + # rollout + else: + rnn_input = states.view(-1, 1, states.shape[-1]) # (N, L, Hin): N=num_envs, L=1 + rnn_output, rnn_states = self.lstm(rnn_input, (hidden_states, cell_states)) + + # flatten the RNN output + rnn_output = torch.flatten(rnn_output, start_dim=0, end_dim=1) # (N, L, D ∗ Hout) -> (N * L, D ∗ Hout) + + return self.net(rnn_output), {"rnn": [rnn_states[0], rnn_states[1]]} + + +# instantiate the model (assumes there is a wrapped environment: env) +policy = LSTM(observation_space=env.observation_space, + action_space=env.action_space, + device=env.device, + unnormalized_log_prob=True, + num_envs=env.num_envs, + num_layers=1, + hidden_size=64, + sequence_length=10) +# [end-lstm-sequential] + +# [start-lstm-functional] +import torch +import torch.nn as nn +import torch.nn.functional as F + +from skrl.models.torch import Model, CategoricalMixin + + +# define the model +class LSTM(CategoricalMixin, Model): + def __init__(self, observation_space, action_space, device, unnormalized_log_prob=True, + num_envs=1, num_layers=1, hidden_size=64, sequence_length=10): + Model.__init__(self, observation_space, action_space, device) + CategoricalMixin.__init__(self, unnormalized_log_prob) + + self.num_envs = num_envs + self.num_layers = num_layers + self.hidden_size = hidden_size # Hcell (Hout is Hcell because proj_size = 0) + self.sequence_length = sequence_length + + self.lstm = nn.LSTM(input_size=self.num_observations, + hidden_size=self.hidden_size, + num_layers=self.num_layers, + batch_first=True) # batch_first -> (batch, sequence, features) + + self.fc1 = nn.Linear(self.hidden_size, 64) + self.fc2 = nn.Linear(64, 32) + self.logits = nn.Linear(32, self.num_actions) + + def get_specification(self): + # batch size (N) is the number of envs during rollout + return {"rnn": {"sequence_length": self.sequence_length, + "sizes": [(self.num_layers, self.num_envs, self.hidden_size), # hidden states (D ∗ num_layers, N, Hout) + (self.num_layers, self.num_envs, self.hidden_size)]}} # cell states (D ∗ num_layers, N, Hcell) + + def compute(self, inputs, role): + states = inputs["states"] + terminated = inputs.get("terminated", None) + hidden_states, cell_states = inputs["rnn"][0], inputs["rnn"][1] + + # training + if self.training: + rnn_input = states.view(-1, self.sequence_length, states.shape[-1]) # (N, L, Hin): N=batch_size, L=sequence_length + hidden_states = hidden_states.view(self.num_layers, -1, self.sequence_length, hidden_states.shape[-1]) # (D * num_layers, N, L, Hout) + cell_states = cell_states.view(self.num_layers, -1, self.sequence_length, cell_states.shape[-1]) # (D * num_layers, N, L, Hcell) + # get the hidden/cell states corresponding to the initial sequence + hidden_states = hidden_states[:,:,0,:].contiguous() # (D * num_layers, N, Hout) + cell_states = cell_states[:,:,0,:].contiguous() # (D * num_layers, N, Hcell) + + # reset the RNN state in the middle of a sequence + if terminated is not None and torch.any(terminated): + rnn_outputs = [] + terminated = terminated.view(-1, self.sequence_length) + indexes = [0] + (terminated[:,:-1].any(dim=0).nonzero(as_tuple=True)[0] + 1).tolist() + [self.sequence_length] + + for i in range(len(indexes) - 1): + i0, i1 = indexes[i], indexes[i + 1] + rnn_output, (hidden_states, cell_states) = self.lstm(rnn_input[:,i0:i1,:], (hidden_states, cell_states)) + hidden_states[:, (terminated[:,i1-1]), :] = 0 + cell_states[:, (terminated[:,i1-1]), :] = 0 + rnn_outputs.append(rnn_output) + + rnn_states = (hidden_states, cell_states) + rnn_output = torch.cat(rnn_outputs, dim=1) + # no need to reset the RNN state in the sequence + else: + rnn_output, rnn_states = self.lstm(rnn_input, (hidden_states, cell_states)) + # rollout + else: + rnn_input = states.view(-1, 1, states.shape[-1]) # (N, L, Hin): N=num_envs, L=1 + rnn_output, rnn_states = self.lstm(rnn_input, (hidden_states, cell_states)) + + # flatten the RNN output + rnn_output = torch.flatten(rnn_output, start_dim=0, end_dim=1) # (N, L, D ∗ Hout) -> (N * L, D ∗ Hout) + + x = self.fc1(rnn_output) + x = F.relu(x) + x = self.fc2(x) + x = F.relu(x) + + return self.logits(x), {"rnn": [rnn_states[0], rnn_states[1]]} + + +# instantiate the model (assumes there is a wrapped environment: env) +policy = LSTM(observation_space=env.observation_space, + action_space=env.action_space, + device=env.device, + unnormalized_log_prob=True, + num_envs=env.num_envs, + num_layers=1, + hidden_size=64, + sequence_length=10) +# [end-lstm-functional] diff --git a/docs/source/snippets/deterministic_model.py b/docs/source/snippets/deterministic_model.py index d735ef63..5026367d 100644 --- a/docs/source/snippets/deterministic_model.py +++ b/docs/source/snippets/deterministic_model.py @@ -1,4 +1,4 @@ -# [start-mlp] +# [start-mlp-sequential] import torch import torch.nn as nn @@ -17,20 +17,53 @@ def __init__(self, observation_space, action_space, device, clip_actions=False): nn.ReLU(), nn.Linear(32, 1)) - def compute(self, states, taken_actions, role): - return self.net(torch.cat([states, taken_actions], dim=1)) + def compute(self, inputs, role): + return self.net(torch.cat([inputs["states"], inputs["taken_actions"]], dim=1)), {} # instantiate the model (assumes there is a wrapped environment: env) -policy = MLP(observation_space=env.observation_space, - action_space=env.action_space, - device=env.device, +critic = MLP(observation_space=env.observation_space, + action_space=env.action_space, + device=env.device, clip_actions=False) -# [end-mlp] +# [end-mlp-sequential] + +# [start-mlp-functional] +import torch +import torch.nn as nn +import torch.nn.functional as F + +from skrl.models.torch import Model, DeterministicMixin + + +# define the model +class MLP(DeterministicMixin, Model): + def __init__(self, observation_space, action_space, device, clip_actions=False): + Model.__init__(self, observation_space, action_space, device) + DeterministicMixin.__init__(self, clip_actions) + + self.fc1 = nn.Linear(self.num_observations + self.num_actions, 64) + self.fc2 = nn.Linear(64, 32) + self.fc3 = nn.Linear(32, 1) + + def compute(self, inputs, role): + x = self.fc1(torch.cat([inputs["states"], inputs["taken_actions"]], dim=1)) + x = F.relu(x) + x = self.fc2(x) + x = F.relu(x) + return self.fc3(x), {} + + +# instantiate the model (assumes there is a wrapped environment: env) +critic = MLP(observation_space=env.observation_space, + action_space=env.action_space, + device=env.device, + clip_actions=False) +# [end-mlp-functional] # ============================================================================= -# [start-cnn] +# [start-cnn-sequential] import torch import torch.nn as nn @@ -43,32 +76,605 @@ def __init__(self, observation_space, action_space, device, clip_actions=False): Model.__init__(self, observation_space, action_space, device) DeterministicMixin.__init__(self, clip_actions) - self.features_extractor = nn.Sequential(nn.Conv2d(3, 32, kernel_size=8, stride=3), + self.features_extractor = nn.Sequential(nn.Conv2d(3, 32, kernel_size=8, stride=4), nn.ReLU(), nn.Conv2d(32, 64, kernel_size=4, stride=2), nn.ReLU(), - nn.Conv2d(64, 64, kernel_size=2, stride=1), + nn.Conv2d(64, 64, kernel_size=3, stride=1), nn.ReLU(), nn.Flatten(), - nn.Linear(3136, 512), + nn.Linear(1024, 512), nn.ReLU(), nn.Linear(512, 16), nn.Tanh()) - self.net = nn.Sequential(nn.Linear(16 + self.num_actions, 32), + + self.net = nn.Sequential(nn.Linear(16 + self.num_actions, 64), nn.Tanh(), - nn.Linear(32, 32), + nn.Linear(64, 32), nn.Tanh(), nn.Linear(32, 1)) - def compute(self, states, taken_actions, role): - # permute (samples, width, height, channels) -> (samples, channels, width, height) - x = self.features_extractor(states.permute(0, 3, 1, 2)) - return self.net(torch.cat([x, taken_actions], dim=1)) + def compute(self, inputs, role): + # permute (samples, width * height * channels) -> (samples, channels, width, height) + x = self.features_extractor(inputs["states"].view(-1, *self.observation_space.shape).permute(0, 3, 1, 2)) + return self.net(torch.cat([x, inputs["taken_actions"]], dim=1)), {} # instantiate the model (assumes there is a wrapped environment: env) -policy = CNN(observation_space=env.observation_space, - action_space=env.action_space, - device=env.device, +critic = CNN(observation_space=env.observation_space, + action_space=env.action_space, + device=env.device, clip_actions=False) -# [end-cnn] +# [end-cnn-sequential] + +# [start-cnn-functional] +import torch +import torch.nn as nn +import torch.nn.functional as F + +from skrl.models.torch import Model, DeterministicMixin + + +# define the model +class CNN(DeterministicMixin, Model): + def __init__(self, observation_space, action_space, device, clip_actions=False): + Model.__init__(self, observation_space, action_space, device) + DeterministicMixin.__init__(self, clip_actions) + + self.conv1 = nn.Conv2d(3, 32, kernel_size=8, stride=4) + self.conv2 = nn.Conv2d(32, 64, kernel_size=4, stride=2) + self.conv3 = nn.Conv2d(64, 64, kernel_size=3, stride=1) + self.fc1 = nn.Linear(1024, 512) + self.fc2 = nn.Linear(512, 16) + self.fc3 = nn.Linear(16 + self.num_actions, 64) + self.fc4 = nn.Linear(64, 32) + self.fc5 = nn.Linear(32, 1) + + def compute(self, inputs, role): + # permute (samples, width * height * channels) -> (samples, channels, width, height) + x = inputs["states"].view(-1, *self.observation_space.shape).permute(0, 3, 1, 2) + x = self.conv1(x) + x = F.relu(x) + x = self.conv2(x) + x = F.relu(x) + x = self.conv3(x) + x = F.relu(x) + x = torch.flatten(x, start_dim=1) + x = self.fc1(x) + x = F.relu(x) + x = self.fc2(x) + x = torch.tanh(x) + x = self.fc3(torch.cat([x, inputs["taken_actions"]], dim=1)) + x = torch.tanh(x) + x = self.fc4(x) + x = torch.tanh(x) + x = self.fc5(x) + return x, {} + + +# instantiate the model (assumes there is a wrapped environment: env) +critic = CNN(observation_space=env.observation_space, + action_space=env.action_space, + device=env.device, + clip_actions=False) +# [end-cnn-functional] + +# ============================================================================= + +# [start-rnn-sequential] +import torch +import torch.nn as nn + +from skrl.models.torch import Model, DeterministicMixin + + +# define the model +class RNN(DeterministicMixin, Model): + def __init__(self, observation_space, action_space, device, clip_actions=False, + num_envs=1, num_layers=1, hidden_size=64, sequence_length=10): + Model.__init__(self, observation_space, action_space, device) + DeterministicMixin.__init__(self, clip_actions) + + self.num_envs = num_envs + self.num_layers = num_layers + self.hidden_size = hidden_size # Hout + self.sequence_length = sequence_length + + self.rnn = nn.RNN(input_size=self.num_observations, + hidden_size=self.hidden_size, + num_layers=self.num_layers, + batch_first=True) # batch_first -> (batch, sequence, features) + + self.net = nn.Sequential(nn.Linear(self.hidden_size + self.num_actions, 64), + nn.ReLU(), + nn.Linear(64, 32), + nn.ReLU(), + nn.Linear(32, 1)) + + def get_specification(self): + # batch size (N) is the number of envs during rollout + return {"rnn": {"sequence_length": self.sequence_length, + "sizes": [(self.num_layers, self.num_envs, self.hidden_size)]}} # hidden states (D ∗ num_layers, N, Hout) + + def compute(self, inputs, role): + states = inputs["states"] + terminated = inputs.get("terminated", None) + hidden_states = inputs["rnn"][0] + + # critic models are only used during training + rnn_input = states.view(-1, self.sequence_length, states.shape[-1]) # (N, L, Hin): N=batch_size, L=sequence_length + + hidden_states = hidden_states.view(self.num_layers, -1, self.sequence_length, hidden_states.shape[-1]) # (D * num_layers, N, L, Hout) + # get the hidden states corresponding to the initial sequence + sequence_index = 1 if role == "target_critic" else 0 # target networks act on the next state of the environment + hidden_states = hidden_states[:,:,sequence_index,:].contiguous() # (D * num_layers, N, Hout) + + # reset the RNN state in the middle of a sequence + if terminated is not None and torch.any(terminated): + rnn_outputs = [] + terminated = terminated.view(-1, self.sequence_length) + indexes = [0] + (terminated[:,:-1].any(dim=0).nonzero(as_tuple=True)[0] + 1).tolist() + [self.sequence_length] + + for i in range(len(indexes) - 1): + i0, i1 = indexes[i], indexes[i + 1] + rnn_output, hidden_states = self.rnn(rnn_input[:,i0:i1,:], hidden_states) + hidden_states[:, (terminated[:,i1-1]), :] = 0 + rnn_outputs.append(rnn_output) + + rnn_output = torch.cat(rnn_outputs, dim=1) + # no need to reset the RNN state in the sequence + else: + rnn_output, hidden_states = self.rnn(rnn_input, hidden_states) + + # flatten the RNN output + rnn_output = torch.flatten(rnn_output, start_dim=0, end_dim=1) # (N, L, D ∗ Hout) -> (N * L, D ∗ Hout) + + return self.net(torch.cat([rnn_output, inputs["taken_actions"]], dim=1)), {"rnn": [hidden_states]} + + +# instantiate the model (assumes there is a wrapped environment: env) +critic = RNN(observation_space=env.observation_space, + action_space=env.action_space, + device=env.device, + clip_actions=False, + num_envs=env.num_envs, + num_layers=1, + hidden_size=64, + sequence_length=10) +# [end-rnn-sequential] + +# [start-rnn-functional] +import torch +import torch.nn as nn +import torch.nn.functional as F + +from skrl.models.torch import Model, DeterministicMixin + + +# define the model +class RNN(DeterministicMixin, Model): + def __init__(self, observation_space, action_space, device, clip_actions=False, + num_envs=1, num_layers=1, hidden_size=64, sequence_length=10): + Model.__init__(self, observation_space, action_space, device) + DeterministicMixin.__init__(self, clip_actions) + + self.num_envs = num_envs + self.num_layers = num_layers + self.hidden_size = hidden_size # Hout + self.sequence_length = sequence_length + + self.rnn = nn.RNN(input_size=self.num_observations, + hidden_size=self.hidden_size, + num_layers=self.num_layers, + batch_first=True) # batch_first -> (batch, sequence, features) + + self.fc1 = nn.Linear(self.hidden_size + self.num_actions, 64) + self.fc2 = nn.Linear(64, 32) + self.fc3 = nn.Linear(32, 1) + + def get_specification(self): + # batch size (N) is the number of envs during rollout + return {"rnn": {"sequence_length": self.sequence_length, + "sizes": [(self.num_layers, self.num_envs, self.hidden_size)]}} # hidden states (D ∗ num_layers, N, Hout) + + def compute(self, inputs, role): + states = inputs["states"] + terminated = inputs.get("terminated", None) + hidden_states = inputs["rnn"][0] + + # critic models are only used during training + rnn_input = states.view(-1, self.sequence_length, states.shape[-1]) # (N, L, Hin): N=batch_size, L=sequence_length + + hidden_states = hidden_states.view(self.num_layers, -1, self.sequence_length, hidden_states.shape[-1]) # (D * num_layers, N, L, Hout) + # get the hidden states corresponding to the initial sequence + sequence_index = 1 if role == "target_critic" else 0 # target networks act on the next state of the environment + hidden_states = hidden_states[:,:,sequence_index,:].contiguous() # (D * num_layers, N, Hout) + + # reset the RNN state in the middle of a sequence + if terminated is not None and torch.any(terminated): + rnn_outputs = [] + terminated = terminated.view(-1, self.sequence_length) + indexes = [0] + (terminated[:,:-1].any(dim=0).nonzero(as_tuple=True)[0] + 1).tolist() + [self.sequence_length] + + for i in range(len(indexes) - 1): + i0, i1 = indexes[i], indexes[i + 1] + rnn_output, hidden_states = self.rnn(rnn_input[:,i0:i1,:], hidden_states) + hidden_states[:, (terminated[:,i1-1]), :] = 0 + rnn_outputs.append(rnn_output) + + rnn_output = torch.cat(rnn_outputs, dim=1) + # no need to reset the RNN state in the sequence + else: + rnn_output, hidden_states = self.rnn(rnn_input, hidden_states) + + # flatten the RNN output + rnn_output = torch.flatten(rnn_output, start_dim=0, end_dim=1) # (N, L, D ∗ Hout) -> (N * L, D ∗ Hout) + + x = self.fc1(torch.cat([rnn_output, inputs["taken_actions"]], dim=1)) + x = F.relu(x) + x = self.fc2(x) + x = F.relu(x) + + return self.fc3(x), {"rnn": [hidden_states]} + + +# instantiate the model (assumes there is a wrapped environment: env) +critic = RNN(observation_space=env.observation_space, + action_space=env.action_space, + device=env.device, + clip_actions=False, + num_envs=env.num_envs, + num_layers=1, + hidden_size=64, + sequence_length=10) +# [end-rnn-functional] + +# ============================================================================= + +# [start-gru-sequential] +import torch +import torch.nn as nn + +from skrl.models.torch import Model, DeterministicMixin + + +# define the model +class GRU(DeterministicMixin, Model): + def __init__(self, observation_space, action_space, device, clip_actions=False, + num_envs=1, num_layers=1, hidden_size=64, sequence_length=10): + Model.__init__(self, observation_space, action_space, device) + DeterministicMixin.__init__(self, clip_actions) + + self.num_envs = num_envs + self.num_layers = num_layers + self.hidden_size = hidden_size # Hout + self.sequence_length = sequence_length + + self.gru = nn.GRU(input_size=self.num_observations, + hidden_size=self.hidden_size, + num_layers=self.num_layers, + batch_first=True) # batch_first -> (batch, sequence, features) + + self.net = nn.Sequential(nn.Linear(self.hidden_size + self.num_actions, 64), + nn.ReLU(), + nn.Linear(64, 32), + nn.ReLU(), + nn.Linear(32, 1)) + + def get_specification(self): + # batch size (N) is the number of envs during rollout + return {"rnn": {"sequence_length": self.sequence_length, + "sizes": [(self.num_layers, self.num_envs, self.hidden_size)]}} # hidden states (D ∗ num_layers, N, Hout) + + def compute(self, inputs, role): + states = inputs["states"] + terminated = inputs.get("terminated", None) + hidden_states = inputs["rnn"][0] + + # critic models are only used during training + rnn_input = states.view(-1, self.sequence_length, states.shape[-1]) # (N, L, Hin): N=batch_size, L=sequence_length + + hidden_states = hidden_states.view(self.num_layers, -1, self.sequence_length, hidden_states.shape[-1]) # (D * num_layers, N, L, Hout) + # get the hidden states corresponding to the initial sequence + sequence_index = 1 if role == "target_critic" else 0 # target networks act on the next state of the environment + hidden_states = hidden_states[:,:,sequence_index,:].contiguous() # (D * num_layers, N, Hout) + + # reset the RNN state in the middle of a sequence + if terminated is not None and torch.any(terminated): + rnn_outputs = [] + terminated = terminated.view(-1, self.sequence_length) + indexes = [0] + (terminated[:,:-1].any(dim=0).nonzero(as_tuple=True)[0] + 1).tolist() + [self.sequence_length] + + for i in range(len(indexes) - 1): + i0, i1 = indexes[i], indexes[i + 1] + rnn_output, hidden_states = self.gru(rnn_input[:,i0:i1,:], hidden_states) + hidden_states[:, (terminated[:,i1-1]), :] = 0 + rnn_outputs.append(rnn_output) + + rnn_output = torch.cat(rnn_outputs, dim=1) + # no need to reset the RNN state in the sequence + else: + rnn_output, hidden_states = self.gru(rnn_input, hidden_states) + + # flatten the RNN output + rnn_output = torch.flatten(rnn_output, start_dim=0, end_dim=1) # (N, L, D ∗ Hout) -> (N * L, D ∗ Hout) + + return self.net(torch.cat([rnn_output, inputs["taken_actions"]], dim=1)), {"rnn": [hidden_states]} + + +# instantiate the model (assumes there is a wrapped environment: env) +critic = GRU(observation_space=env.observation_space, + action_space=env.action_space, + device=env.device, + clip_actions=False, + num_envs=env.num_envs, + num_layers=1, + hidden_size=64, + sequence_length=10) +# [end-gru-sequential] + +# [start-gru-functional] +import torch +import torch.nn as nn +import torch.nn.functional as F + +from skrl.models.torch import Model, DeterministicMixin + + +# define the model +class GRU(DeterministicMixin, Model): + def __init__(self, observation_space, action_space, device, clip_actions=False, + num_envs=1, num_layers=1, hidden_size=64, sequence_length=10): + Model.__init__(self, observation_space, action_space, device) + DeterministicMixin.__init__(self, clip_actions) + + self.num_envs = num_envs + self.num_layers = num_layers + self.hidden_size = hidden_size # Hout + self.sequence_length = sequence_length + + self.gru = nn.GRU(input_size=self.num_observations, + hidden_size=self.hidden_size, + num_layers=self.num_layers, + batch_first=True) # batch_first -> (batch, sequence, features) + + self.fc1 = nn.Linear(self.hidden_size + self.num_actions, 64) + self.fc2 = nn.Linear(64, 32) + self.fc3 = nn.Linear(32, 1) + + def get_specification(self): + # batch size (N) is the number of envs during rollout + return {"rnn": {"sequence_length": self.sequence_length, + "sizes": [(self.num_layers, self.num_envs, self.hidden_size)]}} # hidden states (D ∗ num_layers, N, Hout) + + def compute(self, inputs, role): + states = inputs["states"] + terminated = inputs.get("terminated", None) + hidden_states = inputs["rnn"][0] + + # critic models are only used during training + rnn_input = states.view(-1, self.sequence_length, states.shape[-1]) # (N, L, Hin): N=batch_size, L=sequence_length + + hidden_states = hidden_states.view(self.num_layers, -1, self.sequence_length, hidden_states.shape[-1]) # (D * num_layers, N, L, Hout) + # get the hidden states corresponding to the initial sequence + sequence_index = 1 if role == "target_critic" else 0 # target networks act on the next state of the environment + hidden_states = hidden_states[:,:,sequence_index,:].contiguous() # (D * num_layers, N, Hout) + + # reset the RNN state in the middle of a sequence + if terminated is not None and torch.any(terminated): + rnn_outputs = [] + terminated = terminated.view(-1, self.sequence_length) + indexes = [0] + (terminated[:,:-1].any(dim=0).nonzero(as_tuple=True)[0] + 1).tolist() + [self.sequence_length] + + for i in range(len(indexes) - 1): + i0, i1 = indexes[i], indexes[i + 1] + rnn_output, hidden_states = self.gru(rnn_input[:,i0:i1,:], hidden_states) + hidden_states[:, (terminated[:,i1-1]), :] = 0 + rnn_outputs.append(rnn_output) + + rnn_output = torch.cat(rnn_outputs, dim=1) + # no need to reset the RNN state in the sequence + else: + rnn_output, hidden_states = self.gru(rnn_input, hidden_states) + + # flatten the RNN output + rnn_output = torch.flatten(rnn_output, start_dim=0, end_dim=1) # (N, L, D ∗ Hout) -> (N * L, D ∗ Hout) + + x = self.fc1(torch.cat([rnn_output, inputs["taken_actions"]], dim=1)) + x = F.relu(x) + x = self.fc2(x) + x = F.relu(x) + + return self.fc3(x), {"rnn": [hidden_states]} + + +# instantiate the model (assumes there is a wrapped environment: env) +critic = GRU(observation_space=env.observation_space, + action_space=env.action_space, + device=env.device, + clip_actions=False, + num_envs=env.num_envs, + num_layers=1, + hidden_size=64, + sequence_length=10) +# [end-gru-functional] + +# ============================================================================= + +# [start-lstm-sequential] +import torch +import torch.nn as nn + +from skrl.models.torch import Model, DeterministicMixin + + +# define the model +class LSTM(DeterministicMixin, Model): + def __init__(self, observation_space, action_space, device, clip_actions=False, + num_envs=1, num_layers=1, hidden_size=64, sequence_length=10): + Model.__init__(self, observation_space, action_space, device) + DeterministicMixin.__init__(self, clip_actions) + + self.num_envs = num_envs + self.num_layers = num_layers + self.hidden_size = hidden_size # Hcell (Hout is Hcell because proj_size = 0) + self.sequence_length = sequence_length + + self.lstm = nn.LSTM(input_size=self.num_observations, + hidden_size=self.hidden_size, + num_layers=self.num_layers, + batch_first=True) # batch_first -> (batch, sequence, features) + + self.net = nn.Sequential(nn.Linear(self.hidden_size + self.num_actions, 64), + nn.ReLU(), + nn.Linear(64, 32), + nn.ReLU(), + nn.Linear(32, 1)) + + def get_specification(self): + # batch size (N) is the number of envs during rollout + return {"rnn": {"sequence_length": self.sequence_length, + "sizes": [(self.num_layers, self.num_envs, self.hidden_size), # hidden states (D ∗ num_layers, N, Hout) + (self.num_layers, self.num_envs, self.hidden_size)]}} # cell states (D ∗ num_layers, N, Hcell) + + def compute(self, inputs, role): + states = inputs["states"] + terminated = inputs.get("terminated", None) + hidden_states, cell_states = inputs["rnn"][0], inputs["rnn"][1] + + # critic models are only used during training + rnn_input = states.view(-1, self.sequence_length, states.shape[-1]) # (N, L, Hin): N=batch_size, L=sequence_length + + hidden_states = hidden_states.view(self.num_layers, -1, self.sequence_length, hidden_states.shape[-1]) # (D * num_layers, N, L, Hout) + cell_states = cell_states.view(self.num_layers, -1, self.sequence_length, cell_states.shape[-1]) # (D * num_layers, N, L, Hcell) + # get the hidden/cell states corresponding to the initial sequence + sequence_index = 1 if role == "target_critic" else 0 # target networks act on the next state of the environment + hidden_states = hidden_states[:,:,sequence_index,:].contiguous() # (D * num_layers, N, Hout) + cell_states = cell_states[:,:,sequence_index,:].contiguous() # (D * num_layers, N, Hcell) + + # reset the RNN state in the middle of a sequence + if terminated is not None and torch.any(terminated): + rnn_outputs = [] + terminated = terminated.view(-1, self.sequence_length) + indexes = [0] + (terminated[:,:-1].any(dim=0).nonzero(as_tuple=True)[0] + 1).tolist() + [self.sequence_length] + + for i in range(len(indexes) - 1): + i0, i1 = indexes[i], indexes[i + 1] + rnn_output, (hidden_states, cell_states) = self.lstm(rnn_input[:,i0:i1,:], (hidden_states, cell_states)) + hidden_states[:, (terminated[:,i1-1]), :] = 0 + cell_states[:, (terminated[:,i1-1]), :] = 0 + rnn_outputs.append(rnn_output) + + rnn_states = (hidden_states, cell_states) + rnn_output = torch.cat(rnn_outputs, dim=1) + # no need to reset the RNN state in the sequence + else: + rnn_output, rnn_states = self.lstm(rnn_input, (hidden_states, cell_states)) + + # flatten the RNN output + rnn_output = torch.flatten(rnn_output, start_dim=0, end_dim=1) # (N, L, D ∗ Hout) -> (N * L, D ∗ Hout) + + return self.net(torch.cat([rnn_output, inputs["taken_actions"]], dim=1)), {"rnn": [rnn_states[0], rnn_states[1]]} + + +# instantiate the model (assumes there is a wrapped environment: env) +critic = LSTM(observation_space=env.observation_space, + action_space=env.action_space, + device=env.device, + clip_actions=False, + num_envs=env.num_envs, + num_layers=1, + hidden_size=64, + sequence_length=10) +# [end-lstm-sequential] + +# [start-lstm-functional] +import torch +import torch.nn as nn +import torch.nn.functional as F + +from skrl.models.torch import Model, DeterministicMixin + + +# define the model +class LSTM(DeterministicMixin, Model): + def __init__(self, observation_space, action_space, device, clip_actions=False, + num_envs=1, num_layers=1, hidden_size=64, sequence_length=10): + Model.__init__(self, observation_space, action_space, device) + DeterministicMixin.__init__(self, clip_actions) + + self.num_envs = num_envs + self.num_layers = num_layers + self.hidden_size = hidden_size # Hcell (Hout is Hcell because proj_size = 0) + self.sequence_length = sequence_length + + self.lstm = nn.LSTM(input_size=self.num_observations, + hidden_size=self.hidden_size, + num_layers=self.num_layers, + batch_first=True) # batch_first -> (batch, sequence, features) + + self.fc1 = nn.Linear(self.hidden_size + self.num_actions, 64) + self.fc2 = nn.Linear(64, 32) + self.fc3 = nn.Linear(32, 1) + + def get_specification(self): + # batch size (N) is the number of envs during rollout + return {"rnn": {"sequence_length": self.sequence_length, + "sizes": [(self.num_layers, self.num_envs, self.hidden_size), # hidden states (D ∗ num_layers, N, Hout) + (self.num_layers, self.num_envs, self.hidden_size)]}} # cell states (D ∗ num_layers, N, Hcell) + + def compute(self, inputs, role): + states = inputs["states"] + terminated = inputs.get("terminated", None) + hidden_states, cell_states = inputs["rnn"][0], inputs["rnn"][1] + + # critic models are only used during training + rnn_input = states.view(-1, self.sequence_length, states.shape[-1]) # (N, L, Hin): N=batch_size, L=sequence_length + + hidden_states = hidden_states.view(self.num_layers, -1, self.sequence_length, hidden_states.shape[-1]) # (D * num_layers, N, L, Hout) + cell_states = cell_states.view(self.num_layers, -1, self.sequence_length, cell_states.shape[-1]) # (D * num_layers, N, L, Hcell) + # get the hidden/cell states corresponding to the initial sequence + sequence_index = 1 if role == "target_critic" else 0 # target networks act on the next state of the environment + hidden_states = hidden_states[:,:,sequence_index,:].contiguous() # (D * num_layers, N, Hout) + cell_states = cell_states[:,:,sequence_index,:].contiguous() # (D * num_layers, N, Hcell) + + # reset the RNN state in the middle of a sequence + if terminated is not None and torch.any(terminated): + rnn_outputs = [] + terminated = terminated.view(-1, self.sequence_length) + indexes = [0] + (terminated[:,:-1].any(dim=0).nonzero(as_tuple=True)[0] + 1).tolist() + [self.sequence_length] + + for i in range(len(indexes) - 1): + i0, i1 = indexes[i], indexes[i + 1] + rnn_output, (hidden_states, cell_states) = self.lstm(rnn_input[:,i0:i1,:], (hidden_states, cell_states)) + hidden_states[:, (terminated[:,i1-1]), :] = 0 + cell_states[:, (terminated[:,i1-1]), :] = 0 + rnn_outputs.append(rnn_output) + + rnn_states = (hidden_states, cell_states) + rnn_output = torch.cat(rnn_outputs, dim=1) + # no need to reset the RNN state in the sequence + else: + rnn_output, rnn_states = self.lstm(rnn_input, (hidden_states, cell_states)) + + # flatten the RNN output + rnn_output = torch.flatten(rnn_output, start_dim=0, end_dim=1) # (N, L, D ∗ Hout) -> (N * L, D ∗ Hout) + + x = self.fc1(torch.cat([rnn_output, inputs["taken_actions"]], dim=1)) + x = F.relu(x) + x = self.fc2(x) + x = F.relu(x) + + return self.fc3(x), {"rnn": [rnn_states[0], rnn_states[1]]} + + +# instantiate the model (assumes there is a wrapped environment: env) +critic = LSTM(observation_space=env.observation_space, + action_space=env.action_space, + device=env.device, + clip_actions=False, + num_envs=env.num_envs, + num_layers=1, + hidden_size=64, + sequence_length=10) +# [end-lstm-functional] diff --git a/docs/source/snippets/gaussian_model.py b/docs/source/snippets/gaussian_model.py index 50ef6afe..02d95fc3 100644 --- a/docs/source/snippets/gaussian_model.py +++ b/docs/source/snippets/gaussian_model.py @@ -1,4 +1,42 @@ -# [start-mlp] +# [start-mlp-sequential] +import torch +import torch.nn as nn + +from skrl.models.torch import Model, GaussianMixin + + +# define the model +class MLP(GaussianMixin, Model): + def __init__(self, observation_space, action_space, device, + clip_actions=False, clip_log_std=True, min_log_std=-20, max_log_std=2, reduction="sum"): + Model.__init__(self, observation_space, action_space, device) + GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std, reduction) + + self.net = nn.Sequential(nn.Linear(self.num_observations, 64), + nn.ReLU(), + nn.Linear(64, 32), + nn.ReLU(), + nn.Linear(32, self.num_actions), + nn.Tanh()) + + self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) + + def compute(self, inputs, role): + return self.net(inputs["states"]), self.log_std_parameter, {} + + +# instantiate the model (assumes there is a wrapped environment: env) +policy = MLP(observation_space=env.observation_space, + action_space=env.action_space, + device=env.device, + clip_actions=True, + clip_log_std=True, + min_log_std=-20, + max_log_std=2, + reduction="sum") +# [end-mlp-sequential] + +# [start-mlp-functional] import torch import torch.nn as nn import torch.nn.functional as F @@ -8,83 +46,729 @@ # define the model class MLP(GaussianMixin, Model): - def __init__(self, observation_space, action_space, device, + def __init__(self, observation_space, action_space, device, clip_actions=False, clip_log_std=True, min_log_std=-20, max_log_std=2, reduction="sum"): Model.__init__(self, observation_space, action_space, device) GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std, reduction) - self.linear_layer_1 = nn.Linear(self.num_observations, 128) - self.linear_layer_2 = nn.Linear(128, 64) - self.linear_layer_3 = nn.Linear(64, 32) - self.mean_action_layer = nn.Linear(32, self.num_actions) + self.fc1 = nn.Linear(self.num_observations, 64) + self.fc2 = nn.Linear(64, 32) + self.fc3 = nn.Linear(32, self.num_actions) self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) - def compute(self, states, taken_actions, role): - x = F.relu(self.linear_layer_1(states)) - x = F.relu(self.linear_layer_2(x)) - x = F.relu(self.linear_layer_3(x)) - return torch.tanh(self.mean_action_layer(x)), self.log_std_parameter + def compute(self, inputs, role): + x = self.fc1(inputs["states"]) + x = F.relu(x) + x = self.fc2(x) + x = F.relu(x) + x = self.fc3(x) + return torch.tanh(x), self.log_std_parameter, {} + # instantiate the model (assumes there is a wrapped environment: env) -policy = MLP(observation_space=env.observation_space, - action_space=env.action_space, - device=env.device, +policy = MLP(observation_space=env.observation_space, + action_space=env.action_space, + device=env.device, clip_actions=True, clip_log_std=True, min_log_std=-20, max_log_std=2, reduction="sum") -# [end-mlp] +# [end-mlp-functional] # ============================================================================= -# [start-cnn] +# [start-cnn-sequential] import torch import torch.nn as nn -import torch.nn.functional as F from skrl.models.torch import Model, GaussianMixin # define the model class CNN(GaussianMixin, Model): - def __init__(self, observation_space, action_space, device, + def __init__(self, observation_space, action_space, device, clip_actions=False, clip_log_std=True, min_log_std=-20, max_log_std=2, reduction="sum"): Model.__init__(self, observation_space, action_space, device) GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std, reduction) - self.net = nn.Sequential(nn.Conv2d(1, 64, kernel_size=4, stride=2), + self.net = nn.Sequential(nn.Conv2d(3, 32, kernel_size=8, stride=4), nn.ReLU(), - nn.Conv2d(64, 32, kernel_size=4, stride=2), + nn.Conv2d(32, 64, kernel_size=4, stride=2), nn.ReLU(), - nn.Conv2d(32, 16, kernel_size=2, stride=2), - nn.ReLU(), - nn.Conv2d(16, 8, kernel_size=2, stride=2), + nn.Conv2d(64, 64, kernel_size=3, stride=1), nn.ReLU(), nn.Flatten(), - nn.Linear(1800, 256), + nn.Linear(1024, 512), nn.ReLU(), - nn.Linear(256, 16), + nn.Linear(512, 16), + nn.Tanh(), + nn.Linear(16, 64), nn.Tanh(), - nn.Linear(16, 32), + nn.Linear(64, 32), nn.Tanh(), nn.Linear(32, self.num_actions)) - + self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) - def compute(self, states, taken_actions, role): - # permute (samples, width, height, channels) -> (samples, channels, width, height) - return self.net(states.permute(0, 3, 1, 2)), self.log_std_parameter + def compute(self, inputs, role): + # permute (samples, width * height * channels) -> (samples, channels, width, height) + return self.net(inputs["states"].view(-1, *self.observation_space.shape).permute(0, 3, 1, 2)), self.log_std_parameter, {} # instantiate the model (assumes there is a wrapped environment: env) -policy = CNN(observation_space=env.observation_space, - action_space=env.action_space, - device=env.device, +policy = CNN(observation_space=env.observation_space, + action_space=env.action_space, + device=env.device, clip_actions=True, clip_log_std=True, min_log_std=-20, max_log_std=2, reduction="sum") -# [end-cnn] +# [end-cnn-sequential] + +# [start-cnn-functional] +import torch +import torch.nn as nn +import torch.nn.functional as F + +from skrl.models.torch import Model, GaussianMixin + + +# define the model +class CNN(GaussianMixin, Model): + def __init__(self, observation_space, action_space, device, + clip_actions=False, clip_log_std=True, min_log_std=-20, max_log_std=2, reduction="sum"): + Model.__init__(self, observation_space, action_space, device) + GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std, reduction) + + self.conv1 = nn.Conv2d(3, 32, kernel_size=8, stride=4) + self.conv2 = nn.Conv2d(32, 64, kernel_size=4, stride=2) + self.conv3 = nn.Conv2d(64, 64, kernel_size=3, stride=1) + self.fc1 = nn.Linear(1024, 512) + self.fc2 = nn.Linear(512, 16) + self.fc3 = nn.Linear(16, 64) + self.fc4 = nn.Linear(64, 32) + self.fc5 = nn.Linear(32, self.num_actions) + + self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) + + def compute(self, inputs, role): + # permute (samples, width * height * channels) -> (samples, channels, width, height) + x = inputs["states"].view(-1, *self.observation_space.shape).permute(0, 3, 1, 2) + x = self.conv1(x) + x = F.relu(x) + x = self.conv2(x) + x = F.relu(x) + x = self.conv3(x) + x = F.relu(x) + x = torch.flatten(x, start_dim=1) + x = self.fc1(x) + x = F.relu(x) + x = self.fc2(x) + x = torch.tanh(x) + x = self.fc3(x) + x = torch.tanh(x) + x = self.fc4(x) + x = torch.tanh(x) + x = self.fc5(x) + return x, self.log_std_parameter, {} + + +# instantiate the model (assumes there is a wrapped environment: env) +policy = CNN(observation_space=env.observation_space, + action_space=env.action_space, + device=env.device, + clip_actions=True, + clip_log_std=True, + min_log_std=-20, + max_log_std=2, + reduction="sum") +# [end-cnn-functional] + +# ============================================================================= + +# [start-rnn-sequential] +import torch +import torch.nn as nn + +from skrl.models.torch import Model, GaussianMixin + + +# define the model +class RNN(GaussianMixin, Model): + def __init__(self, observation_space, action_space, device, clip_actions=False, + clip_log_std=True, min_log_std=-20, max_log_std=2, reduction="sum", + num_envs=1, num_layers=1, hidden_size=64, sequence_length=10): + Model.__init__(self, observation_space, action_space, device) + GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std, reduction) + + self.num_envs = num_envs + self.num_layers = num_layers + self.hidden_size = hidden_size # Hout + self.sequence_length = sequence_length + + self.rnn = nn.RNN(input_size=self.num_observations, + hidden_size=self.hidden_size, + num_layers=self.num_layers, + batch_first=True) # batch_first -> (batch, sequence, features) + + self.net = nn.Sequential(nn.Linear(self.hidden_size, 64), + nn.ReLU(), + nn.Linear(64, 32), + nn.ReLU(), + nn.Linear(32, self.num_actions), + nn.Tanh()) + + self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) + + def get_specification(self): + # batch size (N) is the number of envs during rollout + return {"rnn": {"sequence_length": self.sequence_length, + "sizes": [(self.num_layers, self.num_envs, self.hidden_size)]}} # hidden states (D ∗ num_layers, N, Hout) + + def compute(self, inputs, role): + states = inputs["states"] + terminated = inputs.get("terminated", None) + hidden_states = inputs["rnn"][0] + + # training + if self.training: + rnn_input = states.view(-1, self.sequence_length, states.shape[-1]) # (N, L, Hin): N=batch_size, L=sequence_length + hidden_states = hidden_states.view(self.num_layers, -1, self.sequence_length, hidden_states.shape[-1]) # (D * num_layers, N, L, Hout) + # get the hidden states corresponding to the initial sequence + hidden_states = hidden_states[:,:,0,:].contiguous() # (D * num_layers, N, Hout) + + # reset the RNN state in the middle of a sequence + if terminated is not None and torch.any(terminated): + rnn_outputs = [] + terminated = terminated.view(-1, self.sequence_length) + indexes = [0] + (terminated[:,:-1].any(dim=0).nonzero(as_tuple=True)[0] + 1).tolist() + [self.sequence_length] + + for i in range(len(indexes) - 1): + i0, i1 = indexes[i], indexes[i + 1] + rnn_output, hidden_states = self.rnn(rnn_input[:,i0:i1,:], hidden_states) + hidden_states[:, (terminated[:,i1-1]), :] = 0 + rnn_outputs.append(rnn_output) + + rnn_output = torch.cat(rnn_outputs, dim=1) + # no need to reset the RNN state in the sequence + else: + rnn_output, hidden_states = self.rnn(rnn_input, hidden_states) + # rollout + else: + rnn_input = states.view(-1, 1, states.shape[-1]) # (N, L, Hin): N=num_envs, L=1 + rnn_output, hidden_states = self.rnn(rnn_input, hidden_states) + + # flatten the RNN output + rnn_output = torch.flatten(rnn_output, start_dim=0, end_dim=1) # (N, L, D ∗ Hout) -> (N * L, D ∗ Hout) + + return self.net(rnn_output), self.log_std_parameter, {"rnn": [hidden_states]} + + +# instantiate the model (assumes there is a wrapped environment: env) +policy = RNN(observation_space=env.observation_space, + action_space=env.action_space, + device=env.device, + clip_actions=True, + clip_log_std=True, + min_log_std=-20, + max_log_std=2, + reduction="sum", + num_envs=env.num_envs, + num_layers=1, + hidden_size=64, + sequence_length=10) +# [end-rnn-sequential] + +# [start-rnn-functional] +import torch +import torch.nn as nn +import torch.nn.functional as F + +from skrl.models.torch import Model, GaussianMixin + + +# define the model +class RNN(GaussianMixin, Model): + def __init__(self, observation_space, action_space, device, clip_actions=False, + clip_log_std=True, min_log_std=-20, max_log_std=2, reduction="sum", + num_envs=1, num_layers=1, hidden_size=64, sequence_length=10): + Model.__init__(self, observation_space, action_space, device) + GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std, reduction) + + self.num_envs = num_envs + self.num_layers = num_layers + self.hidden_size = hidden_size # Hout + self.sequence_length = sequence_length + + self.rnn = nn.RNN(input_size=self.num_observations, + hidden_size=self.hidden_size, + num_layers=self.num_layers, + batch_first=True) # batch_first -> (batch, sequence, features) + + self.fc1 = nn.Linear(self.hidden_size, 64) + self.fc2 = nn.Linear(64, 32) + self.fc3 = nn.Linear(32, self.num_actions) + + self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) + + def get_specification(self): + # batch size (N) is the number of envs during rollout + return {"rnn": {"sequence_length": self.sequence_length, + "sizes": [(self.num_layers, self.num_envs, self.hidden_size)]}} # hidden states (D ∗ num_layers, N, Hout) + + def compute(self, inputs, role): + states = inputs["states"] + terminated = inputs.get("terminated", None) + hidden_states = inputs["rnn"][0] + + # training + if self.training: + rnn_input = states.view(-1, self.sequence_length, states.shape[-1]) # (N, L, Hin): N=batch_size, L=sequence_length + hidden_states = hidden_states.view(self.num_layers, -1, self.sequence_length, hidden_states.shape[-1]) # (D * num_layers, N, L, Hout) + # get the hidden states corresponding to the initial sequence + hidden_states = hidden_states[:,:,0,:].contiguous() # (D * num_layers, N, Hout) + + # reset the RNN state in the middle of a sequence + if terminated is not None and torch.any(terminated): + rnn_outputs = [] + terminated = terminated.view(-1, self.sequence_length) + indexes = [0] + (terminated[:,:-1].any(dim=0).nonzero(as_tuple=True)[0] + 1).tolist() + [self.sequence_length] + + for i in range(len(indexes) - 1): + i0, i1 = indexes[i], indexes[i + 1] + rnn_output, hidden_states = self.rnn(rnn_input[:,i0:i1,:], hidden_states) + hidden_states[:, (terminated[:,i1-1]), :] = 0 + rnn_outputs.append(rnn_output) + + rnn_output = torch.cat(rnn_outputs, dim=1) + # no need to reset the RNN state in the sequence + else: + rnn_output, hidden_states = self.rnn(rnn_input, hidden_states) + # rollout + else: + rnn_input = states.view(-1, 1, states.shape[-1]) # (N, L, Hin): N=num_envs, L=1 + rnn_output, hidden_states = self.rnn(rnn_input, hidden_states) + + # flatten the RNN output + rnn_output = torch.flatten(rnn_output, start_dim=0, end_dim=1) # (N, L, D ∗ Hout) -> (N * L, D ∗ Hout) + + x = self.fc1(rnn_output) + x = F.relu(x) + x = self.fc2(x) + x = F.relu(x) + x = self.fc3(x) + + return torch.tanh(x), self.log_std_parameter, {"rnn": [hidden_states]} + + +# instantiate the model (assumes there is a wrapped environment: env) +policy = RNN(observation_space=env.observation_space, + action_space=env.action_space, + device=env.device, + clip_actions=True, + clip_log_std=True, + min_log_std=-20, + max_log_std=2, + reduction="sum", + num_envs=env.num_envs, + num_layers=1, + hidden_size=64, + sequence_length=10) +# [end-rnn-functional] + +# ============================================================================= + +# [start-gru-sequential] +import torch +import torch.nn as nn + +from skrl.models.torch import Model, GaussianMixin + + +# define the model +class GRU(GaussianMixin, Model): + def __init__(self, observation_space, action_space, device, clip_actions=False, + clip_log_std=True, min_log_std=-20, max_log_std=2, reduction="sum", + num_envs=1, num_layers=1, hidden_size=64, sequence_length=10): + Model.__init__(self, observation_space, action_space, device) + GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std, reduction) + + self.num_envs = num_envs + self.num_layers = num_layers + self.hidden_size = hidden_size # Hout + self.sequence_length = sequence_length + + self.gru = nn.GRU(input_size=self.num_observations, + hidden_size=self.hidden_size, + num_layers=self.num_layers, + batch_first=True) # batch_first -> (batch, sequence, features) + + self.net = nn.Sequential(nn.Linear(self.hidden_size, 64), + nn.ReLU(), + nn.Linear(64, 32), + nn.ReLU(), + nn.Linear(32, self.num_actions), + nn.Tanh()) + + self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) + + def get_specification(self): + # batch size (N) is the number of envs during rollout + return {"rnn": {"sequence_length": self.sequence_length, + "sizes": [(self.num_layers, self.num_envs, self.hidden_size)]}} # hidden states (D ∗ num_layers, N, Hout) + + def compute(self, inputs, role): + states = inputs["states"] + terminated = inputs.get("terminated", None) + hidden_states = inputs["rnn"][0] + + # training + if self.training: + rnn_input = states.view(-1, self.sequence_length, states.shape[-1]) # (N, L, Hin): N=batch_size, L=sequence_length + hidden_states = hidden_states.view(self.num_layers, -1, self.sequence_length, hidden_states.shape[-1]) # (D * num_layers, N, L, Hout) + # get the hidden states corresponding to the initial sequence + hidden_states = hidden_states[:,:,0,:].contiguous() # (D * num_layers, N, Hout) + + # reset the RNN state in the middle of a sequence + if terminated is not None and torch.any(terminated): + rnn_outputs = [] + terminated = terminated.view(-1, self.sequence_length) + indexes = [0] + (terminated[:,:-1].any(dim=0).nonzero(as_tuple=True)[0] + 1).tolist() + [self.sequence_length] + + for i in range(len(indexes) - 1): + i0, i1 = indexes[i], indexes[i + 1] + rnn_output, hidden_states = self.gru(rnn_input[:,i0:i1,:], hidden_states) + hidden_states[:, (terminated[:,i1-1]), :] = 0 + rnn_outputs.append(rnn_output) + + rnn_output = torch.cat(rnn_outputs, dim=1) + # no need to reset the RNN state in the sequence + else: + rnn_output, hidden_states = self.gru(rnn_input, hidden_states) + # rollout + else: + rnn_input = states.view(-1, 1, states.shape[-1]) # (N, L, Hin): N=num_envs, L=1 + rnn_output, hidden_states = self.gru(rnn_input, hidden_states) + + # flatten the RNN output + rnn_output = torch.flatten(rnn_output, start_dim=0, end_dim=1) # (N, L, D ∗ Hout) -> (N * L, D ∗ Hout) + + return self.net(rnn_output), self.log_std_parameter, {"rnn": [hidden_states]} + + +# instantiate the model (assumes there is a wrapped environment: env) +policy = GRU(observation_space=env.observation_space, + action_space=env.action_space, + device=env.device, + clip_actions=True, + clip_log_std=True, + min_log_std=-20, + max_log_std=2, + reduction="sum", + num_envs=env.num_envs, + num_layers=1, + hidden_size=64, + sequence_length=10) +# [end-gru-sequential] + +# [start-gru-functional] +import torch +import torch.nn as nn +import torch.nn.functional as F + +from skrl.models.torch import Model, GaussianMixin + + +# define the model +class GRU(GaussianMixin, Model): + def __init__(self, observation_space, action_space, device, clip_actions=False, + clip_log_std=True, min_log_std=-20, max_log_std=2, reduction="sum", + num_envs=1, num_layers=1, hidden_size=64, sequence_length=10): + Model.__init__(self, observation_space, action_space, device) + GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std, reduction) + + self.num_envs = num_envs + self.num_layers = num_layers + self.hidden_size = hidden_size # Hout + self.sequence_length = sequence_length + + self.gru = nn.GRU(input_size=self.num_observations, + hidden_size=self.hidden_size, + num_layers=self.num_layers, + batch_first=True) # batch_first -> (batch, sequence, features) + + self.fc1 = nn.Linear(self.hidden_size, 64) + self.fc2 = nn.Linear(64, 32) + self.fc3 = nn.Linear(32, self.num_actions) + + self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) + + def get_specification(self): + # batch size (N) is the number of envs during rollout + return {"rnn": {"sequence_length": self.sequence_length, + "sizes": [(self.num_layers, self.num_envs, self.hidden_size)]}} # hidden states (D ∗ num_layers, N, Hout) + + def compute(self, inputs, role): + states = inputs["states"] + terminated = inputs.get("terminated", None) + hidden_states = inputs["rnn"][0] + + # training + if self.training: + rnn_input = states.view(-1, self.sequence_length, states.shape[-1]) # (N, L, Hin): N=batch_size, L=sequence_length + hidden_states = hidden_states.view(self.num_layers, -1, self.sequence_length, hidden_states.shape[-1]) # (D * num_layers, N, L, Hout) + # get the hidden states corresponding to the initial sequence + hidden_states = hidden_states[:,:,0,:].contiguous() # (D * num_layers, N, Hout) + + # reset the RNN state in the middle of a sequence + if terminated is not None and torch.any(terminated): + rnn_outputs = [] + terminated = terminated.view(-1, self.sequence_length) + indexes = [0] + (terminated[:,:-1].any(dim=0).nonzero(as_tuple=True)[0] + 1).tolist() + [self.sequence_length] + + for i in range(len(indexes) - 1): + i0, i1 = indexes[i], indexes[i + 1] + rnn_output, hidden_states = self.gru(rnn_input[:,i0:i1,:], hidden_states) + hidden_states[:, (terminated[:,i1-1]), :] = 0 + rnn_outputs.append(rnn_output) + + rnn_output = torch.cat(rnn_outputs, dim=1) + # no need to reset the RNN state in the sequence + else: + rnn_output, hidden_states = self.gru(rnn_input, hidden_states) + # rollout + else: + rnn_input = states.view(-1, 1, states.shape[-1]) # (N, L, Hin): N=num_envs, L=1 + rnn_output, hidden_states = self.gru(rnn_input, hidden_states) + + # flatten the RNN output + rnn_output = torch.flatten(rnn_output, start_dim=0, end_dim=1) # (N, L, D ∗ Hout) -> (N * L, D ∗ Hout) + + x = self.fc1(rnn_output) + x = F.relu(x) + x = self.fc2(x) + x = F.relu(x) + x = self.fc3(x) + + return torch.tanh(x), self.log_std_parameter, {"rnn": [hidden_states]} + + +# instantiate the model (assumes there is a wrapped environment: env) +policy = GRU(observation_space=env.observation_space, + action_space=env.action_space, + device=env.device, + clip_actions=True, + clip_log_std=True, + min_log_std=-20, + max_log_std=2, + reduction="sum", + num_envs=env.num_envs, + num_layers=1, + hidden_size=64, + sequence_length=10) +# [end-gru-functional] + +# ============================================================================= + +# [start-lstm-sequential] +import torch +import torch.nn as nn + +from skrl.models.torch import Model, GaussianMixin + + +# define the model +class LSTM(GaussianMixin, Model): + def __init__(self, observation_space, action_space, device, clip_actions=False, + clip_log_std=True, min_log_std=-20, max_log_std=2, reduction="sum", + num_envs=1, num_layers=1, hidden_size=64, sequence_length=10): + Model.__init__(self, observation_space, action_space, device) + GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std, reduction) + + self.num_envs = num_envs + self.num_layers = num_layers + self.hidden_size = hidden_size # Hcell (Hout is Hcell because proj_size = 0) + self.sequence_length = sequence_length + + self.lstm = nn.LSTM(input_size=self.num_observations, + hidden_size=self.hidden_size, + num_layers=self.num_layers, + batch_first=True) # batch_first -> (batch, sequence, features) + + self.net = nn.Sequential(nn.Linear(self.hidden_size, 64), + nn.ReLU(), + nn.Linear(64, 32), + nn.ReLU(), + nn.Linear(32, self.num_actions), + nn.Tanh()) + + self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) + + def get_specification(self): + # batch size (N) is the number of envs during rollout + return {"rnn": {"sequence_length": self.sequence_length, + "sizes": [(self.num_layers, self.num_envs, self.hidden_size), # hidden states (D ∗ num_layers, N, Hout) + (self.num_layers, self.num_envs, self.hidden_size)]}} # cell states (D ∗ num_layers, N, Hcell) + + def compute(self, inputs, role): + states = inputs["states"] + terminated = inputs.get("terminated", None) + hidden_states, cell_states = inputs["rnn"][0], inputs["rnn"][1] + + # training + if self.training: + rnn_input = states.view(-1, self.sequence_length, states.shape[-1]) # (N, L, Hin): N=batch_size, L=sequence_length + hidden_states = hidden_states.view(self.num_layers, -1, self.sequence_length, hidden_states.shape[-1]) # (D * num_layers, N, L, Hout) + cell_states = cell_states.view(self.num_layers, -1, self.sequence_length, cell_states.shape[-1]) # (D * num_layers, N, L, Hcell) + # get the hidden/cell states corresponding to the initial sequence + hidden_states = hidden_states[:,:,0,:].contiguous() # (D * num_layers, N, Hout) + cell_states = cell_states[:,:,0,:].contiguous() # (D * num_layers, N, Hcell) + + # reset the RNN state in the middle of a sequence + if terminated is not None and torch.any(terminated): + rnn_outputs = [] + terminated = terminated.view(-1, self.sequence_length) + indexes = [0] + (terminated[:,:-1].any(dim=0).nonzero(as_tuple=True)[0] + 1).tolist() + [self.sequence_length] + + for i in range(len(indexes) - 1): + i0, i1 = indexes[i], indexes[i + 1] + rnn_output, (hidden_states, cell_states) = self.lstm(rnn_input[:,i0:i1,:], (hidden_states, cell_states)) + hidden_states[:, (terminated[:,i1-1]), :] = 0 + cell_states[:, (terminated[:,i1-1]), :] = 0 + rnn_outputs.append(rnn_output) + + rnn_states = (hidden_states, cell_states) + rnn_output = torch.cat(rnn_outputs, dim=1) + # no need to reset the RNN state in the sequence + else: + rnn_output, rnn_states = self.lstm(rnn_input, (hidden_states, cell_states)) + # rollout + else: + rnn_input = states.view(-1, 1, states.shape[-1]) # (N, L, Hin): N=num_envs, L=1 + rnn_output, rnn_states = self.lstm(rnn_input, (hidden_states, cell_states)) + + # flatten the RNN output + rnn_output = torch.flatten(rnn_output, start_dim=0, end_dim=1) # (N, L, D ∗ Hout) -> (N * L, D ∗ Hout) + + return self.net(rnn_output), self.log_std_parameter, {"rnn": [rnn_states[0], rnn_states[1]]} + + +# instantiate the model (assumes there is a wrapped environment: env) +policy = LSTM(observation_space=env.observation_space, + action_space=env.action_space, + device=env.device, + clip_actions=True, + clip_log_std=True, + min_log_std=-20, + max_log_std=2, + reduction="sum", + num_envs=env.num_envs, + num_layers=1, + hidden_size=64, + sequence_length=10) +# [end-lstm-sequential] + +# [start-lstm-functional] +import torch +import torch.nn as nn +import torch.nn.functional as F + +from skrl.models.torch import Model, GaussianMixin + + +# define the model +class LSTM(GaussianMixin, Model): + def __init__(self, observation_space, action_space, device, clip_actions=False, + clip_log_std=True, min_log_std=-20, max_log_std=2, reduction="sum", + num_envs=1, num_layers=1, hidden_size=64, sequence_length=10): + Model.__init__(self, observation_space, action_space, device) + GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std, reduction) + + self.num_envs = num_envs + self.num_layers = num_layers + self.hidden_size = hidden_size # Hcell (Hout is Hcell because proj_size = 0) + self.sequence_length = sequence_length + + self.lstm = nn.LSTM(input_size=self.num_observations, + hidden_size=self.hidden_size, + num_layers=self.num_layers, + batch_first=True) # batch_first -> (batch, sequence, features) + + self.fc1 = nn.Linear(self.hidden_size, 64) + self.fc2 = nn.Linear(64, 32) + self.fc3 = nn.Linear(32, self.num_actions) + + self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) + + def get_specification(self): + # batch size (N) is the number of envs during rollout + return {"rnn": {"sequence_length": self.sequence_length, + "sizes": [(self.num_layers, self.num_envs, self.hidden_size), # hidden states (D ∗ num_layers, N, Hout) + (self.num_layers, self.num_envs, self.hidden_size)]}} # cell states (D ∗ num_layers, N, Hcell) + + def compute(self, inputs, role): + states = inputs["states"] + terminated = inputs.get("terminated", None) + hidden_states, cell_states = inputs["rnn"][0], inputs["rnn"][1] + + # training + if self.training: + rnn_input = states.view(-1, self.sequence_length, states.shape[-1]) # (N, L, Hin): N=batch_size, L=sequence_length + hidden_states = hidden_states.view(self.num_layers, -1, self.sequence_length, hidden_states.shape[-1]) # (D * num_layers, N, L, Hout) + cell_states = cell_states.view(self.num_layers, -1, self.sequence_length, cell_states.shape[-1]) # (D * num_layers, N, L, Hcell) + # get the hidden/cell states corresponding to the initial sequence + hidden_states = hidden_states[:,:,0,:].contiguous() # (D * num_layers, N, Hout) + cell_states = cell_states[:,:,0,:].contiguous() # (D * num_layers, N, Hcell) + + # reset the RNN state in the middle of a sequence + if terminated is not None and torch.any(terminated): + rnn_outputs = [] + terminated = terminated.view(-1, self.sequence_length) + indexes = [0] + (terminated[:,:-1].any(dim=0).nonzero(as_tuple=True)[0] + 1).tolist() + [self.sequence_length] + + for i in range(len(indexes) - 1): + i0, i1 = indexes[i], indexes[i + 1] + rnn_output, (hidden_states, cell_states) = self.lstm(rnn_input[:,i0:i1,:], (hidden_states, cell_states)) + hidden_states[:, (terminated[:,i1-1]), :] = 0 + cell_states[:, (terminated[:,i1-1]), :] = 0 + rnn_outputs.append(rnn_output) + + rnn_states = (hidden_states, cell_states) + rnn_output = torch.cat(rnn_outputs, dim=1) + # no need to reset the RNN state in the sequence + else: + rnn_output, rnn_states = self.lstm(rnn_input, (hidden_states, cell_states)) + # rollout + else: + rnn_input = states.view(-1, 1, states.shape[-1]) # (N, L, Hin): N=num_envs, L=1 + rnn_output, rnn_states = self.lstm(rnn_input, (hidden_states, cell_states)) + + # flatten the RNN output + rnn_output = torch.flatten(rnn_output, start_dim=0, end_dim=1) # (N, L, D ∗ Hout) -> (N * L, D ∗ Hout) + + x = self.fc1(rnn_output) + x = F.relu(x) + x = self.fc2(x) + x = F.relu(x) + x = self.fc3(x) + + return torch.tanh(x), self.log_std_parameter, {"rnn": [rnn_states[0], rnn_states[1]]} + + +# instantiate the model (assumes there is a wrapped environment: env) +policy = LSTM(observation_space=env.observation_space, + action_space=env.action_space, + device=env.device, + clip_actions=True, + clip_log_std=True, + min_log_std=-20, + max_log_std=2, + reduction="sum", + num_envs=env.num_envs, + num_layers=1, + hidden_size=64, + sequence_length=10) +# [end-lstm-functional] diff --git a/docs/source/snippets/isaacgym_utils.py b/docs/source/snippets/isaacgym_utils.py index 14b04cde..b5dd68b7 100644 --- a/docs/source/snippets/isaacgym_utils.py +++ b/docs/source/snippets/isaacgym_utils.py @@ -48,7 +48,7 @@ cam_props.width, cam_props.height = 300, 300 cam_handle = gym.create_camera_sensor(env, cam_props) gym.set_camera_location(cam_handle, env, gymapi.Vec3(1, 1, 1), gymapi.Vec3(0, 0, 0)) - + envs.append(env) cameras.append(cam_handle) @@ -62,7 +62,7 @@ gym.simulate(sim) # render the scene - web_viewer.render(fetch_results=True, - step_graphics=True, - render_all_camera_sensors=True, + web_viewer.render(fetch_results=True, + step_graphics=True, + render_all_camera_sensors=True, wait_for_page_load=True) diff --git a/docs/source/snippets/memory.py b/docs/source/snippets/memory.py index 42bbab11..5fec7f78 100644 --- a/docs/source/snippets/memory.py +++ b/docs/source/snippets/memory.py @@ -32,6 +32,6 @@ def sample(self, names: Tuple[str], batch_size: int, mini_batches: int = 1) -> L :rtype: list of torch.Tensor list """ # ================================ - # - sample a batch from memory. + # - sample a batch from memory. # It is possible to generate only the sampling indexes and call self.sample_by_index(...) # ================================ diff --git a/docs/source/snippets/model_mixin.py b/docs/source/snippets/model_mixin.py index 85c0af9e..06962241 100644 --- a/docs/source/snippets/model_mixin.py +++ b/docs/source/snippets/model_mixin.py @@ -1,6 +1,5 @@ # [start-model] -from typing import Optional, Union, Sequence - +from typing import Union, Mapping, Sequence, Tuple, Any import gym import torch @@ -9,9 +8,9 @@ class CustomModel(Model): - def __init__(self, - observation_space: Union[int, Sequence[int], gym.Space], - action_space: Union[int, Sequence[int], gym.Space], + def __init__(self, + observation_space: Union[int, Sequence[int], gym.Space], + action_space: Union[int, Sequence[int], gym.Space], device: Union[str, torch.device] = "cuda:0") -> None: """ :param observation_space: Observation/state space or shape. @@ -24,27 +23,24 @@ def __init__(self, :type device: str or torch.device, optional """ super().__init__(observation_space, action_space, device) - - def act(self, - states: torch.Tensor, - taken_actions: Optional[torch.Tensor] = None, - role: str = "") -> Sequence[torch.Tensor]: + + def act(self, + inputs: Mapping[str, Union[torch.Tensor, Any]], + role: str = "") -> Tuple[torch.Tensor, Union[torch.Tensor, None], Mapping[str, Union[torch.Tensor, Any]]]: """Act according to the specified behavior - :param states: Observation/state of the environment used to make the decision - :type states: torch.Tensor - :param taken_actions: Actions taken by a policy to the given states (default: ``None``). - The use of these actions only makes sense in critical models, e.g. - :type taken_actions: torch.Tensor, optional + :param inputs: Model inputs. The most common keys are: + + - ``"states"``: state of the environment used to make the decision + - ``"taken_actions"``: actions taken by the policy for the given states + :type inputs: dict where the values are typically torch.Tensor :param role: Role play by the model (default: ``""``) :type role: str, optional - :raises NotImplementedError: Child class must implement this method - - :return: Action to be taken by the agent given the state of the environment. - The typical sequence's components are the actions, the log of the probability density function and mean actions. - Deterministic agents must ignore the last two components and return empty tensors or None for them - :rtype: sequence of torch.Tensor + :return: Model output. The first component is the action to be taken by the agent. + The second component is the log of the probability density function for stochastic models + or None for deterministic models. The third component is a dictionary containing extra output values + :rtype: tuple of torch.Tensor, torch.Tensor or None, and dictionary """ # ============================== # - act in response to the state @@ -54,7 +50,7 @@ def act(self, # ============================================================================= # [start-mixin] -from typing import Optional, Sequence +from typing import Union, Mapping, Sequence, Tuple, Any import gym @@ -74,26 +70,23 @@ def __init__(self, clip_actions: bool = False, role: str = "") -> None: self._custom_clip_actions = {} self._custom_clip_actions[role] - def act(self, - states: torch.Tensor, - taken_actions: Optional[torch.Tensor] = None, - role: str = "") -> Sequence[torch.Tensor]: + def act(self, + inputs: Mapping[str, Union[torch.Tensor, Any]], + role: str = "") -> Tuple[torch.Tensor, Union[torch.Tensor, None], Mapping[str, Union[torch.Tensor, Any]]]: """Act according to the specified behavior - :param states: Observation/state of the environment used to make the decision - :type states: torch.Tensor - :param taken_actions: Actions taken by a policy to the given states (default: ``None``). - The use of these actions only makes sense in critical models, e.g. - :type taken_actions: torch.Tensor, optional + :param inputs: Model inputs. The most common keys are: + + - ``"states"``: state of the environment used to make the decision + - ``"taken_actions"``: actions taken by the policy for the given states + :type inputs: dict where the values are typically torch.Tensor :param role: Role play by the model (default: ``""``) :type role: str, optional - :raises NotImplementedError: Child class must implement this method - - :return: Action to be taken by the agent given the state of the environment. - The typical sequence's components are the actions, the log of the probability density function and mean actions. - Deterministic agents must ignore the last two components and return empty tensors or None for them - :rtype: sequence of torch.Tensor + :return: Model output. The first component is the action to be taken by the agent. + The second component is the log of the probability density function for stochastic models + or None for deterministic models. The third component is a dictionary containing extra output values + :rtype: tuple of torch.Tensor, torch.Tensor or None, and dictionary """ # ============================== # - act in response to the state @@ -101,4 +94,4 @@ def act(self, # e.g. retrieve clip actions according to role clip_actions = self._custom_clip_actions[role] if role in self._custom_clip_actions else self._custom_clip_actions[""] -# [end-mixin] \ No newline at end of file +# [end-mixin] diff --git a/docs/source/snippets/multivariate_gaussian_model.py b/docs/source/snippets/multivariate_gaussian_model.py index e53f7fe1..e39d95c0 100644 --- a/docs/source/snippets/multivariate_gaussian_model.py +++ b/docs/source/snippets/multivariate_gaussian_model.py @@ -1,4 +1,41 @@ -# [start-mlp] +# [start-mlp-sequential] +import torch +import torch.nn as nn + +from skrl.models.torch import Model, MultivariateGaussianMixin + + +# define the model +class MLP(MultivariateGaussianMixin, Model): + def __init__(self, observation_space, action_space, device, + clip_actions=False, clip_log_std=True, min_log_std=-20, max_log_std=2): + Model.__init__(self, observation_space, action_space, device) + MultivariateGaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std) + + self.net = nn.Sequential(nn.Linear(self.num_observations, 64), + nn.ReLU(), + nn.Linear(64, 32), + nn.ReLU(), + nn.Linear(32, self.num_actions), + nn.Tanh()) + + self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) + + def compute(self, inputs, role): + return self.net(inputs["states"]), self.log_std_parameter, {} + + +# instantiate the model (assumes there is a wrapped environment: env) +policy = MLP(observation_space=env.observation_space, + action_space=env.action_space, + device=env.device, + clip_actions=True, + clip_log_std=True, + min_log_std=-20, + max_log_std=2) +# [end-mlp-sequential] + +# [start-mlp-functional] import torch import torch.nn as nn import torch.nn.functional as F @@ -8,81 +45,720 @@ # define the model class MLP(MultivariateGaussianMixin, Model): - def __init__(self, observation_space, action_space, device, + def __init__(self, observation_space, action_space, device, clip_actions=False, clip_log_std=True, min_log_std=-20, max_log_std=2): Model.__init__(self, observation_space, action_space, device) MultivariateGaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std) - self.linear_layer_1 = nn.Linear(self.num_observations, 128) - self.linear_layer_2 = nn.Linear(128, 64) - self.linear_layer_3 = nn.Linear(64, 32) - self.mean_action_layer = nn.Linear(32, self.num_actions) + self.fc1 = nn.Linear(self.num_observations, 64) + self.fc2 = nn.Linear(64, 32) + self.fc3 = nn.Linear(32, self.num_actions) self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) - def compute(self, states, taken_actions, role): - x = F.relu(self.linear_layer_1(states)) - x = F.relu(self.linear_layer_2(x)) - x = F.relu(self.linear_layer_3(x)) - return torch.tanh(self.mean_action_layer(x)), self.log_std_parameter + def compute(self, inputs, role): + x = self.fc1(inputs["states"]) + x = F.relu(x) + x = self.fc2(x) + x = F.relu(x) + x = self.fc3(x) + return torch.tanh(x), self.log_std_parameter, {} + # instantiate the model (assumes there is a wrapped environment: env) -policy = MLP(observation_space=env.observation_space, - action_space=env.action_space, - device=env.device, +policy = MLP(observation_space=env.observation_space, + action_space=env.action_space, + device=env.device, clip_actions=True, clip_log_std=True, min_log_std=-20, max_log_std=2) -# [end-mlp] +# [end-mlp-functional] # ============================================================================= -# [start-cnn] +# [start-cnn-sequential] import torch import torch.nn as nn -import torch.nn.functional as F from skrl.models.torch import Model, MultivariateGaussianMixin # define the model class CNN(MultivariateGaussianMixin, Model): - def __init__(self, observation_space, action_space, device, + def __init__(self, observation_space, action_space, device, clip_actions=False, clip_log_std=True, min_log_std=-20, max_log_std=2): Model.__init__(self, observation_space, action_space, device) MultivariateGaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std) - self.net = nn.Sequential(nn.Conv2d(1, 64, kernel_size=4, stride=2), + self.net = nn.Sequential(nn.Conv2d(3, 32, kernel_size=8, stride=4), nn.ReLU(), - nn.Conv2d(64, 32, kernel_size=4, stride=2), + nn.Conv2d(32, 64, kernel_size=4, stride=2), nn.ReLU(), - nn.Conv2d(32, 16, kernel_size=2, stride=2), - nn.ReLU(), - nn.Conv2d(16, 8, kernel_size=2, stride=2), + nn.Conv2d(64, 64, kernel_size=3, stride=1), nn.ReLU(), nn.Flatten(), - nn.Linear(1800, 256), + nn.Linear(1024, 512), nn.ReLU(), - nn.Linear(256, 16), + nn.Linear(512, 16), + nn.Tanh(), + nn.Linear(16, 64), nn.Tanh(), - nn.Linear(16, 32), + nn.Linear(64, 32), nn.Tanh(), nn.Linear(32, self.num_actions)) - + self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) - def compute(self, states, taken_actions, role): - # permute (samples, width, height, channels) -> (samples, channels, width, height) - return self.net(states.permute(0, 3, 1, 2)), self.log_std_parameter + def compute(self, inputs, role): + # permute (samples, width * height * channels) -> (samples, channels, width, height) + return self.net(inputs["states"].view(-1, *self.observation_space.shape).permute(0, 3, 1, 2)), self.log_std_parameter, {} # instantiate the model (assumes there is a wrapped environment: env) -policy = CNN(observation_space=env.observation_space, - action_space=env.action_space, - device=env.device, +policy = CNN(observation_space=env.observation_space, + action_space=env.action_space, + device=env.device, clip_actions=True, clip_log_std=True, min_log_std=-20, max_log_std=2) -# [end-cnn] +# [end-cnn-sequential] + +# [start-cnn-functional] +import torch +import torch.nn as nn +import torch.nn.functional as F + +from skrl.models.torch import Model, MultivariateGaussianMixin + + +# define the model +class CNN(MultivariateGaussianMixin, Model): + def __init__(self, observation_space, action_space, device, + clip_actions=False, clip_log_std=True, min_log_std=-20, max_log_std=2): + Model.__init__(self, observation_space, action_space, device) + MultivariateGaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std) + + self.conv1 = nn.Conv2d(3, 32, kernel_size=8, stride=4) + self.conv2 = nn.Conv2d(32, 64, kernel_size=4, stride=2) + self.conv3 = nn.Conv2d(64, 64, kernel_size=3, stride=1) + self.fc1 = nn.Linear(1024, 512) + self.fc2 = nn.Linear(512, 16) + self.fc3 = nn.Linear(16, 64) + self.fc4 = nn.Linear(64, 32) + self.fc5 = nn.Linear(32, self.num_actions) + + self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) + + def compute(self, inputs, role): + # permute (samples, width * height * channels) -> (samples, channels, width, height) + x = inputs["states"].view(-1, *self.observation_space.shape).permute(0, 3, 1, 2) + x = self.conv1(x) + x = F.relu(x) + x = self.conv2(x) + x = F.relu(x) + x = self.conv3(x) + x = F.relu(x) + x = torch.flatten(x, start_dim=1) + x = self.fc1(x) + x = F.relu(x) + x = self.fc2(x) + x = torch.tanh(x) + x = self.fc3(x) + x = torch.tanh(x) + x = self.fc4(x) + x = torch.tanh(x) + x = self.fc5(x) + return x, self.log_std_parameter, {} + + +# instantiate the model (assumes there is a wrapped environment: env) +policy = CNN(observation_space=env.observation_space, + action_space=env.action_space, + device=env.device, + clip_actions=True, + clip_log_std=True, + min_log_std=-20, + max_log_std=2) +# [end-cnn-functional] + +# ============================================================================= + +# [start-rnn-sequential] +import torch +import torch.nn as nn + +from skrl.models.torch import Model, MultivariateGaussianMixin + + +# define the model +class RNN(MultivariateGaussianMixin, Model): + def __init__(self, observation_space, action_space, device, clip_actions=False, + clip_log_std=True, min_log_std=-20, max_log_std=2, + num_envs=1, num_layers=1, hidden_size=64, sequence_length=10): + Model.__init__(self, observation_space, action_space, device) + MultivariateGaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std) + + self.num_envs = num_envs + self.num_layers = num_layers + self.hidden_size = hidden_size # Hout + self.sequence_length = sequence_length + + self.rnn = nn.RNN(input_size=self.num_observations, + hidden_size=self.hidden_size, + num_layers=self.num_layers, + batch_first=True) # batch_first -> (batch, sequence, features) + + self.net = nn.Sequential(nn.Linear(self.hidden_size, 64), + nn.ReLU(), + nn.Linear(64, 32), + nn.ReLU(), + nn.Linear(32, self.num_actions), + nn.Tanh()) + + self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) + + def get_specification(self): + # batch size (N) is the number of envs during rollout + return {"rnn": {"sequence_length": self.sequence_length, + "sizes": [(self.num_layers, self.num_envs, self.hidden_size)]}} # hidden states (D ∗ num_layers, N, Hout) + + def compute(self, inputs, role): + states = inputs["states"] + terminated = inputs.get("terminated", None) + hidden_states = inputs["rnn"][0] + + # training + if self.training: + rnn_input = states.view(-1, self.sequence_length, states.shape[-1]) # (N, L, Hin): N=batch_size, L=sequence_length + hidden_states = hidden_states.view(self.num_layers, -1, self.sequence_length, hidden_states.shape[-1]) # (D * num_layers, N, L, Hout) + # get the hidden states corresponding to the initial sequence + hidden_states = hidden_states[:,:,0,:].contiguous() # (D * num_layers, N, Hout) + + # reset the RNN state in the middle of a sequence + if terminated is not None and torch.any(terminated): + rnn_outputs = [] + terminated = terminated.view(-1, self.sequence_length) + indexes = [0] + (terminated[:,:-1].any(dim=0).nonzero(as_tuple=True)[0] + 1).tolist() + [self.sequence_length] + + for i in range(len(indexes) - 1): + i0, i1 = indexes[i], indexes[i + 1] + rnn_output, hidden_states = self.rnn(rnn_input[:,i0:i1,:], hidden_states) + hidden_states[:, (terminated[:,i1-1]), :] = 0 + rnn_outputs.append(rnn_output) + + rnn_output = torch.cat(rnn_outputs, dim=1) + # no need to reset the RNN state in the sequence + else: + rnn_output, hidden_states = self.rnn(rnn_input, hidden_states) + # rollout + else: + rnn_input = states.view(-1, 1, states.shape[-1]) # (N, L, Hin): N=num_envs, L=1 + rnn_output, hidden_states = self.rnn(rnn_input, hidden_states) + + # flatten the RNN output + rnn_output = torch.flatten(rnn_output, start_dim=0, end_dim=1) # (N, L, D ∗ Hout) -> (N * L, D ∗ Hout) + + return self.net(rnn_output), self.log_std_parameter, {"rnn": [hidden_states]} + + +# instantiate the model (assumes there is a wrapped environment: env) +policy = RNN(observation_space=env.observation_space, + action_space=env.action_space, + device=env.device, + clip_actions=True, + clip_log_std=True, + min_log_std=-20, + max_log_std=2, + num_envs=env.num_envs, + num_layers=1, + hidden_size=64, + sequence_length=10) +# [end-rnn-sequential] + +# [start-rnn-functional] +import torch +import torch.nn as nn +import torch.nn.functional as F + +from skrl.models.torch import Model, MultivariateGaussianMixin + + +# define the model +class RNN(MultivariateGaussianMixin, Model): + def __init__(self, observation_space, action_space, device, clip_actions=False, + clip_log_std=True, min_log_std=-20, max_log_std=2, + num_envs=1, num_layers=1, hidden_size=64, sequence_length=10): + Model.__init__(self, observation_space, action_space, device) + MultivariateGaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std) + + self.num_envs = num_envs + self.num_layers = num_layers + self.hidden_size = hidden_size # Hout + self.sequence_length = sequence_length + + self.rnn = nn.RNN(input_size=self.num_observations, + hidden_size=self.hidden_size, + num_layers=self.num_layers, + batch_first=True) # batch_first -> (batch, sequence, features) + + self.fc1 = nn.Linear(self.hidden_size, 64) + self.fc2 = nn.Linear(64, 32) + self.fc3 = nn.Linear(32, self.num_actions) + + self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) + + def get_specification(self): + # batch size (N) is the number of envs during rollout + return {"rnn": {"sequence_length": self.sequence_length, + "sizes": [(self.num_layers, self.num_envs, self.hidden_size)]}} # hidden states (D ∗ num_layers, N, Hout) + + def compute(self, inputs, role): + states = inputs["states"] + terminated = inputs.get("terminated", None) + hidden_states = inputs["rnn"][0] + + # training + if self.training: + rnn_input = states.view(-1, self.sequence_length, states.shape[-1]) # (N, L, Hin): N=batch_size, L=sequence_length + hidden_states = hidden_states.view(self.num_layers, -1, self.sequence_length, hidden_states.shape[-1]) # (D * num_layers, N, L, Hout) + # get the hidden states corresponding to the initial sequence + hidden_states = hidden_states[:,:,0,:].contiguous() # (D * num_layers, N, Hout) + + # reset the RNN state in the middle of a sequence + if terminated is not None and torch.any(terminated): + rnn_outputs = [] + terminated = terminated.view(-1, self.sequence_length) + indexes = [0] + (terminated[:,:-1].any(dim=0).nonzero(as_tuple=True)[0] + 1).tolist() + [self.sequence_length] + + for i in range(len(indexes) - 1): + i0, i1 = indexes[i], indexes[i + 1] + rnn_output, hidden_states = self.rnn(rnn_input[:,i0:i1,:], hidden_states) + hidden_states[:, (terminated[:,i1-1]), :] = 0 + rnn_outputs.append(rnn_output) + + rnn_output = torch.cat(rnn_outputs, dim=1) + # no need to reset the RNN state in the sequence + else: + rnn_output, hidden_states = self.rnn(rnn_input, hidden_states) + # rollout + else: + rnn_input = states.view(-1, 1, states.shape[-1]) # (N, L, Hin): N=num_envs, L=1 + rnn_output, hidden_states = self.rnn(rnn_input, hidden_states) + + # flatten the RNN output + rnn_output = torch.flatten(rnn_output, start_dim=0, end_dim=1) # (N, L, D ∗ Hout) -> (N * L, D ∗ Hout) + + x = self.fc1(rnn_output) + x = F.relu(x) + x = self.fc2(x) + x = F.relu(x) + x = self.fc3(x) + + return torch.tanh(x), self.log_std_parameter, {"rnn": [hidden_states]} + + +# instantiate the model (assumes there is a wrapped environment: env) +policy = RNN(observation_space=env.observation_space, + action_space=env.action_space, + device=env.device, + clip_actions=True, + clip_log_std=True, + min_log_std=-20, + max_log_std=2, + num_envs=env.num_envs, + num_layers=1, + hidden_size=64, + sequence_length=10) +# [end-rnn-functional] + +# ============================================================================= + +# [start-gru-sequential] +import torch +import torch.nn as nn + +from skrl.models.torch import Model, MultivariateGaussianMixin + + +# define the model +class GRU(MultivariateGaussianMixin, Model): + def __init__(self, observation_space, action_space, device, clip_actions=False, + clip_log_std=True, min_log_std=-20, max_log_std=2, + num_envs=1, num_layers=1, hidden_size=64, sequence_length=10): + Model.__init__(self, observation_space, action_space, device) + MultivariateGaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std) + + self.num_envs = num_envs + self.num_layers = num_layers + self.hidden_size = hidden_size # Hout + self.sequence_length = sequence_length + + self.gru = nn.GRU(input_size=self.num_observations, + hidden_size=self.hidden_size, + num_layers=self.num_layers, + batch_first=True) # batch_first -> (batch, sequence, features) + + self.net = nn.Sequential(nn.Linear(self.hidden_size, 64), + nn.ReLU(), + nn.Linear(64, 32), + nn.ReLU(), + nn.Linear(32, self.num_actions), + nn.Tanh()) + + self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) + + def get_specification(self): + # batch size (N) is the number of envs during rollout + return {"rnn": {"sequence_length": self.sequence_length, + "sizes": [(self.num_layers, self.num_envs, self.hidden_size)]}} # hidden states (D ∗ num_layers, N, Hout) + + def compute(self, inputs, role): + states = inputs["states"] + terminated = inputs.get("terminated", None) + hidden_states = inputs["rnn"][0] + + # training + if self.training: + rnn_input = states.view(-1, self.sequence_length, states.shape[-1]) # (N, L, Hin): N=batch_size, L=sequence_length + hidden_states = hidden_states.view(self.num_layers, -1, self.sequence_length, hidden_states.shape[-1]) # (D * num_layers, N, L, Hout) + # get the hidden states corresponding to the initial sequence + hidden_states = hidden_states[:,:,0,:].contiguous() # (D * num_layers, N, Hout) + + # reset the RNN state in the middle of a sequence + if terminated is not None and torch.any(terminated): + rnn_outputs = [] + terminated = terminated.view(-1, self.sequence_length) + indexes = [0] + (terminated[:,:-1].any(dim=0).nonzero(as_tuple=True)[0] + 1).tolist() + [self.sequence_length] + + for i in range(len(indexes) - 1): + i0, i1 = indexes[i], indexes[i + 1] + rnn_output, hidden_states = self.gru(rnn_input[:,i0:i1,:], hidden_states) + hidden_states[:, (terminated[:,i1-1]), :] = 0 + rnn_outputs.append(rnn_output) + + rnn_output = torch.cat(rnn_outputs, dim=1) + # no need to reset the RNN state in the sequence + else: + rnn_output, hidden_states = self.gru(rnn_input, hidden_states) + # rollout + else: + rnn_input = states.view(-1, 1, states.shape[-1]) # (N, L, Hin): N=num_envs, L=1 + rnn_output, hidden_states = self.gru(rnn_input, hidden_states) + + # flatten the RNN output + rnn_output = torch.flatten(rnn_output, start_dim=0, end_dim=1) # (N, L, D ∗ Hout) -> (N * L, D ∗ Hout) + + return self.net(rnn_output), self.log_std_parameter, {"rnn": [hidden_states]} + + +# instantiate the model (assumes there is a wrapped environment: env) +policy = GRU(observation_space=env.observation_space, + action_space=env.action_space, + device=env.device, + clip_actions=True, + clip_log_std=True, + min_log_std=-20, + max_log_std=2, + num_envs=env.num_envs, + num_layers=1, + hidden_size=64, + sequence_length=10) +# [end-gru-sequential] + +# [start-gru-functional] +import torch +import torch.nn as nn +import torch.nn.functional as F + +from skrl.models.torch import Model, MultivariateGaussianMixin + + +# define the model +class GRU(MultivariateGaussianMixin, Model): + def __init__(self, observation_space, action_space, device, clip_actions=False, + clip_log_std=True, min_log_std=-20, max_log_std=2, + num_envs=1, num_layers=1, hidden_size=64, sequence_length=10): + Model.__init__(self, observation_space, action_space, device) + MultivariateGaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std) + + self.num_envs = num_envs + self.num_layers = num_layers + self.hidden_size = hidden_size # Hout + self.sequence_length = sequence_length + + self.gru = nn.GRU(input_size=self.num_observations, + hidden_size=self.hidden_size, + num_layers=self.num_layers, + batch_first=True) # batch_first -> (batch, sequence, features) + + self.fc1 = nn.Linear(self.hidden_size, 64) + self.fc2 = nn.Linear(64, 32) + self.fc3 = nn.Linear(32, self.num_actions) + + self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) + + def get_specification(self): + # batch size (N) is the number of envs during rollout + return {"rnn": {"sequence_length": self.sequence_length, + "sizes": [(self.num_layers, self.num_envs, self.hidden_size)]}} # hidden states (D ∗ num_layers, N, Hout) + + def compute(self, inputs, role): + states = inputs["states"] + terminated = inputs.get("terminated", None) + hidden_states = inputs["rnn"][0] + + # training + if self.training: + rnn_input = states.view(-1, self.sequence_length, states.shape[-1]) # (N, L, Hin): N=batch_size, L=sequence_length + hidden_states = hidden_states.view(self.num_layers, -1, self.sequence_length, hidden_states.shape[-1]) # (D * num_layers, N, L, Hout) + # get the hidden states corresponding to the initial sequence + hidden_states = hidden_states[:,:,0,:].contiguous() # (D * num_layers, N, Hout) + + # reset the RNN state in the middle of a sequence + if terminated is not None and torch.any(terminated): + rnn_outputs = [] + terminated = terminated.view(-1, self.sequence_length) + indexes = [0] + (terminated[:,:-1].any(dim=0).nonzero(as_tuple=True)[0] + 1).tolist() + [self.sequence_length] + + for i in range(len(indexes) - 1): + i0, i1 = indexes[i], indexes[i + 1] + rnn_output, hidden_states = self.gru(rnn_input[:,i0:i1,:], hidden_states) + hidden_states[:, (terminated[:,i1-1]), :] = 0 + rnn_outputs.append(rnn_output) + + rnn_output = torch.cat(rnn_outputs, dim=1) + # no need to reset the RNN state in the sequence + else: + rnn_output, hidden_states = self.gru(rnn_input, hidden_states) + # rollout + else: + rnn_input = states.view(-1, 1, states.shape[-1]) # (N, L, Hin): N=num_envs, L=1 + rnn_output, hidden_states = self.gru(rnn_input, hidden_states) + + # flatten the RNN output + rnn_output = torch.flatten(rnn_output, start_dim=0, end_dim=1) # (N, L, D ∗ Hout) -> (N * L, D ∗ Hout) + + x = self.fc1(rnn_output) + x = F.relu(x) + x = self.fc2(x) + x = F.relu(x) + x = self.fc3(x) + + return torch.tanh(x), self.log_std_parameter, {"rnn": [hidden_states]} + + +# instantiate the model (assumes there is a wrapped environment: env) +policy = GRU(observation_space=env.observation_space, + action_space=env.action_space, + device=env.device, + clip_actions=True, + clip_log_std=True, + min_log_std=-20, + max_log_std=2, + num_envs=env.num_envs, + num_layers=1, + hidden_size=64, + sequence_length=10) +# [end-gru-functional] + +# ============================================================================= + +# [start-lstm-sequential] +import torch +import torch.nn as nn + +from skrl.models.torch import Model, MultivariateGaussianMixin + + +# define the model +class LSTM(MultivariateGaussianMixin, Model): + def __init__(self, observation_space, action_space, device, clip_actions=False, + clip_log_std=True, min_log_std=-20, max_log_std=2, + num_envs=1, num_layers=1, hidden_size=64, sequence_length=10): + Model.__init__(self, observation_space, action_space, device) + MultivariateGaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std) + + self.num_envs = num_envs + self.num_layers = num_layers + self.hidden_size = hidden_size # Hcell (Hout is Hcell because proj_size = 0) + self.sequence_length = sequence_length + + self.lstm = nn.LSTM(input_size=self.num_observations, + hidden_size=self.hidden_size, + num_layers=self.num_layers, + batch_first=True) # batch_first -> (batch, sequence, features) + + self.net = nn.Sequential(nn.Linear(self.hidden_size, 64), + nn.ReLU(), + nn.Linear(64, 32), + nn.ReLU(), + nn.Linear(32, self.num_actions), + nn.Tanh()) + + self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) + + def get_specification(self): + # batch size (N) is the number of envs during rollout + return {"rnn": {"sequence_length": self.sequence_length, + "sizes": [(self.num_layers, self.num_envs, self.hidden_size), # hidden states (D ∗ num_layers, N, Hout) + (self.num_layers, self.num_envs, self.hidden_size)]}} # cell states (D ∗ num_layers, N, Hcell) + + def compute(self, inputs, role): + states = inputs["states"] + terminated = inputs.get("terminated", None) + hidden_states, cell_states = inputs["rnn"][0], inputs["rnn"][1] + + # training + if self.training: + rnn_input = states.view(-1, self.sequence_length, states.shape[-1]) # (N, L, Hin): N=batch_size, L=sequence_length + hidden_states = hidden_states.view(self.num_layers, -1, self.sequence_length, hidden_states.shape[-1]) # (D * num_layers, N, L, Hout) + cell_states = cell_states.view(self.num_layers, -1, self.sequence_length, cell_states.shape[-1]) # (D * num_layers, N, L, Hcell) + # get the hidden/cell states corresponding to the initial sequence + hidden_states = hidden_states[:,:,0,:].contiguous() # (D * num_layers, N, Hout) + cell_states = cell_states[:,:,0,:].contiguous() # (D * num_layers, N, Hcell) + + # reset the RNN state in the middle of a sequence + if terminated is not None and torch.any(terminated): + rnn_outputs = [] + terminated = terminated.view(-1, self.sequence_length) + indexes = [0] + (terminated[:,:-1].any(dim=0).nonzero(as_tuple=True)[0] + 1).tolist() + [self.sequence_length] + + for i in range(len(indexes) - 1): + i0, i1 = indexes[i], indexes[i + 1] + rnn_output, (hidden_states, cell_states) = self.lstm(rnn_input[:,i0:i1,:], (hidden_states, cell_states)) + hidden_states[:, (terminated[:,i1-1]), :] = 0 + cell_states[:, (terminated[:,i1-1]), :] = 0 + rnn_outputs.append(rnn_output) + + rnn_states = (hidden_states, cell_states) + rnn_output = torch.cat(rnn_outputs, dim=1) + # no need to reset the RNN state in the sequence + else: + rnn_output, rnn_states = self.lstm(rnn_input, (hidden_states, cell_states)) + # rollout + else: + rnn_input = states.view(-1, 1, states.shape[-1]) # (N, L, Hin): N=num_envs, L=1 + rnn_output, rnn_states = self.lstm(rnn_input, (hidden_states, cell_states)) + + # flatten the RNN output + rnn_output = torch.flatten(rnn_output, start_dim=0, end_dim=1) # (N, L, D ∗ Hout) -> (N * L, D ∗ Hout) + + return self.net(rnn_output), self.log_std_parameter, {"rnn": [rnn_states[0], rnn_states[1]]} + + +# instantiate the model (assumes there is a wrapped environment: env) +policy = LSTM(observation_space=env.observation_space, + action_space=env.action_space, + device=env.device, + clip_actions=True, + clip_log_std=True, + min_log_std=-20, + max_log_std=2, + num_envs=env.num_envs, + num_layers=1, + hidden_size=64, + sequence_length=10) +# [end-lstm-sequential] + +# [start-lstm-functional] +import torch +import torch.nn as nn +import torch.nn.functional as F + +from skrl.models.torch import Model, MultivariateGaussianMixin + + +# define the model +class LSTM(MultivariateGaussianMixin, Model): + def __init__(self, observation_space, action_space, device, clip_actions=False, + clip_log_std=True, min_log_std=-20, max_log_std=2, + num_envs=1, num_layers=1, hidden_size=64, sequence_length=10): + Model.__init__(self, observation_space, action_space, device) + MultivariateGaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std) + + self.num_envs = num_envs + self.num_layers = num_layers + self.hidden_size = hidden_size # Hcell (Hout is Hcell because proj_size = 0) + self.sequence_length = sequence_length + + self.lstm = nn.LSTM(input_size=self.num_observations, + hidden_size=self.hidden_size, + num_layers=self.num_layers, + batch_first=True) # batch_first -> (batch, sequence, features) + + self.fc1 = nn.Linear(self.hidden_size, 64) + self.fc2 = nn.Linear(64, 32) + self.fc3 = nn.Linear(32, self.num_actions) + + self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) + + def get_specification(self): + # batch size (N) is the number of envs during rollout + return {"rnn": {"sequence_length": self.sequence_length, + "sizes": [(self.num_layers, self.num_envs, self.hidden_size), # hidden states (D ∗ num_layers, N, Hout) + (self.num_layers, self.num_envs, self.hidden_size)]}} # cell states (D ∗ num_layers, N, Hcell) + + def compute(self, inputs, role): + states = inputs["states"] + terminated = inputs.get("terminated", None) + hidden_states, cell_states = inputs["rnn"][0], inputs["rnn"][1] + + # training + if self.training: + rnn_input = states.view(-1, self.sequence_length, states.shape[-1]) # (N, L, Hin): N=batch_size, L=sequence_length + hidden_states = hidden_states.view(self.num_layers, -1, self.sequence_length, hidden_states.shape[-1]) # (D * num_layers, N, L, Hout) + cell_states = cell_states.view(self.num_layers, -1, self.sequence_length, cell_states.shape[-1]) # (D * num_layers, N, L, Hcell) + # get the hidden/cell states corresponding to the initial sequence + hidden_states = hidden_states[:,:,0,:].contiguous() # (D * num_layers, N, Hout) + cell_states = cell_states[:,:,0,:].contiguous() # (D * num_layers, N, Hcell) + + # reset the RNN state in the middle of a sequence + if terminated is not None and torch.any(terminated): + rnn_outputs = [] + terminated = terminated.view(-1, self.sequence_length) + indexes = [0] + (terminated[:,:-1].any(dim=0).nonzero(as_tuple=True)[0] + 1).tolist() + [self.sequence_length] + + for i in range(len(indexes) - 1): + i0, i1 = indexes[i], indexes[i + 1] + rnn_output, (hidden_states, cell_states) = self.lstm(rnn_input[:,i0:i1,:], (hidden_states, cell_states)) + hidden_states[:, (terminated[:,i1-1]), :] = 0 + cell_states[:, (terminated[:,i1-1]), :] = 0 + rnn_outputs.append(rnn_output) + + rnn_states = (hidden_states, cell_states) + rnn_output = torch.cat(rnn_outputs, dim=1) + # no need to reset the RNN state in the sequence + else: + rnn_output, rnn_states = self.lstm(rnn_input, (hidden_states, cell_states)) + # rollout + else: + rnn_input = states.view(-1, 1, states.shape[-1]) # (N, L, Hin): N=num_envs, L=1 + rnn_output, rnn_states = self.lstm(rnn_input, (hidden_states, cell_states)) + + # flatten the RNN output + rnn_output = torch.flatten(rnn_output, start_dim=0, end_dim=1) # (N, L, D ∗ Hout) -> (N * L, D ∗ Hout) + + x = self.fc1(rnn_output) + x = F.relu(x) + x = self.fc2(x) + x = F.relu(x) + x = self.fc3(x) + + return torch.tanh(x), self.log_std_parameter, {"rnn": [rnn_states[0], rnn_states[1]]} + + +# instantiate the model (assumes there is a wrapped environment: env) +policy = LSTM(observation_space=env.observation_space, + action_space=env.action_space, + device=env.device, + clip_actions=True, + clip_log_std=True, + min_log_std=-20, + max_log_std=2, + num_envs=env.num_envs, + num_layers=1, + hidden_size=64, + sequence_length=10) +# [end-lstm-functional] diff --git a/docs/source/snippets/noise.py b/docs/source/snippets/noise.py index df55430f..b0da68be 100644 --- a/docs/source/snippets/noise.py +++ b/docs/source/snippets/noise.py @@ -18,7 +18,7 @@ def sample(self, size: Union[Tuple[int], torch.Size]) -> torch.Tensor: :param size: Shape of the sampled tensor :type size: tuple or list of integers, or torch.Size - + :return: Sampled noise :rtype: torch.Tensor """ diff --git a/docs/source/snippets/shared_model.py b/docs/source/snippets/shared_model.py index 40182f61..bb784e5c 100644 --- a/docs/source/snippets/shared_model.py +++ b/docs/source/snippets/shared_model.py @@ -18,7 +18,7 @@ def __init__(self, observation_space, action_space, device, clip_actions=False, nn.ELU(), nn.Linear(32, 32), nn.ELU()) - + # separated layers ("policy") self.mean_layer = nn.Linear(32, self.num_actions) self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) @@ -27,18 +27,18 @@ def __init__(self, observation_space, action_space, device, clip_actions=False, self.value_layer = nn.Linear(32, 1) # override the .act(...) method to disambiguate its call - def act(self, states, taken_actions, role): + def act(self, inputs, role): if role == "policy": - return GaussianMixin.act(self, states, taken_actions, role) + return GaussianMixin.act(self, inputs, role) elif role == "value": - return DeterministicMixin.act(self, states, taken_actions, role) + return DeterministicMixin.act(self, inputs, role) # forward the input to compute model output according to the specified role - def compute(self, states, taken_actions, role): + def compute(self, inputs, role): if role == "policy": - return self.mean_layer(self.net(states)), self.log_std_parameter + return self.mean_layer(self.net(inputs["states"])), self.log_std_parameter, {} elif role == "value": - return self.value_layer(self.net(states)) + return self.value_layer(self.net(inputs["states"])), {} # instantiate the shared model and pass the same instance to the other key diff --git a/docs/source/snippets/tabular_model.py b/docs/source/snippets/tabular_model.py index 4b94c69c..7305dd68 100644 --- a/docs/source/snippets/tabular_model.py +++ b/docs/source/snippets/tabular_model.py @@ -13,20 +13,21 @@ def __init__(self, observation_space, action_space, device, num_envs=1, epsilon= self.epsilon = epsilon self.q_table = torch.ones((num_envs, self.num_observations, self.num_actions), dtype=torch.float32) - def compute(self, states, taken_actions, role): - actions = torch.argmax(self.q_table[torch.arange(self.num_envs).view(-1, 1), states], + def compute(self, inputs, role): + states = inputs["states"] + actions = torch.argmax(self.q_table[torch.arange(self.num_envs).view(-1, 1), states], dim=-1, keepdim=True).view(-1,1) - + indexes = (torch.rand(states.shape[0], device=self.device) < self.epsilon).nonzero().view(-1) if indexes.numel(): actions[indexes] = torch.randint(self.num_actions, (indexes.numel(), 1), device=self.device) - return actions + return actions, {} # instantiate the model (assumes there is a wrapped environment: env) -policy = EpilonGreedyPolicy(observation_space=env.observation_space, - action_space=env.action_space, - device=env.device, +policy = EpilonGreedyPolicy(observation_space=env.observation_space, + action_space=env.action_space, + device=env.device, num_envs=env.num_envs, epsilon=0.15) # [end-epsilon-greedy] diff --git a/docs/source/snippets/trainer.py b/docs/source/snippets/trainer.py index 2e54ad44..d4e28a60 100644 --- a/docs/source/snippets/trainer.py +++ b/docs/source/snippets/trainer.py @@ -1,5 +1,5 @@ # [start-base] -from typing import Union, List +from typing import Union, List, Optional import copy @@ -10,17 +10,18 @@ CUSTOM_DEFAULT_CONFIG = { - "timesteps": 100000, # number of timesteps to train for - "headless": False, # whether to use headless mode (no rendering) + "timesteps": 100000, # number of timesteps to train for + "headless": False, # whether to use headless mode (no rendering) + "disable_progressbar": False, # whether to disable the progressbar. If None, disable on non-TTY } class CustomTrainer(Trainer): - def __init__(self, - env: Wrapper, - agents: Union[Agent, List[Agent], List[List[Agent]]], - agents_scope : List[int] = [], - cfg: dict = {}) -> None: + def __init__(self, + env: Wrapper, + agents: Union[Agent, List[Agent], List[List[Agent]]], + agents_scope: Optional[List[int]] = None, + cfg: Optional[dict] = None) -> None: """ :param env: Environment to train on :type env: skrl.env.torch.Wrapper @@ -32,7 +33,8 @@ def __init__(self, :type cfg: dict, optional """ _cfg = copy.deepcopy(CUSTOM_DEFAULT_CONFIG) - _cfg.update(cfg) + _cfg.update(cfg if cfg is not None else {}) + agents_scope = agents_scope if agents_scope is not None else [] super().__init__(env=env, agents=agents, agents_scope=agents_scope, cfg=_cfg) # ================================ @@ -123,4 +125,4 @@ def eval(self) -> None: # evaluate the agent(s) for timestep in range(cfg["timesteps"]): trainer.eval(timestep=timestep) -# [end-manual] \ No newline at end of file +# [end-manual] diff --git a/docs/source/snippets/utils_postprocessing.py b/docs/source/snippets/utils_postprocessing.py index 1a204c2a..477f459f 100644 --- a/docs/source/snippets/utils_postprocessing.py +++ b/docs/source/snippets/utils_postprocessing.py @@ -6,10 +6,10 @@ memory_iterator = postprocessing.MemoryFileIterator("memories/*.pt") for filename, data in memory_iterator: filename # str: basename of the current file - data # dict: keys are the names of the memory tensors in the file. + data # dict: keys are the names of the memory tensors in the file. # Tensor shapes are (memory size, number of envs, specific content size) - - # example of simple usage: + + # example of simple usage: # print the filenames of all memories and their tensor shapes print("\nfilename:", filename) print(" |-- states:", data['states'].shape) @@ -30,8 +30,8 @@ filename # str: basename of the current file data # dict: keys are the names of the memory arrays in the file. # Array shapes are (memory size, number of envs, specific content size) - - # example of simple usage: + + # example of simple usage: # print the filenames of all memories and their array shapes print("\nfilename:", filename) print(" |-- states:", data['states'].shape) @@ -51,10 +51,10 @@ for filename, data in memory_iterator: filename # str: basename of the current file data # dict: keys are the names of the memory list of lists extracted from the file. - # List lengths are (memory size * number of envs) and + # List lengths are (memory size * number of envs) and # sublist lengths are (specific content size) - - # example of simple usage: + + # example of simple usage: # print the filenames of all memories and their list lengths print("\nfilename:", filename) print(" |-- states:", len(data['states'])) @@ -76,7 +76,7 @@ dirname # str: path of the directory (experiment name) containing the Tensorboard file data # dict: keys are the tags, values are lists of [step, value] pairs - # example of simple usage: + # example of simple usage: # print the directory name and the value length for the "Reward / Total reward (mean)" tag print("\ndirname:", dirname) for tag, values in data.items(): diff --git a/setup.py b/setup.py index bbf87912..07ddfffc 100644 --- a/setup.py +++ b/setup.py @@ -10,10 +10,12 @@ # dependencies INSTALL_REQUIRES = [ "gym", - "torch", + "gymnasium", + "torch>=1.8", "tensorboard", + "wandb", "tqdm", - "packaging", + "packaging" ] # installation @@ -24,7 +26,7 @@ description="Modular and flexible library for Reinforcement Learning", long_description=open(os.path.join(root_dir, "README.md")).read(), long_description_content_type="text/markdown", - keywords=["reinforcement learning", "machine learning", "rl", ""], + keywords=["reinforcement", "machine", "learning", "rl"], python_requires=">=3.6.*", install_requires=INSTALL_REQUIRES, url="https://github.com/Toni-SM/skrl", @@ -41,4 +43,10 @@ ], license="MIT", zip_safe=False, + project_urls={ + "Documentation": "https://skrl.readthedocs.io", + "Repository": "https://github.com/Toni-SM/skrl", + "Bug Tracker": "https://github.com/Toni-SM/skrl/issues", + "Discussions": "https://github.com/Toni-SM/skrl/discussions", + } ) diff --git a/skrl/agents/torch/__init__.py b/skrl/agents/torch/__init__.py index ebbc5c75..16fec55f 100644 --- a/skrl/agents/torch/__init__.py +++ b/skrl/agents/torch/__init__.py @@ -1 +1 @@ -from .base import Agent \ No newline at end of file +from skrl.agents.torch.base import Agent diff --git a/skrl/agents/torch/a2c/__init__.py b/skrl/agents/torch/a2c/__init__.py index e8cc76e2..d28e13b5 100644 --- a/skrl/agents/torch/a2c/__init__.py +++ b/skrl/agents/torch/a2c/__init__.py @@ -1 +1 @@ -from .a2c import A2C, A2C_DEFAULT_CONFIG \ No newline at end of file +from skrl.agents.torch.a2c.a2c import A2C, A2C_DEFAULT_CONFIG diff --git a/skrl/agents/torch/a2c/a2c.py b/skrl/agents/torch/a2c/a2c.py index 0693e0d5..ed9fe529 100644 --- a/skrl/agents/torch/a2c/a2c.py +++ b/skrl/agents/torch/a2c/a2c.py @@ -1,6 +1,6 @@ -from typing import Union, Tuple, Dict, Any +from typing import Union, Tuple, Dict, Any, Optional -import gym +import gym, gymnasium import copy import itertools @@ -8,19 +8,20 @@ import torch.nn as nn import torch.nn.functional as F -from ....memories.torch import Memory -from ....models.torch import Model +from skrl.memories.torch import Memory +from skrl.models.torch import Model +from skrl.resources.schedulers.torch import KLAdaptiveRL -from .. import Agent +from skrl.agents.torch import Agent A2C_DEFAULT_CONFIG = { "rollouts": 16, # number of rollouts before updating "mini_batches": 1, # number of mini batches to use for updating - + "discount_factor": 0.99, # discount factor (gamma) "lambda": 0.95, # TD(lambda) coefficient (lam) for computing returns and advantages - + "learning_rate": 1e-3, # learning rate "learning_rate_scheduler": None, # learning rate scheduler class (see torch.optim.lr_scheduler) "learning_rate_scheduler_kwargs": {}, # learning rate scheduler's kwargs (e.g. {"step_size": 1e-3}) @@ -46,33 +47,37 @@ "checkpoint_interval": 1000, # interval for checkpoints (timesteps) "store_separately": False, # whether to store checkpoints separately + + "wandb": False, # whether to use Weights & Biases + "wandb_kwargs": {} # wandb kwargs (see https://docs.wandb.ai/ref/python/init) } } class A2C(Agent): - def __init__(self, - models: Dict[str, Model], - memory: Union[Memory, Tuple[Memory], None] = None, - observation_space: Union[int, Tuple[int], gym.Space, None] = None, - action_space: Union[int, Tuple[int], gym.Space, None] = None, - device: Union[str, torch.device] = "cuda:0", - cfg: dict = {}) -> None: + def __init__(self, + models: Dict[str, Model], + memory: Optional[Union[Memory, Tuple[Memory]]] = None, + observation_space: Optional[Union[int, Tuple[int], gym.Space, gymnasium.Space]] = None, + action_space: Optional[Union[int, Tuple[int], gym.Space, gymnasium.Space]] = None, + device: Optional[Union[str, torch.device]] = None, + cfg: Optional[dict] = None) -> None: """Advantage Actor Critic (A2C) https://arxiv.org/abs/1602.01783 - + :param models: Models used by the agent :type models: dictionary of skrl.models.torch.Model :param memory: Memory to storage the transitions. - If it is a tuple, the first element will be used for training and + If it is a tuple, the first element will be used for training and for the rest only the environment transitions will be added :type memory: skrl.memory.torch.Memory, list of skrl.memory.torch.Memory or None :param observation_space: Observation/state space or shape (default: None) - :type observation_space: int, tuple or list of integers, gym.Space or None, optional + :type observation_space: int, tuple or list of integers, gym.Space, gymnasium.Space or None, optional :param action_space: Action space or shape (default: None) - :type action_space: int, tuple or list of integers, gym.Space or None, optional - :param device: Computing device (default: "cuda:0") + :type action_space: int, tuple or list of integers, gym.Space, gymnasium.Space or None, optional + :param device: Device on which a torch tensor is or will be allocated (default: ``None``). + If None, the device will be either ``"cuda:0"`` if available or ``"cpu"`` :type device: str or torch.device, optional :param cfg: Configuration dictionary :type cfg: dict @@ -80,12 +85,12 @@ def __init__(self, :raises KeyError: If the models dictionary is missing a required key """ _cfg = copy.deepcopy(A2C_DEFAULT_CONFIG) - _cfg.update(cfg) - super().__init__(models=models, - memory=memory, - observation_space=observation_space, - action_space=action_space, - device=device, + _cfg.update(cfg if cfg is not None else {}) + super().__init__(models=models, + memory=memory, + observation_space=observation_space, + action_space=action_space, + device=device, cfg=_cfg) # models @@ -124,7 +129,7 @@ def __init__(self, if self.policy is self.value: self.optimizer = torch.optim.Adam(self.policy.parameters(), lr=self._learning_rate) else: - self.optimizer = torch.optim.Adam(itertools.chain(self.policy.parameters(), self.value.parameters()), + self.optimizer = torch.optim.Adam(itertools.chain(self.policy.parameters(), self.value.parameters()), lr=self._learning_rate) if self._learning_rate_scheduler is not None: self.scheduler = self._learning_rate_scheduler(self.optimizer, **self.cfg["learning_rate_scheduler_kwargs"]) @@ -144,25 +149,58 @@ def __init__(self, else: self._value_preprocessor = self._empty_preprocessor - def init(self) -> None: + def init(self, trainer_cfg: Optional[Dict[str, Any]] = None) -> None: """Initialize the agent """ - super().init() + super().init(trainer_cfg=trainer_cfg) self.set_mode("eval") - + # create tensors in memory if self.memory is not None: self.memory.create_tensor(name="states", size=self.observation_space, dtype=torch.float32) self.memory.create_tensor(name="actions", size=self.action_space, dtype=torch.float32) self.memory.create_tensor(name="rewards", size=1, dtype=torch.float32) - self.memory.create_tensor(name="dones", size=1, dtype=torch.bool) + self.memory.create_tensor(name="terminated", size=1, dtype=torch.bool) + self.memory.create_tensor(name="log_prob", size=1, dtype=torch.float32) self.memory.create_tensor(name="values", size=1, dtype=torch.float32) self.memory.create_tensor(name="returns", size=1, dtype=torch.float32) self.memory.create_tensor(name="advantages", size=1, dtype=torch.float32) - self.tensors_names = ["states", "actions", "rewards", "dones", "values", "returns", "advantages"] + self._tensors_names = ["states", "actions", "terminated", "log_prob", "returns", "advantages"] + + # RNN specifications + self._rnn = False # flag to indicate whether RNN is available + self._rnn_tensors_names = [] # used for sampling during training + self._rnn_final_states = {"policy": [], "value": []} + self._rnn_initial_states = {"policy": [], "value": []} + self._rnn_sequence_length = self.policy.get_specification().get("rnn", {}).get("sequence_length", 1) + + # policy + for i, size in enumerate(self.policy.get_specification().get("rnn", {}).get("sizes", [])): + self._rnn = True + # create tensors in memory + if self.memory is not None: + self.memory.create_tensor(name=f"rnn_policy_{i}", size=(size[0], size[2]), dtype=torch.float32, keep_dimensions=True) + self._rnn_tensors_names.append(f"rnn_policy_{i}") + # default RNN states + self._rnn_initial_states["policy"].append(torch.zeros(size, dtype=torch.float32, device=self.device)) + + # value + if self.value is not None: + if self.policy is self.value: + self._rnn_initial_states["value"] = self._rnn_initial_states["policy"] + else: + for i, size in enumerate(self.value.get_specification().get("rnn", {}).get("sizes", [])): + self._rnn = True + # create tensors in memory + if self.memory is not None: + self.memory.create_tensor(name=f"rnn_value_{i}", size=(size[0], size[2]), dtype=torch.float32, keep_dimensions=True) + self._rnn_tensors_names.append(f"rnn_value_{i}") + # default RNN states + self._rnn_initial_states["value"].append(torch.zeros(size, dtype=torch.float32, device=self.device)) # create temporary variables needed for storage and computation + self._current_log_prob = None self._current_next_states = None def act(self, states: torch.Tensor, timestep: int, timesteps: int) -> torch.Tensor: @@ -178,27 +216,34 @@ def act(self, states: torch.Tensor, timestep: int, timesteps: int) -> torch.Tens :return: Actions :rtype: torch.Tensor """ - states = self._state_preprocessor(states) + rnn = {"rnn": self._rnn_initial_states["policy"]} if self._rnn else {} # sample random actions - # TODO, check for stochasticity + # TODO: fix for stochasticity, rnn and log_prob if timestep < self._random_timesteps: - return self.policy.random_act(states, taken_actions=None, role="policy") + return self.policy.random_act({"states": self._state_preprocessor(states), **rnn}, role="policy") # sample stochastic actions - return self.policy.act(states, taken_actions=None, role="policy") - - def record_transition(self, - states: torch.Tensor, - actions: torch.Tensor, - rewards: torch.Tensor, - next_states: torch.Tensor, - dones: torch.Tensor, - infos: Any, - timestep: int, + actions, log_prob, outputs = self.policy.act({"states": self._state_preprocessor(states), **rnn}, role="policy") + self._current_log_prob = log_prob + + if self._rnn: + self._rnn_final_states["policy"] = outputs.get("rnn", []) + + return actions, log_prob, outputs + + def record_transition(self, + states: torch.Tensor, + actions: torch.Tensor, + rewards: torch.Tensor, + next_states: torch.Tensor, + terminated: torch.Tensor, + truncated: torch.Tensor, + infos: Any, + timestep: int, timesteps: int) -> None: """Record an environment transition in memory - + :param states: Observations/states of the environment used to make the decision :type states: torch.Tensor :param actions: Actions taken by the agent @@ -207,8 +252,10 @@ def record_transition(self, :type rewards: torch.Tensor :param next_states: Next observations/states of the environment :type next_states: torch.Tensor - :param dones: Signals to indicate that episodes have ended - :type dones: torch.Tensor + :param terminated: Signals to indicate that episodes have terminated + :type terminated: torch.Tensor + :param truncated: Signals to indicate that episodes have been truncated + :type truncated: torch.Tensor :param infos: Additional information about the environment :type infos: Any type supported by the environment :param timestep: Current timestep @@ -216,24 +263,48 @@ def record_transition(self, :param timesteps: Number of timesteps :type timesteps: int """ - super().record_transition(states, actions, rewards, next_states, dones, infos, timestep, timesteps) + super().record_transition(states, actions, rewards, next_states, terminated, truncated, infos, timestep, timesteps) - # reward shaping - if self._rewards_shaper is not None: - rewards = self._rewards_shaper(rewards, timestep, timesteps) + if self.memory is not None: + self._current_next_states = next_states - self._current_next_states = next_states + # reward shaping + if self._rewards_shaper is not None: + rewards = self._rewards_shaper(rewards, timestep, timesteps) - if self.memory is not None: - with torch.no_grad(): - values, _, _ = self.value.act(self._state_preprocessor(states), taken_actions=None, role="value") + # compute values + rnn = {"rnn": self._rnn_initial_states["value"]} if self._rnn else {} + values, _, outputs = self.value.act({"states": self._state_preprocessor(states), **rnn}, role="value") values = self._value_preprocessor(values, inverse=True) - self.memory.add_samples(states=states, actions=actions, rewards=rewards, next_states=next_states, dones=dones, - values=values) + # package RNN states + rnn_states = {} + if self._rnn: + rnn_states.update({f"rnn_policy_{i}": s.transpose(0, 1) for i, s in enumerate(self._rnn_initial_states["policy"])}) + if self.policy is not self.value: + rnn_states.update({f"rnn_value_{i}": s.transpose(0, 1) for i, s in enumerate(self._rnn_initial_states["value"])}) + + # storage transition in memory + self.memory.add_samples(states=states, actions=actions, rewards=rewards, next_states=next_states, + terminated=terminated, truncated=truncated, log_prob=self._current_log_prob, values=values, **rnn_states) for memory in self.secondary_memories: - memory.add_samples(states=states, actions=actions, rewards=rewards, next_states=next_states, dones=dones, - values=values) + memory.add_samples(states=states, actions=actions, rewards=rewards, next_states=next_states, + terminated=terminated, truncated=truncated, log_prob=self._current_log_prob, values=values, **rnn_states) + + # update RNN states + if self._rnn: + self._rnn_final_states["value"] = self._rnn_final_states["policy"] if self.policy is self.value else outputs.get("rnn", []) + + # reset states if the episodes have ended + finished_episodes = terminated.nonzero(as_tuple=False) + if finished_episodes.numel(): + for rnn_state in self._rnn_final_states["policy"]: + rnn_state[:, finished_episodes[:, 0]] = 0 + if self.policy is not self.value: + for rnn_state in self._rnn_final_states["value"]: + rnn_state[:, finished_episodes[:, 0]] = 0 + + self._rnn_initial_states = self._rnn_final_states def pre_interaction(self, timestep: int, timesteps: int) -> None: """Callback called before the interaction with the environment @@ -270,11 +341,11 @@ def _update(self, timestep: int, timesteps: int) -> None: :param timesteps: Number of timesteps :type timesteps: int """ - def compute_gae(rewards: torch.Tensor, - dones: torch.Tensor, - values: torch.Tensor, - next_values: torch.Tensor, - discount_factor: float = 0.99, + def compute_gae(rewards: torch.Tensor, + dones: torch.Tensor, + values: torch.Tensor, + next_values: torch.Tensor, + discount_factor: float = 0.99, lambda_coefficient: float = 0.95) -> torch.Tensor: """Compute the Generalized Advantage Estimator (GAE) @@ -313,12 +384,15 @@ def compute_gae(rewards: torch.Tensor, # compute returns and advantages with torch.no_grad(): - last_values, _, _ = self.value.act(self._state_preprocessor(self._current_next_states.float()), taken_actions=None, role="value") + self.value.train(False) + rnn = {"rnn": self._rnn_initial_states["value"]} if self._rnn else {} + last_values, _, _ = self.value.act({"states": self._state_preprocessor(self._current_next_states.float()), **rnn}, role="value") + self.value.train(True) last_values = self._value_preprocessor(last_values, inverse=True) values = self.memory.get_tensor_by_name("values") returns, advantages = compute_gae(rewards=self.memory.get_tensor_by_name("rewards"), - dones=self.memory.get_tensor_by_name("dones"), + dones=self.memory.get_tensor_by_name("terminated"), values=values, next_values=last_values, discount_factor=self._discount_factor, @@ -329,30 +403,51 @@ def compute_gae(rewards: torch.Tensor, self.memory.set_tensor_by_name("advantages", advantages) # sample mini-batches from memory - sampled_batches = self.memory.sample_all(names=self.tensors_names, mini_batches=self._mini_batches) + sampled_batches = self.memory.sample_all(names=self._tensors_names, mini_batches=self._mini_batches, sequence_length=self._rnn_sequence_length) + + rnn_policy, rnn_value = {}, {} + if self._rnn: + sampled_rnn_batches = self.memory.sample_all(names=self._rnn_tensors_names, mini_batches=self._mini_batches, sequence_length=self._rnn_sequence_length) cumulative_policy_loss = 0 cumulative_entropy_loss = 0 cumulative_value_loss = 0 + kl_divergences = [] + # mini-batches loop - for sampled_states, sampled_actions, _, _, _, sampled_returns, sampled_advantages in sampled_batches: + for i, (sampled_states, sampled_actions, sampled_dones, sampled_log_prob, sampled_returns, sampled_advantages) in enumerate(sampled_batches): + + if self._rnn: + if self.policy is self.value: + rnn_policy = {"rnn": [s.transpose(0, 1) for s in sampled_rnn_batches[i]], "terminated": sampled_dones} + rnn_value = rnn_policy + else: + rnn_policy = {"rnn": [s.transpose(0, 1) for s, n in zip(sampled_rnn_batches[i], self._rnn_tensors_names) if "policy" in n], "terminated": sampled_dones} + rnn_value = {"rnn": [s.transpose(0, 1) for s, n in zip(sampled_rnn_batches[i], self._rnn_tensors_names) if "value" in n], "terminated": sampled_dones} sampled_states = self._state_preprocessor(sampled_states, train=True) - _, next_log_prob, _ = self.policy.act(states=sampled_states, taken_actions=sampled_actions, role="policy") + _, next_log_prob, _ = self.policy.act({"states": sampled_states, "taken_actions": sampled_actions, **rnn_policy}, role="policy") + + # compute aproximate KL divergence for KLAdaptive learning rate scheduler + if isinstance(self.scheduler, KLAdaptiveRL): + with torch.no_grad(): + ratio = next_log_prob - sampled_log_prob + kl_divergence = ((torch.exp(ratio) - 1) - ratio).mean() + kl_divergences.append(kl_divergence) # compute entropy loss if self._entropy_loss_scale: entropy_loss = -self._entropy_loss_scale * self.policy.get_entropy(role="policy").mean() else: entropy_loss = 0 - + # compute policy loss policy_loss = -(sampled_advantages * next_log_prob).mean() # compute value loss - predicted_values, _, _ = self.value.act(states=sampled_states, taken_actions=None, role="value") + predicted_values, _, _ = self.value.act({"states": sampled_states, **rnn_value}, role="value") value_loss = F.mse_loss(sampled_returns, predicted_values) @@ -374,12 +469,15 @@ def compute_gae(rewards: torch.Tensor, # update learning rate if self._learning_rate_scheduler: - self.scheduler.step() + if isinstance(self.scheduler, KLAdaptiveRL): + self.scheduler.step(torch.tensor(kl_divergences).mean()) + else: + self.scheduler.step() # record data self.track_data("Loss / Policy loss", cumulative_policy_loss / len(sampled_batches)) self.track_data("Loss / Value loss", cumulative_value_loss / len(sampled_batches)) - + if self._entropy_loss_scale: self.track_data("Loss / Entropy loss", cumulative_entropy_loss / len(sampled_batches)) diff --git a/skrl/agents/torch/amp/__init__.py b/skrl/agents/torch/amp/__init__.py index 9a6ca76e..d985f553 100644 --- a/skrl/agents/torch/amp/__init__.py +++ b/skrl/agents/torch/amp/__init__.py @@ -1 +1 @@ -from .amp import AMP, AMP_DEFAULT_CONFIG \ No newline at end of file +from skrl.agents.torch.amp.amp import AMP, AMP_DEFAULT_CONFIG diff --git a/skrl/agents/torch/amp/amp.py b/skrl/agents/torch/amp/amp.py index 4af3af05..2c413dff 100644 --- a/skrl/agents/torch/amp/amp.py +++ b/skrl/agents/torch/amp/amp.py @@ -1,6 +1,6 @@ -from typing import Callable, Union, Tuple, Dict, Any +from typing import Callable, Union, Tuple, Dict, Any, Optional -import gym +import gym, gymnasium import math import copy import itertools @@ -9,20 +9,20 @@ import torch.nn as nn import torch.nn.functional as F -from ....memories.torch import Memory -from ....models.torch import Model +from skrl.memories.torch import Memory +from skrl.models.torch import Model -from .. import Agent +from skrl.agents.torch import Agent AMP_DEFAULT_CONFIG = { "rollouts": 16, # number of rollouts before updating "learning_epochs": 6, # number of learning epochs during each update "mini_batches": 2, # number of mini batches during each learning epoch - + "discount_factor": 0.99, # discount factor (gamma) "lambda": 0.95, # TD(lambda) coefficient (lam) for computing returns and advantages - + "learning_rate": 5e-5, # learning rate "learning_rate_scheduler": None, # learning rate scheduler class (see torch.optim.lr_scheduler) "learning_rate_scheduler_kwargs": {}, # learning rate scheduler's kwargs (e.g. {"step_size": 1e-3}) @@ -64,47 +64,51 @@ "checkpoint_interval": 1000, # interval for checkpoints (timesteps) "store_separately": False, # whether to store checkpoints separately + + "wandb": False, # whether to use Weights & Biases + "wandb_kwargs": {} # wandb kwargs (see https://docs.wandb.ai/ref/python/init) } } class AMP(Agent): - def __init__(self, - models: Dict[str, Model], - memory: Union[Memory, Tuple[Memory], None] = None, - observation_space: Union[int, Tuple[int], gym.Space, None] = None, - action_space: Union[int, Tuple[int], gym.Space, None] = None, - device: Union[str, torch.device] = "cuda:0", - cfg: dict = {}, - amp_observation_space: Union[int, Tuple[int], gym.Space, None] = None, - motion_dataset: Union[Memory, None] = None, - reply_buffer: Union[Memory, None] = None, - collect_reference_motions: Union[Callable[[int], torch.Tensor], None] = None, - collect_observation: Union[Callable[[], torch.Tensor], None] = None) -> None: + def __init__(self, + models: Dict[str, Model], + memory: Optional[Union[Memory, Tuple[Memory]]] = None, + observation_space: Optional[Union[int, Tuple[int], gym.Space, gymnasium.Space]] = None, + action_space: Optional[Union[int, Tuple[int], gym.Space, gymnasium.Space]] = None, + device: Optional[Union[str, torch.device]] = None, + cfg: Optional[dict] = None, + amp_observation_space: Optional[Union[int, Tuple[int], gym.Space, gymnasium.Space]] = None, + motion_dataset: Optional[Memory] = None, + reply_buffer: Optional[Memory] = None, + collect_reference_motions: Optional[Callable[[int], torch.Tensor]] = None, + collect_observation: Optional[Callable[[], torch.Tensor]] = None) -> None: """Adversarial Motion Priors (AMP) https://arxiv.org/abs/2104.02180 - + The implementation is adapted from the NVIDIA IsaacGymEnvs (https://github.com/NVIDIA-Omniverse/IsaacGymEnvs/blob/main/isaacgymenvs/learning/amp_continuous.py) :param models: Models used by the agent :type models: dictionary of skrl.models.torch.Model :param memory: Memory to storage the transitions. - If it is a tuple, the first element will be used for training and + If it is a tuple, the first element will be used for training and for the rest only the environment transitions will be added :type memory: skrl.memory.torch.Memory, list of skrl.memory.torch.Memory or None :param observation_space: Observation/state space or shape (default: None) - :type observation_space: int, tuple or list of integers, gym.Space or None, optional + :type observation_space: int, tuple or list of integers, gym.Space, gymnasium.Space or None, optional :param action_space: Action space or shape (default: None) - :type action_space: int, tuple or list of integers, gym.Space or None, optional - :param device: Computing device (default: "cuda:0") + :type action_space: int, tuple or list of integers, gym.Space, gymnasium.Space or None, optional + :param device: Device on which a torch tensor is or will be allocated (default: ``None``). + If None, the device will be either ``"cuda:0"`` if available or ``"cpu"`` :type device: str or torch.device, optional :param cfg: Configuration dictionary :type cfg: dict :param amp_observation_space: AMP observation/state space or shape (default: None) - :type amp_observation_space: int, tuple or list of integers, gym.Space or None - :param motion_dataset: Reference motion dataset: M (default: None) + :type amp_observation_space: int, tuple or list of integers, gym.Space, gymnasium.Space or None + :param motion_dataset: Reference motion dataset: M (default: None) :type motion_dataset: skrl.memory.torch.Memory or None :param reply_buffer: Reply buffer for preventing discriminator overfitting: B (default: None) :type reply_buffer: skrl.memory.torch.Memory or None @@ -116,12 +120,12 @@ def __init__(self, :raises KeyError: If the models dictionary is missing a required key """ _cfg = copy.deepcopy(AMP_DEFAULT_CONFIG) - _cfg.update(cfg) - super().__init__(models=models, - memory=memory, - observation_space=observation_space, - action_space=action_space, - device=device, + _cfg.update(cfg if cfg is not None else {}) + super().__init__(models=models, + memory=memory, + observation_space=observation_space, + action_space=action_space, + device=device, cfg=_cfg) self.amp_observation_space = amp_observation_space @@ -169,7 +173,7 @@ def __init__(self, self._learning_starts = self.cfg["learning_starts"] self._amp_batch_size = self.cfg["amp_batch_size"] - self._task_reward_weight = self.cfg["task_reward_weight"] + self._task_reward_weight = self.cfg["task_reward_weight"] self._style_reward_weight = self.cfg["style_reward_weight"] self._discriminator_batch_size = self.cfg["discriminator_batch_size"] @@ -182,9 +186,9 @@ def __init__(self, # set up optimizer and learning rate scheduler if self.policy is not None and self.value is not None and self.discriminator is not None: - self.optimizer = torch.optim.Adam(itertools.chain(self.policy.parameters(), + self.optimizer = torch.optim.Adam(itertools.chain(self.policy.parameters(), self.value.parameters(), - self.discriminator.parameters()), + self.discriminator.parameters()), lr=self._learning_rate) if self._learning_rate_scheduler is not None: self.scheduler = self._learning_rate_scheduler(self.optimizer, **self.cfg["learning_rate_scheduler_kwargs"]) @@ -210,19 +214,19 @@ def __init__(self, else: self._amp_state_preprocessor = self._empty_preprocessor - def init(self) -> None: + def init(self, trainer_cfg: Optional[Dict[str, Any]] = None) -> None: """Initialize the agent """ - super().init() + super().init(trainer_cfg=trainer_cfg) self.set_mode("eval") - + # create tensors in memory if self.memory is not None: self.memory.create_tensor(name="states", size=self.observation_space, dtype=torch.float32) self.memory.create_tensor(name="next_states", size=self.observation_space, dtype=torch.float32) self.memory.create_tensor(name="actions", size=self.action_space, dtype=torch.float32) self.memory.create_tensor(name="rewards", size=1, dtype=torch.float32) - self.memory.create_tensor(name="dones", size=1, dtype=torch.bool) + self.memory.create_tensor(name="terminated", size=1, dtype=torch.bool) self.memory.create_tensor(name="log_prob", size=1, dtype=torch.float32) self.memory.create_tensor(name="values", size=1, dtype=torch.float32) self.memory.create_tensor(name="returns", size=1, dtype=torch.float32) @@ -231,7 +235,7 @@ def init(self) -> None: self.memory.create_tensor(name="amp_states", size=self.amp_observation_space, dtype=torch.float32) self.memory.create_tensor(name="next_values", size=1, dtype=torch.float32) - self.tensors_names = ["states", "actions", "rewards", "next_states", "dones", \ + self.tensors_names = ["states", "actions", "rewards", "next_states", "terminated", \ "log_prob", "values", "returns", "advantages", "amp_states", "next_values"] # create tensors for motion dataset and reply buffer @@ -268,25 +272,26 @@ def act(self, states: torch.Tensor, timestep: int, timesteps: int) -> torch.Tens # sample random actions # TODO, check for stochasticity if timestep < self._random_timesteps: - return self.policy.random_act(states, taken_actions=None, role="policy") + return self.policy.random_act({"states": states}, role="policy") # sample stochastic actions - actions, log_prob, actions_mean = self.policy.act(states, taken_actions=None, role="policy") + actions, log_prob, outputs = self.policy.act({"states": states}, role="policy") self._current_log_prob = log_prob - return actions, log_prob, actions_mean - - def record_transition(self, - states: torch.Tensor, - actions: torch.Tensor, - rewards: torch.Tensor, - next_states: torch.Tensor, - dones: torch.Tensor, - infos: Any, - timestep: int, + return actions, log_prob, outputs + + def record_transition(self, + states: torch.Tensor, + actions: torch.Tensor, + rewards: torch.Tensor, + next_states: torch.Tensor, + terminated: torch.Tensor, + truncated: torch.Tensor, + infos: Any, + timestep: int, timesteps: int) -> None: """Record an environment transition in memory - + :param states: Observations/states of the environment used to make the decision :type states: torch.Tensor :param actions: Actions taken by the agent @@ -295,8 +300,10 @@ def record_transition(self, :type rewards: torch.Tensor :param next_states: Next observations/states of the environment :type next_states: torch.Tensor - :param dones: Signals to indicate that episodes have ended - :type dones: torch.Tensor + :param terminated: Signals to indicate that episodes have terminated + :type terminated: torch.Tensor + :param truncated: Signals to indicate that episodes have been truncated + :type truncated: torch.Tensor :param infos: Additional information about the environment :type infos: Any type supported by the environment :param timestep: Current timestep @@ -307,31 +314,31 @@ def record_transition(self, # use collected states if self._current_states is not None: states = self._current_states - - super().record_transition(states, actions, rewards, next_states, dones, infos, timestep, timesteps) - - # reward shaping - if self._rewards_shaper is not None: - rewards = self._rewards_shaper(rewards, timestep, timesteps) - amp_states = infos["amp_obs"] + super().record_transition(states, actions, rewards, next_states, terminated, truncated, infos, timestep, timesteps) if self.memory is not None: + amp_states = infos["amp_obs"] + + # reward shaping + if self._rewards_shaper is not None: + rewards = self._rewards_shaper(rewards, timestep, timesteps) + with torch.no_grad(): - values, _, _ = self.value.act(states=self._state_preprocessor(states), taken_actions=None, role="value") + values, _, _ = self.value.act({"states": self._state_preprocessor(states)}, role="value") values = self._value_preprocessor(values, inverse=True) with torch.no_grad(): - next_values, _, _ = self.value.act(states=self._state_preprocessor(next_states), taken_actions=None, role="value") + next_values, _, _ = self.value.act({"states": self._state_preprocessor(next_states)}, role="value") next_values = self._value_preprocessor(next_values, inverse=True) next_values *= infos['terminate'].view(-1, 1).logical_not() - self.memory.add_samples(states=states, actions=actions, rewards=rewards, next_states=next_states, dones=dones, + self.memory.add_samples(states=states, actions=actions, rewards=rewards, next_states=next_states, terminated=terminated, truncated=truncated, log_prob=self._current_log_prob, values=values, amp_states=amp_states, next_values=next_values) for memory in self.secondary_memories: - memory.add_samples(states=states, actions=actions, rewards=rewards, next_states=next_states, dones=dones, + memory.add_samples(states=states, actions=actions, rewards=rewards, next_states=next_states, terminated=terminated, truncated=truncated, log_prob=self._current_log_prob, values=values, amp_states=amp_states, next_values=next_values) - + def pre_interaction(self, timestep: int, timesteps: int) -> None: """Callback called before the interaction with the environment @@ -368,11 +375,11 @@ def _update(self, timestep: int, timesteps: int) -> None: :param timesteps: Number of timesteps :type timesteps: int """ - def compute_gae(rewards: torch.Tensor, - dones: torch.Tensor, - values: torch.Tensor, - next_values: torch.Tensor, - discount_factor: float = 0.99, + def compute_gae(rewards: torch.Tensor, + dones: torch.Tensor, + values: torch.Tensor, + next_values: torch.Tensor, + discount_factor: float = 0.99, lambda_coefficient: float = 0.95) -> torch.Tensor: """Compute the Generalized Advantage Estimator (GAE) @@ -416,17 +423,17 @@ def compute_gae(rewards: torch.Tensor, amp_states = self.memory.get_tensor_by_name("amp_states") with torch.no_grad(): - amp_logits, _, _ = self.discriminator.act(self._amp_state_preprocessor(amp_states), taken_actions=None, role="discriminator") + amp_logits, _, _ = self.discriminator.act({"states": self._amp_state_preprocessor(amp_states)}, role="discriminator") style_reward = -torch.log(torch.maximum(1 - 1 / (1 + torch.exp(-amp_logits)), torch.tensor(0.0001, device=self.device))) style_reward *= self._discriminator_reward_scale - + combined_rewards = self._task_reward_weight * rewards + self._style_reward_weight * style_reward # compute returns and advantages values = self.memory.get_tensor_by_name("values") next_values=self.memory.get_tensor_by_name("next_values") returns, advantages = compute_gae(rewards=combined_rewards, - dones=self.memory.get_tensor_by_name("dones"), + dones=self.memory.get_tensor_by_name("terminated"), values=values, next_values=next_values, discount_factor=self._discount_factor, @@ -462,28 +469,28 @@ def compute_gae(rewards: torch.Tensor, sampled_amp_states, _) in enumerate(sampled_batches): sampled_states = self._state_preprocessor(sampled_states, train=True) - - _, next_log_prob, _ = self.policy.act(states=sampled_states, taken_actions=sampled_actions, role="policy") + + _, next_log_prob, _ = self.policy.act({"states": sampled_states, "taken_actions": sampled_actions}, role="policy") # compute entropy loss if self._entropy_loss_scale: entropy_loss = -self._entropy_loss_scale * self.policy.get_entropy(role="policy").mean() else: entropy_loss = 0 - + # compute policy loss ratio = torch.exp(next_log_prob - sampled_log_prob) surrogate = sampled_advantages * ratio surrogate_clipped = sampled_advantages * torch.clip(ratio, 1.0 - self._ratio_clip, 1.0 + self._ratio_clip) - + policy_loss = -torch.min(surrogate, surrogate_clipped).mean() # compute value loss - predicted_values, _, _ = self.value.act(states=sampled_states, taken_actions=None, role="value") + predicted_values, _, _ = self.value.act({"states": sampled_states}, role="value") if self._clip_predicted_values: - predicted_values = sampled_values + torch.clip(predicted_values - sampled_values, - min=-self._value_clip, + predicted_values = sampled_values + torch.clip(predicted_values - sampled_values, + min=-self._value_clip, max=self._value_clip) value_loss = self._value_loss_scale * F.mse_loss(sampled_returns, predicted_values) @@ -500,9 +507,9 @@ def compute_gae(rewards: torch.Tensor, sampled_amp_motion_states = self._amp_state_preprocessor(sampled_motion_batches[batch_index][0], train=True) sampled_amp_motion_states.requires_grad_(True) - amp_logits, _, _ = self.discriminator.act(states=sampled_amp_states, taken_actions=None, role="discriminator") - amp_replay_logits, _, _ = self.discriminator.act(states=sampled_amp_replay_states, taken_actions=None, role="discriminator") - amp_motion_logits, _, _ = self.discriminator.act(states=sampled_amp_motion_states, taken_actions=None, role="discriminator") + amp_logits, _, _ = self.discriminator.act({"states": sampled_amp_states}, role="discriminator") + amp_replay_logits, _, _ = self.discriminator.act({"states": sampled_amp_replay_states}, role="discriminator") + amp_motion_logits, _, _ = self.discriminator.act({"states": sampled_amp_motion_states}, role="discriminator") amp_cat_logits = torch.cat([amp_logits, amp_replay_logits], dim=0) @@ -517,11 +524,11 @@ def compute_gae(rewards: torch.Tensor, # discriminator gradient penalty if self._discriminator_gradient_penalty_scale: - amp_motion_gradient = torch.autograd.grad(amp_motion_logits, - sampled_amp_motion_states, + amp_motion_gradient = torch.autograd.grad(amp_motion_logits, + sampled_amp_motion_states, grad_outputs=torch.ones_like(amp_motion_logits), - create_graph=True, - retain_graph=True, + create_graph=True, + retain_graph=True, only_inputs=True) gradient_penalty = torch.sum(torch.square(amp_motion_gradient[0]), dim=-1).mean() discriminator_loss += self._discriminator_gradient_penalty_scale * gradient_penalty @@ -539,8 +546,8 @@ def compute_gae(rewards: torch.Tensor, self.optimizer.zero_grad() (policy_loss + entropy_loss + value_loss + discriminator_loss).backward() if self._grad_norm_clip > 0: - nn.utils.clip_grad_norm_(itertools.chain(self.policy.parameters(), - self.value.parameters(), + nn.utils.clip_grad_norm_(itertools.chain(self.policy.parameters(), + self.value.parameters(), self.discriminator.parameters()), self._grad_norm_clip) self.optimizer.step() diff --git a/skrl/agents/torch/base.py b/skrl/agents/torch/base.py index 3483a994..7d0c4c00 100644 --- a/skrl/agents/torch/base.py +++ b/skrl/agents/torch/base.py @@ -1,7 +1,7 @@ -from typing import Union, Mapping, Tuple, Dict, Any +from typing import Union, Mapping, Tuple, Dict, Any, Optional import os -import gym +import gym, gymnasium import copy import datetime import collections @@ -11,31 +11,32 @@ from torch.utils.tensorboard import SummaryWriter from skrl import logger -from ...memories.torch import Memory -from ...models.torch import Model +from skrl.memories.torch import Memory +from skrl.models.torch import Model class Agent: - def __init__(self, - models: Dict[str, Model], - memory: Union[Memory, Tuple[Memory], None] = None, - observation_space: Union[int, Tuple[int], gym.Space, None] = None, - action_space: Union[int, Tuple[int], gym.Space, None] = None, - device: Union[str, torch.device] = "cuda:0", - cfg: dict = {}) -> None: + def __init__(self, + models: Dict[str, Model], + memory: Optional[Union[Memory, Tuple[Memory]]] = None, + observation_space: Optional[Union[int, Tuple[int], gym.Space, gymnasium.Space]] = None, + action_space: Optional[Union[int, Tuple[int], gym.Space, gymnasium.Space]] = None, + device: Optional[Union[str, torch.device]] = None, + cfg: Optional[dict] = None) -> None: """Base class that represent a RL agent :param models: Models used by the agent :type models: dictionary of skrl.models.torch.Model :param memory: Memory to storage the transitions. - If it is a tuple, the first element will be used for training and + If it is a tuple, the first element will be used for training and for the rest only the environment transitions will be added :type memory: skrl.memory.torch.Memory, list of skrl.memory.torch.Memory or None :param observation_space: Observation/state space or shape (default: None) - :type observation_space: int, tuple or list of integers, gym.Space or None, optional + :type observation_space: int, tuple or list of integers, gym.Space, gymnasium.Space or None, optional :param action_space: Action space or shape (default: None) - :type action_space: int, tuple or list of integers, gym.Space or None, optional - :param device: Computing device (default: "cuda:0") + :type action_space: int, tuple or list of integers, gym.Space, gymnasium.Space or None, optional + :param device: Device on which a torch tensor is or will be allocated (default: ``None``). + If None, the device will be either ``"cuda:0"`` if available or ``"cpu"`` :type device: str or torch.device, optional :param cfg: Configuration dictionary :type cfg: dict @@ -43,8 +44,8 @@ def __init__(self, self.models = models self.observation_space = observation_space self.action_space = action_space - self.device = torch.device(device) - self.cfg = cfg + self.cfg = cfg if cfg is not None else {} + self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") if device is None else torch.device(device) if type(memory) is list: self.memory = memory[0] @@ -52,7 +53,7 @@ def __init__(self, else: self.memory = memory self.secondary_memories = [] - + # convert the models to their respective device for model in self.models.values(): if model is not None: @@ -66,12 +67,23 @@ def __init__(self, self._cumulative_rewards = None self._cumulative_timesteps = None + self.training = True + # checkpoint self.checkpoint_modules = {} self.checkpoint_interval = self.cfg.get("experiment", {}).get("checkpoint_interval", 1000) self.checkpoint_store_separately = self.cfg.get("experiment", {}).get("store_separately", False) self.checkpoint_best_modules = {"timestep": 0, "reward": -2 ** 31, "saved": False, "modules": {}} + # experiment directory + directory = self.cfg.get("experiment", {}).get("directory", "") + experiment_name = self.cfg.get("experiment", {}).get("experiment_name", "") + if not directory: + directory = os.path.join(os.getcwd(), "runs") + if not experiment_name: + experiment_name = "{}_{}".format(datetime.datetime.now().strftime("%y-%m-%d_%H-%M-%S-%f"), self.__class__.__name__) + self.experiment_dir = os.path.join(directory, experiment_name) + def __str__(self) -> str: """Generate a representation of the agent as string @@ -104,29 +116,42 @@ def _empty_preprocessor(self, _input: Any, *args, **kwargs) -> Any: def _get_internal_value(self, _module: Any) -> Any: """Get internal module/variable state/value - :param _input: Module or variable - :type _input: Any + :param _module: Module or variable + :type _module: Any :return: Module/variable state/value :rtype: Any """ return _module.state_dict() if hasattr(_module, "state_dict") else _module - def init(self) -> None: + def init(self, trainer_cfg: Optional[Dict[str, Any]] = None) -> None: """Initialize the agent This method should be called before the agent is used. - It will initialize the TensoBoard writer and checkpoint directory + It will initialize the TensoBoard writer (and optionally Weights & Biases) and create the checkpoints directory + + :param trainer_cfg: Trainer configuration + :type trainer_cfg: dict, optional """ - # experiment directory - directory = self.cfg.get("experiment", {}).get("directory", "") - experiment_name = self.cfg.get("experiment", {}).get("experiment_name", "") - if not directory: - directory = os.path.join(os.getcwd(), "runs") - if not experiment_name: - experiment_name = "{}_{}".format(datetime.datetime.now().strftime("%y-%m-%d_%H-%M-%S-%f"), self.__class__.__name__) - self.experiment_dir = os.path.join(directory, experiment_name) - + # setup Weights & Biases + if self.cfg.get("experiment", {}).get("wandb", False): + # save experiment config + trainer_cfg = trainer_cfg if trainer_cfg is not None else {} + try: + models_cfg = {k: v.net._modules for (k, v) in self.models.items()} + except AttributeError: + models_cfg = {k: v._modules for (k, v) in self.models.items()} + config={**self.cfg, **trainer_cfg, **models_cfg} + # set default values + wandb_kwargs = copy.deepcopy(self.cfg.get("experiment", {}).get("wandb_kwargs", {})) + wandb_kwargs.setdefault("name", os.path.split(self.experiment_dir)[-1]) + wandb_kwargs.setdefault("sync_tensorboard", True) + wandb_kwargs.setdefault("config", {}) + wandb_kwargs["config"].update(config) + # init Weights & Biases + import wandb + wandb.init(**wandb_kwargs) + # main entry to log data for consumption and visualization by TensorBoard self.writer = SummaryWriter(log_dir=self.experiment_dir) @@ -193,7 +218,7 @@ def write_checkpoint(self, timestep: int, timesteps: int) -> None: # separated modules if self.checkpoint_store_separately: for name, module in self.checkpoint_modules.items(): - torch.save(self.checkpoint_best_modules["modules"][name], + torch.save(self.checkpoint_best_modules["modules"][name], os.path.join(self.experiment_dir, "checkpoints", "best_{}.pt".format(name))) # whole agent else: @@ -203,9 +228,9 @@ def write_checkpoint(self, timestep: int, timesteps: int) -> None: torch.save(modules, os.path.join(self.experiment_dir, "checkpoints", "best_{}.pt".format("agent"))) self.checkpoint_best_modules["saved"] = True - def act(self, - states: torch.Tensor, - timestep: int, + def act(self, + states: torch.Tensor, + timestep: int, timesteps: int) -> torch.Tensor: """Process the environment's states to make a decision (actions) using the main policy @@ -223,20 +248,21 @@ def act(self, """ raise NotImplementedError - def record_transition(self, - states: torch.Tensor, - actions: torch.Tensor, - rewards: torch.Tensor, - next_states: torch.Tensor, - dones: torch.Tensor, - infos: Any, - timestep: int, + def record_transition(self, + states: torch.Tensor, + actions: torch.Tensor, + rewards: torch.Tensor, + next_states: torch.Tensor, + terminated: torch.Tensor, + truncated: torch.Tensor, + infos: Any, + timestep: int, timesteps: int) -> None: """Record an environment transition in memory (to be implemented by the inheriting classes) Inheriting classes must call this method to record episode information (rewards, timesteps, etc.). In addition to recording environment transition (such as states, rewards, etc.), agent information can be recorded. - + :param states: Observations/states of the environment used to make the decision :type states: torch.Tensor :param actions: Actions taken by the agent @@ -245,8 +271,10 @@ def record_transition(self, :type rewards: torch.Tensor :param next_states: Next observations/states of the environment :type next_states: torch.Tensor - :param dones: Signals to indicate that episodes have ended - :type dones: torch.Tensor + :param terminated: Signals to indicate that episodes have terminated + :type terminated: torch.Tensor + :param truncated: Signals to indicate that episodes have been truncated + :type truncated: torch.Tensor :param infos: Additional information about the environment :type infos: Any type supported by the environment :param timestep: Current timestep @@ -258,12 +286,12 @@ def record_transition(self, if self._cumulative_rewards is None: self._cumulative_rewards = torch.zeros_like(rewards, dtype=torch.float32) self._cumulative_timesteps = torch.zeros_like(rewards, dtype=torch.int32) - + self._cumulative_rewards.add_(rewards) self._cumulative_timesteps.add_(1) # check ended episodes - finished_episodes = dones.nonzero(as_tuple=False) + finished_episodes = (terminated + truncated).nonzero(as_tuple=False) if finished_episodes.numel(): # storage cumulative rewards and timesteps @@ -273,7 +301,7 @@ def record_transition(self, # reset the cumulative rewards and timesteps self._cumulative_rewards[finished_episodes] = 0 self._cumulative_timesteps[finished_episodes] = 0 - + # record data if self.write_interval > 0: self.tracking_data["Reward / Instantaneous reward (max)"].append(torch.max(rewards).item()) @@ -302,6 +330,17 @@ def set_mode(self, mode: str) -> None: if model is not None: model.set_mode(mode) + def set_running_mode(self, mode: str) -> None: + """Set the current running mode (training or evaluation) + + This method sets the value of the ``training`` property (boolean). + This property can be used to know if the agent is running in training or evaluation mode. + + :param mode: Mode: 'train' for training or 'eval' for evaluation + :type mode: str + """ + self.training = mode == "train" + def save(self, path: str) -> None: """Save the agent to the specified path @@ -565,9 +604,9 @@ def migrate(self, if module not in ["state_preprocessor", "value_preprocessor", "optimizer"] and hasattr(module, "migrate"): if verbose: logger.info("Model: {} ({})".format(name, type(module).__name__)) - status *= module.migrate(state_dict=checkpoint["model"], - name_map=name_map.get(name, {}), - auto_mapping=auto_mapping, + status *= module.migrate(state_dict=checkpoint["model"], + name_map=name_map.get(name, {}), + auto_mapping=auto_mapping, verbose=verbose) self.set_mode("eval") @@ -620,4 +659,4 @@ def _update(self, timestep: int, timesteps: int) -> None: :raises NotImplementedError: The method is not implemented by the inheriting classes """ - raise NotImplementedError \ No newline at end of file + raise NotImplementedError diff --git a/skrl/agents/torch/cem/__init__.py b/skrl/agents/torch/cem/__init__.py index dba62a05..89bcc376 100644 --- a/skrl/agents/torch/cem/__init__.py +++ b/skrl/agents/torch/cem/__init__.py @@ -1 +1 @@ -from .cem import CEM, CEM_DEFAULT_CONFIG \ No newline at end of file +from skrl.agents.torch.cem.cem import CEM, CEM_DEFAULT_CONFIG diff --git a/skrl/agents/torch/cem/cem.py b/skrl/agents/torch/cem/cem.py index 6168ed02..99c524b4 100644 --- a/skrl/agents/torch/cem/cem.py +++ b/skrl/agents/torch/cem/cem.py @@ -1,15 +1,15 @@ -from typing import Union, Tuple, Dict, Any +from typing import Union, Tuple, Dict, Any, Optional -import gym +import gym, gymnasium import copy import torch import torch.nn.functional as F -from ....memories.torch import Memory -from ....models.torch import Model +from skrl.memories.torch import Memory +from skrl.models.torch import Model -from .. import Agent +from skrl.agents.torch import Agent CEM_DEFAULT_CONFIG = { @@ -17,7 +17,7 @@ "percentile": 0.70, # percentile to compute the reward bound [0, 1] "discount_factor": 0.99, # discount factor (gamma) - + "learning_rate": 1e-2, # learning rate "learning_rate_scheduler": None, # learning rate scheduler class (see torch.optim.lr_scheduler) "learning_rate_scheduler_kwargs": {}, # learning rate scheduler's kwargs (e.g. {"step_size": 1e-3}) @@ -37,33 +37,37 @@ "checkpoint_interval": 1000, # interval for checkpoints (timesteps) "store_separately": False, # whether to store checkpoints separately + + "wandb": False, # whether to use Weights & Biases + "wandb_kwargs": {} # wandb kwargs (see https://docs.wandb.ai/ref/python/init) } } class CEM(Agent): - def __init__(self, - models: Dict[str, Model], - memory: Union[Memory, Tuple[Memory], None] = None, - observation_space: Union[int, Tuple[int], gym.Space, None] = None, - action_space: Union[int, Tuple[int], gym.Space, None] = None, - device: Union[str, torch.device] = "cuda:0", - cfg: dict = {}) -> None: + def __init__(self, + models: Dict[str, Model], + memory: Optional[Union[Memory, Tuple[Memory]]] = None, + observation_space: Optional[Union[int, Tuple[int], gym.Space, gymnasium.Space]] = None, + action_space: Optional[Union[int, Tuple[int], gym.Space, gymnasium.Space]] = None, + device: Optional[Union[str, torch.device]] = None, + cfg: Optional[dict] = None) -> None: """Cross-Entropy Method (CEM) https://ieeexplore.ieee.org/abstract/document/6796865/ - + :param models: Models used by the agent :type models: dictionary of skrl.models.torch.Model :param memory: Memory to storage the transitions. - If it is a tuple, the first element will be used for training and + If it is a tuple, the first element will be used for training and for the rest only the environment transitions will be added :type memory: skrl.memory.torch.Memory, list of skrl.memory.torch.Memory or None :param observation_space: Observation/state space or shape (default: None) - :type observation_space: int, tuple or list of integers, gym.Space or None, optional + :type observation_space: int, tuple or list of integers, gym.Space, gymnasium.Space or None, optional :param action_space: Action space or shape (default: None) - :type action_space: int, tuple or list of integers, gym.Space or None, optional - :param device: Computing device (default: "cuda:0") + :type action_space: int, tuple or list of integers, gym.Space, gymnasium.Space or None, optional + :param device: Device on which a torch tensor is or will be allocated (default: ``None``). + If None, the device will be either ``"cuda:0"`` if available or ``"cpu"`` :type device: str or torch.device, optional :param cfg: Configuration dictionary :type cfg: dict @@ -71,12 +75,12 @@ def __init__(self, :raises KeyError: If the models dictionary is missing a required key """ _cfg = copy.deepcopy(CEM_DEFAULT_CONFIG) - _cfg.update(cfg) - super().__init__(models=models, - memory=memory, - observation_space=observation_space, - action_space=action_space, - device=device, + _cfg.update(cfg if cfg is not None else {}) + super().__init__(models=models, + memory=memory, + observation_space=observation_space, + action_space=action_space, + device=device, cfg=_cfg) # models @@ -84,7 +88,7 @@ def __init__(self, # checkpoint models self.checkpoint_modules["policy"] = self.policy - + # configuration: self._rollouts = self.cfg["rollouts"] self._rollout = 0 @@ -99,7 +103,7 @@ def __init__(self, self._random_timesteps = self.cfg["random_timesteps"] self._learning_starts = self.cfg["learning_starts"] - + self._rewards_shaper = self.cfg["rewards_shaper"] self._episode_tracking = [] @@ -119,20 +123,20 @@ def __init__(self, else: self._state_preprocessor = self._empty_preprocessor - def init(self) -> None: + def init(self, trainer_cfg: Optional[Dict[str, Any]] = None) -> None: """Initialize the agent """ - super().init() - + super().init(trainer_cfg=trainer_cfg) + # create tensors in memory if self.memory is not None: self.memory.create_tensor(name="states", size=self.observation_space, dtype=torch.float32) self.memory.create_tensor(name="next_states", size=self.observation_space, dtype=torch.float32) self.memory.create_tensor(name="actions", size=self.action_space, dtype=torch.int64) self.memory.create_tensor(name="rewards", size=1, dtype=torch.float32) - self.memory.create_tensor(name="dones", size=1, dtype=torch.bool) + self.memory.create_tensor(name="terminated", size=1, dtype=torch.bool) - self.tensors_names = ["states", "actions", "rewards", "next_states", "dones"] + self.tensors_names = ["states", "actions", "rewards", "next_states", "terminated"] def act(self, states: torch.Tensor, timestep: int, timesteps: int) -> torch.Tensor: """Process the environment's states to make a decision (actions) using the main policy @@ -152,22 +156,23 @@ def act(self, states: torch.Tensor, timestep: int, timesteps: int) -> torch.Tens # sample random actions # TODO, check for stochasticity if timestep < self._random_timesteps: - return self.policy.random_act(states, taken_actions=None, role="policy") - - # sample stochastic actions - return self.policy.act(states, taken_actions=None, role="policy") - - def record_transition(self, - states: torch.Tensor, - actions: torch.Tensor, - rewards: torch.Tensor, - next_states: torch.Tensor, - dones: torch.Tensor, - infos: Any, - timestep: int, + return self.policy.random_act({"states": states}, role="policy") + + # sample stochastic actions + return self.policy.act({"states": states}, role="policy") + + def record_transition(self, + states: torch.Tensor, + actions: torch.Tensor, + rewards: torch.Tensor, + next_states: torch.Tensor, + terminated: torch.Tensor, + truncated: torch.Tensor, + infos: Any, + timestep: int, timesteps: int) -> None: """Record an environment transition in memory - + :param states: Observations/states of the environment used to make the decision :type states: torch.Tensor :param actions: Actions taken by the agent @@ -176,8 +181,10 @@ def record_transition(self, :type rewards: torch.Tensor :param next_states: Next observations/states of the environment :type next_states: torch.Tensor - :param dones: Signals to indicate that episodes have ended - :type dones: torch.Tensor + :param terminated: Signals to indicate that episodes have terminated + :type terminated: torch.Tensor + :param truncated: Signals to indicate that episodes have been truncated + :type truncated: torch.Tensor :param infos: Additional information about the environment :type infos: Any type supported by the environment :param timestep: Current timestep @@ -185,20 +192,22 @@ def record_transition(self, :param timesteps: Number of timesteps :type timesteps: int """ - super().record_transition(states, actions, rewards, next_states, dones, infos, timestep, timesteps) + super().record_transition(states, actions, rewards, next_states, terminated, truncated, infos, timestep, timesteps) # reward shaping if self._rewards_shaper is not None: rewards = self._rewards_shaper(rewards, timestep, timesteps) - + if self.memory is not None: - self.memory.add_samples(states=states, actions=actions, rewards=rewards, next_states=next_states, dones=dones) + self.memory.add_samples(states=states, actions=actions, rewards=rewards, next_states=next_states, + terminated=terminated, truncated=truncated) for memory in self.secondary_memories: - memory.add_samples(states=states, actions=actions, rewards=rewards, next_states=next_states, dones=dones) + memory.add_samples(states=states, actions=actions, rewards=rewards, next_states=next_states, + terminated=terminated, truncated=truncated) # track episodes internally if self._rollout: - indexes = torch.nonzero(dones) + indexes = torch.nonzero(terminated + truncated) if indexes.numel(): for i in indexes[:, 0]: self._episode_tracking[i.item()].append(self._rollout + 1) @@ -258,22 +267,23 @@ def _update(self, timestep: int, timesteps: int) -> None: if not len(returns): print("[WARNING] No returns to update. Consider increasing the number of rollouts") return - + returns = torch.tensor(returns) return_threshold = torch.quantile(returns, self._percentile, dim=-1) - + # get elite states and actions indexes = torch.nonzero(returns >= return_threshold) elite_states = torch.cat([sampled_states[limits[i][0]:limits[i][1]] for i in indexes[:, 0]], dim=0) elite_actions = torch.cat([sampled_actions[limits[i][0]:limits[i][1]] for i in indexes[:, 0]], dim=0) # compute scores for the elite states - scores = self.policy.act(elite_states, taken_actions=None, role="policy")[2] + _, _, outputs = self.policy.act({"states": elite_states}, role="policy") + scores = outputs["net_output"] # compute policy loss policy_loss = F.cross_entropy(scores, elite_actions.view(-1)) - # optimize policy + # optimization step self.optimizer.zero_grad() policy_loss.backward() self.optimizer.step() @@ -287,6 +297,6 @@ def _update(self, timestep: int, timesteps: int) -> None: self.track_data("Coefficient / Return threshold", return_threshold.item()) self.track_data("Coefficient / Mean discounted returns", torch.mean(returns).item()) - + if self._learning_rate_scheduler: self.track_data("Learning / Learning rate", self.scheduler.get_last_lr()[0]) diff --git a/skrl/agents/torch/ddpg/__init__.py b/skrl/agents/torch/ddpg/__init__.py index 84b4d294..27338168 100644 --- a/skrl/agents/torch/ddpg/__init__.py +++ b/skrl/agents/torch/ddpg/__init__.py @@ -1 +1 @@ -from .ddpg import DDPG, DDPG_DEFAULT_CONFIG \ No newline at end of file +from skrl.agents.torch.ddpg.ddpg import DDPG, DDPG_DEFAULT_CONFIG diff --git a/skrl/agents/torch/ddpg/ddpg.py b/skrl/agents/torch/ddpg/ddpg.py index dc7235ab..adf3fed0 100644 --- a/skrl/agents/torch/ddpg/ddpg.py +++ b/skrl/agents/torch/ddpg/ddpg.py @@ -1,24 +1,25 @@ -from typing import Union, Tuple, Dict, Any +from typing import Union, Tuple, Dict, Any, Optional -import gym +import gym, gymnasium import copy import torch +import torch.nn as nn import torch.nn.functional as F -from ....memories.torch import Memory -from ....models.torch import Model +from skrl.memories.torch import Memory +from skrl.models.torch import Model -from .. import Agent +from skrl.agents.torch import Agent DDPG_DEFAULT_CONFIG = { "gradient_steps": 1, # gradient steps "batch_size": 64, # training batch size - + "discount_factor": 0.99, # discount factor (gamma) "polyak": 0.005, # soft update hyperparameter (tau) - + "actor_learning_rate": 1e-3, # actor learning rate "critic_learning_rate": 1e-3, # critic learning rate "learning_rate_scheduler": None, # learning rate scheduler class (see torch.optim.lr_scheduler) @@ -30,6 +31,8 @@ "random_timesteps": 0, # random exploration steps "learning_starts": 0, # learning starts after this many steps + "grad_norm_clip": 0, # clipping coefficient for the norm of the gradients + "exploration": { "noise": None, # exploration noise "initial_scale": 1.0, # initial scale for the noise @@ -46,33 +49,37 @@ "checkpoint_interval": 1000, # interval for checkpoints (timesteps) "store_separately": False, # whether to store checkpoints separately + + "wandb": False, # whether to use Weights & Biases + "wandb_kwargs": {} # wandb kwargs (see https://docs.wandb.ai/ref/python/init) } } class DDPG(Agent): - def __init__(self, - models: Dict[str, Model], - memory: Union[Memory, Tuple[Memory], None] = None, - observation_space: Union[int, Tuple[int], gym.Space, None] = None, - action_space: Union[int, Tuple[int], gym.Space, None] = None, - device: Union[str, torch.device] = "cuda:0", - cfg: dict = {}) -> None: + def __init__(self, + models: Dict[str, Model], + memory: Optional[Union[Memory, Tuple[Memory]]] = None, + observation_space: Optional[Union[int, Tuple[int], gym.Space, gymnasium.Space]] = None, + action_space: Optional[Union[int, Tuple[int], gym.Space, gymnasium.Space]] = None, + device: Optional[Union[str, torch.device]] = None, + cfg: Optional[dict] = None) -> None: """Deep Deterministic Policy Gradient (DDPG) https://arxiv.org/abs/1509.02971 - + :param models: Models used by the agent :type models: dictionary of skrl.models.torch.Model :param memory: Memory to storage the transitions. - If it is a tuple, the first element will be used for training and + If it is a tuple, the first element will be used for training and for the rest only the environment transitions will be added :type memory: skrl.memory.torch.Memory, list of skrl.memory.torch.Memory or None :param observation_space: Observation/state space or shape (default: None) - :type observation_space: int, tuple or list of integers, gym.Space or None, optional + :type observation_space: int, tuple or list of integers, gym.Space, gymnasium.Space or None, optional :param action_space: Action space or shape (default: None) - :type action_space: int, tuple or list of integers, gym.Space or None, optional - :param device: Computing device (default: "cuda:0") + :type action_space: int, tuple or list of integers, gym.Space, gymnasium.Space or None, optional + :param device: Device on which a torch tensor is or will be allocated (default: ``None``). + If None, the device will be either ``"cuda:0"`` if available or ``"cpu"`` :type device: str or torch.device, optional :param cfg: Configuration dictionary :type cfg: dict @@ -80,12 +87,12 @@ def __init__(self, :raises KeyError: If the models dictionary is missing a required key """ _cfg = copy.deepcopy(DDPG_DEFAULT_CONFIG) - _cfg.update(cfg) - super().__init__(models=models, - memory=memory, - observation_space=observation_space, - action_space=action_space, - device=device, + _cfg.update(cfg if cfg is not None else {}) + super().__init__(models=models, + memory=memory, + observation_space=observation_space, + action_space=action_space, + device=device, cfg=_cfg) # models @@ -99,7 +106,7 @@ def __init__(self, self.checkpoint_modules["target_policy"] = self.target_policy self.checkpoint_modules["critic"] = self.critic self.checkpoint_modules["target_critic"] = self.target_critic - + if self.target_policy is not None and self.target_critic is not None: # freeze target networks with respect to optimizers (update via .update_parameters()) self.target_policy.freeze_parameters(True) @@ -112,7 +119,7 @@ def __init__(self, # configuration self._gradient_steps = self.cfg["gradient_steps"] self._batch_size = self.cfg["batch_size"] - + self._discount_factor = self.cfg["discount_factor"] self._polyak = self.cfg["polyak"] @@ -121,17 +128,19 @@ def __init__(self, self._learning_rate_scheduler = self.cfg["learning_rate_scheduler"] self._state_preprocessor = self.cfg["state_preprocessor"] - + self._random_timesteps = self.cfg["random_timesteps"] self._learning_starts = self.cfg["learning_starts"] + self._grad_norm_clip = self.cfg["grad_norm_clip"] + self._exploration_noise = self.cfg["exploration"]["noise"] self._exploration_initial_scale = self.cfg["exploration"]["initial_scale"] self._exploration_final_scale = self.cfg["exploration"]["final_scale"] self._exploration_timesteps = self.cfg["exploration"]["timesteps"] self._rewards_shaper = self.cfg["rewards_shaper"] - + # set up optimizers and learning rate schedulers if self.policy is not None and self.critic is not None: self.policy_optimizer = torch.optim.Adam(self.policy.parameters(), lr=self._actor_learning_rate) @@ -150,20 +159,38 @@ def __init__(self, else: self._state_preprocessor = self._empty_preprocessor - def init(self) -> None: + def init(self, trainer_cfg: Optional[Dict[str, Any]] = None) -> None: """Initialize the agent """ - super().init() - + super().init(trainer_cfg=trainer_cfg) + self.set_mode("eval") + # create tensors in memory if self.memory is not None: self.memory.create_tensor(name="states", size=self.observation_space, dtype=torch.float32) self.memory.create_tensor(name="next_states", size=self.observation_space, dtype=torch.float32) self.memory.create_tensor(name="actions", size=self.action_space, dtype=torch.float32) self.memory.create_tensor(name="rewards", size=1, dtype=torch.float32) - self.memory.create_tensor(name="dones", size=1, dtype=torch.bool) - - self.tensors_names = ["states", "actions", "rewards", "next_states", "dones"] + self.memory.create_tensor(name="terminated", size=1, dtype=torch.bool) + + self._tensors_names = ["states", "actions", "rewards", "next_states", "terminated"] + + # RNN specifications + self._rnn = False # flag to indicate whether RNN is available + self._rnn_tensors_names = [] # used for sampling during training + self._rnn_final_states = {"policy": []} + self._rnn_initial_states = {"policy": []} + self._rnn_sequence_length = self.policy.get_specification().get("rnn", {}).get("sequence_length", 1) + + # policy + for i, size in enumerate(self.policy.get_specification().get("rnn", {}).get("sizes", [])): + self._rnn = True + # create tensors in memory + if self.memory is not None: + self.memory.create_tensor(name=f"rnn_policy_{i}", size=(size[0], size[2]), dtype=torch.float32, keep_dimensions=True) + self._rnn_tensors_names.append(f"rnn_policy_{i}") + # default RNN states + self._rnn_initial_states["policy"].append(torch.zeros(size, dtype=torch.float32, device=self.device)) # clip noise bounds self.clip_actions_min = torch.tensor(self.action_space.low, device=self.device) @@ -185,25 +212,28 @@ def act(self, states: torch.Tensor, timestep: int, timesteps: int) -> torch.Tens :return: Actions :rtype: torch.Tensor """ - states = self._state_preprocessor(states) + rnn = {"rnn": self._rnn_initial_states["policy"]} if self._rnn else {} # sample random actions if timestep < self._random_timesteps: - return self.policy.random_act(states, taken_actions=None, role="policy") + return self.policy.random_act({"states": self._state_preprocessor(states), **rnn}, role="policy") # sample deterministic actions - actions = self.policy.act(states, taken_actions=None, role="policy") + actions, _, outputs = self.policy.act({"states": self._state_preprocessor(states), **rnn}, role="policy") + + if self._rnn: + self._rnn_final_states["policy"] = outputs.get("rnn", []) # add exloration noise if self._exploration_noise is not None: # sample noises - noises = self._exploration_noise.sample(actions[0].shape) - + noises = self._exploration_noise.sample(actions.shape) + # define exploration timesteps scale = self._exploration_final_scale if self._exploration_timesteps is None: self._exploration_timesteps = timesteps - + # apply exploration noise if timestep <= self._exploration_timesteps: scale = (1 - timestep / self._exploration_timesteps) \ @@ -212,38 +242,37 @@ def act(self, states: torch.Tensor, timestep: int, timesteps: int) -> torch.Tens noises.mul_(scale) # modify actions - actions[0].add_(noises) + actions.add_(noises) if self._backward_compatibility: - actions = (torch.max(torch.min(actions[0], self.clip_actions_max), self.clip_actions_min), - actions[1], - actions[2]) + actions = torch.max(torch.min(actions, self.clip_actions_max), self.clip_actions_min) else: - actions[0].clamp_(min=self.clip_actions_min, max=self.clip_actions_max) + actions.clamp_(min=self.clip_actions_min, max=self.clip_actions_max) # record noises self.track_data("Exploration / Exploration noise (max)", torch.max(noises).item()) self.track_data("Exploration / Exploration noise (min)", torch.min(noises).item()) self.track_data("Exploration / Exploration noise (mean)", torch.mean(noises).item()) - + else: # record noises self.track_data("Exploration / Exploration noise (max)", 0) self.track_data("Exploration / Exploration noise (min)", 0) self.track_data("Exploration / Exploration noise (mean)", 0) - - return actions - - def record_transition(self, - states: torch.Tensor, - actions: torch.Tensor, - rewards: torch.Tensor, - next_states: torch.Tensor, - dones: torch.Tensor, - infos: Any, - timestep: int, + + return actions, None, outputs + + def record_transition(self, + states: torch.Tensor, + actions: torch.Tensor, + rewards: torch.Tensor, + next_states: torch.Tensor, + terminated: torch.Tensor, + truncated: torch.Tensor, + infos: Any, + timestep: int, timesteps: int) -> None: """Record an environment transition in memory - + :param states: Observations/states of the environment used to make the decision :type states: torch.Tensor :param actions: Actions taken by the agent @@ -252,8 +281,10 @@ def record_transition(self, :type rewards: torch.Tensor :param next_states: Next observations/states of the environment :type next_states: torch.Tensor - :param dones: Signals to indicate that episodes have ended - :type dones: torch.Tensor + :param terminated: Signals to indicate that episodes have terminated + :type terminated: torch.Tensor + :param truncated: Signals to indicate that episodes have been truncated + :type truncated: torch.Tensor :param infos: Additional information about the environment :type infos: Any type supported by the environment :param timestep: Current timestep @@ -261,16 +292,34 @@ def record_transition(self, :param timesteps: Number of timesteps :type timesteps: int """ - super().record_transition(states, actions, rewards, next_states, dones, infos, timestep, timesteps) + super().record_transition(states, actions, rewards, next_states, terminated, truncated, infos, timestep, timesteps) - # reward shaping - if self._rewards_shaper is not None: - rewards = self._rewards_shaper(rewards, timestep, timesteps) - if self.memory is not None: - self.memory.add_samples(states=states, actions=actions, rewards=rewards, next_states=next_states, dones=dones) + # reward shaping + if self._rewards_shaper is not None: + rewards = self._rewards_shaper(rewards, timestep, timesteps) + + # package RNN states + rnn_states = {} + if self._rnn: + rnn_states.update({f"rnn_policy_{i}": s.transpose(0, 1) for i, s in enumerate(self._rnn_initial_states["policy"])}) + + # storage transition in memory + self.memory.add_samples(states=states, actions=actions, rewards=rewards, next_states=next_states, + terminated=terminated, truncated=truncated, **rnn_states) for memory in self.secondary_memories: - memory.add_samples(states=states, actions=actions, rewards=rewards, next_states=next_states, dones=dones) + memory.add_samples(states=states, actions=actions, rewards=rewards, next_states=next_states, + terminated=terminated, truncated=truncated, **rnn_states) + + # update RNN states + if self._rnn: + # reset states if the episodes have ended + finished_episodes = terminated.nonzero(as_tuple=False) + if finished_episodes.numel(): + for rnn_state in self._rnn_final_states["policy"]: + rnn_state[:, finished_episodes[:, 0]] = 0 + + self._rnn_initial_states = self._rnn_final_states def pre_interaction(self, timestep: int, timesteps: int) -> None: """Callback called before the interaction with the environment @@ -291,7 +340,9 @@ def post_interaction(self, timestep: int, timesteps: int) -> None: :type timesteps: int """ if timestep >= self._learning_starts: + self.set_mode("train") self._update(timestep, timesteps) + self.set_mode("eval") # write tracking data and checkpoints super().post_interaction(timestep, timesteps) @@ -306,7 +357,12 @@ def _update(self, timestep: int, timesteps: int) -> None: """ # sample a batch from memory sampled_states, sampled_actions, sampled_rewards, sampled_next_states, sampled_dones = \ - self.memory.sample(names=self.tensors_names, batch_size=self._batch_size)[0] + self.memory.sample(names=self._tensors_names, batch_size=self._batch_size, sequence_length=self._rnn_sequence_length)[0] + + rnn_policy = {} + if self._rnn: + sampled_rnn = self.memory.sample_by_index(names=self._rnn_tensors_names, indexes=self.memory.get_sampling_indexes())[0] + rnn_policy = {"rnn": [s.transpose(0, 1) for s in sampled_rnn], "terminated": sampled_dones} # gradient steps for gradient_step in range(self._gradient_steps): @@ -316,30 +372,34 @@ def _update(self, timestep: int, timesteps: int) -> None: # compute target values with torch.no_grad(): - next_actions, _, _ = self.target_policy.act(states=sampled_next_states, taken_actions=None, role="target_policy") - - target_q_values, _, _ = self.target_critic.act(states=sampled_next_states, taken_actions=next_actions, role="target_critic") + next_actions, _, _ = self.target_policy.act({"states": sampled_next_states, **rnn_policy}, role="target_policy") + + target_q_values, _, _ = self.target_critic.act({"states": sampled_next_states, "taken_actions": next_actions, **rnn_policy}, role="target_critic") target_values = sampled_rewards + self._discount_factor * sampled_dones.logical_not() * target_q_values # compute critic loss - critic_values, _, _ = self.critic.act(states=sampled_states, taken_actions=sampled_actions, role="critic") - + critic_values, _, _ = self.critic.act({"states": sampled_states, "taken_actions": sampled_actions, **rnn_policy}, role="critic") + critic_loss = F.mse_loss(critic_values, target_values) - + # optimization step (critic) self.critic_optimizer.zero_grad() critic_loss.backward() + if self._grad_norm_clip > 0: + nn.utils.clip_grad_norm_(self.critic.parameters(), self._grad_norm_clip) self.critic_optimizer.step() # compute policy (actor) loss - actions, _, _ = self.policy.act(states=sampled_states, taken_actions=None, role="policy") - critic_values, _, _ = self.critic.act(states=sampled_states, taken_actions=actions, role="critic") + actions, _, _ = self.policy.act({"states": sampled_states, **rnn_policy}, role="policy") + critic_values, _, _ = self.critic.act({"states": sampled_states, "taken_actions": actions, **rnn_policy}, role="critic") policy_loss = -critic_values.mean() # optimization step (policy) self.policy_optimizer.zero_grad() policy_loss.backward() + if self._grad_norm_clip > 0: + nn.utils.clip_grad_norm_(self.policy.parameters(), self._grad_norm_clip) self.policy_optimizer.step() # update target networks diff --git a/skrl/agents/torch/dqn/__init__.py b/skrl/agents/torch/dqn/__init__.py index a6b63418..6532813e 100644 --- a/skrl/agents/torch/dqn/__init__.py +++ b/skrl/agents/torch/dqn/__init__.py @@ -1,2 +1,2 @@ -from .dqn import DQN, DQN_DEFAULT_CONFIG -from .ddqn import DDQN, DDQN_DEFAULT_CONFIG \ No newline at end of file +from skrl.agents.torch.dqn.dqn import DQN, DQN_DEFAULT_CONFIG +from skrl.agents.torch.dqn.ddqn import DDQN, DDQN_DEFAULT_CONFIG diff --git a/skrl/agents/torch/dqn/ddqn.py b/skrl/agents/torch/dqn/ddqn.py index 6ad0901e..90c144ce 100644 --- a/skrl/agents/torch/dqn/ddqn.py +++ b/skrl/agents/torch/dqn/ddqn.py @@ -1,25 +1,25 @@ -from typing import Union, Tuple, Dict, Any +from typing import Union, Tuple, Dict, Any, Optional -import gym +import gym, gymnasium import copy import math import torch import torch.nn.functional as F -from ....memories.torch import Memory -from ....models.torch import Model +from skrl.memories.torch import Memory +from skrl.models.torch import Model -from .. import Agent +from skrl.agents.torch import Agent DDQN_DEFAULT_CONFIG = { "gradient_steps": 1, # gradient steps "batch_size": 64, # training batch size - + "discount_factor": 0.99, # discount factor (gamma) "polyak": 0.005, # soft update hyperparameter (tau) - + "learning_rate": 1e-3, # learning rate "learning_rate_scheduler": None, # learning rate scheduler class (see torch.optim.lr_scheduler) "learning_rate_scheduler_kwargs": {}, # learning rate scheduler's kwargs (e.g. {"step_size": 1e-3}) @@ -36,7 +36,7 @@ "exploration": { "initial_epsilon": 1.0, # initial epsilon for epsilon-greedy exploration "final_epsilon": 0.05, # final epsilon for epsilon-greedy exploration - "timesteps": 1000, # timesteps for epsilon-greedy decay + "timesteps": 1000, # timesteps for epsilon-greedy decay }, "rewards_shaper": None, # rewards shaping function: Callable(reward, timestep, timesteps) -> reward @@ -48,33 +48,37 @@ "checkpoint_interval": 1000, # interval for checkpoints (timesteps) "store_separately": False, # whether to store checkpoints separately + + "wandb": False, # whether to use Weights & Biases + "wandb_kwargs": {} # wandb kwargs (see https://docs.wandb.ai/ref/python/init) } } class DDQN(Agent): - def __init__(self, - models: Dict[str, Model], - memory: Union[Memory, Tuple[Memory], None] = None, - observation_space: Union[int, Tuple[int], gym.Space, None] = None, - action_space: Union[int, Tuple[int], gym.Space, None] = None, - device: Union[str, torch.device] = "cuda:0", - cfg: dict = {}) -> None: + def __init__(self, + models: Dict[str, Model], + memory: Optional[Union[Memory, Tuple[Memory]]] = None, + observation_space: Optional[Union[int, Tuple[int], gym.Space, gymnasium.Space]] = None, + action_space: Optional[Union[int, Tuple[int], gym.Space, gymnasium.Space]] = None, + device: Optional[Union[str, torch.device]] = None, + cfg: Optional[dict] = None) -> None: """Double Deep Q-Network (DDQN) https://ojs.aaai.org/index.php/AAAI/article/view/10295 - + :param models: Models used by the agent :type models: dictionary of skrl.models.torch.Model :param memory: Memory to storage the transitions. - If it is a tuple, the first element will be used for training and + If it is a tuple, the first element will be used for training and for the rest only the environment transitions will be added :type memory: skrl.memory.torch.Memory, list of skrl.memory.torch.Memory or None :param observation_space: Observation/state space or shape (default: None) - :type observation_space: int, tuple or list of integers, gym.Space or None, optional + :type observation_space: int, tuple or list of integers, gym.Space, gymnasium.Space or None, optional :param action_space: Action space or shape (default: None) - :type action_space: int, tuple or list of integers, gym.Space or None, optional - :param device: Computing device (default: "cuda:0") + :type action_space: int, tuple or list of integers, gym.Space, gymnasium.Space or None, optional + :param device: Device on which a torch tensor is or will be allocated (default: ``None``). + If None, the device will be either ``"cuda:0"`` if available or ``"cpu"`` :type device: str or torch.device, optional :param cfg: Configuration dictionary :type cfg: dict @@ -82,12 +86,12 @@ def __init__(self, :raises KeyError: If the models dictionary is missing a required key """ _cfg = copy.deepcopy(DDQN_DEFAULT_CONFIG) - _cfg.update(cfg) - super().__init__(models=models, - memory=memory, - observation_space=observation_space, - action_space=action_space, - device=device, + _cfg.update(cfg if cfg is not None else {}) + super().__init__(models=models, + memory=memory, + observation_space=observation_space, + action_space=action_space, + device=device, cfg=_cfg) # models @@ -108,7 +112,7 @@ def __init__(self, # configuration self._gradient_steps = self.cfg["gradient_steps"] self._batch_size = self.cfg["batch_size"] - + self._discount_factor = self.cfg["discount_factor"] self._polyak = self.cfg["polyak"] @@ -116,7 +120,7 @@ def __init__(self, self._learning_rate_scheduler = self.cfg["learning_rate_scheduler"] self._state_preprocessor = self.cfg["state_preprocessor"] - + self._random_timesteps = self.cfg["random_timesteps"] self._learning_starts = self.cfg["learning_starts"] @@ -128,7 +132,7 @@ def __init__(self, self._exploration_timesteps = self.cfg["exploration"]["timesteps"] self._rewards_shaper = self.cfg["rewards_shaper"] - + # set up optimizer and learning rate scheduler if self.q_network is not None: self.optimizer = torch.optim.Adam(self.q_network.parameters(), lr=self._learning_rate) @@ -144,10 +148,10 @@ def __init__(self, else: self._state_preprocessor = self._empty_preprocessor - def init(self) -> None: + def init(self, trainer_cfg: Optional[Dict[str, Any]] = None) -> None: """Initialize the agent """ - super().init() + super().init(trainer_cfg=trainer_cfg) # create tensors in memory if self.memory is not None: @@ -155,9 +159,9 @@ def init(self) -> None: self.memory.create_tensor(name="next_states", size=self.observation_space, dtype=torch.float32) self.memory.create_tensor(name="actions", size=self.action_space, dtype=torch.int64) self.memory.create_tensor(name="rewards", size=1, dtype=torch.float32) - self.memory.create_tensor(name="dones", size=1, dtype=torch.bool) + self.memory.create_tensor(name="terminated", size=1, dtype=torch.bool) - self.tensors_names = ["states", "actions", "rewards", "next_states", "dones"] + self.tensors_names = ["states", "actions", "rewards", "next_states", "terminated"] def act(self, states: torch.Tensor, timestep: int, timesteps: int) -> torch.Tensor: """Process the environment's states to make a decision (actions) using the main policy @@ -175,10 +179,10 @@ def act(self, states: torch.Tensor, timestep: int, timesteps: int) -> torch.Tens states = self._state_preprocessor(states) if not self._exploration_timesteps: - return torch.argmax(self.q_network.act(states, taken_actions=None, role="q_network")[0], dim=1, keepdim=True), None, None - + return torch.argmax(self.q_network.act({"states": states}, role="q_network")[0], dim=1, keepdim=True), None, None + # sample random actions - actions = self.q_network.random_act(states, taken_actions=None, role="q_network")[0] + actions = self.q_network.random_act({"states": states}, role="q_network")[0] if timestep < self._random_timesteps: return actions, None, None @@ -188,24 +192,25 @@ def act(self, states: torch.Tensor, timestep: int, timesteps: int) -> torch.Tens indexes = (torch.rand(states.shape[0], device=self.device) >= epsilon).nonzero().view(-1) if indexes.numel(): - actions[indexes] = torch.argmax(self.q_network.act(states[indexes], taken_actions=None, role="q_network")[0], dim=1, keepdim=True) - + actions[indexes] = torch.argmax(self.q_network.act({"states": states[indexes]}, role="q_network")[0], dim=1, keepdim=True) + # record epsilon self.track_data("Exploration / Exploration epsilon", epsilon) - + return actions, None, None - def record_transition(self, - states: torch.Tensor, - actions: torch.Tensor, - rewards: torch.Tensor, - next_states: torch.Tensor, - dones: torch.Tensor, - infos: Any, - timestep: int, + def record_transition(self, + states: torch.Tensor, + actions: torch.Tensor, + rewards: torch.Tensor, + next_states: torch.Tensor, + terminated: torch.Tensor, + truncated: torch.Tensor, + infos: Any, + timestep: int, timesteps: int) -> None: """Record an environment transition in memory - + :param states: Observations/states of the environment used to make the decision :type states: torch.Tensor :param actions: Actions taken by the agent @@ -214,8 +219,10 @@ def record_transition(self, :type rewards: torch.Tensor :param next_states: Next observations/states of the environment :type next_states: torch.Tensor - :param dones: Signals to indicate that episodes have ended - :type dones: torch.Tensor + :param terminated: Signals to indicate that episodes have terminated + :type terminated: torch.Tensor + :param truncated: Signals to indicate that episodes have been truncated + :type truncated: torch.Tensor :param infos: Additional information about the environment :type infos: Any type supported by the environment :param timestep: Current timestep @@ -223,16 +230,18 @@ def record_transition(self, :param timesteps: Number of timesteps :type timesteps: int """ - super().record_transition(states, actions, rewards, next_states, dones, infos, timestep, timesteps) + super().record_transition(states, actions, rewards, next_states, terminated, truncated, infos, timestep, timesteps) - # reward shaping - if self._rewards_shaper is not None: - rewards = self._rewards_shaper(rewards, timestep, timesteps) - if self.memory is not None: - self.memory.add_samples(states=states, actions=actions, rewards=rewards, next_states=next_states, dones=dones) + # reward shaping + if self._rewards_shaper is not None: + rewards = self._rewards_shaper(rewards, timestep, timesteps) + + self.memory.add_samples(states=states, actions=actions, rewards=rewards, next_states=next_states, + terminated=terminated, truncated=truncated) for memory in self.secondary_memories: - memory.add_samples(states=states, actions=actions, rewards=rewards, next_states=next_states, dones=dones) + memory.add_samples(states=states, actions=actions, rewards=rewards, next_states=next_states, + terminated=terminated, truncated=truncated) def pre_interaction(self, timestep: int, timesteps: int) -> None: """Callback called before the interaction with the environment @@ -278,18 +287,17 @@ def _update(self, timestep: int, timesteps: int) -> None: # compute target values with torch.no_grad(): - next_q_values, _, _ = self.target_q_network.act(states=sampled_next_states, taken_actions=None, role="target_q_network") - - target_q_values = torch.gather(next_q_values, dim=1, index=torch.argmax(self.q_network.act(states=sampled_next_states, \ - taken_actions=None, role="q_network")[0], dim=1, keepdim=True)) + next_q_values, _, _ = self.target_q_network.act({"states": sampled_next_states}, role="target_q_network") + + target_q_values = torch.gather(next_q_values, dim=1, index=torch.argmax(self.q_network.act({"states": sampled_next_states}, \ + role="q_network")[0], dim=1, keepdim=True)) target_values = sampled_rewards + self._discount_factor * sampled_dones.logical_not() * target_q_values # compute Q-network loss - q_values = torch.gather(self.q_network.act(states=sampled_states, taken_actions=None, role="q_network")[0], - dim=1, index=sampled_actions.long()) + q_values = torch.gather(self.q_network.act({"states": sampled_states}, role="q_network")[0], dim=1, index=sampled_actions.long()) q_network_loss = F.mse_loss(q_values, target_values) - + # optimize Q-network self.optimizer.zero_grad() q_network_loss.backward() diff --git a/skrl/agents/torch/dqn/dqn.py b/skrl/agents/torch/dqn/dqn.py index 0a18ece2..d8c74333 100644 --- a/skrl/agents/torch/dqn/dqn.py +++ b/skrl/agents/torch/dqn/dqn.py @@ -1,25 +1,25 @@ -from typing import Union, Tuple, Dict, Any +from typing import Union, Tuple, Dict, Any, Optional -import gym +import gym, gymnasium import copy import math import torch import torch.nn.functional as F -from ....memories.torch import Memory -from ....models.torch import Model +from skrl.memories.torch import Memory +from skrl.models.torch import Model -from .. import Agent +from skrl.agents.torch import Agent DQN_DEFAULT_CONFIG = { "gradient_steps": 1, # gradient steps "batch_size": 64, # training batch size - + "discount_factor": 0.99, # discount factor (gamma) "polyak": 0.005, # soft update hyperparameter (tau) - + "learning_rate": 1e-3, # learning rate "learning_rate_scheduler": None, # learning rate scheduler class (see torch.optim.lr_scheduler) "learning_rate_scheduler_kwargs": {}, # learning rate scheduler's kwargs (e.g. {"step_size": 1e-3}) @@ -36,7 +36,7 @@ "exploration": { "initial_epsilon": 1.0, # initial epsilon for epsilon-greedy exploration "final_epsilon": 0.05, # final epsilon for epsilon-greedy exploration - "timesteps": 1000, # timesteps for epsilon-greedy decay + "timesteps": 1000, # timesteps for epsilon-greedy decay }, "rewards_shaper": None, # rewards shaping function: Callable(reward, timestep, timesteps) -> reward @@ -48,33 +48,37 @@ "checkpoint_interval": 1000, # interval for checkpoints (timesteps) "store_separately": False, # whether to store checkpoints separately + + "wandb": False, # whether to use Weights & Biases + "wandb_kwargs": {} # wandb kwargs (see https://docs.wandb.ai/ref/python/init) } } class DQN(Agent): - def __init__(self, - models: Dict[str, Model], - memory: Union[Memory, Tuple[Memory], None] = None, - observation_space: Union[int, Tuple[int], gym.Space, None] = None, - action_space: Union[int, Tuple[int], gym.Space, None] = None, - device: Union[str, torch.device] = "cuda:0", - cfg: dict = {}) -> None: + def __init__(self, + models: Dict[str, Model], + memory: Optional[Union[Memory, Tuple[Memory]]] = None, + observation_space: Optional[Union[int, Tuple[int], gym.Space, gymnasium.Space]] = None, + action_space: Optional[Union[int, Tuple[int], gym.Space, gymnasium.Space]] = None, + device: Optional[Union[str, torch.device]] = None, + cfg: Optional[dict] = None) -> None: """Deep Q-Network (DQN) https://arxiv.org/abs/1312.5602 - + :param models: Models used by the agent :type models: dictionary of skrl.models.torch.Model :param memory: Memory to storage the transitions. - If it is a tuple, the first element will be used for training and + If it is a tuple, the first element will be used for training and for the rest only the environment transitions will be added :type memory: skrl.memory.torch.Memory, list of skrl.memory.torch.Memory or None :param observation_space: Observation/state space or shape (default: None) - :type observation_space: int, tuple or list of integers, gym.Space or None, optional + :type observation_space: int, tuple or list of integers, gym.Space, gymnasium.Space or None, optional :param action_space: Action space or shape (default: None) - :type action_space: int, tuple or list of integers, gym.Space or None, optional - :param device: Computing device (default: "cuda:0") + :type action_space: int, tuple or list of integers, gym.Space, gymnasium.Space or None, optional + :param device: Device on which a torch tensor is or will be allocated (default: ``None``). + If None, the device will be either ``"cuda:0"`` if available or ``"cpu"`` :type device: str or torch.device, optional :param cfg: Configuration dictionary :type cfg: dict @@ -82,12 +86,12 @@ def __init__(self, :raises KeyError: If the models dictionary is missing a required key """ _cfg = copy.deepcopy(DQN_DEFAULT_CONFIG) - _cfg.update(cfg) - super().__init__(models=models, - memory=memory, - observation_space=observation_space, - action_space=action_space, - device=device, + _cfg.update(cfg if cfg is not None else {}) + super().__init__(models=models, + memory=memory, + observation_space=observation_space, + action_space=action_space, + device=device, cfg=_cfg) # models @@ -97,7 +101,7 @@ def __init__(self, # checkpoint models self.checkpoint_modules["q_network"] = self.q_network self.checkpoint_modules["target_q_network"] = self.target_q_network - + if self.target_q_network is not None: # freeze target networks with respect to optimizers (update via .update_parameters()) self.target_q_network.freeze_parameters(True) @@ -108,13 +112,13 @@ def __init__(self, # configuration self._gradient_steps = self.cfg["gradient_steps"] self._batch_size = self.cfg["batch_size"] - + self._discount_factor = self.cfg["discount_factor"] self._polyak = self.cfg["polyak"] self._learning_rate = self.cfg["learning_rate"] self._learning_rate_scheduler = self.cfg["learning_rate_scheduler"] - + self._state_preprocessor = self.cfg["state_preprocessor"] self._random_timesteps = self.cfg["random_timesteps"] @@ -128,7 +132,7 @@ def __init__(self, self._exploration_timesteps = self.cfg["exploration"]["timesteps"] self._rewards_shaper = self.cfg["rewards_shaper"] - + # set up optimizer and learning rate scheduler if self.q_network is not None: self.optimizer = torch.optim.Adam(self.q_network.parameters(), lr=self._learning_rate) @@ -144,20 +148,20 @@ def __init__(self, else: self._state_preprocessor = self._empty_preprocessor - def init(self) -> None: + def init(self, trainer_cfg: Optional[Dict[str, Any]] = None) -> None: """Initialize the agent """ - super().init() - + super().init(trainer_cfg=trainer_cfg) + # create tensors in memory if self.memory is not None: self.memory.create_tensor(name="states", size=self.observation_space, dtype=torch.float32) self.memory.create_tensor(name="next_states", size=self.observation_space, dtype=torch.float32) self.memory.create_tensor(name="actions", size=self.action_space, dtype=torch.int64) self.memory.create_tensor(name="rewards", size=1, dtype=torch.float32) - self.memory.create_tensor(name="dones", size=1, dtype=torch.bool) + self.memory.create_tensor(name="terminated", size=1, dtype=torch.bool) - self.tensors_names = ["states", "actions", "rewards", "next_states", "dones"] + self.tensors_names = ["states", "actions", "rewards", "next_states", "terminated"] def act(self, states: torch.Tensor, timestep: int, timesteps: int) -> torch.Tensor: """Process the environment's states to make a decision (actions) using the main policy @@ -175,10 +179,10 @@ def act(self, states: torch.Tensor, timestep: int, timesteps: int) -> torch.Tens states = self._state_preprocessor(states) if not self._exploration_timesteps: - return torch.argmax(self.q_network.act(states, taken_actions=None, role="q_network")[0], dim=1, keepdim=True), None, None + return torch.argmax(self.q_network.act({"states": states}, role="q_network")[0], dim=1, keepdim=True), None, None # sample random actions - actions = self.q_network.random_act(states, taken_actions=None, role="q_network")[0] + actions = self.q_network.random_act({"states": states}, role="q_network")[0] if timestep < self._random_timesteps: return actions, None, None @@ -188,24 +192,25 @@ def act(self, states: torch.Tensor, timestep: int, timesteps: int) -> torch.Tens indexes = (torch.rand(states.shape[0], device=self.device) >= epsilon).nonzero().view(-1) if indexes.numel(): - actions[indexes] = torch.argmax(self.q_network.act(states[indexes], taken_actions=None, role="q_network")[0], dim=1, keepdim=True) - + actions[indexes] = torch.argmax(self.q_network.act({"states": states[indexes]}, role="q_network")[0], dim=1, keepdim=True) + # record epsilon self.track_data("Exploration / Exploration epsilon", epsilon) - + return actions, None, None - def record_transition(self, - states: torch.Tensor, - actions: torch.Tensor, - rewards: torch.Tensor, - next_states: torch.Tensor, - dones: torch.Tensor, - infos: Any, - timestep: int, + def record_transition(self, + states: torch.Tensor, + actions: torch.Tensor, + rewards: torch.Tensor, + next_states: torch.Tensor, + terminated: torch.Tensor, + truncated: torch.Tensor, + infos: Any, + timestep: int, timesteps: int) -> None: """Record an environment transition in memory - + :param states: Observations/states of the environment used to make the decision :type states: torch.Tensor :param actions: Actions taken by the agent @@ -214,8 +219,10 @@ def record_transition(self, :type rewards: torch.Tensor :param next_states: Next observations/states of the environment :type next_states: torch.Tensor - :param dones: Signals to indicate that episodes have ended - :type dones: torch.Tensor + :param terminated: Signals to indicate that episodes have terminated + :type terminated: torch.Tensor + :param truncated: Signals to indicate that episodes have been truncated + :type truncated: torch.Tensor :param infos: Additional information about the environment :type infos: Any type supported by the environment :param timestep: Current timestep @@ -223,16 +230,18 @@ def record_transition(self, :param timesteps: Number of timesteps :type timesteps: int """ - super().record_transition(states, actions, rewards, next_states, dones, infos, timestep, timesteps) + super().record_transition(states, actions, rewards, next_states, terminated, truncated, infos, timestep, timesteps) - # reward shaping - if self._rewards_shaper is not None: - rewards = self._rewards_shaper(rewards, timestep, timesteps) - if self.memory is not None: - self.memory.add_samples(states=states, actions=actions, rewards=rewards, next_states=next_states, dones=dones) + # reward shaping + if self._rewards_shaper is not None: + rewards = self._rewards_shaper(rewards, timestep, timesteps) + + self.memory.add_samples(states=states, actions=actions, rewards=rewards, next_states=next_states, + terminated=terminated, truncated=truncated) for memory in self.secondary_memories: - memory.add_samples(states=states, actions=actions, rewards=rewards, next_states=next_states, dones=dones) + memory.add_samples(states=states, actions=actions, rewards=rewards, next_states=next_states, + terminated=terminated, truncated=truncated) def pre_interaction(self, timestep: int, timesteps: int) -> None: """Callback called before the interaction with the environment @@ -278,17 +287,17 @@ def _update(self, timestep: int, timesteps: int) -> None: # compute target values with torch.no_grad(): - next_q_values, _, _ = self.target_q_network.act(states=sampled_next_states, taken_actions=None, role="target_q_network") - + next_q_values, _, _ = self.target_q_network.act({"states": sampled_next_states}, role="target_q_network") + target_q_values = torch.max(next_q_values, dim=-1, keepdim=True)[0] target_values = sampled_rewards + self._discount_factor * sampled_dones.logical_not() * target_q_values # compute Q-network loss - q_values = torch.gather(self.q_network.act(states=sampled_states, taken_actions=None, role="q_network")[0], + q_values = torch.gather(self.q_network.act({"states": sampled_states}, role="q_network")[0], dim=1, index=sampled_actions.long()) q_network_loss = F.mse_loss(q_values, target_values) - + # optimize Q-network self.optimizer.zero_grad() q_network_loss.backward() diff --git a/skrl/agents/torch/ppo/__init__.py b/skrl/agents/torch/ppo/__init__.py index e439116a..a422f115 100644 --- a/skrl/agents/torch/ppo/__init__.py +++ b/skrl/agents/torch/ppo/__init__.py @@ -1 +1 @@ -from .ppo import PPO, PPO_DEFAULT_CONFIG \ No newline at end of file +from skrl.agents.torch.ppo.ppo import PPO, PPO_DEFAULT_CONFIG diff --git a/skrl/agents/torch/ppo/ppo.py b/skrl/agents/torch/ppo/ppo.py index 2fa6081d..aba0feed 100644 --- a/skrl/agents/torch/ppo/ppo.py +++ b/skrl/agents/torch/ppo/ppo.py @@ -1,6 +1,6 @@ -from typing import Union, Tuple, Dict, Any +from typing import Union, Tuple, Dict, Any, Optional -import gym +import gym, gymnasium import copy import itertools @@ -8,21 +8,21 @@ import torch.nn as nn import torch.nn.functional as F -from ....memories.torch import Memory -from ....models.torch import Model -from ....resources.schedulers.torch import KLAdaptiveRL +from skrl.memories.torch import Memory +from skrl.models.torch import Model +from skrl.resources.schedulers.torch import KLAdaptiveRL -from .. import Agent +from skrl.agents.torch import Agent PPO_DEFAULT_CONFIG = { "rollouts": 16, # number of rollouts before updating "learning_epochs": 8, # number of learning epochs during each update "mini_batches": 2, # number of mini batches during each learning epoch - + "discount_factor": 0.99, # discount factor (gamma) "lambda": 0.95, # TD(lambda) coefficient (lam) for computing returns and advantages - + "learning_rate": 1e-3, # learning rate "learning_rate_scheduler": None, # learning rate scheduler class (see torch.optim.lr_scheduler) "learning_rate_scheduler_kwargs": {}, # learning rate scheduler's kwargs (e.g. {"step_size": 1e-3}) @@ -54,33 +54,37 @@ "checkpoint_interval": 1000, # interval for checkpoints (timesteps) "store_separately": False, # whether to store checkpoints separately + + "wandb": False, # whether to use Weights & Biases + "wandb_kwargs": {} # wandb kwargs (see https://docs.wandb.ai/ref/python/init) } } class PPO(Agent): - def __init__(self, - models: Dict[str, Model], - memory: Union[Memory, Tuple[Memory], None] = None, - observation_space: Union[int, Tuple[int], gym.Space, None] = None, - action_space: Union[int, Tuple[int], gym.Space, None] = None, - device: Union[str, torch.device] = "cuda:0", - cfg: dict = {}) -> None: + def __init__(self, + models: Dict[str, Model], + memory: Optional[Union[Memory, Tuple[Memory]]] = None, + observation_space: Optional[Union[int, Tuple[int], gym.Space, gymnasium.Space]] = None, + action_space: Optional[Union[int, Tuple[int], gym.Space, gymnasium.Space]] = None, + device: Optional[Union[str, torch.device]] = None, + cfg: Optional[dict] = None) -> None: """Proximal Policy Optimization (PPO) https://arxiv.org/abs/1707.06347 - + :param models: Models used by the agent :type models: dictionary of skrl.models.torch.Model :param memory: Memory to storage the transitions. - If it is a tuple, the first element will be used for training and + If it is a tuple, the first element will be used for training and for the rest only the environment transitions will be added :type memory: skrl.memory.torch.Memory, list of skrl.memory.torch.Memory or None :param observation_space: Observation/state space or shape (default: None) - :type observation_space: int, tuple or list of integers, gym.Space or None, optional + :type observation_space: int, tuple or list of integers, gym.Space, gymnasium.Space or None, optional :param action_space: Action space or shape (default: None) - :type action_space: int, tuple or list of integers, gym.Space or None, optional - :param device: Computing device (default: "cuda:0") + :type action_space: int, tuple or list of integers, gym.Space, gymnasium.Space or None, optional + :param device: Device on which a torch tensor is or will be allocated (default: ``None``). + If None, the device will be either ``"cuda:0"`` if available or ``"cpu"`` :type device: str or torch.device, optional :param cfg: Configuration dictionary :type cfg: dict @@ -88,12 +92,12 @@ def __init__(self, :raises KeyError: If the models dictionary is missing a required key """ _cfg = copy.deepcopy(PPO_DEFAULT_CONFIG) - _cfg.update(cfg) - super().__init__(models=models, - memory=memory, - observation_space=observation_space, - action_space=action_space, - device=device, + _cfg.update(cfg if cfg is not None else {}) + super().__init__(models=models, + memory=memory, + observation_space=observation_space, + action_space=action_space, + device=device, cfg=_cfg) # models @@ -139,7 +143,7 @@ def __init__(self, if self.policy is self.value: self.optimizer = torch.optim.Adam(self.policy.parameters(), lr=self._learning_rate) else: - self.optimizer = torch.optim.Adam(itertools.chain(self.policy.parameters(), self.value.parameters()), + self.optimizer = torch.optim.Adam(itertools.chain(self.policy.parameters(), self.value.parameters()), lr=self._learning_rate) if self._learning_rate_scheduler is not None: self.scheduler = self._learning_rate_scheduler(self.optimizer, **self.cfg["learning_rate_scheduler_kwargs"]) @@ -159,24 +163,56 @@ def __init__(self, else: self._value_preprocessor = self._empty_preprocessor - def init(self) -> None: + def init(self, trainer_cfg: Optional[Dict[str, Any]] = None) -> None: """Initialize the agent """ - super().init() + super().init(trainer_cfg=trainer_cfg) self.set_mode("eval") - + # create tensors in memory if self.memory is not None: self.memory.create_tensor(name="states", size=self.observation_space, dtype=torch.float32) self.memory.create_tensor(name="actions", size=self.action_space, dtype=torch.float32) self.memory.create_tensor(name="rewards", size=1, dtype=torch.float32) - self.memory.create_tensor(name="dones", size=1, dtype=torch.bool) + self.memory.create_tensor(name="terminated", size=1, dtype=torch.bool) self.memory.create_tensor(name="log_prob", size=1, dtype=torch.float32) self.memory.create_tensor(name="values", size=1, dtype=torch.float32) self.memory.create_tensor(name="returns", size=1, dtype=torch.float32) self.memory.create_tensor(name="advantages", size=1, dtype=torch.float32) - self.tensors_names = ["states", "actions", "rewards", "dones", "log_prob", "values", "returns", "advantages"] + # tensors sampled during training + self._tensors_names = ["states", "actions", "terminated", "log_prob", "values", "returns", "advantages"] + + # RNN specifications + self._rnn = False # flag to indicate whether RNN is available + self._rnn_tensors_names = [] # used for sampling during training + self._rnn_final_states = {"policy": [], "value": []} + self._rnn_initial_states = {"policy": [], "value": []} + self._rnn_sequence_length = self.policy.get_specification().get("rnn", {}).get("sequence_length", 1) + + # policy + for i, size in enumerate(self.policy.get_specification().get("rnn", {}).get("sizes", [])): + self._rnn = True + # create tensors in memory + if self.memory is not None: + self.memory.create_tensor(name=f"rnn_policy_{i}", size=(size[0], size[2]), dtype=torch.float32, keep_dimensions=True) + self._rnn_tensors_names.append(f"rnn_policy_{i}") + # default RNN states + self._rnn_initial_states["policy"].append(torch.zeros(size, dtype=torch.float32, device=self.device)) + + # value + if self.value is not None: + if self.policy is self.value: + self._rnn_initial_states["value"] = self._rnn_initial_states["policy"] + else: + for i, size in enumerate(self.value.get_specification().get("rnn", {}).get("sizes", [])): + self._rnn = True + # create tensors in memory + if self.memory is not None: + self.memory.create_tensor(name=f"rnn_value_{i}", size=(size[0], size[2]), dtype=torch.float32, keep_dimensions=True) + self._rnn_tensors_names.append(f"rnn_value_{i}") + # default RNN states + self._rnn_initial_states["value"].append(torch.zeros(size, dtype=torch.float32, device=self.device)) # create temporary variables needed for storage and computation self._current_log_prob = None @@ -195,30 +231,34 @@ def act(self, states: torch.Tensor, timestep: int, timesteps: int) -> torch.Tens :return: Actions :rtype: torch.Tensor """ - states = self._state_preprocessor(states) + rnn = {"rnn": self._rnn_initial_states["policy"]} if self._rnn else {} # sample random actions - # TODO, check for stochasticity + # TODO: fix for stochasticity, rnn and log_prob if timestep < self._random_timesteps: - return self.policy.random_act(states, taken_actions=None, role="policy") + return self.policy.random_act({"states": self._state_preprocessor(states), **rnn}, role="policy") # sample stochastic actions - actions, log_prob, actions_mean = self.policy.act(states, taken_actions=None, role="policy") + actions, log_prob, outputs = self.policy.act({"states": self._state_preprocessor(states), **rnn}, role="policy") self._current_log_prob = log_prob - return actions, log_prob, actions_mean + if self._rnn: + self._rnn_final_states["policy"] = outputs.get("rnn", []) + + return actions, log_prob, outputs - def record_transition(self, - states: torch.Tensor, - actions: torch.Tensor, - rewards: torch.Tensor, - next_states: torch.Tensor, - dones: torch.Tensor, - infos: Any, - timestep: int, + def record_transition(self, + states: torch.Tensor, + actions: torch.Tensor, + rewards: torch.Tensor, + next_states: torch.Tensor, + terminated: torch.Tensor, + truncated: torch.Tensor, + infos: Any, + timestep: int, timesteps: int) -> None: """Record an environment transition in memory - + :param states: Observations/states of the environment used to make the decision :type states: torch.Tensor :param actions: Actions taken by the agent @@ -227,8 +267,10 @@ def record_transition(self, :type rewards: torch.Tensor :param next_states: Next observations/states of the environment :type next_states: torch.Tensor - :param dones: Signals to indicate that episodes have ended - :type dones: torch.Tensor + :param terminated: Signals to indicate that episodes have terminated + :type terminated: torch.Tensor + :param truncated: Signals to indicate that episodes have been truncated + :type truncated: torch.Tensor :param infos: Additional information about the environment :type infos: Any type supported by the environment :param timestep: Current timestep @@ -236,24 +278,48 @@ def record_transition(self, :param timesteps: Number of timesteps :type timesteps: int """ - super().record_transition(states, actions, rewards, next_states, dones, infos, timestep, timesteps) + super().record_transition(states, actions, rewards, next_states, terminated, truncated, infos, timestep, timesteps) - # reward shaping - if self._rewards_shaper is not None: - rewards = self._rewards_shaper(rewards, timestep, timesteps) + if self.memory is not None: + self._current_next_states = next_states - self._current_next_states = next_states + # reward shaping + if self._rewards_shaper is not None: + rewards = self._rewards_shaper(rewards, timestep, timesteps) - if self.memory is not None: - with torch.no_grad(): - values, _, _ = self.value.act(states=self._state_preprocessor(states), taken_actions=None, role="value") + # compute values + rnn = {"rnn": self._rnn_initial_states["value"]} if self._rnn else {} + values, _, outputs = self.value.act({"states": self._state_preprocessor(states), **rnn}, role="value") values = self._value_preprocessor(values, inverse=True) - self.memory.add_samples(states=states, actions=actions, rewards=rewards, next_states=next_states, dones=dones, - log_prob=self._current_log_prob, values=values) + # package RNN states + rnn_states = {} + if self._rnn: + rnn_states.update({f"rnn_policy_{i}": s.transpose(0, 1) for i, s in enumerate(self._rnn_initial_states["policy"])}) + if self.policy is not self.value: + rnn_states.update({f"rnn_value_{i}": s.transpose(0, 1) for i, s in enumerate(self._rnn_initial_states["value"])}) + + # storage transition in memory + self.memory.add_samples(states=states, actions=actions, rewards=rewards, next_states=next_states, + terminated=terminated, truncated=truncated, log_prob=self._current_log_prob, values=values, **rnn_states) for memory in self.secondary_memories: - memory.add_samples(states=states, actions=actions, rewards=rewards, next_states=next_states, dones=dones, - log_prob=self._current_log_prob, values=values) + memory.add_samples(states=states, actions=actions, rewards=rewards, next_states=next_states, + terminated=terminated, truncated=truncated, log_prob=self._current_log_prob, values=values, **rnn_states) + + # update RNN states + if self._rnn: + self._rnn_final_states["value"] = self._rnn_final_states["policy"] if self.policy is self.value else outputs.get("rnn", []) + + # reset states if the episodes have ended + finished_episodes = terminated.nonzero(as_tuple=False) + if finished_episodes.numel(): + for rnn_state in self._rnn_final_states["policy"]: + rnn_state[:, finished_episodes[:, 0]] = 0 + if self.policy is not self.value: + for rnn_state in self._rnn_final_states["value"]: + rnn_state[:, finished_episodes[:, 0]] = 0 + + self._rnn_initial_states = self._rnn_final_states def pre_interaction(self, timestep: int, timesteps: int) -> None: """Callback called before the interaction with the environment @@ -290,11 +356,11 @@ def _update(self, timestep: int, timesteps: int) -> None: :param timesteps: Number of timesteps :type timesteps: int """ - def compute_gae(rewards: torch.Tensor, - dones: torch.Tensor, - values: torch.Tensor, - next_values: torch.Tensor, - discount_factor: float = 0.99, + def compute_gae(rewards: torch.Tensor, + dones: torch.Tensor, + values: torch.Tensor, + next_values: torch.Tensor, + discount_factor: float = 0.99, lambda_coefficient: float = 0.95) -> torch.Tensor: """Compute the Generalized Advantage Estimator (GAE) @@ -330,15 +396,18 @@ def compute_gae(rewards: torch.Tensor, advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8) return returns, advantages - + # compute returns and advantages with torch.no_grad(): - last_values, _, _ = self.value.act(self._state_preprocessor(self._current_next_states.float()), taken_actions=None, role="value") + self.value.train(False) + rnn = {"rnn": self._rnn_initial_states["value"]} if self._rnn else {} + last_values, _, _ = self.value.act({"states": self._state_preprocessor(self._current_next_states.float()), **rnn}, role="value") + self.value.train(True) last_values = self._value_preprocessor(last_values, inverse=True) values = self.memory.get_tensor_by_name("values") returns, advantages = compute_gae(rewards=self.memory.get_tensor_by_name("rewards"), - dones=self.memory.get_tensor_by_name("dones"), + dones=self.memory.get_tensor_by_name("terminated"), values=values, next_values=last_values, discount_factor=self._discount_factor, @@ -349,7 +418,11 @@ def compute_gae(rewards: torch.Tensor, self.memory.set_tensor_by_name("advantages", advantages) # sample mini-batches from memory - sampled_batches = self.memory.sample_all(names=self.tensors_names, mini_batches=self._mini_batches) + sampled_batches = self.memory.sample_all(names=self._tensors_names, mini_batches=self._mini_batches, sequence_length=self._rnn_sequence_length) + + rnn_policy, rnn_value = {}, {} + if self._rnn: + sampled_rnn_batches = self.memory.sample_all(names=self._rnn_tensors_names, mini_batches=self._mini_batches, sequence_length=self._rnn_sequence_length) cumulative_policy_loss = 0 cumulative_entropy_loss = 0 @@ -360,12 +433,19 @@ def compute_gae(rewards: torch.Tensor, kl_divergences = [] # mini-batches loop - for sampled_states, sampled_actions, _, _, sampled_log_prob, sampled_values, sampled_returns, sampled_advantages \ - in sampled_batches: + for i, (sampled_states, sampled_actions, sampled_dones, sampled_log_prob, sampled_values, sampled_returns, sampled_advantages) in enumerate(sampled_batches): + + if self._rnn: + if self.policy is self.value: + rnn_policy = {"rnn": [s.transpose(0, 1) for s in sampled_rnn_batches[i]], "terminated": sampled_dones} + rnn_value = rnn_policy + else: + rnn_policy = {"rnn": [s.transpose(0, 1) for s, n in zip(sampled_rnn_batches[i], self._rnn_tensors_names) if "policy" in n], "terminated": sampled_dones} + rnn_value = {"rnn": [s.transpose(0, 1) for s, n in zip(sampled_rnn_batches[i], self._rnn_tensors_names) if "value" in n], "terminated": sampled_dones} sampled_states = self._state_preprocessor(sampled_states, train=not epoch) - - _, next_log_prob, _ = self.policy.act(states=sampled_states, taken_actions=sampled_actions, role="policy") + + _, next_log_prob, _ = self.policy.act({"states": sampled_states, "taken_actions": sampled_actions, **rnn_policy}, role="policy") # compute aproximate KL divergence with torch.no_grad(): @@ -382,20 +462,20 @@ def compute_gae(rewards: torch.Tensor, entropy_loss = -self._entropy_loss_scale * self.policy.get_entropy(role="policy").mean() else: entropy_loss = 0 - + # compute policy loss ratio = torch.exp(next_log_prob - sampled_log_prob) surrogate = sampled_advantages * ratio surrogate_clipped = sampled_advantages * torch.clip(ratio, 1.0 - self._ratio_clip, 1.0 + self._ratio_clip) - + policy_loss = -torch.min(surrogate, surrogate_clipped).mean() # compute value loss - predicted_values, _, _ = self.value.act(states=sampled_states, taken_actions=None, role="value") + predicted_values, _, _ = self.value.act({"states": sampled_states, **rnn_value}, role="value") if self._clip_predicted_values: - predicted_values = sampled_values + torch.clip(predicted_values - sampled_values, - min=-self._value_clip, + predicted_values = sampled_values + torch.clip(predicted_values - sampled_values, + min=-self._value_clip, max=self._value_clip) value_loss = self._value_loss_scale * F.mse_loss(sampled_returns, predicted_values) @@ -414,10 +494,10 @@ def compute_gae(rewards: torch.Tensor, cumulative_value_loss += value_loss.item() if self._entropy_loss_scale: cumulative_entropy_loss += entropy_loss.item() - + # update learning rate if self._learning_rate_scheduler: - if isinstance(self.scheduler, KLAdaptiveRL): + if isinstance(self.scheduler, KLAdaptiveRL): self.scheduler.step(torch.tensor(kl_divergences).mean()) else: self.scheduler.step() diff --git a/skrl/agents/torch/q_learning/__init__.py b/skrl/agents/torch/q_learning/__init__.py index dd2a6413..a2b43800 100644 --- a/skrl/agents/torch/q_learning/__init__.py +++ b/skrl/agents/torch/q_learning/__init__.py @@ -1 +1 @@ -from .q_learning import Q_LEARNING, Q_LEARNING_DEFAULT_CONFIG \ No newline at end of file +from skrl.agents.torch.q_learning.q_learning import Q_LEARNING, Q_LEARNING_DEFAULT_CONFIG diff --git a/skrl/agents/torch/q_learning/q_learning.py b/skrl/agents/torch/q_learning/q_learning.py index f13a0c4a..e6868334 100644 --- a/skrl/agents/torch/q_learning/q_learning.py +++ b/skrl/agents/torch/q_learning/q_learning.py @@ -1,14 +1,14 @@ -from typing import Union, Tuple, Dict, Any +from typing import Union, Tuple, Dict, Any, Optional -import gym +import gym, gymnasium import copy import torch -from ....memories.torch import Memory -from ....models.torch import Model +from skrl.memories.torch import Memory +from skrl.models.torch import Model -from .. import Agent +from skrl.agents.torch import Agent Q_LEARNING_DEFAULT_CONFIG = { @@ -28,33 +28,37 @@ "checkpoint_interval": 1000, # interval for checkpoints (timesteps) "store_separately": False, # whether to store checkpoints separately + + "wandb": False, # whether to use Weights & Biases + "wandb_kwargs": {} # wandb kwargs (see https://docs.wandb.ai/ref/python/init) } } class Q_LEARNING(Agent): - def __init__(self, - models: Dict[str, Model], - memory: Union[Memory, Tuple[Memory], None] = None, - observation_space: Union[int, Tuple[int], gym.Space, None] = None, - action_space: Union[int, Tuple[int], gym.Space, None] = None, - device: Union[str, torch.device] = "cuda:0", - cfg: dict = {}) -> None: + def __init__(self, + models: Dict[str, Model], + memory: Optional[Union[Memory, Tuple[Memory]]] = None, + observation_space: Optional[Union[int, Tuple[int], gym.Space, gymnasium.Space]] = None, + action_space: Optional[Union[int, Tuple[int], gym.Space, gymnasium.Space]] = None, + device: Optional[Union[str, torch.device]] = None, + cfg: Optional[dict] = None) -> None: """Q-learning https://www.academia.edu/3294050/Learning_from_delayed_rewards - + :param models: Models used by the agent :type models: dictionary of skrl.models.torch.Model :param memory: Memory to storage the transitions. - If it is a tuple, the first element will be used for training and + If it is a tuple, the first element will be used for training and for the rest only the environment transitions will be added :type memory: skrl.memory.torch.Memory, list of skrl.memory.torch.Memory or None :param observation_space: Observation/state space or shape (default: None) - :type observation_space: int, tuple or list of integers, gym.Space or None, optional + :type observation_space: int, tuple or list of integers, gym.Space, gymnasium.Space or None, optional :param action_space: Action space or shape (default: None) - :type action_space: int, tuple or list of integers, gym.Space or None, optional - :param device: Computing device (default: "cuda:0") + :type action_space: int, tuple or list of integers, gym.Space, gymnasium.Space or None, optional + :param device: Device on which a torch tensor is or will be allocated (default: ``None``). + If None, the device will be either ``"cuda:0"`` if available or ``"cpu"`` :type device: str or torch.device, optional :param cfg: Configuration dictionary :type cfg: dict @@ -62,12 +66,12 @@ def __init__(self, :raises KeyError: If the models dictionary is missing a required key """ _cfg = copy.deepcopy(Q_LEARNING_DEFAULT_CONFIG) - _cfg.update(cfg) - super().__init__(models=models, - memory=memory, - observation_space=observation_space, - action_space=action_space, - device=device, + _cfg.update(cfg if cfg is not None else {}) + super().__init__(models=models, + memory=memory, + observation_space=observation_space, + action_space=action_space, + device=device, cfg=_cfg) # models @@ -75,10 +79,10 @@ def __init__(self, # checkpoint models self.checkpoint_modules["policy"] = self.policy - + # configuration self._discount_factor = self.cfg["discount_factor"] - + self._random_timesteps = self.cfg["random_timesteps"] self._learning_starts = self.cfg["learning_starts"] @@ -93,10 +97,10 @@ def __init__(self, self._current_next_states = None self._current_dones = None - def init(self) -> None: + def init(self, trainer_cfg: Optional[Dict[str, Any]] = None) -> None: """Initialize the agent """ - super().init() + super().init(trainer_cfg=trainer_cfg) def act(self, states: torch.Tensor, timestep: int, timesteps: int) -> torch.Tensor: """Process the environment's states to make a decision (actions) using the main policy @@ -113,22 +117,23 @@ def act(self, states: torch.Tensor, timestep: int, timesteps: int) -> torch.Tens """ # sample random actions if timestep < self._random_timesteps: - return self.policy.random_act(states, taken_actions=None, role="policy") + return self.policy.random_act({"states": states}, role="policy") # sample actions from policy - return self.policy.act(states, taken_actions=None, role="policy") - - def record_transition(self, - states: torch.Tensor, - actions: torch.Tensor, - rewards: torch.Tensor, - next_states: torch.Tensor, - dones: torch.Tensor, - infos: Any, - timestep: int, + return self.policy.act({"states": states}, role="policy") + + def record_transition(self, + states: torch.Tensor, + actions: torch.Tensor, + rewards: torch.Tensor, + next_states: torch.Tensor, + terminated: torch.Tensor, + truncated: torch.Tensor, + infos: Any, + timestep: int, timesteps: int) -> None: """Record an environment transition in memory - + :param states: Observations/states of the environment used to make the decision :type states: torch.Tensor :param actions: Actions taken by the agent @@ -137,8 +142,10 @@ def record_transition(self, :type rewards: torch.Tensor :param next_states: Next observations/states of the environment :type next_states: torch.Tensor - :param dones: Signals to indicate that episodes have ended - :type dones: torch.Tensor + :param terminated: Signals to indicate that episodes have terminated + :type terminated: torch.Tensor + :param truncated: Signals to indicate that episodes have been truncated + :type truncated: torch.Tensor :param infos: Additional information about the environment :type infos: Any type supported by the environment :param timestep: Current timestep @@ -146,7 +153,7 @@ def record_transition(self, :param timesteps: Number of timesteps :type timesteps: int """ - super().record_transition(states, actions, rewards, next_states, dones, infos, timestep, timesteps) + super().record_transition(states, actions, rewards, next_states, terminated, truncated, infos, timestep, timesteps) # reward shaping if self._rewards_shaper is not None: @@ -156,12 +163,14 @@ def record_transition(self, self._current_actions = actions self._current_rewards = rewards self._current_next_states = next_states - self._current_dones = dones + self._current_dones = terminated + truncated if self.memory is not None: - self.memory.add_samples(states=states, actions=actions, rewards=rewards, next_states=next_states, dones=dones) + self.memory.add_samples(states=states, actions=actions, rewards=rewards, next_states=next_states, + terminated=terminated, truncated=truncated) for memory in self.secondary_memories: - memory.add_samples(states=states, actions=actions, rewards=rewards, next_states=next_states, dones=dones) + memory.add_samples(states=states, actions=actions, rewards=rewards, next_states=next_states, + terminated=terminated, truncated=truncated) def pre_interaction(self, timestep: int, timesteps: int) -> None: """Callback called before the interaction with the environment @@ -206,4 +215,3 @@ def _update(self, timestep: int, timesteps: int) -> None: * (self._current_rewards + self._discount_factor * self._current_dones.logical_not() \ * q_table[env_ids, self._current_next_states, next_actions] \ - q_table[env_ids, self._current_states, self._current_actions]) - \ No newline at end of file diff --git a/skrl/agents/torch/sac/__init__.py b/skrl/agents/torch/sac/__init__.py index 704d9b8d..5cd0d7cb 100644 --- a/skrl/agents/torch/sac/__init__.py +++ b/skrl/agents/torch/sac/__init__.py @@ -1 +1 @@ -from .sac import SAC, SAC_DEFAULT_CONFIG \ No newline at end of file +from skrl.agents.torch.sac.sac import SAC, SAC_DEFAULT_CONFIG diff --git a/skrl/agents/torch/sac/sac.py b/skrl/agents/torch/sac/sac.py index 26145f59..5fd955df 100644 --- a/skrl/agents/torch/sac/sac.py +++ b/skrl/agents/torch/sac/sac.py @@ -1,26 +1,27 @@ -from typing import Union, Tuple, Dict, Any +from typing import Union, Tuple, Dict, Any, Optional -import gym +import gym, gymnasium import copy import itertools import numpy as np import torch +import torch.nn as nn import torch.nn.functional as F -from ....memories.torch import Memory -from ....models.torch import Model +from skrl.memories.torch import Memory +from skrl.models.torch import Model -from .. import Agent +from skrl.agents.torch import Agent SAC_DEFAULT_CONFIG = { "gradient_steps": 1, # gradient steps "batch_size": 64, # training batch size - + "discount_factor": 0.99, # discount factor (gamma) "polyak": 0.005, # soft update hyperparameter (tau) - + "actor_learning_rate": 1e-3, # actor learning rate "critic_learning_rate": 1e-3, # critic learning rate "learning_rate_scheduler": None, # learning rate scheduler class (see torch.optim.lr_scheduler) @@ -32,6 +33,8 @@ "random_timesteps": 0, # random exploration steps "learning_starts": 0, # learning starts after this many steps + "grad_norm_clip": 0, # clipping coefficient for the norm of the gradients + "learn_entropy": True, # learn entropy "entropy_learning_rate": 1e-3, # entropy learning rate "initial_entropy_value": 0.2, # initial entropy value @@ -46,33 +49,37 @@ "checkpoint_interval": 1000, # interval for checkpoints (timesteps) "store_separately": False, # whether to store checkpoints separately + + "wandb": False, # whether to use Weights & Biases + "wandb_kwargs": {} # wandb kwargs (see https://docs.wandb.ai/ref/python/init) } } class SAC(Agent): - def __init__(self, - models: Dict[str, Model], - memory: Union[Memory, Tuple[Memory], None] = None, - observation_space: Union[int, Tuple[int], gym.Space, None] = None, - action_space: Union[int, Tuple[int], gym.Space, None] = None, - device: Union[str, torch.device] = "cuda:0", - cfg: dict = {}) -> None: + def __init__(self, + models: Dict[str, Model], + memory: Optional[Union[Memory, Tuple[Memory]]] = None, + observation_space: Optional[Union[int, Tuple[int], gym.Space, gymnasium.Space]] = None, + action_space: Optional[Union[int, Tuple[int], gym.Space, gymnasium.Space]] = None, + device: Optional[Union[str, torch.device]] = None, + cfg: Optional[dict] = None) -> None: """Soft Actor-Critic (SAC) https://arxiv.org/abs/1801.01290 - + :param models: Models used by the agent :type models: dictionary of skrl.models.torch.Model :param memory: Memory to storage the transitions. - If it is a tuple, the first element will be used for training and + If it is a tuple, the first element will be used for training and for the rest only the environment transitions will be added :type memory: skrl.memory.torch.Memory, list of skrl.memory.torch.Memory or None :param observation_space: Observation/state space or shape (default: None) - :type observation_space: int, tuple or list of integers, gym.Space or None, optional + :type observation_space: int, tuple or list of integers, gym.Space, gymnasium.Space or None, optional :param action_space: Action space or shape (default: None) - :type action_space: int, tuple or list of integers, gym.Space or None, optional - :param device: Computing device (default: "cuda:0") + :type action_space: int, tuple or list of integers, gym.Space, gymnasium.Space or None, optional + :param device: Device on which a torch tensor is or will be allocated (default: ``None``). + If None, the device will be either ``"cuda:0"`` if available or ``"cpu"`` :type device: str or torch.device, optional :param cfg: Configuration dictionary :type cfg: dict @@ -80,12 +87,12 @@ def __init__(self, :raises KeyError: If the models dictionary is missing a required key """ _cfg = copy.deepcopy(SAC_DEFAULT_CONFIG) - _cfg.update(cfg) - super().__init__(models=models, - memory=memory, - observation_space=observation_space, - action_space=action_space, - device=device, + _cfg.update(cfg if cfg is not None else {}) + super().__init__(models=models, + memory=memory, + observation_space=observation_space, + action_space=action_space, + device=device, cfg=_cfg) # models @@ -117,16 +124,18 @@ def __init__(self, self._discount_factor = self.cfg["discount_factor"] self._polyak = self.cfg["polyak"] - + self._actor_learning_rate = self.cfg["actor_learning_rate"] self._critic_learning_rate = self.cfg["critic_learning_rate"] self._learning_rate_scheduler = self.cfg["learning_rate_scheduler"] - + self._state_preprocessor = self.cfg["state_preprocessor"] self._random_timesteps = self.cfg["random_timesteps"] self._learning_starts = self.cfg["learning_starts"] + self._grad_norm_clip = self.cfg["grad_norm_clip"] + self._entropy_learning_rate = self.cfg["entropy_learning_rate"] self._learn_entropy = self.cfg["learn_entropy"] self._entropy_coefficient = self.cfg["initial_entropy_value"] @@ -137,8 +146,13 @@ def __init__(self, if self._learn_entropy: self._target_entropy = self.cfg["target_entropy"] if self._target_entropy is None: - self._target_entropy = -np.prod(self.action_space.shape).astype(np.float32) - + if issubclass(type(self.action_space), gym.spaces.Box) or issubclass(type(self.action_space), gymnasium.spaces.Box): + self._target_entropy = -np.prod(self.action_space.shape).astype(np.float32) + elif issubclass(type(self.action_space), gym.spaces.Discrete) or issubclass(type(self.action_space), gymnasium.spaces.Discrete): + self._target_entropy = -self.action_space.n + else: + self._target_entropy = 0 + self.log_entropy_coefficient = torch.log(torch.ones(1, device=self.device) * self._entropy_coefficient).requires_grad_(True) self.entropy_optimizer = torch.optim.Adam([self.log_entropy_coefficient], lr=self._entropy_learning_rate) @@ -147,7 +161,7 @@ def __init__(self, # set up optimizers and learning rate schedulers if self.policy is not None and self.critic_1 is not None and self.critic_2 is not None: self.policy_optimizer = torch.optim.Adam(self.policy.parameters(), lr=self._actor_learning_rate) - self.critic_optimizer = torch.optim.Adam(itertools.chain(self.critic_1.parameters(), self.critic_2.parameters()), + self.critic_optimizer = torch.optim.Adam(itertools.chain(self.critic_1.parameters(), self.critic_2.parameters()), lr=self._critic_learning_rate) if self._learning_rate_scheduler is not None: self.policy_scheduler = self._learning_rate_scheduler(self.policy_optimizer, **self.cfg["learning_rate_scheduler_kwargs"]) @@ -163,10 +177,11 @@ def __init__(self, else: self._state_preprocessor = self._empty_preprocessor - def init(self) -> None: + def init(self, trainer_cfg: Optional[Dict[str, Any]] = None) -> None: """Initialize the agent """ - super().init() + super().init(trainer_cfg=trainer_cfg) + self.set_mode("eval") # create tensors in memory if self.memory is not None: @@ -174,9 +189,26 @@ def init(self) -> None: self.memory.create_tensor(name="next_states", size=self.observation_space, dtype=torch.float32) self.memory.create_tensor(name="actions", size=self.action_space, dtype=torch.float32) self.memory.create_tensor(name="rewards", size=1, dtype=torch.float32) - self.memory.create_tensor(name="dones", size=1, dtype=torch.bool) - - self.tensors_names = ["states", "actions", "rewards", "next_states", "dones"] + self.memory.create_tensor(name="terminated", size=1, dtype=torch.bool) + + self._tensors_names = ["states", "actions", "rewards", "next_states", "terminated"] + + # RNN specifications + self._rnn = False # flag to indicate whether RNN is available + self._rnn_tensors_names = [] # used for sampling during training + self._rnn_final_states = {"policy": []} + self._rnn_initial_states = {"policy": []} + self._rnn_sequence_length = self.policy.get_specification().get("rnn", {}).get("sequence_length", 1) + + # policy + for i, size in enumerate(self.policy.get_specification().get("rnn", {}).get("sizes", [])): + self._rnn = True + # create tensors in memory + if self.memory is not None: + self.memory.create_tensor(name=f"rnn_policy_{i}", size=(size[0], size[2]), dtype=torch.float32, keep_dimensions=True) + self._rnn_tensors_names.append(f"rnn_policy_{i}") + # default RNN states + self._rnn_initial_states["policy"].append(torch.zeros(size, dtype=torch.float32, device=self.device)) def act(self, states: torch.Tensor, timestep: int, timesteps: int) -> torch.Tensor: """Process the environment's states to make a decision (actions) using the main policy @@ -191,27 +223,33 @@ def act(self, states: torch.Tensor, timestep: int, timesteps: int) -> torch.Tens :return: Actions :rtype: torch.Tensor """ - states = self._state_preprocessor(states) + rnn = {"rnn": self._rnn_initial_states["policy"]} if self._rnn else {} # sample random actions # TODO, check for stochasticity if timestep < self._random_timesteps: - return self.policy.random_act(states, taken_actions=None, role="policy") + return self.policy.random_act({"states": self._state_preprocessor(states), **rnn}, role="policy") # sample stochastic actions - return self.policy.act(states, taken_actions=None, role="policy") - - def record_transition(self, - states: torch.Tensor, - actions: torch.Tensor, - rewards: torch.Tensor, - next_states: torch.Tensor, - dones: torch.Tensor, - infos: Any, - timestep: int, + actions, _, outputs = self.policy.act({"states": self._state_preprocessor(states), **rnn}, role="policy") + + if self._rnn: + self._rnn_final_states["policy"] = outputs.get("rnn", []) + + return actions, None, outputs + + def record_transition(self, + states: torch.Tensor, + actions: torch.Tensor, + rewards: torch.Tensor, + next_states: torch.Tensor, + terminated: torch.Tensor, + truncated: torch.Tensor, + infos: Any, + timestep: int, timesteps: int) -> None: """Record an environment transition in memory - + :param states: Observations/states of the environment used to make the decision :type states: torch.Tensor :param actions: Actions taken by the agent @@ -220,8 +258,10 @@ def record_transition(self, :type rewards: torch.Tensor :param next_states: Next observations/states of the environment :type next_states: torch.Tensor - :param dones: Signals to indicate that episodes have ended - :type dones: torch.Tensor + :param terminated: Signals to indicate that episodes have terminated + :type terminated: torch.Tensor + :param truncated: Signals to indicate that episodes have been truncated + :type truncated: torch.Tensor :param infos: Additional information about the environment :type infos: Any type supported by the environment :param timestep: Current timestep @@ -229,16 +269,34 @@ def record_transition(self, :param timesteps: Number of timesteps :type timesteps: int """ - super().record_transition(states, actions, rewards, next_states, dones, infos, timestep, timesteps) + super().record_transition(states, actions, rewards, next_states, terminated, truncated, infos, timestep, timesteps) - # reward shaping - if self._rewards_shaper is not None: - rewards = self._rewards_shaper(rewards, timestep, timesteps) - if self.memory is not None: - self.memory.add_samples(states=states, actions=actions, rewards=rewards, next_states=next_states, dones=dones) + # reward shaping + if self._rewards_shaper is not None: + rewards = self._rewards_shaper(rewards, timestep, timesteps) + + # package RNN states + rnn_states = {} + if self._rnn: + rnn_states.update({f"rnn_policy_{i}": s.transpose(0, 1) for i, s in enumerate(self._rnn_initial_states["policy"])}) + + # storage transition in memory + self.memory.add_samples(states=states, actions=actions, rewards=rewards, next_states=next_states, + terminated=terminated, truncated=truncated, **rnn_states) for memory in self.secondary_memories: - memory.add_samples(states=states, actions=actions, rewards=rewards, next_states=next_states, dones=dones) + memory.add_samples(states=states, actions=actions, rewards=rewards, next_states=next_states, + terminated=terminated, truncated=truncated, **rnn_states) + + # update RNN states + if self._rnn: + # reset states if the episodes have ended + finished_episodes = terminated.nonzero(as_tuple=False) + if finished_episodes.numel(): + for rnn_state in self._rnn_final_states["policy"]: + rnn_state[:, finished_episodes[:, 0]] = 0 + + self._rnn_initial_states = self._rnn_final_states def pre_interaction(self, timestep: int, timesteps: int) -> None: """Callback called before the interaction with the environment @@ -259,11 +317,13 @@ def post_interaction(self, timestep: int, timesteps: int) -> None: :type timesteps: int """ if timestep >= self._learning_starts: + self.set_mode("train") self._update(timestep, timesteps) - + self.set_mode("eval") + # write tracking data and checkpoints super().post_interaction(timestep, timesteps) - + def _update(self, timestep: int, timesteps: int) -> None: """Algorithm's main update step @@ -274,44 +334,53 @@ def _update(self, timestep: int, timesteps: int) -> None: """ # sample a batch from memory sampled_states, sampled_actions, sampled_rewards, sampled_next_states, sampled_dones = \ - self.memory.sample(names=self.tensors_names, batch_size=self._batch_size)[0] + self.memory.sample(names=self._tensors_names, batch_size=self._batch_size, sequence_length=self._rnn_sequence_length)[0] + + rnn_policy = {} + if self._rnn: + sampled_rnn = self.memory.sample_by_index(names=self._rnn_tensors_names, indexes=self.memory.get_sampling_indexes())[0] + rnn_policy = {"rnn": [s.transpose(0, 1) for s in sampled_rnn], "terminated": sampled_dones} # gradient steps for gradient_step in range(self._gradient_steps): - + sampled_states = self._state_preprocessor(sampled_states, train=not gradient_step) sampled_next_states = self._state_preprocessor(sampled_next_states) # compute target values with torch.no_grad(): - next_actions, next_log_prob, _ = self.policy.act(states=sampled_next_states, taken_actions=None, role="policy") + next_actions, next_log_prob, _ = self.policy.act({"states": sampled_next_states, **rnn_policy}, role="policy") - target_q1_values, _, _ = self.target_critic_1.act(states=sampled_next_states, taken_actions=next_actions, role="target_critic_1") - target_q2_values, _, _ = self.target_critic_2.act(states=sampled_next_states, taken_actions=next_actions, role="target_critic_2") + target_q1_values, _, _ = self.target_critic_1.act({"states": sampled_next_states, "taken_actions": next_actions, **rnn_policy}, role="target_critic_1") + target_q2_values, _, _ = self.target_critic_2.act({"states": sampled_next_states, "taken_actions": next_actions, **rnn_policy}, role="target_critic_2") target_q_values = torch.min(target_q1_values, target_q2_values) - self._entropy_coefficient * next_log_prob target_values = sampled_rewards + self._discount_factor * sampled_dones.logical_not() * target_q_values # compute critic loss - critic_1_values, _, _ = self.critic_1.act(states=sampled_states, taken_actions=sampled_actions, role="critic_1") - critic_2_values, _, _ = self.critic_2.act(states=sampled_states, taken_actions=sampled_actions, role="critic_2") - + critic_1_values, _, _ = self.critic_1.act({"states": sampled_states, "taken_actions": sampled_actions, **rnn_policy}, role="critic_1") + critic_2_values, _, _ = self.critic_2.act({"states": sampled_states, "taken_actions": sampled_actions, **rnn_policy}, role="critic_2") + critic_loss = (F.mse_loss(critic_1_values, target_values) + F.mse_loss(critic_2_values, target_values)) / 2 - + # optimization step (critic) self.critic_optimizer.zero_grad() critic_loss.backward() + if self._grad_norm_clip > 0: + nn.utils.clip_grad_norm_(itertools.chain(self.critic_1.parameters(), self.critic_2.parameters()), self._grad_norm_clip) self.critic_optimizer.step() # compute policy (actor) loss - actions, log_prob, _ = self.policy.act(states=sampled_states, taken_actions=None, role="policy") - critic_1_values, _, _ = self.critic_1.act(states=sampled_states, taken_actions=actions, role="critic_1") - critic_2_values, _, _ = self.critic_2.act(states=sampled_states, taken_actions=actions, role="critic_2") + actions, log_prob, _ = self.policy.act({"states": sampled_states, **rnn_policy}, role="policy") + critic_1_values, _, _ = self.critic_1.act({"states": sampled_states, "taken_actions": actions, **rnn_policy}, role="critic_1") + critic_2_values, _, _ = self.critic_2.act({"states": sampled_states, "taken_actions": actions, **rnn_policy}, role="critic_2") policy_loss = (self._entropy_coefficient * log_prob - torch.min(critic_1_values, critic_2_values)).mean() # optimization step (policy) self.policy_optimizer.zero_grad() policy_loss.backward() + if self._grad_norm_clip > 0: + nn.utils.clip_grad_norm_(self.policy.parameters(), self._grad_norm_clip) self.policy_optimizer.step() # entropy learning @@ -348,7 +417,7 @@ def _update(self, timestep: int, timesteps: int) -> None: self.track_data("Q-network / Q2 (max)", torch.max(critic_2_values).item()) self.track_data("Q-network / Q2 (min)", torch.min(critic_2_values).item()) self.track_data("Q-network / Q2 (mean)", torch.mean(critic_2_values).item()) - + self.track_data("Target / Target (max)", torch.max(target_values).item()) self.track_data("Target / Target (min)", torch.min(target_values).item()) self.track_data("Target / Target (mean)", torch.mean(target_values).item()) diff --git a/skrl/agents/torch/sarsa/__init__.py b/skrl/agents/torch/sarsa/__init__.py index c7c0a6e5..bfd794b7 100644 --- a/skrl/agents/torch/sarsa/__init__.py +++ b/skrl/agents/torch/sarsa/__init__.py @@ -1 +1 @@ -from .sarsa import SARSA, SARSA_DEFAULT_CONFIG \ No newline at end of file +from skrl.agents.torch.sarsa.sarsa import SARSA, SARSA_DEFAULT_CONFIG diff --git a/skrl/agents/torch/sarsa/sarsa.py b/skrl/agents/torch/sarsa/sarsa.py index fdad6030..6f1336e8 100644 --- a/skrl/agents/torch/sarsa/sarsa.py +++ b/skrl/agents/torch/sarsa/sarsa.py @@ -1,14 +1,14 @@ -from typing import Union, Tuple, Dict, Any +from typing import Union, Tuple, Dict, Any, Optional -import gym +import gym, gymnasium import copy import torch -from ....memories.torch import Memory -from ....models.torch import Model +from skrl.memories.torch import Memory +from skrl.models.torch import Model -from .. import Agent +from skrl.agents.torch import Agent SARSA_DEFAULT_CONFIG = { @@ -28,33 +28,37 @@ "checkpoint_interval": 1000, # interval for checkpoints (timesteps) "store_separately": False, # whether to store checkpoints separately + + "wandb": False, # whether to use Weights & Biases + "wandb_kwargs": {} # wandb kwargs (see https://docs.wandb.ai/ref/python/init) } } class SARSA(Agent): - def __init__(self, - models: Dict[str, Model], - memory: Union[Memory, Tuple[Memory], None] = None, - observation_space: Union[int, Tuple[int], gym.Space, None] = None, - action_space: Union[int, Tuple[int], gym.Space, None] = None, - device: Union[str, torch.device] = "cuda:0", - cfg: dict = {}) -> None: + def __init__(self, + models: Dict[str, Model], + memory: Optional[Union[Memory, Tuple[Memory]]] = None, + observation_space: Optional[Union[int, Tuple[int], gym.Space, gymnasium.Space]] = None, + action_space: Optional[Union[int, Tuple[int], gym.Space, gymnasium.Space]] = None, + device: Optional[Union[str, torch.device]] = None, + cfg: Optional[dict] = None) -> None: """State Action Reward State Action (SARSA) https://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.17.2539 - + :param models: Models used by the agent :type models: dictionary of skrl.models.torch.Model :param memory: Memory to storage the transitions. - If it is a tuple, the first element will be used for training and + If it is a tuple, the first element will be used for training and for the rest only the environment transitions will be added :type memory: skrl.memory.torch.Memory, list of skrl.memory.torch.Memory or None :param observation_space: Observation/state space or shape (default: None) - :type observation_space: int, tuple or list of integers, gym.Space or None, optional + :type observation_space: int, tuple or list of integers, gym.Space, gymnasium.Space or None, optional :param action_space: Action space or shape (default: None) - :type action_space: int, tuple or list of integers, gym.Space or None, optional - :param device: Computing device (default: "cuda:0") + :type action_space: int, tuple or list of integers, gym.Space, gymnasium.Space or None, optional + :param device: Device on which a torch tensor is or will be allocated (default: ``None``). + If None, the device will be either ``"cuda:0"`` if available or ``"cpu"`` :type device: str or torch.device, optional :param cfg: Configuration dictionary :type cfg: dict @@ -62,12 +66,12 @@ def __init__(self, :raises KeyError: If the models dictionary is missing a required key """ _cfg = copy.deepcopy(SARSA_DEFAULT_CONFIG) - _cfg.update(cfg) - super().__init__(models=models, - memory=memory, - observation_space=observation_space, - action_space=action_space, - device=device, + _cfg.update(cfg if cfg is not None else {}) + super().__init__(models=models, + memory=memory, + observation_space=observation_space, + action_space=action_space, + device=device, cfg=_cfg) # models @@ -75,10 +79,10 @@ def __init__(self, # checkpoint models self.checkpoint_modules["policy"] = self.policy - + # configuration self._discount_factor = self.cfg["discount_factor"] - + self._random_timesteps = self.cfg["random_timesteps"] self._learning_starts = self.cfg["learning_starts"] @@ -93,10 +97,10 @@ def __init__(self, self._current_next_states = None self._current_dones = None - def init(self) -> None: + def init(self, trainer_cfg: Optional[Dict[str, Any]] = None) -> None: """Initialize the agent """ - super().init() + super().init(trainer_cfg=trainer_cfg) def act(self, states: torch.Tensor, timestep: int, timesteps: int) -> torch.Tensor: """Process the environment's states to make a decision (actions) using the main policy @@ -113,22 +117,23 @@ def act(self, states: torch.Tensor, timestep: int, timesteps: int) -> torch.Tens """ # sample random actions if timestep < self._random_timesteps: - return self.policy.random_act(states, taken_actions=None, role="policy") + return self.policy.random_act({"states": states}, role="policy") # sample actions from policy - return self.policy.act(states, taken_actions=None, role="policy") - - def record_transition(self, - states: torch.Tensor, - actions: torch.Tensor, - rewards: torch.Tensor, - next_states: torch.Tensor, - dones: torch.Tensor, - infos: Any, - timestep: int, + return self.policy.act({"states": states}, role="policy") + + def record_transition(self, + states: torch.Tensor, + actions: torch.Tensor, + rewards: torch.Tensor, + next_states: torch.Tensor, + terminated: torch.Tensor, + truncated: torch.Tensor, + infos: Any, + timestep: int, timesteps: int) -> None: """Record an environment transition in memory - + :param states: Observations/states of the environment used to make the decision :type states: torch.Tensor :param actions: Actions taken by the agent @@ -137,8 +142,10 @@ def record_transition(self, :type rewards: torch.Tensor :param next_states: Next observations/states of the environment :type next_states: torch.Tensor - :param dones: Signals to indicate that episodes have ended - :type dones: torch.Tensor + :param terminated: Signals to indicate that episodes have terminated + :type terminated: torch.Tensor + :param truncated: Signals to indicate that episodes have been truncated + :type truncated: torch.Tensor :param infos: Additional information about the environment :type infos: Any type supported by the environment :param timestep: Current timestep @@ -146,7 +153,7 @@ def record_transition(self, :param timesteps: Number of timesteps :type timesteps: int """ - super().record_transition(states, actions, rewards, next_states, dones, infos, timestep, timesteps) + super().record_transition(states, actions, rewards, next_states, terminated, truncated, infos, timestep, timesteps) # reward shaping if self._rewards_shaper is not None: @@ -156,12 +163,14 @@ def record_transition(self, self._current_actions = actions self._current_rewards = rewards self._current_next_states = next_states - self._current_dones = dones + self._current_dones = terminated + truncated if self.memory is not None: - self.memory.add_samples(states=states, actions=actions, rewards=rewards, next_states=next_states, dones=dones) + self.memory.add_samples(states=states, actions=actions, rewards=rewards, next_states=next_states, + terminated=terminated, truncated=truncated) for memory in self.secondary_memories: - memory.add_samples(states=states, actions=actions, rewards=rewards, next_states=next_states, dones=dones) + memory.add_samples(states=states, actions=actions, rewards=rewards, next_states=next_states, + terminated=terminated, truncated=truncated) def pre_interaction(self, timestep: int, timesteps: int) -> None: """Callback called before the interaction with the environment @@ -197,13 +206,12 @@ def _update(self, timestep: int, timesteps: int) -> None: """ q_table = self.policy.table() env_ids = torch.arange(self._current_rewards.shape[0]).view(-1, 1) - + # compute next actions - next_actions = self.policy.act(self._current_next_states, taken_actions=None, role="policy")[0] + next_actions = self.policy.act({"states": self._current_next_states}, role="policy")[0] # update Q-table q_table[env_ids, self._current_states, self._current_actions] += self._learning_rate \ * (self._current_rewards + self._discount_factor * self._current_dones.logical_not() \ * q_table[env_ids, self._current_next_states, next_actions] \ - q_table[env_ids, self._current_states, self._current_actions]) - \ No newline at end of file diff --git a/skrl/agents/torch/td3/__init__.py b/skrl/agents/torch/td3/__init__.py index 85f09b4f..69f07c66 100644 --- a/skrl/agents/torch/td3/__init__.py +++ b/skrl/agents/torch/td3/__init__.py @@ -1 +1 @@ -from .td3 import TD3, TD3_DEFAULT_CONFIG \ No newline at end of file +from skrl.agents.torch.td3.td3 import TD3, TD3_DEFAULT_CONFIG diff --git a/skrl/agents/torch/td3/td3.py b/skrl/agents/torch/td3/td3.py index ae62cbf9..57441eda 100644 --- a/skrl/agents/torch/td3/td3.py +++ b/skrl/agents/torch/td3/td3.py @@ -1,25 +1,26 @@ -from typing import Union, Tuple, Dict, Any +from typing import Union, Tuple, Dict, Any, Optional -import gym +import gym, gymnasium import copy import itertools import torch +import torch.nn as nn import torch.nn.functional as F -from ....memories.torch import Memory -from ....models.torch import Model +from skrl.memories.torch import Memory +from skrl.models.torch import Model -from .. import Agent +from skrl.agents.torch import Agent TD3_DEFAULT_CONFIG = { "gradient_steps": 1, # gradient steps "batch_size": 64, # training batch size - + "discount_factor": 0.99, # discount factor (gamma) "polyak": 0.005, # soft update hyperparameter (tau) - + "actor_learning_rate": 1e-3, # actor learning rate "critic_learning_rate": 1e-3, # critic learning rate "learning_rate_scheduler": None, # learning rate scheduler class (see torch.optim.lr_scheduler) @@ -31,11 +32,13 @@ "random_timesteps": 0, # random exploration steps "learning_starts": 0, # learning starts after this many steps + "grad_norm_clip": 0, # clipping coefficient for the norm of the gradients + "exploration": { "noise": None, # exploration noise - "initial_scale": 1.0, # initial scale for noise - "final_scale": 1e-3, # final scale for noise - "timesteps": None, # timesteps for noise decay + "initial_scale": 1.0, # initial scale for the noise + "final_scale": 1e-3, # final scale for the noise + "timesteps": None, # timesteps for the noise decay }, "policy_delay": 2, # policy delay update with respect to critic update @@ -51,33 +54,37 @@ "checkpoint_interval": 1000, # interval for checkpoints (timesteps) "store_separately": False, # whether to store checkpoints separately + + "wandb": False, # whether to use Weights & Biases + "wandb_kwargs": {} # wandb kwargs (see https://docs.wandb.ai/ref/python/init) } } class TD3(Agent): - def __init__(self, - models: Dict[str, Model], - memory: Union[Memory, Tuple[Memory], None] = None, - observation_space: Union[int, Tuple[int], gym.Space, None] = None, - action_space: Union[int, Tuple[int], gym.Space, None] = None, - device: Union[str, torch.device] = "cuda:0", - cfg: dict = {}) -> None: + def __init__(self, + models: Dict[str, Model], + memory: Optional[Union[Memory, Tuple[Memory]]] = None, + observation_space: Optional[Union[int, Tuple[int], gym.Space, gymnasium.Space]] = None, + action_space: Optional[Union[int, Tuple[int], gym.Space, gymnasium.Space]] = None, + device: Optional[Union[str, torch.device]] = None, + cfg: Optional[dict] = None) -> None: """Twin Delayed DDPG (TD3) https://arxiv.org/abs/1802.09477 - + :param models: Models used by the agent :type models: dictionary of skrl.models.torch.Model :param memory: Memory to storage the transitions. - If it is a tuple, the first element will be used for training and + If it is a tuple, the first element will be used for training and for the rest only the environment transitions will be added :type memory: skrl.memory.torch.Memory, list of skrl.memory.torch.Memory or None :param observation_space: Observation/state space or shape (default: None) - :type observation_space: int, tuple or list of integers, gym.Space or None, optional + :type observation_space: int, tuple or list of integers, gym.Space, gymnasium.Space or None, optional :param action_space: Action space or shape (default: None) - :type action_space: int, tuple or list of integers, gym.Space or None, optional - :param device: Computing device (default: "cuda:0") + :type action_space: int, tuple or list of integers, gym.Space, gymnasium.Space or None, optional + :param device: Device on which a torch tensor is or will be allocated (default: ``None``). + If None, the device will be either ``"cuda:0"`` if available or ``"cpu"`` :type device: str or torch.device, optional :param cfg: Configuration dictionary :type cfg: dict @@ -85,12 +92,12 @@ def __init__(self, :raises KeyError: If the models dictionary is missing a required key """ _cfg = copy.deepcopy(TD3_DEFAULT_CONFIG) - _cfg.update(cfg) - super().__init__(models=models, - memory=memory, - observation_space=observation_space, - action_space=action_space, - device=device, + _cfg.update(cfg if cfg is not None else {}) + super().__init__(models=models, + memory=memory, + observation_space=observation_space, + action_space=action_space, + device=device, cfg=_cfg) # models @@ -100,7 +107,7 @@ def __init__(self, self.critic_2 = self.models.get("critic_2", None) self.target_critic_1 = self.models.get("target_critic_1", None) self.target_critic_2 = self.models.get("target_critic_2", None) - + # checkpoint models self.checkpoint_modules["policy"] = self.policy self.checkpoint_modules["target_policy"] = self.target_policy @@ -126,16 +133,18 @@ def __init__(self, self._discount_factor = self.cfg["discount_factor"] self._polyak = self.cfg["polyak"] - + self._actor_learning_rate = self.cfg["actor_learning_rate"] self._critic_learning_rate = self.cfg["critic_learning_rate"] self._learning_rate_scheduler = self.cfg["learning_rate_scheduler"] self._state_preprocessor = self.cfg["state_preprocessor"] - + self._random_timesteps = self.cfg["random_timesteps"] self._learning_starts = self.cfg["learning_starts"] + self._grad_norm_clip = self.cfg["grad_norm_clip"] + self._exploration_noise = self.cfg["exploration"]["noise"] self._exploration_initial_scale = self.cfg["exploration"]["initial_scale"] self._exploration_final_scale = self.cfg["exploration"]["final_scale"] @@ -152,7 +161,7 @@ def __init__(self, # set up optimizers and learning rate schedulers if self.policy is not None and self.critic_1 is not None and self.critic_2 is not None: self.policy_optimizer = torch.optim.Adam(self.policy.parameters(), lr=self._actor_learning_rate) - self.critic_optimizer = torch.optim.Adam(itertools.chain(self.critic_1.parameters(), self.critic_2.parameters()), + self.critic_optimizer = torch.optim.Adam(itertools.chain(self.critic_1.parameters(), self.critic_2.parameters()), lr=self._critic_learning_rate) if self._learning_rate_scheduler is not None: self.policy_scheduler = self._learning_rate_scheduler(self.policy_optimizer, **self.cfg["learning_rate_scheduler_kwargs"]) @@ -168,10 +177,11 @@ def __init__(self, else: self._state_preprocessor = self._empty_preprocessor - def init(self) -> None: + def init(self, trainer_cfg: Optional[Dict[str, Any]] = None) -> None: """Initialize the agent """ - super().init() + super().init(trainer_cfg=trainer_cfg) + self.set_mode("eval") # create tensors in memory if self.memory is not None: @@ -179,9 +189,26 @@ def init(self) -> None: self.memory.create_tensor(name="next_states", size=self.observation_space, dtype=torch.float32) self.memory.create_tensor(name="actions", size=self.action_space, dtype=torch.float32) self.memory.create_tensor(name="rewards", size=1, dtype=torch.float32) - self.memory.create_tensor(name="dones", size=1, dtype=torch.bool) - - self.tensors_names = ["states", "actions", "rewards", "next_states", "dones"] + self.memory.create_tensor(name="terminated", size=1, dtype=torch.bool) + + self._tensors_names = ["states", "actions", "rewards", "next_states", "terminated"] + + # RNN specifications + self._rnn = False # flag to indicate whether RNN is available + self._rnn_tensors_names = [] # used for sampling during training + self._rnn_final_states = {"policy": []} + self._rnn_initial_states = {"policy": []} + self._rnn_sequence_length = self.policy.get_specification().get("rnn", {}).get("sequence_length", 1) + + # policy + for i, size in enumerate(self.policy.get_specification().get("rnn", {}).get("sizes", [])): + self._rnn = True + # create tensors in memory + if self.memory is not None: + self.memory.create_tensor(name=f"rnn_policy_{i}", size=(size[0], size[2]), dtype=torch.float32, keep_dimensions=True) + self._rnn_tensors_names.append(f"rnn_policy_{i}") + # default RNN states + self._rnn_initial_states["policy"].append(torch.zeros(size, dtype=torch.float32, device=self.device)) # clip noise bounds self.clip_actions_min = torch.tensor(self.action_space.low, device=self.device) @@ -203,25 +230,28 @@ def act(self, states: torch.Tensor, timestep: int, timesteps: int) -> torch.Tens :return: Actions :rtype: torch.Tensor """ - states = self._state_preprocessor(states) + rnn = {"rnn": self._rnn_initial_states["policy"]} if self._rnn else {} # sample random actions if timestep < self._random_timesteps: - return self.policy.random_act(states, taken_actions=None, role="policy") + return self.policy.random_act({"states": self._state_preprocessor(states), **rnn}, role="policy") # sample deterministic actions - actions = self.policy.act(states, taken_actions=None, role="policy") - - # add noise + actions, _, outputs = self.policy.act({"states": self._state_preprocessor(states), **rnn}, role="policy") + + if self._rnn: + self._rnn_final_states["policy"] = outputs.get("rnn", []) + + # add exloration noise if self._exploration_noise is not None: # sample noises - noises = self._exploration_noise.sample(actions[0].shape) - + noises = self._exploration_noise.sample(actions.shape) + # define exploration timesteps scale = self._exploration_final_scale if self._exploration_timesteps is None: self._exploration_timesteps = timesteps - + # apply exploration noise if timestep <= self._exploration_timesteps: scale = (1 - timestep / self._exploration_timesteps) \ @@ -230,39 +260,37 @@ def act(self, states: torch.Tensor, timestep: int, timesteps: int) -> torch.Tens noises.mul_(scale) # modify actions - actions[0].add_(noises) - + actions.add_(noises) if self._backward_compatibility: - actions = (torch.max(torch.min(actions[0], self.clip_actions_max), self.clip_actions_min), - actions[1], - actions[2]) + actions = torch.max(torch.min(actions, self.clip_actions_max), self.clip_actions_min) else: - actions[0].clamp_(min=self.clip_actions_min, max=self.clip_actions_max) + actions.clamp_(min=self.clip_actions_min, max=self.clip_actions_max) # record noises self.track_data("Exploration / Exploration noise (max)", torch.max(noises).item()) self.track_data("Exploration / Exploration noise (min)", torch.min(noises).item()) self.track_data("Exploration / Exploration noise (mean)", torch.mean(noises).item()) - + else: # record noises self.track_data("Exploration / Exploration noise (max)", 0) self.track_data("Exploration / Exploration noise (min)", 0) self.track_data("Exploration / Exploration noise (mean)", 0) - return actions - - def record_transition(self, - states: torch.Tensor, - actions: torch.Tensor, - rewards: torch.Tensor, - next_states: torch.Tensor, - dones: torch.Tensor, - infos: Any, - timestep: int, + return actions, None, outputs + + def record_transition(self, + states: torch.Tensor, + actions: torch.Tensor, + rewards: torch.Tensor, + next_states: torch.Tensor, + terminated: torch.Tensor, + truncated: torch.Tensor, + infos: Any, + timestep: int, timesteps: int) -> None: """Record an environment transition in memory - + :param states: Observations/states of the environment used to make the decision :type states: torch.Tensor :param actions: Actions taken by the agent @@ -271,8 +299,10 @@ def record_transition(self, :type rewards: torch.Tensor :param next_states: Next observations/states of the environment :type next_states: torch.Tensor - :param dones: Signals to indicate that episodes have ended - :type dones: torch.Tensor + :param terminated: Signals to indicate that episodes have terminated + :type terminated: torch.Tensor + :param truncated: Signals to indicate that episodes have been truncated + :type truncated: torch.Tensor :param infos: Additional information about the environment :type infos: Any type supported by the environment :param timestep: Current timestep @@ -280,16 +310,34 @@ def record_transition(self, :param timesteps: Number of timesteps :type timesteps: int """ - super().record_transition(states, actions, rewards, next_states, dones, infos, timestep, timesteps) + super().record_transition(states, actions, rewards, next_states, terminated, truncated, infos, timestep, timesteps) - # reward shaping - if self._rewards_shaper is not None: - rewards = self._rewards_shaper(rewards, timestep, timesteps) - if self.memory is not None: - self.memory.add_samples(states=states, actions=actions, rewards=rewards, next_states=next_states, dones=dones) + # reward shaping + if self._rewards_shaper is not None: + rewards = self._rewards_shaper(rewards, timestep, timesteps) + + # package RNN states + rnn_states = {} + if self._rnn: + rnn_states.update({f"rnn_policy_{i}": s.transpose(0, 1) for i, s in enumerate(self._rnn_initial_states["policy"])}) + + # storage transition in memory + self.memory.add_samples(states=states, actions=actions, rewards=rewards, next_states=next_states, + terminated=terminated, truncated=truncated, **rnn_states) for memory in self.secondary_memories: - memory.add_samples(states=states, actions=actions, rewards=rewards, next_states=next_states, dones=dones) + memory.add_samples(states=states, actions=actions, rewards=rewards, next_states=next_states, + terminated=terminated, truncated=truncated, **rnn_states) + + # update RNN states + if self._rnn: + # reset states if the episodes have ended + finished_episodes = terminated.nonzero(as_tuple=False) + if finished_episodes.numel(): + for rnn_state in self._rnn_final_states["policy"]: + rnn_state[:, finished_episodes[:, 0]] = 0 + + self._rnn_initial_states = self._rnn_final_states def pre_interaction(self, timestep: int, timesteps: int) -> None: """Callback called before the interaction with the environment @@ -310,11 +358,13 @@ def post_interaction(self, timestep: int, timesteps: int) -> None: :type timesteps: int """ if timestep >= self._learning_starts: + self.set_mode("train") self._update(timestep, timesteps) - + self.set_mode("eval") + # write tracking data and checkpoints super().post_interaction(timestep, timesteps) - + def _update(self, timestep: int, timesteps: int) -> None: """Algorithm's main update step @@ -325,19 +375,24 @@ def _update(self, timestep: int, timesteps: int) -> None: """ # sample a batch from memory sampled_states, sampled_actions, sampled_rewards, sampled_next_states, sampled_dones = \ - self.memory.sample(names=self.tensors_names, batch_size=self._batch_size)[0] + self.memory.sample(names=self._tensors_names, batch_size=self._batch_size, sequence_length=self._rnn_sequence_length)[0] + + rnn_policy = {} + if self._rnn: + sampled_rnn = self.memory.sample_by_index(names=self._rnn_tensors_names, indexes=self.memory.get_sampling_indexes())[0] + rnn_policy = {"rnn": [s.transpose(0, 1) for s in sampled_rnn], "terminated": sampled_dones} # gradient steps for gradient_step in range(self._gradient_steps): sampled_states = self._state_preprocessor(sampled_states, train=not gradient_step) sampled_next_states = self._state_preprocessor(sampled_next_states) - + with torch.no_grad(): # target policy smoothing - next_actions, _, _ = self.target_policy.act(states=sampled_next_states, taken_actions=None, role="target_policy") - noises = torch.clamp(self._smooth_regularization_noise.sample(next_actions.shape), - min=-self._smooth_regularization_clip, + next_actions, _, _ = self.target_policy.act({"states": sampled_next_states, **rnn_policy}, role="target_policy") + noises = torch.clamp(self._smooth_regularization_noise.sample(next_actions.shape), + min=-self._smooth_regularization_clip, max=self._smooth_regularization_clip) next_actions.add_(noises) @@ -347,20 +402,22 @@ def _update(self, timestep: int, timesteps: int) -> None: next_actions.clamp_(min=self.clip_actions_min, max=self.clip_actions_max) # compute target values - target_q1_values, _, _ = self.target_critic_1.act(states=sampled_next_states, taken_actions=next_actions, role="target_critic_1") - target_q2_values, _, _ = self.target_critic_2.act(states=sampled_next_states, taken_actions=next_actions, role="target_critic_2") + target_q1_values, _, _ = self.target_critic_1.act({"states": sampled_next_states, "taken_actions": next_actions, **rnn_policy}, role="target_critic_1") + target_q2_values, _, _ = self.target_critic_2.act({"states": sampled_next_states, "taken_actions": next_actions, **rnn_policy}, role="target_critic_2") target_q_values = torch.min(target_q1_values, target_q2_values) target_values = sampled_rewards + self._discount_factor * sampled_dones.logical_not() * target_q_values # compute critic loss - critic_1_values, _, _ = self.critic_1.act(states=sampled_states, taken_actions=sampled_actions, role="critic_1") - critic_2_values, _, _ = self.critic_2.act(states=sampled_states, taken_actions=sampled_actions, role="critic_2") - + critic_1_values, _, _ = self.critic_1.act({"states": sampled_states, "taken_actions": sampled_actions, **rnn_policy}, role="critic_1") + critic_2_values, _, _ = self.critic_2.act({"states": sampled_states, "taken_actions": sampled_actions, **rnn_policy}, role="critic_2") + critic_loss = F.mse_loss(critic_1_values, target_values) + F.mse_loss(critic_2_values, target_values) - + # optimization step (critic) self.critic_optimizer.zero_grad() critic_loss.backward() + if self._grad_norm_clip > 0: + nn.utils.clip_grad_norm_(itertools.chain(self.critic_1.parameters(), self.critic_2.parameters()), self._grad_norm_clip) self.critic_optimizer.step() # delayed update @@ -368,14 +425,16 @@ def _update(self, timestep: int, timesteps: int) -> None: if not self._critic_update_counter % self._policy_delay: # compute policy (actor) loss - actions, _, _ = self.policy.act(states=sampled_states, taken_actions=None, role="policy") - critic_values, _, _ = self.critic_1.act(states=sampled_states, taken_actions=actions, role="critic_1") + actions, _, _ = self.policy.act({"states": sampled_states, **rnn_policy}, role="policy") + critic_values, _, _ = self.critic_1.act({"states": sampled_states, "taken_actions": actions, **rnn_policy}, role="critic_1") policy_loss = -critic_values.mean() # optimization step (policy) self.policy_optimizer.zero_grad() policy_loss.backward() + if self._grad_norm_clip > 0: + nn.utils.clip_grad_norm_(self.policy.parameters(), self._grad_norm_clip) self.policy_optimizer.step() # update target networks @@ -400,7 +459,7 @@ def _update(self, timestep: int, timesteps: int) -> None: self.track_data("Q-network / Q2 (max)", torch.max(critic_2_values).item()) self.track_data("Q-network / Q2 (min)", torch.min(critic_2_values).item()) self.track_data("Q-network / Q2 (mean)", torch.mean(critic_2_values).item()) - + self.track_data("Target / Target (max)", torch.max(target_values).item()) self.track_data("Target / Target (min)", torch.min(target_values).item()) self.track_data("Target / Target (mean)", torch.mean(target_values).item()) diff --git a/skrl/agents/torch/trpo/__init__.py b/skrl/agents/torch/trpo/__init__.py index 9fcdb9e3..c4dfd054 100644 --- a/skrl/agents/torch/trpo/__init__.py +++ b/skrl/agents/torch/trpo/__init__.py @@ -1 +1 @@ -from .trpo import TRPO, TRPO_DEFAULT_CONFIG \ No newline at end of file +from skrl.agents.torch.trpo.trpo import TRPO, TRPO_DEFAULT_CONFIG diff --git a/skrl/agents/torch/trpo/trpo.py b/skrl/agents/torch/trpo/trpo.py index fa9d65d0..1f90b864 100644 --- a/skrl/agents/torch/trpo/trpo.py +++ b/skrl/agents/torch/trpo/trpo.py @@ -1,6 +1,6 @@ -from typing import Union, Tuple, Dict, Any +from typing import Union, Tuple, Dict, Any, Optional -import gym +import gym, gymnasium import copy import torch @@ -9,20 +9,20 @@ from torch.nn.utils.convert_parameters import parameters_to_vector from torch.nn.utils.convert_parameters import vector_to_parameters -from ....memories.torch import Memory -from ....models.torch import Model +from skrl.memories.torch import Memory +from skrl.models.torch import Model -from .. import Agent +from skrl.agents.torch import Agent TRPO_DEFAULT_CONFIG = { "rollouts": 16, # number of rollouts before updating "learning_epochs": 8, # number of learning epochs during each update "mini_batches": 2, # number of mini batches during each learning epoch - + "discount_factor": 0.99, # discount factor (gamma) - "lambda": 0.99, # TD(lambda) coefficient (lam) for computing returns and advantages - + "lambda": 0.95, # TD(lambda) coefficient (lam) for computing returns and advantages + "value_learning_rate": 1e-3, # value learning rate "learning_rate_scheduler": None, # learning rate scheduler class (see torch.optim.lr_scheduler) "learning_rate_scheduler_kwargs": {}, # learning rate scheduler's kwargs (e.g. {"step_size": 1e-3}) @@ -54,33 +54,37 @@ "checkpoint_interval": 1000, # interval for checkpoints (timesteps) "store_separately": False, # whether to store checkpoints separately + + "wandb": False, # whether to use Weights & Biases + "wandb_kwargs": {} # wandb kwargs (see https://docs.wandb.ai/ref/python/init) } } class TRPO(Agent): - def __init__(self, - models: Dict[str, Model], - memory: Union[Memory, Tuple[Memory], None] = None, - observation_space: Union[int, Tuple[int], gym.Space, None] = None, - action_space: Union[int, Tuple[int], gym.Space, None] = None, - device: Union[str, torch.device] = "cuda:0", - cfg: dict = {}) -> None: + def __init__(self, + models: Dict[str, Model], + memory: Optional[Union[Memory, Tuple[Memory]]] = None, + observation_space: Optional[Union[int, Tuple[int], gym.Space, gymnasium.Space]] = None, + action_space: Optional[Union[int, Tuple[int], gym.Space, gymnasium.Space]] = None, + device: Optional[Union[str, torch.device]] = None, + cfg: Optional[dict] = None) -> None: """Trust Region Policy Optimization (TRPO) https://arxiv.org/abs/1502.05477 - + :param models: Models used by the agent :type models: dictionary of skrl.models.torch.Model :param memory: Memory to storage the transitions. - If it is a tuple, the first element will be used for training and + If it is a tuple, the first element will be used for training and for the rest only the environment transitions will be added :type memory: skrl.memory.torch.Memory, list of skrl.memory.torch.Memory or None :param observation_space: Observation/state space or shape (default: None) - :type observation_space: int, tuple or list of integers, gym.Space or None, optional + :type observation_space: int, tuple or list of integers, gym.Space, gymnasium.Space or None, optional :param action_space: Action space or shape (default: None) - :type action_space: int, tuple or list of integers, gym.Space or None, optional - :param device: Computing device (default: "cuda:0") + :type action_space: int, tuple or list of integers, gym.Space, gymnasium.Space or None, optional + :param device: Device on which a torch tensor is or will be allocated (default: ``None``). + If None, the device will be either ``"cuda:0"`` if available or ``"cpu"`` :type device: str or torch.device, optional :param cfg: Configuration dictionary :type cfg: dict @@ -88,12 +92,12 @@ def __init__(self, :raises KeyError: If the models dictionary is missing a required key """ _cfg = copy.deepcopy(TRPO_DEFAULT_CONFIG) - _cfg.update(cfg) - super().__init__(models=models, - memory=memory, - observation_space=observation_space, - action_space=action_space, - device=device, + _cfg.update(cfg if cfg is not None else {}) + super().__init__(models=models, + memory=memory, + observation_space=observation_space, + action_space=action_space, + device=device, cfg=_cfg) # models @@ -157,23 +161,56 @@ def __init__(self, else: self._value_preprocessor = self._empty_preprocessor - def init(self) -> None: + def init(self, trainer_cfg: Optional[Dict[str, Any]] = None) -> None: """Initialize the agent """ - super().init() - + super().init(trainer_cfg=trainer_cfg) + self.set_mode("eval") + # create tensors in memory if self.memory is not None: self.memory.create_tensor(name="states", size=self.observation_space, dtype=torch.float32) self.memory.create_tensor(name="actions", size=self.action_space, dtype=torch.float32) self.memory.create_tensor(name="rewards", size=1, dtype=torch.float32) - self.memory.create_tensor(name="dones", size=1, dtype=torch.bool) + self.memory.create_tensor(name="terminated", size=1, dtype=torch.bool) self.memory.create_tensor(name="log_prob", size=1, dtype=torch.float32) self.memory.create_tensor(name="values", size=1, dtype=torch.float32) self.memory.create_tensor(name="returns", size=1, dtype=torch.float32) self.memory.create_tensor(name="advantages", size=1, dtype=torch.float32) - self.tensors_names = ["states", "actions", "log_prob", "returns", "advantages"] + self._tensors_names_policy = ["states", "actions", "terminated", "log_prob", "advantages"] + self._tensors_names_value = ["states", "terminated", "returns"] + + # RNN specifications + self._rnn = False # flag to indicate whether RNN is available + self._rnn_tensors_names = [] # used for sampling during training + self._rnn_final_states = {"policy": [], "value": []} + self._rnn_initial_states = {"policy": [], "value": []} + self._rnn_sequence_length = self.policy.get_specification().get("rnn", {}).get("sequence_length", 1) + + # policy + for i, size in enumerate(self.policy.get_specification().get("rnn", {}).get("sizes", [])): + self._rnn = True + # create tensors in memory + if self.memory is not None: + self.memory.create_tensor(name=f"rnn_policy_{i}", size=(size[0], size[2]), dtype=torch.float32, keep_dimensions=True) + self._rnn_tensors_names.append(f"rnn_policy_{i}") + # default RNN states + self._rnn_initial_states["policy"].append(torch.zeros(size, dtype=torch.float32, device=self.device)) + + # value + if self.value is not None: + if self.policy is self.value: + self._rnn_initial_states["value"] = self._rnn_initial_states["policy"] + else: + for i, size in enumerate(self.value.get_specification().get("rnn", {}).get("sizes", [])): + self._rnn = True + # create tensors in memory + if self.memory is not None: + self.memory.create_tensor(name=f"rnn_value_{i}", size=(size[0], size[2]), dtype=torch.float32, keep_dimensions=True) + self._rnn_tensors_names.append(f"rnn_value_{i}") + # default RNN states + self._rnn_initial_states["value"].append(torch.zeros(size, dtype=torch.float32, device=self.device)) # create temporary variables needed for storage and computation self._current_log_prob = None @@ -192,30 +229,34 @@ def act(self, states: torch.Tensor, timestep: int, timesteps: int) -> torch.Tens :return: Actions :rtype: torch.Tensor """ - states = self._state_preprocessor(states) + rnn = {"rnn": self._rnn_initial_states["policy"]} if self._rnn else {} # sample random actions - # TODO, check for stochasticity + # TODO: fix for stochasticity, rnn and log_prob if timestep < self._random_timesteps: - return self.policy.random_act(states, taken_actions=None, role="policy") + return self.policy.random_act({"states": self._state_preprocessor(states), **rnn}, role="policy") # sample stochastic actions - actions, log_prob, actions_mean = self.policy.act(states, taken_actions=None, role="policy") + actions, log_prob, outputs = self.policy.act({"states": self._state_preprocessor(states), **rnn}, role="policy") self._current_log_prob = log_prob - return actions, log_prob, actions_mean + if self._rnn: + self._rnn_final_states["policy"] = outputs.get("rnn", []) - def record_transition(self, - states: torch.Tensor, - actions: torch.Tensor, - rewards: torch.Tensor, - next_states: torch.Tensor, - dones: torch.Tensor, - infos: Any, - timestep: int, + return actions, log_prob, outputs + + def record_transition(self, + states: torch.Tensor, + actions: torch.Tensor, + rewards: torch.Tensor, + next_states: torch.Tensor, + terminated: torch.Tensor, + truncated: torch.Tensor, + infos: Any, + timestep: int, timesteps: int) -> None: """Record an environment transition in memory - + :param states: Observations/states of the environment used to make the decision :type states: torch.Tensor :param actions: Actions taken by the agent @@ -224,8 +265,10 @@ def record_transition(self, :type rewards: torch.Tensor :param next_states: Next observations/states of the environment :type next_states: torch.Tensor - :param dones: Signals to indicate that episodes have ended - :type dones: torch.Tensor + :param terminated: Signals to indicate that episodes have terminated + :type terminated: torch.Tensor + :param truncated: Signals to indicate that episodes have been truncated + :type truncated: torch.Tensor :param infos: Additional information about the environment :type infos: Any type supported by the environment :param timestep: Current timestep @@ -233,24 +276,48 @@ def record_transition(self, :param timesteps: Number of timesteps :type timesteps: int """ - super().record_transition(states, actions, rewards, next_states, dones, infos, timestep, timesteps) + super().record_transition(states, actions, rewards, next_states, terminated, truncated, infos, timestep, timesteps) - # reward shaping - if self._rewards_shaper is not None: - rewards = self._rewards_shaper(rewards, timestep, timesteps) + if self.memory is not None: + self._current_next_states = next_states - self._current_next_states = next_states + # reward shaping + if self._rewards_shaper is not None: + rewards = self._rewards_shaper(rewards, timestep, timesteps) - if self.memory is not None: - with torch.no_grad(): - values, _, _ = self.value.act(states=self._state_preprocessor(states), taken_actions=None, role="value") + # compute values + rnn = {"rnn": self._rnn_initial_states["value"]} if self._rnn else {} + values, _, outputs = self.value.act({"states": self._state_preprocessor(states), **rnn}, role="value") values = self._value_preprocessor(values, inverse=True) - self.memory.add_samples(states=states, actions=actions, rewards=rewards, next_states=next_states, dones=dones, - log_prob=self._current_log_prob, values=values) + # package RNN states + rnn_states = {} + if self._rnn: + rnn_states.update({f"rnn_policy_{i}": s.transpose(0, 1) for i, s in enumerate(self._rnn_initial_states["policy"])}) + if self.policy is not self.value: + rnn_states.update({f"rnn_value_{i}": s.transpose(0, 1) for i, s in enumerate(self._rnn_initial_states["value"])}) + + # storage transition in memory + self.memory.add_samples(states=states, actions=actions, rewards=rewards, next_states=next_states, + terminated=terminated, truncated=truncated, log_prob=self._current_log_prob, values=values, **rnn_states) for memory in self.secondary_memories: - memory.add_samples(states=states, actions=actions, rewards=rewards, next_states=next_states, dones=dones, - log_prob=self._current_log_prob, values=values) + memory.add_samples(states=states, actions=actions, rewards=rewards, next_states=next_states, + terminated=terminated, truncated=truncated, log_prob=self._current_log_prob, values=values, **rnn_states) + + # update RNN states + if self._rnn: + self._rnn_final_states["value"] = self._rnn_final_states["policy"] if self.policy is self.value else outputs.get("rnn", []) + + # reset states if the episodes have ended + finished_episodes = terminated.nonzero(as_tuple=False) + if finished_episodes.numel(): + for rnn_state in self._rnn_final_states["policy"]: + rnn_state[:, finished_episodes[:, 0]] = 0 + if self.policy is not self.value: + for rnn_state in self._rnn_final_states["value"]: + rnn_state[:, finished_episodes[:, 0]] = 0 + + self._rnn_initial_states = self._rnn_final_states def pre_interaction(self, timestep: int, timesteps: int) -> None: """Callback called before the interaction with the environment @@ -272,7 +339,9 @@ def post_interaction(self, timestep: int, timesteps: int) -> None: """ self._rollout += 1 if not self._rollout % self._rollouts and timestep >= self._learning_starts: + self.set_mode("train") self._update(timestep, timesteps) + self.set_mode("eval") # write tracking data and checkpoints super().post_interaction(timestep, timesteps) @@ -285,11 +354,11 @@ def _update(self, timestep: int, timesteps: int) -> None: :param timesteps: Number of timesteps :type timesteps: int """ - def compute_gae(rewards: torch.Tensor, - dones: torch.Tensor, - values: torch.Tensor, - next_values: torch.Tensor, - discount_factor: float = 0.99, + def compute_gae(rewards: torch.Tensor, + dones: torch.Tensor, + values: torch.Tensor, + next_values: torch.Tensor, + discount_factor: float = 0.99, lambda_coefficient: float = 0.95) -> torch.Tensor: """Compute the Generalized Advantage Estimator (GAE) @@ -326,10 +395,10 @@ def compute_gae(rewards: torch.Tensor, return returns, advantages - def surrogate_loss(policy: Model, - states: torch.Tensor, - actions: torch.Tensor, - log_prob: torch.Tensor, + def surrogate_loss(policy: Model, + states: torch.Tensor, + actions: torch.Tensor, + log_prob: torch.Tensor, advantages: torch.Tensor) -> torch.Tensor: """Compute the surrogate objective (policy loss) @@ -347,13 +416,13 @@ def surrogate_loss(policy: Model, :return: Surrogate loss :rtype: torch.Tensor """ - _, new_log_prob, _ = policy.act(states, taken_actions=actions, role="policy") + _, new_log_prob, _ = policy.act({"states": states, "taken_actions": actions, **rnn_policy}, role="policy") return (advantages * torch.exp(new_log_prob - log_prob.detach())).mean() - def conjugate_gradient(policy: Model, - states: torch.Tensor, - b: torch.Tensor, - num_iterations: float = 10, + def conjugate_gradient(policy: Model, + states: torch.Tensor, + b: torch.Tensor, + num_iterations: float = 10, residual_tolerance: float = 1e-10) -> torch.Tensor: """Conjugate gradient algorithm to solve Ax = b using the iterative method @@ -363,7 +432,7 @@ def conjugate_gradient(policy: Model, :type policy: Model :param states: States :type states: torch.Tensor - :param b: Vector b + :param b: Vector b :type b: torch.Tensor :param num_iterations: Number of iterations (default: 10) :type num_iterations: float, optional @@ -389,12 +458,12 @@ def conjugate_gradient(policy: Model, rr_old = rr_new return x - def fisher_vector_product(policy: Model, - states: torch.Tensor, - vector: torch.Tensor, + def fisher_vector_product(policy: Model, + states: torch.Tensor, + vector: torch.Tensor, damping: float = 0.1) -> torch.Tensor: """Compute the Fisher vector product (direct method) - + https://www.telesens.co/2018/06/09/efficiently-computing-the-fisher-vector-product-in-trpo/ :param policy: Policy @@ -431,25 +500,29 @@ def kl_divergence(policy_1: Model, policy_2: Model, states: torch.Tensor) -> tor :return: KL divergence :rtype: torch.Tensor """ - _, _, mu_1 = policy_1.act(states, taken_actions=None, role="policy") + mu_1 = policy_1.act({"states": states, **rnn_policy}, role="policy")[2]["mean_actions"] logstd_1 = policy_1.get_log_std(role="policy") mu_1, logstd_1 = mu_1.detach(), logstd_1.detach() - _, _, mu_2 = policy_2.act(states, taken_actions=None, role="policy") - logstd_2 = policy_2.get_log_std(role="policy") - + with torch.backends.cudnn.flags(enabled=not self._rnn): + mu_2 = policy_2.act({"states": states, **rnn_policy}, role="policy")[2]["mean_actions"] + logstd_2 = policy_2.get_log_std(role="policy") + kl = logstd_1 - logstd_2 + 0.5 * (torch.square(logstd_1.exp()) + torch.square(mu_1 - mu_2)) \ / torch.square(logstd_2.exp()) - 0.5 return torch.sum(kl, dim=-1).mean() # compute returns and advantages with torch.no_grad(): - last_values, _, _ = self.value.act(self._state_preprocessor(self._current_next_states.float()), taken_actions=None, role="value") + self.value.train(False) + rnn = {"rnn": self._rnn_initial_states["value"]} if self._rnn else {} + last_values, _, _ = self.value.act({"states": self._state_preprocessor(self._current_next_states.float()), **rnn}, role="value") + self.value.train(True) last_values = self._value_preprocessor(last_values, inverse=True) - + values = self.memory.get_tensor_by_name("values") returns, advantages = compute_gae(rewards=self.memory.get_tensor_by_name("rewards"), - dones=self.memory.get_tensor_by_name("dones"), + dones=self.memory.get_tensor_by_name("terminated"), values=values, next_values=last_values, discount_factor=self._discount_factor, @@ -459,59 +532,83 @@ def kl_divergence(policy_1: Model, policy_2: Model, states: torch.Tensor) -> tor self.memory.set_tensor_by_name("returns", self._value_preprocessor(returns, train=True)) self.memory.set_tensor_by_name("advantages", advantages) - # sample mini-batches from memory - sampled_batches = self.memory.sample_all(names=self.tensors_names, mini_batches=self._mini_batches) + # sample all from memory + sampled_states, sampled_actions, sampled_dones, sampled_log_prob, sampled_advantages \ + = self.memory.sample_all(names=self._tensors_names_policy, mini_batches=1, sequence_length=self._rnn_sequence_length)[0] + sampled_rnn_batches = self.memory.sample_all(names=self._rnn_tensors_names, mini_batches=1, sequence_length=self._rnn_sequence_length)[0] - cumulative_policy_loss = 0 - cumulative_value_loss = 0 + rnn_policy = {} - # learning epochs - for epoch in range(self._learning_epochs): - - # mini-batches loop - for sampled_states, sampled_actions, sampled_log_prob, sampled_returns, sampled_advantages in sampled_batches: + if self._rnn: + if self.policy is self.value: + rnn_policy = {"rnn": [s.transpose(0, 1) for s in sampled_rnn_batches], "terminated": sampled_dones} + else: + rnn_policy = {"rnn": [s.transpose(0, 1) for s, n in zip(sampled_rnn_batches, self._rnn_tensors_names) if "policy" in n], "terminated": sampled_dones} - sampled_states = self._state_preprocessor(sampled_states, train=not epoch) + sampled_states = self._state_preprocessor(sampled_states, train=True) + + # compute policy loss gradient + policy_loss = surrogate_loss(self.policy, sampled_states, sampled_actions, sampled_log_prob, sampled_advantages) + policy_loss_gradient = torch.autograd.grad(policy_loss, self.policy.parameters()) + flat_policy_loss_gradient = torch.cat([gradient.view(-1) for gradient in policy_loss_gradient]) + + # compute the search direction using the conjugate gradient algorithm + search_direction = conjugate_gradient(self.policy, sampled_states, flat_policy_loss_gradient.data, + num_iterations=self._conjugate_gradient_steps) + + # compute step size and full step + xHx = (search_direction * fisher_vector_product(self.policy, sampled_states, search_direction, self._damping)) \ + .sum(0, keepdim=True) + step_size = torch.sqrt(2 * self._max_kl_divergence / xHx)[0] + full_step = step_size * search_direction + + # backtracking line search + restore_policy_flag = True + self.backup_policy.update_parameters(self.policy) + params = parameters_to_vector(self.policy.parameters()) + + expected_improvement = (flat_policy_loss_gradient * full_step).sum(0, keepdim=True) - # compute policy loss gradient - policy_loss = surrogate_loss(self.policy, sampled_states, sampled_actions, sampled_log_prob, sampled_advantages) - policy_loss_gradient = torch.autograd.grad(policy_loss, self.policy.parameters()) - flat_policy_loss_gradient = torch.cat([gradient.view(-1) for gradient in policy_loss_gradient]) + for alpha in [self._step_fraction * 0.5 ** i for i in range(self._max_backtrack_steps)]: + new_params = params + alpha * full_step + vector_to_parameters(new_params, self.policy.parameters()) - # compute the search direction using the conjugate gradient algorithm - search_direction = conjugate_gradient(self.policy, sampled_states, flat_policy_loss_gradient.data, - num_iterations=self._conjugate_gradient_steps) + expected_improvement *= alpha + kl = kl_divergence(self.backup_policy, self.policy, sampled_states) + loss = surrogate_loss(self.policy, sampled_states, sampled_actions, sampled_log_prob, sampled_advantages) - # compute step size and full step - xHx = (search_direction * fisher_vector_product(self.policy, sampled_states, search_direction, self._damping)) \ - .sum(0, keepdim=True) - step_size = torch.sqrt(2 * self._max_kl_divergence / xHx)[0] - full_step = step_size * search_direction + if kl < self._max_kl_divergence and (loss - policy_loss) / expected_improvement > self._accept_ratio: + restore_policy_flag = False + break - # backtracking line search - restore_policy_flag = True - self.backup_policy.update_parameters(self.policy) - params = parameters_to_vector(self.policy.parameters()) + if restore_policy_flag: + self.policy.update_parameters(self.backup_policy) - expected_improvement = (flat_policy_loss_gradient * full_step).sum(0, keepdim=True) + # sample mini-batches from memory + sampled_batches = self.memory.sample_all(names=self._tensors_names_value, mini_batches=self._mini_batches, sequence_length=self._rnn_sequence_length) - for alpha in [self._step_fraction * 0.5 ** i for i in range(self._max_backtrack_steps)]: - new_params = params + alpha * full_step - vector_to_parameters(new_params, self.policy.parameters()) + rnn_value = {} + if self._rnn: + sampled_rnn_batches = self.memory.sample_all(names=self._rnn_tensors_names, mini_batches=self._mini_batches, sequence_length=self._rnn_sequence_length) - expected_improvement *= alpha - kl = kl_divergence(self.backup_policy, self.policy, sampled_states) - loss = surrogate_loss(self.policy, sampled_states, sampled_actions, sampled_log_prob, sampled_advantages) + cumulative_value_loss = 0 - if kl < self._max_kl_divergence and (loss - policy_loss) / expected_improvement > self._accept_ratio: - restore_policy_flag = False - break + # learning epochs + for epoch in range(self._learning_epochs): - if restore_policy_flag: - self.policy.update_parameters(self.backup_policy) + # mini-batches loop + for i, (sampled_states, sampled_dones, sampled_returns) in enumerate(sampled_batches): + + if self._rnn: + if self.policy is self.value: + rnn_value = {"rnn": [s.transpose(0, 1) for s in sampled_rnn_batches[i]], "terminated": sampled_dones} + else: + rnn_value = {"rnn": [s.transpose(0, 1) for s, n in zip(sampled_rnn_batches[i], self._rnn_tensors_names) if "value" in n], "terminated": sampled_dones} + + sampled_states = self._state_preprocessor(sampled_states, train=not epoch) # compute value loss - predicted_values, _, _ = self.value.act(sampled_states, taken_actions=None, role="value") + predicted_values, _, _ = self.value.act({"states": sampled_states, **rnn_value}, role="value") value_loss = self._value_loss_scale * F.mse_loss(sampled_returns, predicted_values) @@ -523,7 +620,6 @@ def kl_divergence(policy_1: Model, policy_2: Model, states: torch.Tensor) -> tor self.value_optimizer.step() # update cumulative losses - cumulative_policy_loss += policy_loss.item() cumulative_value_loss += value_loss.item() # update learning rate @@ -531,9 +627,9 @@ def kl_divergence(policy_1: Model, policy_2: Model, states: torch.Tensor) -> tor self.value_scheduler.step() # record data - self.track_data("Loss / Policy loss", cumulative_policy_loss / (self._learning_epochs * self._mini_batches)) + self.track_data("Loss / Policy loss", policy_loss.item()) self.track_data("Loss / Value loss", cumulative_value_loss / (self._learning_epochs * self._mini_batches)) - + self.track_data("Policy / Standard deviation", self.policy.distribution(role="policy").stddev.mean().item()) if self._learning_rate_scheduler: diff --git a/skrl/envs/torch/__init__.py b/skrl/envs/torch/__init__.py index a11c34c9..67e00a5a 100644 --- a/skrl/envs/torch/__init__.py +++ b/skrl/envs/torch/__init__.py @@ -1,7 +1,7 @@ -from .wrappers import wrap_env -from .wrappers import Wrapper +from skrl.envs.torch.wrappers import wrap_env +from skrl.envs.torch.wrappers import Wrapper -from .loaders import load_isaacgym_env_preview2 -from .loaders import load_isaacgym_env_preview3 -from .loaders import load_isaacgym_env_preview4 -from .loaders import load_omniverse_isaacgym_env +from skrl.envs.torch.loaders import load_isaacgym_env_preview2 +from skrl.envs.torch.loaders import load_isaacgym_env_preview3 +from skrl.envs.torch.loaders import load_isaacgym_env_preview4 +from skrl.envs.torch.loaders import load_omniverse_isaacgym_env diff --git a/skrl/envs/torch/loaders.py b/skrl/envs/torch/loaders.py index 234a9826..80eff009 100644 --- a/skrl/envs/torch/loaders.py +++ b/skrl/envs/torch/loaders.py @@ -3,9 +3,9 @@ import queue from contextlib import contextmanager -__all__ = ["load_isaacgym_env_preview2", - "load_isaacgym_env_preview3", - "load_isaacgym_env_preview4", +__all__ = ["load_isaacgym_env_preview2", + "load_isaacgym_env_preview3", + "load_isaacgym_env_preview4", "load_omniverse_isaacgym_env"] @@ -27,7 +27,7 @@ def cwd(new_path: str) -> None: def _omegaconf_to_dict(config) -> dict: """Convert OmegaConf config to dict - + :param config: The OmegaConf config :type config: OmegaConf.Config @@ -69,8 +69,8 @@ def load_isaacgym_env_preview2(task_name: str = "", isaacgymenvs_path: str = "", :type isaacgymenvs_path: str, optional :param show_cfg: Whether to print the configuration (default: True) :type show_cfg: bool, optional - - :raises ValueError: The task name has not been defined, + + :raises ValueError: The task name has not been defined, neither by the function parameter nor by the command line arguments :raises RuntimeError: The isaacgym package is not installed or the path is wrong @@ -99,7 +99,7 @@ def load_isaacgym_env_preview2(task_name: str = "", isaacgymenvs_path: str = "", sys.argv.append(task_name) else: raise ValueError("No task name defined. Set the task_name parameter or use --task as command line argument") - + # get isaacgym envs path from isaacgym package metadata if not isaacgymenvs_path: if not hasattr(isaacgym, "__path__"): @@ -129,7 +129,7 @@ def load_isaacgym_env_preview2(task_name: str = "", isaacgymenvs_path: str = "", if show_cfg: print("\nIsaac Gym environment ({})".format(args.task)) _print_cfg(vars(args)) - + # update task arguments args.cfg_train = os.path.join(path, args.cfg_train) args.cfg_env = os.path.join(path, args.cfg_env) @@ -139,12 +139,12 @@ def load_isaacgym_env_preview2(task_name: str = "", isaacgymenvs_path: str = "", cfg, cfg_train, _ = load_cfg(args) sim_params = parse_sim_params(args, cfg, cfg_train) task, env = parse_task(args, cfg, cfg_train, sim_params) - + return env def load_isaacgym_env_preview3(task_name: str = "", isaacgymenvs_path: str = "", show_cfg: bool = True): - """Load an Isaac Gym environment (preview 3) - + """Load an Isaac Gym environment (preview 3) + Isaac Gym benchmark environments: https://github.com/NVIDIA-Omniverse/IsaacGymEnvs :param task_name: The name of the task (default: ""). @@ -156,7 +156,7 @@ def load_isaacgym_env_preview3(task_name: str = "", isaacgymenvs_path: str = "", :type isaacgymenvs_path: str, optional :param show_cfg: Whether to print the configuration (default: True) :type show_cfg: bool, optional - + :raises ValueError: The task name has not been defined, neither by the function parameter nor by the command line arguments :raises RuntimeError: The isaacgymenvs package is not installed or the path is wrong @@ -171,7 +171,7 @@ def load_isaacgym_env_preview3(task_name: str = "", isaacgymenvs_path: str = "", import isaacgym import isaacgymenvs - + # check task from command line arguments defined = False for arg in sys.argv: @@ -233,12 +233,12 @@ def load_isaacgym_env_preview3(task_name: str = "", isaacgymenvs_path: str = "", sys.path.append(isaacgymenvs_path) from tasks import isaacgym_task_map try: - env = isaacgym_task_map[config.task.name](cfg=cfg, + env = isaacgym_task_map[config.task.name](cfg=cfg, sim_device=config.sim_device, graphics_device_id=config.graphics_device_id, headless=config.headless) except TypeError as e: - env = isaacgym_task_map[config.task.name](cfg=cfg, + env = isaacgym_task_map[config.task.name](cfg=cfg, rl_device=config.rl_device, sim_device=config.sim_device, graphics_device_id=config.graphics_device_id, @@ -249,8 +249,8 @@ def load_isaacgym_env_preview3(task_name: str = "", isaacgymenvs_path: str = "", return env def load_isaacgym_env_preview4(task_name: str = "", isaacgymenvs_path: str = "", show_cfg: bool = True): - """Load an Isaac Gym environment (preview 4) - + """Load an Isaac Gym environment (preview 4) + Isaac Gym benchmark environments: https://github.com/NVIDIA-Omniverse/IsaacGymEnvs :param task_name: The name of the task (default: ""). @@ -262,7 +262,7 @@ def load_isaacgym_env_preview4(task_name: str = "", isaacgymenvs_path: str = "", :type isaacgymenvs_path: str, optional :param show_cfg: Whether to print the configuration (default: True) :type show_cfg: bool, optional - + :raises ValueError: The task name has not been defined, neither by the function parameter nor by the command line arguments :raises RuntimeError: The isaacgymenvs package is not installed or the path is wrong @@ -271,10 +271,10 @@ def load_isaacgym_env_preview4(task_name: str = "", isaacgymenvs_path: str = "", """ return load_isaacgym_env_preview3(task_name, isaacgymenvs_path, show_cfg) -def load_omniverse_isaacgym_env(task_name: str = "", - omniisaacgymenvs_path: str = "", - show_cfg: bool = True, - multi_threaded: bool = False, +def load_omniverse_isaacgym_env(task_name: str = "", + omniisaacgymenvs_path: str = "", + show_cfg: bool = True, + multi_threaded: bool = False, timeout: int = 30): """Load an Omniverse Isaac Gym environment @@ -293,7 +293,7 @@ def load_omniverse_isaacgym_env(task_name: str = "", :type multi_threaded: bool, optional :param timeout: Seconds to wait for data when queue is empty in multi-threaded environment (default: 30) :type timeout: int, optional - + :raises ValueError: The task name has not been defined, neither by the function parameter nor by the command line arguments :raises RuntimeError: The omniisaacgymenvs package is not installed or the path is wrong @@ -306,12 +306,12 @@ def load_omniverse_isaacgym_env(task_name: str = "", from hydra._internal.utils import create_automatic_config_search_path, get_args_parser from omegaconf import OmegaConf - + from omni.isaac.gym.vec_env import VecEnvBase, VecEnvMT, TaskStopException from omni.isaac.gym.vec_env.vec_env_mt import TrainerMT import omniisaacgymenvs - + # check task from command line arguments defined = False for arg in sys.argv: @@ -350,7 +350,7 @@ def load_omniverse_isaacgym_env(task_name: str = "", hydra_object = Hydra.create_main_hydra2(task_name='load_omniisaacgymenv', config_search_path=search_path) config = hydra_object.compose_config(config_file, args.overrides, run_mode=RunMode.RUN) - cfg = {} + cfg = {} cfg["task"] = _omegaconf_to_dict(config.task) cfg["task_name"] = config.task_name cfg["experiment"] = config.experiment @@ -374,7 +374,7 @@ def load_omniverse_isaacgym_env(task_name: str = "", print("\nOmniverse Isaac Gym environment ({})".format(config.task.name)) _print_cfg(cfg) - # internal classes + # internal classes class _OmniIsaacGymVecEnv(VecEnvBase): def step(self, actions): actions = torch.clamp(actions, -self._task.clip_actions, self._task.clip_actions).to(self._task.device).clone() @@ -404,7 +404,7 @@ def stop(self): class _OmniIsaacGymVecEnvMT(VecEnvMT): def __init__(self, headless): super().__init__(headless) - + self.action_queue = queue.Queue(1) self.data_queue = queue.Queue(1) diff --git a/skrl/envs/torch/wrappers.py b/skrl/envs/torch/wrappers.py index 5f903e58..1bf3428a 100644 --- a/skrl/envs/torch/wrappers.py +++ b/skrl/envs/torch/wrappers.py @@ -1,6 +1,7 @@ from typing import Union, Tuple, Any, Optional import gym +import gymnasium import collections import numpy as np from packaging import version @@ -43,17 +44,17 @@ def __getattr__(self, key: str) -> Any: raise AttributeError("Wrapped environment ({}) does not have attribute '{}'" \ .format(self._env.__class__.__name__, key)) - def reset(self) -> torch.Tensor: + def reset(self) -> Tuple[torch.Tensor, Any]: """Reset the environment :raises NotImplementedError: Not implemented - :return: The state of the environment - :rtype: torch.Tensor + :return: Observation, info + :rtype: torch.Tensor and any other info """ raise NotImplementedError - def step(self, actions: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, Any]: + def step(self, actions: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, Any]: """Perform a step in the environment :param actions: The actions to perform @@ -61,7 +62,7 @@ def step(self, actions: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch :raises NotImplementedError: Not implemented - :return: The state, the reward, the done flag, and the info + :return: Observation, reward, terminated, truncated, info :rtype: tuple of torch.Tensor and any other info """ raise NotImplementedError @@ -122,28 +123,29 @@ def __init__(self, env: Any) -> None: self._reset_once = True self._obs_buf = None - def step(self, actions: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, Any]: + def step(self, actions: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, Any]: """Perform a step in the environment :param actions: The actions to perform :type actions: torch.Tensor - :return: The state, the reward, the done flag, and the info + :return: Observation, reward, terminated, truncated, info :rtype: tuple of torch.Tensor and any other info """ - self._obs_buf, rew_buf, reset_buf, info = self._env.step(actions) - return self._obs_buf, rew_buf.view(-1, 1), reset_buf.view(-1, 1), info + self._obs_buf, reward, terminated, info = self._env.step(actions) + truncated = torch.zeros_like(terminated) + return self._obs_buf, reward.view(-1, 1), terminated.view(-1, 1), truncated.view(-1, 1), info - def reset(self) -> torch.Tensor: + def reset(self) -> Tuple[torch.Tensor, Any]: """Reset the environment - :return: The state of the environment - :rtype: torch.Tensor + :return: Observation, info + :rtype: torch.Tensor and any other info """ if self._reset_once: self._obs_buf = self._env.reset() self._reset_once = False - return self._obs_buf + return self._obs_buf, {} def render(self, *args, **kwargs) -> None: """Render the environment @@ -168,28 +170,29 @@ def __init__(self, env: Any) -> None: self._reset_once = True self._obs_dict = None - def step(self, actions: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, Any]: + def step(self, actions: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, Any]: """Perform a step in the environment :param actions: The actions to perform :type actions: torch.Tensor - :return: The state, the reward, the done flag, and the info + :return: Observation, reward, terminated, truncated, info :rtype: tuple of torch.Tensor and any other info """ - self._obs_dict, rew_buf, reset_buf, info = self._env.step(actions) - return self._obs_dict["obs"], rew_buf.view(-1, 1), reset_buf.view(-1, 1), info + self._obs_dict, reward, terminated, info = self._env.step(actions) + truncated = torch.zeros_like(terminated) + return self._obs_dict["obs"], reward.view(-1, 1), terminated.view(-1, 1), truncated.view(-1, 1), info - def reset(self) -> torch.Tensor: + def reset(self) -> Tuple[torch.Tensor, Any]: """Reset the environment - :return: The state of the environment - :rtype: torch.Tensor + :return: Observation, info + :rtype: torch.Tensor and any other info """ if self._reset_once: self._obs_dict = self._env.reset() self._reset_once = False - return self._obs_dict["obs"] + return self._obs_dict["obs"], {} def render(self, *args, **kwargs) -> None: """Render the environment @@ -224,28 +227,29 @@ def run(self, trainer: Optional["omni.isaac.gym.vec_env.vec_env_mt.TrainerMT"] = """ self._env.run(trainer) - def step(self, actions: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, Any]: + def step(self, actions: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, Any]: """Perform a step in the environment :param actions: The actions to perform :type actions: torch.Tensor - :return: The state, the reward, the done flag, and the info + :return: Observation, reward, terminated, truncated, info :rtype: tuple of torch.Tensor and any other info """ - self._obs_dict, rew_buf, reset_buf, info = self._env.step(actions) - return self._obs_dict["obs"], rew_buf.view(-1, 1), reset_buf.view(-1, 1), info + self._obs_dict, reward, terminated, info = self._env.step(actions) + truncated = torch.zeros_like(terminated) + return self._obs_dict["obs"], reward.view(-1, 1), terminated.view(-1, 1), truncated.view(-1, 1), info - def reset(self) -> torch.Tensor: + def reset(self) -> Tuple[torch.Tensor, Any]: """Reset the environment - :return: The state of the environment - :rtype: torch.Tensor + :return: Observation, info + :rtype: torch.Tensor and any other info """ if self._reset_once: self._obs_dict = self._env.reset() self._reset_once = False - return self._obs_dict["obs"] + return self._obs_dict["obs"], {} def render(self, *args, **kwargs) -> None: """Render the environment @@ -271,6 +275,9 @@ def __init__(self, env: Any) -> None: try: if isinstance(env, gym.vector.SyncVectorEnv) or isinstance(env, gym.vector.AsyncVectorEnv): self._vectorized = True + self._reset_once = True + self._obs_tensor = None + self._info_dict = None except Exception as e: print("[WARNING] Failed to check for a vectorized environment: {}".format(e)) @@ -304,7 +311,7 @@ def action_space(self) -> gym.Space: return self._env.single_action_space return self._env.action_space - def _observation_to_tensor(self, observation: Any, space: Union[gym.Space, None] = None) -> torch.Tensor: + def _observation_to_tensor(self, observation: Any, space: Optional[gym.Space] = None) -> torch.Tensor: """Convert the OpenAI Gym observation to a flat tensor :param observation: The OpenAI Gym observation to convert to a tensor @@ -362,40 +369,214 @@ def _tensor_to_action(self, actions: torch.Tensor) -> Any: return np.array(actions.cpu().numpy(), dtype=space.dtype).reshape(space.shape) raise ValueError("Action space type {} not supported. Please report this issue".format(type(space))) - def step(self, actions: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, Any]: + def step(self, actions: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, Any]: """Perform a step in the environment :param actions: The actions to perform :type actions: torch.Tensor - :return: The state, the reward, the done flag, and the info + :return: Observation, reward, terminated, truncated, info :rtype: tuple of torch.Tensor and any other info """ if self._drepecated_api: - observation, reward, done, info = self._env.step(self._tensor_to_action(actions)) - else: - observation, reward, termination, truncation, info = self._env.step(self._tensor_to_action(actions)) - if type(termination) is bool: - done = termination or truncation + observation, reward, terminated, info = self._env.step(self._tensor_to_action(actions)) + # truncated: https://gymnasium.farama.org/tutorials/handling_time_limits + if type(info) is list: + truncated = np.array([d.get("TimeLimit.truncated", False) for d in info], dtype=terminated.dtype) + terminated *= np.logical_not(truncated) else: - done = np.logical_or(termination, truncation) + truncated = info.get("TimeLimit.truncated", False) + if truncated: + terminated = False + else: + observation, reward, terminated, truncated, info = self._env.step(self._tensor_to_action(actions)) + # convert response to torch - return self._observation_to_tensor(observation), \ - torch.tensor(reward, device=self.device, dtype=torch.float32).view(self.num_envs, -1), \ - torch.tensor(done, device=self.device, dtype=torch.bool).view(self.num_envs, -1), \ - info + observation = self._observation_to_tensor(observation) + reward = torch.tensor(reward, device=self.device, dtype=torch.float32).view(self.num_envs, -1) + terminated = torch.tensor(terminated, device=self.device, dtype=torch.bool).view(self.num_envs, -1) + truncated = torch.tensor(truncated, device=self.device, dtype=torch.bool).view(self.num_envs, -1) + + # save observation and info for vectorized envs + if self._vectorized: + self._obs_tensor = observation + self._info_dict = info + + return observation, reward, terminated, truncated, info - def reset(self) -> torch.Tensor: + def reset(self) -> Tuple[torch.Tensor, Any]: """Reset the environment - :return: The state of the environment - :rtype: torch.Tensor + :return: Observation, info + :rtype: torch.Tensor and any other info """ + # handle vectorized envs + if self._vectorized: + if not self._reset_once: + return self._obs_tensor, self._info_dict + self._reset_once = False + + # reset the env/envs if self._drepecated_api: observation = self._env.reset() + info = {} else: observation, info = self._env.reset() - return self._observation_to_tensor(observation) + return self._observation_to_tensor(observation), info + + def render(self, *args, **kwargs) -> None: + """Render the environment + """ + self._env.render(*args, **kwargs) + + def close(self) -> None: + """Close the environment + """ + self._env.close() + + +class GymnasiumWrapper(Wrapper): + def __init__(self, env: Any) -> None: + """Gymnasium environment wrapper + + :param env: The environment to wrap + :type env: Any supported Gymnasium environment + """ + super().__init__(env) + + self._vectorized = False + try: + if isinstance(env, gymnasium.vector.SyncVectorEnv) or isinstance(env, gymnasium.vector.AsyncVectorEnv): + self._vectorized = True + self._reset_once = True + self._obs_tensor = None + self._info_dict = None + except Exception as e: + print("[WARNING] Failed to check for a vectorized environment: {}".format(e)) + + @property + def state_space(self) -> gymnasium.Space: + """State space + + An alias for the ``observation_space`` property + """ + if self._vectorized: + return self._env.single_observation_space + return self._env.observation_space + + @property + def observation_space(self) -> gymnasium.Space: + """Observation space + """ + if self._vectorized: + return self._env.single_observation_space + return self._env.observation_space + + @property + def action_space(self) -> gymnasium.Space: + """Action space + """ + if self._vectorized: + return self._env.single_action_space + return self._env.action_space + + def _observation_to_tensor(self, observation: Any, space: Optional[gymnasium.Space] = None) -> torch.Tensor: + """Convert the Gymnasium observation to a flat tensor + + :param observation: The Gymnasium observation to convert to a tensor + :type observation: Any supported Gymnasium observation space + + :raises: ValueError if the observation space type is not supported + + :return: The observation as a flat tensor + :rtype: torch.Tensor + """ + observation_space = self._env.observation_space if self._vectorized else self.observation_space + space = space if space is not None else observation_space + + if self._vectorized and isinstance(space, gymnasium.spaces.MultiDiscrete): + return torch.tensor(observation, device=self.device, dtype=torch.int64).view(self.num_envs, -1) + elif isinstance(observation, int): + return torch.tensor(observation, device=self.device, dtype=torch.int64).view(self.num_envs, -1) + elif isinstance(observation, np.ndarray): + return torch.tensor(observation, device=self.device, dtype=torch.float32).view(self.num_envs, -1) + elif isinstance(space, gymnasium.spaces.Discrete): + return torch.tensor(observation, device=self.device, dtype=torch.float32).view(self.num_envs, -1) + elif isinstance(space, gymnasium.spaces.Box): + return torch.tensor(observation, device=self.device, dtype=torch.float32).view(self.num_envs, -1) + elif isinstance(space, gymnasium.spaces.Dict): + tmp = torch.cat([self._observation_to_tensor(observation[k], space[k]) \ + for k in sorted(space.keys())], dim=-1).view(self.num_envs, -1) + return tmp + else: + raise ValueError("Observation space type {} not supported. Please report this issue".format(type(space))) + + def _tensor_to_action(self, actions: torch.Tensor) -> Any: + """Convert the action to the Gymnasium expected format + + :param actions: The actions to perform + :type actions: torch.Tensor + + :raise ValueError: If the action space type is not supported + + :return: The action in the Gymnasium format + :rtype: Any supported Gymnasium action space + """ + space = self._env.action_space if self._vectorized else self.action_space + + if self._vectorized: + if isinstance(space, gymnasium.spaces.MultiDiscrete): + return np.array(actions.cpu().numpy(), dtype=space.dtype).reshape(space.shape) + elif isinstance(space, gymnasium.spaces.Tuple): + if isinstance(space[0], gymnasium.spaces.Box): + return np.array(actions.cpu().numpy(), dtype=space[0].dtype).reshape(space.shape) + elif isinstance(space[0], gymnasium.spaces.Discrete): + return np.array(actions.cpu().numpy(), dtype=space[0].dtype).reshape(-1) + if isinstance(space, gymnasium.spaces.Discrete): + return actions.item() + elif isinstance(space, gymnasium.spaces.Box): + return np.array(actions.cpu().numpy(), dtype=space.dtype).reshape(space.shape) + raise ValueError("Action space type {} not supported. Please report this issue".format(type(space))) + + def step(self, actions: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, Any]: + """Perform a step in the environment + + :param actions: The actions to perform + :type actions: torch.Tensor + + :return: Observation, reward, terminated, truncated, info + :rtype: tuple of torch.Tensor and any other info + """ + observation, reward, terminated, truncated, info = self._env.step(self._tensor_to_action(actions)) + + # convert response to torch + observation = self._observation_to_tensor(observation) + reward = torch.tensor(reward, device=self.device, dtype=torch.float32).view(self.num_envs, -1) + terminated = torch.tensor(terminated, device=self.device, dtype=torch.bool).view(self.num_envs, -1) + truncated = torch.tensor(truncated, device=self.device, dtype=torch.bool).view(self.num_envs, -1) + + # save observation and info for vectorized envs + if self._vectorized: + self._obs_tensor = observation + self._info_dict = info + + return observation, reward, terminated, truncated, info + + def reset(self) -> Tuple[torch.Tensor, Any]: + """Reset the environment + + :return: Observation, info + :rtype: torch.Tensor and any other info + """ + # handle vectorized envs + if self._vectorized: + if not self._reset_once: + return self._obs_tensor, self._info_dict + self._reset_once = False + + # reset the env/envs + observation, info = self._env.reset() + return self._observation_to_tensor(observation), info def render(self, *args, **kwargs) -> None: """Render the environment @@ -472,7 +653,7 @@ def _spec_to_space(self, spec: Any) -> gym.Space: else: raise ValueError("Spec type {} not supported. Please report this issue".format(type(spec))) - def _observation_to_tensor(self, observation: Any, spec: Union[Any, None] = None) -> torch.Tensor: + def _observation_to_tensor(self, observation: Any, spec: Optional[Any] = None) -> torch.Tensor: """Convert the DeepMind observation to a flat tensor :param observation: The DeepMind observation to convert to a tensor @@ -515,36 +696,38 @@ def _tensor_to_action(self, actions: torch.Tensor) -> Any: else: raise ValueError("Action spec type {} not supported. Please report this issue".format(type(spec))) - def step(self, actions: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, Any]: + def step(self, actions: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, Any]: """Perform a step in the environment :param actions: The actions to perform :type actions: torch.Tensor - :return: The state, the reward, the done flag, and the info + :return: Observation, reward, terminated, truncated, info :rtype: tuple of torch.Tensor and any other info """ timestep = self._env.step(self._tensor_to_action(actions)) observation = timestep.observation reward = timestep.reward if timestep.reward is not None else 0 - done = timestep.last() + terminated = timestep.last() + truncated = False info = {} # convert response to torch return self._observation_to_tensor(observation), \ torch.tensor(reward, device=self.device, dtype=torch.float32).view(self.num_envs, -1), \ - torch.tensor(done, device=self.device, dtype=torch.bool).view(self.num_envs, -1), \ + torch.tensor(terminated, device=self.device, dtype=torch.bool).view(self.num_envs, -1), \ + torch.tensor(truncated, device=self.device, dtype=torch.bool).view(self.num_envs, -1), \ info - def reset(self) -> torch.Tensor: + def reset(self) -> Tuple[torch.Tensor, Any]: """Reset the environment :return: The state of the environment :rtype: torch.Tensor """ timestep = self._env.reset() - return self._observation_to_tensor(timestep.observation) + return self._observation_to_tensor(timestep.observation), {} def render(self, *args, **kwargs) -> None: """Render the environment @@ -565,6 +748,144 @@ def close(self) -> None: self._env.close() +class RobosuiteWrapper(Wrapper): + def __init__(self, env: Any) -> None: + """Robosuite environment wrapper + + :param env: The environment to wrap + :type env: Any supported robosuite environment + """ + super().__init__(env) + + # observation and action spaces + self._observation_space = self._spec_to_space(self._env.observation_spec()) + self._action_space = self._spec_to_space(self._env.action_spec) + + @property + def state_space(self) -> gym.Space: + """State space + + An alias for the ``observation_space`` property + """ + return self._observation_space + + @property + def observation_space(self) -> gym.Space: + """Observation space + """ + return self._observation_space + + @property + def action_space(self) -> gym.Space: + """Action space + """ + return self._action_space + + def _spec_to_space(self, spec: Any) -> gym.Space: + """Convert the robosuite spec to a Gym space + + :param spec: The robosuite spec to convert + :type spec: Any supported robosuite spec + + :raises: ValueError if the spec type is not supported + + :return: The Gym space + :rtype: gym.Space + """ + if type(spec) is tuple: + return gym.spaces.Box(shape=spec[0].shape, + dtype=np.float32, + low=spec[0], + high=spec[1]) + elif isinstance(spec, np.ndarray): + return gym.spaces.Box(shape=spec.shape, + dtype=np.float32, + low=np.full(spec.shape, float("-inf")), + high=np.full(spec.shape, float("inf"))) + elif isinstance(spec, collections.OrderedDict): + return gym.spaces.Dict({k: self._spec_to_space(v) for k, v in spec.items()}) + else: + raise ValueError("Spec type {} not supported. Please report this issue".format(type(spec))) + + def _observation_to_tensor(self, observation: Any, spec: Optional[Any] = None) -> torch.Tensor: + """Convert the observation to a flat tensor + + :param observation: The observation to convert to a tensor + :type observation: Any supported observation + + :raises: ValueError if the observation spec type is not supported + + :return: The observation as a flat tensor + :rtype: torch.Tensor + """ + spec = spec if spec is not None else self._env.observation_spec() + + if isinstance(spec, np.ndarray): + return torch.tensor(observation, device=self.device, dtype=torch.float32).reshape(self.num_envs, -1) + elif isinstance(spec, collections.OrderedDict): + return torch.cat([self._observation_to_tensor(observation[k], spec[k]) \ + for k in sorted(spec.keys())], dim=-1).reshape(self.num_envs, -1) + else: + raise ValueError("Observation spec type {} not supported. Please report this issue".format(type(spec))) + + def _tensor_to_action(self, actions: torch.Tensor) -> Any: + """Convert the action to the robosuite expected format + + :param actions: The actions to perform + :type actions: torch.Tensor + + :raise ValueError: If the action space type is not supported + + :return: The action in the robosuite expected format + :rtype: Any supported robosuite action + """ + spec = self._env.action_spec + + if type(spec) is tuple: + return np.array(actions.cpu().numpy(), dtype=np.float32).reshape(spec[0].shape) + else: + raise ValueError("Action spec type {} not supported. Please report this issue".format(type(spec))) + + def step(self, actions: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, Any]: + """Perform a step in the environment + + :param actions: The actions to perform + :type actions: torch.Tensor + + :return: Observation, reward, terminated, truncated, info + :rtype: tuple of torch.Tensor and any other info + """ + observation, reward, terminated, info = self._env.step(self._tensor_to_action(actions)) + truncated = False + info = {} + + # convert response to torch + return self._observation_to_tensor(observation), \ + torch.tensor(reward, device=self.device, dtype=torch.float32).view(self.num_envs, -1), \ + torch.tensor(terminated, device=self.device, dtype=torch.bool).view(self.num_envs, -1), \ + torch.tensor(truncated, device=self.device, dtype=torch.bool).view(self.num_envs, -1), \ + info + + def reset(self) -> Tuple[torch.Tensor, Any]: + """Reset the environment + + :return: The state of the environment + :rtype: torch.Tensor + """ + observation = self._env.reset() + return self._observation_to_tensor(observation), {} + + def render(self, *args, **kwargs) -> None: + """Render the environment + """ + self._env.render(*args, **kwargs) + + def close(self) -> None: + """Close the environment + """ + self._env.close() + + def wrap_env(env: Any, wrapper: str = "auto", verbose: bool = True) -> Wrapper: """Wrap an environment to use a common interface @@ -576,7 +897,7 @@ def wrap_env(env: Any, wrapper: str = "auto", verbose: bool = True) -> Wrapper: >>> env = wrap_env(env) :param env: The environment to be wrapped - :type env: gym.Env, dm_env.Environment or VecTask + :type env: gym.Env, gymnasium.Env, dm_env.Environment or VecTask :param wrapper: The type of wrapper to use (default: "auto"). If ``"auto"``, the wrapper will be automatically selected based on the environment class. The supported wrappers are described in the following table: @@ -590,8 +911,12 @@ def wrap_env(env: Any, wrapper: str = "auto", verbose: bool = True) -> Wrapper: +====================+=========================+ |OpenAI Gym |``"gym"`` | +--------------------+-------------------------+ + |Gymnasium |``"gymnasium"`` | + +--------------------+-------------------------+ |DeepMind |``"dm"`` | +--------------------+-------------------------+ + |Robosuite |``"robosuite"`` | + +--------------------+-------------------------+ |Isaac Gym preview 2 |``"isaacgym-preview2"`` | +--------------------+-------------------------+ |Isaac Gym preview 3 |``"isaacgym-preview3"`` | @@ -623,10 +948,18 @@ def wrap_env(env: Any, wrapper: str = "auto", verbose: bool = True) -> Wrapper: if verbose: logger.info("Environment wrapper: Gym") return GymWrapper(env) + elif isinstance(env, gymnasium.core.Env) or isinstance(env, gymnasium.core.Wrapper): + if verbose: + logger.info("Environment wrapper: Gymnasium") + return GymnasiumWrapper(env) elif "" in base_classes: if verbose: logger.info("Environment wrapper: DeepMind") return DeepMindWrapper(env) + elif "" in base_classes: if verbose: logger.info("Environment wrapper: Isaac Gym (preview 2)") @@ -638,10 +971,18 @@ def wrap_env(env: Any, wrapper: str = "auto", verbose: bool = True) -> Wrapper: if verbose: logger.info("Environment wrapper: Gym") return GymWrapper(env) + elif wrapper == "gymnasium": + if verbose: + logger.info("Environment wrapper: gymnasium") + return GymnasiumWrapper(env) elif wrapper == "dm": if verbose: logger.info("Environment wrapper: DeepMind") return DeepMindWrapper(env) + elif wrapper == "robosuite": + if verbose: + logger.info("Environment wrapper: Robosuite") + return RobosuiteWrapper(env) elif wrapper == "isaacgym-preview2": if verbose: logger.info("Environment wrapper: Isaac Gym (preview 2)") diff --git a/skrl/memories/torch/__init__.py b/skrl/memories/torch/__init__.py index cad85aaf..31491065 100644 --- a/skrl/memories/torch/__init__.py +++ b/skrl/memories/torch/__init__.py @@ -1,4 +1,3 @@ -from .base import Memory +from skrl.memories.torch.base import Memory -from .random import RandomMemory -from .prioritized import PrioritizedMemory \ No newline at end of file +from skrl.memories.torch.random import RandomMemory diff --git a/skrl/memories/torch/base.py b/skrl/memories/torch/base.py index aaa18ab5..b123fcc5 100644 --- a/skrl/memories/torch/base.py +++ b/skrl/memories/torch/base.py @@ -1,8 +1,9 @@ -from typing import Union, Tuple, List +from typing import Optional, Union, Tuple, List import os import csv import gym +import gymnasium import operator import datetime import functools @@ -13,23 +14,24 @@ class Memory: - def __init__(self, - memory_size: int, - num_envs: int = 1, - device: Union[str, torch.device] = "cuda:0", - export: bool = False, - export_format: str = "pt", + def __init__(self, + memory_size: int, + num_envs: int = 1, + device: Optional[Union[str, torch.device]] = None, + export: bool = False, + export_format: str = "pt", export_directory: str = "") -> None: """Base class representing a memory with circular buffers Buffers are torch tensors with shape (memory size, number of environments, data size). Circular buffers are implemented with two integers: a memory index and an environment index - + :param memory_size: Maximum number of elements in the first dimension of each internal storage :type memory_size: int :param num_envs: Number of parallel environments (default: 1) :type num_envs: int, optional - :param device: Device on which a torch tensor is or will be allocated (default: "cuda:0") + :param device: Device on which a torch tensor is or will be allocated (default: ``None``). + If None, the device will be either ``"cuda:0"`` if available or ``"cpu"`` :type device: str or torch.device, optional :param export: Export the memory to a file (default: False). If True, the memory will be exported when the memory is filled @@ -45,7 +47,7 @@ def __init__(self, """ self.memory_size = memory_size self.num_envs = num_envs - self.device = torch.device(device) + self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") if device is None else torch.device(device) # internal variables self.filled = False @@ -54,6 +56,10 @@ def __init__(self, self.tensors = {} self.tensors_view = {} + self.tensors_keep_dimensions = {} + + self.sampling_indexes = None + self.all_sequence_indexes = np.concatenate([np.arange(i, memory_size * num_envs + i, num_envs) for i in range(num_envs)]) # exporting data self.export = export @@ -65,7 +71,7 @@ def __init__(self, def __len__(self) -> int: """Compute and return the current (valid) size of the memory - + The valid size is calculated as the ``memory_size * num_envs`` if the memory is full (filled). Otherwise, the ``memory_index * num_envs + env_index`` is returned @@ -73,28 +79,43 @@ def __len__(self) -> int: :rtype: int """ return self.memory_size * self.num_envs if self.filled else self.memory_index * self.num_envs + self.env_index - - def _get_space_size(self, space: Union[int, Tuple[int], gym.Space]) -> int: + + def _get_space_size(self, + space: Union[int, Tuple[int], gym.Space, gymnasium.Space], + keep_dimensions: bool = False) -> Union[Tuple, int]: """Get the size (number of elements) of a space :param space: Space or shape from which to obtain the number of elements - :type space: int, tuple or list of integers, or gym.Space + :type space: int, tuple or list of integers, gym.Space, or gymnasium.Space + :param keep_dimensions: Whether or not to keep the space dimensions (default: False) + :type keep_dimensions: bool :raises ValueError: If the space is not supported - :return: Size of the space data - :rtype: Space size (number of elements) + :return: Size of the space. If keep_dimensions is True, the space size will be a tuple + :rtype: int or tuple of int """ if type(space) in [int, float]: - return int(space) + return (int(space),) if keep_dimensions else int(space) elif type(space) in [tuple, list]: - return np.prod(space) + return tuple(space) if keep_dimensions else np.prod(space) elif issubclass(type(space), gym.Space): if issubclass(type(space), gym.spaces.Discrete): - return 1 + return (1,) if keep_dimensions else 1 elif issubclass(type(space), gym.spaces.Box): - return np.prod(space.shape) + return tuple(space.shape) if keep_dimensions else np.prod(space.shape) elif issubclass(type(space), gym.spaces.Dict): + if keep_dimensions: + raise ValueError("keep_dimensions=True cannot be used with Dict spaces") + return sum([self._get_space_size(space.spaces[key]) for key in space.spaces]) + elif issubclass(type(space), gymnasium.Space): + if issubclass(type(space), gymnasium.spaces.Discrete): + return (1,) if keep_dimensions else 1 + elif issubclass(type(space), gymnasium.spaces.Box): + return tuple(space.shape) if keep_dimensions else np.prod(space.shape) + elif issubclass(type(space), gymnasium.spaces.Dict): + if keep_dimensions: + raise ValueError("keep_dimensions=True cannot be used with Dict spaces") return sum([self._get_space_size(space.spaces[key]) for key in space.spaces]) raise ValueError("Space type {} not supported".format(type(space))) @@ -142,28 +163,34 @@ def set_tensor_by_name(self, name: str, tensor: torch.Tensor) -> None: with torch.no_grad(): self.tensors[name].copy_(tensor) - def create_tensor(self, name: str, size: Union[int, Tuple[int], gym.Space], dtype: Union[torch.dtype, None] = None) -> bool: + def create_tensor(self, + name: str, + size: Union[int, Tuple[int], gym.Space, gymnasium.Space], + dtype: Optional[torch.dtype] = None, + keep_dimensions: bool = False) -> bool: """Create a new internal tensor in memory - + The tensor will have a 3-components shape (memory size, number of environments, size). The internal representation will use _tensor_ as the name of the class property :param name: Tensor name (the name has to follow the python PEP 8 style) :type name: str :param size: Number of elements in the last dimension (effective data size). - The product of the elements will be computed for collections or gym spaces types - :type size: int, tuple or list of integers or gym.Space + The product of the elements will be computed for sequences or gym/gymnasium spaces + :type size: int, tuple or list of integers, gym.Space, or gymnasium.Space :param dtype: Data type (torch.dtype). If None, the global default torch data type will be used (default) :type dtype: torch.dtype or None, optional - + :param keep_dimensions: Whether or not to keep the dimensions defined through the size parameter (default: False) + :type keep_dimensions: bool + :raises ValueError: The tensor name exists already but the size or dtype are different :return: True if the tensor was created, otherwise False :rtype: bool """ # compute data size - size = self._get_space_size(size) + size = self._get_space_size(size, keep_dimensions) # check dtype and size if the tensor exists if name in self.tensors: tensor = self.tensors[name] @@ -172,10 +199,15 @@ def create_tensor(self, name: str, size: Union[int, Tuple[int], gym.Space], dtyp if dtype is not None and tensor.dtype != dtype: raise ValueError("The dtype of the tensor {} ({}) doesn't match the existing one ({})".format(name, dtype, tensor.dtype)) return False + # define tensor shape + tensor_shape = (self.memory_size, self.num_envs, *size) if keep_dimensions else (self.memory_size, self.num_envs, size) + view_shape = (-1, *size) if keep_dimensions else (-1, size) # create tensor (_tensor_) and add it to the internal storage - setattr(self, "_tensor_{}".format(name), torch.zeros((self.memory_size, self.num_envs, size), device=self.device, dtype=dtype)) + setattr(self, "_tensor_{}".format(name), torch.zeros(tensor_shape, device=self.device, dtype=dtype)) + # update internal variables self.tensors[name] = getattr(self, "_tensor_{}".format(name)) - self.tensors_view[name] = self.tensors[name].view(-1, self.tensors[name].size(-1)) + self.tensors_view[name] = self.tensors[name].view(*view_shape) + self.tensors_keep_dimensions[name] = keep_dimensions # fill the tensors (float tensors) with NaN for tensor in self.tensors.values(): if torch.is_floating_point(tensor): @@ -210,7 +242,7 @@ def add_samples(self, **tensors: torch.Tensor) -> None: - number of environments less than num_envs: Store the samples and increment the environment index (second index) by the number of the environments - + - number of environments equals num_envs: Store the samples and increment the memory index (first index) by one @@ -224,21 +256,21 @@ def add_samples(self, **tensors: torch.Tensor) -> None: raise ValueError("No samples to be recorded in memory. Pass samples as key-value arguments (where key is the tensor name)") # dimensions and shapes of the tensors (assume all tensors have the dimensions of the first tensor) - tmp = tensors[next(iter(tensors))] + tmp = tensors.get("states", tensors[next(iter(tensors))]) # ask for states first dim, shape = tmp.ndim, tmp.shape - # multi environment (number of environments less than num_envs) - if dim == 2 and shape[0] < self.num_envs: - for name, tensor in tensors.items(): - if name in self.tensors: - self.tensors[name][self.memory_index, self.env_index:self.env_index + tensor.shape[0]].copy_(tensor) - self.env_index += tensor.shape[0] # multi environment (number of environments equals num_envs) - elif dim == 2 and shape[0] == self.num_envs: + if dim == 2 and shape[0] == self.num_envs: for name, tensor in tensors.items(): if name in self.tensors: self.tensors[name][self.memory_index].copy_(tensor) self.memory_index += 1 + # multi environment (number of environments less than num_envs) + elif dim == 2 and shape[0] < self.num_envs: + for name, tensor in tensors.items(): + if name in self.tensors: + self.tensors[name][self.memory_index, self.env_index:self.env_index + tensor.shape[0]].copy_(tensor) + self.env_index += tensor.shape[0] # single environment - multi sample (number of environments greater than num_envs (num_envs = 1)) elif dim == 2 and self.num_envs == 1: for name, tensor in tensors.items(): @@ -273,7 +305,11 @@ def add_samples(self, **tensors: torch.Tensor) -> None: if self.export: self.save(directory=self.export_directory, format=self.export_format) - def sample(self, names: Tuple[str], batch_size: int, mini_batches: int = 1) -> List[List[torch.Tensor]]: + def sample(self, + names: Tuple[str], + batch_size: int, + mini_batches: int = 1, + sequence_length: int = 1) -> List[List[torch.Tensor]]: """Data sampling method to be implemented by the inheriting classes :param names: Tensors names from which to obtain the samples @@ -282,9 +318,11 @@ def sample(self, names: Tuple[str], batch_size: int, mini_batches: int = 1) -> L :type batch_size: int :param mini_batches: Number of mini-batches to sample (default: 1) :type mini_batches: int, optional - + :param sequence_length: Length of each sequence (default: 1) + :type sequence_length: int, optional + :raises NotImplementedError: The method has not been implemented - + :return: Sampled data from tensors sorted according to their position in the list of names. The sampled tensors will have the following shape: (batch size, data size) :rtype: list of torch.Tensor list @@ -310,29 +348,47 @@ def sample_by_index(self, names: Tuple[str], indexes: Union[tuple, np.ndarray, t return [[self.tensors_view[name][batch] for name in names] for batch in batches] return [[self.tensors_view[name][indexes] for name in names]] - def sample_all(self, names: Tuple[str], mini_batches: int = 1) -> List[List[torch.Tensor]]: + def sample_all(self, names: Tuple[str], mini_batches: int = 1, sequence_length: int = 1) -> List[List[torch.Tensor]]: """Sample all data from memory - + :param names: Tensors names from which to obtain the samples :type names: tuple or list of strings :param mini_batches: Number of mini-batches to sample (default: 1) :type mini_batches: int, optional + :param sequence_length: Length of each sequence (default: 1) + :type sequence_length: int, optional :return: Sampled data from memory. The sampled tensors will have the following shape: (memory size * number of environments, data size) :rtype: list of torch.Tensor list """ + # sequential order + if sequence_length > 1: + if mini_batches > 1: + batches = BatchSampler(self.all_sequence_indexes, batch_size=len(self.all_sequence_indexes) // mini_batches, drop_last=True) + return [[self.tensors_view[name][batch] for name in names] for batch in batches] + return [[self.tensors_view[name][self.all_sequence_indexes] for name in names]] + + # default order if mini_batches > 1: indexes = np.arange(self.memory_size * self.num_envs) batches = BatchSampler(indexes, batch_size=len(indexes) // mini_batches, drop_last=True) return [[self.tensors_view[name][batch] for name in names] for batch in batches] return [[self.tensors_view[name] for name in names]] - + + def get_sampling_indexes(self) -> Union[tuple, np.ndarray, torch.Tensor]: + """Get the last indexes used for sampling + + :return: Last sampling indexes + :rtype: tuple or list, numpy.ndarray or torch.Tensor + """ + return self.sampling_indexes + def save(self, directory: str = "", format: str = "pt") -> None: """Save the memory to a file Supported formats: - + - PyTorch (pt) - NumPy (npz) - Comma-separated values (csv) @@ -350,7 +406,7 @@ def save(self, directory: str = "", format: str = "pt") -> None: os.makedirs(os.path.join(directory, "memories"), exist_ok=True) memory_path = os.path.join(directory, "memories", \ "{}_memory_{}.{}".format(datetime.datetime.now().strftime("%y-%m-%d_%H-%M-%S-%f"), hex(id(self)), format)) - + # torch if format == "pt": torch.save({name: self.tensors[name] for name in self.get_tensor_names()}, memory_path) @@ -359,7 +415,7 @@ def save(self, directory: str = "", format: str = "pt") -> None: np.savez(memory_path, **{name: self.tensors[name].cpu().numpy() for name in self.get_tensor_names()}) # comma-separated values elif format == "csv": - # open csv writer + # open csv writer # TODO: support keeping the dimensions with open(memory_path, "a") as file: writer = csv.writer(file) names = self.get_tensor_names() @@ -391,18 +447,18 @@ def load(self, path: str) -> None: data = torch.load(path) for name in self.get_tensor_names(): setattr(self, "_tensor_{}".format(name), data[name]) - + # numpy elif path.endswith(".npz"): data = np.load(path) for name in data: setattr(self, "_tensor_{}".format(name), torch.tensor(data[name])) - + # comma-separated values elif path.endswith(".csv"): # TODO: load the memory from a csv pass - + # unsupported format else: raise ValueError("Unsupported format: {}".format(path)) diff --git a/skrl/memories/torch/prioritized.py b/skrl/memories/torch/prioritized.py deleted file mode 100644 index 926ab084..00000000 --- a/skrl/memories/torch/prioritized.py +++ /dev/null @@ -1,56 +0,0 @@ -from typing import Union, Tuple - -import numpy as np - -import torch - -from .base import Memory - - -class PrioritizedMemory(Memory): - def __init__(self, memory_size: int, num_envs: int = 1, device: Union[str, torch.device] = "cuda:0", preallocate: bool = True, alpha: float = 0.5, beta: float = 0.4, eps: float = 1e-6) -> None: - """Prioritized sampling memory - - Sample a batch from memory randomly - - :param memory_size: Maximum number of elements in the first dimension of each internal storage - :type memory_size: int - :param num_envs: Number of parallel environments (default: 1) - :type num_envs: int, optional - :param device: Device on which a torch tensor is or will be allocated (default: "cuda:0") - :type device: str or torch.device, optional - :param preallocate: If true, preallocate memory for efficient use (default: True) - :type preallocate: bool, optional - :param replacement: Flag to indicate whether the sample is with or without replacement (default: True). - Replacement implies that a value can be selected multiple times (the batch size is always guaranteed). - Sampling without replacement will return a batch of maximum memory size if the memory size is less than the requested batch size - :type replacement: bool, optional - :param alpha: Hyperparameter for prioritized sampling (default: 0.5) - :type alpha: float, optional - :param beta: Hyperparameter for prioritized sampling (default: 0.4) - :type beta: float, optional - :param eps: Hyperparameter for prioritized sampling (default: 1e-6) - :type eps: float, optional - """ - super().__init__(memory_size, num_envs, device, preallocate) - - self.alpha = alpha - self.beta = beta - self.eps = eps - - def sample(self, batch_size: int, names: Tuple[str]) -> Tuple[torch.Tensor]: - """Sample a batch from memory randomly - - :param batch_size: Number of element to sample - :type batch_size: int - :param names: Tensors names from which to obtain the samples - :type names: tuple or list of strings - - :return: Sampled data from tensors sorted according to their position in the list of names. - The sampled tensors will have the following shape: (batch size, data size) - :rtype: tuple of torch.Tensor - """ - # generate random indexes - indexes = np.random.choice(len(self), size=batch_size, replace=True) - - return self.sample_by_index(indexes=indexes, names=names) diff --git a/skrl/memories/torch/random.py b/skrl/memories/torch/random.py index 827d3b04..01817a28 100644 --- a/skrl/memories/torch/random.py +++ b/skrl/memories/torch/random.py @@ -1,18 +1,18 @@ -from typing import Union, Tuple, List +from typing import Optional, Union, Tuple, List import torch -from .base import Memory +from skrl.memories.torch import Memory class RandomMemory(Memory): - def __init__(self, - memory_size: int, - num_envs: int = 1, - device: Union[str, torch.device] = "cuda:0", - export: bool = False, - export_format: str = "pt", - export_directory: str = "", + def __init__(self, + memory_size: int, + num_envs: int = 1, + device: Optional[Union[str, torch.device]] = None, + export: bool = False, + export_format: str = "pt", + export_directory: str = "", replacement=True) -> None: """Random sampling memory @@ -22,7 +22,8 @@ def __init__(self, :type memory_size: int :param num_envs: Number of parallel environments (default: 1) :type num_envs: int, optional - :param device: Device on which a torch tensor is or will be allocated (default: "cuda:0") + :param device: Device on which a torch tensor is or will be allocated (default: ``None``). + If None, the device will be either ``"cuda:0"`` if available or ``"cpu"`` :type device: str or torch.device, optional :param export: Export the memory to a file (default: False). If True, the memory will be exported when the memory is filled @@ -33,7 +34,7 @@ def __init__(self, :param export_directory: Directory where the memory will be exported (default: ""). If empty, the agent's experiment directory will be used :type export_directory: str, optional - :param replacement: Flag to indicate whether the sample is with or without replacement (default: True). + :param replacement: Flag to indicate whether the sample is with or without replacement (default: True). Replacement implies that a value can be selected multiple times (the batch size is always guaranteed). Sampling without replacement will return a batch of maximum memory size if the memory size is less than the requested batch size :type replacement: bool, optional @@ -44,7 +45,11 @@ def __init__(self, self._replacement = replacement - def sample(self, names: Tuple[str], batch_size: int, mini_batches: int = 1) -> List[List[torch.Tensor]]: + def sample(self, + names: Tuple[str], + batch_size: int, + mini_batches: int = 1, + sequence_length: int = 1) -> List[List[torch.Tensor]]: """Sample a batch from memory randomly :param names: Tensors names from which to obtain the samples @@ -53,17 +58,30 @@ def sample(self, names: Tuple[str], batch_size: int, mini_batches: int = 1) -> L :type batch_size: int :param mini_batches: Number of mini-batches to sample (default: 1) :type mini_batches: int, optional + :param sequence_length: Length of each sequence (default: 1) + :type sequence_length: int, optional :return: Sampled data from tensors sorted according to their position in the list of names. The sampled tensors will have the following shape: (batch size, data size) :rtype: list of torch.Tensor list """ + # compute valid memory sizes + size = len(self) + if sequence_length > 1: + sequence_indexes = torch.arange(0, self.num_envs * sequence_length, self.num_envs) + size -= sequence_indexes[-1].item() + # generate random indexes if self._replacement: - indexes = torch.randint(0, len(self), (batch_size,), device=self.device) + indexes = torch.randint(0, size, (batch_size,)) else: - # details about the random sampling performance can be found here: + # details about the random sampling performance can be found here: # https://discuss.pytorch.org/t/torch-equivalent-of-numpy-random-choice/16146/19 - indexes = torch.randperm(len(self), dtype=torch.long, device=self.device)[:batch_size] + indexes = torch.randperm(size, dtype=torch.long)[:batch_size] + + # generate sequence indexes + if sequence_length > 1: + indexes = (sequence_indexes.repeat(indexes.shape[0], 1) + indexes.view(-1, 1)).view(-1) + self.sampling_indexes = indexes return self.sample_by_index(names=names, indexes=indexes, mini_batches=mini_batches) diff --git a/skrl/models/torch/__init__.py b/skrl/models/torch/__init__.py index 9a4e879c..158bb6c8 100644 --- a/skrl/models/torch/__init__.py +++ b/skrl/models/torch/__init__.py @@ -1,7 +1,7 @@ -from .base import Model +from skrl.models.torch.base import Model -from .tabular import TabularMixin -from .gaussian import GaussianMixin -from .categorical import CategoricalMixin -from .deterministic import DeterministicMixin -from .multivariate_gaussian import MultivariateGaussianMixin +from skrl.models.torch.tabular import TabularMixin +from skrl.models.torch.gaussian import GaussianMixin +from skrl.models.torch.categorical import CategoricalMixin +from skrl.models.torch.deterministic import DeterministicMixin +from skrl.models.torch.multivariate_gaussian import MultivariateGaussianMixin diff --git a/skrl/models/torch/base.py b/skrl/models/torch/base.py index 06cfadf6..8caa2480 100644 --- a/skrl/models/torch/base.py +++ b/skrl/models/torch/base.py @@ -1,6 +1,7 @@ -from typing import Optional, Union, Mapping, Sequence +from typing import Optional, Union, Mapping, Sequence, Tuple, Any import gym +import gymnasium import collections import numpy as np @@ -10,27 +11,28 @@ class Model(torch.nn.Module): - def __init__(self, - observation_space: Union[int, Sequence[int], gym.Space], - action_space: Union[int, Sequence[int], gym.Space], - device: Union[str, torch.device] = "cuda:0") -> None: + def __init__(self, + observation_space: Union[int, Sequence[int], gym.Space, gymnasium.Space], + action_space: Union[int, Sequence[int], gym.Space, gymnasium.Space], + device: Optional[Union[str, torch.device]] = None) -> None: """Base class representing a function approximator The following properties are defined: - ``device`` (torch.device): Device to be used for the computations - - ``observation_space`` (int, sequence of int, gym.Space): Observation/state space - - ``action_space`` (int, sequence of int, gym.Space): Action space + - ``observation_space`` (int, sequence of int, gym.Space, gymnasium.Space): Observation/state space + - ``action_space`` (int, sequence of int, gym.Space, gymnasium.Space): Action space - ``num_observations`` (int): Number of elements in the observation/state space - ``num_actions`` (int): Number of elements in the action space - + :param observation_space: Observation/state space or shape. The ``num_observations`` property will contain the size of that space - :type observation_space: int, sequence of int, gym.Space + :type observation_space: int, sequence of int, gym.Space, gymnasium.Space :param action_space: Action space or shape. The ``num_actions`` property will contain the size of that space - :type action_space: int, sequence of int, gym.Space - :param device: Device on which a torch tensor is or will be allocated (default: ``"cuda:0"``) + :type action_space: int, sequence of int, gym.Space, gymnasium.Space + :param device: Device on which a torch tensor is or will be allocated (default: ``None``). + If None, the device will be either ``"cuda:0"`` if available or ``"cpu"`` :type device: str or torch.device, optional Custom models should override the ``act`` method:: @@ -45,14 +47,14 @@ def __init__(self, observation_space, action_space, device="cuda:0"): self.layer_1 = nn.Linear(self.num_observations, 64) self.layer_2 = nn.Linear(64, self.num_actions) - def act(self, states, taken_actions=None, role=""): - x = F.relu(self.layer_1(states)) + def act(self, inputs, role=""): + x = F.relu(self.layer_1(inputs["states"])) x = F.relu(self.layer_2(x)) - return x + return x, None, {} """ super(Model, self).__init__() - self.device = torch.device(device) + self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") if device is None else torch.device(device) self.observation_space = observation_space self.action_space = action_space @@ -61,14 +63,14 @@ def act(self, states, taken_actions=None, role=""): self._random_distribution = None - def _get_space_size(self, - space: Union[int, Sequence[int], gym.Space], + def _get_space_size(self, + space: Union[int, Sequence[int], gym.Space, gymnasium.Space], number_of_elements: bool = True) -> int: """Get the size (number of elements) of a space :param space: Space or shape from which to obtain the number of elements - :type space: int, sequence of int, or gym.Space - :param number_of_elements: Whether the number of elements occupied by the space is returned (default: ``True``). + :type space: int, sequence of int, gym.Space, or gymnasium.Space + :param number_of_elements: Whether the number of elements occupied by the space is returned (default: ``True``). If ``False``, the shape of the space is returned. It only affects Discrete spaces :type number_of_elements: bool, optional @@ -100,7 +102,7 @@ def _get_space_size(self, 1 # Dict space - >>> space = gym.spaces.Dict({'a': gym.spaces.Box(low=-1, high=1, shape=(2, 3)), + >>> space = gym.spaces.Dict({'a': gym.spaces.Box(low=-1, high=1, shape=(2, 3)), ... 'b': gym.spaces.Discrete(4)}) >>> model._get_space_size(space) 10 @@ -122,27 +124,37 @@ def _get_space_size(self, size = np.prod(space.shape) elif issubclass(type(space), gym.spaces.Dict): size = sum([self._get_space_size(space.spaces[key], number_of_elements) for key in space.spaces]) + elif issubclass(type(space), gymnasium.Space): + if issubclass(type(space), gymnasium.spaces.Discrete): + if number_of_elements: + size = space.n + else: + size = 1 + elif issubclass(type(space), gymnasium.spaces.Box): + size = np.prod(space.shape) + elif issubclass(type(space), gymnasium.spaces.Dict): + size = sum([self._get_space_size(space.spaces[key], number_of_elements) for key in space.spaces]) if size is None: raise ValueError("Space type {} not supported".format(type(space))) return int(size) - def tensor_to_space(self, - tensor: torch.Tensor, - space: gym.Space, + def tensor_to_space(self, + tensor: torch.Tensor, + space: Union[gym.Space, gymnasium.Space], start: int = 0) -> Union[torch.Tensor, dict]: - """Map a flat tensor to a Gym space + """Map a flat tensor to a Gym/Gymnasium space The mapping is done in the following way: - Tensors belonging to Discrete spaces are returned without modification - - Tensors belonging to Box spaces are reshaped to the corresponding space shape + - Tensors belonging to Box spaces are reshaped to the corresponding space shape keeping the first dimension (number of samples) as they are - Tensors belonging to Dict spaces are mapped into a dictionary with the same keys as the original space :param tensor: Tensor to map from :type tensor: torch.Tensor :param space: Space to map the tensor to - :type space: gym.Space + :type space: gym.Space or gymnasium.Space :param start: Index of the first element of the tensor to map (default: ``0``) :type start: int, optional @@ -153,7 +165,7 @@ def tensor_to_space(self, Example:: - >>> space = gym.spaces.Dict({'a': gym.spaces.Box(low=-1, high=1, shape=(2, 3)), + >>> space = gym.spaces.Dict({'a': gym.spaces.Box(low=-1, high=1, shape=(2, 3)), ... 'b': gym.spaces.Discrete(4)}) >>> tensor = torch.tensor([[-0.3, -0.2, -0.1, 0.1, 0.2, 0.3, 2]]) >>> @@ -162,56 +174,68 @@ def tensor_to_space(self, [ 0.1000, 0.2000, 0.3000]]]), 'b': tensor([[2.]])} """ - if issubclass(type(space), gym.spaces.Discrete): - return tensor - elif issubclass(type(space), gym.spaces.Box): - return tensor.view(tensor.shape[0], *space.shape) - elif issubclass(type(space), gym.spaces.Dict): - output = {} - for k in sorted(space.keys()): - end = start + self._get_space_size(space[k], number_of_elements=False) - output[k] = self.tensor_to_space(tensor[:, start:end], space[k], end) - start = end - return output + if issubclass(type(space), gym.Space): + if issubclass(type(space), gym.spaces.Discrete): + return tensor + elif issubclass(type(space), gym.spaces.Box): + return tensor.view(tensor.shape[0], *space.shape) + elif issubclass(type(space), gym.spaces.Dict): + output = {} + for k in sorted(space.keys()): + end = start + self._get_space_size(space[k], number_of_elements=False) + output[k] = self.tensor_to_space(tensor[:, start:end], space[k], end) + start = end + return output + else: + if issubclass(type(space), gymnasium.spaces.Discrete): + return tensor + elif issubclass(type(space), gymnasium.spaces.Box): + return tensor.view(tensor.shape[0], *space.shape) + elif issubclass(type(space), gymnasium.spaces.Dict): + output = {} + for k in sorted(space.keys()): + end = start + self._get_space_size(space[k], number_of_elements=False) + output[k] = self.tensor_to_space(tensor[:, start:end], space[k], end) + start = end + return output raise ValueError("Space type {} not supported".format(type(space))) - def random_act(self, - states: torch.Tensor, - taken_actions: Optional[torch.Tensor] = None, - role: str = "") -> Sequence[torch.Tensor]: + def random_act(self, + inputs: Mapping[str, Union[torch.Tensor, Any]], + role: str = "") -> Tuple[torch.Tensor, None, Mapping[str, Union[torch.Tensor, Any]]]: """Act randomly according to the action space - :param states: Observation/state of the environment used to get the shape of the action space - :type states: torch.Tensor - :param taken_actions: Actions taken by a policy to the given states (default: ``None``). - The use of these actions only makes sense in critical models, e.g. - :type taken_actions: torch.Tensor, optional + :param inputs: Model inputs. The most common keys are: + + - ``"states"``: state of the environment used to make the decision + - ``"taken_actions"``: actions taken by the policy for the given states + :type inputs: dict where the values are typically torch.Tensor :param role: Role play by the model (default: ``""``) :type role: str, optional :raises NotImplementedError: Unsupported action space - :return: Random actions to be taken by the agent - :rtype: sequence of torch.Tensor + :return: Model output. The first component is the action to be taken by the agent + :rtype: tuple of torch.Tensor, None, and dictionary """ # discrete action space (Discrete) - if issubclass(type(self.action_space), gym.spaces.Discrete): - return torch.randint(self.action_space.n, (states.shape[0], 1), device=self.device), None, None + if issubclass(type(self.action_space), gym.spaces.Discrete) or issubclass(type(self.action_space), gymnasium.spaces.Discrete): + return torch.randint(self.action_space.n, (inputs["states"].shape[0], 1), device=self.device), None, {} # continuous action space (Box) - elif issubclass(type(self.action_space), gym.spaces.Box): + elif issubclass(type(self.action_space), gym.spaces.Box) or issubclass(type(self.action_space), gymnasium.spaces.Box): if self._random_distribution is None: self._random_distribution = torch.distributions.uniform.Uniform( low=torch.tensor(self.action_space.low[0], device=self.device, dtype=torch.float32), high=torch.tensor(self.action_space.high[0], device=self.device, dtype=torch.float32)) - - return self._random_distribution.sample(sample_shape=(states.shape[0], self.num_actions)), None, None + + return self._random_distribution.sample(sample_shape=(inputs["states"].shape[0], self.num_actions)), None, {} else: raise NotImplementedError("Action space type ({}) not supported".format(type(self.action_space))) def init_parameters(self, method_name: str = "normal_", *args, **kwargs) -> None: """Initialize the model parameters according to the specified method name - Method names are from the `torch.nn.init `_ module. + Method names are from the `torch.nn.init `_ module. Allowed method names are *uniform_*, *normal_*, *constant_*, etc. :param method_name: `torch.nn.init `_ method name (default: ``"normal_"``) @@ -234,13 +258,13 @@ def init_parameters(self, method_name: str = "normal_", *args, **kwargs) -> None def init_weights(self, method_name: str = "orthogonal_", *args, **kwargs) -> None: """Initialize the model weights according to the specified method name - - Method names are from the `torch.nn.init `_ module. + + Method names are from the `torch.nn.init `_ module. Allowed method names are *uniform_*, *normal_*, *constant_*, etc. The following layers will be initialized: - torch.nn.Linear - + :param method_name: `torch.nn.init `_ method name (default: ``"orthogonal_"``) :type method_name: str, optional :param args: Positional arguments of the method to be called @@ -262,69 +286,139 @@ def _update_weights(module, method_name, args, kwargs): _update_weights(layer, method_name, args, kwargs) elif isinstance(layer, torch.nn.Linear): exec("torch.nn.init.{}(layer.weight, *args, **kwargs)".format(method_name)) - + _update_weights(self.children(), method_name, args, kwargs) - def forward(self): + def init_biases(self, method_name: str = "constant_", *args, **kwargs) -> None: + """Initialize the model biases according to the specified method name + + Method names are from the `torch.nn.init `_ module. + Allowed method names are *uniform_*, *normal_*, *constant_*, etc. + + The following layers will be initialized: + - torch.nn.Linear + + :param method_name: `torch.nn.init `_ method name (default: ``"constant_"``) + :type method_name: str, optional + :param args: Positional arguments of the method to be called + :type args: tuple, optional + :param kwargs: Key-value arguments of the method to be called + :type kwargs: dict, optional + + Example:: + + # initialize all biases with a constant value (0) + >>> model.init_biases(method_name="constant_", val=0) + + # initialize all biases with normal distribution with mean 0 and standard deviation 0.25 + >>> model.init_biases(method_name="normal_", mean=0.0, std=0.25) + """ + def _update_biases(module, method_name, args, kwargs): + for layer in module: + if isinstance(layer, torch.nn.Sequential): + _update_biases(layer, method_name, args, kwargs) + elif isinstance(layer, torch.nn.Linear): + exec("torch.nn.init.{}(layer.bias, *args, **kwargs)".format(method_name)) + + _update_biases(self.children(), method_name, args, kwargs) + + def get_specification(self) -> Mapping[str, Any]: + """Returns the specification of the model + + The following keys are used by the agents for initialization: + + - ``"rnn"``: Recurrent Neural Network (RNN) specification for RNN, LSTM and GRU layers/cells + + - ``"sizes"``: List of RNN shapes (number of layers, number of environments, number of features in the RNN state). + There must be as many tuples as there are states in the recurrent layer/cell. E.g., LSTM has 2 states (hidden and cell). + + :return: Dictionary containing advanced specification of the model + :rtype: dict + + Example:: + + # model with a LSTM layer. + # - number of layers: 1 + # - number of environments: 4 + # - number of features in the RNN state: 64 + >>> model.get_specification() + {'rnn': {'sizes': [(1, 4, 64), (1, 4, 64)]}} + """ + return {} + + def forward(self, + inputs: Mapping[str, Union[torch.Tensor, Any]], + role: str = "") -> Tuple[torch.Tensor, Union[torch.Tensor, None], Mapping[str, Union[torch.Tensor, Any]]]: """Forward pass of the model - :raises NotImplementedError: Child class must ``.act()`` and ``.compute()`` methods + This method calls the ``.act()`` method and returns its outputs + + :param inputs: Model inputs. The most common keys are: + + - ``"states"``: state of the environment used to make the decision + - ``"taken_actions"``: actions taken by the policy for the given states + :type inputs: dict where the values are typically torch.Tensor + :param role: Role play by the model (default: ``""``) + :type role: str, optional + + :return: Model output. The first component is the action to be taken by the agent. + The second component is the log of the probability density function for stochastic models + or None for deterministic models. The third component is a dictionary containing extra output values + :rtype: tuple of torch.Tensor, torch.Tensor or None, and dictionary """ - raise NotImplementedError("Implement .act() and .compute() methods instead of this") + return self.act(inputs, role) - def compute(self, - states: torch.Tensor, - taken_actions: Optional[torch.Tensor] = None, - role: str = "") -> Union[torch.Tensor, Sequence[torch.Tensor]]: + def compute(self, + inputs: Mapping[str, Union[torch.Tensor, Any]], + role: str = "") -> Tuple[Union[torch.Tensor, Mapping[str, Union[torch.Tensor, Any]]]]: """Define the computation performed (to be implemented by the inheriting classes) by the models - :param states: Observation/state of the environment used to make the decision - :type states: torch.Tensor - :param taken_actions: Actions taken by a policy to the given states (default: ``None``). - The use of these actions only makes sense in critical models, e.g. - :type taken_actions: torch.Tensor, optional + :param inputs: Model inputs. The most common keys are: + + - ``"states"``: state of the environment used to make the decision + - ``"taken_actions"``: actions taken by the policy for the given states + :type inputs: dict where the values are typically torch.Tensor :param role: Role play by the model (default: ``""``) :type role: str, optional :raises NotImplementedError: Child class must implement this method - + :return: Computation performed by the models - :rtype: torch.Tensor or sequence of torch.Tensor + :rtype: tuple of torch.Tensor and dictionary """ raise NotImplementedError("The computation performed by the models (.compute()) is not implemented") - def act(self, - states: torch.Tensor, - taken_actions: Optional[torch.Tensor] = None, - role: str = "") -> Sequence[torch.Tensor]: + def act(self, + inputs: Mapping[str, Union[torch.Tensor, Any]], + role: str = "") -> Tuple[torch.Tensor, Union[torch.Tensor, None], Mapping[str, Union[torch.Tensor, Any]]]: """Act according to the specified behavior (to be implemented by the inheriting classes) Agents will call this method to obtain the decision to be taken given the state of the environment. This method is currently implemented by the helper models (**GaussianModel**, etc.). The classes that inherit from the latter must only implement the ``.compute()`` method - :param states: Observation/state of the environment used to make the decision - :type states: torch.Tensor - :param taken_actions: Actions taken by a policy to the given states (default: ``None``). - The use of these actions only makes sense in critical models, e.g. - :type taken_actions: torch.Tensor, optional + :param inputs: Model inputs. The most common keys are: + + - ``"states"``: state of the environment used to make the decision + - ``"taken_actions"``: actions taken by the policy for the given states + :type inputs: dict where the values are typically torch.Tensor :param role: Role play by the model (default: ``""``) :type role: str, optional :raises NotImplementedError: Child class must implement this method - - :return: Action to be taken by the agent given the state of the environment. - The typical sequence's components are the actions, the log of the probability density function and mean actions. - Deterministic agents must ignore the last two components and return empty tensors or None for them - :rtype: sequence of torch.Tensor + + :return: Model output. The first component is the action to be taken by the agent. + The second component is the log of the probability density function for stochastic models + or None for deterministic models. The third component is a dictionary containing extra output values + :rtype: tuple of torch.Tensor, torch.Tensor or None, and dictionary """ logger.warning("Make sure to place Mixins before Model during model definition") raise NotImplementedError("The action to be taken by the agent (.act()) is not implemented") - + def set_mode(self, mode: str) -> None: """Set the model mode (training or evaluation) - :param mode: Mode: ``"train"`` for training or ``"eval"`` for evaluation. + :param mode: Mode: ``"train"`` for training or ``"eval"`` for evaluation. See `torch.nn.Module.train `_ :type mode: str @@ -339,7 +433,7 @@ def set_mode(self, mode: str) -> None: def save(self, path: str, state_dict: Optional[dict] = None) -> None: """Save the model to the specified path - + :param path: Path to save the model to :type path: str :param state_dict: State dictionary to save (default: ``None``). @@ -390,7 +484,7 @@ def migrate(self, The final storage device is determined by the constructor of the model Only one of ``state_dict`` or ``path`` can be specified. - The ``path`` parameter allows automatic loading the ``state_dict`` only from files generated + The ``path`` parameter allows automatic loading the ``state_dict`` only from files generated by the *rl_games* and *stable-baselines3* libraries at the moment For ambiguous models (where 2 or more parameters, for source or current model, have equal shape) @@ -580,13 +674,13 @@ def migrate(self, self.eval() return status - + def freeze_parameters(self, freeze: bool = True) -> None: """Freeze or unfreeze internal parameters - Freeze: disable gradient computation (``parameters.requires_grad = False``) - - Unfreeze: enable gradient computation (``parameters.requires_grad = True``) - + - Unfreeze: enable gradient computation (``parameters.requires_grad = True``) + :param freeze: Freeze the internal parameters if True, otherwise unfreeze them (default: ``True``) :type freeze: bool, optional diff --git a/skrl/models/torch/categorical.py b/skrl/models/torch/categorical.py index 19f94f10..a37fe4a1 100644 --- a/skrl/models/torch/categorical.py +++ b/skrl/models/torch/categorical.py @@ -1,4 +1,4 @@ -from typing import Optional, Sequence +from typing import Union, Mapping, Tuple, Any import torch from torch.distributions import Categorical @@ -9,8 +9,8 @@ def __init__(self, unnormalized_log_prob: bool = True, role: str = "") -> None: """Categorical mixin model (stochastic model) :param unnormalized_log_prob: Flag to indicate how to be interpreted the model's output (default: ``True``). - If True, the model's output is interpreted as unnormalized log probabilities - (it can be any real number), otherwise as normalized probabilities + If True, the model's output is interpreted as unnormalized log probabilities + (it can be any real number), otherwise as normalized probabilities (the output must be non-negative, finite and have a non-zero sum) :type unnormalized_log_prob: bool, optional :param role: Role play by the model (default: ``""``) @@ -22,7 +22,7 @@ def __init__(self, unnormalized_log_prob: bool = True, role: str = "") -> None: >>> import torch >>> import torch.nn as nn >>> from skrl.models.torch import Model, CategoricalMixin - >>> + >>> >>> class Policy(CategoricalMixin, Model): ... def __init__(self, observation_space, action_space, device="cuda:0", unnormalized_log_prob=True): ... Model.__init__(self, observation_space, action_space, device) @@ -34,13 +34,13 @@ def __init__(self, unnormalized_log_prob: bool = True, role: str = "") -> None: ... nn.ELU(), ... nn.Linear(32, self.num_actions)) ... - ... def compute(self, states, taken_actions, role): - ... return self.net(states) + ... def compute(self, inputs, role): + ... return self.net(inputs["states"]), {} ... >>> # given an observation_space: gym.spaces.Box with shape (4,) >>> # and an action_space: gym.spaces.Discrete with n = 2 >>> model = Policy(observation_space, action_space) - >>> + >>> >>> print(model) Policy( (net): Sequential( @@ -60,47 +60,48 @@ def __init__(self, unnormalized_log_prob: bool = True, role: str = "") -> None: self._c_distribution = {} self._c_distribution[role] = None - def act(self, - states: torch.Tensor, - taken_actions: Optional[torch.Tensor] = None, - role: str = "") -> Sequence[torch.Tensor]: + def act(self, + inputs: Mapping[str, Union[torch.Tensor, Any]], + role: str = "") -> Tuple[torch.Tensor, Union[torch.Tensor, None], Mapping[str, Union[torch.Tensor, Any]]]: """Act stochastically in response to the state of the environment - :param states: Observation/state of the environment used to make the decision - :type states: torch.Tensor - :param taken_actions: Actions taken by a policy to the given states (default: ``None``). - The use of these actions only makes sense in critical models, e.g. - :type taken_actions: torch.Tensor, optional + :param inputs: Model inputs. The most common keys are: + + - ``"states"``: state of the environment used to make the decision + - ``"taken_actions"``: actions taken by the policy for the given states + :type inputs: dict where the values are typically torch.Tensor :param role: Role play by the model (default: ``""``) :type role: str, optional - :return: Action to be taken by the agent given the state of the environment. - The sequence's components are the actions, the log of the probability density function and the model's output - :rtype: sequence of torch.Tensor + :return: Model output. The first component is the action to be taken by the agent. + The second component is the log of the probability density function. + The third component is a dictionary containing the network output ``"net_output"`` + and extra output values + :rtype: tuple of torch.Tensor, torch.Tensor or None, and dictionary Example:: >>> # given a batch of sample states with shape (4096, 4) - >>> action, log_prob, net_output = model.act(states) - >>> print(action.shape, log_prob.shape, net_output.shape) + >>> actions, log_prob, outputs = model.act({"states": states}) + >>> print(actions.shape, log_prob.shape, outputs["net_output"].shape) torch.Size([4096, 1]) torch.Size([4096, 1]) torch.Size([4096, 2]) """ # map from states/observations to normalized probabilities or unnormalized log probabilities - output = self.compute(states.to(self.device), - taken_actions.to(self.device) if taken_actions is not None else taken_actions, role) + net_output, outputs = self.compute(inputs, role) # unnormalized log probabilities if self._c_unnormalized_log_prob[role] if role in self._c_unnormalized_log_prob else self._c_unnormalized_log_prob[""]: - self._c_distribution[role] = Categorical(logits=output) + self._c_distribution[role] = Categorical(logits=net_output) # normalized probabilities else: - self._c_distribution[role] = Categorical(probs=output) - + self._c_distribution[role] = Categorical(probs=net_output) + # actions and log of the probability density function actions = self._c_distribution[role].sample() - log_prob = self._c_distribution[role].log_prob(actions if taken_actions is None else taken_actions.view(-1)) + log_prob = self._c_distribution[role].log_prob(inputs.get("taken_actions", actions).view(-1)) - return actions.unsqueeze(-1), log_prob.unsqueeze(-1), output + outputs["net_output"] = net_output + return actions.unsqueeze(-1), log_prob.unsqueeze(-1), outputs def distribution(self, role: str = "") -> torch.distributions.Categorical: """Get the current distribution of the model @@ -116,4 +117,4 @@ def distribution(self, role: str = "") -> torch.distributions.Categorical: >>> print(distribution) Categorical(probs: torch.Size([4096, 2]), logits: torch.Size([4096, 2])) """ - return self._c_distribution if role in self._c_distribution else self._c_distribution[""] + return self._c_distribution[role] if role in self._c_distribution else self._c_distribution[""] diff --git a/skrl/models/torch/deterministic.py b/skrl/models/torch/deterministic.py index a90cfaff..7e686d1a 100644 --- a/skrl/models/torch/deterministic.py +++ b/skrl/models/torch/deterministic.py @@ -1,6 +1,7 @@ -from typing import Optional, Sequence +from typing import Union, Mapping, Tuple, Any import gym +import gymnasium import torch @@ -20,7 +21,7 @@ def __init__(self, clip_actions: bool = False, role: str = "") -> None: >>> import torch >>> import torch.nn as nn >>> from skrl.models.torch import Model, DeterministicMixin - >>> + >>> >>> class Value(DeterministicMixin, Model): ... def __init__(self, observation_space, action_space, device="cuda:0", clip_actions=False): ... Model.__init__(self, observation_space, action_space, device) @@ -32,13 +33,13 @@ def __init__(self, clip_actions: bool = False, role: str = "") -> None: ... nn.ELU(), ... nn.Linear(32, 1)) ... - ... def compute(self, states, taken_actions, role): - ... return self.net(states) + ... def compute(self, inputs, role): + ... return self.net(inputs["states"]), {} ... >>> # given an observation_space: gym.spaces.Box with shape (60,) >>> # and an action_space: gym.spaces.Box with shape (8,) >>> model = Value(observation_space, action_space) - >>> + >>> >>> print(model) Value( (net): Sequential( @@ -52,7 +53,8 @@ def __init__(self, clip_actions: bool = False, role: str = "") -> None: """ if not hasattr(self, "_d_clip_actions"): self._d_clip_actions = {} - self._d_clip_actions[role] = clip_actions and issubclass(type(self.action_space), gym.Space) + self._d_clip_actions[role] = clip_actions and (issubclass(type(self.action_space), gym.Space) or \ + issubclass(type(self.action_space), gymnasium.Space)) if self._d_clip_actions[role]: self.clip_actions_min = torch.tensor(self.action_space.low, device=self.device, dtype=torch.float32) @@ -60,42 +62,39 @@ def __init__(self, clip_actions: bool = False, role: str = "") -> None: # backward compatibility: torch < 1.9 clamp method does not support tensors self._backward_compatibility = tuple(map(int, (torch.__version__.split(".")[:2]))) < (1, 9) - - def act(self, - states: torch.Tensor, - taken_actions: Optional[torch.Tensor] = None, - role: str = "") -> Sequence[torch.Tensor]: + + def act(self, + inputs: Mapping[str, Union[torch.Tensor, Any]], + role: str = "") -> Tuple[torch.Tensor, Union[torch.Tensor, None], Mapping[str, Union[torch.Tensor, Any]]]: """Act deterministically in response to the state of the environment - :param states: Observation/state of the environment used to make the decision - :type states: torch.Tensor - :param taken_actions: Actions taken by a policy to the given states (default: ``None``). - The use of these actions only makes sense in critical models, e.g. - :type taken_actions: torch.Tensor, optional + :param inputs: Model inputs. The most common keys are: + + - ``"states"``: state of the environment used to make the decision + - ``"taken_actions"``: actions taken by the policy for the given states + :type inputs: dict where the values are typically torch.Tensor :param role: Role play by the model (default: ``""``) :type role: str, optional - :return: Action to be taken by the agent given the state of the environment. - The sequence's components are the computed actions and None for the last two components - :rtype: sequence of torch.Tensor + :return: Model output. The first component is the action to be taken by the agent. + The second component is ``None``. The third component is a dictionary containing extra output values + :rtype: tuple of torch.Tensor, torch.Tensor or None, and dictionary Example:: >>> # given a batch of sample states with shape (4096, 60) - >>> output = model.act(states) - >>> print(output[0].shape, output[1], output[2]) - torch.Size([4096, 1]) None None + >>> actions, _, outputs = model.act({"states": states}) + >>> print(actions.shape, outputs) + torch.Size([4096, 1]) {} """ # map from observations/states to actions - actions = self.compute(states.to(self.device), - taken_actions.to(self.device) if taken_actions is not None else taken_actions, role) + actions, outputs = self.compute(inputs, role) - # clip actions + # clip actions if self._d_clip_actions[role] if role in self._d_clip_actions else self._d_clip_actions[""]: if self._backward_compatibility: actions = torch.max(torch.min(actions, self.clip_actions_max), self.clip_actions_min) else: actions = torch.clamp(actions, min=self.clip_actions_min, max=self.clip_actions_max) - return actions, None, None - \ No newline at end of file + return actions, None, outputs diff --git a/skrl/models/torch/gaussian.py b/skrl/models/torch/gaussian.py index 53fa04b9..f903cc0d 100644 --- a/skrl/models/torch/gaussian.py +++ b/skrl/models/torch/gaussian.py @@ -1,16 +1,17 @@ -from typing import Optional, Sequence +from typing import Union, Mapping, Tuple, Any import gym +import gymnasium import torch from torch.distributions import Normal class GaussianMixin: - def __init__(self, - clip_actions: bool = False, - clip_log_std: bool = True, - min_log_std: float = -20, + def __init__(self, + clip_actions: bool = False, + clip_log_std: bool = True, + min_log_std: float = -20, max_log_std: float = 2, reduction: str = "sum", role: str = "") -> None: @@ -25,7 +26,7 @@ def __init__(self, :param max_log_std: Maximum value of the log standard deviation if ``clip_log_std`` is True (default: ``2``) :type max_log_std: float, optional :param reduction: Reduction method for returning the log probability density function: (default: ``"sum"``). - Supported values are ``"mean"``, ``"sum"``, ``"prod"`` and ``"none"``. If "``none"``, the log probability density + Supported values are ``"mean"``, ``"sum"``, ``"prod"`` and ``"none"``. If "``none"``, the log probability density function is returned as a tensor of shape ``(num_samples, num_actions)`` instead of ``(num_samples, 1)`` :type reduction: str, optional :param role: Role play by the model (default: ``""``) @@ -39,9 +40,9 @@ def __init__(self, >>> import torch >>> import torch.nn as nn >>> from skrl.models.torch import Model, GaussianMixin - >>> + >>> >>> class Policy(GaussianMixin, Model): - ... def __init__(self, observation_space, action_space, device="cuda:0", + ... def __init__(self, observation_space, action_space, device="cuda:0", ... clip_actions=False, clip_log_std=True, min_log_std=-20, max_log_std=2, reduction="sum"): ... Model.__init__(self, observation_space, action_space, device) ... GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std, reduction) @@ -53,13 +54,13 @@ def __init__(self, ... nn.Linear(32, self.num_actions)) ... self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) ... - ... def compute(self, states, taken_actions, role): - ... return self.net(states), self.log_std_parameter + ... def compute(self, inputs, role): + ... return self.net(inputs["states"]), self.log_std_parameter, {} ... >>> # given an observation_space: gym.spaces.Box with shape (60,) >>> # and an action_space: gym.spaces.Box with shape (8,) >>> model = Policy(observation_space, action_space) - >>> + >>> >>> print(model) Policy( (net): Sequential( @@ -73,12 +74,13 @@ def __init__(self, """ if not hasattr(self, "_g_clip_actions"): self._g_clip_actions = {} - self._g_clip_actions[role] = clip_actions and issubclass(type(self.action_space), gym.Space) + self._g_clip_actions[role] = clip_actions and (issubclass(type(self.action_space), gym.Space) or \ + issubclass(type(self.action_space), gymnasium.Space)) if self._g_clip_actions[role]: self.clip_actions_min = torch.tensor(self.action_space.low, device=self.device, dtype=torch.float32) self.clip_actions_max = torch.tensor(self.action_space.high, device=self.device, dtype=torch.float32) - + # backward compatibility: torch < 1.9 clamp method does not support tensors self._backward_compatibility = tuple(map(int, (torch.__version__.split(".")[:2]))) < (1, 9) @@ -101,7 +103,7 @@ def __init__(self, if not hasattr(self, "_g_distribution"): self._g_distribution = {} self._g_distribution[role] = None - + if reduction not in ["mean", "sum", "prod", "none"]: raise ValueError("reduction must be one of 'mean', 'sum', 'prod' or 'none'") if not hasattr(self, "_g_reduction"): @@ -109,46 +111,46 @@ def __init__(self, self._g_reduction[role] = torch.mean if reduction == "mean" else torch.sum if reduction == "sum" \ else torch.prod if reduction == "prod" else None - def act(self, - states: torch.Tensor, - taken_actions: Optional[torch.Tensor] = None, - role: str = "") -> Sequence[torch.Tensor]: + def act(self, + inputs: Mapping[str, Union[torch.Tensor, Any]], + role: str = "") -> Tuple[torch.Tensor, Union[torch.Tensor, None], Mapping[str, Union[torch.Tensor, Any]]]: """Act stochastically in response to the state of the environment - :param states: Observation/state of the environment used to make the decision - :type states: torch.Tensor - :param taken_actions: Actions taken by a policy to the given states (default: ``None``). - The use of these actions only makes sense in critical models, e.g. - :type taken_actions: torch.Tensor, optional + :param inputs: Model inputs. The most common keys are: + + - ``"states"``: state of the environment used to make the decision + - ``"taken_actions"``: actions taken by the policy for the given states + :type inputs: dict where the values are typically torch.Tensor :param role: Role play by the model (default: ``""``) :type role: str, optional - - :return: Action to be taken by the agent given the state of the environment. - The sequence's components are the actions, the log of the probability density function and mean actions - :rtype: sequence of torch.Tensor + + :return: Model output. The first component is the action to be taken by the agent. + The second component is the log of the probability density function. + The third component is a dictionary containing the mean actions ``"mean_actions"`` + and extra output values + :rtype: tuple of torch.Tensor, torch.Tensor or None, and dictionary Example:: >>> # given a batch of sample states with shape (4096, 60) - >>> action, log_prob, mean_action = model.act(states) - >>> print(action.shape, log_prob.shape, mean_action.shape) + >>> actions, log_prob, outputs = model.act({"states": states}) + >>> print(actions.shape, log_prob.shape, outputs["mean_actions"].shape) torch.Size([4096, 8]) torch.Size([4096, 1]) torch.Size([4096, 8]) """ # map from states/observations to mean actions and log standard deviations - actions_mean, log_std = self.compute(states.to(self.device), - taken_actions.to(self.device) if taken_actions is not None else taken_actions, role) + mean_actions, log_std, outputs = self.compute(inputs, role) # clamp log standard deviations if self._g_clip_log_std[role] if role in self._g_clip_log_std else self._g_clip_log_std[""]: - log_std = torch.clamp(log_std, + log_std = torch.clamp(log_std, self._g_log_std_min[role] if role in self._g_log_std_min else self._g_log_std_min[""], self._g_log_std_max[role] if role in self._g_log_std_max else self._g_log_std_max[""]) self._g_log_std[role] = log_std - self._g_num_samples[role] = actions_mean.shape[0] + self._g_num_samples[role] = mean_actions.shape[0] # distribution - self._g_distribution[role] = Normal(actions_mean, log_std.exp()) + self._g_distribution[role] = Normal(mean_actions, log_std.exp()) # sample using the reparameterization trick actions = self._g_distribution[role].rsample() @@ -159,16 +161,17 @@ def act(self, actions = torch.max(torch.min(actions, self.clip_actions_max), self.clip_actions_min) else: actions = torch.clamp(actions, min=self.clip_actions_min, max=self.clip_actions_max) - + # log of the probability density function - log_prob = self._g_distribution[role].log_prob(actions if taken_actions is None else taken_actions) + log_prob = self._g_distribution[role].log_prob(inputs.get("taken_actions", actions)) reduction = self._g_reduction[role] if role in self._g_reduction else self._g_reduction[""] if reduction is not None: log_prob = reduction(log_prob, dim=-1) if log_prob.dim() != actions.dim(): log_prob = log_prob.unsqueeze(-1) - return actions, log_prob, actions_mean + outputs["mean_actions"] = mean_actions + return actions, log_prob, outputs def get_entropy(self, role: str = "") -> torch.Tensor: """Compute and return the entropy of the model @@ -205,7 +208,7 @@ def get_log_std(self, role: str = "") -> torch.Tensor: """ return (self._g_log_std[role] if role in self._g_log_std else self._g_log_std[""]) \ .repeat(self._g_num_samples[role] if role in self._g_num_samples else self._g_num_samples[""], 1) - + def distribution(self, role: str = "") -> torch.distributions.Normal: """Get the current distribution of the model diff --git a/skrl/models/torch/multivariate_gaussian.py b/skrl/models/torch/multivariate_gaussian.py index 4ca63e26..5ddb23e7 100644 --- a/skrl/models/torch/multivariate_gaussian.py +++ b/skrl/models/torch/multivariate_gaussian.py @@ -1,16 +1,17 @@ -from typing import Optional, Sequence +from typing import Union, Mapping, Tuple, Any import gym +import gymnasium import torch from torch.distributions import MultivariateNormal class MultivariateGaussianMixin: - def __init__(self, - clip_actions: bool = False, - clip_log_std: bool = True, - min_log_std: float = -20, + def __init__(self, + clip_actions: bool = False, + clip_log_std: bool = True, + min_log_std: float = -20, max_log_std: float = 2, role: str = "") -> None: """Multivariate Gaussian mixin model (stochastic model) @@ -32,7 +33,7 @@ def __init__(self, >>> import torch >>> import torch.nn as nn >>> from skrl.models.torch import Model, MultivariateGaussianMixin - >>> + >>> >>> class Policy(MultivariateGaussianMixin, Model): ... def __init__(self, observation_space, action_space, device="cuda:0", ... clip_actions=False, clip_log_std=True, min_log_std=-20, max_log_std=2): @@ -46,13 +47,13 @@ def __init__(self, ... nn.Linear(32, self.num_actions)) ... self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions)) ... - ... def compute(self, states, taken_actions, role): - ... return self.net(states), self.log_std_parameter + ... def compute(self, inputs, role): + ... return self.net(inputs["states"]), self.log_std_parameter, {} ... >>> # given an observation_space: gym.spaces.Box with shape (60,) >>> # and an action_space: gym.spaces.Box with shape (8,) >>> model = Policy(observation_space, action_space) - >>> + >>> >>> print(model) Policy( (net): Sequential( @@ -66,12 +67,13 @@ def __init__(self, """ if not hasattr(self, "_mg_clip_actions"): self._mg_clip_actions = {} - self._mg_clip_actions[role] = clip_actions and issubclass(type(self.action_space), gym.Space) + self._mg_clip_actions[role] = clip_actions and (issubclass(type(self.action_space), gym.Space) or \ + issubclass(type(self.action_space), gymnasium.Space)) if self._mg_clip_actions[role]: self.clip_actions_min = torch.tensor(self.action_space.low, device=self.device, dtype=torch.float32) self.clip_actions_max = torch.tensor(self.action_space.high, device=self.device, dtype=torch.float32) - + # backward compatibility: torch < 1.9 clamp method does not support tensors self._backward_compatibility = tuple(map(int, (torch.__version__.split(".")[:2]))) < (1, 9) @@ -94,48 +96,48 @@ def __init__(self, if not hasattr(self, "_mg_distribution"): self._mg_distribution = {} self._mg_distribution[role] = None - - def act(self, - states: torch.Tensor, - taken_actions: Optional[torch.Tensor] = None, - role: str = "") -> Sequence[torch.Tensor]: + + def act(self, + inputs: Mapping[str, Union[torch.Tensor, Any]], + role: str = "") -> Tuple[torch.Tensor, Union[torch.Tensor, None], Mapping[str, Union[torch.Tensor, Any]]]: """Act stochastically in response to the state of the environment - :param states: Observation/state of the environment used to make the decision - :type states: torch.Tensor - :param taken_actions: Actions taken by a policy to the given states (default: ``None``). - The use of these actions only makes sense in critical models, e.g. - :type taken_actions: torch.Tensor, optional + :param inputs: Model inputs. The most common keys are: + + - ``"states"``: state of the environment used to make the decision + - ``"taken_actions"``: actions taken by the policy for the given states + :type inputs: dict where the values are typically torch.Tensor :param role: Role play by the model (default: ``""``) :type role: str, optional - - :return: Action to be taken by the agent given the state of the environment. - The sequence's components are the actions, the log of the probability density function and mean actions - :rtype: sequence of torch.Tensor + + :return: Model output. The first component is the action to be taken by the agent. + The second component is the log of the probability density function. + The third component is a dictionary containing the mean actions ``"mean_actions"`` + and extra output values + :rtype: tuple of torch.Tensor, torch.Tensor or None, and dictionary Example:: >>> # given a batch of sample states with shape (4096, 60) - >>> action, log_prob, mean_action = model.act(states) - >>> print(action.shape, log_prob.shape, mean_action.shape) + >>> actions, log_prob, outputs = model.act({"states": states}) + >>> print(actions.shape, log_prob.shape, outputs["mean_actions"].shape) torch.Size([4096, 8]) torch.Size([4096, 1]) torch.Size([4096, 8]) """ # map from states/observations to mean actions and log standard deviations - actions_mean, log_std = self.compute(states.to(self.device), - taken_actions.to(self.device) if taken_actions is not None else taken_actions, role) + mean_actions, log_std, outputs = self.compute(inputs, role) # clamp log standard deviations if self._mg_clip_log_std[role] if role in self._mg_clip_log_std else self._mg_clip_log_std[""]: - log_std = torch.clamp(log_std, + log_std = torch.clamp(log_std, self._mg_log_std_min[role] if role in self._mg_log_std_min else self._mg_log_std_min[""], self._mg_log_std_max[role] if role in self._mg_log_std_max else self._mg_log_std_max[""]) self._mg_log_std[role] = log_std - self._mg_num_samples[role] = actions_mean.shape[0] + self._mg_num_samples[role] = mean_actions.shape[0] # distribution covariance = torch.diag(log_std.exp() * log_std.exp()) - self._mg_distribution[role] = MultivariateNormal(actions_mean, scale_tril=covariance) + self._mg_distribution[role] = MultivariateNormal(mean_actions, scale_tril=covariance) # sample using the reparameterization trick actions = self._mg_distribution[role].rsample() @@ -146,13 +148,14 @@ def act(self, actions = torch.max(torch.min(actions, self.clip_actions_max), self.clip_actions_min) else: actions = torch.clamp(actions, min=self.clip_actions_min, max=self.clip_actions_max) - + # log of the probability density function - log_prob = self._mg_distribution[role].log_prob(actions if taken_actions is None else taken_actions) + log_prob = self._mg_distribution[role].log_prob(inputs.get("taken_actions", actions)) if log_prob.dim() != actions.dim(): log_prob = log_prob.unsqueeze(-1) - return actions, log_prob, actions_mean + outputs["mean_actions"] = mean_actions + return actions, log_prob, outputs def get_entropy(self, role: str = "") -> torch.Tensor: """Compute and return the entropy of the model diff --git a/skrl/models/torch/tabular.py b/skrl/models/torch/tabular.py index 8fe88279..4d1e994f 100644 --- a/skrl/models/torch/tabular.py +++ b/skrl/models/torch/tabular.py @@ -1,8 +1,8 @@ -from typing import Optional, Mapping, Sequence +from typing import Optional, Union, Mapping, Sequence, Tuple, Any import torch -from . import Model +from skrl.models.torch import Model class TabularMixin: @@ -19,23 +19,24 @@ def __init__(self, num_envs: int = 1, role: str = "") -> None: # define the model >>> import torch >>> from skrl.models.torch import Model, TabularMixin - >>> + >>> >>> class GreedyPolicy(TabularMixin, Model): ... def __init__(self, observation_space, action_space, device="cuda:0", num_envs=1): ... Model.__init__(self, observation_space, action_space, device) ... TabularMixin.__init__(self, num_envs) ... - ... self.table = torch.ones((num_envs, self.num_observations, self.num_actions), + ... self.table = torch.ones((num_envs, self.num_observations, self.num_actions), ... dtype=torch.float32, device=self.device) ... - ... def compute(self, states, taken_actions, role): - ... actions = torch.argmax(self.table[torch.arange(self.num_envs).view(-1, 1), states], + ... def compute(self, inputs, role): + ... actions = torch.argmax(self.table[torch.arange(self.num_envs).view(-1, 1), inputs["states"]], ... dim=-1, keepdim=True).view(-1,1) + ... return actions, {} ... >>> # given an observation_space: gym.spaces.Discrete with n=100 >>> # and an action_space: gym.spaces.Discrete with n=5 >>> model = GreedyPolicy(observation_space, action_space, num_envs=1) - >>> + >>> >>> print(model) GreedyPolicy( (table): Tensor(shape=[1, 100, 5]) @@ -69,35 +70,33 @@ def _get_tensor_names(self) -> Sequence[str]: tensors.append(attr) return sorted(tensors) - def act(self, - states: torch.Tensor, - taken_actions: Optional[torch.Tensor] = None, - role: str = "") -> Sequence[torch.Tensor]: + def act(self, + inputs: Mapping[str, Union[torch.Tensor, Any]], + role: str = "") -> Tuple[torch.Tensor, Union[torch.Tensor, None], Mapping[str, Union[torch.Tensor, Any]]]: """Act in response to the state of the environment - :param states: Observation/state of the environment used to make the decision - :type states: torch.Tensor - :param taken_actions: Actions taken by a policy to the given states (default: ``None``). - The use of these actions only makes sense in critical models, e.g. - :type taken_actions: torch.Tensor, optional + :param inputs: Model inputs. The most common keys are: + + - ``"states"``: state of the environment used to make the decision + - ``"taken_actions"``: actions taken by the policy for the given states + :type inputs: dict where the values are typically torch.Tensor :param role: Role play by the model (default: ``""``) :type role: str, optional - :return: Action to be taken by the agent given the state of the environment. - The sequence's components are the computed actions and None for the last two components - :rtype: sequence of torch.Tensor + :return: Model output. The first component is the action to be taken by the agent. + The second component is ``None``. The third component is a dictionary containing extra output values + :rtype: tuple of torch.Tensor, torch.Tensor or None, and dictionary Example:: >>> # given a batch of sample states with shape (1, 100) - >>> output = model.act(states) - >>> print(output[0], output[1], output[2]) - tensor([[3]], device='cuda:0') None None + >>> actions, _, outputs = model.act({"states": states}) + >>> print(actions[0], outputs) + tensor([[3]], device='cuda:0') {} """ - actions = self.compute(states.to(self.device), - taken_actions.to(self.device) if taken_actions is not None else taken_actions, role) - return actions, None, None - + actions, outputs = self.compute(inputs, role) + return actions, None, outputs + def table(self) -> torch.Tensor: """Return the Q-table @@ -143,12 +142,12 @@ def load_state_dict(self, state_dict: Mapping, strict: bool = True) -> None: :param state_dict: A dict containing parameters and persistent buffers :type state_dict: dict - :param strict: Whether to strictly enforce that the keys in state_dict match the keys + :param strict: Whether to strictly enforce that the keys in state_dict match the keys returned by this module's state_dict() function (default: ``True``) :type strict: bool, optional """ Model.load_state_dict(self, state_dict, strict=False) - + for name, tensor in state_dict.items(): if hasattr(self, name) and isinstance(getattr(self, name), torch.Tensor): _tensor = getattr(self, name) @@ -163,7 +162,7 @@ def load_state_dict(self, state_dict: Mapping, strict: bool = True) -> None: def save(self, path: str, state_dict: Optional[dict] = None) -> None: """Save the model to the specified path - + :param path: Path to save the model to :type path: str :param state_dict: State dictionary to save (default: ``None``). diff --git a/skrl/resources/noises/torch/__init__.py b/skrl/resources/noises/torch/__init__.py index 6fe4734f..b751123d 100644 --- a/skrl/resources/noises/torch/__init__.py +++ b/skrl/resources/noises/torch/__init__.py @@ -1,4 +1,4 @@ -from .base import Noise +from skrl.resources.noises.torch.base import Noise -from .gaussian import GaussianNoise -from .ornstein_uhlenbeck import OrnsteinUhlenbeckNoise +from skrl.resources.noises.torch.gaussian import GaussianNoise +from skrl.resources.noises.torch.ornstein_uhlenbeck import OrnsteinUhlenbeckNoise diff --git a/skrl/resources/noises/torch/base.py b/skrl/resources/noises/torch/base.py index dac95119..6ff37b3a 100644 --- a/skrl/resources/noises/torch/base.py +++ b/skrl/resources/noises/torch/base.py @@ -1,16 +1,29 @@ -from typing import Union, Tuple +from typing import Optional, Union, Tuple import torch class Noise(): - def __init__(self, device: Union[str, torch.device] = "cuda:0") -> None: + def __init__(self, device: Optional[Union[str, torch.device]] = None) -> None: """Base class representing a noise - :param device: Device on which a torch tensor is or will be allocated (default: "cuda:0") + :param device: Device on which a torch tensor is or will be allocated (default: ``None``). + If None, the device will be either ``"cuda:0"`` if available or ``"cpu"`` :type device: str or torch.device, optional + + Custom noises should override the ``sample`` method:: + + import torch + from skrl.resources.noises.torch import Noise + + class CustomNoise(Noise): + def __init__(self, device=None): + super().__init__(device) + + def sample(self, size): + return torch.rand(size, device=self.device) """ - self.device = torch.device(device) + self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") if device is None else torch.device(device) def sample_like(self, tensor: torch.Tensor) -> torch.Tensor: """Sample a noise with the same size (shape) as the input tensor @@ -19,9 +32,17 @@ def sample_like(self, tensor: torch.Tensor) -> torch.Tensor: :param tensor: Input tensor used to determine output tensor size (shape) :type tensor: torch.Tensor - + :return: Sampled noise :rtype: torch.Tensor + + Example:: + + >>> x = torch.rand(3, 2, device="cuda:0") + >>> noise.sample_like(x) + tensor([[-0.0423, -0.1325], + [-0.0639, -0.0957], + [-0.1367, 0.1031]], device='cuda:0') """ return self.sample(tensor.size()) @@ -30,10 +51,10 @@ def sample(self, size: Union[Tuple[int], torch.Size]) -> torch.Tensor: :param size: Shape of the sampled tensor :type size: tuple or list of integers, or torch.Size - + :raises NotImplementedError: The method is not implemented by the inheriting classes :return: Sampled noise :rtype: torch.Tensor """ - raise NotImplementedError("The sampling method (.sample()) is not implemented") \ No newline at end of file + raise NotImplementedError("The sampling method (.sample()) is not implemented") diff --git a/skrl/resources/noises/torch/gaussian.py b/skrl/resources/noises/torch/gaussian.py index 443741f4..f3ddfa94 100644 --- a/skrl/resources/noises/torch/gaussian.py +++ b/skrl/resources/noises/torch/gaussian.py @@ -1,34 +1,52 @@ -from typing import Union, Tuple +from typing import Optional, Union, Tuple import torch from torch.distributions import Normal -from . import Noise +from skrl.resources.noises.torch import Noise class GaussianNoise(Noise): - def __init__(self, mean: float, std: float, device: Union[str, torch.device] = "cuda:0") -> None: + def __init__(self, mean: float, std: float, device: Optional[Union[str, torch.device]] = None) -> None: """Class representing a Gaussian noise :param mean: Mean of the normal distribution :type mean: float :param std: Standard deviation of the normal distribution :type std: float - :param device: Device on which a torch tensor is or will be allocated (default: "cuda:0") + :param device: Device on which a torch tensor is or will be allocated (default: ``None``). + If None, the device will be either ``"cuda:0"`` if available or ``"cpu"`` :type device: str or torch.device, optional + + Example:: + + >>> noise = GaussianNoise(mean=0, std=1) """ super().__init__(device) self.distribution = Normal(loc=torch.tensor(mean, device=self.device, dtype=torch.float32), scale=torch.tensor(std, device=self.device, dtype=torch.float32)) - + def sample(self, size: Union[Tuple[int], torch.Size]) -> torch.Tensor: """Sample a Gaussian noise :param size: Shape of the sampled tensor :type size: tuple or list of integers, or torch.Size - + :return: Sampled noise :rtype: torch.Tensor + + Example:: + + >>> noise.sample((3, 2)) + tensor([[-0.4901, 1.3357], + [-1.2141, 0.3323], + [-0.0889, -1.1651]], device='cuda:0') + + >>> x = torch.rand(3, 2, device="cuda:0") + >>> noise.sample(x.shape) + tensor([[0.5398, 1.2009], + [0.0307, 1.3065], + [0.2082, 0.6116]], device='cuda:0') """ return self.distribution.sample(size) diff --git a/skrl/resources/noises/torch/ornstein_uhlenbeck.py b/skrl/resources/noises/torch/ornstein_uhlenbeck.py index eeb0973c..d133aa76 100644 --- a/skrl/resources/noises/torch/ornstein_uhlenbeck.py +++ b/skrl/resources/noises/torch/ornstein_uhlenbeck.py @@ -1,19 +1,19 @@ -from typing import Union, Tuple +from typing import Optional, Union, Tuple import torch from torch.distributions import Normal -from . import Noise +from skrl.resources.noises.torch import Noise class OrnsteinUhlenbeckNoise(Noise): - def __init__(self, - theta: float, - sigma: float, - base_scale: float, - mean: float = 0, - std: float = 1, - device: Union[str, torch.device] = "cuda:0") -> None: + def __init__(self, + theta: float, + sigma: float, + base_scale: float, + mean: float = 0, + std: float = 1, + device: Optional[Union[str, torch.device]] = None) -> None: """Class representing an Ornstein-Uhlenbeck noise :param theta: Factor to apply to current internal state @@ -22,12 +22,17 @@ def __init__(self, :type sigma: float :param base_scale: Factor to apply to returned noise :type base_scale: float - :param mean: Mean of the normal distribution (default: 0.0) + :param mean: Mean of the normal distribution (default: ``0.0``) :type mean: float, optional - :param std: Standard deviation of the normal distribution (default: 1.0) + :param std: Standard deviation of the normal distribution (default: ``1.0``) :type std: float, optional - :param device: Device on which a torch tensor is or will be allocated (default: "cuda:0") + :param device: Device on which a torch tensor is or will be allocated (default: ``None``). + If None, the device will be either ``"cuda:0"`` if available or ``"cpu"`` :type device: str or torch.device, optional + + Example:: + + >>> noise = OrnsteinUhlenbeckNoise(theta=0.1, sigma=0.2, base_scale=0.5) """ super().__init__(device) @@ -38,18 +43,31 @@ def __init__(self, self.distribution = Normal(loc=torch.tensor(mean, device=self.device, dtype=torch.float32), scale=torch.tensor(std, device=self.device, dtype=torch.float32)) - + def sample(self, size: Union[Tuple[int], torch.Size]) -> torch.Tensor: """Sample an Ornstein-Uhlenbeck noise :param size: Shape of the sampled tensor :type size: tuple or list of integers, or torch.Size - + :return: Sampled noise :rtype: torch.Tensor + + Example:: + + >>> noise.sample((3, 2)) + tensor([[-0.0452, 0.0162], + [ 0.0649, -0.0708], + [-0.0211, 0.0066]], device='cuda:0') + + >>> x = torch.rand(3, 2, device="cuda:0") + >>> noise.sample(x.shape) + tensor([[-0.0540, 0.0461], + [ 0.1117, -0.1157], + [-0.0074, 0.0420]], device='cuda:0') """ if isinstance(self.state, torch.Tensor) and self.state.size() != torch.Size(size): self.state = 0 self.state += -self.state * self.theta + self.sigma * self.distribution.sample(size) - + return self.base_scale * self.state diff --git a/skrl/resources/preprocessors/torch/__init__.py b/skrl/resources/preprocessors/torch/__init__.py index e0714537..793d27cb 100644 --- a/skrl/resources/preprocessors/torch/__init__.py +++ b/skrl/resources/preprocessors/torch/__init__.py @@ -1 +1 @@ -from .running_standard_scaler import RunningStandardScaler +from skrl.resources.preprocessors.torch.running_standard_scaler import RunningStandardScaler diff --git a/skrl/resources/preprocessors/torch/running_standard_scaler.py b/skrl/resources/preprocessors/torch/running_standard_scaler.py index 12c8fe47..a03f4ed3 100644 --- a/skrl/resources/preprocessors/torch/running_standard_scaler.py +++ b/skrl/resources/preprocessors/torch/running_standard_scaler.py @@ -1,6 +1,7 @@ -from typing import Union, Tuple +from typing import Optional, Union, Tuple import gym +import gymnasium import numpy as np import torch @@ -8,47 +9,52 @@ class RunningStandardScaler(nn.Module): - def __init__(self, - size: Union[int, Tuple[int], gym.Space], - epsilon: float = 1e-8, + def __init__(self, + size: Union[int, Tuple[int], gym.Space, gymnasium.Space], + epsilon: float = 1e-8, clip_threshold: float = 5.0, - device: Union[str, torch.device] = "cuda:0") -> None: + device: Optional[Union[str, torch.device]] = None) -> None: """Standardize the input data by removing the mean and scaling by the standard deviation - The implementation is adapted from the rl_games library + The implementation is adapted from the rl_games library (https://github.com/Denys88/rl_games/blob/master/rl_games/algos_torch/running_mean_std.py) Example:: >>> running_standard_scaler = RunningStandardScaler(size=2) - >>> data = ... # tensor of shape (N, 2) + >>> data = torch.rand(3, 2) # tensor of shape (N, 2) >>> running_standard_scaler(data) + tensor([[0.1954, 0.3356], + [0.9719, 0.4163], + [0.8540, 0.1982]]) :param size: Size of the input space - :type size: int, tuple or list of integers, or gym.Space - :param epsilon: Small number to avoid division by zero (default: 1e-8) + :type size: int, tuple or list of integers, gym.Space, or gymnasium.Space + :param epsilon: Small number to avoid division by zero (default: ``1e-8``) :type epsilon: float - :param clip_threshold: Threshold to clip the data (default: 5.0) + :param clip_threshold: Threshold to clip the data (default: ``5.0``) :type clip_threshold: float - :param device: Device on which a torch tensor is or will be allocated (default: "cuda:0") + :param device: Device on which a torch tensor is or will be allocated (default: ``None``). + If None, the device will be either ``"cuda:0"`` if available or ``"cpu"`` :type device: str or torch.device, optional """ super().__init__() self.epsilon = epsilon self.clip_threshold = clip_threshold + self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") if device is None else torch.device(device) size = self._get_space_size(size) - - self.register_buffer("running_mean", torch.zeros(size, dtype = torch.float64, device=device)) - self.register_buffer("running_variance", torch.ones(size, dtype = torch.float64, device=device)) - self.register_buffer("current_count", torch.ones((), dtype = torch.float64, device=device)) - def _get_space_size(self, space: Union[int, Tuple[int], gym.Space]) -> int: + self.register_buffer("running_mean", torch.zeros(size, dtype = torch.float64, device=self.device)) + self.register_buffer("running_variance", torch.ones(size, dtype = torch.float64, device=self.device)) + self.register_buffer("current_count", torch.ones((), dtype = torch.float64, device=self.device)) + + def _get_space_size(self, space: Union[int, Tuple[int], gym.Space, gymnasium.Space]) -> int: """Get the size (number of elements) of a space :param space: Space or shape from which to obtain the number of elements - :type space: int, tuple or list of integers, or gym.Space + :type space: int, tuple or list of integers, gym.Space, or gymnasium.Space :raises ValueError: If the space is not supported @@ -66,10 +72,17 @@ def _get_space_size(self, space: Union[int, Tuple[int], gym.Space]) -> int: return np.prod(space.shape) elif issubclass(type(space), gym.spaces.Dict): return sum([self._get_space_size(space.spaces[key]) for key in space.spaces]) + elif issubclass(type(space), gymnasium.Space): + if issubclass(type(space), gymnasium.spaces.Discrete): + return 1 + elif issubclass(type(space), gymnasium.spaces.Box): + return np.prod(space.shape) + elif issubclass(type(space), gymnasium.spaces.Dict): + return sum([self._get_space_size(space.spaces[key]) for key in space.spaces]) raise ValueError("Space type {} not supported".format(type(space))) def _parallel_variance(self, input_mean: torch.Tensor, input_var: torch.Tensor, input_count: int) -> None: - """Update internal variables using the parallel algorithm for computing variance + """Update internal variables using the parallel algorithm for computing variance https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm @@ -95,9 +108,9 @@ def _compute(self, x: torch.Tensor, train: bool = False, inverse: bool = False) :param x: Input tensor :type x: torch.Tensor - :param train: Whether to train the standardizer (default: False) + :param train: Whether to train the standardizer (default: ``False``) :type train: bool, optional - :param inverse: Whether to inverse the standardizer to scale back the data (default: False) + :param inverse: Whether to inverse the standardizer to scale back the data (default: ``False``) :type inverse: bool, optional """ if train: @@ -112,7 +125,7 @@ def _compute(self, x: torch.Tensor, train: bool = False, inverse: bool = False) * torch.clamp(x, min=-self.clip_threshold, max=self.clip_threshold) + self.running_mean.float() # standardization by centering and scaling else: - return torch.clamp((x - self.running_mean.float()) / (torch.sqrt(self.running_variance.float()) + self.epsilon), + return torch.clamp((x - self.running_mean.float()) / (torch.sqrt(self.running_variance.float()) + self.epsilon), min=-self.clip_threshold, max=self.clip_threshold) def forward(self, x: torch.Tensor, train: bool = False, inverse: bool = False, no_grad: bool = True) -> torch.Tensor: @@ -125,7 +138,7 @@ def forward(self, x: torch.Tensor, train: bool = False, inverse: bool = False, n tensor([[0.6933, 0.1905], [0.3806, 0.3162], [0.1140, 0.0272]], device='cuda:0') - + >>> running_standard_scaler(x, train=True) tensor([[ 0.8681, -0.6731], [ 0.0560, -0.3684], @@ -138,15 +151,15 @@ def forward(self, x: torch.Tensor, train: bool = False, inverse: bool = False, n :param x: Input tensor :type x: torch.Tensor - :param train: Whether to train the standardizer (default: False) + :param train: Whether to train the standardizer (default: ``False``) :type train: bool, optional - :param inverse: Whether to inverse the standardizer to scale back the data (default: False) + :param inverse: Whether to inverse the standardizer to scale back the data (default: ``False``) :type inverse: bool, optional - :param no_grad: Whether to disable the gradient computation (default: True) + :param no_grad: Whether to disable the gradient computation (default: ``True``) :type no_grad: bool, optional """ if no_grad: with torch.no_grad(): return self._compute(x, train, inverse) else: - return self._compute(x, train, inverse) \ No newline at end of file + return self._compute(x, train, inverse) diff --git a/skrl/resources/schedulers/torch/__init__.py b/skrl/resources/schedulers/torch/__init__.py index bae19571..bf481d83 100644 --- a/skrl/resources/schedulers/torch/__init__.py +++ b/skrl/resources/schedulers/torch/__init__.py @@ -1 +1 @@ -from .kl_adaptive import KLAdaptiveRL +from skrl.resources.schedulers.torch.kl_adaptive import KLAdaptiveRL diff --git a/skrl/resources/schedulers/torch/kl_adaptive.py b/skrl/resources/schedulers/torch/kl_adaptive.py index 6f7035f2..4e942b09 100644 --- a/skrl/resources/schedulers/torch/kl_adaptive.py +++ b/skrl/resources/schedulers/torch/kl_adaptive.py @@ -1,28 +1,28 @@ -from typing import Union +from typing import Union, Optional import torch from torch.optim.lr_scheduler import _LRScheduler class KLAdaptiveRL(_LRScheduler): - def __init__(self, - optimizer: torch.optim.Optimizer, - kl_threshold: float = 0.008, - min_lr: float = 1e-6, + def __init__(self, + optimizer: torch.optim.Optimizer, + kl_threshold: float = 0.008, + min_lr: float = 1e-6, max_lr: float = 1e-2, kl_factor: float = 2, lr_factor: float = 1.5, - last_epoch: int = -1, + last_epoch: int = -1, verbose: bool = False) -> None: """Adaptive KL scheduler - + Adjusts the learning rate according to the KL divergence. - The implementation is adapted from the rl_games library + The implementation is adapted from the rl_games library (https://github.com/Denys88/rl_games/blob/master/rl_games/common/schedulers.py) .. note:: - This scheduler is only available for PPO at the moment. + This scheduler is only available for PPO at the moment. Applying it to other agents will not change the learning rate Example:: @@ -36,19 +36,19 @@ def __init__(self, :param optimizer: Wrapped optimizer :type optimizer: torch.optim.Optimizer - :param kl_threshold: Threshold for KL divergence (default: 0.008) + :param kl_threshold: Threshold for KL divergence (default: ``0.008``) :type kl_threshold: float, optional - :param min_lr: Lower bound for learning rate (default: 1e-6) + :param min_lr: Lower bound for learning rate (default: ``1e-6``) :type min_lr: float, optional - :param max_lr: Upper bound for learning rate (default: 1e-2) + :param max_lr: Upper bound for learning rate (default: ``1e-2``) :type max_lr: float, optional - :param kl_factor: The number used to modify the KL divergence threshold (default: 2) + :param kl_factor: The number used to modify the KL divergence threshold (default: ``2``) :type kl_factor: float, optional - :param lr_factor: The number used to modify the learning rate (default: 1.5) + :param lr_factor: The number used to modify the learning rate (default: ``1.5``) :type lr_factor: float, optional - :param last_epoch: The index of last epoch (default: -1) + :param last_epoch: The index of last epoch (default: ``-1``) :type last_epoch: int, optional - :param verbose: Verbose mode (default: False) + :param verbose: Verbose mode (default: ``False``) :type verbose: bool, optional """ super().__init__(optimizer, last_epoch, verbose) @@ -61,10 +61,10 @@ def __init__(self, self._last_lr = [group['lr'] for group in self.optimizer.param_groups] - def step(self, kl: Union[torch.Tensor, float, None] = None, epoch: Union[int, None] = None) -> None: + def step(self, kl: Optional[Union[torch.Tensor, float]] = None, epoch: Optional[int] = None) -> None: """ Step scheduler - + Example:: >>> kl = torch.distributions.kl_divergence(p, q) @@ -88,5 +88,5 @@ def step(self, kl: Union[torch.Tensor, float, None] = None, epoch: Union[int, No group['lr'] = max(group['lr'] / self._lr_factor, self.min_lr) elif kl < self.kl_threshold / self._kl_factor: group['lr'] = min(group['lr'] * self._lr_factor, self.max_lr) - + self._last_lr = [group['lr'] for group in self.optimizer.param_groups] diff --git a/skrl/trainers/torch/__init__.py b/skrl/trainers/torch/__init__.py index 1077b277..6723541f 100644 --- a/skrl/trainers/torch/__init__.py +++ b/skrl/trainers/torch/__init__.py @@ -1,6 +1,6 @@ -from .base import Trainer -from .base import generate_equally_spaced_scopes +from skrl.trainers.torch.base import Trainer +from skrl.trainers.torch.base import generate_equally_spaced_scopes -from .sequential import SequentialTrainer -from .parallel import ParallelTrainer -from .manual import ManualTrainer +from skrl.trainers.torch.sequential import SequentialTrainer +from skrl.trainers.torch.parallel import ParallelTrainer +from skrl.trainers.torch.manual import ManualTrainer diff --git a/skrl/trainers/torch/base.py b/skrl/trainers/torch/base.py index 7565b090..32944d3e 100644 --- a/skrl/trainers/torch/base.py +++ b/skrl/trainers/torch/base.py @@ -1,11 +1,11 @@ -from typing import Union, List +from typing import Union, List, Optional import tqdm import torch -from ...envs.torch import Wrapper -from ...agents.torch import Agent +from skrl.envs.torch import Wrapper +from skrl.agents.torch import Agent def generate_equally_spaced_scopes(num_envs: int, num_agents: int) -> List[int]: @@ -30,12 +30,12 @@ def generate_equally_spaced_scopes(num_envs: int, num_agents: int) -> List[int]: return scopes -class Trainer(): +class Trainer: def __init__(self, env: Wrapper, agents: Union[Agent, List[Agent]], - agents_scope : List[int] = [], - cfg: dict = {}) -> None: + agents_scope: Optional[List[int]] = None, + cfg: Optional[dict] = None) -> None: """Base class for trainers :param env: Environment to train on @@ -47,14 +47,15 @@ def __init__(self, :param cfg: Configuration dictionary (default: {}) :type cfg: dict, optional """ - self.cfg = cfg + self.cfg = cfg if cfg is not None else {} self.env = env self.agents = agents - self.agents_scope = agents_scope + self.agents_scope = agents_scope if agents_scope is not None else [] # get configuration - self.timesteps = self.cfg.get('timesteps', 0) + self.timesteps = self.cfg.get("timesteps", 0) self.headless = self.cfg.get("headless", False) + self.disable_progressbar = self.cfg.get("disable_progressbar", False) self.initial_timestep = 0 @@ -135,14 +136,6 @@ def eval(self) -> None: """ raise NotImplementedError - def start(self) -> None: - """Start training - - This method is deprecated in favour of the '.train()' method - """ - # TODO: remove this method in future versions - print("[WARNING] Trainer.start() method is deprecated in favour of the '.train()' method") - def single_agent_train(self) -> None: """Train a single agent @@ -159,19 +152,19 @@ def single_agent_train(self) -> None: assert self.num_agents == 1, "This method is only valid for a single agent" # reset env - states = self.env.reset() + states, infos = self.env.reset() - for timestep in tqdm.tqdm(range(self.initial_timestep, self.timesteps)): + for timestep in tqdm.tqdm(range(self.initial_timestep, self.timesteps), disable=self.disable_progressbar): # pre-interaction self.agents.pre_interaction(timestep=timestep, timesteps=self.timesteps) # compute actions with torch.no_grad(): - actions, _, _ = self.agents.act(states, timestep=timestep, timesteps=self.timesteps) + actions = self.agents.act(states, timestep=timestep, timesteps=self.timesteps)[0] # step the environments - next_states, rewards, dones, infos = self.env.step(actions) + next_states, rewards, terminated, truncated, infos = self.env.step(actions) # render scene if not self.headless: @@ -183,7 +176,8 @@ def single_agent_train(self) -> None: actions=actions, rewards=rewards, next_states=next_states, - dones=dones, + terminated=terminated, + truncated=truncated, infos=infos, timestep=timestep, timesteps=self.timesteps) @@ -193,8 +187,8 @@ def single_agent_train(self) -> None: # reset environments with torch.no_grad(): - if dones.any(): - states = self.env.reset() + if terminated.any() or truncated.any(): + states, infos = self.env.reset() else: states.copy_(next_states) @@ -214,16 +208,16 @@ def single_agent_eval(self) -> None: assert self.num_agents == 1, "This method is only valid for a single agent" # reset env - states = self.env.reset() + states, infos = self.env.reset() - for timestep in tqdm.tqdm(range(self.initial_timestep, self.timesteps)): + for timestep in tqdm.tqdm(range(self.initial_timestep, self.timesteps), disable=self.disable_progressbar): # compute actions with torch.no_grad(): - actions, _, _ = self.agents.act(states, timestep=timestep, timesteps=self.timesteps) + actions = self.agents.act(states, timestep=timestep, timesteps=self.timesteps)[0] # step the environments - next_states, rewards, dones, infos = self.env.step(actions) + next_states, rewards, terminated, truncated, infos = self.env.step(actions) # render scene if not self.headless: @@ -231,19 +225,20 @@ def single_agent_eval(self) -> None: with torch.no_grad(): # write data to TensorBoard - super(type(self.agents), self.agents).record_transition(states=states, - actions=actions, - rewards=rewards, - next_states=next_states, - dones=dones, - infos=infos, - timestep=timestep, - timesteps=self.timesteps) + self.agents.record_transition(states=states, + actions=actions, + rewards=rewards, + next_states=next_states, + terminated=terminated, + truncated=truncated, + infos=infos, + timestep=timestep, + timesteps=self.timesteps) super(type(self.agents), self.agents).post_interaction(timestep=timestep, timesteps=self.timesteps) # reset environments - if dones.any(): - states = self.env.reset() + if terminated.any() or truncated.any(): + states, infos = self.env.reset() else: states.copy_(next_states) diff --git a/skrl/trainers/torch/manual.py b/skrl/trainers/torch/manual.py index d82fb26e..8b3af490 100644 --- a/skrl/trainers/torch/manual.py +++ b/skrl/trainers/torch/manual.py @@ -5,15 +5,16 @@ import torch -from ...envs.torch import Wrapper -from ...agents.torch import Agent +from skrl.envs.torch import Wrapper +from skrl.agents.torch import Agent -from . import Trainer +from skrl.trainers.torch import Trainer MANUAL_TRAINER_DEFAULT_CONFIG = { - "timesteps": 100000, # number of timesteps to train for - "headless": False, # whether to use headless mode (no rendering) + "timesteps": 100000, # number of timesteps to train for + "headless": False, # whether to use headless mode (no rendering) + "disable_progressbar": False, # whether to disable the progressbar. If None, disable on non-TTY } @@ -21,8 +22,8 @@ class ManualTrainer(Trainer): def __init__(self, env: Wrapper, agents: Union[Agent, List[Agent]], - agents_scope : List[int] = [], - cfg: dict = {}) -> None: + agents_scope: Optional[List[int]] = None, + cfg: Optional[dict] = None) -> None: """Manual trainer Train agents by manually controlling the training/evaluation loop @@ -38,15 +39,16 @@ def __init__(self, :type cfg: dict, optional """ _cfg = copy.deepcopy(MANUAL_TRAINER_DEFAULT_CONFIG) - _cfg.update(cfg) + _cfg.update(cfg if cfg is not None else {}) + agents_scope = agents_scope if agents_scope is not None else [] super().__init__(env=env, agents=agents, agents_scope=agents_scope, cfg=_cfg) # init agents if self.num_agents > 1: for agent in self.agents: - agent.init() + agent.init(trainer_cfg=self.cfg) else: - self.agents.init() + self.agents.init(trainer_cfg=self.cfg) self._progress = None @@ -74,12 +76,19 @@ def train(self, timestep: int, timesteps: Optional[int] = None) -> None: timesteps = self.timesteps if timesteps is None else timesteps if self._progress is None: - self._progress = tqdm.tqdm(total=timesteps) + self._progress = tqdm.tqdm(total=timesteps, disable=self.disable_progressbar) self._progress.update(n=1) + # set running mode + if self.num_agents > 1: + for agent in self.agents: + agent.set_running_mode("train") + else: + self.agents.set_running_mode("train") + # reset env if self.states is None: - self.states = self.env.reset() + self.states, infos = self.env.reset() if self.num_agents == 1: # pre-interaction @@ -87,7 +96,7 @@ def train(self, timestep: int, timesteps: Optional[int] = None) -> None: # compute actions with torch.no_grad(): - actions, _, _ = self.agents.act(self.states, timestep=timestep, timesteps=timesteps) + actions = self.agents.act(self.states, timestep=timestep, timesteps=timesteps)[0] else: # pre-interaction @@ -100,7 +109,7 @@ def train(self, timestep: int, timesteps: Optional[int] = None) -> None: for agent, scope in zip(self.agents, self.agents_scope)]) # step the environments - next_states, rewards, dones, infos = self.env.step(actions) + next_states, rewards, terminated, truncated, infos = self.env.step(actions) # render scene if not self.headless: @@ -113,7 +122,8 @@ def train(self, timestep: int, timesteps: Optional[int] = None) -> None: actions=actions, rewards=rewards, next_states=next_states, - dones=dones, + terminated=terminated, + truncated=truncated, infos=infos, timestep=timestep, timesteps=timesteps) @@ -129,7 +139,8 @@ def train(self, timestep: int, timesteps: Optional[int] = None) -> None: actions=actions[scope[0]:scope[1]], rewards=rewards[scope[0]:scope[1]], next_states=next_states[scope[0]:scope[1]], - dones=dones[scope[0]:scope[1]], + terminated=terminated[scope[0]:scope[1]], + truncated=truncated[scope[0]:scope[1]], infos=infos, timestep=timestep, timesteps=timesteps) @@ -140,8 +151,8 @@ def train(self, timestep: int, timesteps: Optional[int] = None) -> None: # reset environments with torch.no_grad(): - if dones.any(): - self.states = self.env.reset() + if terminated.any() or truncated.any(): + self.states, infos = self.env.reset() else: self.states.copy_(next_states) @@ -165,17 +176,24 @@ def eval(self, timestep: int, timesteps: Optional[int] = None) -> None: timesteps = self.timesteps if timesteps is None else timesteps if self._progress is None: - self._progress = tqdm.tqdm(total=timesteps) + self._progress = tqdm.tqdm(total=timesteps, disable=self.disable_progressbar) self._progress.update(n=1) + # set running mode + if self.num_agents > 1: + for agent in self.agents: + agent.set_running_mode("eval") + else: + self.agents.set_running_mode("eval") + # reset env if self.states is None: - self.states = self.env.reset() - + self.states, infos = self.env.reset() + with torch.no_grad(): if self.num_agents == 1: # compute actions - actions, _, _ = self.agents.act(self.states, timestep=timestep, timesteps=timesteps) + actions = self.agents.act(self.states, timestep=timestep, timesteps=timesteps)[0] else: # compute actions @@ -183,7 +201,7 @@ def eval(self, timestep: int, timesteps: Optional[int] = None) -> None: for agent, scope in zip(self.agents, self.agents_scope)]) # step the environments - next_states, rewards, dones, infos = self.env.step(actions) + next_states, rewards, terminated, truncated, infos = self.env.step(actions) # render scene if not self.headless: @@ -192,31 +210,33 @@ def eval(self, timestep: int, timesteps: Optional[int] = None) -> None: with torch.no_grad(): if self.num_agents == 1: # write data to TensorBoard - super(type(self.agents), self.agents).record_transition(states=self.states, - actions=actions, - rewards=rewards, - next_states=next_states, - dones=dones, - infos=infos, - timestep=timestep, - timesteps=timesteps) + self.agents.record_transition(states=self.states, + actions=actions, + rewards=rewards, + next_states=next_states, + terminated=terminated, + truncated=truncated, + infos=infos, + timestep=timestep, + timesteps=timesteps) super(type(self.agents), self.agents).post_interaction(timestep=timestep, timesteps=timesteps) else: # write data to TensorBoard for agent, scope in zip(self.agents, self.agents_scope): - super(type(agent), agent).record_transition(states=self.states[scope[0]:scope[1]], - actions=actions[scope[0]:scope[1]], - rewards=rewards[scope[0]:scope[1]], - next_states=next_states[scope[0]:scope[1]], - dones=dones[scope[0]:scope[1]], - infos=infos, - timestep=timestep, - timesteps=timesteps) + agent.record_transition(states=self.states[scope[0]:scope[1]], + actions=actions[scope[0]:scope[1]], + rewards=rewards[scope[0]:scope[1]], + next_states=next_states[scope[0]:scope[1]], + terminated=terminated[scope[0]:scope[1]], + truncated=truncated[scope[0]:scope[1]], + infos=infos, + timestep=timestep, + timesteps=timesteps) super(type(agent), agent).post_interaction(timestep=timestep, timesteps=timesteps) # reset environments - if dones.any(): - self.states = self.env.reset() + if terminated.any() or truncated.any(): + self.states, infos = self.env.reset() else: self.states.copy_(next_states) diff --git a/skrl/trainers/torch/parallel.py b/skrl/trainers/torch/parallel.py index c02957a7..342e32d6 100644 --- a/skrl/trainers/torch/parallel.py +++ b/skrl/trainers/torch/parallel.py @@ -1,4 +1,4 @@ -from typing import Union, List +from typing import Union, List, Optional import copy import tqdm @@ -6,15 +6,16 @@ import torch import torch.multiprocessing as mp -from ...envs.torch import Wrapper -from ...agents.torch import Agent +from skrl.envs.torch import Wrapper +from skrl.agents.torch import Agent -from . import Trainer +from skrl.trainers.torch import Trainer PARALLEL_TRAINER_DEFAULT_CONFIG = { - "timesteps": 100000, # number of timesteps to train for - "headless": False, # whether to use headless mode (no rendering) + "timesteps": 100000, # number of timesteps to train for + "headless": False, # whether to use headless mode (no rendering) + "disable_progressbar": False, # whether to disable the progressbar. If None, disable on non-TTY } @@ -25,6 +26,7 @@ def fn_processor(process_index, *args): queue = args[1][process_index] barrier = args[2] scope = args[3][process_index] + trainer_cfg = args[4] agent = None _states = None @@ -44,7 +46,7 @@ def fn_processor(process_index, *args): # initialize agent elif task == 'init': agent = queue.get() - agent.init() + agent.init(trainer_cfg=trainer_cfg) print("[INFO] Processor {}: init agent {} with scope {}".format(process_index, type(agent).__name__, scope)) barrier.wait() @@ -70,7 +72,8 @@ def fn_processor(process_index, *args): actions=_actions, rewards=queue.get()[scope[0]:scope[1]], next_states=queue.get()[scope[0]:scope[1]], - dones=queue.get()[scope[0]:scope[1]], + terminated=queue.get()[scope[0]:scope[1]], + truncated=queue.get()[scope[0]:scope[1]], infos=queue.get(), timestep=msg['timestep'], timesteps=msg['timesteps']) @@ -84,14 +87,15 @@ def fn_processor(process_index, *args): # write data to TensorBoard (evaluation) elif task == "eval-record_transition-post_interaction": with torch.no_grad(): - super(type(agent), agent).record_transition(states=_states, - actions=_actions, - rewards=queue.get()[scope[0]:scope[1]], - next_states=queue.get()[scope[0]:scope[1]], - dones=queue.get()[scope[0]:scope[1]], - infos=queue.get(), - timestep=msg['timestep'], - timesteps=msg['timesteps']) + agent.record_transition(states=_states, + actions=_actions, + rewards=queue.get()[scope[0]:scope[1]], + next_states=queue.get()[scope[0]:scope[1]], + terminated=queue.get()[scope[0]:scope[1]], + truncated=queue.get()[scope[0]:scope[1]], + infos=queue.get(), + timestep=msg['timestep'], + timesteps=msg['timesteps']) super(type(agent), agent).post_interaction(timestep=msg['timestep'], timesteps=msg['timesteps']) barrier.wait() @@ -100,8 +104,8 @@ class ParallelTrainer(Trainer): def __init__(self, env: Wrapper, agents: Union[Agent, List[Agent]], - agents_scope : List[int] = [], - cfg: dict = {}) -> None: + agents_scope: Optional[List[int]] = None, + cfg: Optional[dict] = None) -> None: """Parallel trainer Train agents in parallel using multiple processes @@ -117,7 +121,8 @@ def __init__(self, :type cfg: dict, optional """ _cfg = copy.deepcopy(PARALLEL_TRAINER_DEFAULT_CONFIG) - _cfg.update(cfg) + _cfg.update(cfg if cfg is not None else {}) + agents_scope = agents_scope if agents_scope is not None else [] super().__init__(env=env, agents=agents, agents_scope=agents_scope, cfg=_cfg) mp.set_start_method(method='spawn', force=True) @@ -135,9 +140,16 @@ def train(self) -> None: - Post-interaction (in parallel) - Reset environments """ + # set running mode + if self.num_agents > 1: + for agent in self.agents: + agent.set_running_mode("train") + else: + self.agents.set_running_mode("train") + # single agent if self.num_agents == 1: - self.agents.init() + self.agents.init(trainer_cfg=self.cfg) self.single_agent_train() return @@ -167,7 +179,7 @@ def train(self) -> None: # spawn and wait for all processes to start for i in range(self.num_agents): process = mp.Process(target=fn_processor, - args=(i, consumer_pipes, queues, barrier, self.agents_scope), + args=(i, consumer_pipes, queues, barrier, self.agents_scope, self.cfg), daemon=True) processes.append(process) process.start() @@ -180,11 +192,11 @@ def train(self) -> None: barrier.wait() # reset env - states = self.env.reset() + states, infos = self.env.reset() if not states.is_cuda: states.share_memory_() - for timestep in tqdm.tqdm(range(self.initial_timestep, self.timesteps)): + for timestep in tqdm.tqdm(range(self.initial_timestep, self.timesteps), disable=self.disable_progressbar): # pre-interaction for pipe in producer_pipes: @@ -201,7 +213,7 @@ def train(self) -> None: actions = torch.vstack([queue.get() for queue in queues]) # step the environments - next_states, rewards, dones, infos = self.env.step(actions) + next_states, rewards, terminated, truncated, infos = self.env.step(actions) # render scene if not self.headless: @@ -213,14 +225,17 @@ def train(self) -> None: rewards.share_memory_() if not next_states.is_cuda: next_states.share_memory_() - if not dones.is_cuda: - dones.share_memory_() + if not terminated.is_cuda: + terminated.share_memory_() + if not truncated.is_cuda: + truncated.share_memory_() for pipe, queue in zip(producer_pipes, queues): pipe.send({"task": "record_transition", "timestep": timestep, "timesteps": self.timesteps}) queue.put(rewards) queue.put(next_states) - queue.put(dones) + queue.put(terminated) + queue.put(truncated) queue.put(infos) barrier.wait() @@ -231,8 +246,8 @@ def train(self) -> None: # reset environments with torch.no_grad(): - if dones.any(): - states = self.env.reset() + if terminated.any() or truncated.any(): + states, infos = self.env.reset() if not states.is_cuda: states.share_memory_() else: @@ -259,9 +274,16 @@ def eval(self) -> None: - Render scene - Reset environments """ + # set running mode + if self.num_agents > 1: + for agent in self.agents: + agent.set_running_mode("eval") + else: + self.agents.set_running_mode("eval") + # single agent if self.num_agents == 1: - self.agents.init() + self.agents.init(trainer_cfg=self.cfg) self.single_agent_eval() return @@ -292,7 +314,7 @@ def eval(self) -> None: # spawn and wait for all processes to start for i in range(self.num_agents): process = mp.Process(target=fn_processor, - args=(i, consumer_pipes, queues, barrier, self.agents_scope), + args=(i, consumer_pipes, queues, barrier, self.agents_scope, self.cfg), daemon=True) processes.append(process) process.start() @@ -305,11 +327,11 @@ def eval(self) -> None: barrier.wait() # reset env - states = self.env.reset() + states, infos = self.env.reset() if not states.is_cuda: states.share_memory_() - for timestep in tqdm.tqdm(range(self.initial_timestep, self.timesteps)): + for timestep in tqdm.tqdm(range(self.initial_timestep, self.timesteps), disable=self.disable_progressbar): # compute actions with torch.no_grad(): @@ -321,7 +343,7 @@ def eval(self) -> None: actions = torch.vstack([queue.get() for queue in queues]) # step the environments - next_states, rewards, dones, infos = self.env.step(actions) + next_states, rewards, terminated, truncated, infos = self.env.step(actions) # render scene if not self.headless: @@ -333,8 +355,10 @@ def eval(self) -> None: rewards.share_memory_() if not next_states.is_cuda: next_states.share_memory_() - if not dones.is_cuda: - dones.share_memory_() + if not terminated.is_cuda: + terminated.share_memory_() + if not truncated.is_cuda: + truncated.share_memory_() for pipe, queue in zip(producer_pipes, queues): pipe.send({"task": "eval-record_transition-post_interaction", @@ -342,13 +366,14 @@ def eval(self) -> None: "timesteps": self.timesteps}) queue.put(rewards) queue.put(next_states) - queue.put(dones) + queue.put(terminated) + queue.put(truncated) queue.put(infos) barrier.wait() # reset environments - if dones.any(): - states = self.env.reset() + if terminated.any() or truncated.any(): + states, infos = self.env.reset() if not states.is_cuda: states.share_memory_() else: diff --git a/skrl/trainers/torch/sequential.py b/skrl/trainers/torch/sequential.py index 910b6079..aacb6e8a 100644 --- a/skrl/trainers/torch/sequential.py +++ b/skrl/trainers/torch/sequential.py @@ -1,19 +1,20 @@ -from typing import Union, List +from typing import Union, List, Optional import copy import tqdm import torch -from ...envs.torch import Wrapper -from ...agents.torch import Agent +from skrl.envs.torch import Wrapper +from skrl.agents.torch import Agent -from . import Trainer +from skrl.trainers.torch import Trainer SEQUENTIAL_TRAINER_DEFAULT_CONFIG = { - "timesteps": 100000, # number of timesteps to train for - "headless": False, # whether to use headless mode (no rendering) + "timesteps": 100000, # number of timesteps to train for + "headless": False, # whether to use headless mode (no rendering) + "disable_progressbar": False, # whether to disable the progressbar. If None, disable on non-TTY } @@ -21,8 +22,8 @@ class SequentialTrainer(Trainer): def __init__(self, env: Wrapper, agents: Union[Agent, List[Agent]], - agents_scope : List[int] = [], - cfg: dict = {}) -> None: + agents_scope: Optional[List[int]] = None, + cfg: Optional[dict] = None) -> None: """Sequential trainer Train agents sequentially (i.e., one after the other in each interaction with the environment) @@ -38,15 +39,16 @@ def __init__(self, :type cfg: dict, optional """ _cfg = copy.deepcopy(SEQUENTIAL_TRAINER_DEFAULT_CONFIG) - _cfg.update(cfg) + _cfg.update(cfg if cfg is not None else {}) + agents_scope = agents_scope if agents_scope is not None else [] super().__init__(env=env, agents=agents, agents_scope=agents_scope, cfg=_cfg) # init agents if self.num_agents > 1: for agent in self.agents: - agent.init() + agent.init(trainer_cfg=self.cfg) else: - self.agents.init() + self.agents.init(trainer_cfg=self.cfg) def train(self) -> None: """Train the agents sequentially @@ -61,15 +63,22 @@ def train(self) -> None: - Post-interaction (sequentially) - Reset environments """ + # set running mode + if self.num_agents > 1: + for agent in self.agents: + agent.set_running_mode("train") + else: + self.agents.set_running_mode("train") + # single agent if self.num_agents == 1: self.single_agent_train() return # reset env - states = self.env.reset() + states, infos = self.env.reset() - for timestep in tqdm.tqdm(range(self.initial_timestep, self.timesteps)): + for timestep in tqdm.tqdm(range(self.initial_timestep, self.timesteps), disable=self.disable_progressbar): # pre-interaction for agent in self.agents: @@ -81,7 +90,7 @@ def train(self) -> None: for agent, scope in zip(self.agents, self.agents_scope)]) # step the environments - next_states, rewards, dones, infos = self.env.step(actions) + next_states, rewards, terminated, truncated, infos = self.env.step(actions) # render scene if not self.headless: @@ -94,7 +103,8 @@ def train(self) -> None: actions=actions[scope[0]:scope[1]], rewards=rewards[scope[0]:scope[1]], next_states=next_states[scope[0]:scope[1]], - dones=dones[scope[0]:scope[1]], + terminated=terminated[scope[0]:scope[1]], + truncated=truncated[scope[0]:scope[1]], infos=infos, timestep=timestep, timesteps=self.timesteps) @@ -105,8 +115,8 @@ def train(self) -> None: # reset environments with torch.no_grad(): - if dones.any(): - states = self.env.reset() + if terminated.any() or truncated.any(): + states, infos = self.env.reset() else: states.copy_(next_states) @@ -123,15 +133,22 @@ def eval(self) -> None: - Render scene - Reset environments """ + # set running mode + if self.num_agents > 1: + for agent in self.agents: + agent.set_running_mode("eval") + else: + self.agents.set_running_mode("eval") + # single agent if self.num_agents == 1: self.single_agent_eval() return # reset env - states = self.env.reset() + states, infos = self.env.reset() - for timestep in tqdm.tqdm(range(self.initial_timestep, self.timesteps)): + for timestep in tqdm.tqdm(range(self.initial_timestep, self.timesteps), disable=self.disable_progressbar): # compute actions with torch.no_grad(): @@ -139,7 +156,7 @@ def eval(self) -> None: for agent, scope in zip(self.agents, self.agents_scope)]) # step the environments - next_states, rewards, dones, infos = self.env.step(actions) + next_states, rewards, terminated, truncated, infos = self.env.step(actions) # render scene if not self.headless: @@ -148,29 +165,22 @@ def eval(self) -> None: with torch.no_grad(): # write data to TensorBoard for agent, scope in zip(self.agents, self.agents_scope): - super(type(agent), agent).record_transition(states=states[scope[0]:scope[1]], - actions=actions[scope[0]:scope[1]], - rewards=rewards[scope[0]:scope[1]], - next_states=next_states[scope[0]:scope[1]], - dones=dones[scope[0]:scope[1]], - infos=infos, - timestep=timestep, - timesteps=self.timesteps) + agent.record_transition(states=states[scope[0]:scope[1]], + actions=actions[scope[0]:scope[1]], + rewards=rewards[scope[0]:scope[1]], + next_states=next_states[scope[0]:scope[1]], + terminated=terminated[scope[0]:scope[1]], + truncated=truncated[scope[0]:scope[1]], + infos=infos, + timestep=timestep, + timesteps=self.timesteps) super(type(agent), agent).post_interaction(timestep=timestep, timesteps=self.timesteps) # reset environments - if dones.any(): - states = self.env.reset() + if terminated.any() or truncated.any(): + states, infos = self.env.reset() else: states.copy_(next_states) # close the environment self.env.close() - - def start(self) -> None: - """Start training - - This method is deprecated in favour of the '.train()' method - """ - super().start() - self.train() diff --git a/skrl/utils/__init__.py b/skrl/utils/__init__.py index aaef12ab..1689e22c 100644 --- a/skrl/utils/__init__.py +++ b/skrl/utils/__init__.py @@ -14,8 +14,8 @@ def set_seed(seed: Optional[int] = None, deterministic: bool = False) -> int: """ Set the seed for the random number generators - Due to NumPy's legacy seeding constraint the seed must be between 0 and 2**32 - 1. - Otherwise a NumPy exception (``ValueError: Seed must be between 0 and 2**32 - 1``) will be raised + Due to NumPy's legacy seeding constraint the seed must be between 0 and 2**32 - 1. + Otherwise a NumPy exception (``ValueError: Seed must be between 0 and 2**32 - 1``) will be raised Modified packages: @@ -49,7 +49,7 @@ def set_seed(seed: Optional[int] = None, deterministic: bool = False) -> int: :param seed: The seed to set. Is None, a random seed will be generated (default: ``None``) :type seed: int, optional :param deterministic: Whether PyTorch is configured to use deterministic algorithms (default: ``False``). - The following environment variables should be established for CUDA 10.1 (``CUDA_LAUNCH_BLOCKING=1``) + The following environment variables should be established for CUDA 10.1 (``CUDA_LAUNCH_BLOCKING=1``) and for CUDA 10.2 or later (``CUBLAS_WORKSPACE_CONFIG=:16:8`` or ``CUBLAS_WORKSPACE_CONFIG=:4096:2``). See PyTorch `Reproducibility `_ for details :type deterministic: bool, optional diff --git a/skrl/utils/control.py b/skrl/utils/control.py index d04a1508..d12591a8 100644 --- a/skrl/utils/control.py +++ b/skrl/utils/control.py @@ -2,8 +2,8 @@ import torch -def ik(jacobian_end_effector, - current_position, current_orientation, +def ik(jacobian_end_effector, + current_position, current_orientation, goal_position, goal_orientation, damping_factor=0.05): """ @@ -22,15 +22,15 @@ def ik(jacobian_end_effector, lmbda = torch.eye(6).to(jacobian_end_effector.device) * (damping_factor ** 2) return (transpose @ torch.inverse(jacobian_end_effector @ transpose + lmbda) @ dpose) -def osc(jacobian_end_effector, mass_matrix, - current_position, current_orientation, +def osc(jacobian_end_effector, mass_matrix, + current_position, current_orientation, goal_position, goal_orientation, current_dof_velocities, kp=5, kv=2): """ https://studywolf.wordpress.com/2013/09/17/robot-control-4-operation-space-control/ """ - + mass_matrix_end_effector = torch.inverse(jacobian_end_effector @ torch.inverse(mass_matrix) @ torch.transpose(jacobian_end_effector, 1, 2)) # compute position and orientation error @@ -41,4 +41,3 @@ def osc(jacobian_end_effector, mass_matrix, dpose = torch.cat([position_error, orientation_error], -1) return torch.transpose(jacobian_end_effector, 1, 2) @ mass_matrix_end_effector @ (kp * dpose).unsqueeze(-1) - kv * mass_matrix @ current_dof_velocities - \ No newline at end of file diff --git a/skrl/utils/isaacgym_utils.py b/skrl/utils/isaacgym_utils.py index eb7aa2b6..928d3636 100644 --- a/skrl/utils/isaacgym_utils.py +++ b/skrl/utils/isaacgym_utils.py @@ -1,4 +1,4 @@ -from typing import Union, List +from typing import List, Optional import math import logging @@ -69,9 +69,9 @@ def _route_index(self) -> 'flask.Response': @@ -114,7 +114,7 @@ def _route_index(self) -> 'flask.Response': if(event.keyCode != 18) sendInputRequest({key: event.keyCode}); }, false); - + canvas.addEventListener('mousemove', function(event){ if(event.buttons){ let data = {dx: event.movementX, dy: event.movementY}; @@ -132,7 +132,7 @@ def _route_index(self) -> 'flask.Response': } }, false); - canvas.addEventListener('wheel', function(event){ + canvas.addEventListener('wheel', function(event){ sendInputRequest({mouse: "wheel", dz: Math.sign(event.deltaY)}); }, false); } @@ -145,12 +145,12 @@ def _route_index(self) -> 'flask.Response': def _route_stream(self) -> 'flask.Response': """Stream the image to the web page - + :return: Flask response :rtype: flask.Response """ return flask.Response(self._stream(), mimetype='multipart/x-mixed-replace; boundary=frame') - + def _route_input_event(self) -> 'flask.Response': """Handle keyboard and mouse input @@ -182,22 +182,22 @@ def p_target(p, q, a=0, b=0, c=1, d=0): t = -(a * p[0] + b * p[1] + c * p[2] + d) / denominator return [p[0] + t * (p1[0] - p[0]), p[1] + t * (p1[1] - p[1]), p[2] + t * (p1[2] - p[2])] return v - + # get keyboard and mouse inputs data = flask.request.get_json() key, mouse = data.get("key", None), data.get("mouse", None) dx, dy, dz = data.get("dx", None), data.get("dy", None), data.get("dz", None) - transform = self._gym.get_camera_transform(self._sim, + transform = self._gym.get_camera_transform(self._sim, self._envs[self._camera_id], self._cameras[self._camera_id]) # zoom in/out if mouse == "wheel": # compute zoom vector - vector = qv_mult([transform.r.w, transform.r.x, transform.r.y, transform.r.z], + vector = qv_mult([transform.r.w, transform.r.x, transform.r.y, transform.r.z], [-0.025 * dz, 0, 0]) - + # update transform transform.p.x += vector[0] transform.p.y += vector[1] @@ -214,7 +214,7 @@ def p_target(p, q, a=0, b=0, c=1, d=0): q = q_mult(q, q_from_angle_axis(dy, [1, 0, 0])) # apply rotation - t = p_target([transform.p.x, transform.p.y, transform.p.z], + t = p_target([transform.p.x, transform.p.y, transform.p.z], [transform.r.w, transform.r.x, transform.r.y, transform.r.z]) p = qv_mult(q, [transform.p.x - t[0], transform.p.y - t[1], transform.p.z - t[2]]) q = q_mult(q, [transform.r.w, transform.r.x, transform.r.y, transform.r.z]) @@ -240,7 +240,7 @@ def p_target(p, q, a=0, b=0, c=1, d=0): # update transform transform.r.w, transform.r.x, transform.r.y, transform.r.z = q - + # walk camera elif mouse == "middle": # compute displacement @@ -264,12 +264,12 @@ def p_target(p, q, a=0, b=0, c=1, d=0): elif self._camera_type == gymapi.IMAGE_DEPTH: self._camera_type = gymapi.IMAGE_COLOR return flask.Response(status=200) - + else: return flask.Response(status=200) - self._gym.set_camera_transform(self._cameras[self._camera_id], - self._envs[self._camera_id], + self._gym.set_camera_transform(self._cameras[self._camera_id], + self._envs[self._camera_id], transform) return flask.Response(status=200) @@ -310,9 +310,9 @@ def setup(self, gym: 'isaacgym.gymapi.Gym', sim: 'isaacgym.gymapi.Sim', envs: Li self._envs = envs self._cameras = cameras - def render(self, - fetch_results: bool = True, - step_graphics: bool = True, + def render(self, + fetch_results: bool = True, + step_graphics: bool = True, render_all_camera_sensors: bool = True, wait_for_page_load: bool = True) -> None: """Render and get the image from the current camera @@ -320,11 +320,11 @@ def render(self, This function must be called after the simulation is stepped (post_physics_step). The following Isaac Gym functions are called before get the image. Their calling can be skipped by setting the corresponding argument to False - + - fetch_results - step_graphics - render_all_camera_sensors - + :param fetch_results: Call Gym.fetch_results method (default: True) :type fetch_results: bool :param step_graphics: Call Gym.step_graphics method (default: True) @@ -357,33 +357,33 @@ def render(self, self._gym.step_graphics(self._sim) if render_all_camera_sensors: self._gym.render_all_camera_sensors(self._sim) - + # get image - image = self._gym.get_camera_image(self._sim, + image = self._gym.get_camera_image(self._sim, self._envs[self._camera_id], - self._cameras[self._camera_id], + self._cameras[self._camera_id], self._camera_type) if self._camera_type == gymapi.IMAGE_COLOR: self._image = image.reshape(image.shape[0], -1, 4)[..., :3] elif self._camera_type == gymapi.IMAGE_DEPTH: - self._image = -image.reshape(image.shape[0], -1) + self._image = -image.reshape(image.shape[0], -1) minimum = 0 if np.isinf(np.min(self._image)) else np.min(self._image) maximum = 5 if np.isinf(np.max(self._image)) else np.max(self._image) self._image = np.clip(1 - (self._image - minimum) / (maximum - minimum), 0, 1) self._image = np.uint8(255 * self._image) else: raise ValueError("Unsupported camera type") - + # notify stream thread self._event_stream.set() self._notified = True -def ik(jacobian_end_effector: torch.Tensor, +def ik(jacobian_end_effector: torch.Tensor, current_position: torch.Tensor, current_orientation: torch.Tensor, goal_position: torch.Tensor, - goal_orientation: Union[torch.Tensor, None] = None, + goal_orientation: Optional[torch.Tensor] = None, damping_factor: float = 0.05, squeeze_output: bool = True) -> torch.Tensor: """ @@ -431,18 +431,18 @@ def print_arguments(args): print(" |-- {}: {}".format(a, args.__getattribute__(a))) def print_asset_options(asset_options: 'isaacgym.gymapi.AssetOptions', asset_name: str = ""): - attrs = ["angular_damping", "armature", "collapse_fixed_joints", "convex_decomposition_from_submeshes", - "default_dof_drive_mode", "density", "disable_gravity", "fix_base_link", "flip_visual_attachments", - "linear_damping", "max_angular_velocity", "max_linear_velocity", "mesh_normal_mode", "min_particle_mass", - "override_com", "override_inertia", "replace_cylinder_with_capsule", "tendon_limit_stiffness", "thickness", + attrs = ["angular_damping", "armature", "collapse_fixed_joints", "convex_decomposition_from_submeshes", + "default_dof_drive_mode", "density", "disable_gravity", "fix_base_link", "flip_visual_attachments", + "linear_damping", "max_angular_velocity", "max_linear_velocity", "mesh_normal_mode", "min_particle_mass", + "override_com", "override_inertia", "replace_cylinder_with_capsule", "tendon_limit_stiffness", "thickness", "use_mesh_materials", "use_physx_armature", "vhacd_enabled"] # vhacd_params print("\nAsset options{}".format(" ({})".format(asset_name) if asset_name else "")) for attr in attrs: print(" |-- {}: {}".format(attr, getattr(asset_options, attr) if hasattr(asset_options, attr) else "--")) # vhacd attributes if attr == "vhacd_enabled" and hasattr(asset_options, attr) and getattr(asset_options, attr): - vhacd_attrs = ["alpha", "beta", "concavity", "convex_hull_approximation", "convex_hull_downsampling", - "max_convex_hulls", "max_num_vertices_per_ch", "min_volume_per_ch", "mode", "ocl_acceleration", + vhacd_attrs = ["alpha", "beta", "concavity", "convex_hull_approximation", "convex_hull_downsampling", + "max_convex_hulls", "max_num_vertices_per_ch", "min_volume_per_ch", "mode", "ocl_acceleration", "pca", "plane_downsampling", "project_hull_vertices", "resolution"] print(" |-- vhacd_params:") for vhacd_attr in vhacd_attrs: diff --git a/skrl/utils/model_instantiators.py b/skrl/utils/model_instantiators.py index 86b64294..8de39032 100644 --- a/skrl/utils/model_instantiators.py +++ b/skrl/utils/model_instantiators.py @@ -1,16 +1,17 @@ -from typing import Union, Tuple +from typing import Union, Tuple, Optional import gym +import gymnasium from enum import Enum import torch import torch.nn as nn -from ..models.torch import Model -from ..models.torch import GaussianMixin -from ..models.torch import CategoricalMixin -from ..models.torch import DeterministicMixin -from ..models.torch import MultivariateGaussianMixin +from skrl.models.torch import Model +from skrl.models.torch import GaussianMixin +from skrl.models.torch import CategoricalMixin +from skrl.models.torch import DeterministicMixin +from skrl.models.torch import MultivariateGaussianMixin __all__ = ["categorical_model", "deterministic_model", "gaussian_model", "multivariate_gaussian_model", "Shape"] @@ -82,18 +83,18 @@ def _get_num_units_by_shape(model: Model, shape: Shape) -> int: :rtype: int """ num_units = {Shape.ONE: 1, - Shape.STATES: model.num_observations, + Shape.STATES: model.num_observations, Shape.ACTIONS: model.num_actions, Shape.STATES_ACTIONS: model.num_observations + model.num_actions} return num_units[shape] -def _generate_sequential(model: Model, - input_shape: Shape = Shape.STATES, - hiddens: list = [256, 256], - hidden_activation: list = ["relu", "relu"], - output_shape: Shape = Shape.ACTIONS, - output_activation: Union[str, None] = "tanh", - output_scale: int = None) -> nn.Sequential: +def _generate_sequential(model: Model, + input_shape: Shape = Shape.STATES, + hiddens: list = [256, 256], + hidden_activation: list = ["relu", "relu"], + output_shape: Shape = Shape.ACTIONS, + output_activation: Union[str, None] = "tanh", + output_scale: Optional[int] = None) -> nn.Sequential: """Generate a sequential model :param model: model to generate sequential model for @@ -127,31 +128,32 @@ def _generate_sequential(model: Model, output_layer = [nn.Linear(hiddens[-1], _get_num_units_by_shape(model, output_shape))] if output_activation is not None: output_layer.append(_get_activation_function(output_activation)) - + return nn.Sequential(*input_layer, *hidden_layers, *output_layer) -def gaussian_model(observation_space: Union[int, Tuple[int], gym.Space, None] = None, - action_space: Union[int, Tuple[int], gym.Space, None] = None, - device: Union[str, torch.device] = "cuda:0", - clip_actions: bool = False, - clip_log_std: bool = True, - min_log_std: float = -20, - max_log_std: float = 2, - input_shape: Shape = Shape.STATES, - hiddens: list = [256, 256], - hidden_activation: list = ["relu", "relu"], - output_shape: Shape = Shape.ACTIONS, - output_activation: Union[str, None] = "tanh", - output_scale: float = 1.0) -> Model: +def gaussian_model(observation_space: Optional[Union[int, Tuple[int], gym.Space, gymnasium.Space]] = None, + action_space: Optional[Union[int, Tuple[int], gym.Space, gymnasium.Space]] = None, + device: Optional[Union[str, torch.device]] = None, + clip_actions: bool = False, + clip_log_std: bool = True, + min_log_std: float = -20, + max_log_std: float = 2, + input_shape: Shape = Shape.STATES, + hiddens: list = [256, 256], + hidden_activation: list = ["relu", "relu"], + output_shape: Shape = Shape.ACTIONS, + output_activation: Optional[str] = "tanh", + output_scale: float = 1.0) -> Model: """Instantiate a Gaussian model :param observation_space: Observation/state space or shape (default: None). If it is not None, the num_observations property will contain the size of that space - :type observation_space: int, tuple or list of integers, gym.Space or None, optional + :type observation_space: int, tuple or list of integers, gym.Space, gymnasium.Space or None, optional :param action_space: Action space or shape (default: None). If it is not None, the num_actions property will contain the size of that space - :type action_space: int, tuple or list of integers, gym.Space or None, optional - :param device: Device on which the model will be trained (default: "cuda:0") + :type action_space: int, tuple or list of integers, gym.Space, gymnasium.Space or None, optional + :param device: Device on which a torch tensor is or will be allocated (default: ``None``). + If None, the device will be either ``"cuda:0"`` if available or ``"cpu"`` :type device: str or torch.device, optional :param clip_actions: Flag to indicate whether the actions should be clipped (default: False) :type clip_actions: bool, optional @@ -195,55 +197,56 @@ def __init__(self, observation_space, action_space, device, clip_actions, output_activation=metadata["output_activation"], output_scale=metadata["output_scale"]) self.log_std_parameter = nn.Parameter(torch.zeros(_get_num_units_by_shape(self, metadata["output_shape"]))) - - def compute(self, states, taken_actions=None, role=""): + + def compute(self, inputs, role=""): if self.instantiator_input_type == 0: - output = self.net(states) + output = self.net(inputs["states"]) elif self.instantiator_input_type == -1: - output = self.net(taken_actions) + output = self.net(inputs["taken_actions"]) elif self.instantiator_input_type == -2: - output = self.net(torch.cat((states, taken_actions), dim=1)) + output = self.net(torch.cat((inputs["states"], inputs["taken_actions"]), dim=1)) - return output * self.instantiator_output_scale, self.log_std_parameter + return output * self.instantiator_output_scale, self.log_std_parameter, {} - metadata = {"input_shape": input_shape, - "hiddens": hiddens, - "hidden_activation": hidden_activation, - "output_shape": output_shape, - "output_activation": output_activation, + metadata = {"input_shape": input_shape, + "hiddens": hiddens, + "hidden_activation": hidden_activation, + "output_shape": output_shape, + "output_activation": output_activation, "output_scale": output_scale} return GaussianModel(observation_space=observation_space, - action_space=action_space, - device=device, - clip_actions=clip_actions, - clip_log_std=clip_log_std, + action_space=action_space, + device=device, + clip_actions=clip_actions, + clip_log_std=clip_log_std, min_log_std=min_log_std, max_log_std=max_log_std, metadata=metadata) - -def multivariate_gaussian_model(observation_space: Union[int, Tuple[int], gym.Space, None] = None, - action_space: Union[int, Tuple[int], gym.Space, None] = None, - device: Union[str, torch.device] = "cuda:0", - clip_actions: bool = False, - clip_log_std: bool = True, - min_log_std: float = -20, - max_log_std: float = 2, - input_shape: Shape = Shape.STATES, - hiddens: list = [256, 256], - hidden_activation: list = ["relu", "relu"], - output_shape: Shape = Shape.ACTIONS, - output_activation: Union[str, None] = "tanh", - output_scale: float = 1.0) -> Model: + +def multivariate_gaussian_model(observation_space: Optional[Union[int, Tuple[int], gym.Space, gymnasium.Space]] = None, + action_space: Optional[Union[int, Tuple[int], gym.Space, gymnasium.Space]] = None, + device: Optional[Union[str, torch.device]] = None, + clip_actions: bool = False, + clip_log_std: bool = True, + min_log_std: float = -20, + max_log_std: float = 2, + input_shape: Shape = Shape.STATES, + hiddens: list = [256, 256], + hidden_activation: list = ["relu", "relu"], + output_shape: Shape = Shape.ACTIONS, + output_activation: Optional[str] = "tanh", + output_scale: float = 1.0) -> Model: """Instantiate a multivariate Gaussian model :param observation_space: Observation/state space or shape (default: None). If it is not None, the num_observations property will contain the size of that space - :type observation_space: int, tuple or list of integers, gym.Space or None, optional + :type observation_space: int, tuple or list of integers, gym.Space, gymnasium.Space or None, optional :param action_space: Action space or shape (default: None). If it is not None, the num_actions property will contain the size of that space - :type action_space: int, tuple or list of integers, gym.Space or None, optional - :param device: Device on which the model will be trained (default: "cuda:0") + :type action_space: int, tuple or list of integers, gym.Space, gymnasium.Space or None, optional + :param device: Device on which a torch tensor is or will be allocated (default: ``None``). + If None, the device will be either ``"cuda:0"`` if available or ``"cpu"`` :type device: str or torch.device, optional :param clip_actions: Flag to indicate whether the actions should be clipped (default: False) :type clip_actions: bool, optional @@ -287,52 +290,53 @@ def __init__(self, observation_space, action_space, device, clip_actions, output_activation=metadata["output_activation"], output_scale=metadata["output_scale"]) self.log_std_parameter = nn.Parameter(torch.zeros(_get_num_units_by_shape(self, metadata["output_shape"]))) - - def compute(self, states, taken_actions=None, role=""): + + def compute(self, inputs, role=""): if self.instantiator_input_type == 0: - output = self.net(states) + output = self.net(inputs["states"]) elif self.instantiator_input_type == -1: - output = self.net(taken_actions) + output = self.net(inputs["taken_actions"]) elif self.instantiator_input_type == -2: - output = self.net(torch.cat((states, taken_actions), dim=1)) + output = self.net(torch.cat((inputs["states"], inputs["taken_actions"]), dim=1)) - return output * self.instantiator_output_scale, self.log_std_parameter + return output * self.instantiator_output_scale, self.log_std_parameter, {} - metadata = {"input_shape": input_shape, - "hiddens": hiddens, - "hidden_activation": hidden_activation, - "output_shape": output_shape, - "output_activation": output_activation, + metadata = {"input_shape": input_shape, + "hiddens": hiddens, + "hidden_activation": hidden_activation, + "output_shape": output_shape, + "output_activation": output_activation, "output_scale": output_scale} return MultivariateGaussianModel(observation_space=observation_space, - action_space=action_space, - device=device, - clip_actions=clip_actions, - clip_log_std=clip_log_std, + action_space=action_space, + device=device, + clip_actions=clip_actions, + clip_log_std=clip_log_std, min_log_std=min_log_std, max_log_std=max_log_std, metadata=metadata) -def deterministic_model(observation_space: Union[int, Tuple[int], gym.Space, None] = None, - action_space: Union[int, Tuple[int], gym.Space, None] = None, - device: Union[str, torch.device] = "cuda:0", - clip_actions: bool = False, - input_shape: Shape = Shape.STATES, - hiddens: list = [256, 256], - hidden_activation: list = ["relu", "relu"], - output_shape: Shape = Shape.ACTIONS, - output_activation: Union[str, None] = "tanh", +def deterministic_model(observation_space: Optional[Union[int, Tuple[int], gym.Space, gymnasium.Space]] = None, + action_space: Optional[Union[int, Tuple[int], gym.Space, gymnasium.Space]] = None, + device: Optional[Union[str, torch.device]] = None, + clip_actions: bool = False, + input_shape: Shape = Shape.STATES, + hiddens: list = [256, 256], + hidden_activation: list = ["relu", "relu"], + output_shape: Shape = Shape.ACTIONS, + output_activation: Optional[str] = "tanh", output_scale: float = 1.0) -> Model: """Instantiate a deterministic model :param observation_space: Observation/state space or shape (default: None). If it is not None, the num_observations property will contain the size of that space - :type observation_space: int, tuple or list of integers, gym.Space or None, optional + :type observation_space: int, tuple or list of integers, gym.Space, gymnasium.Space or None, optional :param action_space: Action space or shape (default: None). If it is not None, the num_actions property will contain the size of that space - :type action_space: int, tuple or list of integers, gym.Space or None, optional - :param device: Device on which a torch tensor is or will be allocated (default: "cuda:0") + :type action_space: int, tuple or list of integers, gym.Space, gymnasium.Space or None, optional + :param device: Device on which a torch tensor is or will be allocated (default: ``None``). + If None, the device will be either ``"cuda:0"`` if available or ``"cpu"`` :type device: str or torch.device, optional :param clip_actions: Flag to indicate whether the actions should be clipped to the action space (default: False) :type clip_actions: bool, optional @@ -368,52 +372,53 @@ def __init__(self, observation_space, action_space, device, clip_actions, metada output_shape=metadata["output_shape"], output_activation=metadata["output_activation"], output_scale=metadata["output_scale"]) - - def compute(self, states, taken_actions=None, role=""): + + def compute(self, inputs, role=""): if self.instantiator_input_type == 0: - output = self.net(states) + output = self.net(inputs["states"]) elif self.instantiator_input_type == -1: - output = self.net(taken_actions) + output = self.net(inputs["taken_actions"]) elif self.instantiator_input_type == -2: - output = self.net(torch.cat((states, taken_actions), dim=1)) + output = self.net(torch.cat((inputs["states"], inputs["taken_actions"]), dim=1)) - return output * self.instantiator_output_scale + return output * self.instantiator_output_scale, {} - metadata = {"input_shape": input_shape, - "hiddens": hiddens, - "hidden_activation": hidden_activation, - "output_shape": output_shape, - "output_activation": output_activation, + metadata = {"input_shape": input_shape, + "hiddens": hiddens, + "hidden_activation": hidden_activation, + "output_shape": output_shape, + "output_activation": output_activation, "output_scale": output_scale} return DeterministicModel(observation_space=observation_space, - action_space=action_space, - device=device, - clip_actions=clip_actions, + action_space=action_space, + device=device, + clip_actions=clip_actions, metadata=metadata) -def categorical_model(observation_space: Union[int, Tuple[int], gym.Space, None] = None, - action_space: Union[int, Tuple[int], gym.Space, None] = None, - device: Union[str, torch.device] = "cuda:0", - unnormalized_log_prob: bool = False, - input_shape: Shape = Shape.STATES, - hiddens: list = [256, 256], - hidden_activation: list = ["relu", "relu"], - output_shape: Shape = Shape.ACTIONS, - output_activation: Union[str, None] = None) -> Model: +def categorical_model(observation_space: Optional[Union[int, Tuple[int], gym.Space, gymnasium.Space]] = None, + action_space: Optional[Union[int, Tuple[int], gym.Space, gymnasium.Space]] = None, + device: Optional[Union[str, torch.device]] = None, + unnormalized_log_prob: bool = False, + input_shape: Shape = Shape.STATES, + hiddens: list = [256, 256], + hidden_activation: list = ["relu", "relu"], + output_shape: Shape = Shape.ACTIONS, + output_activation: Optional[str] = None) -> Model: """Instantiate a categorical model :param observation_space: Observation/state space or shape (default: None). If it is not None, the num_observations property will contain the size of that space - :type observation_space: int, tuple or list of integers, gym.Space or None, optional + :type observation_space: int, tuple or list of integers, gym.Space, gymnasium.Space or None, optional :param action_space: Action space or shape (default: None). If it is not None, the num_actions property will contain the size of that space - :type action_space: int, tuple or list of integers, gym.Space or None, optional - :param device: Device on which a torch tensor is or will be allocated (default: "cuda:0") + :type action_space: int, tuple or list of integers, gym.Space, gymnasium.Space or None, optional + :param device: Device on which a torch tensor is or will be allocated (default: ``None``). + If None, the device will be either ``"cuda:0"`` if available or ``"cpu"`` :type device: str or torch.device, optional :param unnormalized_log_prob: Flag to indicate how to be interpreted the model's output (default: True). - If True, the model's output is interpreted as unnormalized log probabilities - (it can be any real number), otherwise as normalized probabilities + If True, the model's output is interpreted as unnormalized log probabilities + (it can be any real number), otherwise as normalized probabilities (the output must be non-negative, finite and have a non-zero sum) :type unnormalized_log_prob: bool, optional :param input_shape: Shape of the input (default: Shape.STATES) @@ -443,25 +448,25 @@ def __init__(self, observation_space, action_space, device, unnormalized_log_pro hidden_activation=metadata["hidden_activation"], output_shape=metadata["output_shape"], output_activation=metadata["output_activation"]) - - def compute(self, states, taken_actions=None, role=""): + + def compute(self, inputs, role=""): if self.instantiator_input_type == 0: - output = self.net(states) + output = self.net(inputs["states"]) elif self.instantiator_input_type == -1: - output = self.net(taken_actions) + output = self.net(inputs["taken_actions"]) elif self.instantiator_input_type == -2: - output = self.net(torch.cat((states, taken_actions), dim=1)) + output = self.net(torch.cat((inputs["states"], inputs["taken_actions"]), dim=1)) - return output + return output, {} - metadata = {"input_shape": input_shape, - "hiddens": hiddens, - "hidden_activation": hidden_activation, - "output_shape": output_shape, + metadata = {"input_shape": input_shape, + "hiddens": hiddens, + "hidden_activation": hidden_activation, + "output_shape": output_shape, "output_activation": output_activation} return CategoricalModel(observation_space=observation_space, - action_space=action_space, - device=device, - unnormalized_log_prob=unnormalized_log_prob, + action_space=action_space, + device=device, + unnormalized_log_prob=unnormalized_log_prob, metadata=metadata) diff --git a/skrl/utils/omniverse_isaacgym_utils.py b/skrl/utils/omniverse_isaacgym_utils.py index 42d5da67..5a8f4f7e 100644 --- a/skrl/utils/omniverse_isaacgym_utils.py +++ b/skrl/utils/omniverse_isaacgym_utils.py @@ -54,7 +54,7 @@ def _torch_quat_conjugate(a): # wxyz a = a.reshape(-1, 4) return torch.cat((a[:, :1], -a[:, 1:]), dim=-1).view(shape) -def ik(jacobian_end_effector: torch.Tensor, +def ik(jacobian_end_effector: torch.Tensor, current_position: torch.Tensor, current_orientation: torch.Tensor, goal_position: torch.Tensor, @@ -139,50 +139,50 @@ def get_env_instance(headless: bool = True, multi_threaded: bool = False) -> "om # parse sim configuration from omniisaacgymenvs.utils.config_utils.sim_config import SimConfig - sim_config = SimConfig({"test": False, - "device_id": 0, + sim_config = SimConfig({"test": False, + "device_id": 0, "headless": True, - "sim_device": "gpu", - "task": {"name": "CustomTask", - "physics_engine": "physx", - "env": {"numEnvs": 512, - "envSpacing": 1.5, - "enableDebugVis": False, - "clipObservations": 1000.0, - "clipActions": 1.0, - "controlFrequencyInv": 4}, + "sim_device": "gpu", + "task": {"name": "CustomTask", + "physics_engine": "physx", + "env": {"numEnvs": 512, + "envSpacing": 1.5, + "enableDebugVis": False, + "clipObservations": 1000.0, + "clipActions": 1.0, + "controlFrequencyInv": 4}, "sim": {"dt": 0.0083, # 1 / 120 - "use_gpu_pipeline": True, - "gravity": [0.0, 0.0, -9.81], - "add_ground_plane": True, - "use_flatcache": True, - "enable_scene_query_support": False, - "enable_cameras": False, - "default_physics_material": {"static_friction": 1.0, - "dynamic_friction": 1.0, - "restitution": 0.0}, - "physx": {"worker_thread_count": 4, - "solver_type": 1, - "use_gpu": True, - "solver_position_iteration_count": 4, - "solver_velocity_iteration_count": 1, - "contact_offset": 0.005, - "rest_offset": 0.0, - "bounce_threshold_velocity": 0.2, - "friction_offset_threshold": 0.04, - "friction_correlation_distance": 0.025, - "enable_sleeping": True, - "enable_stabilization": True, - "max_depenetration_velocity": 1000.0, - "gpu_max_rigid_contact_count": 524288, - "gpu_max_rigid_patch_count": 33554432, - "gpu_found_lost_pairs_capacity": 524288, - "gpu_found_lost_aggregate_pairs_capacity": 262144, - "gpu_total_aggregate_pairs_capacity": 1048576, - "gpu_max_soft_body_contacts": 1048576, - "gpu_max_particle_contacts": 1048576, - "gpu_heap_capacity": 33554432, - "gpu_temp_buffer_capacity": 16777216, + "use_gpu_pipeline": True, + "gravity": [0.0, 0.0, -9.81], + "add_ground_plane": True, + "use_flatcache": True, + "enable_scene_query_support": False, + "enable_cameras": False, + "default_physics_material": {"static_friction": 1.0, + "dynamic_friction": 1.0, + "restitution": 0.0}, + "physx": {"worker_thread_count": 4, + "solver_type": 1, + "use_gpu": True, + "solver_position_iteration_count": 4, + "solver_velocity_iteration_count": 1, + "contact_offset": 0.005, + "rest_offset": 0.0, + "bounce_threshold_velocity": 0.2, + "friction_offset_threshold": 0.04, + "friction_correlation_distance": 0.025, + "enable_sleeping": True, + "enable_stabilization": True, + "max_depenetration_velocity": 1000.0, + "gpu_max_rigid_contact_count": 524288, + "gpu_max_rigid_patch_count": 33554432, + "gpu_found_lost_pairs_capacity": 524288, + "gpu_found_lost_aggregate_pairs_capacity": 262144, + "gpu_total_aggregate_pairs_capacity": 1048576, + "gpu_max_soft_body_contacts": 1048576, + "gpu_max_particle_contacts": 1048576, + "gpu_heap_capacity": 33554432, + "gpu_temp_buffer_capacity": 16777216, "gpu_max_num_partitions": 8}}}}) # import and setup custom task @@ -225,7 +225,7 @@ def stop(self): class _OmniIsaacGymVecEnvMT(VecEnvMT): def __init__(self, headless): super().__init__(headless) - + self.action_queue = queue.Queue(1) self.data_queue = queue.Queue(1) diff --git a/skrl/utils/postprocessing.py b/skrl/utils/postprocessing.py index 7192ee66..f35f48fc 100644 --- a/skrl/utils/postprocessing.py +++ b/skrl/utils/postprocessing.py @@ -12,9 +12,9 @@ class MemoryFileIterator(): def __init__(self, pathname: str) -> None: """Python iterator for loading data from exported memories - + The iterator will load the next memory file in the list of path names. - The output of the iterator is a tuple of the filename and the memory data + The output of the iterator is a tuple of the filename and the memory data where the memory data is a dictionary of torch.Tensor (PyTorch), numpy.ndarray (NumPy) or lists (CSV) depending on the format and the keys of the dictionary are the names of the variables @@ -31,7 +31,7 @@ def __init__(self, pathname: str) -> None: - Comma-separated values: (memory_size * num_envs, data_size) :param pathname: String containing a path specification for the exported memories. - Python `glob `_ method + Python `glob `_ method is used to find all files matching the path specification :type pathname: str """ @@ -50,7 +50,7 @@ def __next__(self) -> Tuple[str, dict]: """ if self.n >= len(self.file_paths): raise StopIteration - + if self.file_paths[self.n].endswith(".pt"): return self._format_torch() elif self.file_paths[self.n].endswith(".npz"): @@ -62,7 +62,7 @@ def __next__(self) -> Tuple[str, dict]: def _format_numpy(self) -> Tuple[str, dict]: """Load numpy array from file - + :return: Tuple of file name and data :rtype: tuple """ @@ -94,7 +94,7 @@ def _format_csv(self) -> Tuple[str, dict]: with open(self.file_paths[self.n], 'r') as f: reader = csv.reader(f) - + # parse header try: header = next(reader, None) @@ -123,13 +123,13 @@ def _format_csv(self) -> Tuple[str, dict]: class TensorboardFileIterator(): def __init__(self, pathname: str, tags: Union[str, List[str]]) -> None: """Python iterator for loading data from Tensorboard files - + The iterator will load the next Tensorboard file in the list of path names. The iterator's output is a tuple of the directory name and the Tensorboard variables selected by the tags. The Tensorboard data is returned as a dictionary with the tag as the key and a list of steps and values as the value :param pathname: String containing a path specification for the Tensorboard files. - Python `glob `_ method + Python `glob `_ method is used to find all files matching the path specification :type pathname: str :param tags: String or list of strings containing the tags of the variables to load @@ -150,7 +150,7 @@ def __next__(self) -> Tuple[str, dict]: :rtype: tuple """ from tensorflow.python.summary.summary_iterator import summary_iterator - + if self.n >= len(self.file_paths): raise StopIteration diff --git a/skrl/version.txt b/skrl/version.txt index 8adc70fd..ac39a106 100644 --- a/skrl/version.txt +++ b/skrl/version.txt @@ -1 +1 @@ -0.8.0 \ No newline at end of file +0.9.0 diff --git a/tests/test_examples_deepmind.py b/tests/test_examples_deepmind.py new file mode 100644 index 00000000..1e946da4 --- /dev/null +++ b/tests/test_examples_deepmind.py @@ -0,0 +1,24 @@ +import pytest +import warnings +import hypothesis +import hypothesis.strategies as st + +import os +import subprocess + +EXAMPLE_DIR = "deepmind" +SCRIPTS = ["dm_suite_cartpole_swingup_ddpg.py", + "dm_manipulation_stack_sac.py", ""] +EXAMPLES_DIR = os.path.abspath(os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "docs", "source", "examples")) +COMMANDS = [f"python {os.path.join(EXAMPLES_DIR, EXAMPLE_DIR, script)}" for script in SCRIPTS] + + +@pytest.mark.parametrize("command", COMMANDS) +def test_scripts(capsys, command): + try: + import gym + except ImportError as e: + warnings.warn(f"\n\nUnable to import dm_control environments ({e}).\nThis test will be skipped\n") + return + + subprocess.run(command, shell=True, check=True) diff --git a/tests/test_examples_gym.py b/tests/test_examples_gym.py new file mode 100644 index 00000000..50dcb759 --- /dev/null +++ b/tests/test_examples_gym.py @@ -0,0 +1,26 @@ +import pytest +import warnings +import hypothesis +import hypothesis.strategies as st + +import os +import subprocess + +EXAMPLE_DIR = "gym" +SCRIPTS = ["ddpg_gym_pendulum.py", + "cem_gym_cartpole.py", + "dqn_gym_cartpole.py", + "q_learning_gym_frozen_lake.py"] +EXAMPLES_DIR = os.path.abspath(os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "docs", "source", "examples")) +COMMANDS = [f"python {os.path.join(EXAMPLES_DIR, EXAMPLE_DIR, script)}" for script in SCRIPTS] + + +@pytest.mark.parametrize("command", COMMANDS) +def test_scripts(capsys, command): + try: + import gym + except ImportError as e: + warnings.warn(f"\n\nUnable to import gym ({e}).\nThis test will be skipped\n") + return + + subprocess.run(command, shell=True, check=True) diff --git a/tests/test_examples_gymnasium.py b/tests/test_examples_gymnasium.py new file mode 100644 index 00000000..fcacc402 --- /dev/null +++ b/tests/test_examples_gymnasium.py @@ -0,0 +1,26 @@ +import pytest +import warnings +import hypothesis +import hypothesis.strategies as st + +import os +import subprocess + +EXAMPLE_DIR = "gymnasium" +SCRIPTS = ["ddpg_gymnasium_pendulum.py", + "cem_gymnasium_cartpole.py", + "dqn_gymnasium_cartpole.py", + "q_learning_gymnasium_frozen_lake.py"] +EXAMPLES_DIR = os.path.abspath(os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "docs", "source", "examples")) +COMMANDS = [f"python {os.path.join(EXAMPLES_DIR, EXAMPLE_DIR, script)}" for script in SCRIPTS] + + +@pytest.mark.parametrize("command", COMMANDS) +def test_scripts(capsys, command): + try: + import gymnasium + except ImportError as e: + warnings.warn(f"\n\nUnable to import gymnasium ({e}).\nThis test will be skipped\n") + return + + subprocess.run(command, shell=True, check=True) diff --git a/tests/test_examples_isaacgym.py b/tests/test_examples_isaacgym.py new file mode 100644 index 00000000..408cfc24 --- /dev/null +++ b/tests/test_examples_isaacgym.py @@ -0,0 +1,24 @@ +import pytest +import warnings +import hypothesis +import hypothesis.strategies as st + +import os +import subprocess + +EXAMPLE_DIR = "isaacgym" +SCRIPTS = ["ppo_cartpole.py", + "trpo_cartpole.py"] +EXAMPLES_DIR = os.path.abspath(os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "docs", "source", "examples")) +COMMANDS = [f"python {os.path.join(EXAMPLES_DIR, EXAMPLE_DIR, script)} headless=True num_envs=64" for script in SCRIPTS] + + +@pytest.mark.parametrize("command", COMMANDS) +def test_scripts(capsys, command): + try: + import isaacgymenvs + except ImportError as e: + warnings.warn(f"\n\nUnable to import isaacgymenvs ({e}).\nThis test will be skipped\n") + return + + subprocess.run(command, shell=True, check=True) diff --git a/tests/test_examples_isaacsim.py b/tests/test_examples_isaacsim.py new file mode 100644 index 00000000..0215597e --- /dev/null +++ b/tests/test_examples_isaacsim.py @@ -0,0 +1,27 @@ +import pytest +import warnings +import hypothesis +import hypothesis.strategies as st + +import os +import subprocess + +# See the following link for Omniverse Isaac Sim Python environment +# https://docs.omniverse.nvidia.com/app_isaacsim/app_isaacsim/install_python.html +PYTHON_ENVIRONMENT = "./python.sh" + +EXAMPLE_DIR = "isaacsim" +SCRIPTS = ["cartpole_example_skrl.py"] +EXAMPLES_DIR = os.path.abspath(os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "docs", "source", "examples")) +COMMANDS = [f"{PYTHON_ENVIRONMENT} {os.path.join(EXAMPLES_DIR, EXAMPLE_DIR, script)}" for script in SCRIPTS] + + +@pytest.mark.parametrize("command", COMMANDS) +def test_scripts(capsys, command): + try: + from omni.isaac.kit import SimulationApp + except ImportError as e: + warnings.warn(f"\n\nUnable to import SimulationApp ({e}).\nThis test will be skipped\n") + return + + subprocess.run(command, shell=True, check=True) diff --git a/tests/test_examples_omniisaacgym.py b/tests/test_examples_omniisaacgym.py new file mode 100644 index 00000000..cb2571c7 --- /dev/null +++ b/tests/test_examples_omniisaacgym.py @@ -0,0 +1,27 @@ +import pytest +import warnings +import hypothesis +import hypothesis.strategies as st + +import os +import subprocess + +# See the following link for Omniverse Isaac Sim Python environment +# https://docs.omniverse.nvidia.com/app_isaacsim/app_isaacsim/install_python.html +PYTHON_ENVIRONMENT = "./python.sh" + +EXAMPLE_DIR = "omniisaacgym" +SCRIPTS = ["ppo_cartpole.py"] +EXAMPLES_DIR = os.path.abspath(os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "docs", "source", "examples")) +COMMANDS = [f"{PYTHON_ENVIRONMENT} {os.path.join(EXAMPLES_DIR, EXAMPLE_DIR, script)} headless=True num_envs=64" for script in SCRIPTS] + + +@pytest.mark.parametrize("command", COMMANDS) +def test_scripts(capsys, command): + try: + import omniisaacgymenvs + except ImportError as e: + warnings.warn(f"\n\nUnable to import omniisaacgymenvs ({e}).\nThis test will be skipped\n") + return + + subprocess.run(command, shell=True, check=True) diff --git a/tests/test_examples_robosuite.py b/tests/test_examples_robosuite.py new file mode 100644 index 00000000..a66ffbd3 --- /dev/null +++ b/tests/test_examples_robosuite.py @@ -0,0 +1,23 @@ +import pytest +import warnings +import hypothesis +import hypothesis.strategies as st + +import os +import subprocess + +EXAMPLE_DIR = "robosuite" +SCRIPTS = [] +EXAMPLES_DIR = os.path.abspath(os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "docs", "source", "examples")) +COMMANDS = [f"python {os.path.join(EXAMPLES_DIR, EXAMPLE_DIR, script)}" for script in SCRIPTS] + + +@pytest.mark.parametrize("command", COMMANDS) +def test_scripts(capsys, command): + try: + import gym + except ImportError as e: + warnings.warn(f"\n\nUnable to import gym ({e}).\nThis test will be skipped\n") + return + + subprocess.run(command, shell=True, check=True) diff --git a/tests/test_examples_shimmy.py b/tests/test_examples_shimmy.py new file mode 100644 index 00000000..4ed69cfd --- /dev/null +++ b/tests/test_examples_shimmy.py @@ -0,0 +1,25 @@ +import pytest +import warnings +import hypothesis +import hypothesis.strategies as st + +import os +import subprocess + +EXAMPLE_DIR = "shimmy" +SCRIPTS = ["dqn_shimmy_atari_pong.py", + "sac_shimmy_dm_control_acrobot_swingup_sparse.py", + "ddpg_openai_gym_compatibility_pendulum.py"] +EXAMPLES_DIR = os.path.abspath(os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "docs", "source", "examples")) +COMMANDS = [f"python {os.path.join(EXAMPLES_DIR, EXAMPLE_DIR, script)}" for script in SCRIPTS] + + +@pytest.mark.parametrize("command", COMMANDS) +def test_scripts(capsys, command): + try: + import shimmy + except ImportError as e: + warnings.warn(f"\n\nUnable to import shimmy ({e}).\nThis test will be skipped\n") + return + + subprocess.run(command, shell=True, check=True) diff --git a/tests/test_memories.py b/tests/test_memories.py new file mode 100644 index 00000000..99204cf8 --- /dev/null +++ b/tests/test_memories.py @@ -0,0 +1,71 @@ +import pytest +import warnings +import hypothesis +import hypothesis.strategies as st + +import string + +import torch + +from skrl.memories.torch import Memory +from skrl.memories.torch import RandomMemory + + +@pytest.fixture +def classes_and_kwargs(): + return [(RandomMemory, {})] + + +@pytest.mark.parametrize("device", [None, "cpu", "cuda:0"]) +def test_device(capsys, classes_and_kwargs, device): + _device = torch.device(device) if device is not None else torch.device("cuda:0" if torch.cuda.is_available() else "cpu") + + for klass, kwargs in classes_and_kwargs: + try: + memory: Memory = klass(memory_size=1, device=device, **kwargs) + except (RuntimeError, AssertionError) as e: + with capsys.disabled(): + print(e) + warnings.warn(f"Invalid device: {device}. This test will be skipped") + continue + + assert memory.device == _device # defined device + +@hypothesis.given(names=st.sets(st.text(alphabet=string.ascii_letters + string.digits + "_", min_size=1, max_size=10), min_size=1, max_size=10)) +@hypothesis.settings(suppress_health_check=[hypothesis.HealthCheck.function_scoped_fixture], deadline=None) +def test_create_tensors(capsys, classes_and_kwargs, names): + for klass, kwargs in classes_and_kwargs: + memory: Memory = klass(memory_size=1, **kwargs) + + for name in names: + memory.create_tensor(name=name, size=1, dtype=torch.float32) + + assert memory.get_tensor_names() == sorted(names) + +@hypothesis.given(memory_size=st.integers(min_value=1, max_value=100), + num_envs=st.integers(min_value=1, max_value=10), + num_samples=st.integers(min_value=1, max_value=500)) +@hypothesis.settings(suppress_health_check=[hypothesis.HealthCheck.function_scoped_fixture], deadline=None) +def test_add_samples(capsys, classes_and_kwargs, memory_size, num_envs, num_samples): + for klass, kwargs in classes_and_kwargs: + memory: Memory = klass(memory_size=memory_size, num_envs=num_envs, **kwargs) + + memory.create_tensor(name="tensor_1", size=1, dtype=torch.float32) + memory.create_tensor(name="tensor_2", size=2, dtype=torch.float32) + + # memory_index + for _ in range(num_samples): + memory.add_samples(tensor_1=torch.zeros((num_envs, 1))) + + assert memory.memory_index == num_samples % memory_size + assert memory.filled == (num_samples >= memory_size) + + memory.reset() + + # memory_index, env_index + for _ in range(num_samples): + memory.add_samples(tensor_2=torch.zeros((2,))) + + assert memory.memory_index == (num_samples // num_envs) % memory_size + assert memory.env_index == num_samples % num_envs + assert memory.filled == (num_samples >= memory_size * num_envs) diff --git a/tests/test_model_instantiators.py b/tests/test_model_instantiators.py new file mode 100644 index 00000000..923e09d2 --- /dev/null +++ b/tests/test_model_instantiators.py @@ -0,0 +1,24 @@ +import pytest +import warnings +import hypothesis +import hypothesis.strategies as st + +import torch + +from skrl.models.torch import Model + +from skrl.utils.model_instantiators import Shape +from skrl.utils.model_instantiators import categorical_model +from skrl.utils.model_instantiators import deterministic_model +from skrl.utils.model_instantiators import gaussian_model +from skrl.utils.model_instantiators import multivariate_gaussian_model + + +@pytest.fixture +def classes_and_kwargs(): + return [] + + +@pytest.mark.parametrize("device", [None, "cpu", "cuda:0"]) +def test_device(capsys, classes_and_kwargs, device): + _device = torch.device(device) if device is not None else torch.device("cuda:0" if torch.cuda.is_available() else "cpu") diff --git a/tests/test_noises_gaussian.py b/tests/test_noises_gaussian.py deleted file mode 100644 index f2dfc0b2..00000000 --- a/tests/test_noises_gaussian.py +++ /dev/null @@ -1,78 +0,0 @@ -import unittest -import math - -import torch - -from skrl.resources.noises.torch import GaussianNoise - - -class TestCase(unittest.TestCase): - def setUp(self): - self.devices = ['cpu', 'cuda:0'] - - self.sizes = [(1000, 2), [2000, 10, 1], torch.Size([3000])] - self.means = (10 * (torch.rand(len(self.sizes)) + 0.5) * torch.sign(torch.rand(len(self.sizes)) - 0.5)).tolist() - self.stds = (10 * (torch.rand(len(self.sizes)) + 0.1)).tolist() # positive non-zero values - - def tearDown(self): - pass - - def test_devices(self): - for device in self.devices: - noise = GaussianNoise(mean=0, std=1.0, device=device) - self.assertEqual(noise.device, torch.device(device)) - - def test_method_sample(self): - for mean, std in zip(self.means, self.stds): - # create noise - noise = GaussianNoise(mean=mean, std=std, device='cpu') - # iterate over all sizes - for size in self.sizes: - # iterate 10 times - for i in range(10): - # sample noise - output = noise.sample(size) - # check output - _mean = output.mean().item() - _std = output.std().item() - self.assertTrue(math.isclose(_mean, mean, rel_tol=abs(mean) * 0.25)) - self.assertTrue(math.isclose(_std, std, rel_tol=std * 0.25)) - # check shape - self.assertEqual(output.size(), torch.Size(size)) - - def test_method_sample_like(self): - for mean, std in zip(self.means, self.stds): - # create noise - noise = GaussianNoise(mean=mean, std=std, device='cpu') - # iterate over all sizes - for size in self.sizes: - # create tensor - tensor = torch.rand(size) - # iterate 10 times - for i in range(10): - # sample noise - output = noise.sample_like(tensor) - # check output - _mean = output.mean().item() - _std = output.std().item() - self.assertTrue(math.isclose(_mean, mean, rel_tol=abs(mean) * 0.25)) - self.assertTrue(math.isclose(_std, std, rel_tol=std * 0.25)) - # check shape - self.assertEqual(output.size(), torch.Size(size)) - - -if __name__ == '__main__': - import sys - - if not sys.argv[-1] == '--debug': - raise RuntimeError('Test can only be runned manually with --debug flag') - - test = TestCase() - test.setUp() - for method in dir(test): - if method.startswith('test_'): - print('Running test: {}'.format(method)) - getattr(test, method)() - test.tearDown() - - print('All tests passed.') diff --git a/tests/test_noises_ornstein_uhlenbeck.py b/tests/test_noises_ornstein_uhlenbeck.py deleted file mode 100644 index 6c7241b4..00000000 --- a/tests/test_noises_ornstein_uhlenbeck.py +++ /dev/null @@ -1,69 +0,0 @@ -import unittest -import math - -import torch - -from skrl.resources.noises.torch import OrnsteinUhlenbeckNoise - - -class TestCase(unittest.TestCase): - def setUp(self): - self.devices = ['cpu', 'cuda:0'] - - self.sizes = [(1000, 2), [2000, 10, 1], torch.Size([3000])] - self.thetas = (10 * (torch.rand(len(self.sizes)) + 0.5)).tolist() # positive non-zero values - self.sigmas = (10 * (torch.rand(len(self.sizes)) + 0.5)).tolist() # positive non-zero values - self.base_scales = (10 * (torch.rand(len(self.sizes)) + 0.5)).tolist() # positive non-zero values - - def tearDown(self): - pass - - def test_devices(self): - for device in self.devices: - noise = OrnsteinUhlenbeckNoise(theta=0.15, sigma=0.1, base_scale=1.0, device=device) - self.assertEqual(noise.device, torch.device(device)) - - def test_method_sample(self): - for theta, sigma, base_scale in zip(self.thetas, self.sigmas, self.base_scales): - # create noise - noise = OrnsteinUhlenbeckNoise(theta=theta, sigma=sigma, base_scale=base_scale, device='cpu') - # iterate over all sizes - for size in self.sizes: - # iterate 10 times - for i in range(10): - # sample noise - output = noise.sample(size) - # check shape - self.assertEqual(output.size(), torch.Size(size)) - - def test_method_sample_like(self): - for theta, sigma, base_scale in zip(self.thetas, self.sigmas, self.base_scales): - # create noise - noise = OrnsteinUhlenbeckNoise(theta=theta, sigma=sigma, base_scale=base_scale, device='cpu') - # iterate over all sizes - for size in self.sizes: - # create tensor - tensor = torch.rand(size) - # iterate 10 times - for i in range(10): - # sample noise - output = noise.sample_like(tensor) - # check shape - self.assertEqual(output.size(), torch.Size(size)) - - -if __name__ == '__main__': - import sys - - if not sys.argv[-1] == '--debug': - raise RuntimeError('Test can only be runned manually with --debug flag') - - test = TestCase() - test.setUp() - for method in dir(test): - if method.startswith('test_'): - print('Running test: {}'.format(method)) - getattr(test, method)() - test.tearDown() - - print('All tests passed.') diff --git a/tests/test_resources_noises.py b/tests/test_resources_noises.py new file mode 100644 index 00000000..f9dcedba --- /dev/null +++ b/tests/test_resources_noises.py @@ -0,0 +1,49 @@ +import pytest +import warnings +import hypothesis +import hypothesis.strategies as st + +import torch + +from skrl.resources.noises.torch import Noise +from skrl.resources.noises.torch import GaussianNoise +from skrl.resources.noises.torch import OrnsteinUhlenbeckNoise + + +@pytest.fixture +def classes_and_kwargs(): + return [(GaussianNoise, {"mean": 0, "std": 1}), + (OrnsteinUhlenbeckNoise, {"theta": 0.1, "sigma": 0.2, "base_scale": 0.3})] + + +@pytest.mark.parametrize("device", [None, "cpu", "cuda:0"]) +def test_device(capsys, classes_and_kwargs, device): + _device = torch.device(device) if device is not None else torch.device("cuda:0" if torch.cuda.is_available() else "cpu") + + for klass, kwargs in classes_and_kwargs: + try: + noise: Noise = klass(device=device, **kwargs) + except (RuntimeError, AssertionError) as e: + with capsys.disabled(): + print(e) + warnings.warn(f"Invalid device: {device}. This test will be skipped") + continue + + output = noise.sample((1,)) + assert noise.device == _device # defined device + assert output.device == _device # runtime device + +@hypothesis.given(size=st.lists(st.integers(min_value=1, max_value=10), max_size=5)) +@hypothesis.settings(suppress_health_check=[hypothesis.HealthCheck.function_scoped_fixture], deadline=None) +def test_sample(capsys, classes_and_kwargs, size): + for klass, kwargs in classes_and_kwargs: + noise: Noise = klass(**kwargs) + + # sample + output = noise.sample(size) + assert output.size() == torch.Size(size) + + # sample like + tensor = torch.rand(size, device="cpu") + output = noise.sample_like(tensor) + assert output.size() == torch.Size(size) diff --git a/tests/test_resources_preprocessors.py b/tests/test_resources_preprocessors.py new file mode 100644 index 00000000..1c94d139 --- /dev/null +++ b/tests/test_resources_preprocessors.py @@ -0,0 +1,45 @@ +import pytest +import warnings +import hypothesis +import hypothesis.strategies as st + +import gym +import gymnasium + +import torch + +from skrl.resources.preprocessors.torch import RunningStandardScaler + + +@pytest.fixture +def classes_and_kwargs(): + return [(RunningStandardScaler, {"size": 1})] + + +@pytest.mark.parametrize("device", [None, "cpu", "cuda:0"]) +def test_device(capsys, classes_and_kwargs, device): + _device = torch.device(device) if device is not None else torch.device("cuda:0" if torch.cuda.is_available() else "cpu") + + for klass, kwargs in classes_and_kwargs: + try: + preprocessor = klass(device=device, **kwargs) + except (RuntimeError, AssertionError) as e: + with capsys.disabled(): + print(e) + warnings.warn(f"Invalid device: {device}. This test will be skipped") + continue + + assert preprocessor.device == _device # defined device + assert preprocessor(torch.ones(kwargs["size"], device=_device)).device == _device # runtime device + +@pytest.mark.parametrize("space_and_size", [(gym.spaces.Box(low=-1, high=1, shape=(2, 3)), 6), + (gymnasium.spaces.Box(low=-1, high=1, shape=(2, 3)), 6), + (gym.spaces.Discrete(n=3), 1), + (gymnasium.spaces.Discrete(n=3), 1)]) +def test_forward(capsys, classes_and_kwargs, space_and_size): + for klass, kwargs in classes_and_kwargs: + space, size = space_and_size + preprocessor = klass(size=space, device="cpu") + + output = preprocessor(torch.rand((10, size), device="cpu")) + assert output.shape == torch.Size((10, size)) diff --git a/tests/test_resources_schedulers.py b/tests/test_resources_schedulers.py new file mode 100644 index 00000000..9f2840bb --- /dev/null +++ b/tests/test_resources_schedulers.py @@ -0,0 +1,22 @@ +import pytest +import warnings +import hypothesis +import hypothesis.strategies as st + +import torch + +from skrl.resources.schedulers.torch import KLAdaptiveRL + + +@pytest.fixture +def classes_and_kwargs(): + return [(KLAdaptiveRL, {})] + + +@pytest.mark.parametrize("optimizer", [torch.optim.Adam([torch.ones((1,))], lr=0.1), + torch.optim.SGD([torch.ones((1,))], lr=0.1)]) +def test_step(capsys, classes_and_kwargs, optimizer): + for klass, kwargs in classes_and_kwargs: + scheduler = klass(optimizer, **kwargs) + + scheduler.step(0.0)