Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

SMODICE Algorithm #1583

Open
wants to merge 5 commits into
base: pytorch
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ Read the ALF documentation [here](https://alf.readthedocs.io/).
|[MuZero](alf/algorithms/muzero_algorithm.py)|Model-based RL|Schrittwieser et al. "Mastering Atari, Go, Chess and Shogi by Planning with a Learned Model" [arXiv:1911.08265](https://arxiv.org/abs/1911.08265)|
|[BC](alf/algorithms/bc_algorithm.py)|Offline RL|Pomerleau "ALVINN: An Autonomous Land Vehicle in a Neural Network" [NeurIPS 1988](https://papers.nips.cc/paper/1988/hash/812b4ba287f5ee0bc9d43bbf5bbe87fb-Abstract.html) <br> Bain et al. "A framework for behavioural cloning" [Machine Intelligence 1999](http://www.cse.unsw.edu.au/~claude/papers/MI15.pdf)|
|[Causal BC](alf/algorithms/causal_bc_algorithm.py)|Offline RL|Swamy et al. "Causal Imitation Learning under Temporally Correlated Noise" [ICML2022](https://proceedings.mlr.press/v162/swamy22a/swamy22a.pdf)|
|[SMODICE](alf/algorithms/smodice_algorithm.py)|Offline RL|Ma et al. "Versatile Offline Imitation Learning via State Occupancy Matching" [ICML2022](https://arxiv.org/abs/2202.02433)|
|[IQL](alf/algorithms/iql_algorithm.py)|Offline RL|Kostrikov, et al. "Offline Reinforcement Learning with Implicit Q-Learning" [arXiv:2110.06169](https://arxiv.org/abs/2110.06169)|
|[SEditor](alf/algorithms/seditor_algorithm.py)|Offline/Safe RL|Yu et al. "Towards Safe Reinforcement Learning with a Safety Editor Policy" [NeurIPS 2022](https://proceedings.neurips.cc/paper_files/paper/2022/file/11afefdd848d1bc9ac9f1604d9f45817-Paper-Conference.pdf)|
|[MERLIN](alf/algorithms/merlin_algorithm.py)|Unsupervised learning|Wayne et al. "Unsupervised Predictive Memory in a Goal-Directed Agent"[arXiv:1803.10760](https://arxiv.org/abs/1803.10760)|
Expand Down
5 changes: 4 additions & 1 deletion alf/algorithms/algorithm.py
Original file line number Diff line number Diff line change
Expand Up @@ -2132,7 +2132,10 @@ def _hybrid_update(self, experience, batch_info, offline_experience,
else:
loss_info = offline_loss_info

params = self._backward_and_gradient_update(loss_info.loss * weight)
params, gns = self._backward_and_gradient_update(
loss_info.loss * weight)

loss_info = loss_info._replace(gns=gns)

if self._RL_train:
# for now, there is no need to do a hybrid after update
Expand Down
418 changes: 418 additions & 0 deletions alf/algorithms/smodice_algorithm.py

Large diffs are not rendered by default.

5 changes: 5 additions & 0 deletions alf/bin/train_play_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -697,6 +697,11 @@ def test_causal_bc_pendulum(self):
conf_file='./hybrid_rl/causal_bc_pendulum_conf.py',
extra_train_params=OFF_POLICY_TRAIN_PARAMS)

def test_smodice_pendulum(self):
self._test(
conf_file='./smodice_pendulum_conf.py',
extra_train_params=OFF_POLICY_TRAIN_PARAMS)

def test_iql_pendulum(self):
self._test(
conf_file='./hybrid_rl/iql_pendulum_conf.py',
Expand Down
1 change: 1 addition & 0 deletions alf/environments/make_penv.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ def gen_penv():
cmd = (f"g++ -O3 -Wall -shared -std=c++17 -fPIC -fvisibility=hidden "
f"`{python} -m pybind11 --includes` parallel_environment.cpp "
f"-o _penv`{python}-config --extension-suffix` -lrt")

ret = os.system(cmd)
assert ret == 0, "Fail to execute " + cmd

Expand Down
2 changes: 1 addition & 1 deletion alf/examples/data_collection_carla_conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
# This is an example config file for data collection in CARLA.

# the desired replay buffer size for collection
# 100 is just an example. Should set it to he actual desired size.
# 100 is just an example. Should set it to the actual desired size.
replay_buffer_length = 100

# the desired environment for data collection
Expand Down
77 changes: 77 additions & 0 deletions alf/examples/smodice_bipedal_walker_conf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
# Copyright (c) 2022 Horizon Robotics and ALF Contributors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from functools import partial
import torch

import alf
from alf.algorithms.agent import Agent
from alf.algorithms.smodice_algorithm import SmodiceAlgorithm
from alf.utils import math_ops

from alf.examples import sac_bipedal_walker_conf

# default params
lr = 1e-4
encoding_dim = 256
fc_layers_params = (encoding_dim, ) * 2
activation = torch.relu_

offline_buffer_length = None
offline_buffer_dir = [
"/home/haichaozhang/data/DATA/sac_bipedal_baseline/train/algorithm/ckpt-80000-replay_buffer"
]

alf.config('Agent', rl_algorithm_cls=SmodiceAlgorithm, optimizer=None)

proj_net = partial(
alf.networks.StableNormalProjectionNetwork,
state_dependent_std=True,
squash_mean=False,
scale_distribution=True,
min_std=1e-3,
max_std=10)

actor_network_cls = partial(
alf.networks.ActorDistributionNetwork,
fc_layer_params=fc_layers_params,
activation=activation,
continuous_projection_net_ctor=proj_net)

v_network_cls = partial(
alf.networks.ValueNetwork,
fc_layer_params=fc_layers_params,
activation=activation)

action_spec = alf.get_action_spec()
discriminator_network_cls = partial(
alf.networks.CriticNetwork, joint_fc_layer_params=fc_layers_params)

alf.config(
'SmodiceAlgorithm',
actor_network_cls=actor_network_cls,
v_network_cls=v_network_cls,
discriminator_network_cls=discriminator_network_cls,
actor_optimizer=alf.optimizers.Adam(lr=lr),
# add weight decay to the v_net following smodice paper
value_optimizer=alf.optimizers.Adam(lr=lr, weight_decay=1e-4),
discriminator_optimizer=alf.optimizers.Adam(lr=lr),
gradient_penalty_weight=0.1,
)

# training config
alf.config(
"TrainerConfig",
offline_buffer_dir=offline_buffer_dir,
offline_buffer_length=offline_buffer_length)
103 changes: 103 additions & 0 deletions alf/examples/smodice_pendulum_conf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
# Copyright (c) 2022 Horizon Robotics and ALF Contributors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from functools import partial
import torch

import alf
from alf.algorithms.agent import Agent
from alf.algorithms.smodice_algorithm import SmodiceAlgorithm
from alf.utils import math_ops

# default params
lr = 1e-4
encoding_dim = 256
fc_layers_params = (encoding_dim, ) * 2
activation = torch.relu_

offline_buffer_length = None
offline_buffer_dir = [
"./hybrid_rl/replay_buffer_data/pendulum_replay_buffer_from_sac_10k"
]

env_name = "Pendulum-v0"

alf.config(
"create_environment", env_name=env_name, num_parallel_environments=1)

alf.config('Agent', rl_algorithm_cls=SmodiceAlgorithm)

alf.config(
'TrainerConfig',
algorithm_ctor=Agent,
whole_replay_buffer_training=False,
clear_replay_buffer=False)

proj_net = partial(
alf.networks.StableNormalProjectionNetwork,
state_dependent_std=True,
squash_mean=False,
scale_distribution=True,
min_std=1e-3,
max_std=10)

actor_network_cls = partial(
alf.networks.ActorDistributionNetwork,
fc_layer_params=fc_layers_params,
activation=activation,
continuous_projection_net_ctor=proj_net)

v_network_cls = partial(
alf.networks.ValueNetwork,
fc_layer_params=fc_layers_params,
activation=activation)

action_spec = alf.get_action_spec()
discriminator_network_cls = partial(
alf.networks.CriticNetwork, joint_fc_layer_params=fc_layers_params)

alf.config(
'SmodiceAlgorithm',
actor_network_cls=actor_network_cls,
v_network_cls=v_network_cls,
discriminator_network_cls=discriminator_network_cls,
actor_optimizer=alf.optimizers.Adam(lr=lr),
# add weight decay to the v_net following smodice paper
value_optimizer=alf.optimizers.Adam(lr=lr, weight_decay=1e-4),
discriminator_optimizer=alf.optimizers.Adam(lr=lr),
gradient_penalty_weight=0.1,
)

num_iterations = 1000000

# training config
alf.config(
"TrainerConfig",
initial_collect_steps=1000,
num_updates_per_train_iter=1,
num_iterations=num_iterations,
# disable rl training by setting rl_train_after_update_steps
# to be larger than num_iterations
rl_train_after_update_steps=0, # joint training
mini_batch_size=256,
mini_batch_length=2,
unroll_length=1,
offline_buffer_dir=offline_buffer_dir,
offline_buffer_length=offline_buffer_length,
num_checkpoints=1,
debug_summaries=True,
evaluate=True,
eval_interval=1000,
num_eval_episodes=3,
)
Loading