HorizonRobotics · Haichao-Zhang · Dec 15, 2023 · Dec 19, 2023 · Dec 19, 2023 · Jan 5, 2024
diff --git a/README.md b/README.md
@@ -35,6 +35,7 @@ Read the ALF documentation [here](https://alf.readthedocs.io/).
 |[MuZero](alf/algorithms/muzero_algorithm.py)|Model-based RL|Schrittwieser et al. "Mastering Atari, Go, Chess and Shogi by Planning with a Learned Model" [arXiv:1911.08265](https://arxiv.org/abs/1911.08265)|
 |[BC](alf/algorithms/bc_algorithm.py)|Offline RL|Pomerleau "ALVINN: An Autonomous Land Vehicle in a Neural Network" [NeurIPS 1988](https://papers.nips.cc/paper/1988/hash/812b4ba287f5ee0bc9d43bbf5bbe87fb-Abstract.html) <br>  Bain et al. "A framework for behavioural cloning" [Machine Intelligence 1999](http://www.cse.unsw.edu.au/~claude/papers/MI15.pdf)|
 |[Causal BC](alf/algorithms/causal_bc_algorithm.py)|Offline RL|Swamy et al. "Causal Imitation Learning under Temporally Correlated Noise" [ICML2022](https://proceedings.mlr.press/v162/swamy22a/swamy22a.pdf)|
+|[SMODICE](alf/algorithms/smodice_algorithm.py)|Offline RL|Ma et al. "Versatile Offline Imitation Learning via State Occupancy Matching" [ICML2022](https://arxiv.org/abs/2202.02433)|
 |[IQL](alf/algorithms/iql_algorithm.py)|Offline RL|Kostrikov, et al. "Offline Reinforcement Learning with Implicit Q-Learning" [arXiv:2110.06169](https://arxiv.org/abs/2110.06169)|
 |[SEditor](alf/algorithms/seditor_algorithm.py)|Offline/Safe RL|Yu et al. "Towards Safe Reinforcement Learning with a Safety Editor Policy" [NeurIPS 2022](https://proceedings.neurips.cc/paper_files/paper/2022/file/11afefdd848d1bc9ac9f1604d9f45817-Paper-Conference.pdf)|
 |[MERLIN](alf/algorithms/merlin_algorithm.py)|Unsupervised learning|Wayne et al. "Unsupervised Predictive Memory in a Goal-Directed Agent"[arXiv:1803.10760](https://arxiv.org/abs/1803.10760)|

diff --git a/alf/algorithms/algorithm.py b/alf/algorithms/algorithm.py
@@ -2132,7 +2132,10 @@ def _hybrid_update(self, experience, batch_info, offline_experience,
         else:
             loss_info = offline_loss_info
 
-        params = self._backward_and_gradient_update(loss_info.loss * weight)
+        params, gns = self._backward_and_gradient_update(
+            loss_info.loss * weight)
+
+        loss_info = loss_info._replace(gns=gns)
 
         if self._RL_train:
             # for now, there is no need to do a hybrid after update

diff --git a/alf/algorithms/smodice_algorithm.py b/alf/algorithms/smodice_algorithm.py
diff --git a/alf/bin/train_play_test.py b/alf/bin/train_play_test.py
@@ -697,6 +697,11 @@ def test_causal_bc_pendulum(self):
             conf_file='./hybrid_rl/causal_bc_pendulum_conf.py',
             extra_train_params=OFF_POLICY_TRAIN_PARAMS)
 
+    def test_smodice_pendulum(self):
+        self._test(
+            conf_file='./smodice_pendulum_conf.py',
+            extra_train_params=OFF_POLICY_TRAIN_PARAMS)
+
     def test_iql_pendulum(self):
         self._test(
             conf_file='./hybrid_rl/iql_pendulum_conf.py',

diff --git a/alf/environments/make_penv.py b/alf/environments/make_penv.py
@@ -30,6 +30,7 @@ def gen_penv():
     cmd = (f"g++ -O3 -Wall -shared -std=c++17 -fPIC -fvisibility=hidden "
            f"`{python} -m pybind11 --includes` parallel_environment.cpp "
            f"-o _penv`{python}-config --extension-suffix` -lrt")
+
     ret = os.system(cmd)
     assert ret == 0, "Fail to execute " + cmd
 

diff --git a/alf/examples/data_collection_carla_conf.py b/alf/examples/data_collection_carla_conf.py
@@ -27,7 +27,7 @@
 # This is an example config file for data collection in CARLA.
 
 # the desired replay buffer size for collection
-# 100 is just an example. Should set it to he actual desired size.
+# 100 is just an example. Should set it to the actual desired size.
 replay_buffer_length = 100
 
 # the desired environment for data collection

diff --git a/alf/examples/smodice_bipedal_walker_conf.py b/alf/examples/smodice_bipedal_walker_conf.py
@@ -0,0 +1,77 @@
+# Copyright (c) 2022 Horizon Robotics and ALF Contributors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from functools import partial
+import torch
+
+import alf
+from alf.algorithms.agent import Agent
+from alf.algorithms.smodice_algorithm import SmodiceAlgorithm
+from alf.utils import math_ops
+
+from alf.examples import sac_bipedal_walker_conf
+
+# default params
+lr = 1e-4
+encoding_dim = 256
+fc_layers_params = (encoding_dim, ) * 2
+activation = torch.relu_
+
+offline_buffer_length = None
+offline_buffer_dir = [
+    "/home/haichaozhang/data/DATA/sac_bipedal_baseline/train/algorithm/ckpt-80000-replay_buffer"
+]
+
+alf.config('Agent', rl_algorithm_cls=SmodiceAlgorithm, optimizer=None)
+
+proj_net = partial(
+    alf.networks.StableNormalProjectionNetwork,
+    state_dependent_std=True,
+    squash_mean=False,
+    scale_distribution=True,
+    min_std=1e-3,
+    max_std=10)
+
+actor_network_cls = partial(
+    alf.networks.ActorDistributionNetwork,
+    fc_layer_params=fc_layers_params,
+    activation=activation,
+    continuous_projection_net_ctor=proj_net)
+
+v_network_cls = partial(
+    alf.networks.ValueNetwork,
+    fc_layer_params=fc_layers_params,
+    activation=activation)
+
+action_spec = alf.get_action_spec()
+discriminator_network_cls = partial(
+    alf.networks.CriticNetwork, joint_fc_layer_params=fc_layers_params)
+
+alf.config(
+    'SmodiceAlgorithm',
+    actor_network_cls=actor_network_cls,
+    v_network_cls=v_network_cls,
+    discriminator_network_cls=discriminator_network_cls,
+    actor_optimizer=alf.optimizers.Adam(lr=lr),
+    # add weight decay to the v_net following smodice paper
+    value_optimizer=alf.optimizers.Adam(lr=lr, weight_decay=1e-4),
+    discriminator_optimizer=alf.optimizers.Adam(lr=lr),
+    gradient_penalty_weight=0.1,
+)
+
+# training config
+alf.config(
+    "TrainerConfig",
+    offline_buffer_dir=offline_buffer_dir,
+    offline_buffer_length=offline_buffer_length)
diff --git a/alf/examples/smodice_pendulum_conf.py b/alf/examples/smodice_pendulum_conf.py
@@ -0,0 +1,103 @@
+# Copyright (c) 2022 Horizon Robotics and ALF Contributors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from functools import partial
+import torch
+
+import alf
+from alf.algorithms.agent import Agent
+from alf.algorithms.smodice_algorithm import SmodiceAlgorithm
+from alf.utils import math_ops
+
+# default params
+lr = 1e-4
+encoding_dim = 256
+fc_layers_params = (encoding_dim, ) * 2
+activation = torch.relu_
+
+offline_buffer_length = None
+offline_buffer_dir = [
+    "./hybrid_rl/replay_buffer_data/pendulum_replay_buffer_from_sac_10k"
+]
+
+env_name = "Pendulum-v0"
+
+alf.config(
+    "create_environment", env_name=env_name, num_parallel_environments=1)
+
+alf.config('Agent', rl_algorithm_cls=SmodiceAlgorithm)
+
+alf.config(
+    'TrainerConfig',
+    algorithm_ctor=Agent,
+    whole_replay_buffer_training=False,
+    clear_replay_buffer=False)
+
+proj_net = partial(
+    alf.networks.StableNormalProjectionNetwork,
+    state_dependent_std=True,
+    squash_mean=False,
+    scale_distribution=True,
+    min_std=1e-3,
+    max_std=10)
+
+actor_network_cls = partial(
+    alf.networks.ActorDistributionNetwork,
+    fc_layer_params=fc_layers_params,
+    activation=activation,
+    continuous_projection_net_ctor=proj_net)
+
+v_network_cls = partial(
+    alf.networks.ValueNetwork,
+    fc_layer_params=fc_layers_params,
+    activation=activation)
+
+action_spec = alf.get_action_spec()
+discriminator_network_cls = partial(
+    alf.networks.CriticNetwork, joint_fc_layer_params=fc_layers_params)
+
+alf.config(
+    'SmodiceAlgorithm',
+    actor_network_cls=actor_network_cls,
+    v_network_cls=v_network_cls,
+    discriminator_network_cls=discriminator_network_cls,
+    actor_optimizer=alf.optimizers.Adam(lr=lr),
+    # add weight decay to the v_net following smodice paper
+    value_optimizer=alf.optimizers.Adam(lr=lr, weight_decay=1e-4),
+    discriminator_optimizer=alf.optimizers.Adam(lr=lr),
+    gradient_penalty_weight=0.1,
+)
+
+num_iterations = 1000000
+
+# training config
+alf.config(
+    "TrainerConfig",
+    initial_collect_steps=1000,
+    num_updates_per_train_iter=1,
+    num_iterations=num_iterations,
+    # disable rl training by setting rl_train_after_update_steps
+    # to be larger than num_iterations
+    rl_train_after_update_steps=0,  # joint training
+    mini_batch_size=256,
+    mini_batch_length=2,
+    unroll_length=1,
+    offline_buffer_dir=offline_buffer_dir,
+    offline_buffer_length=offline_buffer_length,
+    num_checkpoints=1,
+    debug_summaries=True,
+    evaluate=True,
+    eval_interval=1000,
+    num_eval_episodes=3,
+)