JincanDeng
diff --git a/‎.gitattributes
Lines changed: 2 additions & 0 deletions b/‎.gitattributes
Lines changed: 2 additions & 0 deletions
diff --git a/‎.idea/REINFORCE.iml
Lines changed: 12 additions & 0 deletions b/‎.idea/REINFORCE.iml
Lines changed: 12 additions & 0 deletions
diff --git a/‎.idea/misc.xml
Lines changed: 4 additions & 0 deletions b/‎.idea/misc.xml
Lines changed: 4 additions & 0 deletions
diff --git a/‎.idea/modules.xml
Lines changed: 8 additions & 0 deletions b/‎.idea/modules.xml
Lines changed: 8 additions & 0 deletions
diff --git a/‎.idea/workspace.xml
Lines changed: 349 additions & 0 deletions b/‎.idea/workspace.xml
Lines changed: 349 additions & 0 deletions
diff --git a/‎NormalizedActions.py
Lines changed: 65 additions & 0 deletions b/‎NormalizedActions.py
Lines changed: 65 additions & 0 deletions
diff --git a/‎NormalizedActions.pyc
1.86 KB b/‎NormalizedActions.pyc
1.86 KB
diff --git a/‎README.md
Lines changed: 1 addition & 0 deletions b/‎README.md
Lines changed: 1 addition & 0 deletions
diff --git a/‎main.py
Lines changed: 133 additions & 0 deletions b/‎main.py
Lines changed: 133 additions & 0 deletions
diff --git a/‎reinforce_continuous.py
Lines changed: 125 additions & 0 deletions b/‎reinforce_continuous.py
Lines changed: 125 additions & 0 deletions
@@ -0,0 +1,2 @@
+# Auto detect text files and perform LF normalization
+* text=auto
@@ -0,0 +1,65 @@
+#-*- coding: UTF-8 -*-
+"""
+filename:                      
+function:
+    date: 2017/8/7                      
+  author: 
+________                            ____.__                             
+\______ \   ____   ____   ____     |    |__| ____   ____ _____    ____  
+ |    |  \_/ __ \ /    \ / ___\    |    |  |/    \_/ ___\\__  \  /    \ 
+ |    `   \  ___/|   |  / /_/  /\__|    |  |   |  \  \___ / __ \|   |  \
+/_______  /\___  |___|  \___  /\________|__|___|  /\___  (____  |___|  /
+        \/     \/     \/_____/                  \/     \/     \/     \/    
+    
+    　　 へ　　　　　／|
+　　/＼7　　　 ∠＿/
+　 /　│　　 ／　／
+　│　Z ＿,＜　／　　 /`ヽ
+　│　　　　　ヽ　　 /　　〉
+　 Y　　　　　`　 /　　/
+　ｲ●　､　●　　⊂⊃〈　　/
+　()　 へ　　　　|　＼〈
+　　ｰ ､_　 ィ　 │ ／／
+　 / へ　　 /　ﾉ＜| ＼＼
+　 ヽ_ﾉ　　(_／　 │／／
+　　7　　　　　　　|／
+　　＞―r￣￣`ｰ―＿
+"""
+import gym
+
+
+class NormalizedActions(gym.ActionWrapper):
+
+    def _action(self, action):
+        action = (action + 1) / 2  # [-1, 1] => [0, 1]
+        action *= (self.action_space.high - self.action_space.low)
+        action += self.action_space.low
+        return action
+
+    def _reverse_action(self, action):
+        action -= self.action_space.low
+        action /= (self.action_space.high - self.action_space.low)
+        action = action * 2 - 1
+        return action
+
+"""
+░░░░░░░░░▄░░░░░░░░░░░░░░▄░░░░
+░░░░░░░░▌▒█░░░░░░░░░░░▄▀▒▌░░░
+░░░░░░░░▌▒▒█░░░░░░░░▄▀▒▒▒▐░░░
+░░░░░░░▐▄▀▒▒▀▀▀▀▄▄▄▀▒▒▒▒▒▐░░░
+░░░░░▄▄▀▒░▒▒▒▒▒▒▒▒▒█▒▒▄█▒▐░░░
+░░░▄▀▒▒▒░░░▒▒▒░░░▒▒▒▀██▀▒▌░░░ 
+░░▐▒▒▒▄▄▒▒▒▒░░░▒▒▒▒▒▒▒▀▄▒▒▌░░
+░░▌░░▌█▀▒▒▒▒▒▄▀█▄▒▒▒▒▒▒▒█▒▐░░
+░▐░░░▒▒▒▒▒▒▒▒▌██▀▒▒░░░▒▒▒▀▄▌░
+░▌░▒▄██▄▒▒▒▒▒▒▒▒▒░░░░░░▒▒▒▒▌░
+▀▒▀▐▄█▄█▌▄░▀▒▒░░░░░░░░░░▒▒▒▐░
+▐▒▒▐▀▐▀▒░▄▄▒▄▒▒▒▒▒▒░▒░▒░▒▒▒▒▌
+▐▒▒▒▀▀▄▄▒▒▒▄▒▒▒▒▒▒▒▒░▒░▒░▒▒▐░
+░▌▒▒▒▒▒▒▀▀▀▒▒▒▒▒▒░▒░▒░▒░▒▒▒▌░
+░▐▒▒▒▒▒▒▒▒▒▒▒▒▒▒░▒░▒░▒▒▄▒▒▐░░
+░░▀▄▒▒▒▒▒▒▒▒▒▒▒░▒░▒░▒▄▒▒▒▒▌░░
+░░░░▀▄▒▒▒▒▒▒▒▒▒▒▄▄▄▀▒▒▒▒▄▀░░░
+░░░░░░▀▄▄▄▄▄▄▀▀▀▒▒▒▒▒▄▄▀░░░░░
+░░░░░░░░░▒▒▒▒▒▒▒▒▒▒▀▀░░░░░░░░
+"""
@@ -0,0 +1 @@
+# REINFORCE_mx
@@ -0,0 +1,133 @@
+#-*- coding: UTF-8 -*-
+"""
+filename: main.py
+function: the code implementing REINFORCE algorithm in mxnet gluon
+    date: 2018/3/13
+  author: 
+________                            ____.__                             
+\______ \   ____   ____   ____     |    |__| ____   ____ _____    ____  
+ |    |  \_/ __ \ /    \ / ___\    |    |  |/    \_/ ___\\__  \  /    \ 
+ |    `   \  ___/|   |  / /_/  /\__|    |  |   |  \  \___ / __ \|   |  \
+/_______  /\___  |___|  \___  /\________|__|___|  /\___  (____  |___|  /
+        \/     \/     \/_____/                  \/     \/     \/     \/    
+    
+    　　 へ　　　　　／|
+　　/＼7　　　 ∠＿/
+　 /　│　　 ／　／
+　│　Z ＿,＜　／　　 /`ヽ
+　│　　　　　ヽ　　 /　　〉
+　 Y　　　　　`　 /　　/
+　ｲ●　､　●　　⊂⊃〈　　/
+　()　 へ　　　　|　＼〈
+　　ｰ ､_　 ィ　 │ ／／
+　 / へ　　 /　ﾉ＜| ＼＼
+　 ヽ_ﾉ　　(_／　 │／／
+　　7　　　　　　　|／
+　　＞―r￣￣`ｰ―＿
+"""
+from __future__ import print_function
+import numpy as np
+import mxnet as mx
+from mxnet import nd, autograd, gluon
+import argparse, math, os
+import gym
+from gym import wrappers
+from NormalizedActions import NormalizedActions
+
+# argument parser
+parser = argparse.ArgumentParser(description='PyTorch REINFORCE example')
+# parser.add_argument('--env_name', type=str, default='CartPole-v0')
+parser.add_argument('--env_name', type=str, default='InvertedPendulum-v1')
+parser.add_argument('--gamma', type=float, default=0.99, metavar='G',
+                    help='discount factor for reward (default: 0.99)')
+parser.add_argument('--exploration_end', type=int, default=100, metavar='N',
+                    help='number of episodes with noise (default: 100)')
+parser.add_argument('--seed', type=int, default=123, metavar='N',
+                    help='random seed (default: 123)')
+parser.add_argument('--num_steps', type=int, default=1000, metavar='N',
+                    help='max episode length (default: 1000)')
+parser.add_argument('--num_episodes', type=int, default=2000, metavar='N',
+                    help='number of episodes (default: 2000)')
+parser.add_argument('--hidden_size', type=int, default=128, metavar='N',
+                    help='number of episodes (default: 128)')
+parser.add_argument('--render', action='store_true',
+                    help='render the environment')
+parser.add_argument('--ckpt_freq', type=int, default=100,
+		            help='model saving frequency')
+parser.add_argument('--display', type=bool, default=False,
+                    help='display or not')
+args = parser.parse_args()
+
+# global variables
+env_name = args.env_name
+env = gym.make(env_name)
+if type(env.action_space) != gym.spaces.discrete.Discrete:
+    from reinforce_continuous import REINFORCE
+    env = NormalizedActions(gym.make(env_name))
+else:
+    # from reinforce_discrete import REINFORCE
+    raise NotImplementedError()
+
+if args.display:
+    env = wrappers.Monitor(env, '/tmp/{}-experiment'.format(env_name), force=True)
+
+env.seed(args.seed)
+mx.random.seed(args.seed)
+np.random.seed(args.seed)
+
+agent = REINFORCE(args.hidden_size, env.observation_space.shape[0], env.action_space)
+
+dir = 'ckpt_' + env_name
+if not os.path.exists(dir):
+    os.mkdir(dir)
+
+for i_episode in range(args.num_episodes):
+    # state = torch.Tensor([env.reset()])
+    state = nd.array([env.reset()])
+    entropies = []
+    log_probs = []
+    rewards = []
+    # generate examples
+    for t in range(args.num_steps):
+        action, log_prob, entropy = agent.select_action(state)
+
+        next_state, reward, done, _ = env.step(action.numpy()[0])
+
+        entropies.append(entropy)
+        log_probs.append(log_prob)
+        rewards.append(reward)
+        state = nd.array([next_state])
+
+        if done:
+            break
+
+    agent.update_parameters(rewards, log_probs, entropies, args.gamma)
+
+    # if i_episode % args.ckpt_freq == 0:
+    #     torch.save(agent.model.state_dict(), os.path.join(dir, 'reinforce-' + str(i_episode) + '.pkl'))
+
+    print("Episode: {}, reward: {}".format(i_episode, np.sum(rewards)))
+
+env.close()
+
+"""
+░░░░░░░░░▄░░░░░░░░░░░░░░▄░░░░
+░░░░░░░░▌▒█░░░░░░░░░░░▄▀▒▌░░░
+░░░░░░░░▌▒▒█░░░░░░░░▄▀▒▒▒▐░░░
+░░░░░░░▐▄▀▒▒▀▀▀▀▄▄▄▀▒▒▒▒▒▐░░░
+░░░░░▄▄▀▒░▒▒▒▒▒▒▒▒▒█▒▒▄█▒▐░░░
+░░░▄▀▒▒▒░░░▒▒▒░░░▒▒▒▀██▀▒▌░░░ 
+░░▐▒▒▒▄▄▒▒▒▒░░░▒▒▒▒▒▒▒▀▄▒▒▌░░
+░░▌░░▌█▀▒▒▒▒▒▄▀█▄▒▒▒▒▒▒▒█▒▐░░
+░▐░░░▒▒▒▒▒▒▒▒▌██▀▒▒░░░▒▒▒▀▄▌░
+░▌░▒▄██▄▒▒▒▒▒▒▒▒▒░░░░░░▒▒▒▒▌░
+▀▒▀▐▄█▄█▌▄░▀▒▒░░░░░░░░░░▒▒▒▐░
+▐▒▒▐▀▐▀▒░▄▄▒▄▒▒▒▒▒▒░▒░▒░▒▒▒▒▌
+▐▒▒▒▀▀▄▄▒▒▒▄▒▒▒▒▒▒▒▒░▒░▒░▒▒▐░
+░▌▒▒▒▒▒▒▀▀▀▒▒▒▒▒▒░▒░▒░▒░▒▒▒▌░
+░▐▒▒▒▒▒▒▒▒▒▒▒▒▒▒░▒░▒░▒▒▄▒▒▐░░
+░░▀▄▒▒▒▒▒▒▒▒▒▒▒░▒░▒░▒▄▒▒▒▒▌░░
+░░░░▀▄▒▒▒▒▒▒▒▒▒▒▄▄▄▀▒▒▒▒▄▀░░░
+░░░░░░▀▄▄▄▄▄▄▀▀▀▒▒▒▒▒▄▄▀░░░░░
+░░░░░░░░░▒▒▒▒▒▒▒▒▒▒▀▀░░░░░░░░
+"""
@@ -0,0 +1,125 @@
+#-*- coding: UTF-8 -*-
+"""
+filename: REINFORCE_continuous.py
+function: the REINFORCE algorithm for continuous variables
+    date: 2017/8/7                      
+  author: 
+________                            ____.__                             
+\______ \   ____   ____   ____     |    |__| ____   ____ _____    ____  
+ |    |  \_/ __ \ /    \ / ___\    |    |  |/    \_/ ___\\__  \  /    \ 
+ |    `   \  ___/|   |  / /_/  /\__|    |  |   |  \  \___ / __ \|   |  \
+/_______  /\___  |___|  \___  /\________|__|___|  /\___  (____  |___|  /
+        \/     \/     \/_____/                  \/     \/     \/     \/    
+    
+    　　 へ　　　　　／|
+　　/＼7　　　 ∠＿/
+　 /　│　　 ／　／
+　│　Z ＿,＜　／　　 /`ヽ
+　│　　　　　ヽ　　 /　　〉
+　 Y　　　　　`　 /　　/
+　ｲ●　､　●　　⊂⊃〈　　/
+　()　 へ　　　　|　＼〈
+　　ｰ ､_　 ィ　 │ ／／
+　 / へ　　 /　ﾉ＜| ＼＼
+　 ヽ_ﾉ　　(_／　 │／／
+　　7　　　　　　　|／
+　　＞―r￣￣`ｰ―＿
+"""
+from __future__ import print_function
+import numpy as np
+import mxnet as mx
+from mxnet import nd, autograd, gluon
+
+import math
+
+# set ctx
+data_ctx = mx.cpu()
+model_ctx = mx.cpu()
+
+def normal(x, mu, sigma_sq):
+    a = nd.exp(-1*nd.power(x-mu, 2)/(2*sigma_sq))
+    b = np.sqrt(1/(2*sigma_sq*math.pi.expand_as(sigma_sq)))
+    return a*b
+
+
+class Policy(gluon.Block):
+    def __init__(self, hidden_size, num_inputs, action_space):
+        super(Policy, self).__init__()
+        self.action_space = action_space
+        num_outputs = action_space.shape[0]
+        with self.name_scope():
+            self.dense0 = gluon.nn.Dense(hidden_size)
+            self.dense1 = gluon.nn.Dense(num_outputs)
+            self.dense2 = gluon.nn.Dense(num_outputs)
+
+    def forward(self, inputs):
+        x = inputs
+        x = nd.relu(self.dense0(x))
+        mu = self.dense1(x)
+        sigma_sq = self.dense2(x)
+
+        return mu, sigma_sq
+
+class REINFORCE:
+    def __init__(self, hidden_size, num_inputs, action_space):
+        self.action_space = action_space
+        self.model = Policy(hidden_size, num_inputs, action_space)
+        self.model.collect_params().initialize(mx.init.Normal(sigma=0.01), ctx=model_ctx)
+        self.optimizer = gluon.Trainer(self.model.collect_params(), 'sgd', {'learning_rate': 0.01})
+
+    def select_action(self, state):
+        with autograd.record():
+            mu, sigma_sq = self.model(state.as_in_context(model_ctx))
+            # sigma_sq = nd.softrelu(sigma_sq)
+            # the implementation of softplus
+            sigma_sq = nd.log(1+nd.exp(sigma_sq))
+
+            eps = nd.random.normal(0,1, mu.shape, dtype=np.float32)
+            # calculate the probability
+            action = mu + nd.sqrt(sigma_sq)*eps
+            prob = normal(action, mu, sigma_sq)
+
+            entropy = -0.5*(np.log(sigma_sq+math.pi*2)+1)
+            log_prob = nd.log(prob)
+
+        return action, log_prob, entropy
+
+    def update_parameters(self, rewards, log_probs, entropies, gamma):
+        # loss = myloss(rewards, log_probs, entropies, gamma, sample_weight=None)
+        # self.model.collect_params().zero_grad()
+        with autograd.record():
+            R = nd.zeros((1, 1))
+            loss = 0
+            for i in reversed(range(len(rewards))):
+                R = gamma * R + rewards[i]
+                loss = loss - (log_probs[i] * R).sum() - (0.0001 * entropies[i]).sum()
+        self.model.collect_params().zero_grad()
+        loss.backward()
+        grads = [i.grad(data_ctx) for i in self.model.collect_params().values()]
+        # 梯度裁剪。需要注意的是，这里的梯度是整个批量的梯度。
+        # 因此我们将clipping_norm乘以num_steps和batch_size。
+        gluon.utils.clip_global_norm(grads, 40)
+        self.optimizer.step(batch_size=len(rewards))
+
+
+"""
+░░░░░░░░░▄░░░░░░░░░░░░░░▄░░░░
+░░░░░░░░▌▒█░░░░░░░░░░░▄▀▒▌░░░
+░░░░░░░░▌▒▒█░░░░░░░░▄▀▒▒▒▐░░░
+░░░░░░░▐▄▀▒▒▀▀▀▀▄▄▄▀▒▒▒▒▒▐░░░
+░░░░░▄▄▀▒░▒▒▒▒▒▒▒▒▒█▒▒▄█▒▐░░░
+░░░▄▀▒▒▒░░░▒▒▒░░░▒▒▒▀██▀▒▌░░░ 
+░░▐▒▒▒▄▄▒▒▒▒░░░▒▒▒▒▒▒▒▀▄▒▒▌░░
+░░▌░░▌█▀▒▒▒▒▒▄▀█▄▒▒▒▒▒▒▒█▒▐░░
+░▐░░░▒▒▒▒▒▒▒▒▌██▀▒▒░░░▒▒▒▀▄▌░
+░▌░▒▄██▄▒▒▒▒▒▒▒▒▒░░░░░░▒▒▒▒▌░
+▀▒▀▐▄█▄█▌▄░▀▒▒░░░░░░░░░░▒▒▒▐░
+▐▒▒▐▀▐▀▒░▄▄▒▄▒▒▒▒▒▒░▒░▒░▒▒▒▒▌
+▐▒▒▒▀▀▄▄▒▒▒▄▒▒▒▒▒▒▒▒░▒░▒░▒▒▐░
+░▌▒▒▒▒▒▒▀▀▀▒▒▒▒▒▒░▒░▒░▒░▒▒▒▌░
+░▐▒▒▒▒▒▒▒▒▒▒▒▒▒▒░▒░▒░▒▒▄▒▒▐░░
+░░▀▄▒▒▒▒▒▒▒▒▒▒▒░▒░▒░▒▄▒▒▒▒▌░░
+░░░░▀▄▒▒▒▒▒▒▒▒▒▒▄▄▄▀▒▒▒▒▄▀░░░
+░░░░░░▀▄▄▄▄▄▄▀▀▀▒▒▒▒▒▄▄▀░░░░░
+░░░░░░░░░▒▒▒▒▒▒▒▒▒▒▀▀░░░░░░░░
+"""
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+# Auto detect text files and perform LF normalization`
	`2`	`+* text=auto`