Skip to content

Commit e2463ec

Browse files
committed
add actor critic for cart pole
1 parent d9fe91f commit e2463ec

File tree

3 files changed

+240
-3
lines changed

3 files changed

+240
-3
lines changed

cartpole_actorcritic.py

Lines changed: 225 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,225 @@
1+
import numpy as np
2+
np.set_printoptions(precision=4, suppress=True)
3+
import matplotlib.pyplot as plt
4+
import random
5+
import sys
6+
7+
# import gym
8+
9+
# env = gym.make('CartPole-v1')
10+
"""
11+
Observation:
12+
Type: Box(4)
13+
Num Observation Min Max
14+
0 Cart Position -4.8 4.8
15+
1 Cart Velocity -Inf Inf
16+
2 Pole Angle -0.418 rad (-24 deg) 0.418 rad (24 deg)
17+
3 Pole Angular Velocity -Inf Inf
18+
Actions:
19+
Type: Discrete(2)
20+
Num Action
21+
0 Push cart to the left
22+
1 Push cart to the right
23+
Note: The amount the velocity that is reduced or increased is not
24+
fixed; it depends on the angle the pole is pointing. This is because
25+
the center of gravity of the pole increases the amount of energy needed
26+
to move the cart underneath it
27+
Reward:
28+
Reward is 1 for every step taken, including the termination step
29+
Starting State:
30+
All observations are assigned a uniform random value in [-0.05..0.05]
31+
Episode Termination:
32+
Pole Angle is more than 12 degrees.
33+
Cart Position is more than 2.4 (center of the cart reaches the edge of
34+
the display).
35+
Episode length is greater than 200.
36+
Solved Requirements:
37+
Considered solved when the average return is greater than or equal to
38+
195.0 over 100 consecutive trials.
39+
"""
40+
41+
X_range = [-4.8, 4.8]
42+
v_range = [-10, 10]#[-100000, 100000] #[float('-inf'), float('inf')]
43+
theta_range = [-24, 24]
44+
anglev_range = [-10, 10] #[-100000, 100000]#[float('-inf'), float('inf')]
45+
start_range = [-0.05, 0.05]
46+
47+
terminating_cond =[2.4, 12, 200]
48+
49+
action_set = [0,1] #left, right
50+
51+
M = 3 # dimensionality of the fourier transform
52+
softmax_sigma = 0.1
53+
# gamma = 1
54+
55+
def in_radian(ang):
56+
return ang*np.pi/180
57+
58+
def transition(action, x, x_dot, theta, theta_dot):
59+
gravity = 9.8
60+
masscart = 1.0
61+
masspole = 0.1
62+
total_mass = masspole + masscart
63+
length = 0.5 # actually half the pole's length
64+
polemass_length = masspole * length
65+
force_mag = 10.0
66+
tau = 0.02
67+
68+
force = force_mag if action == 1 else -force_mag
69+
costheta = np.cos(theta) # theta in radians
70+
sintheta = np.sin(theta)
71+
72+
# from gym https://github.com/openai/gym/blob/master/gym/envs/classic_control/cartpole.py
73+
temp = (force + polemass_length * theta_dot ** 2 * sintheta) / total_mass
74+
thetaacc = (gravity * sintheta - costheta * temp) / (length * (4.0 / 3.0 - masspole * costheta ** 2 / total_mass))
75+
xacc = temp - polemass_length * thetaacc * costheta / total_mass
76+
77+
#euler
78+
x = x + tau * x_dot
79+
x_dot = x_dot + tau * xacc
80+
theta = theta + tau * theta_dot
81+
theta_dot = theta_dot + tau * thetaacc
82+
83+
#semi euler
84+
# x_dot = x_dot + tau * xacc
85+
# x = x + tau * x_dot
86+
# theta_dot = theta_dot + tau * thetaacc
87+
# theta = theta + tau * theta_dot
88+
89+
return x, x_dot, theta, theta_dot
90+
91+
def is_terminating(x, x_dot, theta, theta_dot, step):
92+
if x <= -terminating_cond[0] or x >= terminating_cond[0] or theta <= -in_radian(terminating_cond[1]) or theta >= in_radian(terminating_cond[1]) or step>=terminating_cond[2]:
93+
return True
94+
return False
95+
96+
def reward(x, x_dot, theta, theta_dot, step):
97+
if is_terminating(x, x_dot, theta, theta_dot, step):
98+
return 0
99+
return 1
100+
101+
def normalize(x, x_dot, theta, theta_dot, cosineflag=True):
102+
if cosineflag:
103+
x = (x-X_range[0])/(X_range[1]-X_range[0])
104+
theta = (theta-theta_range[0])/(theta_range[1]-theta_range[0])
105+
x_dot = (x_dot - v_range[0])/(v_range[1] - v_range[0])
106+
theta_dot = (theta_dot - anglev_range[0])/(anglev_range[1] - anglev_range[0])
107+
108+
else:
109+
x = 2*(x-X_range[0])/(X_range[1]-X_range[0]) -1
110+
theta = 2*(theta-theta_range[0])/(theta_range[1]-theta_range[0]) -1
111+
x_dot = 2*(x_dot - v_range[0])/(v_range[1] - v_range[0]) -1
112+
theta_dot = 2*(theta_dot - anglev_range[0])/(anglev_range[1] - anglev_range[0]) -1
113+
114+
return x, x_dot, theta, theta_dot
115+
116+
def fourier(x, x_dot, theta, theta_dot, cosineflag=False): #4M+1 features
117+
#normalize
118+
x, x_dot, theta, theta_dot = normalize(x, x_dot, theta, theta_dot, cosineflag)
119+
phi = [1]
120+
if cosineflag:
121+
for i in range(1, M+1):
122+
phi.append(np.cos(i*np.pi*x))
123+
for i in range(1, M+1):
124+
phi.append(np.cos(i*np.pi*x_dot))
125+
for i in range(1, M+1):
126+
phi.append(np.cos(i*np.pi*theta))
127+
for i in range(1, M+1):
128+
phi.append(np.cos(i*np.pi*theta_dot))
129+
else:
130+
for i in range(1, M+1):
131+
phi.append(np.sin(i*np.pi*x))
132+
for i in range(1, M+1):
133+
phi.append(np.sin(i*np.pi*x_dot))
134+
for i in range(1, M+1):
135+
phi.append(np.sin(i*np.pi*theta))
136+
for i in range(1, M+1):
137+
phi.append(np.sin(i*np.pi*theta_dot))
138+
return np.array(phi)
139+
140+
def softmax_action(policy_params, x, x_dot, theta, theta_dot):
141+
142+
phi_s = fourier(x, x_dot, theta, theta_dot) # (4M+1, )
143+
# print(policy_params.shape, phi_s.shape)
144+
policy_val = np.dot(phi_s.T, policy_params) #(4M,1) (4M+1, 2)
145+
policy_exp = np.exp(softmax_sigma*policy_val)
146+
policy_exp /= np.sum(policy_exp)
147+
# print(policy_exp, x, x_dot, theta, theta_dot)
148+
return policy_exp #(2, )
149+
150+
def ACTOR_CRITIC(alpha_w, alpha_theta, gamma=1.0):
151+
policy_params = np.random.normal(0, 0.1, (4*M+1,len(action_set))) #np.ones((4*M+1,len(action_set)))*(-0.01)
152+
value_params = np.ones(4*M+1)*0.01
153+
episode_length, avg_return = [], []
154+
155+
for iter in range(5000):
156+
policy_params_temp = policy_params.copy()
157+
#run episode
158+
#initial state
159+
x = np.random.uniform(start_range[0], start_range[1])
160+
theta = np.random.uniform(start_range[0], start_range[1])
161+
x_dot = np.random.uniform(start_range[0], start_range[1])
162+
theta_dot = np.random.uniform(start_range[0], start_range[1])
163+
step = 1
164+
_return = 0
165+
166+
# #using gym
167+
# x, x_dot, theta, theta_dot = env.reset()
168+
169+
#run epsidoe
170+
while not is_terminating(x, x_dot, theta, theta_dot, step):
171+
#choose action
172+
curr_action = random.choices(action_set, softmax_action(policy_params, x, x_dot, theta, theta_dot))
173+
174+
# #using gym
175+
# observation, curr_reward, done, info = env.step(curr_action)
176+
# next_x, next_x_dot, next_theta, mext_theta_dot = observation
177+
178+
#next state
179+
next_x, next_x_dot, next_theta, mext_theta_dot = transition(curr_action, x, x_dot, theta, theta_dot)
180+
#reward
181+
curr_reward = reward(next_x, next_x_dot, next_theta, mext_theta_dot, step)
182+
_return += curr_reward*gamma**(step-1)
183+
step += 1
184+
print(x, x_dot, theta, theta_dot, curr_action, softmax_action(policy_params, x, x_dot, theta, theta_dot))
185+
186+
phi_s = fourier(x, x_dot, theta, theta_dot)
187+
phi_next_s = fourier(next_x, next_x_dot, next_theta, mext_theta_dot)
188+
if not is_terminating(next_x, next_x_dot, next_theta, mext_theta_dot, step):
189+
delta = curr_reward +gamma*np.dot(phi_next_s, value_params) - np.dot(phi_s, value_params)
190+
else:
191+
delta = curr_reward - np.dot(phi_s, value_params)
192+
#update value params
193+
value_params += alpha_w*delta*phi_s
194+
#update policy params
195+
policy = softmax_action(policy_params, x, x_dot, theta, theta_dot)
196+
if curr_action == 0:
197+
policy_params[:,0] += alpha_theta*delta*(1-policy[0])*phi_s
198+
policy_params[:,1] += alpha_theta*delta*(-1*policy[0])*phi_s
199+
# print(curr_action, delta, policy)
200+
if curr_action == 1:
201+
policy_params[:,0] += alpha_theta*delta*(-policy[1])*phi_s
202+
policy_params[:,1] += alpha_theta*delta*(1-policy[1])*phi_s
203+
204+
x, x_dot, theta, theta_dot = next_x, next_x_dot, next_theta, mext_theta_dot
205+
206+
episode_length.append(step)
207+
avg_return.append(_return)
208+
209+
print("\n EPISODE LENGTH: ",step, "CURR ITER: ", iter)
210+
if np.mean(avg_return[max(0, iter-100): iter+1]) > 195.0:
211+
print("Hooray... solved")
212+
break
213+
max_diff = np.max(np.abs(policy_params_temp - policy_params))
214+
print(" Max diff: ",max_diff)
215+
if max_diff/alpha_theta < 0.001: # 0.001 works with 1e-6 policy_step
216+
break
217+
218+
plt.figure()
219+
plt.plot(np.arange(len(avg_return)), avg_return)
220+
plt.xlabel('Iterations')
221+
plt.ylabel('Avg. return')
222+
plt.savefig('graph_cartpole_actorcritic')
223+
224+
alpha_w, alpha_theta = 1e-7, 5e-4
225+
ACTOR_CRITIC(alpha_w, alpha_theta)

cartpole_reinforce.py

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,10 @@
33
import matplotlib.pyplot as plt
44
# import random
55
import sys
6+
7+
# import gym
8+
9+
# env = gym.make('CartPole-v1')
610
"""
711
Observation:
812
Type: Box(4)
@@ -159,21 +163,29 @@ def REINFORCE(alpha_w, alpha_theta, algo_type='without_baseline', gamma=1.0):
159163
theta_dot = np.random.uniform(start_range[0], start_range[1])
160164
step = 1
161165
state_list, action_list, reward_list = [], [], []
166+
167+
# #using gym
168+
# x, x_dot, theta, theta_dot = env.reset()
162169

163170
#run epsidoe
164171
while not is_terminating(x, x_dot, theta, theta_dot, step):
165172
#choose action
166173
curr_action = np.argmax(softmax_action(policy_params, x, x_dot, theta, theta_dot))
174+
175+
# #using gym
176+
# observation, curr_reward, done, info = env.step(curr_action)
177+
# next_x, next_x_dot, next_theta, mext_theta_dot = observation
178+
167179
#next state
168180
next_x, next_x_dot, next_theta, mext_theta_dot = transition(curr_action, x, x_dot, theta, theta_dot)
169181
#reward
170182
curr_reward = reward(next_x, next_x_dot, next_theta, mext_theta_dot, step)
171183
step += 1
172-
173184
state_list.append([x, x_dot, theta, theta_dot])
174185
action_list.append(curr_action)
175186
reward_list.append(curr_reward)
176187
x, x_dot, theta, theta_dot = next_x, next_x_dot, next_theta, mext_theta_dot
188+
177189
print(np.array(state_list))
178190
print(action_list)
179191
print("\n EPISODE LENGTH: ",len(reward_list), "CURR ITER: ", iter)

mountaincar.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,14 @@
11
import gym
22

3-
env = gym.make('MountainCar-v0')
3+
env = gym.make('CartPole-v1')
44
observation = env.reset()
55

66
# action-space & observation-space describes what is the valid format of action & state parameters for that particular env to work on with
77
print("Action space ", env.action_space) #Discerete(3) -> 0,1,2
88
#disceret class; #var in class n, start; #functions sample(), contains(x)
99
print("State space ",env.observation_space) #[Output: ] Box(2,)
1010
#box class; low, high, shape of state
11-
for t in range(5):
11+
for t in range(50):
1212
# env.render()
1313
action = env.action_space.sample()
1414
print(observation, action)

0 commit comments

Comments
 (0)