Skip to content

Commit 216d1de

Browse files
committed
chore: 📝 Finished MADDPG: Add figures of the training results and modified hyperparams
1 parent 35907fa commit 216d1de

File tree

8 files changed

+43
-42
lines changed

8 files changed

+43
-42
lines changed

MultiAgentDeepDeterministicPolicyGradient/config.yaml

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -6,22 +6,22 @@ num_advs: 1
66

77
# training hyperparameters
88
lr_actor: 0.0001
9-
lr_critic: 0.001
9+
lr_critic: 0.003
1010
# exploration factor
1111
epsilon: 0.1
1212
noise_rate: 0.1
1313
# reward decay
1414
gamma: 0.95
1515
# soft update rate
16-
tau: 0.01
17-
cuda: False
16+
tau: 0.05
17+
cuda: True
1818
# experience
19-
batch_size: 256
19+
batch_size: 10000
2020
buffer_size: 500000
2121

2222
# settings about saving models and logs
23-
log_dir: .\MultiAgentDeepDeterministicPolicyGradient\logs
24-
save_dir: .\MultiAgentDeepDeterministicPolicyGradient\models
23+
log_dir: ./MultiAgentDeepDeterministicPolicyGradient/logs
24+
save_dir: ./MultiAgentDeepDeterministicPolicyGradient/models
2525

2626
# evaluation settings
2727
eval_interval: 1000
Loading
Loading
Loading

MultiAgentDeepDeterministicPolicyGradient/models.py

Lines changed: 26 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -24,13 +24,13 @@ def __init__(
2424
self.action_high = action_high
2525
self.action_dim = action_dim
2626
self.net = nn.Sequential(
27-
nn.Linear(obs_dim, 256),
27+
nn.Linear(obs_dim, 64),
2828
nn.ReLU(),
29-
nn.Linear(256, 256),
29+
nn.Linear(64, 64),
3030
nn.ReLU(),
31-
nn.Linear(256, 256),
31+
nn.Linear(64, 64),
3232
nn.ReLU(),
33-
nn.Linear(256, action_dim),
33+
nn.Linear(64, action_dim),
3434
nn.Tanh(),
3535
)
3636

@@ -46,8 +46,7 @@ def select_action(
4646
) -> np.ndarray:
4747
# random exploration
4848
if np.random.uniform() < epsilon:
49-
mu = np.random.uniform(-self.action_high,
50-
self.action_high, self.action_dim)
49+
mu = np.random.uniform(-self.action_high, self.action_high, self.action_dim)
5150

5251
else:
5352

@@ -61,7 +60,7 @@ def select_action(
6160
noise = noise_rate * self.action_high * np.random.randn(*mu.shape)
6261
mu += noise
6362
mu = np.clip(mu, -self.action_high, self.action_high)
64-
return mu
63+
return mu.copy()
6564

6665

6766
class Critic(nn.Module):
@@ -75,13 +74,13 @@ def __init__(
7574
self.action_high = action_high
7675
# critic should give scores for all agents' actions
7776
self.net = nn.Sequential(
78-
nn.Linear(sum(obs_dims) + sum(action_dims), 256),
77+
nn.Linear(sum(obs_dims) + sum(action_dims), 64),
7978
nn.ReLU(),
80-
nn.Linear(256, 256),
79+
nn.Linear(64, 64),
8180
nn.ReLU(),
82-
nn.Linear(256, 256),
81+
nn.Linear(64, 64),
8382
nn.ReLU(),
84-
nn.Linear(256, 1),
83+
nn.Linear(64, 1),
8584
)
8685

8786
def forward(self, obs: Tensor, actions: Tensor) -> Tensor:
@@ -153,27 +152,21 @@ def __init__(
153152
for i in range(n_agents):
154153
self.actors.append(Actor(action_high, obs_dims[i], action_dims[i]))
155154
self.critics.append(Critic(action_high, obs_dims, action_dims))
156-
self.target_actors.append(
157-
Actor(action_high, obs_dims[i], action_dims[i]))
158-
self.target_critics.append(
159-
Critic(action_high, obs_dims, action_dims))
155+
self.target_actors.append(Actor(action_high, obs_dims[i], action_dims[i]))
156+
self.target_critics.append(Critic(action_high, obs_dims, action_dims))
160157
# load_state_dict
161158
self.target_actors[i].load_state_dict(self.actors[i].state_dict())
162-
self.target_critics[i].load_state_dict(
163-
self.critics[i].state_dict())
159+
self.target_critics[i].load_state_dict(self.critics[i].state_dict())
164160
# optimizers
165-
self.optimizer_a.append(optim.Adam(
166-
self.actors[i].parameters(), lr=lr_a))
167-
self.optimizer_c.append(optim.Adam(
168-
self.critics[i].parameters(), lr=lr_c))
161+
self.optimizer_a.append(optim.Adam(self.actors[i].parameters(), lr=lr_a))
162+
self.optimizer_c.append(optim.Adam(self.critics[i].parameters(), lr=lr_c))
169163

170164
self.actors[i] = self.actors[i].to(self.device)
171165
self.critics[i] = self.critics[i].to(self.device)
172166
self.target_actors[i] = self.target_actors[i].to(self.device)
173167
self.target_critics[i] = self.target_critics[i].to(self.device)
174168

175-
self.buffer = MemoryBuffer(
176-
mem_capacity, obs_dims, action_dims, self.n_agents)
169+
self.buffer = MemoryBuffer(mem_capacity, obs_dims, action_dims, self.n_agents)
177170
self.writer = SummaryWriter(log_dir=log_dir)
178171

179172
def learn(self):
@@ -260,40 +253,39 @@ def _update_policy(self, transitions: dict):
260253

261254
# comput td target and use the square of td residual as the loss
262255
q_value = self.critics[i].forward(o, mu)
263-
critic_loss = t.mean((q_target - q_value) *(q_target - q_value))
256+
critic_loss = t.mean((q_target - q_value) * (q_target - q_value))
264257

265258
# actor loss, Actor's goal is to make Critic's scoring higher
266259
mu[i] = self.actors[i].forward(o[i])
267260
actor_loss = -self.critics[i].forward(o, mu).mean()
268261

269262
# then perform gradient descent
270263
self.optimizer_a[i].zero_grad()
271-
self.optimizer_c[i].zero_grad()
272-
critic_loss.backward()
273264
actor_loss.backward()
274265
self.optimizer_a[i].step()
266+
self.optimizer_c[i].zero_grad()
267+
critic_loss.backward()
275268
self.optimizer_c[i].step()
276269

277270
actor_losses.append(actor_loss.item())
278271
critic_losses.append(critic_loss.item())
279272

280-
# then soft update the target network
281-
self._soft_update_target()
273+
# then soft update the target network
274+
self._soft_update_target(i)
282275

283276
return actor_losses, critic_losses
284277

285-
def _soft_update_target(self) -> None:
286-
for i in range(self.n_agents):
287-
for target_param, param in zip(
278+
def _soft_update_target(self,i) -> None:
279+
280+
for target_param, param in zip(
288281
self.target_actors[i].parameters(), self.actors[i].parameters()
289282
):
290283
target_param.data.copy_(
291284
(1 - self.tau) * target_param.data + self.tau * param.data
292285
)
293286

294-
for target_param, param in zip(
295-
self.target_critics[i].parameters(
296-
), self.critics[i].parameters()
287+
for target_param, param in zip(
288+
self.target_critics[i].parameters(), self.critics[i].parameters()
297289
):
298290
target_param.data.copy_(
299291
(1 - self.tau) * target_param.data + self.tau * param.data

MultiAgentDeepDeterministicPolicyGradient/readme.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,3 +15,11 @@
1515
## 优化思路
1616

1717
多Agent需要执行for循环,能否把它变成多线程呢,这样就可以大大加快速度了。
18+
19+
## 结果
20+
21+
![reward](./figs/reward.jpg)
22+
23+
![aloss](./figs/al.jpg)
24+
25+
![closs](./figs/cl.jpg)

MultiAgentDeepDeterministicPolicyGradient/train.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33

44

55
if __name__ == "__main__":
6-
args = load_cfg(r"MultiAgentDeepDeterministicPolicyGradient\config.yaml")
6+
args = load_cfg(r"MultiAgentDeepDeterministicPolicyGradient/config.yaml")
77
env, args = make_env(args)
88

99
agent = MADDPG(

readme.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -532,7 +532,8 @@ def forward(self, x: t.Tensor) -> t.Tensor:
532532
533533
Q_{\pi}(s_t,a_t) &= \mathbb{E}[U_t \vert S_t=s_t,A_t= a_t]\\
534534
535-
V_{\pi}(s_t)&=\mathbb{E}*A[Q*{\pi}(s_t,A)], A \sim \pi (\cdot \vert s_t)
535+
V_{\pi}(s_t)&=\mathbb{E}_
536+
A[Q_{\pi}(s_t,A)], A \sim \pi (\cdot \vert s_t)
536537
537538
\end{aligned}
538539
$$

0 commit comments

Comments
 (0)