-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmodeling_other_agents.py
58 lines (51 loc) · 2.38 KB
/
modeling_other_agents.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
from torch.distributions.categorical import Categorical
from configparser import ConfigParser
import torch
import torch.nn as nn
import copy
# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
config = ConfigParser()
config.read("config.cfg")
class PolicyEstimation:
def __init__(self, env):
self.policy_estimation_model = config['POLICY_ESTIMATION']['model']
self.entr_coeff = float(config['POLICY_ESTIMATION']['entr_coeff']) # entropy coefficient for policy est
self.adam_lr = float(config['POLICY_ESTIMATION']['adam_lr'])
self.adam_epoch = int(config['POLICY_ESTIMATION']['adam_epoch'])
self.env = env
self.setTorchSeed()
def setTorchSeed(self):
seed = config['PARAMS']['seed']
try:
torch.manual_seed(int(seed))
except ValueError:
pass
def maximum_likelihood(self, malfl):
"""
Entropy regularized maximum likelihood estimation with Adam optimizer
"""
for agent in [malfl.agent_one, malfl.agent_two]:
parameters = [
nn.Parameter(torch.rand(size=[self.env.n_states, self.env.n_states, self.env.n_actions], device=device),
requires_grad=True)]
optimizer = torch.optim.Adam(parameters, lr=self.adam_lr)
C = torch.tensor(agent.counter, device=device)
for epoch in range(self.adam_epoch):
dist = Categorical(torch.exp(parameters[0]))
log_probs = torch.log(dist.probs)
entropy_matrix = dist.entropy()
loss = - ((log_probs * C).sum() + self.entr_coeff * (entropy_matrix * C.sum(-1)).sum())
optimizer.zero_grad()
loss.backward()
optimizer.step()
agent.estimated_other_pi = Categorical(torch.exp(parameters[0]).cpu()).probs.detach().numpy()
def estimate_policies_from_trajectories(self, malfl=None):
if self.policy_estimation_model == "MLE":
self.maximum_likelihood(malfl)
elif self.policy_estimation_model == 'TEST':
# use the actual policies
malfl.agent_two.estimated_other_pi = copy.deepcopy(malfl.agent_one.pi)
malfl.agent_one.estimated_other_pi = copy.deepcopy(malfl.agent_two.pi)
else:
print("select one from ['TEST', 'MLE'] for policy estimation")