-
Notifications
You must be signed in to change notification settings - Fork 0
/
doctor_strange.py
137 lines (101 loc) · 3.88 KB
/
doctor_strange.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
'''
Doctor Strange (Monte Carlo Simulation class) AGENT
This agent uses the env object. env object is still under construction. Use it
at your own risk.
Doctor Strange sees several possible futures. Then it takes the average of them to
choose the best move.
Since placing a bom can most likely causes in death so it does not do it.
It's simulation are all random.
It takes the average of the possible opponent moves
'''
import time
import random
class agent:
def __init__(self, player_num, env):
self.name = "doctor strange"
self.player_num = player_num #player_num must always be deducted "player_num -1"
self.env = env
self.round_time = 3 # our agent things 3 second for each round
def give_next_move(self, solid_state):
'''
This method is called each time the player needs to choose an
action
solid_state: is a dictionary containing all the information about the board
'''
self.board = solid_state["board"]
self.done = solid_state["done"]
self.bombs = solid_state["bombs"]
self.turn = solid_state["turn"]
self.player = solid_state["players"][self.player_num-1]
player1_moves, player2_moves = self.env.get_valid_actions(solid_state)
if ( self.player_num == 1):
my_moves = player1_moves
enemy_moves = player2_moves
else:
my_moves = player2_moves
enemy_moves = player1_moves
list_MC_nodes = []
for mm in my_moves:
list_future_states = [] # In this one, my moves are fixed
for em in enemy_moves:
if ( self.player_num == 1):
joint_move = (mm, em)
else:
joint_move = (em, mm)
list_future_states.append(self.env.next_state(solid_state,joint_move))
list_MC_nodes.append(MC_node(list_future_states,mm))
self.run_simulation(random.choice(list_MC_nodes).get_a_state())
#TODO simulation works now just work with average
timeout = time.time() + self.round_time # 5 minutes from now
counter = 0
while time.time() < timeout:
counter = counter + 1
temp_MC_node = random.choice(list_MC_nodes)
temp_reward = self.run_simulation(temp_MC_node.get_a_state())
temp_MC_node.update_value(temp_reward)
print("number of simulations: ", counter)
#HERE we choose the best mc_node and so the best move
optimal_mc = None
highest_value = 0 # does not matter
for mc_node in list_MC_nodes:
if optimal_mc == None:
optimal_mc = mc_node
highest_value = mc_node.average_reward
else:
if (mc_node.average_reward > highest_value):
optimal_mc = mc_node
highest_value = mc_node.average_reward
action = optimal_mc.action
return action
def run_simulation(self,solid_state):
'''
runs a random simulation until it reachs a terminal
it then return the reward of the terminal
'''
temp_state = solid_state
while(not temp_state["done"]):
p1_m, p2_m = self.env.get_valid_actions(temp_state)
joint_move = (random.choice(p1_m),random.choice(p2_m))
temp_state = self.env.next_state(temp_state,joint_move)
#print("-> joint move: ", joint_move, " <-")
#now temp_state is a terminal
return temp_state["players"][self.player_num-1].score #might need to make it to work for all players
#player1_moves, player2_moves = self.env.get_valid_actions(solid_state)
class MC_node:
def __init__(self, state_list, action):
'''
MC_node holds the first initial nodes of a next step
'''
self.state_list = state_list
self.action = action # this is the action of us of
self.average_reward = 0
self.probability_of_states = {}
#for s in state_list:
# self.probability_of_states[s]= 0# this checks the higher probability move
def get_a_state(self):
'''
This method needs to be changed so it returns with probability
'''
return random.choice(self.state_list)
def update_value(self, new_result):
self.average_reward = self.average_reward + new_result